From 27cff30833ee145e97a6c84dc976b3def08eb2da Mon Sep 17 00:00:00 2001
From: Laura Wang <3700467+Laurawly@users.noreply.github.com>
Date: Tue, 6 Jan 2026 10:58:01 -0800
Subject: [PATCH 1/8]  Provide a vLLM general plugin that registers
 oink::rmsnorm and   oink::fused_add_rms_norm backed by an SM100 CuTeDSL
 RMSNorm kernel.

  The ops are torch.compile-friendly (stride-preserving for padded-row inputs)
  and the fused op matches vLLM's in-place residual-add RMSNorm semantics.
---
 oink/README.md                                |   57 +
 oink/pyproject.toml                           |   29 +
 oink/src/kernelagent_oink/__init__.py         |   95 +
 .../kernelagent_oink/blackwell/__init__.py    |    3 +
 .../kernelagent_oink/blackwell/lite_quack.py  |  350 +++
 .../blackwell/oink_custom_ops.py              |  224 ++
 .../src/kernelagent_oink/blackwell/rmsnorm.py | 2660 +++++++++++++++++
 7 files changed, 3418 insertions(+)
 create mode 100644 oink/README.md
 create mode 100644 oink/pyproject.toml
 create mode 100644 oink/src/kernelagent_oink/__init__.py
 create mode 100644 oink/src/kernelagent_oink/blackwell/__init__.py
 create mode 100644 oink/src/kernelagent_oink/blackwell/lite_quack.py
 create mode 100644 oink/src/kernelagent_oink/blackwell/oink_custom_ops.py
 create mode 100644 oink/src/kernelagent_oink/blackwell/rmsnorm.py

diff --git a/oink/README.md b/oink/README.md
new file mode 100644
index 0000000..427f69f
--- /dev/null
+++ b/oink/README.md
@@ -0,0 +1,57 @@
+# KernelAgent Oink (vLLM plugin)
+
+This subproject provides an **out-of-tree vLLM plugin** that registers
+`torch.library.custom_op` entrypoints under the `oink::` namespace:
+
+- `torch.ops.oink.rmsnorm`
+- `torch.ops.oink.fused_add_rms_norm`
+
+The implementation is backed by a CuTeDSL (CUTLASS) RMSNorm kernel tuned for
+**NVIDIA Blackwell (SM100)**.
+
+## Install (editable)
+
+From the `KernelAgent` repo root:
+
+```bash
+pip install -e ./oink
+```
+
+This plugin requires the CuTeDSL stack:
+
+```bash
+pip install nvidia-cutlass-dsl cuda-python
+```
+
+## Use with vLLM
+
+1. Enable the vLLM integration:
+
+```bash
+export VLLM_USE_OINK_RMSNORM=1
+```
+
+2. Ensure vLLM keeps `rms_norm` as a custom op when using `torch.compile` /
+CUDA graphs. In Python:
+
+```python
+from vllm import LLM
+
+llm = LLM(
+    model=...,
+    tensor_parallel_size=...,
+    enforce_eager=False,
+    compilation_config={"custom_ops": ["none", "+rms_norm"]},
+)
+```
+
+Without `+rms_norm`, Inductor may fuse RMSNorm into larger Triton kernels and
+neither vLLM's CUDA RMSNorm nor Oink will run.
+
+## Notes
+
+- This plugin is designed to be **safe to import even when disabled**; it only
+  registers ops when `VLLM_USE_OINK_RMSNORM` is truthy (`"1"` / `"true"`).
+- The ops preserve **padded-row layouts** for 2D tensors (shape `[M, N]`,
+  `stride(1) == 1`, and potentially `stride(0) > N`), which is required for
+  `torch.compile` stride verification on some models (e.g., MLA padded inputs).
diff --git a/oink/pyproject.toml b/oink/pyproject.toml
new file mode 100644
index 0000000..a9ec306
--- /dev/null
+++ b/oink/pyproject.toml
@@ -0,0 +1,29 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "kernelagent-oink"
+version = "0.1.0"
+description = "vLLM plugin that registers Oink Blackwell RMSNorm custom ops"
+readme = "README.md"
+requires-python = ">=3.10"
+license = {text = "Apache-2.0"}
+authors = [{name = "PyTorch Labs"}]
+
+# Keep dependencies minimal, but include the CuTeDSL stack required by the
+# Blackwell RMSNorm implementation.
+#
+# We intentionally do NOT depend on `torch` here because vLLM already pins and
+# provides a compatible PyTorch build.
+dependencies = [
+  "nvidia-cutlass-dsl",
+  "cuda-python",
+]
+
+[project.entry-points."vllm.general_plugins"]
+oink = "kernelagent_oink:register"
+
+[tool.setuptools.packages.find]
+where = ["src"]
+include = ["kernelagent_oink*"]
diff --git a/oink/src/kernelagent_oink/__init__.py b/oink/src/kernelagent_oink/__init__.py
new file mode 100644
index 0000000..542e59e
--- /dev/null
+++ b/oink/src/kernelagent_oink/__init__.py
@@ -0,0 +1,95 @@
+from __future__ import annotations
+
+import logging
+import os
+
+logger = logging.getLogger(__name__)
+
+_OPS_REGISTERED = False
+
+
+def _env_truthy(name: str) -> bool:
+    val = os.environ.get(name)
+    if val is None:
+        return False
+    return val.strip().lower() in ("1", "true", "yes", "on")
+
+
+def _infer_cuda_device_index() -> int:
+    local_rank = os.environ.get("LOCAL_RANK")
+    if local_rank is not None:
+        try:
+            return int(local_rank)
+        except ValueError:
+            pass
+    return 0
+
+
+def _compute_cutedsl_arch(major: int, minor: int) -> str:
+    # CuTeDSL uses an "a" suffix for >= Hopper.
+    suffix = "a" if major >= 9 else ""
+    # Match cutlass/base_dsl/env_manager.py: map sm_110 -> sm_101.
+    if major == 11 and minor == 0:
+        major, minor = 10, 1
+    return f"sm_{major}{minor}{suffix}"
+
+
+def register() -> None:
+    """vLLM plugin entrypoint.
+
+    This function must be safe to call multiple times and must not raise.
+    vLLM executes it in multiple processes (engine + workers).
+    """
+    global _OPS_REGISTERED
+
+    if _OPS_REGISTERED:
+        return
+
+    # Gate on the vLLM integration flag so installing the package does not
+    # change behavior unless explicitly enabled.
+    if not _env_truthy("VLLM_USE_OINK_RMSNORM"):
+        return
+
+    try:
+        import torch
+    except Exception as e:  # pragma: no cover
+        logger.debug("Oink plugin: torch import failed: %s", e)
+        return
+
+    try:
+        if not torch.cuda.is_available():
+            return
+        device_index = _infer_cuda_device_index()
+        major, minor = torch.cuda.get_device_capability(device_index)
+        sm = 10 * int(major) + int(minor)
+        if sm < 100:
+            return
+
+        # Ensure required deps are importable before registering ops so that vLLM
+        # doesn't detect ops that would later fail at first use.
+        try:
+            import cutlass  # noqa: F401
+            import cuda.bindings.driver as _cuda  # noqa: F401
+        except Exception as e:
+            logger.warning(
+                "Oink plugin: CuTeDSL deps missing; skipping op registration. "
+                "Install `nvidia-cutlass-dsl` + `cuda-python`. Error: %s",
+                e,
+            )
+            return
+
+        # Ensure CuTeDSL sees a target arch early. If the user has already set it,
+        # respect their choice.
+        os.environ.setdefault("CUTE_DSL_ARCH", _compute_cutedsl_arch(int(major), int(minor)))
+
+        # Import registers the ops via torch.library.custom_op decorators.
+        from .blackwell import oink_custom_ops  # noqa: F401
+    except Exception as e:  # pragma: no cover
+        # Do not raise: vLLM plugin loader does not guard plugin execution.
+        logger.exception("Oink plugin: failed to register ops: %s", e)
+        return
+
+    _OPS_REGISTERED = True
+
+
+__all__ = ["register"]
diff --git a/oink/src/kernelagent_oink/blackwell/__init__.py b/oink/src/kernelagent_oink/blackwell/__init__.py
new file mode 100644
index 0000000..4d21ee8
--- /dev/null
+++ b/oink/src/kernelagent_oink/blackwell/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+__all__ = []
diff --git a/oink/src/kernelagent_oink/blackwell/lite_quack.py b/oink/src/kernelagent_oink/blackwell/lite_quack.py
new file mode 100644
index 0000000..3c3f750
--- /dev/null
+++ b/oink/src/kernelagent_oink/blackwell/lite_quack.py
@@ -0,0 +1,350 @@
+"""
+Lightweight local clone of the small subset of helpers that the SM100
+RMSNorm CuteDSL kernels depend on.
+
+This module intentionally avoids importing the `quack` package so that
+Oink Blackwell kernels can run without Quack installed, while keeping
+numerical behaviour and performance close to the original reference
+implementations.
+"""
+
+from __future__ import annotations
+
+import math
+import operator
+from typing import Callable, Optional, Tuple
+
+import cuda.bindings.driver as cuda  # type: ignore
+import torch
+from torch import Tensor
+
+import cutlass
+import cutlass.cute as cute
+from cutlass import Float32, Int32, const_expr
+from cutlass.cute.runtime import from_dlpack
+from cutlass.cutlass_dsl import T, dsl_user_op
+from cutlass._mlir.dialects import llvm, nvvm, vector
+
+
+# -------------------------
+# Dtype mapping
+# -------------------------
+
+TORCH2CUTE_DTYPE = {
+    torch.float16: cutlass.Float16,
+    torch.bfloat16: cutlass.BFloat16,
+    torch.float32: cutlass.Float32,
+}
+
+
+# -------------------------
+# Tensor conversion helpers
+# -------------------------
+
+def convert_from_dlpack(
+    x: Tensor,
+    leading_dim: int,
+    alignment: int = 16,
+    divisibility: int = 1,
+) -> cute.Tensor:
+    """
+    Wrap a torch.Tensor in a CuteDSL tensor with layout metadata that
+    matches the logical leading dimension and alignment/divisibility
+    constraints expected by SM100 kernels.
+    """
+    return (
+        from_dlpack(x, assumed_align=alignment)
+        .mark_layout_dynamic(leading_dim=leading_dim)
+        .mark_compact_shape_dynamic(
+            mode=leading_dim,
+            stride_order=x.dim_order(),
+            divisibility=divisibility,
+        )
+    )
+
+
+# -------------------------
+# SM90/SM100 cluster helpers
+# -------------------------
+
+
+@dsl_user_op
+def elem_pointer(x: cute.Tensor, coord: cute.Coord, *, loc=None, ip=None) -> cute.Pointer:
+    return x.iterator + cute.crd2idx(coord, x.layout, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def set_block_rank(
+    smem_ptr: cute.Pointer,
+    peer_cta_rank_in_cluster: cute.Int32,
+    *,
+    loc=None,
+    ip=None,
+) -> cutlass.Int32:
+    """Map the given smem pointer to the address at another CTA rank in the cluster."""
+    smem_ptr_i32 = smem_ptr.toint(loc=loc, ip=ip).ir_value()
+    return cutlass.Int32(
+        llvm.inline_asm(
+            T.i32(),
+            [smem_ptr_i32, peer_cta_rank_in_cluster.ir_value()],
+            "mapa.shared::cluster.u32 $0, $1, $2;",
+            "=r,r,r",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+    )
+
+
+@dsl_user_op
+def store_shared_remote(
+    val: float | Float32 | Int32 | cutlass.Int64,
+    smem_ptr: cute.Pointer,
+    mbar_ptr: cute.Pointer,
+    peer_cta_rank_in_cluster: cute.typing.Int,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    remote_smem_ptr_i32 = set_block_rank(
+        smem_ptr,
+        peer_cta_rank_in_cluster,
+        loc=loc,
+        ip=ip,
+    ).ir_value()
+    remote_mbar_ptr_i32 = set_block_rank(
+        mbar_ptr,
+        peer_cta_rank_in_cluster,
+        loc=loc,
+        ip=ip,
+    ).ir_value()
+    if const_expr(isinstance(val, float)):
+        val = Float32(val)
+    assert isinstance(val, (Float32, Int32, cutlass.Int64)), "val must be Float32, Int32, or Int64"
+    suffix = {Float32: "f32", Int32: "s32", cutlass.Int64: "s64"}[type(val)]
+    constraint = {Float32: "f", Int32: "r", cutlass.Int64: "l"}[type(val)]
+    llvm.inline_asm(
+        None,
+        [remote_smem_ptr_i32, val.ir_value(loc=loc, ip=ip), remote_mbar_ptr_i32],
+        f"st.async.shared::cluster.mbarrier::complete_tx::bytes.{suffix} [$0], $1, [$2];",
+        f"r,{constraint},r",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+
+
+@cute.jit
+def predicate_k(tAcA: cute.Tensor, limit: cutlass.Int32) -> cute.Tensor:
+    """
+    Build a predicate tensor for the K dimension only. Values beyond
+    `limit` are masked out.
+    """
+    tApA = cute.make_fragment(
+        cute.make_layout(
+            (cute.size(tAcA, mode=[0, 1]), cute.size(tAcA, mode=[1]), cute.size(tAcA, mode=[2])),
+            stride=(cute.size(tAcA, mode=[2]), 0, 1),
+        ),
+        cutlass.Boolean,
+    )
+    for rest_v in cutlass.range_constexpr(tApA.shape[0]):
+        for rest_k in cutlass.range_constexpr(tApA.shape[2]):
+            tApA[rest_v, 0, rest_k] = cute.elem_less(tAcA[(0, rest_v), 0, rest_k][1], limit)
+    return tApA
+
+
+@dsl_user_op
+def domain_offset_i64(coord: cute.Coord, tensor: cute.Tensor, *, loc=None, ip=None) -> cute.Tensor:
+    """
+    Return a tensor whose iterator is offset by an Int64 byte offset
+    computed from `coord` and the tensor's strides.
+    """
+    flat_coord_i64 = tuple(cutlass.Int64(c) for c in cute.flatten(coord))
+    flat_stride = cute.flatten_to_tuple(tensor.stride)
+    assert len(flat_coord_i64) == len(flat_stride), (
+        "Coordinate and stride must have the same length"
+    )
+    offset = sum(c * s for c, s in zip(flat_coord_i64, flat_stride))
+    assert isinstance(tensor.iterator, cute.Pointer)
+    new_ptr = cute.make_ptr(
+        tensor.element_type,
+        tensor.iterator.toint() + offset * tensor.element_type.width // 8,
+        tensor.memspace,
+        assumed_align=tensor.iterator.max_alignment,
+    )
+    return cute.make_tensor(new_ptr, tensor.layout)
+
+
+# -------------------------
+# Reduction helpers
+# -------------------------
+
+
+@cute.jit
+def warp_reduce(
+    val: cute.TensorSSA | cute.Numeric,
+    op: Callable,
+    width: cutlass.Constexpr[int] = cute.arch.WARP_SIZE,
+) -> cute.TensorSSA | cute.Numeric:
+    """
+    Warp-level reduction for either scalar values or small TensorSSA
+    fragments.
+    """
+    if cutlass.const_expr(isinstance(val, cute.TensorSSA)):
+        res = cute.make_fragment(val.shape, val.dtype)
+        res.store(val)
+        for i in cutlass.range_constexpr(cute.size(val.shape)):
+            res[i] = warp_reduce(res[i], op, width)
+        return res.load()
+    for i in cutlass.range_constexpr(int(math.log2(width))):
+        val = op(val, cute.arch.shuffle_sync_bfly(val, offset=1 << i))
+    return val
+
+
+@cute.jit
+def block_reduce(
+    val: cute.Numeric,
+    op: Callable,
+    reduction_buffer: cute.Tensor,
+    init_val: cute.Numeric = 0.0,
+) -> cute.Numeric:
+    """Block-level reduction across warps."""
+    lane_idx, warp_idx = cute.arch.lane_idx(), cute.arch.warp_idx()
+    warps_per_row = cute.size(reduction_buffer.shape[1])
+    row_idx, col_idx = warp_idx // warps_per_row, warp_idx % warps_per_row
+    if lane_idx == 0:
+        reduction_buffer[row_idx, col_idx] = val
+    cute.arch.barrier()
+    block_reduce_val = init_val
+    if lane_idx < warps_per_row:
+        block_reduce_val = reduction_buffer[row_idx, lane_idx]
+    return warp_reduce(block_reduce_val, op)
+
+
+@cute.jit
+def cluster_reduce(
+    val: cute.Numeric,
+    op: Callable,
+    reduction_buffer: cute.Tensor,
+    mbar_ptr: cute.Pointer,
+    init_val: cute.Numeric = 0.0,
+    phase: Optional[cutlass.Int32] = None,
+) -> cute.Numeric:
+    """
+    Cluster-wide reduction using shared memory and mbarrier. The
+    reduction_buffer has shape (rows_per_block, (warps_per_row, cluster_n)).
+    """
+    cta_rank_in_cluster = cute.arch.block_idx_in_cluster()
+    lane_idx, warp_idx = cute.arch.lane_idx(), cute.arch.warp_idx()
+    rows_per_block, (warps_per_row, cluster_n) = reduction_buffer.shape
+    row_idx, col_idx = warp_idx // warps_per_row, warp_idx % warps_per_row
+    if warp_idx == 0:
+        with cute.arch.elect_one():
+            num_warps = rows_per_block * warps_per_row
+            cute.arch.mbarrier_arrive_and_expect_tx(
+                mbar_ptr,
+                num_warps * cluster_n * reduction_buffer.element_type.width // 8,
+            )
+    if lane_idx < cluster_n:
+        store_shared_remote(
+            val,
+            elem_pointer(reduction_buffer, (row_idx, (col_idx, cta_rank_in_cluster))),
+            mbar_ptr,
+            peer_cta_rank_in_cluster=lane_idx,
+        )
+    cute.arch.mbarrier_wait(mbar_ptr, phase=phase if phase is not None else 0)
+    block_reduce_val = init_val
+    num_iter = cute.ceil_div(warps_per_row * cluster_n, cute.arch.WARP_SIZE)
+    for i in cutlass.range_constexpr(num_iter):
+        idx = lane_idx + i * cute.arch.WARP_SIZE
+        if idx < cute.size(reduction_buffer, mode=[1]):
+            block_reduce_val = op(block_reduce_val, reduction_buffer[row_idx, idx])
+    return warp_reduce(block_reduce_val, op)
+
+
+@cute.jit
+def block_or_cluster_reduce(
+    val: cute.Numeric,
+    op: Callable,
+    reduction_buffer: cute.Tensor,
+    mbar_ptr: Optional[cute.Pointer],
+    phase: Optional[cutlass.Int32] = None,
+    init_val: cute.Numeric = 0.0,
+) -> cute.Numeric:
+    """Dispatch between block or cluster reduction depending on mbar_ptr."""
+    if cutlass.const_expr(mbar_ptr is None):
+        return block_reduce(val, op, reduction_buffer, init_val=init_val)
+    return cluster_reduce(val, op, reduction_buffer, mbar_ptr, init_val=init_val, phase=phase)
+
+
+@cute.jit
+def row_reduce(
+    x: cute.TensorSSA | cute.Numeric,
+    op: cute.ReductionOp,
+    threads_per_row: cutlass.Constexpr[int],
+    reduction_buffer: Optional[cute.Tensor] = None,
+    mbar_ptr: Optional[cute.Pointer] = None,
+    phase: Optional[cutlass.Int32] = None,
+    init_val: cute.Numeric = 0.0,
+    hook_fn: Optional[Callable] = None,
+) -> cute.Numeric:
+    """
+    Row-wise reduction used by RMSNorm and similar kernels.
+
+    reduction_buffer must have shape
+      (num_warps / warps_per_row, (warps_per_row, cluster_n)).
+    """
+    if cutlass.const_expr(isinstance(x, cute.TensorSSA)):
+        val = x.reduce(op, init_val=init_val, reduction_profile=0)
+    else:
+        val = x
+    warp_op = {
+        cute.ReductionOp.ADD: operator.add,
+        cute.ReductionOp.MAX: cute.arch.fmax if cutlass.const_expr(x.dtype == Float32) else max,
+        cute.ReductionOp.MIN: min,
+        cute.ReductionOp.MUL: operator.mul,
+    }[op]
+    val = warp_reduce(
+        val,
+        warp_op,
+        width=min(threads_per_row, cute.arch.WARP_SIZE),
+    )
+    if cutlass.const_expr(hook_fn is not None):
+        hook_fn()
+    if cutlass.const_expr(reduction_buffer is not None):
+        warps_per_row, cluster_n = reduction_buffer.shape[1]
+        assert cluster_n == 1 or mbar_ptr is not None, (
+            "mbar_ptr must be provided for cluster reduction"
+        )
+        if cutlass.const_expr(warps_per_row > 1 or cluster_n > 1):
+            val = block_or_cluster_reduce(
+                val,
+                warp_op,
+                reduction_buffer,
+                mbar_ptr,
+                phase=phase,
+                init_val=init_val,
+            )
+    return val
+
+
+# -------------------------
+# SM count helper
+# -------------------------
+
+
+def get_sm_count(N: int, device: torch.device) -> int:
+    """
+    Heuristic for the number of persistent CTAs (sm_count) based on N and
+    the GPU's SM count. This mirrors the behaviour used in Quack's
+    RMSNorm kernels but lives entirely in this local module.
+    """
+    sm_count_multiple = (
+        16 if N <= 256 else (8 if N <= 1024 else (4 if N <= 2048 else (2 if N <= 4096 else 1)))
+    )
+    sm_count = torch.cuda.get_device_properties(device).multi_processor_count
+    sm_count = (
+        sm_count * sm_count_multiple if N <= 8192 else sm_count // 2 if N <= 16384 else sm_count * 2
+    )
+    return sm_count
+
diff --git a/oink/src/kernelagent_oink/blackwell/oink_custom_ops.py b/oink/src/kernelagent_oink/blackwell/oink_custom_ops.py
new file mode 100644
index 0000000..8225025
--- /dev/null
+++ b/oink/src/kernelagent_oink/blackwell/oink_custom_ops.py
@@ -0,0 +1,224 @@
+from __future__ import annotations
+
+"""
+Torch custom ops wrapping Oink's Blackwell RMSNorm kernels.
+
+These ops are designed to be:
+- Architecture-aware (use CuTeDSL SM100 kernels when available, fall back
+  to a safe reference elsewhere).
+- Layout-preserving for 2D row-major inputs, including padded MLA-style
+  layouts where stride(0) > N and stride(1) == 1.
+- torch.compile-friendly via proper fake implementations that mirror
+  runtime shapes and strides.
+
+Public ops (Python signatures):
+
+  torch.ops.oink.rmsnorm(x: Tensor, weight: Tensor, eps: float) -> Tensor
+      Functional RMSNorm. Returns a new tensor with the same shape and
+      stride as x when using the fast CuTeDSL path.
+
+  torch.ops.oink.fused_add_rms_norm(
+      x: Tensor, residual: Tensor, weight: Tensor, eps: float
+  ) -> None
+      In-place fused residual-add + RMSNorm matching vLLM semantics:
+          residual = x + residual   (stored into `residual`)
+          x = RMSNorm(residual, w)  (stored into `x`)
+      Mutates `x` and `residual` in-place and returns None.
+"""
+
+import importlib
+import threading
+
+import torch
+from torch.library import custom_op
+
+_RMSNORM_MOD: object | None = None
+_RMSNORM_MOD_LOCK = threading.Lock()
+
+
+def _get_rmsnorm_mod():
+    """Lazy import to keep plugin registration lightweight.
+
+    Importing the CuTeDSL kernel stack can be expensive and may require a CUDA
+    context. We defer it until the first actual execution of the custom op.
+    """
+    global _RMSNORM_MOD
+
+    cached = _RMSNORM_MOD
+    if cached is not None:
+        return cached
+
+    with _RMSNORM_MOD_LOCK:
+        if _RMSNORM_MOD is None:
+            _RMSNORM_MOD = importlib.import_module("kernelagent_oink.blackwell.rmsnorm")
+        return _RMSNORM_MOD
+
+
+def _get_sm(device: torch.device | None = None) -> int:
+    """Return SM version as an int (e.g., 100 for SM100 / Blackwell)."""
+    if device is None:
+        device = torch.device("cuda")
+    major, minor = torch.cuda.get_device_capability(device)
+    return 10 * major + minor
+
+
+#
+# RMSNorm (functional)
+#
+
+@custom_op("oink::rmsnorm", mutates_args=())
+def oink_rmsnorm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float,
+) -> torch.Tensor:
+    """
+    Functional RMSNorm entrypoint.
+
+    This op is model-agnostic. It expects a 2D [M, N] view of the input
+    where the last dimension is contiguous (stride(1) == 1). The leading
+    dimension stride(0) may be larger than N (padded-row layouts), and
+    will be preserved on the fast CuTeDSL path.
+
+    On SM100 (and newer), this dispatches to the tuned CuTeDSL Blackwell
+    RMSNorm kernel in rmsnorm.rmsnorm_forward, which in turn selects the
+    best internal schedule (including DSv3-specific stage-2 kernels where
+    applicable) and preserves the input's 2D stride when using the
+    pointer-based path.
+
+    On older architectures it falls back to a safe PyTorch reference
+    implementation for correctness.
+    """
+    assert x.is_cuda, "oink::rmsnorm requires CUDA tensors"
+    assert x.dim() == 2, "oink::rmsnorm expects a 2D [M, N] tensor view"
+    assert weight.dim() == 1, "weight must be 1D [N]"
+
+    sm = _get_sm(x.device)
+    if sm >= 100:
+        # Use the tuned CuTeDSL SM100 kernel. The public API already
+        # contains all necessary gating and layout checks internally.
+        _rms = _get_rmsnorm_mod()
+        y, _rstd, _res = _rms.rmsnorm_forward(
+            x,
+            weight=weight,
+            bias=None,
+            residual=None,
+            eps=eps,
+            store_rstd=False,
+        )
+        return y
+
+    # Fallback: reference implementation (correctness-first).
+    _rms = _get_rmsnorm_mod()
+    return _rms.rmsnorm_ref(
+        x,
+        w=weight,
+        b=None,
+        residual=None,
+        eps=eps,
+    )
+
+
+@oink_rmsnorm.register_fake
+def oink_rmsnorm_fake(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float,
+) -> torch.Tensor:
+    """
+    Fake (meta) implementation for oink::rmsnorm.
+
+    We must preserve x's logical layout (shape + stride) so that Inductor's
+    CUDA graph capture sees the same stride contract as the real kernel.
+    """
+    # x is a FakeTensor here; x.shape/x.stride()/x.device/x.dtype are defined.
+    return torch.empty_strided(
+        x.shape,
+        x.stride(),
+        device=x.device,
+        dtype=x.dtype,
+    )
+
+
+#
+# Fused residual-add + RMSNorm (in-place, vLLM semantics)
+#
+
+@custom_op("oink::fused_add_rms_norm", mutates_args=("x", "residual"))
+def oink_fused_add_rms_norm(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float,
+) -> None:
+    """
+    In-place fused residual-add + RMSNorm:
+
+        residual <- x + residual
+        x <- RMSNorm(residual, weight, eps)
+
+    Returns:
+        None (mutates `x` and `residual` in-place).
+    """
+    assert x.is_cuda and residual.is_cuda, "oink::fused_add_rms_norm requires CUDA tensors"
+    assert x.shape == residual.shape, "x and residual must have the same shape"
+    assert x.dtype == residual.dtype, "x and residual must have the same dtype"
+    assert weight.dim() == 1, "weight must be 1D [N]"
+
+    sm = _get_sm(x.device)
+    if sm >= 100:
+        _rms = _get_rmsnorm_mod()
+        # Prefer the lowest-overhead in-place entrypoint (returns None).
+        if hasattr(_rms, "fused_add_rmsnorm_inplace_"):
+            _rms.fused_add_rmsnorm_inplace_(  # type: ignore[misc]
+                x,
+                residual,
+                weight,
+                eps=eps,
+            )
+            return None
+        # Backward-compatible wrapper (returns (x, residual)).
+        if hasattr(_rms, "fused_add_rmsnorm_forward_inplace"):
+            _rms.fused_add_rmsnorm_forward_inplace(  # type: ignore[misc]
+                x,
+                residual,
+                weight,
+                eps=eps,
+            )
+            return None
+
+        # Extremely defensive fallback if the Oink module doesn't provide
+        # the in-place entrypoint.
+        y, z = _rms.fused_add_rmsnorm_forward(x, residual, weight, eps=eps)
+        x.copy_(y)
+        residual.copy_(z)
+        return None
+
+    # Non-SM100 fallback: keep semantics in-place (correctness-first).
+    residual.add_(x)
+    _rms = _get_rmsnorm_mod()
+    y = _rms.rmsnorm_ref(residual, w=weight, b=None, residual=None, eps=eps)
+    x.copy_(y)
+    return None
+
+
+@oink_fused_add_rms_norm.register_fake
+def oink_fused_add_rms_norm_fake(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float,
+) -> None:
+    """
+    Fake (meta) implementation for oink::fused_add_rms_norm.
+
+    Because this op mutates its inputs in-place, the outputs alias the input
+    buffers and therefore have the same shapes and strides.
+    """
+    return None
+
+
+__all__ = [
+    "oink_rmsnorm",
+    "oink_fused_add_rms_norm",
+]
diff --git a/oink/src/kernelagent_oink/blackwell/rmsnorm.py b/oink/src/kernelagent_oink/blackwell/rmsnorm.py
new file mode 100644
index 0000000..d6c2c20
--- /dev/null
+++ b/oink/src/kernelagent_oink/blackwell/rmsnorm.py
@@ -0,0 +1,2660 @@
+"""
+RMSNorm kernel for SM100 (Blackwell) in CuteDSL.
+
+This implementation targets Blackwell with:
+- A stride-preserving pointer path for padded-row layouts (e.g. MLA stride0> N).
+- A one-pass fused-add RMSNorm schedule for bf16/fp16 (DSv3 N=7168) that keeps
+  `x + residual` in registers (avoids re-reading gmem) and uses FP32 accumulation.
+- Optional experimental schedule knobs (env vars) to explore copy widths and
+  stage-2 cp.async variants.
+
+Note: This file expects the local CuTeDSL (cutlass) and SM100 helper modules
+to be available in the Python environment (e.g., `nvidia-cutlass-dsl` and
+`cuda-python`). It is shipped as part of the KernelAgent Oink vLLM plugin.
+"""
+
+from __future__ import annotations
+
+import ctypes
+import importlib.metadata
+import os
+import re
+import subprocess
+import sys
+import threading
+from typing import Optional, Tuple
+
+_HERE = os.path.dirname(__file__)
+
+# CuTeDSL caches generated MLIR into a tempdir under a global default
+# (`/tmp/$USER/cutlass_python_cache`). The cache bytecode format can differ across
+# `nvidia-cutlass-dsl` versions (e.g. 4.3.2 vs 4.3.4), and cross-version cache
+# sharing causes noisy "invalid section ID" warnings (and disables cache reuse).
+#
+# If the user has not pinned `CUTE_DSL_CACHE_DIR`, isolate by version so multiple
+# CuTeDSL envs can coexist on the same machine without stepping on each other.
+if "CUTE_DSL_CACHE_DIR" not in os.environ:
+    try:
+        _dsl_ver = importlib.metadata.version("nvidia-cutlass-dsl")
+    except Exception:
+        _dsl_ver = "unknown"
+    _dsl_ver = re.sub(r"[^0-9A-Za-z]+", "_", _dsl_ver)
+    _user = os.environ.get("USER") or os.environ.get("USERNAME") or "user"
+    _tmp = os.environ.get("TMPDIR") or "/tmp"
+    os.environ["CUTE_DSL_CACHE_DIR"] = os.path.join(
+        _tmp, _user, f"cutlass_python_cache_{_dsl_ver}"
+    )
+
+try:
+    import cutlass  # type: ignore  # noqa: F401
+except Exception as e:
+    raise ImportError(
+        "kernelagent_oink.blackwell.rmsnorm requires CuTeDSL's Python package "
+        "(`cutlass`, typically provided by `nvidia-cutlass-dsl`)."
+    ) from e
+
+import torch
+from torch import Tensor
+
+import cuda.bindings.driver as cuda  # provided by NVIDIA cuda-python
+
+import cutlass
+import cutlass.cute as cute
+from cutlass import Float32, Int32, const_expr
+from cutlass.cute import runtime as rt
+
+# Simple compile cache declared early so direct execution works
+_PTR_COMPILE_CACHE = {}
+
+# Thread-local cache for the fast-launch path. We keep per-thread packed args and
+# pointer/scalar storage so concurrent callers don't race on in-place updates.
+_PTR_FAST_LAUNCH_TLS = threading.local()
+
+def _env_flag(name: str, default: bool) -> bool:
+    val = os.environ.get(name)
+    if val is None:
+        return default
+    return val.strip().lower() not in {"0", "false", "no", "off", ""}
+
+
+# Fast-launch uses a few private-ish CuTeDSL internals (packed args plumbing and
+# runtime pointer descriptors). Keep it enabled by default for our pinned CuTeDSL
+# environment, but allow disabling it via env var and auto-disable it if those
+# internals are not present in a future upgrade.
+_ENABLE_FAST_LAUNCH = _env_flag("OINK_CUTEDSL_FAST_LAUNCH", default=True)
+_FAST_LAUNCH_SUPPORTED = True
+
+# Fused-add RMSNorm schedule knobs (read once at import time; set env vars before
+# importing this module if you want to override).
+_DIRECT_GMEM_POLICY = (os.environ.get("OINK_RMSNORM_DIRECT_GMEM", "auto").strip().lower() or "auto")
+_COPY_BITS_POLICY = (os.environ.get("OINK_RMSNORM_COPY_BITS", "auto").strip().lower() or "auto")
+_ENABLE_CLUSTER_ILP = _env_flag("OINK_RMSNORM_ENABLE_CLUSTER_ILP", default=False)
+_ENABLE_CLUSTER_ILP_UNSAFE = _env_flag("OINK_RMSNORM_ENABLE_CLUSTER_ILP_UNSAFE", default=False)
+_ENABLE_TPR256 = _env_flag("OINK_RMSNORM_ENABLE_TPR256", default=False)
+_ENABLE_STAGE2 = _env_flag("OINK_RMSNORM_ENABLE_STAGE2", default=False)
+
+# CuTeDSL stability probe for the experimental cluster_n>1 + direct-GMEM schedule.
+#
+# Some CuTeDSL builds segfault during JIT compilation when combining:
+# - cluster launches (cluster_n>1) and
+# - direct-GMEM loads/stores (no staging SMEM tiles).
+#
+# We keep the schedule gated behind `OINK_RMSNORM_ENABLE_CLUSTER_ILP=1` +
+# `OINK_RMSNORM_ENABLE_CLUSTER_ILP_UNSAFE=1`, and additionally run a one-time
+# out-of-process compile probe so we can safely fall back to the staged SMEM
+# path instead of crashing the parent process.
+#
+# This is (currently) sensitive to the vector width: we have observed
+# reproducible segfaults for the 256b universal-copy path, while the 128b path
+# can succeed. Cache the maximum supported copy width (0 = unsupported).
+_CLUSTER_DIRECT_GMEM_MAX_COPY_BITS: Optional[int] = None
+_CLUSTER_DIRECT_GMEM_PROBE_LOCK = threading.Lock()
+_CLUSTER_DIRECT_GMEM_PROBE_WARNED = False
+
+
+def _probe_cluster_direct_gmem_max_copy_bits() -> int:
+    global _CLUSTER_DIRECT_GMEM_MAX_COPY_BITS
+    global _CLUSTER_DIRECT_GMEM_PROBE_WARNED
+
+    override = os.environ.get("OINK_RMSNORM_CLUSTER_DIRECT_GMEM_MAX_COPY_BITS")
+    if override is not None and override.strip() != "":
+        try:
+            value = int(override)
+        except ValueError:
+            value = 0
+        value = 256 if value >= 256 else 128 if value >= 128 else 0
+        _CLUSTER_DIRECT_GMEM_MAX_COPY_BITS = value
+        return value
+
+    if _CLUSTER_DIRECT_GMEM_MAX_COPY_BITS is not None:
+        return _CLUSTER_DIRECT_GMEM_MAX_COPY_BITS
+
+    with _CLUSTER_DIRECT_GMEM_PROBE_LOCK:
+        if _CLUSTER_DIRECT_GMEM_MAX_COPY_BITS is not None:
+            return _CLUSTER_DIRECT_GMEM_MAX_COPY_BITS
+
+        script_template = r"""
+import os
+
+os.environ["OINK_CUTEDSL_FAST_LAUNCH"] = "0"
+
+import cutlass
+import cutlass.cute as cute
+import cuda.bindings.driver as cuda
+from cutlass import Float32, Int32
+from cutlass.cute import runtime as rt
+
+from kernelagent_oink.blackwell import rmsnorm
+
+N = 7168
+dtype = cutlass.BFloat16
+
+copy_bits = int(os.environ["OINK_PROBE_COPY_BITS"])
+assumed_align = int(os.environ["OINK_PROBE_ASSUMED_ALIGN"])
+
+op = rmsnorm.RMSNormSM100(
+    N,
+    dtype,
+    stage=1,
+    copy_bits=copy_bits,
+    use_async=False,
+    direct_gmem=True,
+)
+op._cluster_n_override = 2  # 2 CTAs per row
+
+ptr_x = rt.make_ptr(dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align)
+ptr_res = rt.make_ptr(dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align)
+ptr_w = rt.make_ptr(dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align)
+
+_ = cute.compile(
+    op.launch_from_ptrs_fused_add_inplace,
+    ptr_x,
+    ptr_w,
+    ptr_res,
+    Int32(4096),
+    Int32(N),
+    Int32(N),
+    cuda.CUstream(0),
+    Float32(1e-6),
+)
+print(f"ok {copy_bits}")
+"""
+
+        env = os.environ.copy()
+        env["PYTHONNOUSERSITE"] = "1"
+
+        def run_probe(copy_bits: int, assumed_align: int):
+            probe_env = env.copy()
+            probe_env["OINK_PROBE_COPY_BITS"] = str(copy_bits)
+            probe_env["OINK_PROBE_ASSUMED_ALIGN"] = str(assumed_align)
+            return subprocess.run(
+                [sys.executable, "-c", script_template],
+                env=probe_env,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                timeout=120.0,
+            )
+
+        proc_256 = None
+        proc_128 = None
+        try:
+            proc_256 = run_probe(256, 32)
+            if proc_256.returncode == 0:
+                max_bits = 256
+            else:
+                proc_128 = run_probe(128, 16)
+                max_bits = 128 if proc_128.returncode == 0 else 0
+        except Exception:
+            max_bits = 0
+
+        if not _CLUSTER_DIRECT_GMEM_PROBE_WARNED and max_bits != 256:
+            _CLUSTER_DIRECT_GMEM_PROBE_WARNED = True
+            if max_bits == 128:
+                print(
+                    "Oink: cluster_n>1 + direct_gmem 256b compile probe failed; "
+                    "using 128b copies for the cluster ILP schedule.",
+                    file=sys.stderr,
+                )
+                if proc_256 is not None and proc_256.stderr:
+                    tail = "\n".join(proc_256.stderr.splitlines()[-12:])
+                    print(f"Oink: probe stderr tail:\n{tail}", file=sys.stderr)
+            else:
+                rc = (
+                    proc_128.returncode
+                    if proc_128 is not None
+                    else (proc_256.returncode if proc_256 is not None else "unknown")
+                )
+                print(
+                    "Oink: cluster_n>1 + direct_gmem compile probe failed; "
+                    f"falling back to staged SMEM path (returncode={rc}).",
+                    file=sys.stderr,
+                )
+                failing_proc = proc_128 if proc_128 is not None else proc_256
+                if failing_proc is not None and failing_proc.stderr:
+                    tail = "\n".join(failing_proc.stderr.splitlines()[-12:])
+                    print(f"Oink: probe stderr tail:\n{tail}", file=sys.stderr)
+
+        _CLUSTER_DIRECT_GMEM_MAX_COPY_BITS = max_bits
+        return max_bits
+
+def _parse_version_tuple(version: str) -> Tuple[int, int, int]:
+    parts = version.split(".")
+    nums: list[int] = []
+    for part in parts[:3]:
+        match = re.match(r"^(\d+)", part)
+        nums.append(int(match.group(1)) if match is not None else 0)
+    while len(nums) < 3:
+        nums.append(0)
+    return nums[0], nums[1], nums[2]
+
+
+def _cutlass_dsl_version() -> Optional[Tuple[int, int, int]]:
+    try:
+        return _parse_version_tuple(importlib.metadata.version("nvidia-cutlass-dsl"))
+    except Exception:
+        return None
+
+
+_CUTLASS_DSL_VERSION = _cutlass_dsl_version()
+# CuTeDSL 4.3.4 tightened some kernel argument expectations (notably around
+# passing Layout/Shape/Constexpr objects into @cute.kernel functions). Keep the
+# older signature for 4.3.2, but switch to a 4.3.4-compatible signature when we
+# detect 4.3.4+ (or when version detection is unavailable).
+_KERNEL_ACCEPTS_LAYOUT_ARGS = _CUTLASS_DSL_VERSION is not None and _CUTLASS_DSL_VERSION < (4, 3, 4)
+
+if _ENABLE_CLUSTER_ILP and not _ENABLE_CLUSTER_ILP_UNSAFE:
+    # We have observed reproducible segfaults in some CuTeDSL builds when using
+    # cluster launches for this schedule. Require an explicit UNSAFE opt-in to
+    # avoid accidental crashes.
+    _ENABLE_CLUSTER_ILP = False
+    print(
+        "Oink: OINK_RMSNORM_ENABLE_CLUSTER_ILP requested but disabled by default due to "
+        "known instability; set OINK_RMSNORM_ENABLE_CLUSTER_ILP_UNSAFE=1 to force-enable.",
+        file=sys.stderr,
+    )
+
+
+def _fast_launch_enabled() -> bool:
+    return _ENABLE_FAST_LAUNCH and _FAST_LAUNCH_SUPPORTED
+
+
+def _direct_gmem_from_policy(*, default: bool) -> bool:
+    """Resolve the direct-GMEM schedule flag from the (import-time) policy string."""
+    if _DIRECT_GMEM_POLICY in {"0", "false", "no", "off"}:
+        return False
+    if _DIRECT_GMEM_POLICY in {"1", "true", "yes", "on"}:
+        return True
+    return default
+
+
+def _copy_bits_from_policy(*, default: int, can_use_256: bool) -> int:
+    """Resolve copy width (in bits) from the (import-time) policy string."""
+    if _COPY_BITS_POLICY in {"128"}:
+        return 128
+    if _COPY_BITS_POLICY in {"256"} and can_use_256:
+        return 256
+    return default
+
+
+class _StableI32Arg:
+    """A stable Int32 runtime arg (avoids per-call Int32().__c_pointers__ allocations)."""
+
+    def __init__(self, value: int):
+        self._c_value = ctypes.c_int32(int(value))
+        self._c_pointer = ctypes.cast(ctypes.pointer(self._c_value), ctypes.c_void_p)
+
+    def set(self, value: int) -> None:
+        self._c_value.value = int(value)
+
+    def __c_pointers__(self):
+        return [self._c_pointer]
+
+
+class _StableF32Arg:
+    """A stable Float32 runtime arg (avoids per-call Float32().__c_pointers__ allocations)."""
+
+    def __init__(self, value: float):
+        self._c_value = ctypes.c_float(float(value))
+        self._c_pointer = ctypes.cast(ctypes.pointer(self._c_value), ctypes.c_void_p)
+
+    def set(self, value: float) -> None:
+        self._c_value.value = float(value)
+
+    def __c_pointers__(self):
+        return [self._c_pointer]
+
+
+def _tls_fast_launch_cache() -> dict[tuple[object, ...], object]:
+    cache = getattr(_PTR_FAST_LAUNCH_TLS, "cache", None)
+    if cache is None:
+        cache = {}
+        _PTR_FAST_LAUNCH_TLS.cache = cache
+    return cache
+
+
+def _set_runtime_ptr(ptr: object, device_ptr: int) -> None:
+    # Runtime pointer objects cache a `ctypes.c_void_p` descriptor and pass
+    # its address to the compiled function. Updating `_desc.value` updates
+    # the device pointer without changing the address of the descriptor.
+    #
+    # This relies on internal CuTeDSL runtime pointer fields (`_desc`, `_pointer`,
+    # etc.). If these internals change in a future CuTeDSL upgrade, callers
+    # should catch AttributeError and fall back to the regular launch path.
+    device_ptr = int(device_ptr)
+    ptr._pointer = device_ptr  # type: ignore[attr-defined]
+    if getattr(ptr, "_c_pointer", None) is None:
+        ptr.__c_pointers__()  # type: ignore[attr-defined]
+    ptr._desc.value = device_ptr  # type: ignore[attr-defined]
+
+
+class _PtrRmsnormFastLaunch:
+    def __init__(
+        self,
+        *,
+        compiled: object,
+        executor: object,
+        capi_func: object,
+        ptr_x: object,
+        ptr_w: Optional[object],
+        ptr_out: object,
+        arg_m: _StableI32Arg,
+        arg_n: _StableI32Arg,
+        arg_ld: _StableI32Arg,
+        arg_eps: _StableF32Arg,
+        stream: cuda.CUstream,
+        packed_args: object,
+        keepalive: tuple[object, ...],
+    ):
+        self._compiled = compiled
+        self._executor = executor
+        self._capi_func = capi_func
+        self._ptr_x = ptr_x
+        self._ptr_w = ptr_w
+        self._ptr_out = ptr_out
+        self._arg_m = arg_m
+        self._arg_n = arg_n
+        self._arg_ld = arg_ld
+        self._arg_eps = arg_eps
+        self._stream = stream
+        self._packed_args = packed_args
+        self._keepalive = keepalive
+
+        self._use_fast_launch = True
+
+        self._cuda_result = getattr(executor, "cuda_result", None)
+
+        self._last_x_ptr = -1
+        self._last_w_ptr = -1
+        self._last_out_ptr = -1
+        self._last_m = -1
+        self._last_ld = -1
+        self._last_eps = float("nan")
+
+    def launch(
+        self,
+        *,
+        x: Tensor,
+        weight: Optional[Tensor],
+        out: Tensor,
+        M: int,
+        N: int,
+        ld: int,
+        eps: float,
+    ) -> None:
+        if not _fast_launch_enabled() or not self._use_fast_launch:
+            self._fallback_launch(x=x, weight=weight, out=out, M=M, N=N, ld=ld, eps=eps)
+            return
+
+        x_ptr = x.data_ptr()
+        if x_ptr != self._last_x_ptr:
+            try:
+                _set_runtime_ptr(self._ptr_x, x_ptr)
+                self._last_x_ptr = x_ptr
+            except AttributeError:
+                self._disable_fast_launch()
+                self._fallback_launch(x=x, weight=weight, out=out, M=M, N=N, ld=ld, eps=eps)
+                return
+
+        if self._ptr_w is not None:
+            w_ptr = weight.data_ptr()  # type: ignore[union-attr]
+            if w_ptr != self._last_w_ptr:
+                try:
+                    _set_runtime_ptr(self._ptr_w, w_ptr)
+                    self._last_w_ptr = w_ptr
+                except AttributeError:
+                    self._disable_fast_launch()
+                    self._fallback_launch(x=x, weight=weight, out=out, M=M, N=N, ld=ld, eps=eps)
+                    return
+
+        out_ptr = out.data_ptr()
+        if out_ptr != self._last_out_ptr:
+            try:
+                _set_runtime_ptr(self._ptr_out, out_ptr)
+                self._last_out_ptr = out_ptr
+            except AttributeError:
+                self._disable_fast_launch()
+                self._fallback_launch(x=x, weight=weight, out=out, M=M, N=N, ld=ld, eps=eps)
+                return
+
+        if M != self._last_m:
+            self._arg_m.set(M)
+            self._last_m = M
+        if ld != self._last_ld:
+            self._arg_ld.set(ld)
+            self._last_ld = ld
+        if eps != self._last_eps:
+            self._arg_eps.set(eps)
+            self._last_eps = eps
+
+        # Clear the error slot before launch (mirrors JitExecutor behavior).
+        if self._cuda_result is not None:
+            self._cuda_result.value = 0
+
+        ret = self._capi_func(self._packed_args)  # type: ignore[misc]
+        if ret != 0:
+            raise RuntimeError(f"CuTeDSL capi_func returned non-zero: {ret}")
+        if self._cuda_result is not None:
+            err = int(self._cuda_result.value)
+            if err != 0:
+                raise RuntimeError(f"CuTeDSL kernel launch failed (cuda_result={err})")
+
+    def _disable_fast_launch(self) -> None:
+        global _FAST_LAUNCH_SUPPORTED
+        self._use_fast_launch = False
+        _FAST_LAUNCH_SUPPORTED = False
+
+    def _fallback_launch(
+        self,
+        *,
+        x: Tensor,
+        weight: Optional[Tensor],
+        out: Tensor,
+        M: int,
+        N: int,
+        ld: int,
+        eps: float,
+    ) -> None:
+        # If the packed-args or runtime pointer mutation path stops working
+        # (e.g. due to a CuTeDSL upgrade), fall back to the regular call path.
+        dtype = TORCH2CUTE_DTYPE[x.dtype]
+        ptr_x = rt.make_ptr(dtype, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+        ptr_out = rt.make_ptr(dtype, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+        ptr_w = (
+            rt.make_ptr(dtype, weight.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+            if weight is not None
+            else None
+        )
+        self._compiled(
+            ptr_x,
+            ptr_w,
+            None,  # ptr_b
+            None,  # ptr_res
+            ptr_out,
+            None,  # ptr_res_out
+            None,  # ptr_rstd
+            Int32(M),
+            Int32(N),
+            Int32(ld),
+            self._stream,
+            Float32(eps),
+        )
+
+
+class _PtrFusedAddRmsnormFastLaunch:
+    def __init__(
+        self,
+        *,
+        compiled: object,
+        executor: object,
+        capi_func: object,
+        ptr_x: object,
+        ptr_w: object,
+        ptr_res: object,
+        arg_m: _StableI32Arg,
+        arg_n: _StableI32Arg,
+        arg_ld_x: _StableI32Arg,
+        arg_eps: _StableF32Arg,
+        stream: cuda.CUstream,
+        assumed_align: int,
+        packed_args: object,
+        keepalive: tuple[object, ...],
+    ):
+        self._compiled = compiled
+        self._executor = executor
+        self._capi_func = capi_func
+        self._ptr_x = ptr_x
+        self._ptr_w = ptr_w
+        self._ptr_res = ptr_res
+        self._arg_m = arg_m
+        self._arg_n = arg_n
+        self._arg_ld_x = arg_ld_x
+        self._arg_eps = arg_eps
+        self._stream = stream
+        self._assumed_align = int(assumed_align)
+        self._packed_args = packed_args
+        self._keepalive = keepalive
+
+        self._use_fast_launch = True
+
+        self._cuda_result = getattr(executor, "cuda_result", None)
+
+        self._last_x_ptr = -1
+        self._last_w_ptr = -1
+        self._last_res_ptr = -1
+        self._last_m = -1
+        self._last_ld_x = -1
+        self._last_eps = float("nan")
+
+    def launch(
+        self,
+        *,
+        x: Tensor,
+        weight: Tensor,
+        residual: Tensor,
+        M: int,
+        N: int,
+        ld_x: int,
+        eps: float,
+    ) -> None:
+        if not _fast_launch_enabled() or not self._use_fast_launch:
+            self._fallback_launch(
+                x=x, weight=weight, residual=residual, M=M, N=N, ld_x=ld_x, eps=eps
+            )
+            return
+
+        x_ptr = x.data_ptr()
+        if x_ptr != self._last_x_ptr:
+            try:
+                _set_runtime_ptr(self._ptr_x, x_ptr)
+                self._last_x_ptr = x_ptr
+            except AttributeError:
+                self._disable_fast_launch()
+                self._fallback_launch(
+                    x=x, weight=weight, residual=residual, M=M, N=N, ld_x=ld_x, eps=eps
+                )
+                return
+
+        w_ptr = weight.data_ptr()
+        if w_ptr != self._last_w_ptr:
+            try:
+                _set_runtime_ptr(self._ptr_w, w_ptr)
+                self._last_w_ptr = w_ptr
+            except AttributeError:
+                self._disable_fast_launch()
+                self._fallback_launch(
+                    x=x, weight=weight, residual=residual, M=M, N=N, ld_x=ld_x, eps=eps
+                )
+                return
+
+        res_ptr = residual.data_ptr()
+        if res_ptr != self._last_res_ptr:
+            try:
+                _set_runtime_ptr(self._ptr_res, res_ptr)
+                self._last_res_ptr = res_ptr
+            except AttributeError:
+                self._disable_fast_launch()
+                self._fallback_launch(
+                    x=x, weight=weight, residual=residual, M=M, N=N, ld_x=ld_x, eps=eps
+                )
+                return
+
+        if M != self._last_m:
+            self._arg_m.set(M)
+            self._last_m = M
+        if ld_x != self._last_ld_x:
+            self._arg_ld_x.set(ld_x)
+            self._last_ld_x = ld_x
+        if eps != self._last_eps:
+            self._arg_eps.set(eps)
+            self._last_eps = eps
+
+        if self._cuda_result is not None:
+            self._cuda_result.value = 0
+
+        ret = self._capi_func(self._packed_args)  # type: ignore[misc]
+        if ret != 0:
+            raise RuntimeError(f"CuTeDSL capi_func returned non-zero: {ret}")
+        if self._cuda_result is not None:
+            err = int(self._cuda_result.value)
+            if err != 0:
+                raise RuntimeError(f"CuTeDSL kernel launch failed (cuda_result={err})")
+
+    def _disable_fast_launch(self) -> None:
+        global _FAST_LAUNCH_SUPPORTED
+        self._use_fast_launch = False
+        _FAST_LAUNCH_SUPPORTED = False
+
+    def _fallback_launch(
+        self,
+        *,
+        x: Tensor,
+        weight: Tensor,
+        residual: Tensor,
+        M: int,
+        N: int,
+        ld_x: int,
+        eps: float,
+    ) -> None:
+        dtype = TORCH2CUTE_DTYPE[x.dtype]
+        ptr_x = rt.make_ptr(
+            dtype,
+            x.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=self._assumed_align,
+        )
+        ptr_res = rt.make_ptr(
+            dtype,
+            residual.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=self._assumed_align,
+        )
+        ptr_w = rt.make_ptr(
+            dtype,
+            weight.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=self._assumed_align,
+        )
+        self._compiled(
+            ptr_x,
+            ptr_w,
+            ptr_res,
+            Int32(M),
+            Int32(N),
+            Int32(ld_x),
+            self._stream,
+            Float32(eps),
+        )
+
+
+def _get_fast_ptr_rmsnorm_launcher(
+    *,
+    compiled: object,
+    dtype: type[cutlass.Numeric],
+    N: int,
+    device_index: int,
+    stream_handle: int,
+    has_weight: bool,
+    eps: float,
+) -> Optional[_PtrRmsnormFastLaunch]:
+    if not _fast_launch_enabled():
+        return None
+    # Keyed by the compiled object identity so schedule changes (e.g. copy width,
+    # async/staged variants, etc.) never alias in the fast-launch cache.
+    key = ("ptr_fast", id(compiled), N, dtype, device_index, int(stream_handle), has_weight)
+    cache = _tls_fast_launch_cache()
+    cached = cache.get(key)
+    if cached is not None:
+        return cached  # type: ignore[return-value]
+
+    # Create stable runtime args and pointer descriptors once.
+    ptr_x = rt.make_ptr(dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=16)
+    ptr_out = rt.make_ptr(dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=16)
+    ptr_w = (
+        rt.make_ptr(dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=16) if has_weight else None
+    )
+
+    arg_m = _StableI32Arg(0)
+    arg_n = _StableI32Arg(N)
+    arg_ld = _StableI32Arg(N)
+    arg_eps = _StableF32Arg(eps)
+
+    stream = cuda.CUstream(int(stream_handle))
+
+    # Create an executor (loads the CUDA library once).
+    executor = compiled.to(device_index)  # type: ignore[attr-defined]
+
+    # Use generate_execution_args once to build the packed args array, and keep
+    # any adapted args alive for the lifetime of the cache entry.
+    try:
+        exe_args, adapted_args = executor.generate_execution_args(
+            ptr_x,
+            ptr_w,
+            None,  # ptr_b
+            None,  # ptr_res
+            ptr_out,
+            None,  # ptr_res_out
+            None,  # ptr_rstd
+            arg_m,
+            arg_n,
+            arg_ld,
+            stream,
+            arg_eps,
+        )
+        packed_args = executor._get_invoke_packed_args(list(exe_args))  # type: ignore[attr-defined]
+        capi_func = compiled.capi_func  # type: ignore[attr-defined]
+    except AttributeError:
+        global _FAST_LAUNCH_SUPPORTED
+        _FAST_LAUNCH_SUPPORTED = False
+        return None
+
+    keepalive: tuple[object, ...] = (
+        executor,
+        ptr_x,
+        ptr_w,
+        ptr_out,
+        arg_m,
+        arg_n,
+        arg_ld,
+        arg_eps,
+        stream,
+        *adapted_args,
+    )
+
+    launcher = _PtrRmsnormFastLaunch(
+        compiled=compiled,
+        executor=executor,
+        capi_func=capi_func,
+        ptr_x=ptr_x,
+        ptr_w=ptr_w,
+        ptr_out=ptr_out,
+        arg_m=arg_m,
+        arg_n=arg_n,
+        arg_ld=arg_ld,
+        arg_eps=arg_eps,
+        stream=stream,
+        packed_args=packed_args,
+        keepalive=keepalive,
+    )
+    cache[key] = launcher
+    return launcher
+
+
+def _get_fast_ptr_fused_add_rmsnorm_launcher(
+    *,
+    compiled: object,
+    dtype: type[cutlass.Numeric],
+    N: int,
+    device_index: int,
+    stream_handle: int,
+    copy_bits: int,
+    use_async: bool,
+    tpr: int,
+    direct_gmem: bool,
+    assumed_align: int,
+    eps: float,
+) -> Optional[_PtrFusedAddRmsnormFastLaunch]:
+    if not _fast_launch_enabled():
+        return None
+    key = (
+        "ptr_fused_add_fast",
+        id(compiled),
+        N,
+        dtype,
+        device_index,
+        int(stream_handle),
+        int(copy_bits),
+        bool(use_async),
+        int(tpr),
+        bool(direct_gmem),
+        int(assumed_align),
+    )
+    cache = _tls_fast_launch_cache()
+    cached = cache.get(key)
+    if cached is not None:
+        return cached  # type: ignore[return-value]
+
+    ptr_x = rt.make_ptr(dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align)
+    ptr_res = rt.make_ptr(dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align)
+    ptr_w = rt.make_ptr(dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align)
+
+    arg_m = _StableI32Arg(0)
+    arg_n = _StableI32Arg(N)
+    arg_ld_x = _StableI32Arg(N)
+    arg_eps = _StableF32Arg(eps)
+
+    stream = cuda.CUstream(int(stream_handle))
+
+    executor = compiled.to(device_index)  # type: ignore[attr-defined]
+
+    try:
+        exe_args, adapted_args = executor.generate_execution_args(
+            ptr_x,
+            ptr_w,
+            ptr_res,
+            arg_m,
+            arg_n,
+            arg_ld_x,
+            stream,
+            arg_eps,
+        )
+        packed_args = executor._get_invoke_packed_args(list(exe_args))  # type: ignore[attr-defined]
+        capi_func = compiled.capi_func  # type: ignore[attr-defined]
+    except AttributeError:
+        global _FAST_LAUNCH_SUPPORTED
+        _FAST_LAUNCH_SUPPORTED = False
+        return None
+
+    keepalive: tuple[object, ...] = (
+        executor,
+        ptr_x,
+        ptr_w,
+        ptr_res,
+        arg_m,
+        arg_n,
+        arg_ld_x,
+        arg_eps,
+        stream,
+        *adapted_args,
+    )
+
+    launcher = _PtrFusedAddRmsnormFastLaunch(
+        compiled=compiled,
+        executor=executor,
+        capi_func=capi_func,
+        ptr_x=ptr_x,
+        ptr_w=ptr_w,
+        ptr_res=ptr_res,
+        arg_m=arg_m,
+        arg_n=arg_n,
+        arg_ld_x=arg_ld_x,
+        arg_eps=arg_eps,
+        stream=stream,
+        assumed_align=assumed_align,
+        packed_args=packed_args,
+        keepalive=keepalive,
+    )
+    cache[key] = launcher
+    return launcher
+
+
+# Local helpers for reduction, dtype mapping, and coordinate/predicate utilities.
+#
+# NOTE: Avoid `from . import ...` imports here: CuTeDSL's AST preprocessor may
+# mishandle that form (module=None in the AST). Use fully-qualified imports.
+from kernelagent_oink.blackwell import lite_quack as qutils
+from kernelagent_oink.blackwell.lite_quack import TORCH2CUTE_DTYPE, row_reduce
+
+
+# -------------------------
+# Copy helpers (allow up to 256b)
+# -------------------------
+
+@cute.jit
+def get_copy_atom_bw(
+    dtype: type[cutlass.Numeric], num_copy_elems: int, is_async: bool = False
+) -> cute.CopyAtom:
+    # cp.async (SIMT) supports up to 128b per op; use 256b for sync when possible
+    max_bits = const_expr(128 if is_async else 256)
+    num_copy_bits = const_expr(min(max_bits, num_copy_elems * dtype.width))
+    from cutlass.cute.nvgpu import cpasync
+    # Prefer GLOBAL cache policy for bulk streaming reads at large M
+    copy_op = (
+        cpasync.CopyG2SOp(cache_mode=cpasync.LoadCacheMode.GLOBAL)
+        if is_async
+        else cute.nvgpu.CopyUniversalOp()
+    )
+    return cute.make_copy_atom(copy_op, dtype, num_bits_per_copy=num_copy_bits)
+
+
+@cute.jit
+def copy_tiled(
+    src: cute.Tensor,
+    dst: cute.Tensor,
+    *,
+    pred: Optional[cute.Tensor] = None,
+    num_copy_elems: int = 1,
+    is_async: bool = False,
+) -> None:
+    atom = get_copy_atom_bw(src.element_type, num_copy_elems, is_async)
+    cute.copy(atom, src, dst, pred=pred)
+
+
+# -------------------------
+# RMSNorm Kernel (SM100)
+# -------------------------
+
+
+class RMSNormSM100:
+    def __init__(
+        self,
+        N: int,
+        dtype: type[cutlass.Numeric],
+        stage: Optional[int] = None,
+        *,
+        copy_bits: int = 128,
+        use_async: bool = True,
+        direct_gmem: bool = False,
+    ):
+        self.N = N
+        self.dtype = dtype
+        # Match Quack default for RMSNorm: stage = 1 unless explicitly overridden
+        self.stage = 1 if stage is None else stage
+        self.reduction_dtype = cutlass.Float32
+        self.copy_bits = int(copy_bits)
+        self.use_async = bool(use_async)
+        self.direct_gmem = bool(direct_gmem)
+
+    def _threads_per_row(self) -> int:
+        try:
+            return self._tpr_override  # type: ignore[attr-defined]
+        except Exception:
+            pass
+        # Tune mid-size buckets for large-M rows.
+        N = self.N
+        # DSv3 MLA (padded/strided) hot shape. Prefer a threads-per-row that
+        # makes the tile width exactly match N with 128b vectors (bf16/fp16),
+        # avoiding the ~33% padded work from rounding 1536 -> 2048.
+        if N == 1536 and self.dtype.width == 16:
+            return 96
+        # DSv3 default hidden size (7168). Choose a threads-per-row that matches
+        # the selected vector width to avoid padded work:
+        # - 128b copies (vec=8 for bf16/fp16): 7168/8 = 896 = 7 * 128  -> tpr=128
+        # - 256b copies (vec=16 for bf16/fp16): 7168/16 = 448 = 2 * 224 -> tpr=224
+        #
+        # The fused direct-GMEM path often uses 256b copies on 32B-aligned
+        # tensors, while the non-fused path defaults to 128b copies.
+        if N == 7168 and self.dtype.width == 16:
+            return 224 if self.copy_bits >= 256 else 128
+        # For small-N, use at least one full warp per row. The kernel
+        # implementation assumes one row per CTA; returning <32 here can
+        # produce multi-row tiles (cols_per_block > 1) which is not supported.
+        if N <= 1024:
+            return 32
+        elif N <= 4096:
+            return 128
+        elif N <= 8192:
+            # Allow an override (used by 2-rows/CTA path for N≈6k/8k)
+            try:
+                return self._tpr_override  # type: ignore[attr-defined]
+            except Exception:
+                return 128
+        elif N <= 16384:
+            return 256
+        else:
+            return 256
+
+    def _cluster_n(self) -> int:
+        try:
+            return self._cluster_n_override  # type: ignore[attr-defined]
+        except Exception:
+            pass
+        N = self.N
+        # Default policy
+        if N <= 8192:
+            return 1
+        if const_expr(self.dtype.width == 16):
+            if N <= 16 * 1024:
+                return 2
+            elif N <= 32 * 1024:
+                return 2
+            elif N <= 64 * 1024:
+                return 4
+            elif N <= 128 * 1024:
+                return 8
+            else:
+                return 16
+        else:
+            if N <= 32 * 1024:
+                return 1
+            elif N <= 64 * 1024:
+                return 2
+            elif N <= 128 * 1024:
+                return 4
+            elif N <= 256 * 1024:
+                return 8
+            else:
+                return 16
+
+    def _num_threads(self) -> int:
+        # Favor 128 threads up to N=16k to reduce per-row partitioning overhead.
+        # This keeps cols_per_block=1 at N=8192 (bf16), which benchmarks faster for large-M.
+        try:
+            return self._nt_override  # type: ignore[attr-defined]
+        except Exception:
+            if self.N == 1536 and self.dtype.width == 16:
+                return 96
+            if self.N == 7168 and self.dtype.width == 16:
+                return 224 if self.copy_bits >= 256 else 128
+            if self.N <= 1024:
+                return 32
+            return 128 if self.N <= 16384 else 256
+
+    def _tv_layout(self, num_copy_bits: int = 256) -> Tuple[cute.Shape, cute.Layout]:
+        vecsize = num_copy_bits // self.dtype.width
+        num_threads = self._num_threads()
+        assert num_threads % cute.arch.WARP_SIZE == 0
+        tpr = self._threads_per_row()
+        cluster_n = self._cluster_n()
+        # Allow tails: compute number of vector columns with ceil
+        num_cols_vec = cute.ceil_div(self.N, vecsize)
+        num_blocks_N = cute.ceil_div(num_cols_vec, tpr * cluster_n)
+        cols_per_block = num_threads // tpr
+        tiler_mn = (cols_per_block, vecsize * num_blocks_N * tpr)
+        tv_layout = cute.make_layout(
+            ((tpr, cols_per_block), (vecsize, num_blocks_N)),
+            stride=((vecsize * cols_per_block, 1), (cols_per_block, cols_per_block * vecsize * tpr)),
+        )
+        return tiler_mn, tv_layout
+
+    def _smem_bytes(self, tiler_mn, num_warps) -> int:
+        # smem for X tile (+ residual if present) + reduction buffers + mbar(s)
+        return (
+            cute.size_in_bytes(self.dtype, cute.make_layout(tiler_mn))
+            + self.stage * num_warps * self._cluster_n() * (self.reduction_dtype.width // 8)
+            + self.stage * (cutlass.Int64.width // 8)
+        )
+
+    @cute.jit
+    def __call__(
+        self,
+        mX: cute.Tensor,
+        mW: Optional[cute.Tensor],
+        mB: Optional[cute.Tensor],
+        mRes: Optional[cute.Tensor],
+        mO: cute.Tensor,
+        mResO: Optional[cute.Tensor],
+        mRstd: Optional[cute.Tensor],
+        stream: cuda.CUstream,
+        eps: Float32 = 1e-6,
+    ):
+        # Make last dim static (N)
+        semistatic_shape = (*mX.shape[:-1], self.N)
+
+        def new_stride(t):
+            return (
+                cute.assume(t.stride[0], divby=256 // t.element_type.width),
+                t.stride[1],
+            )
+
+        mX, mRes, mO, mResO = [
+            cute.make_tensor(t.iterator, cute.make_layout(semistatic_shape, stride=new_stride(t)))
+            if const_expr(t is not None)
+            else None
+            for t in (mX, mRes, mO, mResO)
+        ]
+        assert mX.element_type == self.dtype
+        assert mO.element_type == self.dtype
+
+        copy_bits = int(self.copy_bits)
+        tiler_mn, tv_layout = self._tv_layout(num_copy_bits=copy_bits)
+        num_threads = cute.size(tv_layout, mode=[0]) if _KERNEL_ACCEPTS_LAYOUT_ARGS else self._num_threads()
+        num_warps = num_threads // cute.arch.WARP_SIZE
+        threads_per_row = tv_layout.shape[0][0] if _KERNEL_ACCEPTS_LAYOUT_ARGS else self._threads_per_row()
+        warps_per_row = max(threads_per_row // cute.arch.WARP_SIZE, 1)
+        cluster_n = self._cluster_n()
+
+        if const_expr(mW is not None):
+            mW = cute.make_tensor(
+                mW.iterator, cute.prepend(mW.layout, cute.make_layout((tiler_mn[0],), stride=(0,)))
+            )
+        if const_expr(mB is not None):
+            mB = cute.make_tensor(
+                mB.iterator, cute.prepend(mB.layout, cute.make_layout((tiler_mn[0],), stride=(0,)))
+            )
+        if const_expr(mRstd is not None):
+            mRstd = cute.make_tensor(
+                mRstd.iterator, cute.append(mRstd.layout, cute.make_layout((self.N,), stride=(0,)))
+            )
+
+        # No SMEM reload mode switch; overlap is controlled in the K-loop path
+
+        # Compute smem usage considering staged buffers.
+        #
+        # In direct-gmem mode, we skip the gmem->smem tiles entirely and only
+        # keep the reduction buffers in shared memory.
+        stage_bufs = 2 if self.stage > 1 else 1
+        tile_bytes_x = (
+            cute.size_in_bytes(self.dtype, cute.make_layout(tiler_mn)) * stage_bufs
+            if const_expr(not self.direct_gmem)
+            else 0
+        )
+        tile_bytes_res = (
+            cute.size_in_bytes(mRes.element_type, cute.make_layout(tiler_mn)) * stage_bufs
+            if const_expr(mRes is not None and not self.direct_gmem)
+            else 0
+        )
+        red_bytes = self.stage * num_warps * cluster_n * (self.reduction_dtype.width // 8)
+        # mbarriers are only allocated/used for cluster_n>1. Some CuTeDSL builds
+        # require mbarrier state to be 16B-aligned in shared memory; account for
+        # the alignment padding when computing dynamic smem bytes.
+        smem_bytes = tile_bytes_x + tile_bytes_res + red_bytes
+        if cluster_n > 1:
+            # Align up to 16B before placing the mbarrier array.
+            smem_bytes = ((smem_bytes + 15) // 16) * 16
+            smem_bytes += self.stage * (cutlass.Int64.width // 8)
+
+        kernel = (
+            self.kernel(
+                mX,
+                mW,
+                mB,
+                mRes,
+                mO,
+                mResO,
+                mRstd,
+                eps,
+                tv_layout,
+                tiler_mn,
+                const_expr(cluster_n),
+                const_expr(num_warps),
+                const_expr(warps_per_row),
+                const_expr(threads_per_row),
+            )
+            if _KERNEL_ACCEPTS_LAYOUT_ARGS
+            else self.kernel(
+                mX,
+                mW,
+                mB,
+                mRes,
+                mO,
+                mResO,
+                mRstd,
+                eps,
+            )
+        )
+        kernel.launch(
+            grid=[cute.ceil_div(mX.shape[0], tiler_mn[0]), cluster_n, 1],
+            block=[num_threads, 1, 1],
+            cluster=([1, cluster_n, 1] if cluster_n > 1 else None),
+            smem=smem_bytes,
+            stream=stream,
+        )
+
+    @cute.jit
+    def launch_from_ptrs(
+        self,
+        ptr_x: cute.Pointer,
+        ptr_w: Optional[cute.Pointer],
+        ptr_b: Optional[cute.Pointer],
+        ptr_res: Optional[cute.Pointer],
+        ptr_out: cute.Pointer,
+        ptr_res_out: Optional[cute.Pointer],
+        ptr_rstd: Optional[cute.Pointer],
+        M: Int32,
+        N_dyn: Int32,
+        ld: Int32,
+        stream: cuda.CUstream,
+        eps: Float32 = 1e-6,
+    ):
+        """Pointer-based entrypoint to reuse the existing RMSNorm schedule.
+
+        This reconstructs cute.Tensor views from raw pointers plus sizes,
+        avoiding any DLPack conversions at the Python boundary.
+        """
+        # Use a dynamic N for the leading-dimension stride so that the
+        # subsequent cute.assume(...) in __call__ sees a dynamic expression
+        # rather than a plain Python int.
+        # The compile-time N for the kernel (self.N) is still used to
+        # specialize the schedule.
+        # Assume row-major [M, N] with an arbitrary leading-dimension stride
+        # (common for padded-row / packed-attention layouts).
+        layout_mn = cute.make_layout((M, N_dyn), stride=(ld, 1))
+        layout_n = cute.make_layout((N_dyn,), stride=(1,))
+        layout_m = cute.make_layout((M,), stride=(1,))
+
+        mX = cute.make_tensor(ptr_x, layout_mn)
+        mO = cute.make_tensor(ptr_out, layout_mn)
+
+        mRes = (
+            cute.make_tensor(ptr_res, layout_mn)
+            if const_expr(ptr_res is not None)
+            else None
+        )
+        mResO = (
+            cute.make_tensor(ptr_res_out, layout_mn)
+            if const_expr(ptr_res_out is not None)
+            else None
+        )
+        mW = (
+            cute.make_tensor(ptr_w, layout_n)
+            if const_expr(ptr_w is not None)
+            else None
+        )
+        mB = (
+            cute.make_tensor(ptr_b, layout_n)
+            if const_expr(ptr_b is not None)
+            else None
+        )
+        mRstd = (
+            cute.make_tensor(ptr_rstd, layout_m)
+            if const_expr(ptr_rstd is not None)
+            else None
+        )
+
+        # Reuse the main JIT entry to launch the scheduled kernel.
+        self.__call__(mX, mW, mB, mRes, mO, mResO, mRstd, stream, eps)
+
+    @cute.jit
+    def launch_from_ptrs_fused_add_inplace(
+        self,
+        ptr_x: cute.Pointer,
+        ptr_w: cute.Pointer,
+        ptr_res: cute.Pointer,
+        M: Int32,
+        N_dyn: Int32,
+        ld_x: Int32,
+        stream: cuda.CUstream,
+        eps: Float32 = 1e-6,
+    ):
+        """Pointer-based entrypoint for vLLM-style fused_add_rms_norm semantics.
+
+        This specialized entrypoint supports:
+        - `x` / output with an arbitrary leading-dimension stride (`ld_x`), and
+        - `residual` / residual-out as a contiguous [M, N] tensor (ld_res = N).
+
+        Both `x` and `residual` are updated in-place:
+          residual <- x + residual
+          x <- RMSNorm(residual) * weight
+        """
+        layout_x = cute.make_layout((M, N_dyn), stride=(ld_x, 1))
+        layout_res = cute.make_layout((M, N_dyn), stride=(N_dyn, 1))
+        layout_n = cute.make_layout((N_dyn,), stride=(1,))
+
+        mX = cute.make_tensor(ptr_x, layout_x)
+        mO = cute.make_tensor(ptr_x, layout_x)
+        mRes = cute.make_tensor(ptr_res, layout_res)
+        mResO = cute.make_tensor(ptr_res, layout_res)
+        mW = cute.make_tensor(ptr_w, layout_n)
+
+        self.__call__(
+            mX,
+            mW,
+            None,  # bias
+            mRes,
+            mO,
+            mResO,
+            None,  # rstd
+            stream,
+            eps,
+        )
+
+    @cute.jit
+    def _kernel_impl(
+        self,
+        mX: cute.Tensor,
+        mW: Optional[cute.Tensor],
+        mB: Optional[cute.Tensor],
+        mRes: Optional[cute.Tensor],
+        mO: cute.Tensor,
+        mResO: Optional[cute.Tensor],
+        mRstd: Optional[cute.Tensor],
+        eps: Float32,
+        tv_layout: cute.Layout,
+        tiler_mn: cute.Shape,
+        cluster_n: cutlass.Constexpr[int],
+        num_warps: cutlass.Constexpr[int],
+        warps_per_row: cutlass.Constexpr[int],
+        threads_per_row: cutlass.Constexpr[int],
+    ):
+        tidx, _, _ = cute.arch.thread_idx()
+        bidx, _, _ = cute.arch.block_idx()
+        if const_expr(cluster_n > 1):
+            cta_rank_in_cluster = cute.arch.block_idx_in_cluster()
+        else:
+            cta_rank_in_cluster = const_expr(0)
+        n_off = cta_rank_in_cluster * tiler_mn[1]
+
+        smem = cutlass.utils.SmemAllocator()
+        # Allocate one or two SMEM buffers depending on stage depth
+        sX0 = (
+            smem.allocate_tensor(
+                mX.element_type, cute.make_ordered_layout(tiler_mn, order=(1, 0)), byte_alignment=32
+            )
+            if const_expr(not self.direct_gmem)
+            else None
+        )
+        sX1 = (
+            smem.allocate_tensor(
+                mX.element_type,
+                cute.make_ordered_layout(tiler_mn, order=(1, 0)),
+                byte_alignment=32,
+            )
+            if const_expr(self.stage > 1 and not self.direct_gmem)
+            else None
+        )
+        sRes0 = (
+            smem.allocate_tensor(
+                mRes.element_type, cute.make_ordered_layout(tiler_mn, order=(1, 0)), byte_alignment=32
+            )
+            if const_expr(mRes is not None and not self.direct_gmem)
+            else None
+        )
+        sRes1 = (
+            smem.allocate_tensor(
+                mRes.element_type,
+                cute.make_ordered_layout(tiler_mn, order=(1, 0)),
+                byte_alignment=32,
+            )
+            if const_expr(mRes is not None and self.stage > 1 and not self.direct_gmem)
+            else None
+        )
+
+        # Reduction buffers + mbar for cluster reduce (reused by row_reduce helper)
+        red_layout = cute.make_ordered_layout(
+            (num_warps // warps_per_row, (warps_per_row, cluster_n), self.stage),
+            order=(1, 0, 2),
+        )
+        reduction_buffer = smem.allocate_tensor(self.reduction_dtype, red_layout, byte_alignment=4)
+        if const_expr(cluster_n > 1):
+            # Some CuTeDSL builds appear sensitive to the shared-memory alignment of
+            # mbarrier state. `SmemAllocator.allocate_array` does not currently
+            # expose an alignment parameter, so allocate an Int64 tensor with an
+            # explicit alignment and pass its iterator as the pointer.
+            mbar_tensor = smem.allocate_tensor(
+                cutlass.Int64,
+                cute.make_layout((self.stage,), stride=(1,)),
+                byte_alignment=16,
+            )
+            mbar_ptr = mbar_tensor.iterator
+        else:
+            mbar_ptr = None
+
+        shape = mX.shape
+        idX = cute.make_identity_tensor(shape)
+        limit_k = shape[1] - n_off
+
+        # Tiled copy setup
+        num_copy_elems_X = tv_layout.shape[1][0]
+        use_async = const_expr(self.use_async and self.N >= 1024 and not self.direct_gmem)
+        copy_atom = get_copy_atom_bw(mX.element_type, num_copy_elems_X, is_async=use_async)
+        thr_copy = cute.make_tiled_copy(copy_atom, tv_layout, tiler_mn).get_slice(tidx)
+
+        # Tail predicate for the N dimension (when tile width > N). Reuse this
+        # for W/B loads so we never read past the end of those 1D tensors.
+        is_even_N_wb = const_expr(shape[1] == tiler_mn[1] * cluster_n)
+        if const_expr(not is_even_N_wb):
+            cX0 = cute.local_tile(idX, tiler_mn, (0, 0))
+            tXp_wb = qutils.predicate_k(thr_copy.partition_S(cX0), limit=limit_k)
+        else:
+            tXp_wb = None
+
+        # Weight/bias loads:
+        #
+        # - Direct-GMEM schedule: load weight/bias up front to hide latency.
+        # - Staged SMEM schedule: loading after the reduction reduces register
+        #   pressure during the long-scoreboard reduction phase (better for large-M),
+        #   but it measurably hurts small-M latency for the non-fused (no residual,
+        #   no bias) case. For that specific case, prefetch weight up front as well.
+        tXrW = None
+        tXrB = None
+        prefetch_w_early = bool(
+            mW is not None and (self.direct_gmem or (mRes is None and mB is None))
+        )
+        if const_expr(prefetch_w_early):
+            gW = cute.local_tile(qutils.domain_offset_i64((0, n_off), mW), tiler_mn, (0, 0))
+            tXgW = thr_copy.partition_S(gW)
+            tXrW = cute.make_fragment_like(tXgW)
+            if const_expr(not is_even_N_wb):
+                tXrW.fill(0)
+            cute.copy(
+                get_copy_atom_bw(mW.element_type, num_copy_elems_X, is_async=False),
+                tXgW,
+                tXrW,
+                pred=tXp_wb,
+            )
+        if const_expr(self.direct_gmem and mB is not None):
+            gB = cute.local_tile(qutils.domain_offset_i64((0, n_off), mB), tiler_mn, (0, 0))
+            tXgB = thr_copy.partition_S(gB)
+            tXrB = cute.make_fragment_like(tXgB)
+            if const_expr(not is_even_N_wb):
+                tXrB.fill(0)
+            cute.copy(
+                get_copy_atom_bw(mB.element_type, num_copy_elems_X, is_async=False),
+                tXgB,
+                tXrB,
+                pred=tXp_wb,
+            )
+
+        # Non-persistent per-CTA execution (one tile in M)
+        self._init_cluster(tidx, mbar_ptr)
+
+        mX_i, mRes_i, mO_i, mResO_i = [
+            qutils.domain_offset_i64((bidx * tiler_mn[0], 0), t) if t is not None else None
+            for t in (mX, mRes, mO, mResO)
+        ]
+        mX_i, mRes_i, mO_i, mResO_i = [
+            qutils.domain_offset_i64((0, n_off), t) if t is not None else None
+            for t in (mX_i, mRes_i, mO_i, mResO_i)
+        ]
+        gX_i = cute.local_tile(mX_i, tiler_mn, (0, 0))
+        gO_i = cute.local_tile(mO_i, tiler_mn, (0, 0))
+        gRes_i = (
+            cute.local_tile(mRes_i, tiler_mn, (0, 0)) if const_expr(mRes is not None) else None
+        )
+        gResO_i = (
+            cute.local_tile(mResO_i, tiler_mn, (0, 0)) if const_expr(mResO is not None) else None
+        )
+        gRstd_i = (
+            cute.local_tile(mRstd, tiler_mn, (bidx, 0)) if const_expr(mRstd is not None) else None
+        )
+        cX_i = cute.local_tile(idX, tiler_mn, (bidx, 0))
+
+        # Common identity/row index partitions reused by both default and K-loop paths
+        tXcX_i = thr_copy.partition_S(cX_i)[(0, None), None, None]
+        row_i = tXcX_i[0][0]
+        tXgRstd_i = thr_copy.partition_D(gRstd_i) if const_expr(mRstd is not None) else None
+
+        # Stage-2 intra-row K-loop cp.async ping-pong (two tiles). This reduces
+        # per-thread fragment size and can improve memory-latency hiding for
+        # N=7168 at large M. It is enabled by setting `stage=2` when constructing
+        # the RMSNormSM100 op (see `_fused_add_rmsnorm_forward_ptr_inplace`).
+        if const_expr(
+            self.stage > 1 and not self.direct_gmem and use_async and cluster_n == 1 and shape[1] == 7168
+        ):
+            vecsize = tv_layout.shape[1][0]
+            tpr = threads_per_row
+            target_tile_n = const_expr(4096)
+            tile_factor = const_expr(target_tile_n // (vecsize * tpr))
+            if const_expr(tile_factor > 0):
+                tile_n = vecsize * tpr * tile_factor
+                num_tiles = cute.ceil_div(shape[1], tile_n)
+
+                tiler_mn_tile = (tiler_mn[0], tile_n)
+                sX0_tile = cute.local_tile(sX0, tiler_mn_tile, (0, 0))
+                sX1_tile = cute.local_tile(sX1, tiler_mn_tile, (0, 0))
+                sRes0_tile = (
+                    cute.local_tile(sRes0, tiler_mn_tile, (0, 0))
+                    if const_expr(mRes is not None)
+                    else None
+                )
+                sRes1_tile = (
+                    cute.local_tile(sRes1, tiler_mn_tile, (0, 0))
+                    if const_expr(mRes is not None)
+                    else None
+                )
+
+                tv_layout_tile = cute.make_layout(
+                    ((tpr, tiler_mn[0]), (vecsize, tile_factor)),
+                    stride=(
+                        (vecsize * tiler_mn[0], 1),
+                        (tiler_mn[0], tiler_mn[0] * vecsize * tpr),
+                    ),
+                )
+                thr_copy_tile = cute.make_tiled_copy(copy_atom, tv_layout_tile, tiler_mn_tile).get_slice(
+                    tidx
+                )
+
+                # Accumulate per-thread partial sums across tiles; reduce once.
+                sum_sq_thread = cute.Float32(0.0)
+
+                # Preload tile 0 into sX0/sRes0.
+                k_off0 = const_expr(0) * tile_n
+                gX_0 = cute.local_tile(
+                    qutils.domain_offset_i64((0, k_off0), mX_i), tiler_mn_tile, (0, 0)
+                )
+                tXgX_0 = thr_copy_tile.partition_S(gX_0)
+                tXsX_0 = thr_copy_tile.partition_D(sX0_tile)
+                cX_0 = cute.local_tile(
+                    cute.domain_offset((0, k_off0), cX_i), tiler_mn_tile, (0, 0)
+                )
+                tXc_0 = thr_copy_tile.partition_S(cX_0)
+                tXp_0 = qutils.predicate_k(tXc_0, limit=limit_k)
+
+                tXp_ping = tXp_0
+                tXp_pong = tXp_0
+
+                if row_i < shape[0]:
+                    copy_tiled(tXgX_0, tXsX_0, num_copy_elems=vecsize, is_async=True, pred=tXp_0)
+                    if const_expr(mRes is not None):
+                        gRes_0 = cute.local_tile(
+                            qutils.domain_offset_i64((0, k_off0), mRes_i),
+                            tiler_mn_tile,
+                            (0, 0),
+                        )
+                        tXgRes_0 = thr_copy_tile.partition_S(gRes_0)
+                        tXsRes_0 = thr_copy_tile.partition_D(sRes0_tile)
+                        copy_tiled(
+                            tXgRes_0,
+                            tXsRes_0,
+                            num_copy_elems=vecsize,
+                            is_async=True,
+                            pred=tXp_0,
+                        )
+                cute.arch.cp_async_commit_group()
+
+                for t in cutlass.range_constexpr(num_tiles):
+                    next_t = t + 1
+                    if next_t < num_tiles:
+                        k_off_n = next_t * tile_n
+                        gX_n = cute.local_tile(
+                            qutils.domain_offset_i64((0, k_off_n), mX_i),
+                            tiler_mn_tile,
+                            (0, 0),
+                        )
+                        tXgX_n = thr_copy_tile.partition_S(gX_n)
+                        cX_n = cute.local_tile(
+                            cute.domain_offset((0, k_off_n), cX_i),
+                            tiler_mn_tile,
+                            (0, 0),
+                        )
+                        tXc_n = thr_copy_tile.partition_S(cX_n)
+                        tXp_n = qutils.predicate_k(tXc_n, limit=limit_k)
+
+                        if const_expr((t % 2) == 0):
+                            tXsX_n = thr_copy_tile.partition_D(sX1_tile)
+                            tXsRes_n = (
+                                thr_copy_tile.partition_D(sRes1_tile) if const_expr(mRes is not None) else None
+                            )
+                            tXp_pong = tXp_n
+                        else:
+                            tXsX_n = thr_copy_tile.partition_D(sX0_tile)
+                            tXsRes_n = (
+                                thr_copy_tile.partition_D(sRes0_tile) if const_expr(mRes is not None) else None
+                            )
+                            tXp_ping = tXp_n
+
+                        if row_i < shape[0]:
+                            copy_tiled(
+                                tXgX_n, tXsX_n, num_copy_elems=vecsize, is_async=True, pred=tXp_n
+                            )
+                            if const_expr(mRes is not None):
+                                gRes_n = cute.local_tile(
+                                    qutils.domain_offset_i64((0, k_off_n), mRes_i),
+                                    tiler_mn_tile,
+                                    (0, 0),
+                                )
+                                tXgRes_n = thr_copy_tile.partition_S(gRes_n)
+                                copy_tiled(
+                                    tXgRes_n,
+                                    tXsRes_n,
+                                    num_copy_elems=vecsize,
+                                    is_async=True,
+                                    pred=tXp_n,
+                                )
+                        cute.arch.cp_async_commit_group()
+
+                    cute.arch.cp_async_wait_group(1 if next_t < num_tiles else 0)
+
+                    # Current tile buffer (ping/pong).
+                    if const_expr((t % 2) == 0):
+                        tXsX_cur = thr_copy_tile.partition_D(sX0_tile)
+                        tXsRes_cur = (
+                            thr_copy_tile.partition_D(sRes0_tile) if const_expr(mRes is not None) else None
+                        )
+                        pred_cur = tXp_ping
+                    else:
+                        tXsX_cur = thr_copy_tile.partition_D(sX1_tile)
+                        tXsRes_cur = (
+                            thr_copy_tile.partition_D(sRes1_tile) if const_expr(mRes is not None) else None
+                        )
+                        pred_cur = tXp_pong
+
+                    k_off = t * tile_n
+                    gX_t = cute.local_tile(qutils.domain_offset_i64((0, k_off), mX_i), tiler_mn_tile, (0, 0))
+                    tXgX_t = thr_copy_tile.partition_S(gX_t)
+                    tXrX_t = cute.make_fragment_like(tXgX_t)
+                    cute.autovec_copy(tXsX_cur, tXrX_t)
+                    x_t = tXrX_t.load().to(cute.Float32)
+                    if const_expr(mRes is not None):
+                        gRes_t = cute.local_tile(
+                            qutils.domain_offset_i64((0, k_off), mRes_i), tiler_mn_tile, (0, 0)
+                        )
+                        tXgRes_t = thr_copy_tile.partition_S(gRes_t)
+                        tXrRes_t = cute.make_fragment_like(tXgRes_t)
+                        cute.autovec_copy(tXsRes_cur, tXrRes_t)
+                        x_t += tXrRes_t.load().to(cute.Float32)
+
+                    if const_expr(mResO is not None):
+                        gResO_t = cute.local_tile(
+                            qutils.domain_offset_i64((0, k_off), mResO_i),
+                            tiler_mn_tile,
+                            (0, 0),
+                        )
+                        tXgResO_t = thr_copy_tile.partition_D(gResO_t)
+                        tXrResO_t = cute.make_fragment_like(tXgResO_t)
+                        tXrResO_t.store(x_t.to(tXrResO_t.element_type))
+                        if row_i < shape[0]:
+                            copy_tiled(
+                                tXrResO_t,
+                                tXgResO_t,
+                                num_copy_elems=vecsize,
+                                is_async=False,
+                                pred=pred_cur,
+                            )
+
+                    sum_sq_thread = sum_sq_thread + (x_t * x_t).reduce(
+                        cute.ReductionOp.ADD,
+                        init_val=0.0,
+                        reduction_profile=0,
+                    )
+
+                sum_sq = row_reduce(
+                    sum_sq_thread,
+                    cute.ReductionOp.ADD,
+                    threads_per_row,
+                    reduction_buffer[None, None, 0],
+                    mbar_ptr,
+                    init_val=0.0,
+                )
+                rstd = cute.math.rsqrt(sum_sq / shape[1] + eps, fastmath=True)
+
+                if const_expr(mRstd is not None):
+                    if tXcX_i[0][1] == 0 and row_i < shape[0]:
+                        tXgRstd_i[0] = rstd
+
+                for t in cutlass.range_constexpr(num_tiles):
+                    k_off = t * tile_n
+                    cX_t = cute.local_tile(cute.domain_offset((0, k_off), cX_i), tiler_mn_tile, (0, 0))
+                    tXc_t = thr_copy_tile.partition_S(cX_t)
+                    tXp_t = qutils.predicate_k(tXc_t, limit=limit_k)
+
+                    if const_expr((t % 2) == 0):
+                        tXsX_cur = thr_copy_tile.partition_D(sX0_tile)
+                        tXsRes_cur = (
+                            thr_copy_tile.partition_D(sRes0_tile) if const_expr(mRes is not None) else None
+                        )
+                    else:
+                        tXsX_cur = thr_copy_tile.partition_D(sX1_tile)
+                        tXsRes_cur = (
+                            thr_copy_tile.partition_D(sRes1_tile) if const_expr(mRes is not None) else None
+                        )
+
+                    gX_t = cute.local_tile(qutils.domain_offset_i64((0, k_off), mX_i), tiler_mn_tile, (0, 0))
+                    tXgX_t = thr_copy_tile.partition_S(gX_t)
+                    tXrX_t = cute.make_fragment_like(tXgX_t)
+                    cute.autovec_copy(tXsX_cur, tXrX_t)
+                    x_t = tXrX_t.load().to(cute.Float32)
+                    if const_expr(mRes is not None):
+                        gRes_t = cute.local_tile(
+                            qutils.domain_offset_i64((0, k_off), mRes_i), tiler_mn_tile, (0, 0)
+                        )
+                        tXgRes_t = thr_copy_tile.partition_S(gRes_t)
+                        tXrRes_t = cute.make_fragment_like(tXgRes_t)
+                        cute.autovec_copy(tXsRes_cur, tXrRes_t)
+                        x_t += tXrRes_t.load().to(cute.Float32)
+
+                    y_t = x_t * rstd
+                    if const_expr(mW is not None):
+                        gW_t = cute.local_tile(
+                            qutils.domain_offset_i64((0, k_off), mW), tiler_mn_tile, (0, 0)
+                        )
+                        tWgW_t = thr_copy_tile.partition_S(gW_t)
+                        tWrW_t = cute.make_fragment_like(tWgW_t)
+                        copy_tiled(tWgW_t, tWrW_t, num_copy_elems=vecsize, is_async=False, pred=tXp_t)
+                        y_t = y_t * tWrW_t.load().to(cute.Float32)
+                    if const_expr(mB is not None):
+                        gB_t = cute.local_tile(
+                            qutils.domain_offset_i64((0, k_off), mB), tiler_mn_tile, (0, 0)
+                        )
+                        tWgB_t = thr_copy_tile.partition_S(gB_t)
+                        tWrB_t = cute.make_fragment_like(tWgB_t)
+                        copy_tiled(tWgB_t, tWrB_t, num_copy_elems=vecsize, is_async=False, pred=tXp_t)
+                        y_t = y_t + tWrB_t.load().to(cute.Float32)
+
+                    gO_t = cute.local_tile(qutils.domain_offset_i64((0, k_off), mO_i), tiler_mn_tile, (0, 0))
+                    tXgO_t = thr_copy_tile.partition_D(gO_t)
+                    tXrO_t = cute.make_fragment_like(tXgO_t)
+                    tXrO_t.store(y_t.to(tXrO_t.element_type))
+                    if row_i < shape[0]:
+                        copy_tiled(tXrO_t, tXgO_t, num_copy_elems=vecsize, is_async=False, pred=tXp_t)
+
+                return
+
+        # Single-stage path: one-row-per-CTA
+        tXgX_i = thr_copy.partition_S(gX_i)
+        tXgRes_i = thr_copy.partition_S(gRes_i) if const_expr(mRes is not None) else None
+        tXgO_i = thr_copy.partition_D(gO_i)
+        tXgResO_i = thr_copy.partition_D(gResO_i) if const_expr(mResO is not None) else None
+        # tXgRstd_i / tXcX_i / row_i prepared above
+        is_even_N_i = const_expr(shape[1] == tiler_mn[1] * cluster_n)
+        tXpX_i = (
+            qutils.predicate_k(thr_copy.partition_S(cX_i), limit=limit_k) if not is_even_N_i else None
+        )
+
+        tXrX = cute.make_fragment_like(tXgX_i)
+        tXrRes = cute.make_fragment_like(tXgRes_i) if const_expr(mRes is not None) else None
+        if const_expr(self.direct_gmem):
+            if const_expr(not is_even_N_i):
+                tXrX.fill(0)
+                if const_expr(tXrRes is not None):
+                    tXrRes.fill(0)
+            if row_i < shape[0]:
+                cute.copy(copy_atom, tXgX_i, tXrX, pred=tXpX_i)
+                if const_expr(tXrRes is not None):
+                    cute.copy(copy_atom, tXgRes_i, tXrRes, pred=tXpX_i)
+        else:
+            # If N is not a multiple of the tile width, the predicated gmem->smem
+            # copies leave out-of-bounds lanes uninitialized. Clear the SMEM tile
+            # so masked lanes read as 0 for reduction/output.
+            if const_expr(not is_even_N_i):
+                thr_copy.partition_D(sX0).fill(0)
+                if const_expr(mRes is not None):
+                    thr_copy.partition_D(sRes0).fill(0)
+
+            if row_i < shape[0]:
+                cute.copy(copy_atom, tXgX_i, thr_copy.partition_D(sX0), pred=tXpX_i)
+                if const_expr(mRes is not None):
+                    cute.copy(copy_atom, tXgRes_i, thr_copy.partition_D(sRes0), pred=tXpX_i)
+            if const_expr(use_async):
+                cute.arch.cp_async_commit_group()
+                cute.arch.cp_async_wait_group(0)
+
+            cute.autovec_copy(thr_copy.partition_D(sX0), tXrX)
+            if const_expr(tXrRes is not None):
+                cute.autovec_copy(thr_copy.partition_D(sRes0), tXrRes)
+        x_red = tXrX.load().to(cute.Float32)
+        if const_expr(tXrRes is not None):
+            x_red += tXrRes.load().to(cute.Float32)
+
+        if const_expr(mResO is not None):
+            tXrResO = cute.make_fragment_like(tXgResO_i)
+            tXrResO.store(x_red.to(tXrResO.element_type))
+            if row_i < shape[0]:
+                cute.copy(
+                    get_copy_atom_bw(tXrResO.element_type, num_copy_elems_X, is_async=False),
+                    tXrResO,
+                    tXgResO_i,
+                    pred=tXpX_i,
+                )
+
+        sum_sq = row_reduce(
+            x_red * x_red,
+            cute.ReductionOp.ADD,
+            threads_per_row,
+            reduction_buffer[None, None, 0],
+            mbar_ptr,
+            init_val=0.0,
+        )
+        rstd = cute.math.rsqrt(sum_sq / shape[1] + eps, fastmath=True)
+
+        if const_expr(mRstd is not None):
+            if (
+                tXcX_i[0][1] == 0
+                and row_i < shape[0]
+                and (cluster_n == 1 or cute.arch.block_idx_in_cluster() == 0)
+            ):
+                tXgRstd_i[0] = rstd
+
+        if const_expr(not self.direct_gmem and (mRes is not None or mB is not None)):
+            # Load weight/bias after the reduction so they don't inflate register
+            # pressure during the long-scoreboard reduction phase (helping occupancy
+            # when registers are the limiting factor).
+            if const_expr(mW is not None):
+                gW = cute.local_tile(qutils.domain_offset_i64((0, n_off), mW), tiler_mn, (0, 0))
+                tXgW = thr_copy.partition_S(gW)
+                tXrW = cute.make_fragment_like(tXgW)
+                if const_expr(not is_even_N_wb):
+                    tXrW.fill(0)
+                cute.copy(
+                    get_copy_atom_bw(mW.element_type, num_copy_elems_X, is_async=False),
+                    tXgW,
+                    tXrW,
+                    pred=tXp_wb,
+                )
+            if const_expr(mB is not None):
+                gB = cute.local_tile(qutils.domain_offset_i64((0, n_off), mB), tiler_mn, (0, 0))
+                tXgB = thr_copy.partition_S(gB)
+                tXrB = cute.make_fragment_like(tXgB)
+                if const_expr(not is_even_N_wb):
+                    tXrB.fill(0)
+                cute.copy(
+                    get_copy_atom_bw(mB.element_type, num_copy_elems_X, is_async=False),
+                    tXgB,
+                    tXrB,
+                    pred=tXp_wb,
+                )
+
+        # Reuse `x_red` (x + residual, in fp32) for the output path so we don't
+        # keep both `tXrX` and `tXrRes` fragments live across the reduction.
+        y = x_red * rstd
+        if const_expr(mW is not None):
+            y = y * tXrW.load().to(cute.Float32)
+        if const_expr(mB is not None):
+            y = y + tXrB.load().to(cute.Float32)
+
+        tXrO = cute.make_fragment_like(tXgO_i)
+        tXrO.store(y.to(tXrO.element_type))
+        if row_i < shape[0]:
+            cute.copy(
+                get_copy_atom_bw(tXrO.element_type, num_copy_elems_X, is_async=False),
+                tXrO,
+                tXgO_i,
+                pred=tXpX_i,
+            )
+
+    if _KERNEL_ACCEPTS_LAYOUT_ARGS:
+        @cute.kernel
+        def kernel(
+            self,
+            mX: cute.Tensor,
+            mW: Optional[cute.Tensor],
+            mB: Optional[cute.Tensor],
+            mRes: Optional[cute.Tensor],
+            mO: cute.Tensor,
+            mResO: Optional[cute.Tensor],
+            mRstd: Optional[cute.Tensor],
+            eps: Float32,
+            tv_layout: cute.Layout,
+            tiler_mn: cute.Shape,
+            cluster_n: cutlass.Constexpr[int],
+            num_warps: cutlass.Constexpr[int],
+            warps_per_row: cutlass.Constexpr[int],
+            threads_per_row: cutlass.Constexpr[int],
+        ):
+            self._kernel_impl(
+                mX,
+                mW,
+                mB,
+                mRes,
+                mO,
+                mResO,
+                mRstd,
+                eps,
+                tv_layout,
+                tiler_mn,
+                cluster_n,
+                num_warps,
+                warps_per_row,
+                threads_per_row,
+            )
+    else:
+        @cute.kernel
+        def kernel(
+            self,
+            mX: cute.Tensor,
+            mW: Optional[cute.Tensor],
+            mB: Optional[cute.Tensor],
+            mRes: Optional[cute.Tensor],
+            mO: cute.Tensor,
+            mResO: Optional[cute.Tensor],
+            mRstd: Optional[cute.Tensor],
+            eps: Float32,
+        ):
+            copy_bits = int(self.copy_bits)
+            tiler_mn, tv_layout = self._tv_layout(num_copy_bits=copy_bits)
+            num_threads = self._num_threads()
+            num_warps = num_threads // cute.arch.WARP_SIZE
+            threads_per_row = self._threads_per_row()
+            warps_per_row = max(threads_per_row // cute.arch.WARP_SIZE, 1)
+            cluster_n = self._cluster_n()
+            self._kernel_impl(
+                mX,
+                mW,
+                mB,
+                mRes,
+                mO,
+                mResO,
+                mRstd,
+                eps,
+                tv_layout,
+                tiler_mn,
+                const_expr(cluster_n),
+                const_expr(num_warps),
+                const_expr(warps_per_row),
+                const_expr(threads_per_row),
+            )
+
+    @cute.jit
+    def _init_cluster(self, tidx: cutlass.Int32, mbar_ptr: Optional[cute.Pointer]):
+        if const_expr(mbar_ptr is not None):
+            if tidx < self.stage:
+                cute.arch.mbarrier_init(mbar_ptr + tidx, 1)
+            cute.arch.mbarrier_init_fence()
+            cute.arch.cluster_arrive_relaxed()
+
+
+def _can_use_ptr_path(
+    x: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    residual: Optional[Tensor],
+) -> bool:
+    """Fast path precondition for the pointer-based CuTeDSL entry.
+
+    We require a row-major 2D layout where the last dimension is
+    contiguous (stride(1) == 1). The leading dimension (stride(0))
+    may be larger than N (padded-row / packed-attention layouts),
+    and is passed to the kernel as `ld`.
+    """
+    if x.stride(1) != 1:
+        return False
+    # All participating tensors are interpreted as the same element type
+    # (derived from x.dtype) in the pointer-based path. If dtypes differ,
+    # we'd read the wrong bit patterns and silently produce incorrect output.
+    if residual is not None and residual.dtype != x.dtype:
+        return False
+    if weight is not None and weight.dtype != x.dtype:
+        return False
+    if bias is not None and bias.dtype != x.dtype:
+        return False
+    # The kernel assumes `ld` satisfies a divisibility constraint used by
+    # cute.assume(..., divby=...) for vectorization.
+    elem_bits = TORCH2CUTE_DTYPE[x.dtype].width
+    divby = 256 // elem_bits
+    if (x.stride(0) % divby) != 0:
+        return False
+    # The kernel uses 128-bit vectorized copies (16B). Require at least 16B
+    # alignment on all participating tensors to avoid misaligned global loads.
+    if (x.data_ptr() % 16) != 0:
+        return False
+    if residual is not None and residual.stride(1) != 1:
+        return False
+    if residual is not None and residual.stride(0) != x.stride(0):
+        return False
+    if residual is not None and (residual.data_ptr() % 16) != 0:
+        return False
+    if weight is not None and not weight.is_contiguous():
+        return False
+    if bias is not None and not bias.is_contiguous():
+        return False
+    if weight is not None and (weight.data_ptr() % 16) != 0:
+        return False
+    if bias is not None and (bias.data_ptr() % 16) != 0:
+        return False
+    return True
+
+
+def _can_use_ptr_path_fused_add_inplace(
+    x: Tensor,
+    weight: Tensor,
+    residual: Tensor,
+) -> bool:
+    """Fast-path precondition for fused_add_rmsnorm_forward_inplace.
+
+    We allow the common vLLM layout where:
+    - `x` is strided/padded row-major (stride(1) == 1, stride(0) >= N)
+    - `residual` is contiguous row-major (stride(0) == N)
+    """
+    if x.stride(1) != 1:
+        return False
+    if residual.dtype != x.dtype:
+        return False
+    if weight.dtype != x.dtype:
+        return False
+    if residual.stride(1) != 1:
+        return False
+    if not residual.is_contiguous():
+        return False
+    if not weight.is_contiguous():
+        return False
+
+    dtype = TORCH2CUTE_DTYPE[x.dtype]
+    divby = 256 // dtype.width
+    if (x.stride(0) % divby) != 0:
+        return False
+    if (residual.stride(0) % divby) != 0:
+        return False
+
+    if (x.data_ptr() % 16) != 0:
+        return False
+    if (residual.data_ptr() % 16) != 0:
+        return False
+    if (weight.data_ptr() % 16) != 0:
+        return False
+    return True
+
+
+def _rmsnorm_forward_ptr(
+    x: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    residual: Optional[Tensor],
+    eps: float,
+    store_rstd: bool,
+) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
+    """Pointer-based RMSNorm forward that bypasses DLPack entirely.
+
+    This path reconstructs cute.Tensor views from raw device pointers
+    and explicit layouts inside the JIT graph, avoiding any runtime
+    DLPack conversions while reusing the tuned RMSNormSM100 schedule.
+    """
+    assert x.is_cuda
+    assert x.dim() == 2, "Use (M, N) tensor; flatten batch/seq beforehand."
+    M, N = x.shape
+
+    # Preserve the input's 2D stride so downstream users that rely on
+    # padded-row layouts (stride0 > N) continue to see the expected layout.
+    out = torch.empty_strided(x.shape, x.stride(), device=x.device, dtype=x.dtype)
+    residual_out: Optional[Tensor] = None
+    rstd: Optional[Tensor] = None
+
+    if residual is not None:
+        residual_out = torch.empty_strided(
+            residual.shape, residual.stride(), device=residual.device, dtype=residual.dtype
+        )
+    if store_rstd:
+        rstd = torch.empty(M, device=x.device, dtype=torch.float32)
+
+    _rmsnorm_forward_ptr_into(
+        x=x,
+        weight=weight,
+        bias=bias,
+        residual=residual,
+        out=out,
+        residual_out=residual_out,
+        rstd=rstd,
+        eps=eps,
+    )
+    return out, rstd, residual_out
+
+
+def _rmsnorm_forward_ptr_into(
+    x: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    residual: Optional[Tensor],
+    out: Tensor,
+    residual_out: Optional[Tensor],
+    rstd: Optional[Tensor],
+    eps: float,
+) -> None:
+    """Internal helper that launches the pointer-based kernel into preallocated outputs.
+
+    This enables integration into frameworks like vLLM that manage their
+    own buffers and prefer in-place or out-parameter semantics.
+    """
+    assert x.is_cuda
+    assert x.dim() == 2, "Use (M, N) tensor; flatten batch/seq beforehand."
+    M, N = x.shape
+    device_index = x.get_device()
+    dtype = TORCH2CUTE_DTYPE[x.dtype]
+
+    if bias is None and residual is None and residual_out is None and rstd is None:
+        # Fast-launch path: cache packed args and update pointers/scalars in-place to
+        # avoid Python-side argument marshalling overhead that dominates small-batch cases.
+        #
+        # If fast-launch is disabled (or CuTeDSL internals changed), we fall back
+        # to calling the compiled function directly.
+        if torch.cuda.current_device() != device_index:
+            torch.cuda.set_device(device_index)
+        stream_handle = int(torch.cuda.current_stream().cuda_stream)
+        has_weight = weight is not None
+
+        stage = 1
+        compiled_key = (
+            "ptr",
+            N,
+            dtype,
+            False,  # residual
+            has_weight,
+            False,  # bias
+            False,  # residual_out
+            False,  # rstd
+            stage,
+            device_index,
+        )
+        compiled = _PTR_COMPILE_CACHE.get(compiled_key)
+        if compiled is None:
+            op = RMSNormSM100(N, dtype, stage=stage)
+            ld_val = int(x.stride(0))
+            ptr_x = rt.make_ptr(dtype, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+            ptr_out = rt.make_ptr(dtype, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+            ptr_w = (
+                rt.make_ptr(dtype, weight.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+                if has_weight
+                else None
+            )
+            stream = cuda.CUstream(stream_handle)
+            ld = Int32(ld_val)
+            compiled = cute.compile(
+                op.launch_from_ptrs,
+                ptr_x,
+                ptr_w,
+                None,  # ptr_b
+                None,  # ptr_res
+                ptr_out,
+                None,  # ptr_res_out
+                None,  # ptr_rstd
+                Int32(M),
+                Int32(N),
+                ld,
+                stream,
+                Float32(eps),
+            )
+            _PTR_COMPILE_CACHE[compiled_key] = compiled
+
+        launcher = _get_fast_ptr_rmsnorm_launcher(
+            compiled=compiled,
+            dtype=dtype,
+            N=N,
+            device_index=device_index,
+            stream_handle=stream_handle,
+            has_weight=has_weight,
+            eps=eps,
+        )
+        ld_val = int(x.stride(0))
+        if launcher is not None:
+            launcher.launch(x=x, weight=weight, out=out, M=M, N=N, ld=ld_val, eps=eps)
+            return
+
+        ptr_x = rt.make_ptr(dtype, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+        ptr_out = rt.make_ptr(dtype, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+        ptr_w = (
+            rt.make_ptr(dtype, weight.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+            if has_weight
+            else None
+        )
+        stream = cuda.CUstream(stream_handle)
+        ld = Int32(ld_val)
+        compiled(
+            ptr_x,
+            ptr_w,
+            None,  # ptr_b
+            None,  # ptr_res
+            ptr_out,
+            None,  # ptr_res_out
+            None,  # ptr_rstd
+            Int32(M),
+            Int32(N),
+            ld,
+            stream,
+            Float32(eps),
+        )
+        return
+
+    # Fallback: general path (supports bias/residual/rstd, but is slower to launch).
+    stage = 1
+    if torch.cuda.current_device() != device_index:
+        torch.cuda.set_device(device_index)
+    stream_handle = int(torch.cuda.current_stream().cuda_stream)
+    key = (
+        "ptr",
+        N,
+        dtype,
+        residual is not None,
+        weight is not None,
+        bias is not None,
+        residual_out is not None,
+        rstd is not None,
+        stage,
+        device_index,
+    )
+    compiled = _PTR_COMPILE_CACHE.get(key)
+    if compiled is None:
+        op = RMSNormSM100(N, dtype, stage=stage)
+        ptr_x = rt.make_ptr(dtype, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+        ptr_out = rt.make_ptr(dtype, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+        ptr_res = (
+            rt.make_ptr(dtype, residual.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+            if residual is not None
+            else None
+        )
+        ptr_res_out = (
+            rt.make_ptr(
+                dtype, residual_out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+            )
+            if residual_out is not None
+            else None
+        )
+        ptr_w = (
+            rt.make_ptr(dtype, weight.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+            if weight is not None
+            else None
+        )
+        ptr_b = (
+            rt.make_ptr(dtype, bias.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+            if bias is not None
+            else None
+        )
+        ptr_rstd = (
+            rt.make_ptr(
+                cutlass.Float32, rstd.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=4
+            )
+            if rstd is not None
+            else None
+        )
+        stream = cuda.CUstream(stream_handle)
+        ld = Int32(int(x.stride(0)))
+        compiled = cute.compile(
+            op.launch_from_ptrs,
+            ptr_x,
+            ptr_w,
+            ptr_b,
+            ptr_res,
+            ptr_out,
+            ptr_res_out,
+            ptr_rstd,
+            Int32(M),
+            Int32(N),
+            ld,
+            stream,
+            Float32(eps),
+        )
+        _PTR_COMPILE_CACHE[key] = compiled
+    ptr_x = rt.make_ptr(dtype, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+    ptr_out = rt.make_ptr(dtype, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+    ptr_res = (
+        rt.make_ptr(dtype, residual.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+        if residual is not None
+        else None
+    )
+    ptr_res_out = (
+        rt.make_ptr(dtype, residual_out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+        if residual_out is not None
+        else None
+    )
+    ptr_w = (
+        rt.make_ptr(dtype, weight.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+        if weight is not None
+        else None
+    )
+    ptr_b = (
+        rt.make_ptr(dtype, bias.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+        if bias is not None
+        else None
+    )
+    ptr_rstd = (
+        rt.make_ptr(cutlass.Float32, rstd.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=4)
+        if rstd is not None
+        else None
+    )
+    stream = cuda.CUstream(stream_handle)
+    ld = Int32(int(x.stride(0)))
+    compiled(
+        ptr_x,
+        ptr_w,
+        ptr_b,
+        ptr_res,
+        ptr_out,
+        ptr_res_out,
+        ptr_rstd,
+        Int32(M),
+        Int32(N),
+        ld,
+        stream,
+        Float32(eps),
+    )
+
+
+def _fused_add_rmsnorm_forward_ptr_inplace(
+    x: Tensor,
+    residual: Tensor,
+    weight: Tensor,
+    eps: float,
+) -> None:
+    """Pointer-based fused_add_rmsnorm that updates `x` and `residual` in-place."""
+    assert x.is_cuda
+    assert x.dim() == 2
+    assert residual.is_cuda
+    assert residual.dim() == 2
+    assert x.shape == residual.shape
+
+    M, N = x.shape
+    device_index = x.get_device()
+    dtype = TORCH2CUTE_DTYPE[x.dtype]
+    stage = 1
+
+    if torch.cuda.current_device() != device_index:
+        torch.cuda.set_device(device_index)
+    stream_handle = int(torch.cuda.current_stream().cuda_stream)
+
+    # Latency-optimized schedule for small-M cases: avoid the gmem->smem
+    # staging path (large dynamic smem + extra barriers) and load directly
+    # from gmem into registers.
+    copy_bits = 128
+    # Use a direct-GMEM schedule (no staging SMEM tiles) for DSv3 hidden size
+    # (7168, bf16/fp16). This improves both:
+    # - small-M latency (fewer barriers + less dynamic shared memory), and
+    # - large-M bandwidth (lower overhead, better vectorization when 32B-aligned).
+    #
+    # This is a policy decision: it is tuned for DSv3's N=7168. If you want to
+    # benchmark other models/shapes, you can override it with:
+    #   - OINK_RMSNORM_DIRECT_GMEM=0  (force staging/cp.async path)
+    #   - OINK_RMSNORM_DIRECT_GMEM=1  (force direct-gmem path)
+    direct_gmem = _direct_gmem_from_policy(default=bool(dtype.width == 16 and N == 7168))
+    use_async = not direct_gmem
+    tpr_override: Optional[int] = None
+    nt_override: Optional[int] = None
+    cluster_n_override: Optional[int] = None
+    direct_gmem_max_copy_bits: Optional[int] = None
+
+    # Experimental stage-2 cp.async path (2-tile ping-pong) for N=7168. This is
+    # primarily about improving memory-latency hiding / reducing long-scoreboard
+    # stalls for large-M workloads.
+    if _ENABLE_STAGE2 and dtype.width == 16 and N == 7168 and M >= 4096:
+        stage = 2
+        direct_gmem = False
+        use_async = True
+
+    # Experimental ILP variant (clusters): split each row across 2 CTAs.
+    #
+    # NOTE: This is currently opt-in because some CuTeDSL builds exhibit
+    # instability with cluster launches for this specific schedule. To reduce
+    # the chance of accidental crashes, we require an additional explicit
+    # opt-in via `OINK_RMSNORM_ENABLE_CLUSTER_ILP_UNSAFE=1`.
+    if _ENABLE_CLUSTER_ILP and not _ENABLE_STAGE2:
+        if dtype.width == 16 and N == 7168 and M >= 4096:
+            cluster_n_override = 2
+            if direct_gmem:
+                # Cluster launches + direct-GMEM has exhibited reproducible compiler
+                # instability (segfaults) in some CuTeDSL builds, especially for the
+                # 256b vector path. Probe it out-of-process once so we can safely
+                # select a working copy width (or fall back to the staged SMEM path)
+                # instead of crashing the parent process.
+                max_bits = _probe_cluster_direct_gmem_max_copy_bits()
+                if max_bits == 0:
+                    direct_gmem = False
+                    use_async = True
+                else:
+                    direct_gmem_max_copy_bits = max_bits
+
+    # Experimental per-row partitioning: use 256 threads/row for N=7168 to
+    # increase concurrency/ILP (accepts a small tail-predicate region).
+    if _ENABLE_TPR256 and cluster_n_override is None and not _ENABLE_STAGE2:
+        if dtype.width == 16 and N == 7168 and M >= 4096:
+            tpr_override = 256
+            nt_override = 256
+
+
+    can_use_256 = bool(
+        direct_gmem
+        and (
+            direct_gmem_max_copy_bits is None
+            or direct_gmem_max_copy_bits >= 256
+        )
+        and dtype.width == 16
+        and (x.data_ptr() % 32) == 0
+        and (residual.data_ptr() % 32) == 0
+        and (weight.data_ptr() % 32) == 0
+    )
+    assumed_align = 32 if can_use_256 else 16
+    if can_use_256:
+        copy_bits = 256
+
+    copy_bits = _copy_bits_from_policy(default=copy_bits, can_use_256=can_use_256)
+    if copy_bits == 128:
+        assumed_align = 16
+    elif copy_bits == 256 and can_use_256:
+        assumed_align = 32
+    else:
+        copy_bits = 128
+        assumed_align = 16
+
+    key = (
+        "ptr_fused_add_inplace",
+        N,
+        dtype,
+        stage,
+        device_index,
+        copy_bits,
+        use_async,
+        tpr_override,
+        nt_override,
+        direct_gmem,
+        cluster_n_override,
+    )
+    compiled = _PTR_COMPILE_CACHE.get(key)
+    if compiled is None:
+        op = RMSNormSM100(
+            N,
+            dtype,
+            stage=stage,
+            copy_bits=copy_bits,
+            use_async=use_async,
+            direct_gmem=direct_gmem,
+        )
+        if tpr_override is not None:
+            op._tpr_override = tpr_override  # type: ignore[attr-defined]
+        if nt_override is not None:
+            op._nt_override = nt_override  # type: ignore[attr-defined]
+        if cluster_n_override is not None:
+            op._cluster_n_override = cluster_n_override  # type: ignore[attr-defined]
+        ptr_x = rt.make_ptr(
+            dtype, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align
+        )
+        ptr_res = rt.make_ptr(
+            dtype, residual.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align
+        )
+        ptr_w = rt.make_ptr(
+            dtype, weight.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align
+        )
+        stream = cuda.CUstream(stream_handle)
+        ld_x = Int32(int(x.stride(0)))
+        compiled = cute.compile(
+            op.launch_from_ptrs_fused_add_inplace,
+            ptr_x,
+            ptr_w,
+            ptr_res,
+            Int32(M),
+            Int32(N),
+            ld_x,
+            stream,
+            Float32(eps),
+        )
+        _PTR_COMPILE_CACHE[key] = compiled
+    launcher = _get_fast_ptr_fused_add_rmsnorm_launcher(
+        compiled=compiled,
+        dtype=dtype,
+        N=N,
+        device_index=device_index,
+        stream_handle=stream_handle,
+        copy_bits=copy_bits,
+        use_async=use_async,
+        tpr=tpr_override or 0,
+        direct_gmem=direct_gmem,
+        assumed_align=assumed_align,
+        eps=eps,
+    )
+    if launcher is not None:
+        launcher.launch(
+            x=x,
+            weight=weight,
+            residual=residual,
+            M=M,
+            N=N,
+            ld_x=int(x.stride(0)),
+            eps=eps,
+        )
+        return
+
+    # Fast-launch is disabled/unavailable (or CuTeDSL internals changed). Fall back
+    # to calling the compiled function directly.
+    ptr_x = rt.make_ptr(dtype, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align)
+    ptr_res = rt.make_ptr(
+        dtype, residual.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align
+    )
+    ptr_w = rt.make_ptr(
+        dtype, weight.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align
+    )
+    stream = cuda.CUstream(stream_handle)
+    ld_x = Int32(int(x.stride(0)))
+    compiled(ptr_x, ptr_w, ptr_res, Int32(M), Int32(N), ld_x, stream, Float32(eps))
+
+
+# -------------------------
+# Public API (forward + verify)
+# -------------------------
+
+
+def rmsnorm_forward(
+    x: Tensor,
+    weight: Optional[Tensor] = None,
+    bias: Optional[Tensor] = None,
+    residual: Optional[Tensor] = None,
+    eps: float = 1e-6,
+    store_rstd: bool = False,
+) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
+    assert x.is_cuda
+    assert x.dim() == 2, "Use (M, N) tensor; flatten batch/seq beforehand."
+    M, N = x.shape
+    dtype = TORCH2CUTE_DTYPE[x.dtype]
+
+    # For DSv3 big-M outliers on SM100, keep using the dedicated
+    # stage-2 K-loop implementation, which is already tuned and
+    # parity-checked against the reference.
+    use_stage2_big_dsv3 = bool(
+        M >= 65536 and N in (6144, 8192) and x.dtype in (torch.float16, torch.bfloat16)
+    )
+    if use_stage2_big_dsv3:
+        try:
+            import rmsnorm_with_stage2 as rms2  # type: ignore[import-not-found]
+        except Exception:
+            rms2 = None  # type: ignore[assignment]
+        if rms2 is not None:
+            y, rstd, residual_out = rms2.rmsnorm_forward_with_stage2(
+                x, weight=weight, bias=bias, residual=residual, eps=eps, store_rstd=store_rstd
+            )
+            # Preserve stride contracts for torch.compile consistency, even
+            # when using the optional stage-2 implementation.
+            if y.stride() != x.stride():
+                y_strided = torch.empty_strided(
+                    x.shape, x.stride(), device=x.device, dtype=x.dtype
+                )
+                y_strided.copy_(y)
+                y = y_strided
+            if residual is not None and residual_out is not None:
+                if residual_out.stride() != residual.stride():
+                    residual_out_strided = torch.empty_strided(
+                        residual.shape,
+                        residual.stride(),
+                        device=residual.device,
+                        dtype=residual.dtype,
+                    )
+                    residual_out_strided.copy_(residual_out)
+                    residual_out = residual_out_strided
+            return y, rstd, residual_out
+
+    # Default: use the pointer-based entry whenever we can represent the
+    # inputs as a row-major [M, N] view with stride(1) == 1. For rare layouts
+    # we can't safely express without DLPack, fall back to a torch reference.
+    if _can_use_ptr_path(x, weight, bias, residual):
+        return _rmsnorm_forward_ptr(x, weight, bias, residual, eps, store_rstd)
+
+    # Safe fallback (correctness-first). This is expected to be rare in vLLM.
+    y = rmsnorm_ref(x, weight, bias, residual, eps)
+    # Preserve the input stride contract even on the fallback path so
+    # torch.compile sees a consistent output layout across all branches.
+    if y.stride() != x.stride():
+        y_strided = torch.empty_strided(x.shape, x.stride(), device=x.device, dtype=x.dtype)
+        y_strided.copy_(y)
+        y = y_strided
+    rstd = None
+    if store_rstd:
+        xf = x.float()
+        if residual is not None:
+            xf = xf + residual.float()
+        rstd = torch.rsqrt(xf.square().mean(dim=-1) + eps).to(torch.float32)
+    residual_out = None
+    if residual is not None:
+        residual_out = (x.float() + residual.float()).to(x.dtype)
+        if residual_out.stride() != residual.stride():
+            residual_out_strided = torch.empty_strided(
+                residual.shape,
+                residual.stride(),
+                device=residual.device,
+                dtype=residual.dtype,
+            )
+            residual_out_strided.copy_(residual_out)
+            residual_out = residual_out_strided
+    return y, rstd, residual_out
+
+
+def rmsnorm_ref(
+    x: Tensor,
+    w: Optional[Tensor] = None,
+    b: Optional[Tensor] = None,
+    residual: Optional[Tensor] = None,
+    eps: float = 1e-6,
+) -> Tensor:
+    xf = x.float()
+    if residual is not None:
+        xf = xf + residual.float()
+    rstd = torch.rsqrt(xf.square().mean(dim=-1, keepdim=True) + eps)
+    y = xf * rstd
+    if w is not None:
+        y = y * w.float()
+    if b is not None:
+        y = y + b.float()
+    return y.to(x.dtype)
+
+
+def fused_add_rmsnorm_forward(
+    x: Tensor,
+    residual: Tensor,
+    weight: Tensor,
+    eps: float = 1e-6,
+) -> Tuple[Tensor, Tensor]:
+    """Fused residual-add + RMSNorm for SM100 in CuteDSL.
+
+    This is a convenience wrapper around ``rmsnorm_forward`` that matches the
+    semantics of vLLM's ``fused_add_rms_norm``:
+
+        z = x + residual
+        y = RMSNorm(z, weight, eps)
+
+    It returns ``(y, z)`` where ``z`` has the same dtype/shape as the inputs.
+    """
+    assert x.is_cuda and residual.is_cuda
+    assert x.shape == residual.shape
+    assert x.dtype == residual.dtype
+
+    orig_shape = x.shape
+    N = orig_shape[-1]
+
+    x_2d = x.view(-1, N)
+    res_2d = residual.view(-1, N)
+
+    y_2d, _rstd, z_2d = rmsnorm_forward(
+        x_2d,
+        weight=weight,
+        bias=None,
+        residual=res_2d,
+        eps=eps,
+        store_rstd=False,
+    )
+
+    y = y_2d.view(orig_shape)
+    z = z_2d.view(orig_shape)
+    return y, z
+
+
+def fused_add_rmsnorm_forward_inplace(
+    x: Tensor,
+    residual: Tensor,
+    weight: Tensor,
+    eps: float = 1e-6,
+) -> Tuple[Tensor, Tensor]:
+    """In-place fused residual-add + RMSNorm matching vLLM semantics.
+
+    This variant writes:
+
+        z = x + residual     (stored into ``residual``)
+        y = RMSNorm(z, w)    (stored into ``x``)
+
+    i.e., it uses ``x`` as the normalized output buffer and ``residual`` as
+    the residual-out buffer, mirroring vLLM's fused_add_rms_norm kernel.
+    """
+    fused_add_rmsnorm_inplace_(x, residual, weight, eps=eps)
+    return x, residual
+
+
+def fused_add_rmsnorm_inplace_(
+    x: Tensor,
+    residual: Tensor,
+    weight: Tensor,
+    eps: float = 1e-6,
+) -> None:
+    """In-place fused residual-add + RMSNorm matching vLLM semantics.
+
+    This is the lowest-overhead Python entrypoint (returns `None`) intended
+    for performance-critical call sites like `torch.ops.oink.fused_add_rms_norm`.
+    """
+    assert x.is_cuda and residual.is_cuda
+    assert x.shape == residual.shape
+    assert x.dtype == residual.dtype
+
+    N = x.shape[-1]
+    x_2d = x if x.dim() == 2 else x.view(-1, N)
+    res_2d = residual if residual.dim() == 2 else residual.view(-1, N)
+
+    # Fast path: vLLM-compatible layout where x may be strided/padded but
+    # residual is contiguous. This updates both tensors in-place without
+    # additional allocations.
+    if _can_use_ptr_path_fused_add_inplace(x_2d, weight, res_2d):
+        _fused_add_rmsnorm_forward_ptr_inplace(x_2d, res_2d, weight, eps)
+        return None
+
+    # Fallback: allocate via the regular fused path, then copy results into
+    # the user-provided buffers so that semantics remain identical.
+    y, z = fused_add_rmsnorm_forward(x, residual, weight, eps)
+    x.copy_(y)
+    residual.copy_(z)
+    return None
+
+
+if __name__ == "__main__":
+    # Minimal ad-hoc test (functionality only). For performance comparisons, use the benchmark harness.
+    if not torch.cuda.is_available():
+        print("CUDA not available; functional test skipped.")
+        sys.exit(0)
+    M, N = 1024, 8192
+    dtype = torch.bfloat16
+    x = torch.randn(M, N, device="cuda", dtype=dtype)
+    w = torch.randn(N, device="cuda", dtype=dtype)
+    y_ref = rmsnorm_ref(x, w)
+    y, _, _ = rmsnorm_forward(x, w)
+    torch.testing.assert_close(y, y_ref, rtol=1e-3, atol=1e-3)
+    print("RMSNormSM100 correctness check passed.")
+
+# (compile cache moved to top)

From 3003c1398f700cf13095eb40f1e5bab84de013f7 Mon Sep 17 00:00:00 2001
From: Laura Wang <3700467+Laurawly@users.noreply.github.com>
Date: Tue, 6 Jan 2026 12:25:31 -0800
Subject: [PATCH 2/8] Fix oink ruff lint and add license headers

---
 oink/src/kernelagent_oink/__init__.py         | 14 ++++++++
 .../kernelagent_oink/blackwell/__init__.py    | 14 ++++++++
 .../kernelagent_oink/blackwell/lite_quack.py  | 20 ++++++++---
 .../blackwell/oink_custom_ops.py              | 16 ++++++++-
 .../src/kernelagent_oink/blackwell/rmsnorm.py | 33 +++++++++++++------
 5 files changed, 82 insertions(+), 15 deletions(-)

diff --git a/oink/src/kernelagent_oink/__init__.py b/oink/src/kernelagent_oink/__init__.py
index 542e59e..d9f25d0 100644
--- a/oink/src/kernelagent_oink/__init__.py
+++ b/oink/src/kernelagent_oink/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from __future__ import annotations
 
 import logging
diff --git a/oink/src/kernelagent_oink/blackwell/__init__.py b/oink/src/kernelagent_oink/blackwell/__init__.py
index 4d21ee8..a92109a 100644
--- a/oink/src/kernelagent_oink/blackwell/__init__.py
+++ b/oink/src/kernelagent_oink/blackwell/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from __future__ import annotations
 
 __all__ = []
diff --git a/oink/src/kernelagent_oink/blackwell/lite_quack.py b/oink/src/kernelagent_oink/blackwell/lite_quack.py
index 3c3f750..8c05b47 100644
--- a/oink/src/kernelagent_oink/blackwell/lite_quack.py
+++ b/oink/src/kernelagent_oink/blackwell/lite_quack.py
@@ -1,3 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """
 Lightweight local clone of the small subset of helpers that the SM100
 RMSNorm CuteDSL kernels depend on.
@@ -12,9 +26,8 @@
 
 import math
 import operator
-from typing import Callable, Optional, Tuple
+from typing import Callable, Optional
 
-import cuda.bindings.driver as cuda  # type: ignore
 import torch
 from torch import Tensor
 
@@ -23,7 +36,7 @@
 from cutlass import Float32, Int32, const_expr
 from cutlass.cute.runtime import from_dlpack
 from cutlass.cutlass_dsl import T, dsl_user_op
-from cutlass._mlir.dialects import llvm, nvvm, vector
+from cutlass._mlir.dialects import llvm
 
 
 # -------------------------
@@ -347,4 +360,3 @@ def get_sm_count(N: int, device: torch.device) -> int:
         sm_count * sm_count_multiple if N <= 8192 else sm_count // 2 if N <= 16384 else sm_count * 2
     )
     return sm_count
-
diff --git a/oink/src/kernelagent_oink/blackwell/oink_custom_ops.py b/oink/src/kernelagent_oink/blackwell/oink_custom_ops.py
index 8225025..92423d9 100644
--- a/oink/src/kernelagent_oink/blackwell/oink_custom_ops.py
+++ b/oink/src/kernelagent_oink/blackwell/oink_custom_ops.py
@@ -1,4 +1,16 @@
-from __future__ import annotations
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 """
 Torch custom ops wrapping Oink's Blackwell RMSNorm kernels.
@@ -26,6 +38,8 @@
       Mutates `x` and `residual` in-place and returns None.
 """
 
+from __future__ import annotations
+
 import importlib
 import threading
 
diff --git a/oink/src/kernelagent_oink/blackwell/rmsnorm.py b/oink/src/kernelagent_oink/blackwell/rmsnorm.py
index d6c2c20..a77938c 100644
--- a/oink/src/kernelagent_oink/blackwell/rmsnorm.py
+++ b/oink/src/kernelagent_oink/blackwell/rmsnorm.py
@@ -1,3 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """
 RMSNorm kernel for SM100 (Blackwell) in CuteDSL.
 
@@ -53,15 +67,15 @@
         "(`cutlass`, typically provided by `nvidia-cutlass-dsl`)."
     ) from e
 
-import torch
-from torch import Tensor
+import torch  # noqa: E402
+from torch import Tensor  # noqa: E402
 
-import cuda.bindings.driver as cuda  # provided by NVIDIA cuda-python
+import cuda.bindings.driver as cuda  # provided by NVIDIA cuda-python  # noqa: E402
 
-import cutlass
-import cutlass.cute as cute
-from cutlass import Float32, Int32, const_expr
-from cutlass.cute import runtime as rt
+import cutlass  # noqa: E402
+import cutlass.cute as cute  # noqa: E402
+from cutlass import Float32, Int32, const_expr  # noqa: E402
+from cutlass.cute import runtime as rt  # noqa: E402
 
 # Simple compile cache declared early so direct execution works
 _PTR_COMPILE_CACHE = {}
@@ -862,8 +876,8 @@ def _get_fast_ptr_fused_add_rmsnorm_launcher(
 #
 # NOTE: Avoid `from . import ...` imports here: CuTeDSL's AST preprocessor may
 # mishandle that form (module=None in the AST). Use fully-qualified imports.
-from kernelagent_oink.blackwell import lite_quack as qutils
-from kernelagent_oink.blackwell.lite_quack import TORCH2CUTE_DTYPE, row_reduce
+from kernelagent_oink.blackwell import lite_quack as qutils  # noqa: E402
+from kernelagent_oink.blackwell.lite_quack import TORCH2CUTE_DTYPE, row_reduce  # noqa: E402
 
 
 # -------------------------
@@ -2458,7 +2472,6 @@ def rmsnorm_forward(
     assert x.is_cuda
     assert x.dim() == 2, "Use (M, N) tensor; flatten batch/seq beforehand."
     M, N = x.shape
-    dtype = TORCH2CUTE_DTYPE[x.dtype]
 
     # For DSv3 big-M outliers on SM100, keep using the dedicated
     # stage-2 K-loop implementation, which is already tuned and

From 1468088ecab42c675d8e4d9e0bf465bd5f68644a Mon Sep 17 00:00:00 2001
From: Laura Wang <3700467+Laurawly@users.noreply.github.com>
Date: Tue, 6 Jan 2026 12:27:54 -0800
Subject: [PATCH 3/8] Format oink with ruff

---
 oink/src/kernelagent_oink/__init__.py         |   4 +-
 .../kernelagent_oink/blackwell/lite_quack.py  |  41 +-
 .../blackwell/oink_custom_ops.py              |   6 +-
 .../src/kernelagent_oink/blackwell/rmsnorm.py | 472 ++++++++++++++----
 4 files changed, 403 insertions(+), 120 deletions(-)

diff --git a/oink/src/kernelagent_oink/__init__.py b/oink/src/kernelagent_oink/__init__.py
index d9f25d0..bbbd7c1 100644
--- a/oink/src/kernelagent_oink/__init__.py
+++ b/oink/src/kernelagent_oink/__init__.py
@@ -94,7 +94,9 @@ def register() -> None:
 
         # Ensure CuTeDSL sees a target arch early. If the user has already set it,
         # respect their choice.
-        os.environ.setdefault("CUTE_DSL_ARCH", _compute_cutedsl_arch(int(major), int(minor)))
+        os.environ.setdefault(
+            "CUTE_DSL_ARCH", _compute_cutedsl_arch(int(major), int(minor))
+        )
 
         # Import registers the ops via torch.library.custom_op decorators.
         from .blackwell import oink_custom_ops  # noqa: F401
diff --git a/oink/src/kernelagent_oink/blackwell/lite_quack.py b/oink/src/kernelagent_oink/blackwell/lite_quack.py
index 8c05b47..14ae723 100644
--- a/oink/src/kernelagent_oink/blackwell/lite_quack.py
+++ b/oink/src/kernelagent_oink/blackwell/lite_quack.py
@@ -54,6 +54,7 @@
 # Tensor conversion helpers
 # -------------------------
 
+
 def convert_from_dlpack(
     x: Tensor,
     leading_dim: int,
@@ -82,7 +83,9 @@ def convert_from_dlpack(
 
 
 @dsl_user_op
-def elem_pointer(x: cute.Tensor, coord: cute.Coord, *, loc=None, ip=None) -> cute.Pointer:
+def elem_pointer(
+    x: cute.Tensor, coord: cute.Coord, *, loc=None, ip=None
+) -> cute.Pointer:
     return x.iterator + cute.crd2idx(coord, x.layout, loc=loc, ip=ip)
 
 
@@ -133,7 +136,9 @@ def store_shared_remote(
     ).ir_value()
     if const_expr(isinstance(val, float)):
         val = Float32(val)
-    assert isinstance(val, (Float32, Int32, cutlass.Int64)), "val must be Float32, Int32, or Int64"
+    assert isinstance(val, (Float32, Int32, cutlass.Int64)), (
+        "val must be Float32, Int32, or Int64"
+    )
     suffix = {Float32: "f32", Int32: "s32", cutlass.Int64: "s64"}[type(val)]
     constraint = {Float32: "f", Int32: "r", cutlass.Int64: "l"}[type(val)]
     llvm.inline_asm(
@@ -155,19 +160,27 @@ def predicate_k(tAcA: cute.Tensor, limit: cutlass.Int32) -> cute.Tensor:
     """
     tApA = cute.make_fragment(
         cute.make_layout(
-            (cute.size(tAcA, mode=[0, 1]), cute.size(tAcA, mode=[1]), cute.size(tAcA, mode=[2])),
+            (
+                cute.size(tAcA, mode=[0, 1]),
+                cute.size(tAcA, mode=[1]),
+                cute.size(tAcA, mode=[2]),
+            ),
             stride=(cute.size(tAcA, mode=[2]), 0, 1),
         ),
         cutlass.Boolean,
     )
     for rest_v in cutlass.range_constexpr(tApA.shape[0]):
         for rest_k in cutlass.range_constexpr(tApA.shape[2]):
-            tApA[rest_v, 0, rest_k] = cute.elem_less(tAcA[(0, rest_v), 0, rest_k][1], limit)
+            tApA[rest_v, 0, rest_k] = cute.elem_less(
+                tAcA[(0, rest_v), 0, rest_k][1], limit
+            )
     return tApA
 
 
 @dsl_user_op
-def domain_offset_i64(coord: cute.Coord, tensor: cute.Tensor, *, loc=None, ip=None) -> cute.Tensor:
+def domain_offset_i64(
+    coord: cute.Coord, tensor: cute.Tensor, *, loc=None, ip=None
+) -> cute.Tensor:
     """
     Return a tensor whose iterator is offset by an Int64 byte offset
     computed from `coord` and the tensor's strides.
@@ -287,7 +300,9 @@ def block_or_cluster_reduce(
     """Dispatch between block or cluster reduction depending on mbar_ptr."""
     if cutlass.const_expr(mbar_ptr is None):
         return block_reduce(val, op, reduction_buffer, init_val=init_val)
-    return cluster_reduce(val, op, reduction_buffer, mbar_ptr, init_val=init_val, phase=phase)
+    return cluster_reduce(
+        val, op, reduction_buffer, mbar_ptr, init_val=init_val, phase=phase
+    )
 
 
 @cute.jit
@@ -313,7 +328,9 @@ def row_reduce(
         val = x
     warp_op = {
         cute.ReductionOp.ADD: operator.add,
-        cute.ReductionOp.MAX: cute.arch.fmax if cutlass.const_expr(x.dtype == Float32) else max,
+        cute.ReductionOp.MAX: cute.arch.fmax
+        if cutlass.const_expr(x.dtype == Float32)
+        else max,
         cute.ReductionOp.MIN: min,
         cute.ReductionOp.MUL: operator.mul,
     }[op]
@@ -353,10 +370,16 @@ def get_sm_count(N: int, device: torch.device) -> int:
     RMSNorm kernels but lives entirely in this local module.
     """
     sm_count_multiple = (
-        16 if N <= 256 else (8 if N <= 1024 else (4 if N <= 2048 else (2 if N <= 4096 else 1)))
+        16
+        if N <= 256
+        else (8 if N <= 1024 else (4 if N <= 2048 else (2 if N <= 4096 else 1)))
     )
     sm_count = torch.cuda.get_device_properties(device).multi_processor_count
     sm_count = (
-        sm_count * sm_count_multiple if N <= 8192 else sm_count // 2 if N <= 16384 else sm_count * 2
+        sm_count * sm_count_multiple
+        if N <= 8192
+        else sm_count // 2
+        if N <= 16384
+        else sm_count * 2
     )
     return sm_count
diff --git a/oink/src/kernelagent_oink/blackwell/oink_custom_ops.py b/oink/src/kernelagent_oink/blackwell/oink_custom_ops.py
index 92423d9..a96a4c7 100644
--- a/oink/src/kernelagent_oink/blackwell/oink_custom_ops.py
+++ b/oink/src/kernelagent_oink/blackwell/oink_custom_ops.py
@@ -80,6 +80,7 @@ def _get_sm(device: torch.device | None = None) -> int:
 # RMSNorm (functional)
 #
 
+
 @custom_op("oink::rmsnorm", mutates_args=())
 def oink_rmsnorm(
     x: torch.Tensor,
@@ -158,6 +159,7 @@ def oink_rmsnorm_fake(
 # Fused residual-add + RMSNorm (in-place, vLLM semantics)
 #
 
+
 @custom_op("oink::fused_add_rms_norm", mutates_args=("x", "residual"))
 def oink_fused_add_rms_norm(
     x: torch.Tensor,
@@ -174,7 +176,9 @@ def oink_fused_add_rms_norm(
     Returns:
         None (mutates `x` and `residual` in-place).
     """
-    assert x.is_cuda and residual.is_cuda, "oink::fused_add_rms_norm requires CUDA tensors"
+    assert x.is_cuda and residual.is_cuda, (
+        "oink::fused_add_rms_norm requires CUDA tensors"
+    )
     assert x.shape == residual.shape, "x and residual must have the same shape"
     assert x.dtype == residual.dtype, "x and residual must have the same dtype"
     assert weight.dim() == 1, "weight must be 1D [N]"
diff --git a/oink/src/kernelagent_oink/blackwell/rmsnorm.py b/oink/src/kernelagent_oink/blackwell/rmsnorm.py
index a77938c..c7fc1b3 100644
--- a/oink/src/kernelagent_oink/blackwell/rmsnorm.py
+++ b/oink/src/kernelagent_oink/blackwell/rmsnorm.py
@@ -84,6 +84,7 @@
 # pointer/scalar storage so concurrent callers don't race on in-place updates.
 _PTR_FAST_LAUNCH_TLS = threading.local()
 
+
 def _env_flag(name: str, default: bool) -> bool:
     val = os.environ.get(name)
     if val is None:
@@ -100,10 +101,16 @@ def _env_flag(name: str, default: bool) -> bool:
 
 # Fused-add RMSNorm schedule knobs (read once at import time; set env vars before
 # importing this module if you want to override).
-_DIRECT_GMEM_POLICY = (os.environ.get("OINK_RMSNORM_DIRECT_GMEM", "auto").strip().lower() or "auto")
-_COPY_BITS_POLICY = (os.environ.get("OINK_RMSNORM_COPY_BITS", "auto").strip().lower() or "auto")
+_DIRECT_GMEM_POLICY = (
+    os.environ.get("OINK_RMSNORM_DIRECT_GMEM", "auto").strip().lower() or "auto"
+)
+_COPY_BITS_POLICY = (
+    os.environ.get("OINK_RMSNORM_COPY_BITS", "auto").strip().lower() or "auto"
+)
 _ENABLE_CLUSTER_ILP = _env_flag("OINK_RMSNORM_ENABLE_CLUSTER_ILP", default=False)
-_ENABLE_CLUSTER_ILP_UNSAFE = _env_flag("OINK_RMSNORM_ENABLE_CLUSTER_ILP_UNSAFE", default=False)
+_ENABLE_CLUSTER_ILP_UNSAFE = _env_flag(
+    "OINK_RMSNORM_ENABLE_CLUSTER_ILP_UNSAFE", default=False
+)
 _ENABLE_TPR256 = _env_flag("OINK_RMSNORM_ENABLE_TPR256", default=False)
 _ENABLE_STAGE2 = _env_flag("OINK_RMSNORM_ENABLE_STAGE2", default=False)
 
@@ -252,6 +259,7 @@ def run_probe(copy_bits: int, assumed_align: int):
         _CLUSTER_DIRECT_GMEM_MAX_COPY_BITS = max_bits
         return max_bits
 
+
 def _parse_version_tuple(version: str) -> Tuple[int, int, int]:
     parts = version.split(".")
     nums: list[int] = []
@@ -275,7 +283,9 @@ def _cutlass_dsl_version() -> Optional[Tuple[int, int, int]]:
 # passing Layout/Shape/Constexpr objects into @cute.kernel functions). Keep the
 # older signature for 4.3.2, but switch to a 4.3.4-compatible signature when we
 # detect 4.3.4+ (or when version detection is unavailable).
-_KERNEL_ACCEPTS_LAYOUT_ARGS = _CUTLASS_DSL_VERSION is not None and _CUTLASS_DSL_VERSION < (4, 3, 4)
+_KERNEL_ACCEPTS_LAYOUT_ARGS = (
+    _CUTLASS_DSL_VERSION is not None and _CUTLASS_DSL_VERSION < (4, 3, 4)
+)
 
 if _ENABLE_CLUSTER_ILP and not _ENABLE_CLUSTER_ILP_UNSAFE:
     # We have observed reproducible segfaults in some CuTeDSL builds when using
@@ -427,7 +437,9 @@ def launch(
                 self._last_x_ptr = x_ptr
             except AttributeError:
                 self._disable_fast_launch()
-                self._fallback_launch(x=x, weight=weight, out=out, M=M, N=N, ld=ld, eps=eps)
+                self._fallback_launch(
+                    x=x, weight=weight, out=out, M=M, N=N, ld=ld, eps=eps
+                )
                 return
 
         if self._ptr_w is not None:
@@ -438,7 +450,9 @@ def launch(
                     self._last_w_ptr = w_ptr
                 except AttributeError:
                     self._disable_fast_launch()
-                    self._fallback_launch(x=x, weight=weight, out=out, M=M, N=N, ld=ld, eps=eps)
+                    self._fallback_launch(
+                        x=x, weight=weight, out=out, M=M, N=N, ld=ld, eps=eps
+                    )
                     return
 
         out_ptr = out.data_ptr()
@@ -448,7 +462,9 @@ def launch(
                 self._last_out_ptr = out_ptr
             except AttributeError:
                 self._disable_fast_launch()
-                self._fallback_launch(x=x, weight=weight, out=out, M=M, N=N, ld=ld, eps=eps)
+                self._fallback_launch(
+                    x=x, weight=weight, out=out, M=M, N=N, ld=ld, eps=eps
+                )
                 return
 
         if M != self._last_m:
@@ -492,10 +508,19 @@ def _fallback_launch(
         # If the packed-args or runtime pointer mutation path stops working
         # (e.g. due to a CuTeDSL upgrade), fall back to the regular call path.
         dtype = TORCH2CUTE_DTYPE[x.dtype]
-        ptr_x = rt.make_ptr(dtype, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
-        ptr_out = rt.make_ptr(dtype, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+        ptr_x = rt.make_ptr(
+            dtype, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+        )
+        ptr_out = rt.make_ptr(
+            dtype, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+        )
         ptr_w = (
-            rt.make_ptr(dtype, weight.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+            rt.make_ptr(
+                dtype,
+                weight.data_ptr(),
+                mem_space=rt.AddressSpace.gmem,
+                assumed_align=16,
+            )
             if weight is not None
             else None
         )
@@ -695,7 +720,15 @@ def _get_fast_ptr_rmsnorm_launcher(
         return None
     # Keyed by the compiled object identity so schedule changes (e.g. copy width,
     # async/staged variants, etc.) never alias in the fast-launch cache.
-    key = ("ptr_fast", id(compiled), N, dtype, device_index, int(stream_handle), has_weight)
+    key = (
+        "ptr_fast",
+        id(compiled),
+        N,
+        dtype,
+        device_index,
+        int(stream_handle),
+        has_weight,
+    )
     cache = _tls_fast_launch_cache()
     cached = cache.get(key)
     if cached is not None:
@@ -705,7 +738,9 @@ def _get_fast_ptr_rmsnorm_launcher(
     ptr_x = rt.make_ptr(dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=16)
     ptr_out = rt.make_ptr(dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=16)
     ptr_w = (
-        rt.make_ptr(dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=16) if has_weight else None
+        rt.make_ptr(dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=16)
+        if has_weight
+        else None
     )
 
     arg_m = _StableI32Arg(0)
@@ -808,9 +843,15 @@ def _get_fast_ptr_fused_add_rmsnorm_launcher(
     if cached is not None:
         return cached  # type: ignore[return-value]
 
-    ptr_x = rt.make_ptr(dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align)
-    ptr_res = rt.make_ptr(dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align)
-    ptr_w = rt.make_ptr(dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align)
+    ptr_x = rt.make_ptr(
+        dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align
+    )
+    ptr_res = rt.make_ptr(
+        dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align
+    )
+    ptr_w = rt.make_ptr(
+        dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align
+    )
 
     arg_m = _StableI32Arg(0)
     arg_n = _StableI32Arg(N)
@@ -884,6 +925,7 @@ def _get_fast_ptr_fused_add_rmsnorm_launcher(
 # Copy helpers (allow up to 256b)
 # -------------------------
 
+
 @cute.jit
 def get_copy_atom_bw(
     dtype: type[cutlass.Numeric], num_copy_elems: int, is_async: bool = False
@@ -892,6 +934,7 @@ def get_copy_atom_bw(
     max_bits = const_expr(128 if is_async else 256)
     num_copy_bits = const_expr(min(max_bits, num_copy_elems * dtype.width))
     from cutlass.cute.nvgpu import cpasync
+
     # Prefer GLOBAL cache policy for bulk streaming reads at large M
     copy_op = (
         cpasync.CopyG2SOp(cache_mode=cpasync.LoadCacheMode.GLOBAL)
@@ -1037,7 +1080,10 @@ def _tv_layout(self, num_copy_bits: int = 256) -> Tuple[cute.Shape, cute.Layout]
         tiler_mn = (cols_per_block, vecsize * num_blocks_N * tpr)
         tv_layout = cute.make_layout(
             ((tpr, cols_per_block), (vecsize, num_blocks_N)),
-            stride=((vecsize * cols_per_block, 1), (cols_per_block, cols_per_block * vecsize * tpr)),
+            stride=(
+                (vecsize * cols_per_block, 1),
+                (cols_per_block, cols_per_block * vecsize * tpr),
+            ),
         )
         return tiler_mn, tv_layout
 
@@ -1045,7 +1091,10 @@ def _smem_bytes(self, tiler_mn, num_warps) -> int:
         # smem for X tile (+ residual if present) + reduction buffers + mbar(s)
         return (
             cute.size_in_bytes(self.dtype, cute.make_layout(tiler_mn))
-            + self.stage * num_warps * self._cluster_n() * (self.reduction_dtype.width // 8)
+            + self.stage
+            * num_warps
+            * self._cluster_n()
+            * (self.reduction_dtype.width // 8)
             + self.stage * (cutlass.Int64.width // 8)
         )
 
@@ -1072,7 +1121,9 @@ def new_stride(t):
             )
 
         mX, mRes, mO, mResO = [
-            cute.make_tensor(t.iterator, cute.make_layout(semistatic_shape, stride=new_stride(t)))
+            cute.make_tensor(
+                t.iterator, cute.make_layout(semistatic_shape, stride=new_stride(t))
+            )
             if const_expr(t is not None)
             else None
             for t in (mX, mRes, mO, mResO)
@@ -1082,23 +1133,34 @@ def new_stride(t):
 
         copy_bits = int(self.copy_bits)
         tiler_mn, tv_layout = self._tv_layout(num_copy_bits=copy_bits)
-        num_threads = cute.size(tv_layout, mode=[0]) if _KERNEL_ACCEPTS_LAYOUT_ARGS else self._num_threads()
+        num_threads = (
+            cute.size(tv_layout, mode=[0])
+            if _KERNEL_ACCEPTS_LAYOUT_ARGS
+            else self._num_threads()
+        )
         num_warps = num_threads // cute.arch.WARP_SIZE
-        threads_per_row = tv_layout.shape[0][0] if _KERNEL_ACCEPTS_LAYOUT_ARGS else self._threads_per_row()
+        threads_per_row = (
+            tv_layout.shape[0][0]
+            if _KERNEL_ACCEPTS_LAYOUT_ARGS
+            else self._threads_per_row()
+        )
         warps_per_row = max(threads_per_row // cute.arch.WARP_SIZE, 1)
         cluster_n = self._cluster_n()
 
         if const_expr(mW is not None):
             mW = cute.make_tensor(
-                mW.iterator, cute.prepend(mW.layout, cute.make_layout((tiler_mn[0],), stride=(0,)))
+                mW.iterator,
+                cute.prepend(mW.layout, cute.make_layout((tiler_mn[0],), stride=(0,))),
             )
         if const_expr(mB is not None):
             mB = cute.make_tensor(
-                mB.iterator, cute.prepend(mB.layout, cute.make_layout((tiler_mn[0],), stride=(0,)))
+                mB.iterator,
+                cute.prepend(mB.layout, cute.make_layout((tiler_mn[0],), stride=(0,))),
             )
         if const_expr(mRstd is not None):
             mRstd = cute.make_tensor(
-                mRstd.iterator, cute.append(mRstd.layout, cute.make_layout((self.N,), stride=(0,)))
+                mRstd.iterator,
+                cute.append(mRstd.layout, cute.make_layout((self.N,), stride=(0,))),
             )
 
         # No SMEM reload mode switch; overlap is controlled in the K-loop path
@@ -1114,11 +1176,14 @@ def new_stride(t):
             else 0
         )
         tile_bytes_res = (
-            cute.size_in_bytes(mRes.element_type, cute.make_layout(tiler_mn)) * stage_bufs
+            cute.size_in_bytes(mRes.element_type, cute.make_layout(tiler_mn))
+            * stage_bufs
             if const_expr(mRes is not None and not self.direct_gmem)
             else 0
         )
-        red_bytes = self.stage * num_warps * cluster_n * (self.reduction_dtype.width // 8)
+        red_bytes = (
+            self.stage * num_warps * cluster_n * (self.reduction_dtype.width // 8)
+        )
         # mbarriers are only allocated/used for cluster_n>1. Some CuTeDSL builds
         # require mbarrier state to be 16B-aligned in shared memory; account for
         # the alignment padding when computing dynamic smem bytes.
@@ -1211,14 +1276,10 @@ def launch_from_ptrs(
             else None
         )
         mW = (
-            cute.make_tensor(ptr_w, layout_n)
-            if const_expr(ptr_w is not None)
-            else None
+            cute.make_tensor(ptr_w, layout_n) if const_expr(ptr_w is not None) else None
         )
         mB = (
-            cute.make_tensor(ptr_b, layout_n)
-            if const_expr(ptr_b is not None)
-            else None
+            cute.make_tensor(ptr_b, layout_n) if const_expr(ptr_b is not None) else None
         )
         mRstd = (
             cute.make_tensor(ptr_rstd, layout_m)
@@ -1303,7 +1364,9 @@ def _kernel_impl(
         # Allocate one or two SMEM buffers depending on stage depth
         sX0 = (
             smem.allocate_tensor(
-                mX.element_type, cute.make_ordered_layout(tiler_mn, order=(1, 0)), byte_alignment=32
+                mX.element_type,
+                cute.make_ordered_layout(tiler_mn, order=(1, 0)),
+                byte_alignment=32,
             )
             if const_expr(not self.direct_gmem)
             else None
@@ -1319,7 +1382,9 @@ def _kernel_impl(
         )
         sRes0 = (
             smem.allocate_tensor(
-                mRes.element_type, cute.make_ordered_layout(tiler_mn, order=(1, 0)), byte_alignment=32
+                mRes.element_type,
+                cute.make_ordered_layout(tiler_mn, order=(1, 0)),
+                byte_alignment=32,
             )
             if const_expr(mRes is not None and not self.direct_gmem)
             else None
@@ -1339,7 +1404,9 @@ def _kernel_impl(
             (num_warps // warps_per_row, (warps_per_row, cluster_n), self.stage),
             order=(1, 0, 2),
         )
-        reduction_buffer = smem.allocate_tensor(self.reduction_dtype, red_layout, byte_alignment=4)
+        reduction_buffer = smem.allocate_tensor(
+            self.reduction_dtype, red_layout, byte_alignment=4
+        )
         if const_expr(cluster_n > 1):
             # Some CuTeDSL builds appear sensitive to the shared-memory alignment of
             # mbarrier state. `SmemAllocator.allocate_array` does not currently
@@ -1360,8 +1427,12 @@ def _kernel_impl(
 
         # Tiled copy setup
         num_copy_elems_X = tv_layout.shape[1][0]
-        use_async = const_expr(self.use_async and self.N >= 1024 and not self.direct_gmem)
-        copy_atom = get_copy_atom_bw(mX.element_type, num_copy_elems_X, is_async=use_async)
+        use_async = const_expr(
+            self.use_async and self.N >= 1024 and not self.direct_gmem
+        )
+        copy_atom = get_copy_atom_bw(
+            mX.element_type, num_copy_elems_X, is_async=use_async
+        )
         thr_copy = cute.make_tiled_copy(copy_atom, tv_layout, tiler_mn).get_slice(tidx)
 
         # Tail predicate for the N dimension (when tile width > N). Reuse this
@@ -1386,7 +1457,9 @@ def _kernel_impl(
             mW is not None and (self.direct_gmem or (mRes is None and mB is None))
         )
         if const_expr(prefetch_w_early):
-            gW = cute.local_tile(qutils.domain_offset_i64((0, n_off), mW), tiler_mn, (0, 0))
+            gW = cute.local_tile(
+                qutils.domain_offset_i64((0, n_off), mW), tiler_mn, (0, 0)
+            )
             tXgW = thr_copy.partition_S(gW)
             tXrW = cute.make_fragment_like(tXgW)
             if const_expr(not is_even_N_wb):
@@ -1398,7 +1471,9 @@ def _kernel_impl(
                 pred=tXp_wb,
             )
         if const_expr(self.direct_gmem and mB is not None):
-            gB = cute.local_tile(qutils.domain_offset_i64((0, n_off), mB), tiler_mn, (0, 0))
+            gB = cute.local_tile(
+                qutils.domain_offset_i64((0, n_off), mB), tiler_mn, (0, 0)
+            )
             tXgB = thr_copy.partition_S(gB)
             tXrB = cute.make_fragment_like(tXgB)
             if const_expr(not is_even_N_wb):
@@ -1414,7 +1489,9 @@ def _kernel_impl(
         self._init_cluster(tidx, mbar_ptr)
 
         mX_i, mRes_i, mO_i, mResO_i = [
-            qutils.domain_offset_i64((bidx * tiler_mn[0], 0), t) if t is not None else None
+            qutils.domain_offset_i64((bidx * tiler_mn[0], 0), t)
+            if t is not None
+            else None
             for t in (mX, mRes, mO, mResO)
         ]
         mX_i, mRes_i, mO_i, mResO_i = [
@@ -1424,27 +1501,39 @@ def _kernel_impl(
         gX_i = cute.local_tile(mX_i, tiler_mn, (0, 0))
         gO_i = cute.local_tile(mO_i, tiler_mn, (0, 0))
         gRes_i = (
-            cute.local_tile(mRes_i, tiler_mn, (0, 0)) if const_expr(mRes is not None) else None
+            cute.local_tile(mRes_i, tiler_mn, (0, 0))
+            if const_expr(mRes is not None)
+            else None
         )
         gResO_i = (
-            cute.local_tile(mResO_i, tiler_mn, (0, 0)) if const_expr(mResO is not None) else None
+            cute.local_tile(mResO_i, tiler_mn, (0, 0))
+            if const_expr(mResO is not None)
+            else None
         )
         gRstd_i = (
-            cute.local_tile(mRstd, tiler_mn, (bidx, 0)) if const_expr(mRstd is not None) else None
+            cute.local_tile(mRstd, tiler_mn, (bidx, 0))
+            if const_expr(mRstd is not None)
+            else None
         )
         cX_i = cute.local_tile(idX, tiler_mn, (bidx, 0))
 
         # Common identity/row index partitions reused by both default and K-loop paths
         tXcX_i = thr_copy.partition_S(cX_i)[(0, None), None, None]
         row_i = tXcX_i[0][0]
-        tXgRstd_i = thr_copy.partition_D(gRstd_i) if const_expr(mRstd is not None) else None
+        tXgRstd_i = (
+            thr_copy.partition_D(gRstd_i) if const_expr(mRstd is not None) else None
+        )
 
         # Stage-2 intra-row K-loop cp.async ping-pong (two tiles). This reduces
         # per-thread fragment size and can improve memory-latency hiding for
         # N=7168 at large M. It is enabled by setting `stage=2` when constructing
         # the RMSNormSM100 op (see `_fused_add_rmsnorm_forward_ptr_inplace`).
         if const_expr(
-            self.stage > 1 and not self.direct_gmem and use_async and cluster_n == 1 and shape[1] == 7168
+            self.stage > 1
+            and not self.direct_gmem
+            and use_async
+            and cluster_n == 1
+            and shape[1] == 7168
         ):
             vecsize = tv_layout.shape[1][0]
             tpr = threads_per_row
@@ -1475,9 +1564,9 @@ def _kernel_impl(
                         (tiler_mn[0], tiler_mn[0] * vecsize * tpr),
                     ),
                 )
-                thr_copy_tile = cute.make_tiled_copy(copy_atom, tv_layout_tile, tiler_mn_tile).get_slice(
-                    tidx
-                )
+                thr_copy_tile = cute.make_tiled_copy(
+                    copy_atom, tv_layout_tile, tiler_mn_tile
+                ).get_slice(tidx)
 
                 # Accumulate per-thread partial sums across tiles; reduce once.
                 sum_sq_thread = cute.Float32(0.0)
@@ -1499,7 +1588,13 @@ def _kernel_impl(
                 tXp_pong = tXp_0
 
                 if row_i < shape[0]:
-                    copy_tiled(tXgX_0, tXsX_0, num_copy_elems=vecsize, is_async=True, pred=tXp_0)
+                    copy_tiled(
+                        tXgX_0,
+                        tXsX_0,
+                        num_copy_elems=vecsize,
+                        is_async=True,
+                        pred=tXp_0,
+                    )
                     if const_expr(mRes is not None):
                         gRes_0 = cute.local_tile(
                             qutils.domain_offset_i64((0, k_off0), mRes_i),
@@ -1538,19 +1633,27 @@ def _kernel_impl(
                         if const_expr((t % 2) == 0):
                             tXsX_n = thr_copy_tile.partition_D(sX1_tile)
                             tXsRes_n = (
-                                thr_copy_tile.partition_D(sRes1_tile) if const_expr(mRes is not None) else None
+                                thr_copy_tile.partition_D(sRes1_tile)
+                                if const_expr(mRes is not None)
+                                else None
                             )
                             tXp_pong = tXp_n
                         else:
                             tXsX_n = thr_copy_tile.partition_D(sX0_tile)
                             tXsRes_n = (
-                                thr_copy_tile.partition_D(sRes0_tile) if const_expr(mRes is not None) else None
+                                thr_copy_tile.partition_D(sRes0_tile)
+                                if const_expr(mRes is not None)
+                                else None
                             )
                             tXp_ping = tXp_n
 
                         if row_i < shape[0]:
                             copy_tiled(
-                                tXgX_n, tXsX_n, num_copy_elems=vecsize, is_async=True, pred=tXp_n
+                                tXgX_n,
+                                tXsX_n,
+                                num_copy_elems=vecsize,
+                                is_async=True,
+                                pred=tXp_n,
                             )
                             if const_expr(mRes is not None):
                                 gRes_n = cute.local_tile(
@@ -1574,25 +1677,35 @@ def _kernel_impl(
                     if const_expr((t % 2) == 0):
                         tXsX_cur = thr_copy_tile.partition_D(sX0_tile)
                         tXsRes_cur = (
-                            thr_copy_tile.partition_D(sRes0_tile) if const_expr(mRes is not None) else None
+                            thr_copy_tile.partition_D(sRes0_tile)
+                            if const_expr(mRes is not None)
+                            else None
                         )
                         pred_cur = tXp_ping
                     else:
                         tXsX_cur = thr_copy_tile.partition_D(sX1_tile)
                         tXsRes_cur = (
-                            thr_copy_tile.partition_D(sRes1_tile) if const_expr(mRes is not None) else None
+                            thr_copy_tile.partition_D(sRes1_tile)
+                            if const_expr(mRes is not None)
+                            else None
                         )
                         pred_cur = tXp_pong
 
                     k_off = t * tile_n
-                    gX_t = cute.local_tile(qutils.domain_offset_i64((0, k_off), mX_i), tiler_mn_tile, (0, 0))
+                    gX_t = cute.local_tile(
+                        qutils.domain_offset_i64((0, k_off), mX_i),
+                        tiler_mn_tile,
+                        (0, 0),
+                    )
                     tXgX_t = thr_copy_tile.partition_S(gX_t)
                     tXrX_t = cute.make_fragment_like(tXgX_t)
                     cute.autovec_copy(tXsX_cur, tXrX_t)
                     x_t = tXrX_t.load().to(cute.Float32)
                     if const_expr(mRes is not None):
                         gRes_t = cute.local_tile(
-                            qutils.domain_offset_i64((0, k_off), mRes_i), tiler_mn_tile, (0, 0)
+                            qutils.domain_offset_i64((0, k_off), mRes_i),
+                            tiler_mn_tile,
+                            (0, 0),
                         )
                         tXgRes_t = thr_copy_tile.partition_S(gRes_t)
                         tXrRes_t = cute.make_fragment_like(tXgRes_t)
@@ -1639,29 +1752,41 @@ def _kernel_impl(
 
                 for t in cutlass.range_constexpr(num_tiles):
                     k_off = t * tile_n
-                    cX_t = cute.local_tile(cute.domain_offset((0, k_off), cX_i), tiler_mn_tile, (0, 0))
+                    cX_t = cute.local_tile(
+                        cute.domain_offset((0, k_off), cX_i), tiler_mn_tile, (0, 0)
+                    )
                     tXc_t = thr_copy_tile.partition_S(cX_t)
                     tXp_t = qutils.predicate_k(tXc_t, limit=limit_k)
 
                     if const_expr((t % 2) == 0):
                         tXsX_cur = thr_copy_tile.partition_D(sX0_tile)
                         tXsRes_cur = (
-                            thr_copy_tile.partition_D(sRes0_tile) if const_expr(mRes is not None) else None
+                            thr_copy_tile.partition_D(sRes0_tile)
+                            if const_expr(mRes is not None)
+                            else None
                         )
                     else:
                         tXsX_cur = thr_copy_tile.partition_D(sX1_tile)
                         tXsRes_cur = (
-                            thr_copy_tile.partition_D(sRes1_tile) if const_expr(mRes is not None) else None
+                            thr_copy_tile.partition_D(sRes1_tile)
+                            if const_expr(mRes is not None)
+                            else None
                         )
 
-                    gX_t = cute.local_tile(qutils.domain_offset_i64((0, k_off), mX_i), tiler_mn_tile, (0, 0))
+                    gX_t = cute.local_tile(
+                        qutils.domain_offset_i64((0, k_off), mX_i),
+                        tiler_mn_tile,
+                        (0, 0),
+                    )
                     tXgX_t = thr_copy_tile.partition_S(gX_t)
                     tXrX_t = cute.make_fragment_like(tXgX_t)
                     cute.autovec_copy(tXsX_cur, tXrX_t)
                     x_t = tXrX_t.load().to(cute.Float32)
                     if const_expr(mRes is not None):
                         gRes_t = cute.local_tile(
-                            qutils.domain_offset_i64((0, k_off), mRes_i), tiler_mn_tile, (0, 0)
+                            qutils.domain_offset_i64((0, k_off), mRes_i),
+                            tiler_mn_tile,
+                            (0, 0),
                         )
                         tXgRes_t = thr_copy_tile.partition_S(gRes_t)
                         tXrRes_t = cute.make_fragment_like(tXgRes_t)
@@ -1671,43 +1796,77 @@ def _kernel_impl(
                     y_t = x_t * rstd
                     if const_expr(mW is not None):
                         gW_t = cute.local_tile(
-                            qutils.domain_offset_i64((0, k_off), mW), tiler_mn_tile, (0, 0)
+                            qutils.domain_offset_i64((0, k_off), mW),
+                            tiler_mn_tile,
+                            (0, 0),
                         )
                         tWgW_t = thr_copy_tile.partition_S(gW_t)
                         tWrW_t = cute.make_fragment_like(tWgW_t)
-                        copy_tiled(tWgW_t, tWrW_t, num_copy_elems=vecsize, is_async=False, pred=tXp_t)
+                        copy_tiled(
+                            tWgW_t,
+                            tWrW_t,
+                            num_copy_elems=vecsize,
+                            is_async=False,
+                            pred=tXp_t,
+                        )
                         y_t = y_t * tWrW_t.load().to(cute.Float32)
                     if const_expr(mB is not None):
                         gB_t = cute.local_tile(
-                            qutils.domain_offset_i64((0, k_off), mB), tiler_mn_tile, (0, 0)
+                            qutils.domain_offset_i64((0, k_off), mB),
+                            tiler_mn_tile,
+                            (0, 0),
                         )
                         tWgB_t = thr_copy_tile.partition_S(gB_t)
                         tWrB_t = cute.make_fragment_like(tWgB_t)
-                        copy_tiled(tWgB_t, tWrB_t, num_copy_elems=vecsize, is_async=False, pred=tXp_t)
+                        copy_tiled(
+                            tWgB_t,
+                            tWrB_t,
+                            num_copy_elems=vecsize,
+                            is_async=False,
+                            pred=tXp_t,
+                        )
                         y_t = y_t + tWrB_t.load().to(cute.Float32)
 
-                    gO_t = cute.local_tile(qutils.domain_offset_i64((0, k_off), mO_i), tiler_mn_tile, (0, 0))
+                    gO_t = cute.local_tile(
+                        qutils.domain_offset_i64((0, k_off), mO_i),
+                        tiler_mn_tile,
+                        (0, 0),
+                    )
                     tXgO_t = thr_copy_tile.partition_D(gO_t)
                     tXrO_t = cute.make_fragment_like(tXgO_t)
                     tXrO_t.store(y_t.to(tXrO_t.element_type))
                     if row_i < shape[0]:
-                        copy_tiled(tXrO_t, tXgO_t, num_copy_elems=vecsize, is_async=False, pred=tXp_t)
+                        copy_tiled(
+                            tXrO_t,
+                            tXgO_t,
+                            num_copy_elems=vecsize,
+                            is_async=False,
+                            pred=tXp_t,
+                        )
 
                 return
 
         # Single-stage path: one-row-per-CTA
         tXgX_i = thr_copy.partition_S(gX_i)
-        tXgRes_i = thr_copy.partition_S(gRes_i) if const_expr(mRes is not None) else None
+        tXgRes_i = (
+            thr_copy.partition_S(gRes_i) if const_expr(mRes is not None) else None
+        )
         tXgO_i = thr_copy.partition_D(gO_i)
-        tXgResO_i = thr_copy.partition_D(gResO_i) if const_expr(mResO is not None) else None
+        tXgResO_i = (
+            thr_copy.partition_D(gResO_i) if const_expr(mResO is not None) else None
+        )
         # tXgRstd_i / tXcX_i / row_i prepared above
         is_even_N_i = const_expr(shape[1] == tiler_mn[1] * cluster_n)
         tXpX_i = (
-            qutils.predicate_k(thr_copy.partition_S(cX_i), limit=limit_k) if not is_even_N_i else None
+            qutils.predicate_k(thr_copy.partition_S(cX_i), limit=limit_k)
+            if not is_even_N_i
+            else None
         )
 
         tXrX = cute.make_fragment_like(tXgX_i)
-        tXrRes = cute.make_fragment_like(tXgRes_i) if const_expr(mRes is not None) else None
+        tXrRes = (
+            cute.make_fragment_like(tXgRes_i) if const_expr(mRes is not None) else None
+        )
         if const_expr(self.direct_gmem):
             if const_expr(not is_even_N_i):
                 tXrX.fill(0)
@@ -1729,7 +1888,9 @@ def _kernel_impl(
             if row_i < shape[0]:
                 cute.copy(copy_atom, tXgX_i, thr_copy.partition_D(sX0), pred=tXpX_i)
                 if const_expr(mRes is not None):
-                    cute.copy(copy_atom, tXgRes_i, thr_copy.partition_D(sRes0), pred=tXpX_i)
+                    cute.copy(
+                        copy_atom, tXgRes_i, thr_copy.partition_D(sRes0), pred=tXpX_i
+                    )
             if const_expr(use_async):
                 cute.arch.cp_async_commit_group()
                 cute.arch.cp_async_wait_group(0)
@@ -1746,7 +1907,9 @@ def _kernel_impl(
             tXrResO.store(x_red.to(tXrResO.element_type))
             if row_i < shape[0]:
                 cute.copy(
-                    get_copy_atom_bw(tXrResO.element_type, num_copy_elems_X, is_async=False),
+                    get_copy_atom_bw(
+                        tXrResO.element_type, num_copy_elems_X, is_async=False
+                    ),
                     tXrResO,
                     tXgResO_i,
                     pred=tXpX_i,
@@ -1775,7 +1938,9 @@ def _kernel_impl(
             # pressure during the long-scoreboard reduction phase (helping occupancy
             # when registers are the limiting factor).
             if const_expr(mW is not None):
-                gW = cute.local_tile(qutils.domain_offset_i64((0, n_off), mW), tiler_mn, (0, 0))
+                gW = cute.local_tile(
+                    qutils.domain_offset_i64((0, n_off), mW), tiler_mn, (0, 0)
+                )
                 tXgW = thr_copy.partition_S(gW)
                 tXrW = cute.make_fragment_like(tXgW)
                 if const_expr(not is_even_N_wb):
@@ -1787,7 +1952,9 @@ def _kernel_impl(
                     pred=tXp_wb,
                 )
             if const_expr(mB is not None):
-                gB = cute.local_tile(qutils.domain_offset_i64((0, n_off), mB), tiler_mn, (0, 0))
+                gB = cute.local_tile(
+                    qutils.domain_offset_i64((0, n_off), mB), tiler_mn, (0, 0)
+                )
                 tXgB = thr_copy.partition_S(gB)
                 tXrB = cute.make_fragment_like(tXgB)
                 if const_expr(not is_even_N_wb):
@@ -1818,6 +1985,7 @@ def _kernel_impl(
             )
 
     if _KERNEL_ACCEPTS_LAYOUT_ARGS:
+
         @cute.kernel
         def kernel(
             self,
@@ -1853,6 +2021,7 @@ def kernel(
                 threads_per_row,
             )
     else:
+
         @cute.kernel
         def kernel(
             self,
@@ -2015,7 +2184,10 @@ def _rmsnorm_forward_ptr(
 
     if residual is not None:
         residual_out = torch.empty_strided(
-            residual.shape, residual.stride(), device=residual.device, dtype=residual.dtype
+            residual.shape,
+            residual.stride(),
+            device=residual.device,
+            dtype=residual.dtype,
         )
     if store_rstd:
         rstd = torch.empty(M, device=x.device, dtype=torch.float32)
@@ -2082,10 +2254,19 @@ def _rmsnorm_forward_ptr_into(
         if compiled is None:
             op = RMSNormSM100(N, dtype, stage=stage)
             ld_val = int(x.stride(0))
-            ptr_x = rt.make_ptr(dtype, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
-            ptr_out = rt.make_ptr(dtype, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+            ptr_x = rt.make_ptr(
+                dtype, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+            )
+            ptr_out = rt.make_ptr(
+                dtype, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+            )
             ptr_w = (
-                rt.make_ptr(dtype, weight.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+                rt.make_ptr(
+                    dtype,
+                    weight.data_ptr(),
+                    mem_space=rt.AddressSpace.gmem,
+                    assumed_align=16,
+                )
                 if has_weight
                 else None
             )
@@ -2122,10 +2303,19 @@ def _rmsnorm_forward_ptr_into(
             launcher.launch(x=x, weight=weight, out=out, M=M, N=N, ld=ld_val, eps=eps)
             return
 
-        ptr_x = rt.make_ptr(dtype, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
-        ptr_out = rt.make_ptr(dtype, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+        ptr_x = rt.make_ptr(
+            dtype, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+        )
+        ptr_out = rt.make_ptr(
+            dtype, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+        )
         ptr_w = (
-            rt.make_ptr(dtype, weight.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+            rt.make_ptr(
+                dtype,
+                weight.data_ptr(),
+                mem_space=rt.AddressSpace.gmem,
+                assumed_align=16,
+            )
             if has_weight
             else None
         )
@@ -2167,33 +2357,55 @@ def _rmsnorm_forward_ptr_into(
     compiled = _PTR_COMPILE_CACHE.get(key)
     if compiled is None:
         op = RMSNormSM100(N, dtype, stage=stage)
-        ptr_x = rt.make_ptr(dtype, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
-        ptr_out = rt.make_ptr(dtype, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+        ptr_x = rt.make_ptr(
+            dtype, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+        )
+        ptr_out = rt.make_ptr(
+            dtype, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+        )
         ptr_res = (
-            rt.make_ptr(dtype, residual.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+            rt.make_ptr(
+                dtype,
+                residual.data_ptr(),
+                mem_space=rt.AddressSpace.gmem,
+                assumed_align=16,
+            )
             if residual is not None
             else None
         )
         ptr_res_out = (
             rt.make_ptr(
-                dtype, residual_out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+                dtype,
+                residual_out.data_ptr(),
+                mem_space=rt.AddressSpace.gmem,
+                assumed_align=16,
             )
             if residual_out is not None
             else None
         )
         ptr_w = (
-            rt.make_ptr(dtype, weight.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+            rt.make_ptr(
+                dtype,
+                weight.data_ptr(),
+                mem_space=rt.AddressSpace.gmem,
+                assumed_align=16,
+            )
             if weight is not None
             else None
         )
         ptr_b = (
-            rt.make_ptr(dtype, bias.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+            rt.make_ptr(
+                dtype, bias.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+            )
             if bias is not None
             else None
         )
         ptr_rstd = (
             rt.make_ptr(
-                cutlass.Float32, rstd.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=4
+                cutlass.Float32,
+                rstd.data_ptr(),
+                mem_space=rt.AddressSpace.gmem,
+                assumed_align=4,
             )
             if rstd is not None
             else None
@@ -2216,30 +2428,50 @@ def _rmsnorm_forward_ptr_into(
             Float32(eps),
         )
         _PTR_COMPILE_CACHE[key] = compiled
-    ptr_x = rt.make_ptr(dtype, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
-    ptr_out = rt.make_ptr(dtype, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+    ptr_x = rt.make_ptr(
+        dtype, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+    )
+    ptr_out = rt.make_ptr(
+        dtype, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+    )
     ptr_res = (
-        rt.make_ptr(dtype, residual.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+        rt.make_ptr(
+            dtype, residual.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+        )
         if residual is not None
         else None
     )
     ptr_res_out = (
-        rt.make_ptr(dtype, residual_out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+        rt.make_ptr(
+            dtype,
+            residual_out.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=16,
+        )
         if residual_out is not None
         else None
     )
     ptr_w = (
-        rt.make_ptr(dtype, weight.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+        rt.make_ptr(
+            dtype, weight.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+        )
         if weight is not None
         else None
     )
     ptr_b = (
-        rt.make_ptr(dtype, bias.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+        rt.make_ptr(
+            dtype, bias.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+        )
         if bias is not None
         else None
     )
     ptr_rstd = (
-        rt.make_ptr(cutlass.Float32, rstd.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=4)
+        rt.make_ptr(
+            cutlass.Float32,
+            rstd.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=4,
+        )
         if rstd is not None
         else None
     )
@@ -2296,7 +2528,9 @@ def _fused_add_rmsnorm_forward_ptr_inplace(
     # benchmark other models/shapes, you can override it with:
     #   - OINK_RMSNORM_DIRECT_GMEM=0  (force staging/cp.async path)
     #   - OINK_RMSNORM_DIRECT_GMEM=1  (force direct-gmem path)
-    direct_gmem = _direct_gmem_from_policy(default=bool(dtype.width == 16 and N == 7168))
+    direct_gmem = _direct_gmem_from_policy(
+        default=bool(dtype.width == 16 and N == 7168)
+    )
     use_async = not direct_gmem
     tpr_override: Optional[int] = None
     nt_override: Optional[int] = None
@@ -2340,13 +2574,9 @@ def _fused_add_rmsnorm_forward_ptr_inplace(
             tpr_override = 256
             nt_override = 256
 
-
     can_use_256 = bool(
         direct_gmem
-        and (
-            direct_gmem_max_copy_bits is None
-            or direct_gmem_max_copy_bits >= 256
-        )
+        and (direct_gmem_max_copy_bits is None or direct_gmem_max_copy_bits >= 256)
         and dtype.width == 16
         and (x.data_ptr() % 32) == 0
         and (residual.data_ptr() % 32) == 0
@@ -2395,13 +2625,22 @@ def _fused_add_rmsnorm_forward_ptr_inplace(
         if cluster_n_override is not None:
             op._cluster_n_override = cluster_n_override  # type: ignore[attr-defined]
         ptr_x = rt.make_ptr(
-            dtype, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align
+            dtype,
+            x.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=assumed_align,
         )
         ptr_res = rt.make_ptr(
-            dtype, residual.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align
+            dtype,
+            residual.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=assumed_align,
         )
         ptr_w = rt.make_ptr(
-            dtype, weight.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align
+            dtype,
+            weight.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=assumed_align,
         )
         stream = cuda.CUstream(stream_handle)
         ld_x = Int32(int(x.stride(0)))
@@ -2444,12 +2683,20 @@ def _fused_add_rmsnorm_forward_ptr_inplace(
 
     # Fast-launch is disabled/unavailable (or CuTeDSL internals changed). Fall back
     # to calling the compiled function directly.
-    ptr_x = rt.make_ptr(dtype, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align)
+    ptr_x = rt.make_ptr(
+        dtype, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align
+    )
     ptr_res = rt.make_ptr(
-        dtype, residual.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align
+        dtype,
+        residual.data_ptr(),
+        mem_space=rt.AddressSpace.gmem,
+        assumed_align=assumed_align,
     )
     ptr_w = rt.make_ptr(
-        dtype, weight.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align
+        dtype,
+        weight.data_ptr(),
+        mem_space=rt.AddressSpace.gmem,
+        assumed_align=assumed_align,
     )
     stream = cuda.CUstream(stream_handle)
     ld_x = Int32(int(x.stride(0)))
@@ -2486,7 +2733,12 @@ def rmsnorm_forward(
             rms2 = None  # type: ignore[assignment]
         if rms2 is not None:
             y, rstd, residual_out = rms2.rmsnorm_forward_with_stage2(
-                x, weight=weight, bias=bias, residual=residual, eps=eps, store_rstd=store_rstd
+                x,
+                weight=weight,
+                bias=bias,
+                residual=residual,
+                eps=eps,
+                store_rstd=store_rstd,
             )
             # Preserve stride contracts for torch.compile consistency, even
             # when using the optional stage-2 implementation.
@@ -2519,7 +2771,9 @@ def rmsnorm_forward(
     # Preserve the input stride contract even on the fallback path so
     # torch.compile sees a consistent output layout across all branches.
     if y.stride() != x.stride():
-        y_strided = torch.empty_strided(x.shape, x.stride(), device=x.device, dtype=x.dtype)
+        y_strided = torch.empty_strided(
+            x.shape, x.stride(), device=x.device, dtype=x.dtype
+        )
         y_strided.copy_(y)
         y = y_strided
     rstd = None

From 4c9a826cb2b1a3d48fbffe156f10943b41ec8f80 Mon Sep 17 00:00:00 2001
From: Laura Wang <3700467+Laurawly@users.noreply.github.com>
Date: Wed, 21 Jan 2026 11:06:37 -0800
Subject: [PATCH 4/8] oink: SM100 suite refresh (strict parity + quack-style
 benches)

- Switch correctness gate to PyTorch ref + record err stats\n- Tighten Softmax/LayerNorm tolerances (Quack-like)\n- Quack-style benchmark suite layout + SVG plots\n- Packaging/README polish for publishability
---
 oink/README.md                                |   94 +-
 oink/benchmarks/README.md                     |  146 +
 oink/benchmarks/benchmark/bench_utils.py      |  255 ++
 .../benchmark_cross_entropy_sm100.py          |  426 +++
 .../benchmark_fused_add_rmsnorm_sm100.py      |  296 ++
 .../benchmark/benchmark_hbm_roofline_sm100.py |  226 ++
 .../benchmark/benchmark_layernorm_sm100.py    |  393 +++
 .../benchmark/benchmark_rmsnorm_bwd_sm100.py  |  434 +++
 .../benchmark/benchmark_rmsnorm_sm100.py      |  337 ++
 .../benchmark/benchmark_softmax_sm100.py      |  292 ++
 .../media/sm100_bf16_oink_vs_quack.svg        | 2259 +++++++++++++
 .../media/sm100_bf16_oink_vs_quack_dsv3.svg   | 2600 +++++++++++++++
 .../sm100_bf16_oink_vs_quack_dsv3_all.svg     | 2936 ++++++++++++++++
 ..._bf16_oink_vs_quack_dsv3_cross_entropy.svg | 1687 ++++++++++
 ...bf16_oink_vs_quack_dsv3_with_layernorm.svg | 2720 +++++++++++++++
 ...m100_bf16_oink_vs_quack_with_layernorm.svg | 2580 ++++++++++++++
 .../media/sm100_fp16_oink_vs_quack.svg        | 2280 +++++++++++++
 .../media/sm100_fp16_oink_vs_quack_dsv3.svg   | 2621 +++++++++++++++
 .../sm100_fp16_oink_vs_quack_dsv3_all.svg     | 2957 +++++++++++++++++
 ..._fp16_oink_vs_quack_dsv3_cross_entropy.svg | 1708 ++++++++++
 ...fp16_oink_vs_quack_dsv3_with_layernorm.svg | 2741 +++++++++++++++
 ...m100_fp16_oink_vs_quack_with_layernorm.svg | 2601 +++++++++++++++
 .../benchmarks/readme/plot_quack_style_svg.py |  431 +++
 oink/benchmarks/readme/run_sm100_suite.py     |  302 ++
 oink/benchmarks/readme/summarize_results.py   |  205 ++
 oink/pyproject.toml                           |   24 +-
 oink/src/kernelagent_oink/__init__.py         |   28 +-
 .../blackwell/cross_entropy.py                | 1209 +++++++
 .../kernelagent_oink/blackwell/layernorm.py   | 1368 ++++++++
 .../kernelagent_oink/blackwell/lite_quack.py  | 1001 +++++-
 .../src/kernelagent_oink/blackwell/rmsnorm.py |  467 ++-
 .../blackwell/rmsnorm_with_stage2.py          |  805 +++++
 .../src/kernelagent_oink/blackwell/softmax.py |  749 +++++
 33 files changed, 39026 insertions(+), 152 deletions(-)
 create mode 100644 oink/benchmarks/README.md
 create mode 100644 oink/benchmarks/benchmark/bench_utils.py
 create mode 100644 oink/benchmarks/benchmark/benchmark_cross_entropy_sm100.py
 create mode 100644 oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py
 create mode 100644 oink/benchmarks/benchmark/benchmark_hbm_roofline_sm100.py
 create mode 100644 oink/benchmarks/benchmark/benchmark_layernorm_sm100.py
 create mode 100644 oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py
 create mode 100644 oink/benchmarks/benchmark/benchmark_rmsnorm_sm100.py
 create mode 100644 oink/benchmarks/benchmark/benchmark_softmax_sm100.py
 create mode 100644 oink/benchmarks/media/sm100_bf16_oink_vs_quack.svg
 create mode 100644 oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3.svg
 create mode 100644 oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3_all.svg
 create mode 100644 oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3_cross_entropy.svg
 create mode 100644 oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3_with_layernorm.svg
 create mode 100644 oink/benchmarks/media/sm100_bf16_oink_vs_quack_with_layernorm.svg
 create mode 100644 oink/benchmarks/media/sm100_fp16_oink_vs_quack.svg
 create mode 100644 oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3.svg
 create mode 100644 oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3_all.svg
 create mode 100644 oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3_cross_entropy.svg
 create mode 100644 oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3_with_layernorm.svg
 create mode 100644 oink/benchmarks/media/sm100_fp16_oink_vs_quack_with_layernorm.svg
 create mode 100644 oink/benchmarks/readme/plot_quack_style_svg.py
 create mode 100644 oink/benchmarks/readme/run_sm100_suite.py
 create mode 100644 oink/benchmarks/readme/summarize_results.py
 create mode 100644 oink/src/kernelagent_oink/blackwell/cross_entropy.py
 create mode 100644 oink/src/kernelagent_oink/blackwell/layernorm.py
 create mode 100644 oink/src/kernelagent_oink/blackwell/rmsnorm_with_stage2.py
 create mode 100644 oink/src/kernelagent_oink/blackwell/softmax.py

diff --git a/oink/README.md b/oink/README.md
index 427f69f..aeb0c09 100644
--- a/oink/README.md
+++ b/oink/README.md
@@ -1,13 +1,33 @@
-# KernelAgent Oink (vLLM plugin)
+# KernelAgent-Oink
 
-This subproject provides an **out-of-tree vLLM plugin** that registers
-`torch.library.custom_op` entrypoints under the `oink::` namespace:
+KernelAgent-Oink is a small **CuTeDSL (CUTLASS DSL) kernel library** for
+**NVIDIA Blackwell (SM100 / GB200 / B200-class)**, bundled as a lightweight
+Python package that can be used standalone or as a **vLLM general plugin**.
 
-- `torch.ops.oink.rmsnorm`
-- `torch.ops.oink.fused_add_rms_norm`
+At the moment, the vLLM integration exposes the following `torch.library.custom_op`
+entrypoints under the `oink::` namespace:
 
-The implementation is backed by a CuTeDSL (CUTLASS) RMSNorm kernel tuned for
-**NVIDIA Blackwell (SM100)**.
+- `torch.ops.oink.rmsnorm(x, weight, eps) -> Tensor`
+- `torch.ops.oink.fused_add_rms_norm(x, residual, weight, eps) -> None` (in-place)
+
+The package also includes additional SM100 kernels used by the benchmark suite:
+LayerNorm, Softmax (fwd+bwd), and CrossEntropy (fwd+bwd).
+
+## Requirements
+
+- GPU: **SM100** for the fast CuTeDSL paths. On other GPUs, Oink falls back to
+  reference PyTorch implementations for correctness.
+- Python dependencies:
+  - `nvidia-cutlass-dsl` (CuTeDSL)
+  - `cuda-python`
+  - `torch` (provided by your environment / vLLM)
+
+Recommended env vars:
+
+```bash
+export CUTE_DSL_ARCH=sm_100a
+export PYTORCH_ALLOC_CONF=expandable_segments:True
+```
 
 ## Install (editable)
 
@@ -17,22 +37,23 @@ From the `KernelAgent` repo root:
 pip install -e ./oink
 ```
 
-This plugin requires the CuTeDSL stack:
+For running the in-repo benchmark suite / plots:
 
 ```bash
-pip install nvidia-cutlass-dsl cuda-python
+pip install -e "./oink[bench]"
 ```
 
-## Use with vLLM
+## Usage
+
+### vLLM (general plugin)
 
-1. Enable the vLLM integration:
+1) Enable the plugin:
 
 ```bash
 export VLLM_USE_OINK_RMSNORM=1
 ```
 
-2. Ensure vLLM keeps `rms_norm` as a custom op when using `torch.compile` /
-CUDA graphs. In Python:
+2) Ensure vLLM keeps `rms_norm` as a custom op when using `torch.compile` / CUDA graphs:
 
 ```python
 from vllm import LLM
@@ -45,13 +66,44 @@ llm = LLM(
 )
 ```
 
-Without `+rms_norm`, Inductor may fuse RMSNorm into larger Triton kernels and
-neither vLLM's CUDA RMSNorm nor Oink will run.
+Without `+rms_norm`, Inductor may fuse RMSNorm into larger kernels and neither
+vLLM’s CUDA RMSNorm nor Oink will run.
+
+### Direct PyTorch usage (manual op registration)
+
+For standalone use (outside vLLM), register the custom ops once:
+
+```python
+import kernelagent_oink
+import torch
+
+kernelagent_oink.register(force=True)
+
+x = torch.randn(1024, 4096, device="cuda", dtype=torch.bfloat16)
+w = torch.randn(4096, device="cuda", dtype=torch.bfloat16)
+y = torch.ops.oink.rmsnorm(x, w, 1e-6)
+```
+
+## Benchmarks
+
+The repo includes a Quack-style benchmark suite (tables + SVG plots) to compare
+Oink against Quack on SM100 and to reproduce the reported speedups.
+
+- How to run + methodology: `oink/benchmarks/README.md`
+- Pre-generated plots: `oink/benchmarks/media/`
+
+<div align="center">
+  <img src="benchmarks/media/sm100_bf16_oink_vs_quack_with_layernorm.svg" alt="SM100 BF16: Oink vs Quack (Quack-suite)">
+</div>
+
+<div align="center">
+  <img src="benchmarks/media/sm100_bf16_oink_vs_quack_dsv3_all.svg" alt="SM100 BF16: Oink vs Quack (DSv3-like shapes)">
+</div>
 
-## Notes
+## Links
 
-- This plugin is designed to be **safe to import even when disabled**; it only
-  registers ops when `VLLM_USE_OINK_RMSNORM` is truthy (`"1"` / `"true"`).
-- The ops preserve **padded-row layouts** for 2D tensors (shape `[M, N]`,
-  `stride(1) == 1`, and potentially `stride(0) > N`), which is required for
-  `torch.compile` stride verification on some models (e.g., MLA padded inputs).
+| What | Link |
+|---|---|
+| Quack (expert baseline) | https://github.com/Dao-AILab/quack |
+| KernelAgent (agentic framework) | https://github.com/meta-pytorch/KernelAgent |
+| vLLM PR (Oink RMSNorm integration) | https://github.com/vllm-project/vllm/pull/31828 |
diff --git a/oink/benchmarks/README.md b/oink/benchmarks/README.md
new file mode 100644
index 0000000..ceb7932
--- /dev/null
+++ b/oink/benchmarks/README.md
@@ -0,0 +1,146 @@
+# SM100 Benchmarks (KernelAgent-Oink vs Quack)
+
+This folder contains SM100 (GB200 / Blackwell) microbenchmarks for the Oink
+CuTeDSL kernels vendored into KernelAgent, comparing against Quack’s SM100
+kernels where Quack provides an equivalent API.
+
+## Prereqs
+
+- GPU: **SM100** (`torch.cuda.get_device_capability() == (10, 0)`).
+- Python deps in your environment:
+  - `torch`
+  - `nvidia-cutlass-dsl` (CuTeDSL)
+  - `cuda-python`
+  - `triton` (only for `triton.testing.do_bench`)
+  - `quack` (optional; only needed for Oink-vs-Quack comparisons)
+
+Recommended env vars:
+
+```bash
+export PYTORCH_ALLOC_CONF=expandable_segments:True
+export CUTE_DSL_ARCH=sm_100a
+```
+
+## Shape suites
+
+- **Quack-suite**: `(batch, seq) ∈ {1,4,8,16,32} × {8192,16384,32768,65536,131072}`,
+  with `hidden = 4096` so `M = batch * seq`, `N = 4096`.
+- **DeepSeek-V3-like (DSv3)**
+  - RMSNorm / LayerNorm / Softmax: `M ∈ {4096, 16384, 65536}`, `N ∈ {6144, 7168, 8192}`
+  - Cross-entropy: `M ∈ {4096, 16384, 65536}`, `N ∈ {3072, 6144, 8192, 12288}`
+
+## Correctness gates
+
+By default, each script runs a per-shape `torch.testing.assert_close` check
+vs a **pure-PyTorch reference** **before** emitting timing numbers. When Quack
+is available for that op/path, the script also validates Quack vs the *same*
+reference (so speedups can’t come from looser numerics).
+
+Disable with `--skip-verify` only for quick smoke tests.
+
+## Running benchmarks
+
+All scripts support:
+
+- `--quack-suite` or `--dsv3` (or `--configs MxN,...`)
+- `--dtype {bf16,fp16,fp32}`
+- `--iters <ms>` and `--warmup-ms <ms>` for kernel-only timing
+- `--json <path>` and/or `--csv <path>` outputs (meta + rows)
+
+### One-command suite
+
+Run the full Quack-suite + DSv3 set (Oink vs Quack) and write all JSON artifacts
+to a timestamped directory:
+
+```bash
+python oink/benchmarks/readme/run_sm100_suite.py --dtype bf16
+```
+
+Turn the JSON artifacts into Markdown tables (with geomean speedups):
+
+```bash
+python oink/benchmarks/readme/summarize_results.py --in-dir /tmp/kernelagent_oink_sm100_suite_<timestamp> \
+  --out /tmp/kernelagent_oink_sm100_suite_summary.md
+```
+
+### Measured HBM roofline (STREAM-like)
+
+To contextualize the `*_tbps` numbers as a fraction of a *measured* bandwidth
+ceiling (rather than a theoretical spec), run:
+
+```bash
+CUDA_VISIBLE_DEVICES=0 python oink/benchmarks/benchmark/benchmark_hbm_roofline_sm100.py --dtype bf16 --op both --gb 2 \
+  --json /tmp/hbm_roofline_sm100_bf16.json
+```
+
+### RMSNorm forward
+
+```bash
+python oink/benchmarks/benchmark/benchmark_rmsnorm_sm100.py --dtype bf16 --weight-dtype fp32 --quack-suite --iters 200 --warmup-ms 25 \
+  --json /tmp/oink_rmsnorm_fwd_quack_suite.json
+
+python oink/benchmarks/benchmark/benchmark_rmsnorm_sm100.py --dtype bf16 --weight-dtype fp32 --dsv3 --iters 200 --warmup-ms 25 \
+  --json /tmp/oink_rmsnorm_fwd_dsv3.json
+
+# vLLM-style inference weights (weight dtype == activation dtype)
+python oink/benchmarks/benchmark/benchmark_rmsnorm_sm100.py --dtype bf16 --weight-dtype same --quack-suite --iters 200 --warmup-ms 25 \
+  --json /tmp/oink_rmsnorm_fwd_quack_suite_wsame.json
+```
+
+### Fused Add + RMSNorm (vLLM-style, in-place)
+
+This is a good "roofline case study" kernel (heavy read/write traffic, very little extra math):
+
+```bash
+CUDA_VISIBLE_DEVICES=0 python oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py --dtype bf16 --M 65536 --N 4096 \
+  --json /tmp/fused_add_rmsnorm_sm100_bf16.json
+```
+
+### RMSNorm backward
+
+```bash
+python oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py --dtype bf16 --weight-dtype fp32 --quack-suite --iters 100 --warmup-ms 25 \
+  --csv /tmp/oink_rmsnorm_bwd_quack_suite.csv
+
+python oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py --dtype bf16 --weight-dtype fp32 --dsv3 --iters 100 --warmup-ms 25 \
+  --csv /tmp/oink_rmsnorm_bwd_dsv3.csv
+```
+
+### Softmax (forward + backward)
+
+```bash
+python oink/benchmarks/benchmark/benchmark_softmax_sm100.py --dtype bf16 --mode fwd_bwd --quack-suite --iters 50 --warmup-ms 25 \
+  --json /tmp/oink_softmax_fwd_bwd_quack_suite.json
+
+python oink/benchmarks/benchmark/benchmark_softmax_sm100.py --dtype bf16 --mode fwd_bwd --dsv3 --iters 50 --warmup-ms 25 \
+  --json /tmp/oink_softmax_fwd_bwd_dsv3.json
+```
+
+### Cross-entropy (forward + backward)
+
+```bash
+python oink/benchmarks/benchmark/benchmark_cross_entropy_sm100.py --dtype bf16 --mode fwd_bwd --quack-suite --iters 50 --warmup-ms 25 \
+  --json /tmp/oink_cross_entropy_fwd_bwd_quack_suite.json
+
+python oink/benchmarks/benchmark/benchmark_cross_entropy_sm100.py --dtype bf16 --mode fwd_bwd --dsv3 --iters 50 --warmup-ms 25 \
+  --json /tmp/oink_cross_entropy_fwd_bwd_dsv3.json
+```
+
+### LayerNorm forward
+
+```bash
+python oink/benchmarks/benchmark/benchmark_layernorm_sm100.py --dtype bf16 --quack-suite --iters 200 --warmup-ms 25 \
+  --json /tmp/oink_layernorm_fwd_quack_suite.json
+
+python oink/benchmarks/benchmark/benchmark_layernorm_sm100.py --dtype bf16 --dsv3 --iters 200 --warmup-ms 25 \
+  --json /tmp/oink_layernorm_fwd_dsv3.json
+```
+
+## Notes
+
+- These scripts intentionally avoid importing any external Oink checkout so the
+  results reflect the in-tree KernelAgent Oink kernels.
+- For RMSNorm, the `rmsnorm_with_stage2` implementation is a **fallback** that
+  is only used when the pointer-based fast path cannot be used (e.g. when
+  `weight.dtype != x.dtype`, or when layouts/alignments are incompatible). You
+  can force it for A/B testing via `KERNELAGENT_OINK_FORCE_RMSNORM_STAGE2=1`.
diff --git a/oink/benchmarks/benchmark/bench_utils.py b/oink/benchmarks/benchmark/bench_utils.py
new file mode 100644
index 0000000..0abb005
--- /dev/null
+++ b/oink/benchmarks/benchmark/bench_utils.py
@@ -0,0 +1,255 @@
+from __future__ import annotations
+
+import csv
+import json
+import math
+import os
+import subprocess
+import sys
+from dataclasses import asdict, dataclass
+from datetime import datetime
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
+
+import torch
+from triton.testing import do_bench as triton_do_bench
+
+
+@dataclass(frozen=True)
+class DeviceMeta:
+    device: str
+    capability: Tuple[int, int]
+    torch: str
+    cuda: str
+    cute_dsl_arch: str
+    git_sha: str
+    timestamp: str
+
+
+def _try_git_sha() -> str:
+    here = os.path.dirname(os.path.abspath(__file__))
+    repo_root = os.path.abspath(os.path.join(here, "..", ".."))
+    try:
+        out = subprocess.check_output(
+            ["git", "rev-parse", "HEAD"],
+            cwd=repo_root,
+            stderr=subprocess.DEVNULL,
+            text=True,
+        )
+        return out.strip()
+    except Exception:
+        return ""
+
+
+def collect_device_meta(device: Optional[torch.device] = None) -> DeviceMeta:
+    if device is None:
+        device = torch.device("cuda")
+    props = torch.cuda.get_device_properties(device)
+    timestamp = datetime.now().isoformat(timespec="seconds")
+    return DeviceMeta(
+        device=str(props.name),
+        capability=(int(props.major), int(props.minor)),
+        torch=str(torch.__version__),
+        cuda=str(getattr(torch.version, "cuda", "unknown")),
+        cute_dsl_arch=os.environ.get("CUTE_DSL_ARCH", ""),
+        git_sha=_try_git_sha(),
+        timestamp=timestamp,
+    )
+
+
+def detect_hbm_peak_gbps(device: Optional[torch.device] = None) -> float:
+    """Approximate HBM peak bandwidth in GB/s for roofline fractions."""
+    if device is None:
+        device = torch.device("cuda")
+    props = torch.cuda.get_device_properties(device)
+    sm = props.major * 10 + props.minor
+    if sm >= 100:
+        return 8000.0
+    return 2000.0
+
+
+def do_bench_triton(fn: Callable[[], Any], *, warmup_ms: int = 25, rep_ms: int = 100) -> float:
+    """Kernel-only timing consistent with the Oink benchmark harnesses."""
+    return float(triton_do_bench(fn, warmup=warmup_ms, rep=rep_ms, return_mode="mean"))
+
+
+def parse_dtype(s: str) -> torch.dtype:
+    s = s.lower()
+    if s == "bf16":
+        return torch.bfloat16
+    if s == "fp16":
+        return torch.float16
+    if s == "fp32":
+        return torch.float32
+    raise ValueError(f"Unsupported dtype: {s}")
+
+
+def parse_configs(s: str) -> List[Tuple[int, int]]:
+    out: List[Tuple[int, int]] = []
+    for part in s.split(","):
+        m, n = part.lower().split("x")
+        out.append((int(m), int(n)))
+    return out
+
+
+def quack_suite_configs() -> List[Tuple[int, int, int]]:
+    """Return (batch, seq, hidden) triples following Quack's common grid (hidden=4096)."""
+    batch_sizes = [1, 4, 8, 16, 32]
+    seq_lengths = [8192, 16384, 32768, 65536, 131072]
+    hidden = 4096
+    cfgs: List[Tuple[int, int, int]] = []
+    for bs in batch_sizes:
+        for sl in seq_lengths:
+            M = bs * sl
+            if M * hidden > (2**31):
+                continue
+            cfgs.append((bs, sl, hidden))
+    return cfgs
+
+
+def ensure_oink_src_on_path() -> None:
+    """Make the in-repo KernelAgent Oink package importable without an editable install."""
+    here = os.path.dirname(os.path.abspath(__file__))
+    oink_src = os.path.abspath(os.path.join(here, "..", "..", "src"))
+    if oink_src not in sys.path:
+        sys.path.insert(0, oink_src)
+
+
+def write_csv(path: str, rows: Sequence[Dict[str, Any]]) -> None:
+    if not rows:
+        return
+    os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
+    file_exists = os.path.exists(path)
+    with open(path, "a", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=sorted(rows[0].keys()))
+        if not file_exists:
+            writer.writeheader()
+        for row in rows:
+            writer.writerow(row)
+
+
+def write_json(path: str, meta: DeviceMeta, rows: Sequence[Dict[str, Any]], *, extra: Dict[str, Any] | None = None) -> None:
+    os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
+    payload: Dict[str, Any] = {
+        "meta": {**asdict(meta), **(extra or {})},
+        "rows": list(rows),
+    }
+    with open(path, "w") as f:
+        json.dump(payload, f, indent=2)
+
+
+def iter_row_blocks(M: int, block_rows: int) -> Iterable[Tuple[int, int]]:
+    """Yield (start, end) row index ranges for a 2D (M, N) matrix.
+
+    The intent is to make correctness references for large tensors tractable
+    without materializing full float32 intermediates.
+    """
+    if M < 0:
+        raise ValueError(f"M must be non-negative, got {M}")
+    if block_rows <= 0:
+        raise ValueError(f"block_rows must be > 0, got {block_rows}")
+    for start in range(0, M, block_rows):
+        yield start, min(M, start + block_rows)
+
+
+@dataclass
+class ErrorStats:
+    """Numerical error stats between an output and a reference.
+
+    Notes:
+    - `max_abs` and `rel_l2` are computed exactly (streamed).
+    - `p99_abs` is computed over a deterministic strided sample of abs error
+      values (to keep very large tensors tractable).
+    """
+
+    max_abs: float
+    p99_abs: float
+    rel_l2: float
+    p99_sample_elems: int
+    p99_sample_stride: int
+
+
+class ErrorStatsAccumulator:
+    """Stream error stats over (output_block, ref_block) pairs.
+
+    This is intended for large 2D tensors where we compute reference results
+    block-by-block to avoid materializing full float32 intermediates.
+    """
+
+    def __init__(self, *, total_elems: int, p99_target_samples: int = 1_000_000):
+        if total_elems <= 0:
+            raise ValueError(f"total_elems must be > 0, got {total_elems}")
+        if p99_target_samples <= 0:
+            raise ValueError(f"p99_target_samples must be > 0, got {p99_target_samples}")
+        self.total_elems = int(total_elems)
+        self.p99_target_samples = int(p99_target_samples)
+        # Deterministic strided sampling across the flattened tensor order.
+        self.sample_stride = max(1, self.total_elems // self.p99_target_samples)
+        self._global_offset = 0
+
+        self._max_abs = 0.0
+        self._err_sq_sum = 0.0
+        self._ref_sq_sum = 0.0
+        self._abs_err_samples: List[torch.Tensor] = []
+
+    def update(self, out: torch.Tensor, ref: torch.Tensor) -> None:
+        if out.shape != ref.shape:
+            raise ValueError(f"shape mismatch: out={tuple(out.shape)} ref={tuple(ref.shape)}")
+
+        # Compute error in float32 for stable reductions.
+        err_f32 = (out - ref).to(torch.float32)
+        abs_err = err_f32.abs()
+
+        # Exact reductions.
+        self._max_abs = max(self._max_abs, float(abs_err.max().item()))
+        self._err_sq_sum += float((err_f32 * err_f32).sum(dtype=torch.float64).item())
+        ref_f32 = ref.to(torch.float32)
+        self._ref_sq_sum += float((ref_f32 * ref_f32).sum(dtype=torch.float64).item())
+
+        # Deterministic strided sample for p99_abs.
+        flat = abs_err.flatten()
+        block_elems = int(flat.numel())
+        if block_elems <= 0:
+            return
+
+        stride = int(self.sample_stride)
+        first = (-int(self._global_offset)) % stride
+        if first < block_elems:
+            idx = torch.arange(first, block_elems, step=stride, device=flat.device, dtype=torch.int64)
+            # Gather a modest number of values (≈ block_elems/stride).
+            vals = flat.index_select(0, idx).detach().to(device="cpu", dtype=torch.float32)
+            self._abs_err_samples.append(vals)
+
+        self._global_offset += block_elems
+
+    def finalize(self) -> ErrorStats:
+        if self._abs_err_samples:
+            samples = torch.cat(self._abs_err_samples, dim=0)
+            if samples.numel() > self.p99_target_samples:
+                samples = samples[: self.p99_target_samples]
+            p99 = float(torch.quantile(samples, 0.99).item()) if samples.numel() > 0 else 0.0
+            sample_elems = int(samples.numel())
+        else:
+            p99 = 0.0
+            sample_elems = 0
+
+        denom = math.sqrt(self._ref_sq_sum) if self._ref_sq_sum > 0 else 0.0
+        rel_l2 = (math.sqrt(self._err_sq_sum) / denom) if denom > 0 else 0.0
+
+        return ErrorStats(
+            max_abs=float(self._max_abs),
+            p99_abs=float(p99),
+            rel_l2=float(rel_l2),
+            p99_sample_elems=int(sample_elems),
+            p99_sample_stride=int(self.sample_stride),
+        )
+
+
+def error_stats_to_row(prefix: str, stats: ErrorStats) -> Dict[str, Any]:
+    """Flatten ErrorStats into JSON-friendly row fields."""
+    return {
+        f"{prefix}_max_abs": float(stats.max_abs),
+        f"{prefix}_p99_abs": float(stats.p99_abs),
+        f"{prefix}_rel_l2": float(stats.rel_l2),
+        f"{prefix}_p99_sample_elems": int(stats.p99_sample_elems),
+        f"{prefix}_p99_sample_stride": int(stats.p99_sample_stride),
+    }
diff --git a/oink/benchmarks/benchmark/benchmark_cross_entropy_sm100.py b/oink/benchmarks/benchmark/benchmark_cross_entropy_sm100.py
new file mode 100644
index 0000000..8bcac15
--- /dev/null
+++ b/oink/benchmarks/benchmark/benchmark_cross_entropy_sm100.py
@@ -0,0 +1,426 @@
+from __future__ import annotations
+
+import argparse
+import os
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+
+# Reduce fragmentation pressure on busy GPUs.
+os.environ.setdefault("PYTORCH_ALLOC_CONF", "expandable_segments:True")
+
+# Ensure SM100 (GB200) architecture is recognized by CuTeDSL when running outside vLLM.
+os.environ.setdefault("CUTE_DSL_ARCH", "sm_100a")
+
+from bench_utils import (  # noqa: E402
+    ErrorStatsAccumulator,
+    collect_device_meta,
+    detect_hbm_peak_gbps,
+    do_bench_triton,
+    error_stats_to_row,
+    ensure_oink_src_on_path,
+    iter_row_blocks,
+    parse_configs,
+    parse_dtype,
+    quack_suite_configs,
+    write_csv,
+    write_json,
+)
+
+ensure_oink_src_on_path()
+
+from kernelagent_oink.blackwell import cross_entropy as oink_ce  # noqa: E402
+
+try:
+    from quack.cross_entropy import cross_entropy_bwd as quack_ce_bwd  # type: ignore
+    from quack.cross_entropy import cross_entropy_fwd as quack_ce_fwd  # type: ignore
+except Exception:
+    quack_ce_fwd = None
+    quack_ce_bwd = None
+
+
+# Match Quack's unit-test defaults (tests/test_cross_entropy.py).
+_VERIFY_TOL_LOSS = dict(atol=5e-5, rtol=1e-5)  # float32 outputs (loss/lse)
+_VERIFY_TOL_DX = {
+    torch.float32: dict(atol=5e-5, rtol=1e-5),
+    # FP16 `dx` is low-precision; allow ~1 ulp at typical magnitudes.
+    torch.float16: dict(atol=1e-3, rtol=1e-3),
+    # BF16 `dx` is low-precision; allow ~1 ulp at typical magnitudes.
+    torch.bfloat16: dict(atol=1e-2, rtol=1e-2),
+}
+
+
+def bytes_io_model_ce(
+    M: int,
+    N: int,
+    dtype: torch.dtype,
+    *,
+    target_dtype: torch.dtype = torch.int64,
+    mode: str,
+) -> int:
+    elem = torch.tensor(0, dtype=dtype).element_size()
+    t_elem = torch.tensor(0, dtype=target_dtype).element_size()
+    # Forward:
+    #   read logits (M*N) + read target (M) + write loss (M fp32) + write lse (M fp32)
+    fwd = M * N * elem + M * t_elem + 2 * M * 4
+    # Backward (reduction="none" path):
+    #   read logits (M*N) + read target (M) + read dloss (M fp32) + read lse (M fp32) + write dx (M*N)
+    bwd = 2 * M * N * elem + M * t_elem + 2 * M * 4
+
+    if mode == "fwd":
+        return int(fwd)
+    if mode == "bwd":
+        return int(bwd)
+    if mode == "fwd_bwd":
+        # Logical IO for dx given (logits, target, dloss): read logits + read target
+        # + read dloss + write dx. (Intermediate lse/loss are implementation details.)
+        return int(2 * M * N * elem + M * t_elem + M * 4)
+    raise ValueError(f"Unsupported mode: {mode}")
+
+
+def dsv3_configs() -> List[Tuple[int, int]]:
+    Ms = [4096, 16384, 65536]
+    Ns = [3072, 6144, 8192, 12288]
+    return [(m, n) for m in Ms for n in Ns]
+
+
+def _verify_parity(logits: torch.Tensor, target: torch.Tensor, *, ignore_index: int) -> dict[str, object]:
+    dtype = logits.dtype
+    ref_block_rows = 512
+    dloss = torch.randn(logits.size(0), device=logits.device, dtype=torch.float32)  # upstream grad
+
+    with torch.no_grad():
+        loss_o, lse_o = oink_ce.cross_entropy_forward(
+            logits, target, ignore_index=ignore_index, reduction="none"
+        )
+        dx_o = oink_ce.cross_entropy_backward(dloss, logits, target, lse_o, ignore_index=ignore_index)
+        dx_fused_o = oink_ce.cross_entropy_fwd_bwd(
+            dloss,
+            logits,
+            target,
+            ignore_index=ignore_index,
+        )
+
+        loss_q = None
+        lse_q = None
+        dx_q = None
+        if quack_ce_fwd is not None and quack_ce_bwd is not None:
+            loss_q, lse_q = quack_ce_fwd(
+                logits,
+                target,
+                target_logit=None,
+                ignore_index=ignore_index,
+                return_lse=True,
+                return_dx=False,
+                inplace_backward=False,
+            )
+            dx_q = quack_ce_bwd(
+                logits,
+                target,
+                dloss,
+                lse_q,
+                ignore_index=ignore_index,
+                inplace_backward=False,
+            )
+
+    M = int(logits.shape[0])
+    N = int(logits.shape[1])
+    loss_acc_ours = ErrorStatsAccumulator(total_elems=M, p99_target_samples=min(M, 1_000_000))
+    lse_acc_ours = ErrorStatsAccumulator(total_elems=M, p99_target_samples=min(M, 1_000_000))
+    dx_acc_ours = ErrorStatsAccumulator(total_elems=M * N)
+    dx_fused_acc_ours = ErrorStatsAccumulator(total_elems=M * N)
+    loss_acc_quack = (
+        ErrorStatsAccumulator(total_elems=M, p99_target_samples=min(M, 1_000_000))
+        if (quack_ce_fwd is not None and quack_ce_bwd is not None)
+        else None
+    )
+    lse_acc_quack = (
+        ErrorStatsAccumulator(total_elems=M, p99_target_samples=min(M, 1_000_000))
+        if (quack_ce_fwd is not None and quack_ce_bwd is not None)
+        else None
+    )
+    dx_acc_quack = (
+        ErrorStatsAccumulator(total_elems=M * N)
+        if (quack_ce_fwd is not None and quack_ce_bwd is not None)
+        else None
+    )
+
+    # Match Quack tests: compare to a PyTorch reference computed on float32 logits.
+    # Chunk over rows so we don't materialize a full (M, N) float32 tensor.
+    for start, end in iter_row_blocks(M, ref_block_rows):
+        logits_f32 = logits[start:end].float().requires_grad_(True)
+        target_blk = target[start:end]
+        dloss_blk = dloss[start:end]
+
+        loss_ref = torch.nn.functional.cross_entropy(
+            logits_f32,
+            target_blk,
+            reduction="none",
+            ignore_index=ignore_index,
+        )
+        lse_ref = torch.logsumexp(logits_f32, dim=-1)
+        (dx_ref_f32,) = torch.autograd.grad(loss_ref, logits_f32, grad_outputs=dloss_blk)
+        dx_ref = dx_ref_f32.to(dtype)
+
+        torch.testing.assert_close(loss_o[start:end], loss_ref.detach(), **_VERIFY_TOL_LOSS)
+        torch.testing.assert_close(lse_o[start:end], lse_ref.detach(), **_VERIFY_TOL_LOSS)
+        torch.testing.assert_close(dx_o[start:end], dx_ref, **_VERIFY_TOL_DX[dtype])
+        torch.testing.assert_close(dx_fused_o[start:end], dx_ref, **_VERIFY_TOL_DX[dtype])
+        loss_acc_ours.update(loss_o[start:end], loss_ref.detach())
+        lse_acc_ours.update(lse_o[start:end], lse_ref.detach())
+        dx_acc_ours.update(dx_o[start:end], dx_ref)
+        dx_fused_acc_ours.update(dx_fused_o[start:end], dx_ref)
+
+        if loss_q is not None and lse_q is not None and dx_q is not None:
+            torch.testing.assert_close(loss_q[start:end], loss_ref.detach(), **_VERIFY_TOL_LOSS)
+            torch.testing.assert_close(lse_q[start:end], lse_ref.detach(), **_VERIFY_TOL_LOSS)
+            torch.testing.assert_close(dx_q[start:end], dx_ref, **_VERIFY_TOL_DX[dtype])
+            assert loss_acc_quack is not None and lse_acc_quack is not None and dx_acc_quack is not None
+            loss_acc_quack.update(loss_q[start:end], loss_ref.detach())
+            lse_acc_quack.update(lse_q[start:end], lse_ref.detach())
+            dx_acc_quack.update(dx_q[start:end], dx_ref)
+
+    stats: dict[str, object] = {}
+    stats.update(error_stats_to_row("ours_err_loss", loss_acc_ours.finalize()))
+    stats.update(error_stats_to_row("ours_err_lse", lse_acc_ours.finalize()))
+    stats.update(error_stats_to_row("ours_err_dx", dx_acc_ours.finalize()))
+    stats.update(error_stats_to_row("ours_err_dx_fused", dx_fused_acc_ours.finalize()))
+    if loss_acc_quack is not None and lse_acc_quack is not None and dx_acc_quack is not None:
+        stats.update(error_stats_to_row("quack_err_loss", loss_acc_quack.finalize()))
+        stats.update(error_stats_to_row("quack_err_lse", lse_acc_quack.finalize()))
+        stats.update(error_stats_to_row("quack_err_dx", dx_acc_quack.finalize()))
+    return stats
+
+
+def bench_single(
+    M: int,
+    N: int,
+    dtype: torch.dtype,
+    *,
+    warmup_ms: int,
+    iters_ms: int,
+    mode: str,
+    verify: bool,
+    ignore_index: int,
+) -> Tuple[Tuple[float, float], Optional[Tuple[float, float]], dict[str, object]]:
+    device = torch.device("cuda")
+    logits = 0.1 * torch.randn(M, N, device=device, dtype=dtype)
+    target = torch.randint(0, N, (M,), device=device, dtype=torch.int64)
+    # Sprinkle some ignore_index entries for robustness (and to match reduction semantics).
+    if ignore_index is not None:
+        mask = torch.rand(M, device=device) < 0.01
+        target[mask] = int(ignore_index)
+    dloss = torch.randn(M, device=device, dtype=torch.float32)
+
+    stats: dict[str, object] = {}
+    if verify:
+        stats = _verify_parity(logits, target, ignore_index=int(ignore_index))
+
+    bytes_io = bytes_io_model_ce(M, N, dtype, target_dtype=target.dtype, mode=mode)
+
+    if mode == "fwd":
+        fn_oink = lambda: oink_ce.cross_entropy_forward(
+            logits, target, ignore_index=int(ignore_index), reduction="none"
+        )
+        fn_quack = (
+            None
+            if quack_ce_fwd is None
+            else (
+                lambda: quack_ce_fwd(
+                    logits,
+                    target,
+                    target_logit=None,
+                    ignore_index=int(ignore_index),
+                    return_lse=True,
+                    return_dx=False,
+                    inplace_backward=False,
+                )
+            )
+        )
+    elif mode == "bwd":
+        with torch.no_grad():
+            _loss_o, lse_o = oink_ce.cross_entropy_forward(
+                logits, target, ignore_index=int(ignore_index), reduction="none"
+            )
+            if quack_ce_fwd is not None:
+                _loss_q, lse_q = quack_ce_fwd(
+                    logits,
+                    target,
+                    target_logit=None,
+                    ignore_index=int(ignore_index),
+                    return_lse=True,
+                    return_dx=False,
+                    inplace_backward=False,
+                )
+            else:
+                lse_q = None
+        fn_oink = lambda: oink_ce.cross_entropy_backward(
+            dloss, logits, target, lse_o, ignore_index=int(ignore_index)
+        )
+        fn_quack = (
+            None
+            if (quack_ce_bwd is None or lse_q is None)
+            else (
+                lambda: quack_ce_bwd(
+                    logits,
+                    target,
+                    dloss,
+                    lse_q,
+                    ignore_index=int(ignore_index),
+                    inplace_backward=False,
+                )
+            )
+        )
+    elif mode == "fwd_bwd":
+        fn_oink = lambda: oink_ce.cross_entropy_fwd_bwd(
+            dloss,
+            logits,
+            target,
+            ignore_index=int(ignore_index),
+        )
+        fn_quack = (
+            None
+            if (quack_ce_fwd is None or quack_ce_bwd is None)
+            else (
+                lambda: quack_ce_bwd(
+                    logits,
+                    target,
+                    dloss,
+                    quack_ce_fwd(
+                        logits,
+                        target,
+                        target_logit=None,
+                        ignore_index=int(ignore_index),
+                        return_lse=True,
+                        return_dx=False,
+                        inplace_backward=False,
+                    )[1],
+                    ignore_index=int(ignore_index),
+                    inplace_backward=False,
+                )
+            )
+        )
+    else:
+        raise ValueError(f"Unsupported mode: {mode}")
+
+    ms_oink = do_bench_triton(fn_oink, warmup_ms=warmup_ms, rep_ms=iters_ms)
+    gbps_oink = bytes_io / (ms_oink * 1e-3) / 1e9
+
+    if fn_quack is None:
+        return (ms_oink, gbps_oink), None, stats
+
+    ms_quack = do_bench_triton(fn_quack, warmup_ms=warmup_ms, rep_ms=iters_ms)
+    gbps_quack = bytes_io / (ms_quack * 1e-3) / 1e9
+    return (ms_oink, gbps_oink), (ms_quack, gbps_quack), stats
+
+
+def main() -> None:
+    if not torch.cuda.is_available():
+        raise SystemExit("CUDA not available")
+
+    torch.cuda.set_device(0)
+    device = torch.device("cuda")
+    props = torch.cuda.get_device_properties(device)
+    sm = props.major * 10 + props.minor
+    print(f"Running on {torch.cuda.get_device_name(device)} (SM{sm})")
+
+    p = argparse.ArgumentParser()
+    p.add_argument("--dtype", type=str, default="bf16", choices=["fp16", "bf16", "fp32"])
+    p.add_argument("--mode", type=str, default="fwd_bwd", choices=["fwd", "bwd", "fwd_bwd"])
+    p.add_argument("--ignore-index", type=int, default=-100)
+    p.add_argument("--iters", type=int, default=50, help="Triton do_bench rep_ms (kernel-only).")
+    p.add_argument("--warmup-ms", type=int, default=25)
+    p.add_argument("--csv", type=str, default=None, help="Optional CSV output path; appends rows")
+    p.add_argument("--json", type=str, default=None, help="Optional JSON output path (meta + rows)")
+    p.add_argument("--configs", type=str, default="1024x4096,8192x4096")
+    p.add_argument("--quack-suite", action="store_true", help="Run Quack-style batch/seq grid (vocab=4096)")
+    p.add_argument("--dsv3", action="store_true", help="Run DSv3 set: M in {4096,16384,65536}, N in {3072,6144,8192,12288}")
+    p.add_argument(
+        "--skip-verify",
+        action="store_true",
+        help="Skip correctness checks (Oink/Quack vs PyTorch float32-logits cross entropy)",
+    )
+    args = p.parse_args()
+
+    dtype = parse_dtype(args.dtype)
+
+    if args.quack_suite:
+        cfgs = [(bs * sl, hidden) for (bs, sl, hidden) in quack_suite_configs()]
+    elif args.dsv3:
+        cfgs = dsv3_configs()
+    else:
+        cfgs = parse_configs(args.configs)
+
+    hbm_peak = detect_hbm_peak_gbps(device)
+    meta = collect_device_meta(device)
+
+    rows_out: List[Dict[str, Any]] = []
+    for (M, N) in cfgs:
+        print(f"bench M={M:<8d} N={N:<6d} dtype={args.dtype} mode={args.mode} ...", flush=True)
+        (ms_oink, gbps_oink), quack, stats = bench_single(
+            M=M,
+            N=N,
+            dtype=dtype,
+            warmup_ms=int(args.warmup_ms),
+            iters_ms=int(args.iters),
+            mode=str(args.mode),
+            verify=not args.skip_verify,
+            ignore_index=int(args.ignore_index),
+        )
+        row: Dict[str, Any] = {
+            "M": M,
+            "N": N,
+            "dtype": args.dtype,
+            "mode": args.mode,
+            "ignore_index": int(args.ignore_index),
+            "ours_ms": ms_oink,
+            "ours_gbps": gbps_oink,
+            "ours_tbps": gbps_oink / 1000.0,
+            "ours_hbm_frac": gbps_oink / hbm_peak,
+        }
+        if quack is not None:
+            ms_q, gbps_q = quack
+            row.update(
+                {
+                    "quack_ms": ms_q,
+                    "quack_gbps": gbps_q,
+                    "quack_tbps": gbps_q / 1000.0,
+                    "speedup_vs_quack": ms_q / ms_oink,
+                }
+            )
+        row.update(stats)
+        rows_out.append(row)
+
+    if args.csv is not None:
+        write_csv(args.csv, rows_out)
+    if args.json is not None:
+        write_json(
+            args.json,
+            meta,
+            rows_out,
+            extra={
+                "method": "triton.testing.do_bench(mean)",
+                "warmup_ms": int(args.warmup_ms),
+                "rep_ms": int(args.iters),
+                "io_model_bytes": "mode-dependent; see bytes_io_model_ce in script",
+            },
+        )
+
+    headers = ["M", "N", "mode", "ours_ms", "ours_tbps"]
+    if quack_ce_fwd is not None and quack_ce_bwd is not None:
+        headers += ["quack_ms", "quack_tbps", "speedup_vs_quack"]
+    print("\nSummary:")
+    print(" ".join(h.rjust(14) for h in headers))
+    for r in rows_out:
+        parts: List[str] = []
+        for h in headers:
+            v = r.get(h)
+            if isinstance(v, float):
+                parts.append(f"{v:14.4f}")
+            else:
+                parts.append(f"{str(v):>14}")
+        print(" ".join(parts))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py b/oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py
new file mode 100644
index 0000000..b75f892
--- /dev/null
+++ b/oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py
@@ -0,0 +1,296 @@
+from __future__ import annotations
+
+"""
+Benchmark fused_add_rmsnorm (in-place) on SM100.
+
+This matches vLLM's fused_add_rms_norm semantics:
+  z = x + residual   (stored into residual)
+  y = RMSNorm(z, w)  (stored into x)
+
+Why this exists:
+- It is a common inference hot path (vLLM).
+- It is strongly memory-bound (reads/writes two MxN tensors), making it a good
+  roofline case study for Blackwell.
+
+Example:
+  CUDA_VISIBLE_DEVICES=0 python oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py --dtype bf16 --M 65536 --N 4096 \\
+    --json /tmp/fused_add_rmsnorm_sm100_bf16.json
+
+DSv3 suite (Oink vs Quack, multi-shape):
+  CUDA_VISIBLE_DEVICES=0 python oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py --dtype bf16 --dsv3 \\
+    --json /tmp/kernelagent_oink_sm100_suite_bf16/fused_add_rmsnorm_dsv3.json
+"""
+
+import argparse
+import os
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+
+# Reduce fragmentation pressure on busy GPUs.
+os.environ.setdefault("PYTORCH_ALLOC_CONF", "expandable_segments:True")
+
+# Ensure SM100 (GB200) architecture is recognized by CuTeDSL when running outside vLLM.
+os.environ.setdefault("CUTE_DSL_ARCH", "sm_100a")
+
+from bench_utils import (  # noqa: E402
+    ErrorStatsAccumulator,
+    collect_device_meta,
+    detect_hbm_peak_gbps,
+    do_bench_triton,
+    error_stats_to_row,
+    ensure_oink_src_on_path,
+    iter_row_blocks,
+    parse_dtype,
+    write_json,
+)
+
+ensure_oink_src_on_path()
+
+from kernelagent_oink.blackwell import rmsnorm as oink_rmsnorm  # noqa: E402
+
+_VERIFY_TOL = {
+    # Align with Quack's RMSNorm unit-test defaults (tests/test_rmsnorm.py).
+    torch.float32: dict(atol=1e-4, rtol=1e-3),
+    torch.float16: dict(atol=1e-2, rtol=1e-3),
+    torch.bfloat16: dict(atol=1e-1, rtol=1e-2),
+}
+
+try:
+    # Use the low-level mutating custom op to avoid per-iteration allocations
+    # (critical for fair comparisons on small/medium M).
+    from quack.rmsnorm import _rmsnorm_fwd as quack_rmsnorm_fwd_mut  # type: ignore
+except Exception:
+    quack_rmsnorm_fwd_mut = None
+
+
+def dsv3_configs() -> List[Tuple[int, int]]:
+    Ms = [4096, 16384, 65536]
+    Ns = [6144, 7168, 8192]
+    return [(m, n) for m in Ms for n in Ns]
+
+
+def bytes_io_model_fused_add_rmsnorm_inplace(M: int, N: int, dtype: torch.dtype) -> int:
+    elem = torch.tensor(0, dtype=dtype).element_size()
+    # Read x + read residual + write x + write residual + read weight
+    return int((4 * M * N + N) * elem)
+
+
+def _verify_parity(
+    *,
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    w: torch.Tensor,
+    eps: float,
+) -> dict[str, object]:
+    tol = _VERIFY_TOL[x.dtype]
+    ref_block_rows = 4096
+    M = int(x.shape[0])
+    N = int(x.shape[1])
+
+    y_acc_ours = ErrorStatsAccumulator(total_elems=M * N)
+    z_acc_ours = ErrorStatsAccumulator(total_elems=M * N)
+    y_acc_quack = ErrorStatsAccumulator(total_elems=M * N) if quack_rmsnorm_fwd_mut is not None else None
+    z_acc_quack = ErrorStatsAccumulator(total_elems=M * N) if quack_rmsnorm_fwd_mut is not None else None
+
+    x_o = x.clone()
+    r_o = residual.clone()
+    out_q = None
+    res_out_q = None
+    with torch.no_grad():
+        oink_rmsnorm.fused_add_rmsnorm_inplace_(x_o, r_o, w, eps=eps)
+
+        if quack_rmsnorm_fwd_mut is not None:
+            out_q = torch.empty_like(x)
+            res_out_q = torch.empty_like(residual)
+            quack_rmsnorm_fwd_mut(
+                x,
+                w,
+                out_q,
+                None,  # bias
+                None,  # rstd
+                None,  # mean
+                residual,
+                res_out_q,
+                eps,
+                False,  # is_layernorm
+            )
+
+    # Pure-PyTorch reference (float32 accumulation), chunked over rows.
+    M = int(x.shape[0])
+    w_f32 = w.float()
+    for start, end in iter_row_blocks(M, ref_block_rows):
+        z = x[start:end] + residual[start:end]
+        zf = z.float()
+        rstd = torch.rsqrt(zf.square().mean(dim=-1, keepdim=True) + eps)
+        y_ref = ((zf * rstd) * w_f32).to(x.dtype)
+
+        torch.testing.assert_close(x_o[start:end], y_ref, **tol)
+        torch.testing.assert_close(r_o[start:end], z, **tol)
+        y_acc_ours.update(x_o[start:end], y_ref)
+        z_acc_ours.update(r_o[start:end], z)
+        if out_q is not None and res_out_q is not None:
+            torch.testing.assert_close(out_q[start:end], y_ref, **tol)
+            torch.testing.assert_close(res_out_q[start:end], z, **tol)
+            assert y_acc_quack is not None and z_acc_quack is not None
+            y_acc_quack.update(out_q[start:end], y_ref)
+            z_acc_quack.update(res_out_q[start:end], z)
+
+    stats: dict[str, object] = {}
+    stats.update(error_stats_to_row("ours_err_y", y_acc_ours.finalize()))
+    stats.update(error_stats_to_row("ours_err_residual_out", z_acc_ours.finalize()))
+    if y_acc_quack is not None and z_acc_quack is not None:
+        stats.update(error_stats_to_row("quack_err_y", y_acc_quack.finalize()))
+        stats.update(error_stats_to_row("quack_err_residual_out", z_acc_quack.finalize()))
+    return stats
+
+
+def bench_one(
+    *,
+    M: int,
+    N: int,
+    dtype: torch.dtype,
+    warmup_ms: int,
+    iters_ms: int,
+    verify: bool,
+) -> Dict[str, Any]:
+    device = torch.device("cuda")
+    x = torch.randn((M, N), device=device, dtype=dtype)
+    residual = torch.randn_like(x)
+    w = torch.randn((N,), device=device, dtype=dtype)
+
+    stats: dict[str, object] = {}
+    if verify:
+        stats = _verify_parity(x=x, residual=residual, w=w, eps=1e-6)
+
+    bytes_io = bytes_io_model_fused_add_rmsnorm_inplace(M, N, dtype)
+
+    fn = lambda: oink_rmsnorm.fused_add_rmsnorm_inplace_(x, residual, w, eps=1e-6)
+    ms = do_bench_triton(fn, warmup_ms=warmup_ms, rep_ms=iters_ms)
+
+    gbps = bytes_io / (ms * 1e-3) / 1e9
+    tbps = gbps / 1000.0
+    hbm_frac = gbps / detect_hbm_peak_gbps(device)
+
+    row: Dict[str, Any] = dict(
+        M=int(M),
+        N=int(N),
+        dtype="bf16" if dtype is torch.bfloat16 else ("fp16" if dtype is torch.float16 else "fp32"),
+        ours_ms=float(ms),
+        ours_gbps=float(gbps),
+        ours_tbps=float(tbps),
+        ours_hbm_frac=float(hbm_frac),
+    )
+    row.update(stats)
+
+    if quack_rmsnorm_fwd_mut is not None:
+        out_q = torch.empty_like(x)
+        res_out_q = torch.empty_like(residual)
+
+        fn_q = lambda: quack_rmsnorm_fwd_mut(
+            x,
+            w,
+            out_q,
+            None,  # bias
+            None,  # rstd
+            None,  # mean
+            residual,
+            res_out_q,
+            1e-6,
+            False,  # is_layernorm
+        )
+        ms_q = do_bench_triton(fn_q, warmup_ms=warmup_ms, rep_ms=iters_ms)
+        gbps_q = bytes_io / (ms_q * 1e-3) / 1e9
+        row.update(
+            dict(
+                quack_ms=float(ms_q),
+                quack_gbps=float(gbps_q),
+                quack_tbps=float(gbps_q / 1000.0),
+                speedup_vs_quack=float(ms_q / ms),
+            )
+        )
+
+    return row
+
+
+def _dtype_label(dtype: torch.dtype) -> str:
+    if dtype is torch.bfloat16:
+        return "bf16"
+    if dtype is torch.float16:
+        return "fp16"
+    return "fp32"
+
+
+def _print_table(rows: List[Dict[str, Any]]) -> None:
+    if not rows:
+        return
+    headers = ["M", "N", "ours_ms", "ours_tbps"]
+    has_quack = any("quack_ms" in r for r in rows)
+    if has_quack:
+        headers += ["quack_ms", "quack_tbps", "speedup_vs_quack"]
+    print("\nSummary:")
+    print(" ".join(h.rjust(14) for h in headers))
+    for r in rows:
+        parts: List[str] = []
+        for h in headers:
+            v = r.get(h)
+            if isinstance(v, float):
+                parts.append(f"{v:14.4f}")
+            else:
+                parts.append(f"{str(v):>14}")
+        print(" ".join(parts))
+
+
+def main() -> None:
+    p = argparse.ArgumentParser()
+    p.add_argument("--dtype", type=str, default="bf16", choices=["bf16", "fp16", "fp32"])
+    p.add_argument("--M", type=int, default=65536)
+    p.add_argument("--N", type=int, default=4096)
+    p.add_argument(
+        "--dsv3",
+        action="store_true",
+        help="Run DSv3 set: M in {4096,16384,65536}, N in {6144,7168,8192}",
+    )
+    p.add_argument("--warmup-ms", type=int, default=25)
+    p.add_argument("--iters", type=int, default=200, help="rep_ms for do_bench (default: 200)")
+    p.add_argument("--skip-verify", action="store_true")
+    p.add_argument("--json", type=str, default=None)
+    args = p.parse_args()
+
+    dtype = parse_dtype(args.dtype)
+    meta = collect_device_meta(torch.device("cuda"))
+
+    cfgs = dsv3_configs() if bool(args.dsv3) else [(int(args.M), int(args.N))]
+    rows: List[Dict[str, Any]] = []
+    for (M, N) in cfgs:
+        print(f"bench M={M:<8d} N={N:<6d} dtype={_dtype_label(dtype)} fused_add_rmsnorm ...", flush=True)
+        rows.append(
+            bench_one(
+                M=int(M),
+                N=int(N),
+                dtype=dtype,
+                warmup_ms=int(args.warmup_ms),
+                iters_ms=int(args.iters),
+                verify=not bool(args.skip_verify),
+            )
+        )
+
+    _print_table(rows)
+
+    if args.json:
+        write_json(
+            args.json,
+            meta,
+            rows,
+            extra=dict(
+                io_model_bytes="(4*M*N + N)*elem_size",
+                warmup_ms=int(args.warmup_ms),
+                rep_ms=int(args.iters),
+                method="triton.testing.do_bench(mean)",
+                note="Oink fused_add_rmsnorm_inplace_ vs Quack quack::_rmsnorm_fwd(residual=..., residual_out=...) when available",
+            ),
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/oink/benchmarks/benchmark/benchmark_hbm_roofline_sm100.py b/oink/benchmarks/benchmark/benchmark_hbm_roofline_sm100.py
new file mode 100644
index 0000000..971a03c
--- /dev/null
+++ b/oink/benchmarks/benchmark/benchmark_hbm_roofline_sm100.py
@@ -0,0 +1,226 @@
+from __future__ import annotations
+
+"""
+HBM roofline microbenchmark for SM100 (GB200 / Blackwell).
+
+This script measures a STREAM-like bandwidth ceiling using a simple Triton kernel
+that performs a large contiguous copy (read + write) and/or triad (read + read + write)
+over a large buffer.
+
+Why this exists:
+- The benchmark harnesses for Oink ops report an "ours_tbps" derived from an IO model.
+- For roofline discussions, comparing against a *measured* device bandwidth ceiling
+  is often more meaningful than quoting a marketing/theoretical spec.
+
+Example:
+  CUDA_VISIBLE_DEVICES=0 python oink/benchmarks/benchmark/benchmark_hbm_roofline_sm100.py --dtype bf16 --op copy --gb 2
+  CUDA_VISIBLE_DEVICES=0 python oink/benchmarks/benchmark/benchmark_hbm_roofline_sm100.py --dtype fp16 --op triad --gb 2
+"""
+
+import argparse
+import os
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+# Reduce fragmentation pressure on busy GPUs.
+os.environ.setdefault("PYTORCH_ALLOC_CONF", "expandable_segments:True")
+
+from bench_utils import (  # noqa: E402
+    collect_device_meta,
+    do_bench_triton,
+    parse_dtype,
+    write_json,
+)
+
+
+@triton.jit
+def _copy_kernel(
+    x_ptr,
+    y_ptr,
+    n_elements,
+    BLOCK: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    offsets = pid * BLOCK + tl.arange(0, BLOCK)
+    mask = offsets < n_elements
+    x = tl.load(x_ptr + offsets, mask=mask, other=0)
+    tl.store(y_ptr + offsets, x, mask=mask)
+
+
+@triton.jit
+def _triad_kernel(
+    x_ptr,
+    y_ptr,
+    n_elements,
+    BLOCK: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    offsets = pid * BLOCK + tl.arange(0, BLOCK)
+    mask = offsets < n_elements
+    x = tl.load(x_ptr + offsets, mask=mask, other=0)
+    y = tl.load(y_ptr + offsets, mask=mask, other=0)
+    tl.store(y_ptr + offsets, x + y, mask=mask)
+
+
+def _bytes_moved(n_elements: int, elem_size: int, *, op: str) -> int:
+    if op == "copy":
+        return int(2 * n_elements * elem_size)  # read x + write y
+    if op == "triad":
+        return int(3 * n_elements * elem_size)  # read x + read y + write y
+    raise ValueError(f"Unsupported op: {op}")
+
+
+def bench_one(
+    *,
+    n_elements: int,
+    dtype: torch.dtype,
+    op: str,
+    block: int,
+    num_warps: int,
+    warmup_ms: int,
+    iters_ms: int,
+) -> Tuple[float, float]:
+    device = torch.device("cuda")
+    x = torch.empty((n_elements,), device=device, dtype=dtype)
+    y = torch.empty_like(x)
+    # Avoid pathological compression-friendly patterns (e.g. all-zeros) that can
+    # artificially inflate apparent bandwidth on some GPUs. Random-ish data is
+    # a closer match to ML workloads.
+    x.uniform_(-1, 1)
+    y.uniform_(-1, 1)
+
+    grid = (triton.cdiv(n_elements, block),)
+
+    if op == "copy":
+        launch = lambda: _copy_kernel[grid](
+            x,
+            y,
+            n_elements,
+            BLOCK=block,
+            num_warps=num_warps,
+            num_stages=4,
+        )
+    elif op == "triad":
+        launch = lambda: _triad_kernel[grid](
+            x,
+            y,
+            n_elements,
+            BLOCK=block,
+            num_warps=num_warps,
+            num_stages=4,
+        )
+    else:
+        raise ValueError(f"Unsupported op: {op}")
+
+    # Force compilation out of the timed region.
+    launch()
+    torch.cuda.synchronize()
+
+    ms = do_bench_triton(launch, warmup_ms=warmup_ms, rep_ms=iters_ms)
+    moved = _bytes_moved(n_elements, x.element_size(), op=op)
+    tbps = moved / (ms * 1e-3) / 1e12
+    return ms, tbps
+
+
+def _print_summary(rows: List[Dict[str, Any]]) -> None:
+    if not rows:
+        return
+    best = max(rows, key=lambda r: float(r["tbps"]))
+    print("\nSummary (STREAM-like):")
+    print(f"- best_tbps: {best['tbps']:.3f} TB/s  ({best['op']}, BLOCK={best['block']}, warps={best['num_warps']})")
+
+
+def main() -> None:
+    p = argparse.ArgumentParser()
+    p.add_argument("--dtype", type=str, default="bf16", choices=["bf16", "fp16", "fp32"])
+    p.add_argument("--op", type=str, default="copy", choices=["copy", "triad", "both"])
+    p.add_argument("--gb", type=float, default=2.0, help="Size per tensor in GB (default: 2)")
+    p.add_argument("--warmup-ms", type=int, default=25)
+    p.add_argument("--iters", type=int, default=100, help="rep_ms for do_bench (default: 100)")
+    p.add_argument("--json", type=str, default=None, help="Write JSON results to this path")
+    p.add_argument("--no-sweep", action="store_true", help="Disable tuning sweep; run a single config")
+    p.add_argument("--block", type=int, default=2048, help="BLOCK size when --no-sweep is set")
+    p.add_argument("--warps", type=int, default=8, help="num_warps when --no-sweep is set")
+    args = p.parse_args()
+
+    dtype = parse_dtype(args.dtype)
+    device = torch.device("cuda")
+    props = torch.cuda.get_device_properties(device)
+    cap = (int(props.major), int(props.minor))
+    if cap != (10, 0):
+        raise RuntimeError(f"Expected SM100 (10,0), got {cap} ({props.name})")
+
+    elem_size = torch.tensor(0, dtype=dtype).element_size()
+    bytes_per_tensor = int(args.gb * (1024**3))
+    n_elements = max(1, bytes_per_tensor // elem_size)
+
+    ops: List[str]
+    if args.op == "both":
+        ops = ["copy", "triad"]
+    else:
+        ops = [args.op]
+
+    if args.no_sweep:
+        sweep: List[Tuple[int, int]] = [(int(args.block), int(args.warps))]
+    else:
+        # A tiny hand-tuned sweep that keeps compile overhead reasonable.
+        sweep = [
+            (1024, 4),
+            (1024, 8),
+            (2048, 4),
+            (2048, 8),
+            (4096, 8),
+        ]
+
+    print(f"Running on {props.name} (SM{props.major}{props.minor})")
+    print(f"- dtype: {args.dtype} (elem={elem_size}B)")
+    print(f"- n_elements: {n_elements:,}  (~{(n_elements * elem_size) / (1024**3):.2f} GiB per tensor)")
+    print(f"- ops: {ops}")
+    print(f"- sweep: {sweep}")
+
+    meta = collect_device_meta(device)
+    rows: List[Dict[str, Any]] = []
+    for op in ops:
+        for block, warps in sweep:
+            ms, tbps = bench_one(
+                n_elements=n_elements,
+                dtype=dtype,
+                op=op,
+                block=block,
+                num_warps=warps,
+                warmup_ms=int(args.warmup_ms),
+                iters_ms=int(args.iters),
+            )
+            rows.append(
+                dict(
+                    op=op,
+                    dtype=str(args.dtype),
+                    n_elements=int(n_elements),
+                    elem_size_B=int(elem_size),
+                    block=int(block),
+                    num_warps=int(warps),
+                    warmup_ms=int(args.warmup_ms),
+                    rep_ms=int(args.iters),
+                    ms=float(ms),
+                    tbps=float(tbps),
+                )
+            )
+            print(f"- {op:5s} BLOCK={block:4d} warps={warps}: {tbps:.3f} TB/s  ({ms:.4f} ms)")
+
+    _print_summary(rows)
+
+    if args.json:
+        # Write meta + detailed rows for reproducibility.
+        extra = dict(
+            bytes_model="copy:2*N*elem, triad:3*N*elem",
+            bytes_per_tensor=int(bytes_per_tensor),
+            gb_per_tensor=float(args.gb),
+        )
+        write_json(args.json, meta, rows, extra=extra)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/oink/benchmarks/benchmark/benchmark_layernorm_sm100.py b/oink/benchmarks/benchmark/benchmark_layernorm_sm100.py
new file mode 100644
index 0000000..778e3e2
--- /dev/null
+++ b/oink/benchmarks/benchmark/benchmark_layernorm_sm100.py
@@ -0,0 +1,393 @@
+from __future__ import annotations
+
+import argparse
+import os
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+
+# Reduce fragmentation pressure on busy GPUs.
+os.environ.setdefault("PYTORCH_ALLOC_CONF", "expandable_segments:True")
+
+# Ensure SM100 (GB200) architecture is recognized by CuTeDSL when running outside vLLM.
+os.environ.setdefault("CUTE_DSL_ARCH", "sm_100a")
+
+from bench_utils import (  # noqa: E402
+    ErrorStatsAccumulator,
+    collect_device_meta,
+    detect_hbm_peak_gbps,
+    do_bench_triton,
+    error_stats_to_row,
+    ensure_oink_src_on_path,
+    iter_row_blocks,
+    parse_configs,
+    parse_dtype,
+    quack_suite_configs,
+    write_csv,
+    write_json,
+)
+
+ensure_oink_src_on_path()
+
+from kernelagent_oink.blackwell import layernorm as oink_ln  # noqa: E402
+
+try:
+    # Quack exposes LayerNorm through the RMSNorm module (is_layernorm=True path).
+    from quack.rmsnorm import layernorm_fwd as quack_layernorm  # type: ignore
+except Exception:
+    quack_layernorm = None
+
+_VERIFY_TOL_Y = {
+    # Match Quack's unit-test defaults (tests/test_layernorm.py).
+    torch.float32: dict(atol=1e-4, rtol=1e-4),
+    torch.float16: dict(atol=1e-3, rtol=1e-3),
+    torch.bfloat16: dict(atol=1e-2, rtol=1e-2),
+}
+
+# Quack checks rstd/mean (fp32) with a tighter fixed tolerance.
+_VERIFY_TOL_STATS = dict(atol=6e-4, rtol=6e-4)
+
+
+def bytes_io_model_layernorm(
+    M: int,
+    N: int,
+    dtype: torch.dtype,
+    *,
+    has_bias: bool,
+    return_rstd: bool,
+    return_mean: bool,
+    weight_dtype: torch.dtype = torch.float32,
+) -> int:
+    elem = torch.tensor(0, dtype=dtype).element_size()
+    w_elem = torch.tensor(0, dtype=weight_dtype).element_size()
+    total = 0
+    # Read x + write y
+    total += 2 * M * N * elem
+    # Read weight (+ optional bias) along feature dim
+    total += N * w_elem
+    if has_bias:
+        total += N * w_elem
+    # Optional per-row stats (fp32)
+    if return_rstd:
+        total += M * 4
+    if return_mean:
+        total += M * 4
+    return int(total)
+
+
+def dsv3_configs() -> List[Tuple[int, int]]:
+    Ms = [4096, 16384, 65536]
+    Ns = [6144, 7168, 8192]
+    return [(m, n) for m in Ms for n in Ns]
+
+
+def _verify_parity(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    b: torch.Tensor | None,
+    *,
+    eps: float,
+    return_rstd: bool,
+    return_mean: bool,
+) -> dict[str, object]:
+    tol_y = _VERIFY_TOL_Y[x.dtype]
+    ref_block_rows = 4096
+    M = int(x.shape[0])
+    N = int(x.shape[1])
+
+    y_acc_ours = ErrorStatsAccumulator(total_elems=M * N)
+    y_acc_quack = (
+        ErrorStatsAccumulator(total_elems=M * N) if (quack_layernorm is not None and b is None) else None
+    )
+    with torch.no_grad():
+        ours = oink_ln.layernorm(
+            x,
+            w,
+            bias=b,
+            eps=eps,
+            return_rstd=return_rstd,
+            return_mean=return_mean,
+        )
+        quack = None
+        if quack_layernorm is not None and b is None:
+            quack = quack_layernorm(
+                x,
+                w,
+                eps=eps,
+                return_rstd=return_rstd,
+                return_mean=return_mean,
+            )
+    torch.cuda.synchronize()
+
+    def _unpack(out):
+        if return_rstd and return_mean:
+            y, rstd, mean = out
+        elif return_rstd and not return_mean:
+            y, rstd = out
+            mean = None
+        elif return_mean and not return_rstd:
+            y, mean = out
+            rstd = None
+        else:
+            y, rstd, mean = out, None, None
+        return y, rstd, mean
+
+    y_o, rstd_o, mean_o = _unpack(ours)
+    y_q, rstd_q, mean_q = _unpack(quack) if quack is not None else (None, None, None)
+
+    # Pure-PyTorch reference (float32 accumulation), matching Quack's unit tests:
+    # - compute ref output via F.layer_norm on float32
+    # - compute mean/rstd from float32 input
+    rstd_ref_all = torch.empty((M,), device=x.device, dtype=torch.float32) if return_rstd else None
+    mean_ref_all = torch.empty((M,), device=x.device, dtype=torch.float32) if return_mean else None
+
+    for start, end in iter_row_blocks(M, ref_block_rows):
+        x_f32 = x[start:end].float()
+        y_ref_f32 = torch.nn.functional.layer_norm(x_f32, w.shape, w, b, eps)
+        y_ref = y_ref_f32.to(x.dtype)
+        torch.testing.assert_close(y_o[start:end], y_ref, **tol_y)
+        y_acc_ours.update(y_o[start:end], y_ref)
+        if y_q is not None:
+            torch.testing.assert_close(y_q[start:end], y_ref, **tol_y)
+            assert y_acc_quack is not None
+            y_acc_quack.update(y_q[start:end], y_ref)
+
+        # Per-row stats in fp32, as in Quack's tests.
+        if return_rstd or return_mean:
+            mean_f32 = x_f32.mean(dim=-1)
+            if return_mean:
+                assert mean_ref_all is not None
+                mean_ref_all[start:end] = mean_f32
+            if return_rstd:
+                var_f32 = ((x_f32 - mean_f32.unsqueeze(1)) ** 2).mean(dim=-1)
+                rstd_ref = 1.0 / torch.sqrt(var_f32 + eps)
+                assert rstd_ref_all is not None
+                rstd_ref_all[start:end] = rstd_ref
+
+                assert rstd_o is not None
+                torch.testing.assert_close(rstd_o[start:end], rstd_ref, **_VERIFY_TOL_STATS)
+                if rstd_q is not None:
+                    torch.testing.assert_close(rstd_q[start:end], rstd_ref, **_VERIFY_TOL_STATS)
+
+            if return_mean:
+                mean_ref = mean_f32
+                assert mean_o is not None
+                torch.testing.assert_close(mean_o[start:end], mean_ref, **_VERIFY_TOL_STATS)
+                if mean_q is not None:
+                    torch.testing.assert_close(mean_q[start:end], mean_ref, **_VERIFY_TOL_STATS)
+
+    stats: dict[str, object] = {}
+    stats.update(error_stats_to_row("ours_err_y", y_acc_ours.finalize()))
+    if y_acc_quack is not None:
+        stats.update(error_stats_to_row("quack_err_y", y_acc_quack.finalize()))
+
+    if return_rstd:
+        assert rstd_o is not None and rstd_ref_all is not None
+        rstd_acc_ours = ErrorStatsAccumulator(
+            total_elems=int(rstd_ref_all.numel()), p99_target_samples=int(rstd_ref_all.numel())
+        )
+        rstd_acc_ours.update(rstd_o, rstd_ref_all)
+        stats.update(error_stats_to_row("ours_err_rstd", rstd_acc_ours.finalize()))
+        if rstd_q is not None:
+            rstd_acc_quack = ErrorStatsAccumulator(
+                total_elems=int(rstd_ref_all.numel()), p99_target_samples=int(rstd_ref_all.numel())
+            )
+            rstd_acc_quack.update(rstd_q, rstd_ref_all)
+            stats.update(error_stats_to_row("quack_err_rstd", rstd_acc_quack.finalize()))
+
+    if return_mean:
+        assert mean_o is not None and mean_ref_all is not None
+        mean_acc_ours = ErrorStatsAccumulator(
+            total_elems=int(mean_ref_all.numel()), p99_target_samples=int(mean_ref_all.numel())
+        )
+        mean_acc_ours.update(mean_o, mean_ref_all)
+        stats.update(error_stats_to_row("ours_err_mean", mean_acc_ours.finalize()))
+        if mean_q is not None:
+            mean_acc_quack = ErrorStatsAccumulator(
+                total_elems=int(mean_ref_all.numel()), p99_target_samples=int(mean_ref_all.numel())
+            )
+            mean_acc_quack.update(mean_q, mean_ref_all)
+            stats.update(error_stats_to_row("quack_err_mean", mean_acc_quack.finalize()))
+
+    return stats
+
+
+def bench_single(
+    M: int,
+    N: int,
+    dtype: torch.dtype,
+    *,
+    eps: float,
+    warmup_ms: int,
+    iters_ms: int,
+    verify: bool,
+    return_rstd: bool,
+    return_mean: bool,
+    has_bias: bool,
+) -> Tuple[Tuple[float, float], Optional[Tuple[float, float]], dict[str, object]]:
+    device = torch.device("cuda")
+    x = torch.randn(M, N, device=device, dtype=dtype)
+    w = torch.randn(N, device=device, dtype=torch.float32)
+    b = torch.randn(N, device=device, dtype=torch.float32) if has_bias else None
+
+    stats: dict[str, object] = {}
+    if verify:
+        stats = _verify_parity(x, w, b, eps=eps, return_rstd=return_rstd, return_mean=return_mean)
+
+    bytes_io = bytes_io_model_layernorm(
+        M,
+        N,
+        dtype,
+        has_bias=has_bias,
+        return_rstd=return_rstd,
+        return_mean=return_mean,
+        weight_dtype=w.dtype,
+    )
+
+    fn_oink = lambda: oink_ln.layernorm(
+        x,
+        w,
+        bias=b,
+        eps=eps,
+        return_rstd=return_rstd,
+        return_mean=return_mean,
+    )
+    ms_oink = do_bench_triton(fn_oink, warmup_ms=warmup_ms, rep_ms=iters_ms)
+    gbps_oink = bytes_io / (ms_oink * 1e-3) / 1e9
+
+    if quack_layernorm is None or has_bias:
+        return (ms_oink, gbps_oink), None, stats
+
+    fn_quack = lambda: quack_layernorm(
+        x,
+        w,
+        eps=eps,
+        return_rstd=return_rstd,
+        return_mean=return_mean,
+    )
+    ms_quack = do_bench_triton(fn_quack, warmup_ms=warmup_ms, rep_ms=iters_ms)
+    gbps_quack = bytes_io / (ms_quack * 1e-3) / 1e9
+    return (ms_oink, gbps_oink), (ms_quack, gbps_quack), stats
+
+
+def main() -> None:
+    if not torch.cuda.is_available():
+        raise SystemExit("CUDA not available")
+
+    torch.cuda.set_device(0)
+    device = torch.device("cuda")
+    props = torch.cuda.get_device_properties(device)
+    sm = props.major * 10 + props.minor
+    print(f"Running on {torch.cuda.get_device_name(device)} (SM{sm})")
+
+    p = argparse.ArgumentParser()
+    p.add_argument("--dtype", type=str, default="bf16", choices=["fp16", "bf16", "fp32"])
+    p.add_argument("--eps", type=float, default=1e-6)
+    p.add_argument("--return-rstd", action="store_true")
+    p.add_argument("--return-mean", action="store_true")
+    p.add_argument("--with-bias", action="store_true", help="Benchmark bias path (Quack compare skipped)")
+    p.add_argument("--iters", type=int, default=100, help="Triton do_bench rep_ms (kernel-only).")
+    p.add_argument("--warmup-ms", type=int, default=25)
+    p.add_argument("--csv", type=str, default=None, help="Optional CSV output path; appends rows")
+    p.add_argument("--json", type=str, default=None, help="Optional JSON output path (meta + rows)")
+    p.add_argument("--configs", type=str, default="1024x4096,8192x4096")
+    p.add_argument("--quack-suite", action="store_true", help="Run Quack-style batch/seq grid (hidden=4096)")
+    p.add_argument(
+        "--dsv3",
+        action="store_true",
+        help="Run DSv3 set: M in {4096,16384,65536}, N in {6144,7168,8192}",
+    )
+    p.add_argument(
+        "--skip-verify",
+        action="store_true",
+        help="Skip correctness checks (Oink/Quack vs a pure-PyTorch reference; Quack compare skipped when bias is enabled)",
+    )
+    args = p.parse_args()
+
+    dtype = parse_dtype(args.dtype)
+    eps = float(args.eps)
+
+    if args.quack_suite:
+        cfgs = [(bs * sl, hidden) for (bs, sl, hidden) in quack_suite_configs()]
+    elif args.dsv3:
+        cfgs = dsv3_configs()
+    else:
+        cfgs = parse_configs(args.configs)
+
+    hbm_peak = detect_hbm_peak_gbps(device)
+    meta = collect_device_meta(device)
+
+    rows_out: List[Dict[str, Any]] = []
+    for (M, N) in cfgs:
+        print(f"bench M={M:<8d} N={N:<6d} dtype={args.dtype} ...", flush=True)
+        (ms_oink, gbps_oink), quack, stats = bench_single(
+            M=M,
+            N=N,
+            dtype=dtype,
+            eps=eps,
+            warmup_ms=int(args.warmup_ms),
+            iters_ms=int(args.iters),
+            verify=not args.skip_verify,
+            return_rstd=bool(args.return_rstd),
+            return_mean=bool(args.return_mean),
+            has_bias=bool(args.with_bias),
+        )
+        row: Dict[str, Any] = {
+            "M": M,
+            "N": N,
+            "dtype": args.dtype,
+            "eps": eps,
+            "return_rstd": bool(args.return_rstd),
+            "return_mean": bool(args.return_mean),
+            "with_bias": bool(args.with_bias),
+            "ours_ms": ms_oink,
+            "ours_gbps": gbps_oink,
+            "ours_tbps": gbps_oink / 1000.0,
+            "ours_hbm_frac": gbps_oink / hbm_peak,
+        }
+        if quack is not None:
+            ms_q, gbps_q = quack
+            row.update(
+                {
+                    "quack_ms": ms_q,
+                    "quack_gbps": gbps_q,
+                    "quack_tbps": gbps_q / 1000.0,
+                    "speedup_vs_quack": ms_q / ms_oink,
+                }
+            )
+        row.update(stats)
+        rows_out.append(row)
+
+    if args.csv is not None:
+        write_csv(args.csv, rows_out)
+    if args.json is not None:
+        write_json(
+            args.json,
+            meta,
+            rows_out,
+            extra={
+                "method": "triton.testing.do_bench(mean)",
+                "warmup_ms": int(args.warmup_ms),
+                "rep_ms": int(args.iters),
+                "io_model_bytes": "see bytes_io_model_layernorm in script",
+            },
+        )
+
+    headers = ["M", "N", "ours_ms", "ours_tbps"]
+    if quack_layernorm is not None and (not args.with_bias):
+        headers += ["quack_ms", "quack_tbps", "speedup_vs_quack"]
+    print("\nSummary:")
+    print(" ".join(h.rjust(14) for h in headers))
+    for r in rows_out:
+        parts: List[str] = []
+        for h in headers:
+            v = r.get(h)
+            if isinstance(v, float):
+                parts.append(f"{v:14.4f}")
+            else:
+                parts.append(f"{str(v):>14}")
+        print(" ".join(parts))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py b/oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py
new file mode 100644
index 0000000..01c390d
--- /dev/null
+++ b/oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py
@@ -0,0 +1,434 @@
+from __future__ import annotations
+
+import argparse
+import csv
+import os
+import sys
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+
+import torch
+from triton.testing import do_bench as triton_do_bench
+
+# Reduce fragmentation pressure on busy GPUs.
+os.environ.setdefault("PYTORCH_ALLOC_CONF", "expandable_segments:True")
+
+# Ensure SM100 (GB200) architecture is recognized by CuTeDSL when running outside vLLM.
+os.environ.setdefault("CUTE_DSL_ARCH", "sm_100a")
+
+# Make the in-repo KernelAgent Oink package importable without an editable install.
+_HERE = os.path.dirname(os.path.abspath(__file__))
+_OINK_SRC = os.path.abspath(os.path.join(_HERE, "..", "src"))
+if _OINK_SRC not in sys.path:
+    sys.path.insert(0, _OINK_SRC)
+
+from bench_utils import (  # noqa: E402
+    ErrorStatsAccumulator,
+    collect_device_meta,
+    error_stats_to_row,
+    iter_row_blocks,
+    write_json,
+)
+from kernelagent_oink.blackwell import rmsnorm as oink_rmsnorm  # noqa: E402
+
+try:
+    from quack.rmsnorm import rmsnorm_bwd as quack_rmsnorm_bwd  # type: ignore
+except Exception:
+    quack_rmsnorm_bwd = None
+
+_VERIFY_TOL_DX = {
+    # Match Quack's unit-test defaults (tests/test_rmsnorm.py).
+    torch.float32: dict(atol=1e-4, rtol=1e-3),
+    torch.float16: dict(atol=1e-2, rtol=1e-3),
+    torch.bfloat16: dict(atol=1e-1, rtol=1e-2),
+}
+
+
+def detect_hbm_peak_gbps(device: Optional[torch.device] = None) -> float:
+    """Approximate HBM peak bandwidth in GB/s for roofline fractions."""
+    if device is None:
+        device = torch.device("cuda")
+    props = torch.cuda.get_device_properties(device)
+    sm = props.major * 10 + props.minor
+    if sm >= 100:
+        return 8000.0
+    return 2000.0
+
+
+@dataclass
+class Result:
+    ms: float
+    gbps: float
+
+
+def do_bench_triton(fn, warmup_ms: int = 25, rep_ms: int = 100) -> float:
+    # Kernel-only timing consistent with the existing Oink forward harness.
+    return float(triton_do_bench(fn, warmup=warmup_ms, rep=rep_ms, return_mode="mean"))
+
+
+def bytes_io_model_bwd(
+    M: int, N: int, dtype: torch.dtype, *, weight_dtype: torch.dtype = torch.float32
+) -> int:
+    """A simple IO model for RMSNorm backward.
+
+    This intentionally ignores partial-reduction scratch buffers (`dw_partial` /
+    `db_partial`) since those are highly implementation-specific and depend on
+    sm_count; we still report speedups and times regardless.
+    """
+    elem = torch.tensor(0, dtype=dtype).element_size()
+    w_elem = torch.tensor(0, dtype=weight_dtype).element_size()
+    # Read x + dout + write dx
+    total = 3 * M * N * elem
+    # Read weight + write dw
+    total += 2 * N * w_elem
+    # Read rstd (fp32 per row)
+    total += M * 4
+    return int(total)
+
+
+def parse_dtype(s: str) -> torch.dtype:
+    s = s.lower()
+    if s == "bf16":
+        return torch.bfloat16
+    if s == "fp16":
+        return torch.float16
+    if s == "fp32":
+        return torch.float32
+    raise ValueError(f"Unsupported dtype: {s}")
+
+
+def parse_configs(s: str) -> List[Tuple[int, int]]:
+    out: List[Tuple[int, int]] = []
+    for part in s.split(","):
+        m, n = part.lower().split("x")
+        out.append((int(m), int(n)))
+    return out
+
+
+def quack_suite_configs() -> List[Tuple[int, int, int]]:
+    """Return (batch, seq, hidden) triples following Quack's grid (hidden=4096)."""
+    batch_sizes = [1, 4, 8, 16, 32]
+    seq_lengths = [8192, 16384, 32768, 65536, 131072]
+    hidden = 4096
+    cfgs: List[Tuple[int, int, int]] = []
+    for bs in batch_sizes:
+        for sl in seq_lengths:
+            M = bs * sl
+            if M * hidden > (2**31):
+                continue
+            cfgs.append((bs, sl, hidden))
+    return cfgs
+
+
+def dsv3_configs() -> List[Tuple[int, int]]:
+    Ms = [4096, 16384, 65536]
+    Ns = [6144, 7168, 8192]
+    return [(m, n) for m in Ms for n in Ns]
+
+
+def _verify_parity(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    dout: torch.Tensor,
+    rstd: torch.Tensor,
+    *,
+    has_bias: bool,
+    has_residual: bool,
+) -> dict[str, object]:
+    tol_dx = _VERIFY_TOL_DX[x.dtype]
+    ref_block_rows = 1024
+    M, N = int(x.shape[0]), int(x.shape[1])
+
+    dx_acc_ours = ErrorStatsAccumulator(total_elems=M * N)
+    dx_acc_quack = ErrorStatsAccumulator(total_elems=M * N) if quack_rmsnorm_bwd is not None else None
+
+    with torch.no_grad():
+        dx_oink, dw_oink, db_oink, dres_oink = oink_rmsnorm.rmsnorm_backward(
+            x,
+            w,
+            dout,
+            rstd,
+            dresidual_out=None,
+            has_bias=has_bias,
+            has_residual=has_residual,
+        )
+
+        dx_quack = None
+        dw_quack = None
+        db_quack = None
+        dres_quack = None
+        if quack_rmsnorm_bwd is not None:
+            dx_quack, dw_quack, db_quack, dres_quack = quack_rmsnorm_bwd(
+                x,
+                w,
+                dout,
+                rstd,
+                dresidual_out=None,
+                has_bias=has_bias,
+                has_residual=has_residual,
+            )
+    torch.cuda.synchronize()
+
+    # Pure-PyTorch reference, matching Quack's rmsnorm_bwd_ref (float32 math for x_hat).
+    # Chunk over rows to avoid materializing an (M, N) float32 tensor for large shapes.
+    dw_accum = torch.zeros((N,), device=x.device, dtype=torch.float32)
+    w_f32 = w.float()
+    for start, end in iter_row_blocks(M, ref_block_rows):
+        x_f32 = x[start:end].float()
+        rstd_blk = rstd[start:end]
+        x_hat = x_f32 * rstd_blk.unsqueeze(1)
+        # Match Quack/PyTorch reference behavior: gradient math uses float32
+        # intermediates even when (x, w, dout) are bf16/fp16.
+        dout_f32 = dout[start:end].float()
+        wdy = dout_f32 * w_f32
+        c1 = (x_hat * wdy).mean(dim=-1, keepdim=True)
+        dx_ref = ((wdy - x_hat * c1) * rstd_blk.unsqueeze(1)).to(x.dtype)
+
+        torch.testing.assert_close(dx_oink[start:end], dx_ref, **tol_dx)
+        dx_acc_ours.update(dx_oink[start:end], dx_ref)
+        if dx_quack is not None:
+            torch.testing.assert_close(dx_quack[start:end], dx_ref, **tol_dx)
+            assert dx_acc_quack is not None
+            dx_acc_quack.update(dx_quack[start:end], dx_ref)
+
+        if dw_oink is not None:
+            dw_accum += (dout_f32 * x_hat).sum(dim=0)
+
+    stats: dict[str, object] = {}
+    stats.update(error_stats_to_row("ours_err_dx", dx_acc_ours.finalize()))
+    if dx_acc_quack is not None:
+        stats.update(error_stats_to_row("quack_err_dx", dx_acc_quack.finalize()))
+
+    if dw_oink is not None:
+        dw_ref = dw_accum.to(w.dtype)
+        if w.dtype == torch.float32:
+            # Weight grad is sensitive to reduction order; use a slightly larger
+            # absolute tolerance in the suite harness (Quack's unit tests use
+            # smaller M, where dw is typically tighter).
+            dw_tol = dict(atol=2e-3, rtol=1e-3)
+        else:
+            # For fp16/bf16 weights, `dw` is low-precision and grows with M; use an
+            # ulp/magnitude-aware tolerance rather than a fixed epsilon.
+            dw_ref_f32 = dw_ref.to(torch.float32)
+            dw_oink_f32 = dw_oink.to(torch.float32)
+            scale = float(dw_ref_f32.abs().max().item())
+            dw_atol = max(2.0 * torch.finfo(w.dtype).eps * scale, 1e-3)
+            dw_tol = dict(atol=dw_atol, rtol=1e-3)
+            torch.testing.assert_close(dw_oink_f32, dw_ref_f32, **dw_tol)
+            if dw_quack is not None:
+                torch.testing.assert_close(dw_quack.to(torch.float32), dw_ref_f32, **dw_tol)
+            dw_tol = None  # handled above
+        if dw_tol is not None:
+            torch.testing.assert_close(dw_oink, dw_ref, **dw_tol)
+            if dw_quack is not None:
+                torch.testing.assert_close(dw_quack, dw_ref, **dw_tol)
+
+        # Record weight-grad error stats (small, so exact p99 over the full vector).
+        dw_acc_ours = ErrorStatsAccumulator(total_elems=int(dw_ref.numel()), p99_target_samples=int(dw_ref.numel()))
+        dw_acc_ours.update(dw_oink, dw_ref)
+        stats.update(error_stats_to_row("ours_err_dw", dw_acc_ours.finalize()))
+        if dw_quack is not None:
+            dw_acc_quack = ErrorStatsAccumulator(
+                total_elems=int(dw_ref.numel()), p99_target_samples=int(dw_ref.numel())
+            )
+            dw_acc_quack.update(dw_quack, dw_ref)
+            stats.update(error_stats_to_row("quack_err_dw", dw_acc_quack.finalize()))
+
+    assert db_oink is None and db_quack is None
+    assert dres_oink is None and dres_quack is None
+    return stats
+
+
+def bench_single(
+    M: int,
+    N: int,
+    dtype: torch.dtype,
+    weight_dtype: torch.dtype,
+    iters_ms: int,
+    eps: float,
+    warmup_ms: int,
+    verify: bool,
+) -> Tuple[Result, Result | None, dict[str, object]]:
+    device = torch.device("cuda")
+    x = torch.randn(M, N, device=device, dtype=dtype)
+    w = torch.randn(N, device=device, dtype=weight_dtype)
+    dout = torch.randn(M, N, device=device, dtype=dtype)
+    # rstd is fp32 per row; compute once outside the timed region.
+    with torch.no_grad():
+        xf = x.float()
+        rstd = torch.rsqrt(xf.square().mean(dim=-1) + eps).to(torch.float32)
+
+    stats: dict[str, object] = {}
+    if verify:
+        stats = _verify_parity(x, w, dout, rstd, has_bias=False, has_residual=False)
+
+    fn_oink = lambda: oink_rmsnorm.rmsnorm_backward(
+        x,
+        w,
+        dout,
+        rstd,
+        dresidual_out=None,
+        has_bias=False,
+        has_residual=False,
+    )
+
+    ms_oink = do_bench_triton(fn_oink, warmup_ms=warmup_ms, rep_ms=iters_ms)
+    bytes_io = bytes_io_model_bwd(M, N, dtype, weight_dtype=w.dtype)
+    gbps_oink = bytes_io / (ms_oink * 1e-3) / 1e9
+    ours = Result(ms=ms_oink, gbps=gbps_oink)
+
+    if quack_rmsnorm_bwd is None:
+        return ours, None, stats
+
+    fn_quack = lambda: quack_rmsnorm_bwd(
+        x,
+        w,
+        dout,
+        rstd,
+        dresidual_out=None,
+        has_bias=False,
+        has_residual=False,
+    )
+    ms_quack = do_bench_triton(fn_quack, warmup_ms=warmup_ms, rep_ms=iters_ms)
+    gbps_quack = bytes_io / (ms_quack * 1e-3) / 1e9
+    return ours, Result(ms=ms_quack, gbps=gbps_quack), stats
+
+
+def main() -> None:
+    if not torch.cuda.is_available():
+        raise SystemExit("CUDA not available")
+
+    torch.cuda.set_device(0)
+    device = torch.device("cuda")
+    props = torch.cuda.get_device_properties(device)
+    sm = props.major * 10 + props.minor
+    print(f"Running on {torch.cuda.get_device_name(device)} (SM{sm})")
+
+    p = argparse.ArgumentParser()
+    p.add_argument("--dtype", type=str, default="bf16", choices=["fp16", "bf16", "fp32"])
+    p.add_argument(
+        "--weight-dtype",
+        type=str,
+        default="fp32",
+        choices=["same", "fp16", "bf16", "fp32"],
+        help="RMSNorm weight dtype. `same` matches activation dtype.",
+    )
+    p.add_argument("--eps", type=float, default=1e-6)
+    p.add_argument(
+        "--iters",
+        type=int,
+        default=100,
+        help="Triton do_bench rep_ms (kernel-only).",
+    )
+    p.add_argument("--warmup-ms", type=int, default=25)
+    p.add_argument("--csv", type=str, default=None, help="Optional CSV output path; appends rows")
+    p.add_argument("--json", type=str, default=None, help="Optional JSON output path (meta + rows)")
+    p.add_argument("--configs", type=str, default="1024x4096,8192x4096")
+    p.add_argument("--quack-suite", action="store_true", help="Run Quack-style batch/seq grid")
+    p.add_argument(
+        "--dsv3",
+        action="store_true",
+        help="Run DSv3 set: M in {4096,16384,65536}, N in {6144,7168,8192}",
+    )
+    p.add_argument(
+        "--skip-verify",
+        action="store_true",
+        help="Skip correctness checks (Oink/Quack vs a pure-PyTorch RMSNorm backward reference)",
+    )
+    args = p.parse_args()
+
+    dtype = parse_dtype(args.dtype)
+    if args.weight_dtype == "same":
+        weight_dtype = dtype
+    else:
+        weight_dtype = parse_dtype(args.weight_dtype)
+    eps = float(args.eps)
+
+    if args.quack_suite:
+        cfgs = [(bs * sl, hidden) for (bs, sl, hidden) in quack_suite_configs()]
+    elif args.dsv3:
+        cfgs = dsv3_configs()
+    else:
+        cfgs = parse_configs(args.configs)
+
+    hbm_peak = detect_hbm_peak_gbps(device)
+
+    rows_out: list[dict[str, object]] = []
+
+    for (M, N) in cfgs:
+        print(f"bench M={M:<8d} N={N:<6d} dtype={args.dtype} ...", flush=True)
+        ours, quack, stats = bench_single(
+            M=M,
+            N=N,
+            dtype=dtype,
+            weight_dtype=weight_dtype,
+            iters_ms=int(args.iters),
+            eps=eps,
+            warmup_ms=int(args.warmup_ms),
+            verify=not args.skip_verify,
+        )
+
+        row: dict[str, object] = {
+            "M": M,
+            "N": N,
+            "dtype": args.dtype,
+            "weight_dtype": args.weight_dtype,
+            "ours_ms": ours.ms,
+            "ours_gbps": ours.gbps,
+            "ours_tbps": ours.gbps / 1000.0,
+            "ours_hbm_frac": ours.gbps / hbm_peak,
+        }
+        if quack is not None:
+            row.update(
+                {
+                    "quack_ms": quack.ms,
+                    "quack_gbps": quack.gbps,
+                    "quack_tbps": quack.gbps / 1000.0,
+                    "speedup_vs_quack": quack.ms / ours.ms,
+                }
+            )
+        row.update(stats)
+        rows_out.append(row)
+
+        if args.csv is not None:
+            file_exists = os.path.exists(args.csv)
+            with open(args.csv, "a", newline="") as f:
+                writer = csv.DictWriter(f, fieldnames=sorted(row.keys()))
+                if not file_exists:
+                    writer.writeheader()
+                writer.writerow(row)
+
+    if args.json is not None:
+        meta = collect_device_meta(device)
+        write_json(
+            args.json,
+            meta,
+            rows_out,
+            extra={
+                "method": "triton.testing.do_bench(mean)",
+                "warmup_ms": int(args.warmup_ms),
+                "rep_ms": int(args.iters),
+                "io_model_bytes": "see bytes_io_model_bwd in script",
+                "weight_dtype": str(args.weight_dtype),
+            },
+        )
+
+    # Print a small summary table.
+    headers = ["M", "N", "dtype", "ours_ms", "ours_tbps", "ours_hbm_frac"]
+    if quack_rmsnorm_bwd is not None:
+        headers += ["quack_ms", "quack_tbps", "speedup_vs_quack"]
+    print("\nSummary:")
+    print(" ".join(h.rjust(14) for h in headers))
+    for r in rows_out:
+        parts: list[str] = []
+        for h in headers:
+            v = r.get(h)
+            if isinstance(v, float):
+                parts.append(f"{v:14.4f}")
+            else:
+                parts.append(f"{str(v):>14}")
+        print(" ".join(parts))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/oink/benchmarks/benchmark/benchmark_rmsnorm_sm100.py b/oink/benchmarks/benchmark/benchmark_rmsnorm_sm100.py
new file mode 100644
index 0000000..e55e9ff
--- /dev/null
+++ b/oink/benchmarks/benchmark/benchmark_rmsnorm_sm100.py
@@ -0,0 +1,337 @@
+from __future__ import annotations
+
+import argparse
+import os
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+
+# Reduce fragmentation pressure on busy GPUs.
+os.environ.setdefault("PYTORCH_ALLOC_CONF", "expandable_segments:True")
+
+# Ensure SM100 (GB200) architecture is recognized by CuTeDSL when running outside vLLM.
+os.environ.setdefault("CUTE_DSL_ARCH", "sm_100a")
+
+from bench_utils import (  # noqa: E402
+    ErrorStatsAccumulator,
+    collect_device_meta,
+    detect_hbm_peak_gbps,
+    do_bench_triton,
+    error_stats_to_row,
+    ensure_oink_src_on_path,
+    iter_row_blocks,
+    parse_configs,
+    parse_dtype,
+    quack_suite_configs,
+    write_csv,
+    write_json,
+)
+
+ensure_oink_src_on_path()
+
+from kernelagent_oink.blackwell import rmsnorm as oink_rmsnorm  # noqa: E402
+
+try:
+    from quack.rmsnorm import rmsnorm_fwd as quack_rmsnorm_fwd  # type: ignore
+except Exception:
+    quack_rmsnorm_fwd = None
+
+_VERIFY_TOL_Y = {
+    # Match Quack's unit-test defaults (tests/test_rmsnorm.py).
+    torch.float32: dict(atol=1e-4, rtol=1e-3),
+    torch.float16: dict(atol=1e-2, rtol=1e-3),
+    # NOTE: bf16 ulp grows with magnitude; a slightly larger rtol is more robust
+    # for the large-M suite shapes (and fused paths that can see larger values).
+    torch.bfloat16: dict(atol=1e-1, rtol=1e-2),
+}
+
+_VERIFY_TOL_RSTD = {
+    torch.float32: dict(atol=1e-5, rtol=1e-5),
+    torch.float16: dict(atol=1e-3, rtol=1e-3),
+    torch.bfloat16: dict(atol=1e-3, rtol=1e-3),
+}
+
+
+def bytes_io_model_fwd(
+    M: int, N: int, dtype: torch.dtype, *, weight_dtype: torch.dtype = torch.float32
+) -> int:
+    elem = torch.tensor(0, dtype=dtype).element_size()
+    w_elem = torch.tensor(0, dtype=weight_dtype).element_size()
+    # Read x + write y
+    total = 2 * M * N * elem
+    # Read weight
+    total += N * w_elem
+    return int(total)
+
+
+def dsv3_configs() -> List[Tuple[int, int]]:
+    # DSv3-ish hidden sizes used throughout the Oink/Quack SM100 suite tables.
+    Ms = [4096, 16384, 65536]
+    Ns = [6144, 7168, 8192]
+    return [(m, n) for m in Ms for n in Ns]
+
+
+def _verify_parity(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    *,
+    eps: float,
+    store_rstd: bool,
+) -> dict[str, object]:
+    tol_y = _VERIFY_TOL_Y[x.dtype]
+    tol_rstd = _VERIFY_TOL_RSTD[x.dtype]
+    ref_block_rows = 4096
+    M = int(x.shape[0])
+    N = int(x.shape[1])
+
+    y_acc_ours = ErrorStatsAccumulator(total_elems=M * N)
+    y_acc_quack = ErrorStatsAccumulator(total_elems=M * N) if quack_rmsnorm_fwd is not None else None
+
+    with torch.no_grad():
+        y_o, rstd_o, res_o = oink_rmsnorm.rmsnorm_forward(
+            x,
+            weight=w,
+            bias=None,
+            residual=None,
+            eps=eps,
+            store_rstd=store_rstd,
+        )
+        y_q = None
+        rstd_q = None
+        if quack_rmsnorm_fwd is not None:
+            # Quack returns (out, residual_out, rstd).
+            y_q, res_q, rstd_q = quack_rmsnorm_fwd(
+                x,
+                w,
+                bias=None,
+                residual=None,
+                out_dtype=None,
+                residual_dtype=None,
+                eps=eps,
+                store_rstd=store_rstd,
+            )
+
+    # Pure-PyTorch reference (float32 accumulation), chunked over rows to avoid
+    # materializing an (M, N) float32 tensor for large Quack-suite shapes.
+    w_f32 = w.float()
+    rstd_ref = torch.empty((M,), device=x.device, dtype=torch.float32)
+    for start, end in iter_row_blocks(M, ref_block_rows):
+        x_f32 = x[start:end].float()
+        rstd_blk = torch.rsqrt(x_f32.square().mean(dim=-1) + eps)
+        rstd_ref[start:end] = rstd_blk
+
+        y_ref_blk_f32 = (x_f32 * rstd_blk.unsqueeze(1)) * w_f32
+        y_ref_blk = y_ref_blk_f32.to(x.dtype)
+        torch.testing.assert_close(y_o[start:end], y_ref_blk, **tol_y)
+        y_acc_ours.update(y_o[start:end], y_ref_blk)
+        if y_q is not None:
+            torch.testing.assert_close(y_q[start:end], y_ref_blk, **tol_y)
+            assert y_acc_quack is not None
+            y_acc_quack.update(y_q[start:end], y_ref_blk)
+
+    stats: dict[str, object] = {}
+    stats.update(error_stats_to_row("ours_err_y", y_acc_ours.finalize()))
+    if y_acc_quack is not None:
+        stats.update(error_stats_to_row("quack_err_y", y_acc_quack.finalize()))
+
+    if store_rstd:
+        assert rstd_o is not None
+        torch.testing.assert_close(rstd_o, rstd_ref, **tol_rstd)
+        if y_q is not None:
+            assert rstd_q is not None
+            torch.testing.assert_close(rstd_q, rstd_ref, **tol_rstd)
+        # Stats for rstd are cheap (M elements); compute exact p99 over all rows.
+        rstd_acc_ours = ErrorStatsAccumulator(total_elems=int(rstd_ref.numel()), p99_target_samples=int(rstd_ref.numel()))
+        rstd_acc_ours.update(rstd_o, rstd_ref)
+        stats.update(error_stats_to_row("ours_err_rstd", rstd_acc_ours.finalize()))
+        if rstd_q is not None:
+            rstd_acc_quack = ErrorStatsAccumulator(
+                total_elems=int(rstd_ref.numel()), p99_target_samples=int(rstd_ref.numel())
+            )
+            rstd_acc_quack.update(rstd_q, rstd_ref)
+            stats.update(error_stats_to_row("quack_err_rstd", rstd_acc_quack.finalize()))
+    # Residual output semantics differ slightly across implementations:
+    # - Oink returns `None` when residual is None.
+    # - Quack returns `x` as a safe alias in that case.
+    #
+    # For parity we focus on `y` (and optional `rstd`) for the residual=None path.
+    assert res_o is None
+    if quack_rmsnorm_fwd is not None:
+        assert res_q is x
+    return stats
+
+
+def bench_single(
+    M: int,
+    N: int,
+    dtype: torch.dtype,
+    *,
+    weight_dtype: torch.dtype,
+    eps: float,
+    warmup_ms: int,
+    iters_ms: int,
+    verify: bool,
+    store_rstd: bool,
+) -> Tuple[Tuple[float, float], Optional[Tuple[float, float]], dict[str, object]]:
+    device = torch.device("cuda")
+    x = torch.randn(M, N, device=device, dtype=dtype)
+    w = torch.randn(N, device=device, dtype=weight_dtype)
+
+    stats: dict[str, object] = {}
+    if verify:
+        stats = _verify_parity(x, w, eps=eps, store_rstd=store_rstd)
+
+    bytes_io = bytes_io_model_fwd(M, N, dtype, weight_dtype=w.dtype)
+
+    fn_oink = lambda: oink_rmsnorm.rmsnorm_forward(
+        x,
+        weight=w,
+        bias=None,
+        residual=None,
+        eps=eps,
+        store_rstd=store_rstd,
+    )
+    ms_oink = do_bench_triton(fn_oink, warmup_ms=warmup_ms, rep_ms=iters_ms)
+    gbps_oink = bytes_io / (ms_oink * 1e-3) / 1e9
+
+    if quack_rmsnorm_fwd is None:
+        return (ms_oink, gbps_oink), None, stats
+
+    fn_quack = lambda: quack_rmsnorm_fwd(
+        x,
+        w,
+        bias=None,
+        residual=None,
+        out_dtype=None,
+        residual_dtype=None,
+        eps=eps,
+        store_rstd=store_rstd,
+    )
+    ms_quack = do_bench_triton(fn_quack, warmup_ms=warmup_ms, rep_ms=iters_ms)
+    gbps_quack = bytes_io / (ms_quack * 1e-3) / 1e9
+    return (ms_oink, gbps_oink), (ms_quack, gbps_quack), stats
+
+
+def main() -> None:
+    if not torch.cuda.is_available():
+        raise SystemExit("CUDA not available")
+
+    torch.cuda.set_device(0)
+    device = torch.device("cuda")
+    props = torch.cuda.get_device_properties(device)
+    sm = props.major * 10 + props.minor
+    print(f"Running on {torch.cuda.get_device_name(device)} (SM{sm})")
+
+    p = argparse.ArgumentParser()
+    p.add_argument("--dtype", type=str, default="bf16", choices=["fp16", "bf16", "fp32"])
+    p.add_argument(
+        "--weight-dtype",
+        type=str,
+        default="fp32",
+        choices=["same", "fp16", "bf16", "fp32"],
+        help="RMSNorm weight dtype. `same` matches activation dtype (vLLM-style inference).",
+    )
+    p.add_argument("--eps", type=float, default=1e-6)
+    p.add_argument("--store-rstd", action="store_true", help="Also write rstd (fp32 per row)")
+    p.add_argument("--iters", type=int, default=100, help="Triton do_bench rep_ms (kernel-only).")
+    p.add_argument("--warmup-ms", type=int, default=25)
+    p.add_argument("--csv", type=str, default=None, help="Optional CSV output path; appends rows")
+    p.add_argument("--json", type=str, default=None, help="Optional JSON output path (meta + rows)")
+    p.add_argument("--configs", type=str, default="1024x4096,8192x4096")
+    p.add_argument("--quack-suite", action="store_true", help="Run Quack-style batch/seq grid")
+    p.add_argument("--dsv3", action="store_true", help="Run DSv3 set: M in {4096,16384,65536}, N in {6144,7168,8192}")
+    p.add_argument("--skip-verify", action="store_true", help="Skip correctness checks (Oink/Quack vs a pure-PyTorch reference)")
+    args = p.parse_args()
+
+    dtype = parse_dtype(args.dtype)
+    if args.weight_dtype == "same":
+        weight_dtype = dtype
+    else:
+        weight_dtype = parse_dtype(args.weight_dtype)
+    eps = float(args.eps)
+
+    if args.quack_suite:
+        cfgs = [(bs * sl, hidden) for (bs, sl, hidden) in quack_suite_configs()]
+    elif args.dsv3:
+        cfgs = dsv3_configs()
+    else:
+        cfgs = parse_configs(args.configs)
+
+    hbm_peak = detect_hbm_peak_gbps(device)
+    meta = collect_device_meta(device)
+
+    rows_out: List[Dict[str, Any]] = []
+    for (M, N) in cfgs:
+        print(f"bench M={M:<8d} N={N:<6d} dtype={args.dtype} ...", flush=True)
+        (ms_oink, gbps_oink), quack, stats = bench_single(
+            M=M,
+            N=N,
+            dtype=dtype,
+            weight_dtype=weight_dtype,
+            eps=eps,
+            warmup_ms=int(args.warmup_ms),
+            iters_ms=int(args.iters),
+            verify=not args.skip_verify,
+            store_rstd=bool(args.store_rstd),
+        )
+        row: Dict[str, Any] = {
+            "M": M,
+            "N": N,
+            "dtype": args.dtype,
+            "weight_dtype": args.weight_dtype,
+            "eps": eps,
+            "store_rstd": bool(args.store_rstd),
+            "ours_ms": ms_oink,
+            "ours_gbps": gbps_oink,
+            "ours_tbps": gbps_oink / 1000.0,
+            "ours_hbm_frac": gbps_oink / hbm_peak,
+        }
+        if quack is not None:
+            ms_q, gbps_q = quack
+            row.update(
+                {
+                    "quack_ms": ms_q,
+                    "quack_gbps": gbps_q,
+                    "quack_tbps": gbps_q / 1000.0,
+                    "speedup_vs_quack": ms_q / ms_oink,
+                }
+            )
+        row.update(stats)
+        rows_out.append(row)
+
+    if args.csv is not None:
+        write_csv(args.csv, rows_out)
+    if args.json is not None:
+        write_json(
+            args.json,
+            meta,
+            rows_out,
+            extra={
+                "method": "triton.testing.do_bench(mean)",
+                "warmup_ms": int(args.warmup_ms),
+                "rep_ms": int(args.iters),
+                "io_model_bytes": "(2*M*N)*elem_size + N*weight_elem_size",
+                "store_rstd": bool(args.store_rstd),
+                "weight_dtype": str(args.weight_dtype),
+            },
+        )
+
+    # Print a compact summary table.
+    headers = ["M", "N", "ours_ms", "ours_tbps"]
+    if quack_rmsnorm_fwd is not None:
+        headers += ["quack_ms", "quack_tbps", "speedup_vs_quack"]
+    print("\nSummary:")
+    print(" ".join(h.rjust(14) for h in headers))
+    for r in rows_out:
+        parts: List[str] = []
+        for h in headers:
+            v = r.get(h)
+            if isinstance(v, float):
+                parts.append(f"{v:14.4f}")
+            else:
+                parts.append(f"{str(v):>14}")
+        print(" ".join(parts))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/oink/benchmarks/benchmark/benchmark_softmax_sm100.py b/oink/benchmarks/benchmark/benchmark_softmax_sm100.py
new file mode 100644
index 0000000..93c5af3
--- /dev/null
+++ b/oink/benchmarks/benchmark/benchmark_softmax_sm100.py
@@ -0,0 +1,292 @@
+from __future__ import annotations
+
+import argparse
+import os
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+
+# Reduce fragmentation pressure on busy GPUs.
+os.environ.setdefault("PYTORCH_ALLOC_CONF", "expandable_segments:True")
+
+# Ensure SM100 (GB200) architecture is recognized by CuTeDSL when running outside vLLM.
+os.environ.setdefault("CUTE_DSL_ARCH", "sm_100a")
+
+from bench_utils import (  # noqa: E402
+    ErrorStatsAccumulator,
+    collect_device_meta,
+    detect_hbm_peak_gbps,
+    do_bench_triton,
+    error_stats_to_row,
+    ensure_oink_src_on_path,
+    iter_row_blocks,
+    parse_configs,
+    parse_dtype,
+    quack_suite_configs,
+    write_csv,
+    write_json,
+)
+
+ensure_oink_src_on_path()
+
+from kernelagent_oink.blackwell import softmax as oink_softmax  # noqa: E402
+
+try:
+    from quack.softmax import softmax_bwd as quack_softmax_bwd  # type: ignore
+    from quack.softmax import softmax_fwd as quack_softmax_fwd  # type: ignore
+except Exception:
+    quack_softmax_fwd = None
+    quack_softmax_bwd = None
+
+_VERIFY_TOL = {
+    # Match Quack's unit-test defaults (tests/test_softmax.py).
+    torch.float32: dict(atol=1e-4, rtol=1e-4),
+    torch.float16: dict(atol=1e-3, rtol=1e-3),
+    torch.bfloat16: dict(atol=1e-2, rtol=1e-2),
+}
+
+
+def bytes_io_model_softmax(M: int, N: int, dtype: torch.dtype, *, mode: str) -> int:
+    elem = torch.tensor(0, dtype=dtype).element_size()
+    if mode == "fwd":
+        return int(2 * M * N * elem)  # read x + write y
+    if mode == "bwd":
+        return int(3 * M * N * elem)  # read dy + read y + write dx
+    if mode == "fwd_bwd":
+        # Logical IO for dx given (x, dy): read x + read dy + write dx.
+        # (The intermediate y=softmax(x) is an implementation detail and is
+        # intentionally not counted here.)
+        return int(3 * M * N * elem)
+    raise ValueError(f"Unsupported mode: {mode}")
+
+
+def dsv3_configs() -> List[Tuple[int, int]]:
+    Ms = [4096, 16384, 65536]
+    Ns = [6144, 7168, 8192]
+    return [(m, n) for m in Ms for n in Ns]
+
+
+def _verify_parity(x: torch.Tensor) -> dict[str, object]:
+    tol = _VERIFY_TOL[x.dtype]
+    ref_block_rows = 4096
+    dy = torch.randn_like(x)  # upstream grad
+
+    with torch.no_grad():
+        y_o = oink_softmax.softmax_forward(x)
+        dx_o = oink_softmax.softmax_backward(dy, y_o)
+        dx_fused_o = oink_softmax.softmax_fwd_bwd(dy, x)
+
+        y_q = None
+        dx_q = None
+        if quack_softmax_fwd is not None and quack_softmax_bwd is not None:
+            y_q = quack_softmax_fwd(x)
+            dx_q = quack_softmax_bwd(dy, y_q)
+
+    M = int(x.shape[0])
+    N = int(x.shape[1])
+    y_acc_ours = ErrorStatsAccumulator(total_elems=M * N)
+    dx_acc_ours = ErrorStatsAccumulator(total_elems=M * N)
+    dx_fused_acc_ours = ErrorStatsAccumulator(total_elems=M * N)
+    y_acc_quack = (
+        ErrorStatsAccumulator(total_elems=M * N)
+        if (quack_softmax_fwd is not None and quack_softmax_bwd is not None)
+        else None
+    )
+    dx_acc_quack = (
+        ErrorStatsAccumulator(total_elems=M * N)
+        if (quack_softmax_fwd is not None and quack_softmax_bwd is not None)
+        else None
+    )
+
+    # Match Quack tests: compare to PyTorch softmax refs (fwd+bwd), chunked.
+    for start, end in iter_row_blocks(M, ref_block_rows):
+        x_blk = x[start:end]
+        dy_blk = dy[start:end]
+        y_ref_blk = torch.softmax(x_blk, dim=-1)
+        dot = torch.sum(dy_blk * y_ref_blk, dim=-1, keepdim=True, dtype=torch.float32)
+        dx_ref_blk = (dy_blk - dot.to(dy_blk.dtype)) * y_ref_blk
+
+        torch.testing.assert_close(y_o[start:end], y_ref_blk, **tol)
+        torch.testing.assert_close(dx_o[start:end], dx_ref_blk, **tol)
+        torch.testing.assert_close(dx_fused_o[start:end], dx_ref_blk, **tol)
+        y_acc_ours.update(y_o[start:end], y_ref_blk)
+        dx_acc_ours.update(dx_o[start:end], dx_ref_blk)
+        dx_fused_acc_ours.update(dx_fused_o[start:end], dx_ref_blk)
+        if y_q is not None and dx_q is not None:
+            torch.testing.assert_close(y_q[start:end], y_ref_blk, **tol)
+            torch.testing.assert_close(dx_q[start:end], dx_ref_blk, **tol)
+            assert y_acc_quack is not None and dx_acc_quack is not None
+            y_acc_quack.update(y_q[start:end], y_ref_blk)
+            dx_acc_quack.update(dx_q[start:end], dx_ref_blk)
+
+    stats: dict[str, object] = {}
+    stats.update(error_stats_to_row("ours_err_y", y_acc_ours.finalize()))
+    stats.update(error_stats_to_row("ours_err_dx", dx_acc_ours.finalize()))
+    stats.update(error_stats_to_row("ours_err_dx_fused", dx_fused_acc_ours.finalize()))
+    if y_acc_quack is not None and dx_acc_quack is not None:
+        stats.update(error_stats_to_row("quack_err_y", y_acc_quack.finalize()))
+        stats.update(error_stats_to_row("quack_err_dx", dx_acc_quack.finalize()))
+    return stats
+
+
+def bench_single(
+    M: int,
+    N: int,
+    dtype: torch.dtype,
+    *,
+    warmup_ms: int,
+    iters_ms: int,
+    mode: str,
+    verify: bool,
+) -> Tuple[Tuple[float, float], Optional[Tuple[float, float]], dict[str, object]]:
+    device = torch.device("cuda")
+    x = torch.randn(M, N, device=device, dtype=dtype)
+    dy = torch.randn_like(x)
+
+    stats: dict[str, object] = {}
+    if verify:
+        stats = _verify_parity(x)
+
+    bytes_io = bytes_io_model_softmax(M, N, dtype, mode=mode)
+
+    if mode == "fwd":
+        fn_oink = lambda: oink_softmax.softmax_forward(x)
+        fn_quack = None if quack_softmax_fwd is None else (lambda: quack_softmax_fwd(x))
+    elif mode == "bwd":
+        with torch.no_grad():
+            y_o = oink_softmax.softmax_forward(x)
+            y_q = quack_softmax_fwd(x) if quack_softmax_fwd is not None else None
+        fn_oink = lambda: oink_softmax.softmax_backward(dy, y_o)
+        fn_quack = (
+            None
+            if (quack_softmax_bwd is None or y_q is None)
+            else (lambda: quack_softmax_bwd(dy, y_q))
+        )
+    elif mode == "fwd_bwd":
+        fn_oink = lambda: oink_softmax.softmax_fwd_bwd(dy, x)
+        fn_quack = (
+            None
+            if (quack_softmax_fwd is None or quack_softmax_bwd is None)
+            else (lambda: quack_softmax_bwd(dy, quack_softmax_fwd(x)))
+        )
+    else:
+        raise ValueError(f"Unsupported mode: {mode}")
+
+    ms_oink = do_bench_triton(fn_oink, warmup_ms=warmup_ms, rep_ms=iters_ms)
+    gbps_oink = bytes_io / (ms_oink * 1e-3) / 1e9
+
+    if fn_quack is None:
+        return (ms_oink, gbps_oink), None, stats
+
+    ms_quack = do_bench_triton(fn_quack, warmup_ms=warmup_ms, rep_ms=iters_ms)
+    gbps_quack = bytes_io / (ms_quack * 1e-3) / 1e9
+    return (ms_oink, gbps_oink), (ms_quack, gbps_quack), stats
+
+
+def main() -> None:
+    if not torch.cuda.is_available():
+        raise SystemExit("CUDA not available")
+
+    torch.cuda.set_device(0)
+    device = torch.device("cuda")
+    props = torch.cuda.get_device_properties(device)
+    sm = props.major * 10 + props.minor
+    print(f"Running on {torch.cuda.get_device_name(device)} (SM{sm})")
+
+    p = argparse.ArgumentParser()
+    p.add_argument("--dtype", type=str, default="bf16", choices=["fp16", "bf16", "fp32"])
+    p.add_argument("--mode", type=str, default="fwd_bwd", choices=["fwd", "bwd", "fwd_bwd"])
+    p.add_argument("--iters", type=int, default=50, help="Triton do_bench rep_ms (kernel-only).")
+    p.add_argument("--warmup-ms", type=int, default=25)
+    p.add_argument("--csv", type=str, default=None, help="Optional CSV output path; appends rows")
+    p.add_argument("--json", type=str, default=None, help="Optional JSON output path (meta + rows)")
+    p.add_argument("--configs", type=str, default="1024x4096,8192x4096")
+    p.add_argument("--quack-suite", action="store_true", help="Run Quack-style batch/seq grid")
+    p.add_argument(
+        "--dsv3",
+        action="store_true",
+        help="Run DSv3 set: M in {4096,16384,65536}, N in {6144,7168,8192}",
+    )
+    p.add_argument("--skip-verify", action="store_true", help="Skip correctness checks (Oink/Quack vs PyTorch softmax)")
+    args = p.parse_args()
+
+    dtype = parse_dtype(args.dtype)
+
+    if args.quack_suite:
+        cfgs = [(bs * sl, hidden) for (bs, sl, hidden) in quack_suite_configs()]
+    elif args.dsv3:
+        cfgs = dsv3_configs()
+    else:
+        cfgs = parse_configs(args.configs)
+
+    hbm_peak = detect_hbm_peak_gbps(device)
+    meta = collect_device_meta(device)
+
+    rows_out: List[Dict[str, Any]] = []
+    for (M, N) in cfgs:
+        print(f"bench M={M:<8d} N={N:<6d} dtype={args.dtype} mode={args.mode} ...", flush=True)
+        (ms_oink, gbps_oink), quack, stats = bench_single(
+            M=M,
+            N=N,
+            dtype=dtype,
+            warmup_ms=int(args.warmup_ms),
+            iters_ms=int(args.iters),
+            mode=str(args.mode),
+            verify=not args.skip_verify,
+        )
+        row: Dict[str, Any] = {
+            "M": M,
+            "N": N,
+            "dtype": args.dtype,
+            "mode": args.mode,
+            "ours_ms": ms_oink,
+            "ours_gbps": gbps_oink,
+            "ours_tbps": gbps_oink / 1000.0,
+            "ours_hbm_frac": gbps_oink / hbm_peak,
+        }
+        if quack is not None:
+            ms_q, gbps_q = quack
+            row.update(
+                {
+                    "quack_ms": ms_q,
+                    "quack_gbps": gbps_q,
+                    "quack_tbps": gbps_q / 1000.0,
+                    "speedup_vs_quack": ms_q / ms_oink,
+                }
+            )
+        row.update(stats)
+        rows_out.append(row)
+
+    if args.csv is not None:
+        write_csv(args.csv, rows_out)
+    if args.json is not None:
+        write_json(
+            args.json,
+            meta,
+            rows_out,
+            extra={
+                "method": "triton.testing.do_bench(mean)",
+                "warmup_ms": int(args.warmup_ms),
+                "rep_ms": int(args.iters),
+                "io_model_bytes": "mode-dependent: fwd=2*M*N, bwd=3*M*N, fwd_bwd=3*M*N (all * elem_size; fwd_bwd counts logical x+dy+dx)",
+            },
+        )
+
+    headers = ["M", "N", "mode", "ours_ms", "ours_tbps"]
+    if quack_softmax_fwd is not None and quack_softmax_bwd is not None:
+        headers += ["quack_ms", "quack_tbps", "speedup_vs_quack"]
+    print("\nSummary:")
+    print(" ".join(h.rjust(14) for h in headers))
+    for r in rows_out:
+        parts: List[str] = []
+        for h in headers:
+            v = r.get(h)
+            if isinstance(v, float):
+                parts.append(f"{v:14.4f}")
+            else:
+                parts.append(f"{str(v):>14}")
+        print(" ".join(parts))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/oink/benchmarks/media/sm100_bf16_oink_vs_quack.svg b/oink/benchmarks/media/sm100_bf16_oink_vs_quack.svg
new file mode 100644
index 0000000..e32e3a7
--- /dev/null
+++ b/oink/benchmarks/media/sm100_bf16_oink_vs_quack.svg
@@ -0,0 +1,2259 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="1275.328413pt" height="387.112144pt" viewBox="0 0 1275.328413 387.112144" xmlns="http://www.w3.org/2000/svg" version="1.1">
+ <metadata>
+  <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+   <cc:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
+    <dc:date>2026-01-12T23:31:37.117906</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <cc:Agent>
+      <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
+     </cc:Agent>
+    </dc:creator>
+   </cc:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure_1">
+  <g id="patch_1">
+   <path d="M 0 387.112144 
+L 1275.328413 387.112144 
+L 1275.328413 0 
+L 0 0 
+z
+" style="fill: #ffffff"/>
+  </g>
+  <g id="axes_1">
+   <g id="patch_2">
+    <path d="M 58.465 334.546471 
+L 424.416918 334.546471 
+L 424.416918 144.816 
+L 58.465 144.816 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="line2d_1">
+      <defs>
+       <path id="ma3eba1b756" d="M 0 0 
+L 0 3.5 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#ma3eba1b756" x="75.099178" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_1">
+      <!-- (8K, 4K) -->
+      <g transform="translate(76.569739 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-28" d="M 1984 4856 
+Q 1566 4138 1362 3434 
+Q 1159 2731 1159 2009 
+Q 1159 1288 1364 580 
+Q 1569 -128 1984 -844 
+L 1484 -844 
+Q 1016 -109 783 600 
+Q 550 1309 550 2009 
+Q 550 2706 781 3412 
+Q 1013 4119 1484 4856 
+L 1984 4856 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-38" d="M 2034 2216 
+Q 1584 2216 1326 1975 
+Q 1069 1734 1069 1313 
+Q 1069 891 1326 650 
+Q 1584 409 2034 409 
+Q 2484 409 2743 651 
+Q 3003 894 3003 1313 
+Q 3003 1734 2745 1975 
+Q 2488 2216 2034 2216 
+z
+M 1403 2484 
+Q 997 2584 770 2862 
+Q 544 3141 544 3541 
+Q 544 4100 942 4425 
+Q 1341 4750 2034 4750 
+Q 2731 4750 3128 4425 
+Q 3525 4100 3525 3541 
+Q 3525 3141 3298 2862 
+Q 3072 2584 2669 2484 
+Q 3125 2378 3379 2068 
+Q 3634 1759 3634 1313 
+Q 3634 634 3220 271 
+Q 2806 -91 2034 -91 
+Q 1263 -91 848 271 
+Q 434 634 434 1313 
+Q 434 1759 690 2068 
+Q 947 2378 1403 2484 
+z
+M 1172 3481 
+Q 1172 3119 1398 2916 
+Q 1625 2713 2034 2713 
+Q 2441 2713 2670 2916 
+Q 2900 3119 2900 3481 
+Q 2900 3844 2670 4047 
+Q 2441 4250 2034 4250 
+Q 1625 4250 1398 4047 
+Q 1172 3844 1172 3481 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-4b" d="M 628 4666 
+L 1259 4666 
+L 1259 2694 
+L 3353 4666 
+L 4166 4666 
+L 1850 2491 
+L 4331 0 
+L 3500 0 
+L 1259 2247 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-2c" d="M 750 794 
+L 1409 794 
+L 1409 256 
+L 897 -744 
+L 494 -744 
+L 750 256 
+L 750 794 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-20" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-34" d="M 2419 4116 
+L 825 1625 
+L 2419 1625 
+L 2419 4116 
+z
+M 2253 4666 
+L 3047 4666 
+L 3047 1625 
+L 3713 1625 
+L 3713 1100 
+L 3047 1100 
+L 3047 0 
+L 2419 0 
+L 2419 1100 
+L 313 1100 
+L 313 1709 
+L 2253 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-29" d="M 513 4856 
+L 1013 4856 
+Q 1481 4119 1714 3412 
+Q 1947 2706 1947 2009 
+Q 1947 1309 1714 600 
+Q 1481 -109 1013 -844 
+L 513 -844 
+Q 928 -128 1133 580 
+Q 1338 1288 1338 2009 
+Q 1338 2731 1133 3434 
+Q 928 4138 513 4856 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="line2d_2">
+      <g>
+       <use xlink:href="#ma3eba1b756" x="130.546438" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_2">
+      <!-- (16K, 4K) -->
+      <g transform="translate(132.017 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-31" d="M 794 531 
+L 1825 531 
+L 1825 4091 
+L 703 3866 
+L 703 4441 
+L 1819 4666 
+L 2450 4666 
+L 2450 531 
+L 3481 531 
+L 3481 0 
+L 794 0 
+L 794 531 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-36" d="M 2113 2584 
+Q 1688 2584 1439 2293 
+Q 1191 2003 1191 1497 
+Q 1191 994 1439 701 
+Q 1688 409 2113 409 
+Q 2538 409 2786 701 
+Q 3034 994 3034 1497 
+Q 3034 2003 2786 2293 
+Q 2538 2584 2113 2584 
+z
+M 3366 4563 
+L 3366 3988 
+Q 3128 4100 2886 4159 
+Q 2644 4219 2406 4219 
+Q 1781 4219 1451 3797 
+Q 1122 3375 1075 2522 
+Q 1259 2794 1537 2939 
+Q 1816 3084 2150 3084 
+Q 2853 3084 3261 2657 
+Q 3669 2231 3669 1497 
+Q 3669 778 3244 343 
+Q 2819 -91 2113 -91 
+Q 1303 -91 875 529 
+Q 447 1150 447 2328 
+Q 447 3434 972 4092 
+Q 1497 4750 2381 4750 
+Q 2619 4750 2861 4703 
+Q 3103 4656 3366 4563 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="line2d_3">
+      <g>
+       <use xlink:href="#ma3eba1b756" x="185.993699" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_3">
+      <!-- (32K, 4K) -->
+      <g transform="translate(187.46426 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-33" d="M 2597 2516 
+Q 3050 2419 3304 2112 
+Q 3559 1806 3559 1356 
+Q 3559 666 3084 287 
+Q 2609 -91 1734 -91 
+Q 1441 -91 1130 -33 
+Q 819 25 488 141 
+L 488 750 
+Q 750 597 1062 519 
+Q 1375 441 1716 441 
+Q 2309 441 2620 675 
+Q 2931 909 2931 1356 
+Q 2931 1769 2642 2001 
+Q 2353 2234 1838 2234 
+L 1294 2234 
+L 1294 2753 
+L 1863 2753 
+Q 2328 2753 2575 2939 
+Q 2822 3125 2822 3475 
+Q 2822 3834 2567 4026 
+Q 2313 4219 1838 4219 
+Q 1578 4219 1281 4162 
+Q 984 4106 628 3988 
+L 628 4550 
+Q 988 4650 1302 4700 
+Q 1616 4750 1894 4750 
+Q 2613 4750 3031 4423 
+Q 3450 4097 3450 3541 
+Q 3450 3153 3228 2886 
+Q 3006 2619 2597 2516 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-32" d="M 1228 531 
+L 3431 531 
+L 3431 0 
+L 469 0 
+L 469 531 
+Q 828 903 1448 1529 
+Q 2069 2156 2228 2338 
+Q 2531 2678 2651 2914 
+Q 2772 3150 2772 3378 
+Q 2772 3750 2511 3984 
+Q 2250 4219 1831 4219 
+Q 1534 4219 1204 4116 
+Q 875 4013 500 3803 
+L 500 4441 
+Q 881 4594 1212 4672 
+Q 1544 4750 1819 4750 
+Q 2544 4750 2975 4387 
+Q 3406 4025 3406 3419 
+Q 3406 3131 3298 2873 
+Q 3191 2616 2906 2266 
+Q 2828 2175 2409 1742 
+Q 1991 1309 1228 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="line2d_4">
+      <g>
+       <use xlink:href="#ma3eba1b756" x="241.440959" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_4">
+      <!-- (64K, 4K) -->
+      <g transform="translate(242.91152 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="line2d_5">
+      <g>
+       <use xlink:href="#ma3eba1b756" x="296.888219" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_5">
+      <!-- (128K, 4K) -->
+      <g transform="translate(298.35878 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="line2d_6">
+      <g>
+       <use xlink:href="#ma3eba1b756" x="352.33548" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_6">
+      <!-- (256K, 4K) -->
+      <g transform="translate(353.806041 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-35" d="M 691 4666 
+L 3169 4666 
+L 3169 4134 
+L 1269 4134 
+L 1269 2991 
+Q 1406 3038 1543 3061 
+Q 1681 3084 1819 3084 
+Q 2600 3084 3056 2656 
+Q 3513 2228 3513 1497 
+Q 3513 744 3044 326 
+Q 2575 -91 1722 -91 
+Q 1428 -91 1123 -41 
+Q 819 9 494 109 
+L 494 744 
+Q 775 591 1075 516 
+Q 1375 441 1709 441 
+Q 2250 441 2565 725 
+Q 2881 1009 2881 1497 
+Q 2881 1984 2565 2268 
+Q 2250 2553 1709 2553 
+Q 1456 2553 1204 2497 
+Q 953 2441 691 2322 
+L 691 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_7">
+     <g id="line2d_7">
+      <g>
+       <use xlink:href="#ma3eba1b756" x="407.78274" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_7">
+      <!-- (512K, 4K) -->
+      <g transform="translate(409.253301 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2">
+    <g id="ytick_1">
+     <g id="line2d_8">
+      <path d="M 58.465 334.546471 
+L 424.416918 334.546471 
+" clip-path="url(#pabdd47e931)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_9">
+      <defs>
+       <path id="m1594e17ace" d="M 0 0 
+L -3.5 0 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#m1594e17ace" x="58.465" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_8">
+      <!-- 0 -->
+      <g transform="translate(43.83 339.105533) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-30" d="M 2034 4250 
+Q 1547 4250 1301 3770 
+Q 1056 3291 1056 2328 
+Q 1056 1369 1301 889 
+Q 1547 409 2034 409 
+Q 2525 409 2770 889 
+Q 3016 1369 3016 2328 
+Q 3016 3291 2770 3770 
+Q 2525 4250 2034 4250 
+z
+M 2034 4750 
+Q 2819 4750 3233 4129 
+Q 3647 3509 3647 2328 
+Q 3647 1150 3233 529 
+Q 2819 -91 2034 -91 
+Q 1250 -91 836 529 
+Q 422 1150 422 2328 
+Q 422 3509 836 4129 
+Q 1250 4750 2034 4750 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_2">
+     <g id="line2d_10">
+      <path d="M 58.465 286.945749 
+L 424.416918 286.945749 
+" clip-path="url(#pabdd47e931)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_11">
+      <g>
+       <use xlink:href="#m1594e17ace" x="58.465" y="286.945749" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_9">
+      <!-- 2000 -->
+      <g transform="translate(20.925 291.504812) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-32"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_3">
+     <g id="line2d_12">
+      <path d="M 58.465 239.345028 
+L 424.416918 239.345028 
+" clip-path="url(#pabdd47e931)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_13">
+      <g>
+       <use xlink:href="#m1594e17ace" x="58.465" y="239.345028" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_10">
+      <!-- 4000 -->
+      <g transform="translate(20.925 243.90409) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-34"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_4">
+     <g id="line2d_14">
+      <path d="M 58.465 191.744306 
+L 424.416918 191.744306 
+" clip-path="url(#pabdd47e931)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_15">
+      <g>
+       <use xlink:href="#m1594e17ace" x="58.465" y="191.744306" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_11">
+      <!-- 6000 -->
+      <g transform="translate(20.925 196.303368) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-36"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_12">
+     <!-- Memory Bandwidth (GB/s) -->
+     <g transform="translate(13.5975 344.684985) rotate(-90) scale(0.16 -0.16)">
+      <defs>
+       <path id="DejaVuSans-4d" d="M 628 4666 
+L 1569 4666 
+L 2759 1491 
+L 3956 4666 
+L 4897 4666 
+L 4897 0 
+L 4281 0 
+L 4281 4097 
+L 3078 897 
+L 2444 897 
+L 1241 4097 
+L 1241 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-65" d="M 3597 1894 
+L 3597 1613 
+L 953 1613 
+Q 991 1019 1311 708 
+Q 1631 397 2203 397 
+Q 2534 397 2845 478 
+Q 3156 559 3463 722 
+L 3463 178 
+Q 3153 47 2828 -22 
+Q 2503 -91 2169 -91 
+Q 1331 -91 842 396 
+Q 353 884 353 1716 
+Q 353 2575 817 3079 
+Q 1281 3584 2069 3584 
+Q 2775 3584 3186 3129 
+Q 3597 2675 3597 1894 
+z
+M 3022 2063 
+Q 3016 2534 2758 2815 
+Q 2500 3097 2075 3097 
+Q 1594 3097 1305 2825 
+Q 1016 2553 972 2059 
+L 3022 2063 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6d" d="M 3328 2828 
+Q 3544 3216 3844 3400 
+Q 4144 3584 4550 3584 
+Q 5097 3584 5394 3201 
+Q 5691 2819 5691 2113 
+L 5691 0 
+L 5113 0 
+L 5113 2094 
+Q 5113 2597 4934 2840 
+Q 4756 3084 4391 3084 
+Q 3944 3084 3684 2787 
+Q 3425 2491 3425 1978 
+L 3425 0 
+L 2847 0 
+L 2847 2094 
+Q 2847 2600 2669 2842 
+Q 2491 3084 2119 3084 
+Q 1678 3084 1418 2786 
+Q 1159 2488 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1356 3278 1631 3431 
+Q 1906 3584 2284 3584 
+Q 2666 3584 2933 3390 
+Q 3200 3197 3328 2828 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6f" d="M 1959 3097 
+Q 1497 3097 1228 2736 
+Q 959 2375 959 1747 
+Q 959 1119 1226 758 
+Q 1494 397 1959 397 
+Q 2419 397 2687 759 
+Q 2956 1122 2956 1747 
+Q 2956 2369 2687 2733 
+Q 2419 3097 1959 3097 
+z
+M 1959 3584 
+Q 2709 3584 3137 3096 
+Q 3566 2609 3566 1747 
+Q 3566 888 3137 398 
+Q 2709 -91 1959 -91 
+Q 1206 -91 779 398 
+Q 353 888 353 1747 
+Q 353 2609 779 3096 
+Q 1206 3584 1959 3584 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-72" d="M 2631 2963 
+Q 2534 3019 2420 3045 
+Q 2306 3072 2169 3072 
+Q 1681 3072 1420 2755 
+Q 1159 2438 1159 1844 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1341 3275 1631 3429 
+Q 1922 3584 2338 3584 
+Q 2397 3584 2469 3576 
+Q 2541 3569 2628 3553 
+L 2631 2963 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-79" d="M 2059 -325 
+Q 1816 -950 1584 -1140 
+Q 1353 -1331 966 -1331 
+L 506 -1331 
+L 506 -850 
+L 844 -850 
+Q 1081 -850 1212 -737 
+Q 1344 -625 1503 -206 
+L 1606 56 
+L 191 3500 
+L 800 3500 
+L 1894 763 
+L 2988 3500 
+L 3597 3500 
+L 2059 -325 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-42" d="M 1259 2228 
+L 1259 519 
+L 2272 519 
+Q 2781 519 3026 730 
+Q 3272 941 3272 1375 
+Q 3272 1813 3026 2020 
+Q 2781 2228 2272 2228 
+L 1259 2228 
+z
+M 1259 4147 
+L 1259 2741 
+L 2194 2741 
+Q 2656 2741 2882 2914 
+Q 3109 3088 3109 3444 
+Q 3109 3797 2882 3972 
+Q 2656 4147 2194 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 2241 4666 
+Q 2963 4666 3353 4366 
+Q 3744 4066 3744 3513 
+Q 3744 3084 3544 2831 
+Q 3344 2578 2956 2516 
+Q 3422 2416 3680 2098 
+Q 3938 1781 3938 1306 
+Q 3938 681 3513 340 
+Q 3088 0 2303 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-61" d="M 2194 1759 
+Q 1497 1759 1228 1600 
+Q 959 1441 959 1056 
+Q 959 750 1161 570 
+Q 1363 391 1709 391 
+Q 2188 391 2477 730 
+Q 2766 1069 2766 1631 
+L 2766 1759 
+L 2194 1759 
+z
+M 3341 1997 
+L 3341 0 
+L 2766 0 
+L 2766 531 
+Q 2569 213 2275 61 
+Q 1981 -91 1556 -91 
+Q 1019 -91 701 211 
+Q 384 513 384 1019 
+Q 384 1609 779 1909 
+Q 1175 2209 1959 2209 
+L 2766 2209 
+L 2766 2266 
+Q 2766 2663 2505 2880 
+Q 2244 3097 1772 3097 
+Q 1472 3097 1187 3025 
+Q 903 2953 641 2809 
+L 641 3341 
+Q 956 3463 1253 3523 
+Q 1550 3584 1831 3584 
+Q 2591 3584 2966 3190 
+Q 3341 2797 3341 1997 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6e" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-64" d="M 2906 2969 
+L 2906 4863 
+L 3481 4863 
+L 3481 0 
+L 2906 0 
+L 2906 525 
+Q 2725 213 2448 61 
+Q 2172 -91 1784 -91 
+Q 1150 -91 751 415 
+Q 353 922 353 1747 
+Q 353 2572 751 3078 
+Q 1150 3584 1784 3584 
+Q 2172 3584 2448 3432 
+Q 2725 3281 2906 2969 
+z
+M 947 1747 
+Q 947 1113 1208 752 
+Q 1469 391 1925 391 
+Q 2381 391 2643 752 
+Q 2906 1113 2906 1747 
+Q 2906 2381 2643 2742 
+Q 2381 3103 1925 3103 
+Q 1469 3103 1208 2742 
+Q 947 2381 947 1747 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-77" d="M 269 3500 
+L 844 3500 
+L 1563 769 
+L 2278 3500 
+L 2956 3500 
+L 3675 769 
+L 4391 3500 
+L 4966 3500 
+L 4050 0 
+L 3372 0 
+L 2619 2869 
+L 1863 0 
+L 1184 0 
+L 269 3500 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-69" d="M 603 3500 
+L 1178 3500 
+L 1178 0 
+L 603 0 
+L 603 3500 
+z
+M 603 4863 
+L 1178 4863 
+L 1178 4134 
+L 603 4134 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-74" d="M 1172 4494 
+L 1172 3500 
+L 2356 3500 
+L 2356 3053 
+L 1172 3053 
+L 1172 1153 
+Q 1172 725 1289 603 
+Q 1406 481 1766 481 
+L 2356 481 
+L 2356 0 
+L 1766 0 
+Q 1100 0 847 248 
+Q 594 497 594 1153 
+L 594 3053 
+L 172 3053 
+L 172 3500 
+L 594 3500 
+L 594 4494 
+L 1172 4494 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-68" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-47" d="M 3809 666 
+L 3809 1919 
+L 2778 1919 
+L 2778 2438 
+L 4434 2438 
+L 4434 434 
+Q 4069 175 3628 42 
+Q 3188 -91 2688 -91 
+Q 1594 -91 976 548 
+Q 359 1188 359 2328 
+Q 359 3472 976 4111 
+Q 1594 4750 2688 4750 
+Q 3144 4750 3555 4637 
+Q 3966 4525 4313 4306 
+L 4313 3634 
+Q 3963 3931 3569 4081 
+Q 3175 4231 2741 4231 
+Q 1884 4231 1454 3753 
+Q 1025 3275 1025 2328 
+Q 1025 1384 1454 906 
+Q 1884 428 2741 428 
+Q 3075 428 3337 486 
+Q 3600 544 3809 666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-2f" d="M 1625 4666 
+L 2156 4666 
+L 531 -594 
+L 0 -594 
+L 1625 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-73" d="M 2834 3397 
+L 2834 2853 
+Q 2591 2978 2328 3040 
+Q 2066 3103 1784 3103 
+Q 1356 3103 1142 2972 
+Q 928 2841 928 2578 
+Q 928 2378 1081 2264 
+Q 1234 2150 1697 2047 
+L 1894 2003 
+Q 2506 1872 2764 1633 
+Q 3022 1394 3022 966 
+Q 3022 478 2636 193 
+Q 2250 -91 1575 -91 
+Q 1294 -91 989 -36 
+Q 684 19 347 128 
+L 347 722 
+Q 666 556 975 473 
+Q 1284 391 1588 391 
+Q 1994 391 2212 530 
+Q 2431 669 2431 922 
+Q 2431 1156 2273 1281 
+Q 2116 1406 1581 1522 
+L 1381 1569 
+Q 847 1681 609 1914 
+Q 372 2147 372 2553 
+Q 372 3047 722 3315 
+Q 1072 3584 1716 3584 
+Q 2034 3584 2315 3537 
+Q 2597 3491 2834 3397 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-4d"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(86.279297 0)"/>
+      <use xlink:href="#DejaVuSans-6d" transform="translate(147.802734 0)"/>
+      <use xlink:href="#DejaVuSans-6f" transform="translate(245.214844 0)"/>
+      <use xlink:href="#DejaVuSans-72" transform="translate(306.396484 0)"/>
+      <use xlink:href="#DejaVuSans-79" transform="translate(347.509766 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(406.689453 0)"/>
+      <use xlink:href="#DejaVuSans-42" transform="translate(438.476562 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(507.080078 0)"/>
+      <use xlink:href="#DejaVuSans-6e" transform="translate(568.359375 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(631.738281 0)"/>
+      <use xlink:href="#DejaVuSans-77" transform="translate(695.214844 0)"/>
+      <use xlink:href="#DejaVuSans-69" transform="translate(777.001953 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(804.785156 0)"/>
+      <use xlink:href="#DejaVuSans-74" transform="translate(868.261719 0)"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(907.470703 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(970.849609 0)"/>
+      <use xlink:href="#DejaVuSans-28" transform="translate(1002.636719 0)"/>
+      <use xlink:href="#DejaVuSans-47" transform="translate(1041.650391 0)"/>
+      <use xlink:href="#DejaVuSans-42" transform="translate(1119.140625 0)"/>
+      <use xlink:href="#DejaVuSans-2f" transform="translate(1187.744141 0)"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(1221.435547 0)"/>
+      <use xlink:href="#DejaVuSans-29" transform="translate(1273.535156 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_16">
+    <path d="M 75.099178 253.25486 
+L 130.546438 259.661974 
+L 185.993699 223.419235 
+L 241.440959 218.133855 
+L 296.888219 216.008349 
+L 352.33548 214.921541 
+L 407.78274 214.432846 
+" clip-path="url(#pabdd47e931)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <defs>
+     <path id="m7377485948" d="M 0 3.5 
+C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
+C 3.131218 1.81853 3.5 0.928211 3.5 0 
+C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
+C 1.81853 -3.131218 0.928211 -3.5 0 -3.5 
+C -0.928211 -3.5 -1.81853 -3.131218 -2.474874 -2.474874 
+C -3.131218 -1.81853 -3.5 -0.928211 -3.5 0 
+C -3.5 0.928211 -3.131218 1.81853 -2.474874 2.474874 
+C -1.81853 3.131218 -0.928211 3.5 0 3.5 
+z
+" style="stroke: #5ba3f5"/>
+    </defs>
+    <g clip-path="url(#pabdd47e931)">
+     <use xlink:href="#m7377485948" x="75.099178" y="253.25486" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7377485948" x="130.546438" y="259.661974" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7377485948" x="185.993699" y="223.419235" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7377485948" x="241.440959" y="218.133855" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7377485948" x="296.888219" y="216.008349" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7377485948" x="352.33548" y="214.921541" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7377485948" x="407.78274" y="214.432846" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_17">
+    <path d="M 75.099178 258.412109 
+L 130.546438 246.283012 
+L 185.993699 238.132423 
+L 241.440959 234.06502 
+L 296.888219 231.64472 
+L 352.33548 230.413258 
+L 407.78274 229.891064 
+" clip-path="url(#pabdd47e931)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <defs>
+     <path id="mad3f600a4b" d="M 0 3.5 
+C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
+C 3.131218 1.81853 3.5 0.928211 3.5 0 
+C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
+C 1.81853 -3.131218 0.928211 -3.5 0 -3.5 
+C -0.928211 -3.5 -1.81853 -3.131218 -2.474874 -2.474874 
+C -3.131218 -1.81853 -3.5 -0.928211 -3.5 0 
+C -3.5 0.928211 -3.131218 1.81853 -2.474874 2.474874 
+C -1.81853 3.131218 -0.928211 3.5 0 3.5 
+z
+" style="stroke: #ff4444"/>
+    </defs>
+    <g clip-path="url(#pabdd47e931)">
+     <use xlink:href="#mad3f600a4b" x="75.099178" y="258.412109" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mad3f600a4b" x="130.546438" y="246.283012" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mad3f600a4b" x="185.993699" y="238.132423" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mad3f600a4b" x="241.440959" y="234.06502" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mad3f600a4b" x="296.888219" y="231.64472" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mad3f600a4b" x="352.33548" y="230.413258" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mad3f600a4b" x="407.78274" y="229.891064" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_18">
+    <path d="M 58.465 158.870109 
+L 424.416918 158.870109 
+" clip-path="url(#pabdd47e931)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_3">
+    <path d="M 58.465 334.546471 
+L 58.465 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_4">
+    <path d="M 424.416918 334.546471 
+L 424.416918 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_5">
+    <path d="M 58.465 334.546471 
+L 424.416918 334.546471 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_6">
+    <path d="M 58.465 144.816 
+L 424.416918 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_13">
+    <!-- RMSNorm (fp32 weight) -->
+    <g transform="translate(133.737678 138.816) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-52" d="M 2841 2188 
+Q 3044 2119 3236 1894 
+Q 3428 1669 3622 1275 
+L 4263 0 
+L 3584 0 
+L 2988 1197 
+Q 2756 1666 2539 1819 
+Q 2322 1972 1947 1972 
+L 1259 1972 
+L 1259 0 
+L 628 0 
+L 628 4666 
+L 2053 4666 
+Q 2853 4666 3247 4331 
+Q 3641 3997 3641 3322 
+Q 3641 2881 3436 2590 
+Q 3231 2300 2841 2188 
+z
+M 1259 4147 
+L 1259 2491 
+L 2053 2491 
+Q 2509 2491 2742 2702 
+Q 2975 2913 2975 3322 
+Q 2975 3731 2742 3939 
+Q 2509 4147 2053 4147 
+L 1259 4147 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-53" d="M 3425 4513 
+L 3425 3897 
+Q 3066 4069 2747 4153 
+Q 2428 4238 2131 4238 
+Q 1616 4238 1336 4038 
+Q 1056 3838 1056 3469 
+Q 1056 3159 1242 3001 
+Q 1428 2844 1947 2747 
+L 2328 2669 
+Q 3034 2534 3370 2195 
+Q 3706 1856 3706 1288 
+Q 3706 609 3251 259 
+Q 2797 -91 1919 -91 
+Q 1588 -91 1214 -16 
+Q 841 59 441 206 
+L 441 856 
+Q 825 641 1194 531 
+Q 1563 422 1919 422 
+Q 2459 422 2753 634 
+Q 3047 847 3047 1241 
+Q 3047 1584 2836 1778 
+Q 2625 1972 2144 2069 
+L 1759 2144 
+Q 1053 2284 737 2584 
+Q 422 2884 422 3419 
+Q 422 4038 858 4394 
+Q 1294 4750 2059 4750 
+Q 2388 4750 2728 4690 
+Q 3069 4631 3425 4513 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-4e" d="M 628 4666 
+L 1478 4666 
+L 3547 763 
+L 3547 4666 
+L 4159 4666 
+L 4159 0 
+L 3309 0 
+L 1241 3903 
+L 1241 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-66" d="M 2375 4863 
+L 2375 4384 
+L 1825 4384 
+Q 1516 4384 1395 4259 
+Q 1275 4134 1275 3809 
+L 1275 3500 
+L 2222 3500 
+L 2222 3053 
+L 1275 3053 
+L 1275 0 
+L 697 0 
+L 697 3053 
+L 147 3053 
+L 147 3500 
+L 697 3500 
+L 697 3744 
+Q 697 4328 969 4595 
+Q 1241 4863 1831 4863 
+L 2375 4863 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-70" d="M 1159 525 
+L 1159 -1331 
+L 581 -1331 
+L 581 3500 
+L 1159 3500 
+L 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+z
+M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-67" d="M 2906 1791 
+Q 2906 2416 2648 2759 
+Q 2391 3103 1925 3103 
+Q 1463 3103 1205 2759 
+Q 947 2416 947 1791 
+Q 947 1169 1205 825 
+Q 1463 481 1925 481 
+Q 2391 481 2648 825 
+Q 2906 1169 2906 1791 
+z
+M 3481 434 
+Q 3481 -459 3084 -895 
+Q 2688 -1331 1869 -1331 
+Q 1566 -1331 1297 -1286 
+Q 1028 -1241 775 -1147 
+L 775 -588 
+Q 1028 -725 1275 -790 
+Q 1522 -856 1778 -856 
+Q 2344 -856 2625 -561 
+Q 2906 -266 2906 331 
+L 2906 616 
+Q 2728 306 2450 153 
+Q 2172 0 1784 0 
+Q 1141 0 747 490 
+Q 353 981 353 1791 
+Q 353 2603 747 3093 
+Q 1141 3584 1784 3584 
+Q 2172 3584 2450 3431 
+Q 2728 3278 2906 2969 
+L 2906 3500 
+L 3481 3500 
+L 3481 434 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-52"/>
+     <use xlink:href="#DejaVuSans-4d" transform="translate(69.482422 0)"/>
+     <use xlink:href="#DejaVuSans-53" transform="translate(155.761719 0)"/>
+     <use xlink:href="#DejaVuSans-4e" transform="translate(219.238281 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(294.042969 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(355.224609 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(394.587891 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(492 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(523.787109 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(562.800781 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(598.005859 0)"/>
+     <use xlink:href="#DejaVuSans-33" transform="translate(661.482422 0)"/>
+     <use xlink:href="#DejaVuSans-32" transform="translate(725.105469 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(788.728516 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(820.515625 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(902.302734 0)"/>
+     <use xlink:href="#DejaVuSans-69" transform="translate(963.826172 0)"/>
+     <use xlink:href="#DejaVuSans-67" transform="translate(991.609375 0)"/>
+     <use xlink:href="#DejaVuSans-68" transform="translate(1055.085938 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(1118.464844 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1157.673828 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_2">
+   <g id="patch_7">
+    <path d="M 469.455 334.546471 
+L 835.406918 334.546471 
+L 835.406918 144.816 
+L 469.455 144.816 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_3">
+    <g id="xtick_8">
+     <g id="line2d_19">
+      <g>
+       <use xlink:href="#ma3eba1b756" x="486.089178" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_14">
+      <!-- (8K, 4K) -->
+      <g transform="translate(487.559739 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_9">
+     <g id="line2d_20">
+      <g>
+       <use xlink:href="#ma3eba1b756" x="541.536438" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_15">
+      <!-- (16K, 4K) -->
+      <g transform="translate(543.007 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_10">
+     <g id="line2d_21">
+      <g>
+       <use xlink:href="#ma3eba1b756" x="596.983699" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_16">
+      <!-- (32K, 4K) -->
+      <g transform="translate(598.45426 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_11">
+     <g id="line2d_22">
+      <g>
+       <use xlink:href="#ma3eba1b756" x="652.430959" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_17">
+      <!-- (64K, 4K) -->
+      <g transform="translate(653.90152 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_12">
+     <g id="line2d_23">
+      <g>
+       <use xlink:href="#ma3eba1b756" x="707.878219" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_18">
+      <!-- (128K, 4K) -->
+      <g transform="translate(709.34878 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_13">
+     <g id="line2d_24">
+      <g>
+       <use xlink:href="#ma3eba1b756" x="763.32548" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_19">
+      <!-- (256K, 4K) -->
+      <g transform="translate(764.796041 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_14">
+     <g id="line2d_25">
+      <g>
+       <use xlink:href="#ma3eba1b756" x="818.77274" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_20">
+      <!-- (512K, 4K) -->
+      <g transform="translate(820.243301 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_4">
+    <g id="ytick_5">
+     <g id="line2d_26">
+      <path d="M 469.455 334.546471 
+L 835.406918 334.546471 
+" clip-path="url(#p78cfff3dbe)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_27">
+      <g>
+       <use xlink:href="#m1594e17ace" x="469.455" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_6">
+     <g id="line2d_28">
+      <path d="M 469.455 286.945749 
+L 835.406918 286.945749 
+" clip-path="url(#p78cfff3dbe)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_29">
+      <g>
+       <use xlink:href="#m1594e17ace" x="469.455" y="286.945749" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_7">
+     <g id="line2d_30">
+      <path d="M 469.455 239.345028 
+L 835.406918 239.345028 
+" clip-path="url(#p78cfff3dbe)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_31">
+      <g>
+       <use xlink:href="#m1594e17ace" x="469.455" y="239.345028" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_8">
+     <g id="line2d_32">
+      <path d="M 469.455 191.744306 
+L 835.406918 191.744306 
+" clip-path="url(#p78cfff3dbe)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_33">
+      <g>
+       <use xlink:href="#m1594e17ace" x="469.455" y="191.744306" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_34">
+    <path d="M 486.089178 242.661221 
+L 541.536438 232.6956 
+L 596.983699 226.518353 
+L 652.430959 223.572657 
+L 707.878219 221.993259 
+L 763.32548 221.173843 
+L 818.77274 220.741664 
+" clip-path="url(#p78cfff3dbe)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p78cfff3dbe)">
+     <use xlink:href="#m7377485948" x="486.089178" y="242.661221" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7377485948" x="541.536438" y="232.6956" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7377485948" x="596.983699" y="226.518353" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7377485948" x="652.430959" y="223.572657" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7377485948" x="707.878219" y="221.993259" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7377485948" x="763.32548" y="221.173843" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7377485948" x="818.77274" y="220.741664" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_35">
+    <path d="M 486.089178 272.482214 
+L 541.536438 265.651785 
+L 596.983699 261.232124 
+L 652.430959 258.947326 
+L 707.878219 257.740701 
+L 763.32548 257.088157 
+L 818.77274 256.798828 
+" clip-path="url(#p78cfff3dbe)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p78cfff3dbe)">
+     <use xlink:href="#mad3f600a4b" x="486.089178" y="272.482214" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mad3f600a4b" x="541.536438" y="265.651785" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mad3f600a4b" x="596.983699" y="261.232124" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mad3f600a4b" x="652.430959" y="258.947326" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mad3f600a4b" x="707.878219" y="257.740701" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mad3f600a4b" x="763.32548" y="257.088157" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mad3f600a4b" x="818.77274" y="256.798828" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_36">
+    <path d="M 469.455 158.870109 
+L 835.406918 158.870109 
+" clip-path="url(#p78cfff3dbe)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_8">
+    <path d="M 469.455 334.546471 
+L 469.455 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_9">
+    <path d="M 835.406918 334.546471 
+L 835.406918 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_10">
+    <path d="M 469.455 334.546471 
+L 835.406918 334.546471 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_11">
+    <path d="M 469.455 144.816 
+L 835.406918 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_21">
+    <!-- Softmax (fwd+bwd) -->
+    <g transform="translate(562.765646 138.816) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-78" d="M 3513 3500 
+L 2247 1797 
+L 3578 0 
+L 2900 0 
+L 1881 1375 
+L 863 0 
+L 184 0 
+L 1544 1831 
+L 300 3500 
+L 978 3500 
+L 1906 2253 
+L 2834 3500 
+L 3513 3500 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-2b" d="M 2944 4013 
+L 2944 2272 
+L 4684 2272 
+L 4684 1741 
+L 2944 1741 
+L 2944 0 
+L 2419 0 
+L 2419 1741 
+L 678 1741 
+L 678 2272 
+L 2419 2272 
+L 2419 4013 
+L 2944 4013 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-62" d="M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+M 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2969 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-53"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(63.476562 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(124.658203 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(158.113281 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(197.322266 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(294.734375 0)"/>
+     <use xlink:href="#DejaVuSans-78" transform="translate(356.013672 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(415.193359 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(446.980469 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(485.994141 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(519.449219 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(601.236328 0)"/>
+     <use xlink:href="#DejaVuSans-2b" transform="translate(664.712891 0)"/>
+     <use xlink:href="#DejaVuSans-62" transform="translate(748.501953 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(811.978516 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(893.765625 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(957.242188 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_3">
+   <g id="patch_12">
+    <path d="M 880.445 334.546471 
+L 1246.396918 334.546471 
+L 1246.396918 144.816 
+L 880.445 144.816 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_5">
+    <g id="xtick_15">
+     <g id="line2d_37">
+      <g>
+       <use xlink:href="#ma3eba1b756" x="897.079178" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_22">
+      <!-- (8K, 4K) -->
+      <g transform="translate(898.549739 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_16">
+     <g id="line2d_38">
+      <g>
+       <use xlink:href="#ma3eba1b756" x="952.526438" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_23">
+      <!-- (16K, 4K) -->
+      <g transform="translate(953.997 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_17">
+     <g id="line2d_39">
+      <g>
+       <use xlink:href="#ma3eba1b756" x="1007.973699" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_24">
+      <!-- (32K, 4K) -->
+      <g transform="translate(1009.44426 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_18">
+     <g id="line2d_40">
+      <g>
+       <use xlink:href="#ma3eba1b756" x="1063.420959" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_25">
+      <!-- (64K, 4K) -->
+      <g transform="translate(1064.89152 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_19">
+     <g id="line2d_41">
+      <g>
+       <use xlink:href="#ma3eba1b756" x="1118.868219" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_26">
+      <!-- (128K, 4K) -->
+      <g transform="translate(1120.33878 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_20">
+     <g id="line2d_42">
+      <g>
+       <use xlink:href="#ma3eba1b756" x="1174.31548" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_27">
+      <!-- (256K, 4K) -->
+      <g transform="translate(1175.786041 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_21">
+     <g id="line2d_43">
+      <g>
+       <use xlink:href="#ma3eba1b756" x="1229.76274" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_28">
+      <!-- (512K, 4K) -->
+      <g transform="translate(1231.233301 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_6">
+    <g id="ytick_9">
+     <g id="line2d_44">
+      <path d="M 880.445 334.546471 
+L 1246.396918 334.546471 
+" clip-path="url(#p08e088dd3c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_45">
+      <g>
+       <use xlink:href="#m1594e17ace" x="880.445" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_10">
+     <g id="line2d_46">
+      <path d="M 880.445 286.945749 
+L 1246.396918 286.945749 
+" clip-path="url(#p08e088dd3c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_47">
+      <g>
+       <use xlink:href="#m1594e17ace" x="880.445" y="286.945749" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_11">
+     <g id="line2d_48">
+      <path d="M 880.445 239.345028 
+L 1246.396918 239.345028 
+" clip-path="url(#p08e088dd3c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_49">
+      <g>
+       <use xlink:href="#m1594e17ace" x="880.445" y="239.345028" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_12">
+     <g id="line2d_50">
+      <path d="M 880.445 191.744306 
+L 1246.396918 191.744306 
+" clip-path="url(#p08e088dd3c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_51">
+      <g>
+       <use xlink:href="#m1594e17ace" x="880.445" y="191.744306" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_52">
+    <path d="M 897.079178 257.088265 
+L 952.526438 246.107662 
+L 1007.973699 241.060757 
+L 1063.420959 237.106886 
+L 1118.868219 235.500346 
+L 1174.31548 234.468963 
+L 1229.76274 233.975246 
+" clip-path="url(#p08e088dd3c)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p08e088dd3c)">
+     <use xlink:href="#m7377485948" x="897.079178" y="257.088265" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7377485948" x="952.526438" y="246.107662" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7377485948" x="1007.973699" y="241.060757" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7377485948" x="1063.420959" y="237.106886" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7377485948" x="1118.868219" y="235.500346" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7377485948" x="1174.31548" y="234.468963" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7377485948" x="1229.76274" y="233.975246" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_53">
+    <path d="M 897.079178 299.12209 
+L 952.526438 272.123621 
+L 1007.973699 266.042305 
+L 1063.420959 262.721252 
+L 1118.868219 261.113394 
+L 1174.31548 260.124563 
+L 1229.76274 259.694299 
+" clip-path="url(#p08e088dd3c)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p08e088dd3c)">
+     <use xlink:href="#mad3f600a4b" x="897.079178" y="299.12209" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mad3f600a4b" x="952.526438" y="272.123621" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mad3f600a4b" x="1007.973699" y="266.042305" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mad3f600a4b" x="1063.420959" y="262.721252" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mad3f600a4b" x="1118.868219" y="261.113394" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mad3f600a4b" x="1174.31548" y="260.124563" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mad3f600a4b" x="1229.76274" y="259.694299" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_54">
+    <path d="M 880.445 158.870109 
+L 1246.396918 158.870109 
+" clip-path="url(#p08e088dd3c)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_13">
+    <path d="M 880.445 334.546471 
+L 880.445 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_14">
+    <path d="M 1246.396918 334.546471 
+L 1246.396918 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_15">
+    <path d="M 880.445 334.546471 
+L 1246.396918 334.546471 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_16">
+    <path d="M 880.445 144.816 
+L 1246.396918 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_29">
+    <!-- Cross-Entropy (fwd+bwd) -->
+    <g transform="translate(948.247678 138.816) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-43" d="M 4122 4306 
+L 4122 3641 
+Q 3803 3938 3442 4084 
+Q 3081 4231 2675 4231 
+Q 1875 4231 1450 3742 
+Q 1025 3253 1025 2328 
+Q 1025 1406 1450 917 
+Q 1875 428 2675 428 
+Q 3081 428 3442 575 
+Q 3803 722 4122 1019 
+L 4122 359 
+Q 3791 134 3420 21 
+Q 3050 -91 2638 -91 
+Q 1578 -91 968 557 
+Q 359 1206 359 2328 
+Q 359 3453 968 4101 
+Q 1578 4750 2638 4750 
+Q 3056 4750 3426 4639 
+Q 3797 4528 4122 4306 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-2d" d="M 313 2009 
+L 1997 2009 
+L 1997 1497 
+L 313 1497 
+L 313 2009 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-45" d="M 628 4666 
+L 3578 4666 
+L 3578 4134 
+L 1259 4134 
+L 1259 2753 
+L 3481 2753 
+L 3481 2222 
+L 1259 2222 
+L 1259 531 
+L 3634 531 
+L 3634 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-43"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(69.824219 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(108.6875 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(169.869141 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(221.96875 0)"/>
+     <use xlink:href="#DejaVuSans-2d" transform="translate(274.068359 0)"/>
+     <use xlink:href="#DejaVuSans-45" transform="translate(310.152344 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(373.335938 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(436.714844 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(475.923828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(514.787109 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(575.96875 0)"/>
+     <use xlink:href="#DejaVuSans-79" transform="translate(639.445312 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(698.625 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(730.412109 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(769.425781 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(802.880859 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(884.667969 0)"/>
+     <use xlink:href="#DejaVuSans-2b" transform="translate(948.144531 0)"/>
+     <use xlink:href="#DejaVuSans-62" transform="translate(1031.933594 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(1095.410156 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1177.197266 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1240.673828 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="text_30">
+   <!-- SM100 BF16 Kernel Benchmarks (Oink vs Quack) — Quack-suite -->
+   <g transform="translate(286.419063 18.156563) scale(0.22 -0.22)">
+    <defs>
+     <path id="DejaVuSans-46" d="M 628 4666 
+L 3309 4666 
+L 3309 4134 
+L 1259 4134 
+L 1259 2759 
+L 3109 2759 
+L 3109 2228 
+L 1259 2228 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-6c" d="M 603 4863 
+L 1178 4863 
+L 1178 0 
+L 603 0 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-63" d="M 3122 3366 
+L 3122 2828 
+Q 2878 2963 2633 3030 
+Q 2388 3097 2138 3097 
+Q 1578 3097 1268 2742 
+Q 959 2388 959 1747 
+Q 959 1106 1268 751 
+Q 1578 397 2138 397 
+Q 2388 397 2633 464 
+Q 2878 531 3122 666 
+L 3122 134 
+Q 2881 22 2623 -34 
+Q 2366 -91 2075 -91 
+Q 1284 -91 818 406 
+Q 353 903 353 1747 
+Q 353 2603 823 3093 
+Q 1294 3584 2113 3584 
+Q 2378 3584 2631 3529 
+Q 2884 3475 3122 3366 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-6b" d="M 581 4863 
+L 1159 4863 
+L 1159 1991 
+L 2875 3500 
+L 3609 3500 
+L 1753 1863 
+L 3688 0 
+L 2938 0 
+L 1159 1709 
+L 1159 0 
+L 581 0 
+L 581 4863 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-4f" d="M 2522 4238 
+Q 1834 4238 1429 3725 
+Q 1025 3213 1025 2328 
+Q 1025 1447 1429 934 
+Q 1834 422 2522 422 
+Q 3209 422 3611 934 
+Q 4013 1447 4013 2328 
+Q 4013 3213 3611 3725 
+Q 3209 4238 2522 4238 
+z
+M 2522 4750 
+Q 3503 4750 4090 4092 
+Q 4678 3434 4678 2328 
+Q 4678 1225 4090 567 
+Q 3503 -91 2522 -91 
+Q 1538 -91 948 565 
+Q 359 1222 359 2328 
+Q 359 3434 948 4092 
+Q 1538 4750 2522 4750 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-76" d="M 191 3500 
+L 800 3500 
+L 1894 563 
+L 2988 3500 
+L 3597 3500 
+L 2284 0 
+L 1503 0 
+L 191 3500 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-51" d="M 2522 4238 
+Q 1834 4238 1429 3725 
+Q 1025 3213 1025 2328 
+Q 1025 1447 1429 934 
+Q 1834 422 2522 422 
+Q 3209 422 3611 934 
+Q 4013 1447 4013 2328 
+Q 4013 3213 3611 3725 
+Q 3209 4238 2522 4238 
+z
+M 3406 84 
+L 4238 -825 
+L 3475 -825 
+L 2784 -78 
+Q 2681 -84 2626 -87 
+Q 2572 -91 2522 -91 
+Q 1538 -91 948 567 
+Q 359 1225 359 2328 
+Q 359 3434 948 4092 
+Q 1538 4750 2522 4750 
+Q 3503 4750 4090 4092 
+Q 4678 3434 4678 2328 
+Q 4678 1516 4351 937 
+Q 4025 359 3406 84 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-75" d="M 544 1381 
+L 544 3500 
+L 1119 3500 
+L 1119 1403 
+Q 1119 906 1312 657 
+Q 1506 409 1894 409 
+Q 2359 409 2629 706 
+Q 2900 1003 2900 1516 
+L 2900 3500 
+L 3475 3500 
+L 3475 0 
+L 2900 0 
+L 2900 538 
+Q 2691 219 2414 64 
+Q 2138 -91 1772 -91 
+Q 1169 -91 856 284 
+Q 544 659 544 1381 
+z
+M 1991 3584 
+L 1991 3584 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-2014" d="M 313 1978 
+L 6088 1978 
+L 6088 1528 
+L 313 1528 
+L 313 1978 
+z
+" transform="scale(0.015625)"/>
+    </defs>
+    <use xlink:href="#DejaVuSans-53"/>
+    <use xlink:href="#DejaVuSans-4d" transform="translate(63.476562 0)"/>
+    <use xlink:href="#DejaVuSans-31" transform="translate(149.755859 0)"/>
+    <use xlink:href="#DejaVuSans-30" transform="translate(213.378906 0)"/>
+    <use xlink:href="#DejaVuSans-30" transform="translate(277.001953 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(340.625 0)"/>
+    <use xlink:href="#DejaVuSans-42" transform="translate(372.412109 0)"/>
+    <use xlink:href="#DejaVuSans-46" transform="translate(441.015625 0)"/>
+    <use xlink:href="#DejaVuSans-31" transform="translate(498.535156 0)"/>
+    <use xlink:href="#DejaVuSans-36" transform="translate(562.158203 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(625.78125 0)"/>
+    <use xlink:href="#DejaVuSans-4b" transform="translate(657.568359 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(718.144531 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(779.667969 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(819.03125 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(882.410156 0)"/>
+    <use xlink:href="#DejaVuSans-6c" transform="translate(943.933594 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(971.716797 0)"/>
+    <use xlink:href="#DejaVuSans-42" transform="translate(1003.503906 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(1072.107422 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(1133.630859 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(1197.009766 0)"/>
+    <use xlink:href="#DejaVuSans-68" transform="translate(1251.990234 0)"/>
+    <use xlink:href="#DejaVuSans-6d" transform="translate(1315.369141 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(1412.78125 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(1474.060547 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(1515.173828 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(1573.083984 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(1625.183594 0)"/>
+    <use xlink:href="#DejaVuSans-28" transform="translate(1656.970703 0)"/>
+    <use xlink:href="#DejaVuSans-4f" transform="translate(1695.984375 0)"/>
+    <use xlink:href="#DejaVuSans-69" transform="translate(1774.695312 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(1802.478516 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(1865.857422 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(1923.767578 0)"/>
+    <use xlink:href="#DejaVuSans-76" transform="translate(1955.554688 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(2014.734375 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2066.833984 0)"/>
+    <use xlink:href="#DejaVuSans-51" transform="translate(2098.621094 0)"/>
+    <use xlink:href="#DejaVuSans-75" transform="translate(2177.332031 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(2240.710938 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(2301.990234 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(2356.970703 0)"/>
+    <use xlink:href="#DejaVuSans-29" transform="translate(2414.880859 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2453.894531 0)"/>
+    <use xlink:href="#DejaVuSans-2014" transform="translate(2485.681641 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2585.681641 0)"/>
+    <use xlink:href="#DejaVuSans-51" transform="translate(2617.46875 0)"/>
+    <use xlink:href="#DejaVuSans-75" transform="translate(2696.179688 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(2759.558594 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(2820.837891 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(2875.818359 0)"/>
+    <use xlink:href="#DejaVuSans-2d" transform="translate(2933.728516 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(2969.8125 0)"/>
+    <use xlink:href="#DejaVuSans-75" transform="translate(3021.912109 0)"/>
+    <use xlink:href="#DejaVuSans-69" transform="translate(3085.291016 0)"/>
+    <use xlink:href="#DejaVuSans-74" transform="translate(3113.074219 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(3152.283203 0)"/>
+   </g>
+  </g>
+  <g id="legend_1">
+   <g id="line2d_55">
+    <path d="M 378.240937 46.691969 
+L 394.490937 46.691969 
+L 410.740937 46.691969 
+" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g>
+     <use xlink:href="#m7377485948" x="394.490937" y="46.691969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="text_31">
+    <!-- KernelAgent-Oink (ours) -->
+    <g transform="translate(421.140937 51.241969) scale(0.13 -0.13)">
+     <defs>
+      <path id="DejaVuSans-41" d="M 2188 4044 
+L 1331 1722 
+L 3047 1722 
+L 2188 4044 
+z
+M 1831 4666 
+L 2547 4666 
+L 4325 0 
+L 3669 0 
+L 3244 1197 
+L 1141 1197 
+L 716 0 
+L 50 0 
+L 1831 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-4b"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(60.576172 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(122.099609 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(161.462891 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(224.841797 0)"/>
+     <use xlink:href="#DejaVuSans-6c" transform="translate(286.365234 0)"/>
+     <use xlink:href="#DejaVuSans-41" transform="translate(314.148438 0)"/>
+     <use xlink:href="#DejaVuSans-67" transform="translate(382.556641 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(446.033203 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(507.556641 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(570.935547 0)"/>
+     <use xlink:href="#DejaVuSans-2d" transform="translate(610.144531 0)"/>
+     <use xlink:href="#DejaVuSans-4f" transform="translate(648.978516 0)"/>
+     <use xlink:href="#DejaVuSans-69" transform="translate(727.689453 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(755.472656 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(818.851562 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(876.761719 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(908.548828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(947.5625 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(1008.744141 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(1072.123047 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(1113.236328 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1165.335938 0)"/>
+    </g>
+   </g>
+   <g id="line2d_56">
+    <path d="M 603.705625 46.691969 
+L 619.955625 46.691969 
+L 636.205625 46.691969 
+" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g>
+     <use xlink:href="#mad3f600a4b" x="619.955625" y="46.691969" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="text_32">
+    <!-- Quack -->
+    <g transform="translate(646.605625 51.241969) scale(0.13 -0.13)">
+     <use xlink:href="#DejaVuSans-51"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(78.710938 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(142.089844 0)"/>
+     <use xlink:href="#DejaVuSans-63" transform="translate(203.369141 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(258.349609 0)"/>
+    </g>
+   </g>
+   <g id="line2d_57">
+    <path d="M 713.720156 46.691969 
+L 729.970156 46.691969 
+L 746.220156 46.691969 
+" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="text_33">
+    <!-- HBM peak (measured) -->
+    <g transform="translate(756.620156 51.241969) scale(0.13 -0.13)">
+     <defs>
+      <path id="DejaVuSans-48" d="M 628 4666 
+L 1259 4666 
+L 1259 2753 
+L 3553 2753 
+L 3553 4666 
+L 4184 4666 
+L 4184 0 
+L 3553 0 
+L 3553 2222 
+L 1259 2222 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-48"/>
+     <use xlink:href="#DejaVuSans-42" transform="translate(75.195312 0)"/>
+     <use xlink:href="#DejaVuSans-4d" transform="translate(143.798828 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(230.078125 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(261.865234 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(325.341797 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(386.865234 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(448.144531 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(506.054688 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(537.841797 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(576.855469 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(674.267578 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(735.791016 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(797.070312 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(849.169922 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(912.548828 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(951.412109 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1012.935547 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1076.412109 0)"/>
+    </g>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="pabdd47e931">
+   <rect x="58.465" y="144.816" width="365.951918" height="189.730471"/>
+  </clipPath>
+  <clipPath id="p78cfff3dbe">
+   <rect x="469.455" y="144.816" width="365.951918" height="189.730471"/>
+  </clipPath>
+  <clipPath id="p08e088dd3c">
+   <rect x="880.445" y="144.816" width="365.951918" height="189.730471"/>
+  </clipPath>
+ </defs>
+</svg>
diff --git a/oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3.svg b/oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3.svg
new file mode 100644
index 0000000..b70ba9b
--- /dev/null
+++ b/oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3.svg
@@ -0,0 +1,2600 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="1296pt" height="403.2pt" viewBox="0 0 1296 403.2" xmlns="http://www.w3.org/2000/svg" version="1.1">
+ <metadata>
+  <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+   <cc:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
+    <dc:date>2026-01-12T20:27:29.562089</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <cc:Agent>
+      <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
+     </cc:Agent>
+    </dc:creator>
+   </cc:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure_1">
+  <g id="patch_1">
+   <path d="M 0 403.2 
+L 1296 403.2 
+L 1296 0 
+L 0 0 
+z
+" style="fill: #ffffff"/>
+  </g>
+  <g id="axes_1">
+   <g id="patch_2">
+    <path d="M 66.53 345.561604 
+L 436.873051 345.561604 
+L 436.873051 151.44 
+L 66.53 151.44 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="line2d_1">
+      <defs>
+       <path id="m62e8ddf8d5" d="M 0 0 
+L 0 3.5 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#m62e8ddf8d5" x="83.363775" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_1">
+      <!-- (4K, 6K) -->
+      <g transform="translate(84.834336 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-28" d="M 1984 4856 
+Q 1566 4138 1362 3434 
+Q 1159 2731 1159 2009 
+Q 1159 1288 1364 580 
+Q 1569 -128 1984 -844 
+L 1484 -844 
+Q 1016 -109 783 600 
+Q 550 1309 550 2009 
+Q 550 2706 781 3412 
+Q 1013 4119 1484 4856 
+L 1984 4856 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-34" d="M 2419 4116 
+L 825 1625 
+L 2419 1625 
+L 2419 4116 
+z
+M 2253 4666 
+L 3047 4666 
+L 3047 1625 
+L 3713 1625 
+L 3713 1100 
+L 3047 1100 
+L 3047 0 
+L 2419 0 
+L 2419 1100 
+L 313 1100 
+L 313 1709 
+L 2253 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-4b" d="M 628 4666 
+L 1259 4666 
+L 1259 2694 
+L 3353 4666 
+L 4166 4666 
+L 1850 2491 
+L 4331 0 
+L 3500 0 
+L 1259 2247 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-2c" d="M 750 794 
+L 1409 794 
+L 1409 256 
+L 897 -744 
+L 494 -744 
+L 750 256 
+L 750 794 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-20" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-36" d="M 2113 2584 
+Q 1688 2584 1439 2293 
+Q 1191 2003 1191 1497 
+Q 1191 994 1439 701 
+Q 1688 409 2113 409 
+Q 2538 409 2786 701 
+Q 3034 994 3034 1497 
+Q 3034 2003 2786 2293 
+Q 2538 2584 2113 2584 
+z
+M 3366 4563 
+L 3366 3988 
+Q 3128 4100 2886 4159 
+Q 2644 4219 2406 4219 
+Q 1781 4219 1451 3797 
+Q 1122 3375 1075 2522 
+Q 1259 2794 1537 2939 
+Q 1816 3084 2150 3084 
+Q 2853 3084 3261 2657 
+Q 3669 2231 3669 1497 
+Q 3669 778 3244 343 
+Q 2819 -91 2113 -91 
+Q 1303 -91 875 529 
+Q 447 1150 447 2328 
+Q 447 3434 972 4092 
+Q 1497 4750 2381 4750 
+Q 2619 4750 2861 4703 
+Q 3103 4656 3366 4563 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-29" d="M 513 4856 
+L 1013 4856 
+Q 1481 4119 1714 3412 
+Q 1947 2706 1947 2009 
+Q 1947 1309 1714 600 
+Q 1481 -109 1013 -844 
+L 513 -844 
+Q 928 -128 1133 580 
+Q 1338 1288 1338 2009 
+Q 1338 2731 1133 3434 
+Q 928 4138 513 4856 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="line2d_2">
+      <g>
+       <use xlink:href="#m62e8ddf8d5" x="125.448213" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_2">
+      <!-- (16K, 6K) -->
+      <g transform="translate(126.918774 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-31" d="M 794 531 
+L 1825 531 
+L 1825 4091 
+L 703 3866 
+L 703 4441 
+L 1819 4666 
+L 2450 4666 
+L 2450 531 
+L 3481 531 
+L 3481 0 
+L 794 0 
+L 794 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="line2d_3">
+      <g>
+       <use xlink:href="#m62e8ddf8d5" x="167.53265" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_3">
+      <!-- (64K, 6K) -->
+      <g transform="translate(169.003211 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="line2d_4">
+      <g>
+       <use xlink:href="#m62e8ddf8d5" x="209.617088" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_4">
+      <!-- (4K, 7K) -->
+      <g transform="translate(211.087649 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-37" d="M 525 4666 
+L 3525 4666 
+L 3525 4397 
+L 1831 0 
+L 1172 0 
+L 2766 4134 
+L 525 4134 
+L 525 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="line2d_5">
+      <g>
+       <use xlink:href="#m62e8ddf8d5" x="251.701526" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_5">
+      <!-- (16K, 7K) -->
+      <g transform="translate(253.172087 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="line2d_6">
+      <g>
+       <use xlink:href="#m62e8ddf8d5" x="293.785963" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_6">
+      <!-- (64K, 7K) -->
+      <g transform="translate(295.256524 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_7">
+     <g id="line2d_7">
+      <g>
+       <use xlink:href="#m62e8ddf8d5" x="335.870401" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_7">
+      <!-- (4K, 8K) -->
+      <g transform="translate(337.340962 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-38" d="M 2034 2216 
+Q 1584 2216 1326 1975 
+Q 1069 1734 1069 1313 
+Q 1069 891 1326 650 
+Q 1584 409 2034 409 
+Q 2484 409 2743 651 
+Q 3003 894 3003 1313 
+Q 3003 1734 2745 1975 
+Q 2488 2216 2034 2216 
+z
+M 1403 2484 
+Q 997 2584 770 2862 
+Q 544 3141 544 3541 
+Q 544 4100 942 4425 
+Q 1341 4750 2034 4750 
+Q 2731 4750 3128 4425 
+Q 3525 4100 3525 3541 
+Q 3525 3141 3298 2862 
+Q 3072 2584 2669 2484 
+Q 3125 2378 3379 2068 
+Q 3634 1759 3634 1313 
+Q 3634 634 3220 271 
+Q 2806 -91 2034 -91 
+Q 1263 -91 848 271 
+Q 434 634 434 1313 
+Q 434 1759 690 2068 
+Q 947 2378 1403 2484 
+z
+M 1172 3481 
+Q 1172 3119 1398 2916 
+Q 1625 2713 2034 2713 
+Q 2441 2713 2670 2916 
+Q 2900 3119 2900 3481 
+Q 2900 3844 2670 4047 
+Q 2441 4250 2034 4250 
+Q 1625 4250 1398 4047 
+Q 1172 3844 1172 3481 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_8">
+     <g id="line2d_8">
+      <g>
+       <use xlink:href="#m62e8ddf8d5" x="377.954838" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_8">
+      <!-- (16K, 8K) -->
+      <g transform="translate(379.4254 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_9">
+     <g id="line2d_9">
+      <g>
+       <use xlink:href="#m62e8ddf8d5" x="420.039276" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_9">
+      <!-- (64K, 8K) -->
+      <g transform="translate(421.509837 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2">
+    <g id="ytick_1">
+     <g id="line2d_10">
+      <path d="M 66.53 345.561604 
+L 436.873051 345.561604 
+" clip-path="url(#p856ea13fce)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_11">
+      <defs>
+       <path id="m7fb1351010" d="M 0 0 
+L -3.5 0 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#m7fb1351010" x="66.53" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_10">
+      <!-- 0 -->
+      <g transform="translate(51.895 350.120666) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-30" d="M 2034 4250 
+Q 1547 4250 1301 3770 
+Q 1056 3291 1056 2328 
+Q 1056 1369 1301 889 
+Q 1547 409 2034 409 
+Q 2525 409 2770 889 
+Q 3016 1369 3016 2328 
+Q 3016 3291 2770 3770 
+Q 2525 4250 2034 4250 
+z
+M 2034 4750 
+Q 2819 4750 3233 4129 
+Q 3647 3509 3647 2328 
+Q 3647 1150 3233 529 
+Q 2819 -91 2034 -91 
+Q 1250 -91 836 529 
+Q 422 1150 422 2328 
+Q 422 3509 836 4129 
+Q 1250 4750 2034 4750 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_2">
+     <g id="line2d_12">
+      <path d="M 66.53 321.210406 
+L 436.873051 321.210406 
+" clip-path="url(#p856ea13fce)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_13">
+      <g>
+       <use xlink:href="#m7fb1351010" x="66.53" y="321.210406" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_11">
+      <!-- 1000 -->
+      <g transform="translate(28.99 325.769469) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_3">
+     <g id="line2d_14">
+      <path d="M 66.53 296.859208 
+L 436.873051 296.859208 
+" clip-path="url(#p856ea13fce)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_15">
+      <g>
+       <use xlink:href="#m7fb1351010" x="66.53" y="296.859208" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_12">
+      <!-- 2000 -->
+      <g transform="translate(28.99 301.418271) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-32" d="M 1228 531 
+L 3431 531 
+L 3431 0 
+L 469 0 
+L 469 531 
+Q 828 903 1448 1529 
+Q 2069 2156 2228 2338 
+Q 2531 2678 2651 2914 
+Q 2772 3150 2772 3378 
+Q 2772 3750 2511 3984 
+Q 2250 4219 1831 4219 
+Q 1534 4219 1204 4116 
+Q 875 4013 500 3803 
+L 500 4441 
+Q 881 4594 1212 4672 
+Q 1544 4750 1819 4750 
+Q 2544 4750 2975 4387 
+Q 3406 4025 3406 3419 
+Q 3406 3131 3298 2873 
+Q 3191 2616 2906 2266 
+Q 2828 2175 2409 1742 
+Q 1991 1309 1228 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-32"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_4">
+     <g id="line2d_16">
+      <path d="M 66.53 272.508011 
+L 436.873051 272.508011 
+" clip-path="url(#p856ea13fce)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_17">
+      <g>
+       <use xlink:href="#m7fb1351010" x="66.53" y="272.508011" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_13">
+      <!-- 3000 -->
+      <g transform="translate(28.99 277.067073) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-33" d="M 2597 2516 
+Q 3050 2419 3304 2112 
+Q 3559 1806 3559 1356 
+Q 3559 666 3084 287 
+Q 2609 -91 1734 -91 
+Q 1441 -91 1130 -33 
+Q 819 25 488 141 
+L 488 750 
+Q 750 597 1062 519 
+Q 1375 441 1716 441 
+Q 2309 441 2620 675 
+Q 2931 909 2931 1356 
+Q 2931 1769 2642 2001 
+Q 2353 2234 1838 2234 
+L 1294 2234 
+L 1294 2753 
+L 1863 2753 
+Q 2328 2753 2575 2939 
+Q 2822 3125 2822 3475 
+Q 2822 3834 2567 4026 
+Q 2313 4219 1838 4219 
+Q 1578 4219 1281 4162 
+Q 984 4106 628 3988 
+L 628 4550 
+Q 988 4650 1302 4700 
+Q 1616 4750 1894 4750 
+Q 2613 4750 3031 4423 
+Q 3450 4097 3450 3541 
+Q 3450 3153 3228 2886 
+Q 3006 2619 2597 2516 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-33"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_5">
+     <g id="line2d_18">
+      <path d="M 66.53 248.156813 
+L 436.873051 248.156813 
+" clip-path="url(#p856ea13fce)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_19">
+      <g>
+       <use xlink:href="#m7fb1351010" x="66.53" y="248.156813" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_14">
+      <!-- 4000 -->
+      <g transform="translate(28.99 252.715875) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-34"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_6">
+     <g id="line2d_20">
+      <path d="M 66.53 223.805615 
+L 436.873051 223.805615 
+" clip-path="url(#p856ea13fce)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_21">
+      <g>
+       <use xlink:href="#m7fb1351010" x="66.53" y="223.805615" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_15">
+      <!-- 5000 -->
+      <g transform="translate(28.99 228.364678) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-35" d="M 691 4666 
+L 3169 4666 
+L 3169 4134 
+L 1269 4134 
+L 1269 2991 
+Q 1406 3038 1543 3061 
+Q 1681 3084 1819 3084 
+Q 2600 3084 3056 2656 
+Q 3513 2228 3513 1497 
+Q 3513 744 3044 326 
+Q 2575 -91 1722 -91 
+Q 1428 -91 1123 -41 
+Q 819 9 494 109 
+L 494 744 
+Q 775 591 1075 516 
+Q 1375 441 1709 441 
+Q 2250 441 2565 725 
+Q 2881 1009 2881 1497 
+Q 2881 1984 2565 2268 
+Q 2250 2553 1709 2553 
+Q 1456 2553 1204 2497 
+Q 953 2441 691 2322 
+L 691 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-35"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_7">
+     <g id="line2d_22">
+      <path d="M 66.53 199.454417 
+L 436.873051 199.454417 
+" clip-path="url(#p856ea13fce)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_23">
+      <g>
+       <use xlink:href="#m7fb1351010" x="66.53" y="199.454417" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_16">
+      <!-- 6000 -->
+      <g transform="translate(28.99 204.01348) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-36"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_8">
+     <g id="line2d_24">
+      <path d="M 66.53 175.10322 
+L 436.873051 175.10322 
+" clip-path="url(#p856ea13fce)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_25">
+      <g>
+       <use xlink:href="#m7fb1351010" x="66.53" y="175.10322" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_17">
+      <!-- 7000 -->
+      <g transform="translate(28.99 179.662282) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-37"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_18">
+     <!-- Memory Bandwidth (GB/s) -->
+     <g transform="translate(21.6625 353.504552) rotate(-90) scale(0.16 -0.16)">
+      <defs>
+       <path id="DejaVuSans-4d" d="M 628 4666 
+L 1569 4666 
+L 2759 1491 
+L 3956 4666 
+L 4897 4666 
+L 4897 0 
+L 4281 0 
+L 4281 4097 
+L 3078 897 
+L 2444 897 
+L 1241 4097 
+L 1241 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-65" d="M 3597 1894 
+L 3597 1613 
+L 953 1613 
+Q 991 1019 1311 708 
+Q 1631 397 2203 397 
+Q 2534 397 2845 478 
+Q 3156 559 3463 722 
+L 3463 178 
+Q 3153 47 2828 -22 
+Q 2503 -91 2169 -91 
+Q 1331 -91 842 396 
+Q 353 884 353 1716 
+Q 353 2575 817 3079 
+Q 1281 3584 2069 3584 
+Q 2775 3584 3186 3129 
+Q 3597 2675 3597 1894 
+z
+M 3022 2063 
+Q 3016 2534 2758 2815 
+Q 2500 3097 2075 3097 
+Q 1594 3097 1305 2825 
+Q 1016 2553 972 2059 
+L 3022 2063 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6d" d="M 3328 2828 
+Q 3544 3216 3844 3400 
+Q 4144 3584 4550 3584 
+Q 5097 3584 5394 3201 
+Q 5691 2819 5691 2113 
+L 5691 0 
+L 5113 0 
+L 5113 2094 
+Q 5113 2597 4934 2840 
+Q 4756 3084 4391 3084 
+Q 3944 3084 3684 2787 
+Q 3425 2491 3425 1978 
+L 3425 0 
+L 2847 0 
+L 2847 2094 
+Q 2847 2600 2669 2842 
+Q 2491 3084 2119 3084 
+Q 1678 3084 1418 2786 
+Q 1159 2488 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1356 3278 1631 3431 
+Q 1906 3584 2284 3584 
+Q 2666 3584 2933 3390 
+Q 3200 3197 3328 2828 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6f" d="M 1959 3097 
+Q 1497 3097 1228 2736 
+Q 959 2375 959 1747 
+Q 959 1119 1226 758 
+Q 1494 397 1959 397 
+Q 2419 397 2687 759 
+Q 2956 1122 2956 1747 
+Q 2956 2369 2687 2733 
+Q 2419 3097 1959 3097 
+z
+M 1959 3584 
+Q 2709 3584 3137 3096 
+Q 3566 2609 3566 1747 
+Q 3566 888 3137 398 
+Q 2709 -91 1959 -91 
+Q 1206 -91 779 398 
+Q 353 888 353 1747 
+Q 353 2609 779 3096 
+Q 1206 3584 1959 3584 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-72" d="M 2631 2963 
+Q 2534 3019 2420 3045 
+Q 2306 3072 2169 3072 
+Q 1681 3072 1420 2755 
+Q 1159 2438 1159 1844 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1341 3275 1631 3429 
+Q 1922 3584 2338 3584 
+Q 2397 3584 2469 3576 
+Q 2541 3569 2628 3553 
+L 2631 2963 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-79" d="M 2059 -325 
+Q 1816 -950 1584 -1140 
+Q 1353 -1331 966 -1331 
+L 506 -1331 
+L 506 -850 
+L 844 -850 
+Q 1081 -850 1212 -737 
+Q 1344 -625 1503 -206 
+L 1606 56 
+L 191 3500 
+L 800 3500 
+L 1894 763 
+L 2988 3500 
+L 3597 3500 
+L 2059 -325 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-42" d="M 1259 2228 
+L 1259 519 
+L 2272 519 
+Q 2781 519 3026 730 
+Q 3272 941 3272 1375 
+Q 3272 1813 3026 2020 
+Q 2781 2228 2272 2228 
+L 1259 2228 
+z
+M 1259 4147 
+L 1259 2741 
+L 2194 2741 
+Q 2656 2741 2882 2914 
+Q 3109 3088 3109 3444 
+Q 3109 3797 2882 3972 
+Q 2656 4147 2194 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 2241 4666 
+Q 2963 4666 3353 4366 
+Q 3744 4066 3744 3513 
+Q 3744 3084 3544 2831 
+Q 3344 2578 2956 2516 
+Q 3422 2416 3680 2098 
+Q 3938 1781 3938 1306 
+Q 3938 681 3513 340 
+Q 3088 0 2303 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-61" d="M 2194 1759 
+Q 1497 1759 1228 1600 
+Q 959 1441 959 1056 
+Q 959 750 1161 570 
+Q 1363 391 1709 391 
+Q 2188 391 2477 730 
+Q 2766 1069 2766 1631 
+L 2766 1759 
+L 2194 1759 
+z
+M 3341 1997 
+L 3341 0 
+L 2766 0 
+L 2766 531 
+Q 2569 213 2275 61 
+Q 1981 -91 1556 -91 
+Q 1019 -91 701 211 
+Q 384 513 384 1019 
+Q 384 1609 779 1909 
+Q 1175 2209 1959 2209 
+L 2766 2209 
+L 2766 2266 
+Q 2766 2663 2505 2880 
+Q 2244 3097 1772 3097 
+Q 1472 3097 1187 3025 
+Q 903 2953 641 2809 
+L 641 3341 
+Q 956 3463 1253 3523 
+Q 1550 3584 1831 3584 
+Q 2591 3584 2966 3190 
+Q 3341 2797 3341 1997 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6e" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-64" d="M 2906 2969 
+L 2906 4863 
+L 3481 4863 
+L 3481 0 
+L 2906 0 
+L 2906 525 
+Q 2725 213 2448 61 
+Q 2172 -91 1784 -91 
+Q 1150 -91 751 415 
+Q 353 922 353 1747 
+Q 353 2572 751 3078 
+Q 1150 3584 1784 3584 
+Q 2172 3584 2448 3432 
+Q 2725 3281 2906 2969 
+z
+M 947 1747 
+Q 947 1113 1208 752 
+Q 1469 391 1925 391 
+Q 2381 391 2643 752 
+Q 2906 1113 2906 1747 
+Q 2906 2381 2643 2742 
+Q 2381 3103 1925 3103 
+Q 1469 3103 1208 2742 
+Q 947 2381 947 1747 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-77" d="M 269 3500 
+L 844 3500 
+L 1563 769 
+L 2278 3500 
+L 2956 3500 
+L 3675 769 
+L 4391 3500 
+L 4966 3500 
+L 4050 0 
+L 3372 0 
+L 2619 2869 
+L 1863 0 
+L 1184 0 
+L 269 3500 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-69" d="M 603 3500 
+L 1178 3500 
+L 1178 0 
+L 603 0 
+L 603 3500 
+z
+M 603 4863 
+L 1178 4863 
+L 1178 4134 
+L 603 4134 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-74" d="M 1172 4494 
+L 1172 3500 
+L 2356 3500 
+L 2356 3053 
+L 1172 3053 
+L 1172 1153 
+Q 1172 725 1289 603 
+Q 1406 481 1766 481 
+L 2356 481 
+L 2356 0 
+L 1766 0 
+Q 1100 0 847 248 
+Q 594 497 594 1153 
+L 594 3053 
+L 172 3053 
+L 172 3500 
+L 594 3500 
+L 594 4494 
+L 1172 4494 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-68" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-47" d="M 3809 666 
+L 3809 1919 
+L 2778 1919 
+L 2778 2438 
+L 4434 2438 
+L 4434 434 
+Q 4069 175 3628 42 
+Q 3188 -91 2688 -91 
+Q 1594 -91 976 548 
+Q 359 1188 359 2328 
+Q 359 3472 976 4111 
+Q 1594 4750 2688 4750 
+Q 3144 4750 3555 4637 
+Q 3966 4525 4313 4306 
+L 4313 3634 
+Q 3963 3931 3569 4081 
+Q 3175 4231 2741 4231 
+Q 1884 4231 1454 3753 
+Q 1025 3275 1025 2328 
+Q 1025 1384 1454 906 
+Q 1884 428 2741 428 
+Q 3075 428 3337 486 
+Q 3600 544 3809 666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-2f" d="M 1625 4666 
+L 2156 4666 
+L 531 -594 
+L 0 -594 
+L 1625 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-73" d="M 2834 3397 
+L 2834 2853 
+Q 2591 2978 2328 3040 
+Q 2066 3103 1784 3103 
+Q 1356 3103 1142 2972 
+Q 928 2841 928 2578 
+Q 928 2378 1081 2264 
+Q 1234 2150 1697 2047 
+L 1894 2003 
+Q 2506 1872 2764 1633 
+Q 3022 1394 3022 966 
+Q 3022 478 2636 193 
+Q 2250 -91 1575 -91 
+Q 1294 -91 989 -36 
+Q 684 19 347 128 
+L 347 722 
+Q 666 556 975 473 
+Q 1284 391 1588 391 
+Q 1994 391 2212 530 
+Q 2431 669 2431 922 
+Q 2431 1156 2273 1281 
+Q 2116 1406 1581 1522 
+L 1381 1569 
+Q 847 1681 609 1914 
+Q 372 2147 372 2553 
+Q 372 3047 722 3315 
+Q 1072 3584 1716 3584 
+Q 2034 3584 2315 3537 
+Q 2597 3491 2834 3397 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-4d"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(86.279297 0)"/>
+      <use xlink:href="#DejaVuSans-6d" transform="translate(147.802734 0)"/>
+      <use xlink:href="#DejaVuSans-6f" transform="translate(245.214844 0)"/>
+      <use xlink:href="#DejaVuSans-72" transform="translate(306.396484 0)"/>
+      <use xlink:href="#DejaVuSans-79" transform="translate(347.509766 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(406.689453 0)"/>
+      <use xlink:href="#DejaVuSans-42" transform="translate(438.476562 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(507.080078 0)"/>
+      <use xlink:href="#DejaVuSans-6e" transform="translate(568.359375 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(631.738281 0)"/>
+      <use xlink:href="#DejaVuSans-77" transform="translate(695.214844 0)"/>
+      <use xlink:href="#DejaVuSans-69" transform="translate(777.001953 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(804.785156 0)"/>
+      <use xlink:href="#DejaVuSans-74" transform="translate(868.261719 0)"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(907.470703 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(970.849609 0)"/>
+      <use xlink:href="#DejaVuSans-28" transform="translate(1002.636719 0)"/>
+      <use xlink:href="#DejaVuSans-47" transform="translate(1041.650391 0)"/>
+      <use xlink:href="#DejaVuSans-42" transform="translate(1119.140625 0)"/>
+      <use xlink:href="#DejaVuSans-2f" transform="translate(1187.744141 0)"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(1221.435547 0)"/>
+      <use xlink:href="#DejaVuSans-29" transform="translate(1273.535156 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_26">
+    <path d="M 83.363775 220.602834 
+L 125.448213 192.302239 
+L 167.53265 184.037143 
+L 209.617088 215.679122 
+L 251.701526 192.646751 
+L 293.785963 186.491419 
+L 335.870401 216.730422 
+L 377.954838 193.198291 
+L 420.039276 185.866938 
+" clip-path="url(#p856ea13fce)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <defs>
+     <path id="mc7b58b2ba9" d="M 0 3.5 
+C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
+C 3.131218 1.81853 3.5 0.928211 3.5 0 
+C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
+C 1.81853 -3.131218 0.928211 -3.5 0 -3.5 
+C -0.928211 -3.5 -1.81853 -3.131218 -2.474874 -2.474874 
+C -3.131218 -1.81853 -3.5 -0.928211 -3.5 0 
+C -3.5 0.928211 -3.131218 1.81853 -2.474874 2.474874 
+C -1.81853 3.131218 -0.928211 3.5 0 3.5 
+z
+" style="stroke: #5ba3f5"/>
+    </defs>
+    <g clip-path="url(#p856ea13fce)">
+     <use xlink:href="#mc7b58b2ba9" x="83.363775" y="220.602834" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mc7b58b2ba9" x="125.448213" y="192.302239" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mc7b58b2ba9" x="167.53265" y="184.037143" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mc7b58b2ba9" x="209.617088" y="215.679122" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mc7b58b2ba9" x="251.701526" y="192.646751" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mc7b58b2ba9" x="293.785963" y="186.491419" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mc7b58b2ba9" x="335.870401" y="216.730422" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mc7b58b2ba9" x="377.954838" y="193.198291" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mc7b58b2ba9" x="420.039276" y="185.866938" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_27">
+    <path d="M 83.363775 237.768471 
+L 125.448213 210.158317 
+L 167.53265 201.225875 
+L 209.617088 230.11742 
+L 251.701526 206.489193 
+L 293.785963 199.507637 
+L 335.870401 230.691316 
+L 377.954838 206.76609 
+L 420.039276 199.529002 
+" clip-path="url(#p856ea13fce)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <defs>
+     <path id="mf1a9062706" d="M 0 3.5 
+C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
+C 3.131218 1.81853 3.5 0.928211 3.5 0 
+C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
+C 1.81853 -3.131218 0.928211 -3.5 0 -3.5 
+C -0.928211 -3.5 -1.81853 -3.131218 -2.474874 -2.474874 
+C -3.131218 -1.81853 -3.5 -0.928211 -3.5 0 
+C -3.5 0.928211 -3.131218 1.81853 -2.474874 2.474874 
+C -1.81853 3.131218 -0.928211 3.5 0 3.5 
+z
+" style="stroke: #ff4444"/>
+    </defs>
+    <g clip-path="url(#p856ea13fce)">
+     <use xlink:href="#mf1a9062706" x="83.363775" y="237.768471" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mf1a9062706" x="125.448213" y="210.158317" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mf1a9062706" x="167.53265" y="201.225875" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mf1a9062706" x="209.617088" y="230.11742" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mf1a9062706" x="251.701526" y="206.489193" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mf1a9062706" x="293.785963" y="199.507637" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mf1a9062706" x="335.870401" y="230.691316" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mf1a9062706" x="377.954838" y="206.76609" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mf1a9062706" x="420.039276" y="199.529002" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_28">
+    <path d="M 66.53 165.819378 
+L 436.873051 165.819378 
+" clip-path="url(#p856ea13fce)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_3">
+    <path d="M 66.53 345.561604 
+L 66.53 151.44 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_4">
+    <path d="M 436.873051 345.561604 
+L 436.873051 151.44 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_5">
+    <path d="M 66.53 345.561604 
+L 436.873051 345.561604 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_6">
+    <path d="M 66.53 151.44 
+L 436.873051 151.44 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_19">
+    <!-- Fused Add+RMSNorm (fwd) -->
+    <g transform="translate(127.299026 145.44) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-46" d="M 628 4666 
+L 3309 4666 
+L 3309 4134 
+L 1259 4134 
+L 1259 2759 
+L 3109 2759 
+L 3109 2228 
+L 1259 2228 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-75" d="M 544 1381 
+L 544 3500 
+L 1119 3500 
+L 1119 1403 
+Q 1119 906 1312 657 
+Q 1506 409 1894 409 
+Q 2359 409 2629 706 
+Q 2900 1003 2900 1516 
+L 2900 3500 
+L 3475 3500 
+L 3475 0 
+L 2900 0 
+L 2900 538 
+Q 2691 219 2414 64 
+Q 2138 -91 1772 -91 
+Q 1169 -91 856 284 
+Q 544 659 544 1381 
+z
+M 1991 3584 
+L 1991 3584 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-41" d="M 2188 4044 
+L 1331 1722 
+L 3047 1722 
+L 2188 4044 
+z
+M 1831 4666 
+L 2547 4666 
+L 4325 0 
+L 3669 0 
+L 3244 1197 
+L 1141 1197 
+L 716 0 
+L 50 0 
+L 1831 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-2b" d="M 2944 4013 
+L 2944 2272 
+L 4684 2272 
+L 4684 1741 
+L 2944 1741 
+L 2944 0 
+L 2419 0 
+L 2419 1741 
+L 678 1741 
+L 678 2272 
+L 2419 2272 
+L 2419 4013 
+L 2944 4013 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-52" d="M 2841 2188 
+Q 3044 2119 3236 1894 
+Q 3428 1669 3622 1275 
+L 4263 0 
+L 3584 0 
+L 2988 1197 
+Q 2756 1666 2539 1819 
+Q 2322 1972 1947 1972 
+L 1259 1972 
+L 1259 0 
+L 628 0 
+L 628 4666 
+L 2053 4666 
+Q 2853 4666 3247 4331 
+Q 3641 3997 3641 3322 
+Q 3641 2881 3436 2590 
+Q 3231 2300 2841 2188 
+z
+M 1259 4147 
+L 1259 2491 
+L 2053 2491 
+Q 2509 2491 2742 2702 
+Q 2975 2913 2975 3322 
+Q 2975 3731 2742 3939 
+Q 2509 4147 2053 4147 
+L 1259 4147 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-53" d="M 3425 4513 
+L 3425 3897 
+Q 3066 4069 2747 4153 
+Q 2428 4238 2131 4238 
+Q 1616 4238 1336 4038 
+Q 1056 3838 1056 3469 
+Q 1056 3159 1242 3001 
+Q 1428 2844 1947 2747 
+L 2328 2669 
+Q 3034 2534 3370 2195 
+Q 3706 1856 3706 1288 
+Q 3706 609 3251 259 
+Q 2797 -91 1919 -91 
+Q 1588 -91 1214 -16 
+Q 841 59 441 206 
+L 441 856 
+Q 825 641 1194 531 
+Q 1563 422 1919 422 
+Q 2459 422 2753 634 
+Q 3047 847 3047 1241 
+Q 3047 1584 2836 1778 
+Q 2625 1972 2144 2069 
+L 1759 2144 
+Q 1053 2284 737 2584 
+Q 422 2884 422 3419 
+Q 422 4038 858 4394 
+Q 1294 4750 2059 4750 
+Q 2388 4750 2728 4690 
+Q 3069 4631 3425 4513 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-4e" d="M 628 4666 
+L 1478 4666 
+L 3547 763 
+L 3547 4666 
+L 4159 4666 
+L 4159 0 
+L 3309 0 
+L 1241 3903 
+L 1241 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-66" d="M 2375 4863 
+L 2375 4384 
+L 1825 4384 
+Q 1516 4384 1395 4259 
+Q 1275 4134 1275 3809 
+L 1275 3500 
+L 2222 3500 
+L 2222 3053 
+L 1275 3053 
+L 1275 0 
+L 697 0 
+L 697 3053 
+L 147 3053 
+L 147 3500 
+L 697 3500 
+L 697 3744 
+Q 697 4328 969 4595 
+Q 1241 4863 1831 4863 
+L 2375 4863 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-46"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(52.019531 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(115.398438 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(167.498047 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(229.021484 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(292.498047 0)"/>
+     <use xlink:href="#DejaVuSans-41" transform="translate(324.285156 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(390.943359 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(454.419922 0)"/>
+     <use xlink:href="#DejaVuSans-2b" transform="translate(517.896484 0)"/>
+     <use xlink:href="#DejaVuSans-52" transform="translate(601.685547 0)"/>
+     <use xlink:href="#DejaVuSans-4d" transform="translate(671.167969 0)"/>
+     <use xlink:href="#DejaVuSans-53" transform="translate(757.447266 0)"/>
+     <use xlink:href="#DejaVuSans-4e" transform="translate(820.923828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(895.728516 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(956.910156 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(996.273438 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(1093.685547 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(1125.472656 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(1164.486328 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(1197.941406 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1279.728516 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1343.205078 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_2">
+   <g id="patch_7">
+    <path d="M 477.52 345.561604 
+L 847.863051 345.561604 
+L 847.863051 151.44 
+L 477.52 151.44 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_3">
+    <g id="xtick_10">
+     <g id="line2d_29">
+      <g>
+       <use xlink:href="#m62e8ddf8d5" x="494.353775" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_20">
+      <!-- (4K, 6K) -->
+      <g transform="translate(495.824336 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_11">
+     <g id="line2d_30">
+      <g>
+       <use xlink:href="#m62e8ddf8d5" x="536.438213" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_21">
+      <!-- (16K, 6K) -->
+      <g transform="translate(537.908774 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_12">
+     <g id="line2d_31">
+      <g>
+       <use xlink:href="#m62e8ddf8d5" x="578.52265" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_22">
+      <!-- (64K, 6K) -->
+      <g transform="translate(579.993211 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_13">
+     <g id="line2d_32">
+      <g>
+       <use xlink:href="#m62e8ddf8d5" x="620.607088" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_23">
+      <!-- (4K, 7K) -->
+      <g transform="translate(622.077649 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_14">
+     <g id="line2d_33">
+      <g>
+       <use xlink:href="#m62e8ddf8d5" x="662.691526" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_24">
+      <!-- (16K, 7K) -->
+      <g transform="translate(664.162087 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_15">
+     <g id="line2d_34">
+      <g>
+       <use xlink:href="#m62e8ddf8d5" x="704.775963" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_25">
+      <!-- (64K, 7K) -->
+      <g transform="translate(706.246524 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_16">
+     <g id="line2d_35">
+      <g>
+       <use xlink:href="#m62e8ddf8d5" x="746.860401" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_26">
+      <!-- (4K, 8K) -->
+      <g transform="translate(748.330962 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_17">
+     <g id="line2d_36">
+      <g>
+       <use xlink:href="#m62e8ddf8d5" x="788.944838" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_27">
+      <!-- (16K, 8K) -->
+      <g transform="translate(790.4154 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_18">
+     <g id="line2d_37">
+      <g>
+       <use xlink:href="#m62e8ddf8d5" x="831.029276" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_28">
+      <!-- (64K, 8K) -->
+      <g transform="translate(832.499837 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_4">
+    <g id="ytick_9">
+     <g id="line2d_38">
+      <path d="M 477.52 345.561604 
+L 847.863051 345.561604 
+" clip-path="url(#p1ed2bedd16)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_39">
+      <g>
+       <use xlink:href="#m7fb1351010" x="477.52" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_10">
+     <g id="line2d_40">
+      <path d="M 477.52 321.210406 
+L 847.863051 321.210406 
+" clip-path="url(#p1ed2bedd16)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_41">
+      <g>
+       <use xlink:href="#m7fb1351010" x="477.52" y="321.210406" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_11">
+     <g id="line2d_42">
+      <path d="M 477.52 296.859208 
+L 847.863051 296.859208 
+" clip-path="url(#p1ed2bedd16)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_43">
+      <g>
+       <use xlink:href="#m7fb1351010" x="477.52" y="296.859208" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_12">
+     <g id="line2d_44">
+      <path d="M 477.52 272.508011 
+L 847.863051 272.508011 
+" clip-path="url(#p1ed2bedd16)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_45">
+      <g>
+       <use xlink:href="#m7fb1351010" x="477.52" y="272.508011" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_13">
+     <g id="line2d_46">
+      <path d="M 477.52 248.156813 
+L 847.863051 248.156813 
+" clip-path="url(#p1ed2bedd16)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_47">
+      <g>
+       <use xlink:href="#m7fb1351010" x="477.52" y="248.156813" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_14">
+     <g id="line2d_48">
+      <path d="M 477.52 223.805615 
+L 847.863051 223.805615 
+" clip-path="url(#p1ed2bedd16)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_49">
+      <g>
+       <use xlink:href="#m7fb1351010" x="477.52" y="223.805615" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_15">
+     <g id="line2d_50">
+      <path d="M 477.52 199.454417 
+L 847.863051 199.454417 
+" clip-path="url(#p1ed2bedd16)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_51">
+      <g>
+       <use xlink:href="#m7fb1351010" x="477.52" y="199.454417" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_16">
+     <g id="line2d_52">
+      <path d="M 477.52 175.10322 
+L 847.863051 175.10322 
+" clip-path="url(#p1ed2bedd16)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_53">
+      <g>
+       <use xlink:href="#m7fb1351010" x="477.52" y="175.10322" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_54">
+    <path d="M 494.353775 263.911259 
+L 536.438213 243.717278 
+L 578.52265 237.136708 
+L 620.607088 259.989198 
+L 662.691526 240.048593 
+L 704.775963 234.044728 
+L 746.860401 264.200518 
+L 788.944838 249.423191 
+L 831.029276 245.724769 
+" clip-path="url(#p1ed2bedd16)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p1ed2bedd16)">
+     <use xlink:href="#mc7b58b2ba9" x="494.353775" y="263.911259" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mc7b58b2ba9" x="536.438213" y="243.717278" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mc7b58b2ba9" x="578.52265" y="237.136708" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mc7b58b2ba9" x="620.607088" y="259.989198" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mc7b58b2ba9" x="662.691526" y="240.048593" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mc7b58b2ba9" x="704.775963" y="234.044728" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mc7b58b2ba9" x="746.860401" y="264.200518" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mc7b58b2ba9" x="788.944838" y="249.423191" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mc7b58b2ba9" x="831.029276" y="245.724769" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_55">
+    <path d="M 494.353775 293.455245 
+L 536.438213 278.102834 
+L 578.52265 273.074938 
+L 620.607088 281.193452 
+L 662.691526 265.891044 
+L 704.775963 261.232871 
+L 746.860401 282.999424 
+L 788.944838 270.466554 
+L 831.029276 266.756053 
+" clip-path="url(#p1ed2bedd16)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p1ed2bedd16)">
+     <use xlink:href="#mf1a9062706" x="494.353775" y="293.455245" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mf1a9062706" x="536.438213" y="278.102834" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mf1a9062706" x="578.52265" y="273.074938" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mf1a9062706" x="620.607088" y="281.193452" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mf1a9062706" x="662.691526" y="265.891044" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mf1a9062706" x="704.775963" y="261.232871" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mf1a9062706" x="746.860401" y="282.999424" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mf1a9062706" x="788.944838" y="270.466554" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mf1a9062706" x="831.029276" y="266.756053" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_56">
+    <path d="M 477.52 165.819378 
+L 847.863051 165.819378 
+" clip-path="url(#p1ed2bedd16)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_8">
+    <path d="M 477.52 345.561604 
+L 477.52 151.44 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_9">
+    <path d="M 847.863051 345.561604 
+L 847.863051 151.44 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_10">
+    <path d="M 477.52 345.561604 
+L 847.863051 345.561604 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_11">
+    <path d="M 477.52 151.44 
+L 847.863051 151.44 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_29">
+    <!-- Softmax (fwd+bwd) -->
+    <g transform="translate(573.026213 145.44) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-78" d="M 3513 3500 
+L 2247 1797 
+L 3578 0 
+L 2900 0 
+L 1881 1375 
+L 863 0 
+L 184 0 
+L 1544 1831 
+L 300 3500 
+L 978 3500 
+L 1906 2253 
+L 2834 3500 
+L 3513 3500 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-62" d="M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+M 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2969 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-53"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(63.476562 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(124.658203 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(158.113281 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(197.322266 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(294.734375 0)"/>
+     <use xlink:href="#DejaVuSans-78" transform="translate(356.013672 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(415.193359 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(446.980469 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(485.994141 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(519.449219 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(601.236328 0)"/>
+     <use xlink:href="#DejaVuSans-2b" transform="translate(664.712891 0)"/>
+     <use xlink:href="#DejaVuSans-62" transform="translate(748.501953 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(811.978516 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(893.765625 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(957.242188 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_3">
+   <g id="patch_12">
+    <path d="M 888.51 345.561604 
+L 1258.853051 345.561604 
+L 1258.853051 151.44 
+L 888.51 151.44 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_5">
+    <g id="xtick_19">
+     <g id="line2d_57">
+      <g>
+       <use xlink:href="#m62e8ddf8d5" x="905.343775" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_30">
+      <!-- (4K, 6K) -->
+      <g transform="translate(906.814336 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_20">
+     <g id="line2d_58">
+      <g>
+       <use xlink:href="#m62e8ddf8d5" x="947.428213" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_31">
+      <!-- (16K, 6K) -->
+      <g transform="translate(948.898774 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_21">
+     <g id="line2d_59">
+      <g>
+       <use xlink:href="#m62e8ddf8d5" x="989.51265" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_32">
+      <!-- (64K, 6K) -->
+      <g transform="translate(990.983211 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_22">
+     <g id="line2d_60">
+      <g>
+       <use xlink:href="#m62e8ddf8d5" x="1031.597088" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_33">
+      <!-- (4K, 7K) -->
+      <g transform="translate(1033.067649 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_23">
+     <g id="line2d_61">
+      <g>
+       <use xlink:href="#m62e8ddf8d5" x="1073.681526" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_34">
+      <!-- (16K, 7K) -->
+      <g transform="translate(1075.152087 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_24">
+     <g id="line2d_62">
+      <g>
+       <use xlink:href="#m62e8ddf8d5" x="1115.765963" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_35">
+      <!-- (64K, 7K) -->
+      <g transform="translate(1117.236524 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_25">
+     <g id="line2d_63">
+      <g>
+       <use xlink:href="#m62e8ddf8d5" x="1157.850401" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_36">
+      <!-- (4K, 8K) -->
+      <g transform="translate(1159.320962 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_26">
+     <g id="line2d_64">
+      <g>
+       <use xlink:href="#m62e8ddf8d5" x="1199.934838" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_37">
+      <!-- (16K, 8K) -->
+      <g transform="translate(1201.4054 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_27">
+     <g id="line2d_65">
+      <g>
+       <use xlink:href="#m62e8ddf8d5" x="1242.019276" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_38">
+      <!-- (64K, 8K) -->
+      <g transform="translate(1243.489837 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_6">
+    <g id="ytick_17">
+     <g id="line2d_66">
+      <path d="M 888.51 345.561604 
+L 1258.853051 345.561604 
+" clip-path="url(#pada9960f7c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_67">
+      <g>
+       <use xlink:href="#m7fb1351010" x="888.51" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_18">
+     <g id="line2d_68">
+      <path d="M 888.51 321.210406 
+L 1258.853051 321.210406 
+" clip-path="url(#pada9960f7c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_69">
+      <g>
+       <use xlink:href="#m7fb1351010" x="888.51" y="321.210406" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_19">
+     <g id="line2d_70">
+      <path d="M 888.51 296.859208 
+L 1258.853051 296.859208 
+" clip-path="url(#pada9960f7c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_71">
+      <g>
+       <use xlink:href="#m7fb1351010" x="888.51" y="296.859208" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_20">
+     <g id="line2d_72">
+      <path d="M 888.51 272.508011 
+L 1258.853051 272.508011 
+" clip-path="url(#pada9960f7c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_73">
+      <g>
+       <use xlink:href="#m7fb1351010" x="888.51" y="272.508011" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_21">
+     <g id="line2d_74">
+      <path d="M 888.51 248.156813 
+L 1258.853051 248.156813 
+" clip-path="url(#pada9960f7c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_75">
+      <g>
+       <use xlink:href="#m7fb1351010" x="888.51" y="248.156813" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_22">
+     <g id="line2d_76">
+      <path d="M 888.51 223.805615 
+L 1258.853051 223.805615 
+" clip-path="url(#pada9960f7c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_77">
+      <g>
+       <use xlink:href="#m7fb1351010" x="888.51" y="223.805615" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_23">
+     <g id="line2d_78">
+      <path d="M 888.51 199.454417 
+L 1258.853051 199.454417 
+" clip-path="url(#pada9960f7c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_79">
+      <g>
+       <use xlink:href="#m7fb1351010" x="888.51" y="199.454417" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_24">
+     <g id="line2d_80">
+      <path d="M 888.51 175.10322 
+L 1258.853051 175.10322 
+" clip-path="url(#pada9960f7c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_81">
+      <g>
+       <use xlink:href="#m7fb1351010" x="888.51" y="175.10322" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_82">
+    <path d="M 905.343775 267.374117 
+L 947.428213 239.406658 
+L 989.51265 228.781463 
+L 1031.597088 270.832845 
+L 1073.681526 245.579611 
+L 1115.765963 237.002063 
+L 1157.850401 269.143317 
+L 1199.934838 243.707962 
+L 1242.019276 235.974729 
+" clip-path="url(#pada9960f7c)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pada9960f7c)">
+     <use xlink:href="#mc7b58b2ba9" x="905.343775" y="267.374117" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mc7b58b2ba9" x="947.428213" y="239.406658" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mc7b58b2ba9" x="989.51265" y="228.781463" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mc7b58b2ba9" x="1031.597088" y="270.832845" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mc7b58b2ba9" x="1073.681526" y="245.579611" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mc7b58b2ba9" x="1115.765963" y="237.002063" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mc7b58b2ba9" x="1157.850401" y="269.143317" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mc7b58b2ba9" x="1199.934838" y="243.707962" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mc7b58b2ba9" x="1242.019276" y="235.974729" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_83">
+    <path d="M 905.343775 291.660661 
+L 947.428213 278.668407 
+L 989.51265 273.816197 
+L 1031.597088 277.090923 
+L 1073.681526 260.141614 
+L 1115.765963 254.792619 
+L 1157.850401 277.61424 
+L 1199.934838 264.216362 
+L 1242.019276 260.538633 
+" clip-path="url(#pada9960f7c)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pada9960f7c)">
+     <use xlink:href="#mf1a9062706" x="905.343775" y="291.660661" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mf1a9062706" x="947.428213" y="278.668407" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mf1a9062706" x="989.51265" y="273.816197" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mf1a9062706" x="1031.597088" y="277.090923" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mf1a9062706" x="1073.681526" y="260.141614" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mf1a9062706" x="1115.765963" y="254.792619" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mf1a9062706" x="1157.850401" y="277.61424" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mf1a9062706" x="1199.934838" y="264.216362" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mf1a9062706" x="1242.019276" y="260.538633" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_84">
+    <path d="M 888.51 165.819378 
+L 1258.853051 165.819378 
+" clip-path="url(#pada9960f7c)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_13">
+    <path d="M 888.51 345.561604 
+L 888.51 151.44 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_14">
+    <path d="M 1258.853051 345.561604 
+L 1258.853051 151.44 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_15">
+    <path d="M 888.51 345.561604 
+L 1258.853051 345.561604 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_16">
+    <path d="M 888.51 151.44 
+L 1258.853051 151.44 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_39">
+    <!-- LayerNorm (fwd) -->
+    <g transform="translate(998.070276 145.44) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-4c" d="M 628 4666 
+L 1259 4666 
+L 1259 531 
+L 3531 531 
+L 3531 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-4c"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(55.712891 0)"/>
+     <use xlink:href="#DejaVuSans-79" transform="translate(116.992188 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(176.171875 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(237.695312 0)"/>
+     <use xlink:href="#DejaVuSans-4e" transform="translate(278.808594 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(353.613281 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(414.794922 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(454.158203 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(551.570312 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(583.357422 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(622.371094 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(655.826172 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(737.613281 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(801.089844 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="text_40">
+   <!-- SM100 BF16 Kernel Benchmarks (Oink vs Quack) — DSv3 (hidden-size ops) -->
+   <g transform="translate(233.067969 24.780563) scale(0.22 -0.22)">
+    <defs>
+     <path id="DejaVuSans-6c" d="M 603 4863 
+L 1178 4863 
+L 1178 0 
+L 603 0 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-63" d="M 3122 3366 
+L 3122 2828 
+Q 2878 2963 2633 3030 
+Q 2388 3097 2138 3097 
+Q 1578 3097 1268 2742 
+Q 959 2388 959 1747 
+Q 959 1106 1268 751 
+Q 1578 397 2138 397 
+Q 2388 397 2633 464 
+Q 2878 531 3122 666 
+L 3122 134 
+Q 2881 22 2623 -34 
+Q 2366 -91 2075 -91 
+Q 1284 -91 818 406 
+Q 353 903 353 1747 
+Q 353 2603 823 3093 
+Q 1294 3584 2113 3584 
+Q 2378 3584 2631 3529 
+Q 2884 3475 3122 3366 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-6b" d="M 581 4863 
+L 1159 4863 
+L 1159 1991 
+L 2875 3500 
+L 3609 3500 
+L 1753 1863 
+L 3688 0 
+L 2938 0 
+L 1159 1709 
+L 1159 0 
+L 581 0 
+L 581 4863 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-4f" d="M 2522 4238 
+Q 1834 4238 1429 3725 
+Q 1025 3213 1025 2328 
+Q 1025 1447 1429 934 
+Q 1834 422 2522 422 
+Q 3209 422 3611 934 
+Q 4013 1447 4013 2328 
+Q 4013 3213 3611 3725 
+Q 3209 4238 2522 4238 
+z
+M 2522 4750 
+Q 3503 4750 4090 4092 
+Q 4678 3434 4678 2328 
+Q 4678 1225 4090 567 
+Q 3503 -91 2522 -91 
+Q 1538 -91 948 565 
+Q 359 1222 359 2328 
+Q 359 3434 948 4092 
+Q 1538 4750 2522 4750 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-76" d="M 191 3500 
+L 800 3500 
+L 1894 563 
+L 2988 3500 
+L 3597 3500 
+L 2284 0 
+L 1503 0 
+L 191 3500 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-51" d="M 2522 4238 
+Q 1834 4238 1429 3725 
+Q 1025 3213 1025 2328 
+Q 1025 1447 1429 934 
+Q 1834 422 2522 422 
+Q 3209 422 3611 934 
+Q 4013 1447 4013 2328 
+Q 4013 3213 3611 3725 
+Q 3209 4238 2522 4238 
+z
+M 3406 84 
+L 4238 -825 
+L 3475 -825 
+L 2784 -78 
+Q 2681 -84 2626 -87 
+Q 2572 -91 2522 -91 
+Q 1538 -91 948 567 
+Q 359 1225 359 2328 
+Q 359 3434 948 4092 
+Q 1538 4750 2522 4750 
+Q 3503 4750 4090 4092 
+Q 4678 3434 4678 2328 
+Q 4678 1516 4351 937 
+Q 4025 359 3406 84 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-2014" d="M 313 1978 
+L 6088 1978 
+L 6088 1528 
+L 313 1528 
+L 313 1978 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-44" d="M 1259 4147 
+L 1259 519 
+L 2022 519 
+Q 2988 519 3436 956 
+Q 3884 1394 3884 2338 
+Q 3884 3275 3436 3711 
+Q 2988 4147 2022 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 1925 4666 
+Q 3281 4666 3915 4102 
+Q 4550 3538 4550 2338 
+Q 4550 1131 3912 565 
+Q 3275 0 1925 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-2d" d="M 313 2009 
+L 1997 2009 
+L 1997 1497 
+L 313 1497 
+L 313 2009 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-7a" d="M 353 3500 
+L 3084 3500 
+L 3084 2975 
+L 922 459 
+L 3084 459 
+L 3084 0 
+L 275 0 
+L 275 525 
+L 2438 3041 
+L 353 3041 
+L 353 3500 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-70" d="M 1159 525 
+L 1159 -1331 
+L 581 -1331 
+L 581 3500 
+L 1159 3500 
+L 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+z
+M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+" transform="scale(0.015625)"/>
+    </defs>
+    <use xlink:href="#DejaVuSans-53"/>
+    <use xlink:href="#DejaVuSans-4d" transform="translate(63.476562 0)"/>
+    <use xlink:href="#DejaVuSans-31" transform="translate(149.755859 0)"/>
+    <use xlink:href="#DejaVuSans-30" transform="translate(213.378906 0)"/>
+    <use xlink:href="#DejaVuSans-30" transform="translate(277.001953 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(340.625 0)"/>
+    <use xlink:href="#DejaVuSans-42" transform="translate(372.412109 0)"/>
+    <use xlink:href="#DejaVuSans-46" transform="translate(441.015625 0)"/>
+    <use xlink:href="#DejaVuSans-31" transform="translate(498.535156 0)"/>
+    <use xlink:href="#DejaVuSans-36" transform="translate(562.158203 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(625.78125 0)"/>
+    <use xlink:href="#DejaVuSans-4b" transform="translate(657.568359 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(718.144531 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(779.667969 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(819.03125 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(882.410156 0)"/>
+    <use xlink:href="#DejaVuSans-6c" transform="translate(943.933594 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(971.716797 0)"/>
+    <use xlink:href="#DejaVuSans-42" transform="translate(1003.503906 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(1072.107422 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(1133.630859 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(1197.009766 0)"/>
+    <use xlink:href="#DejaVuSans-68" transform="translate(1251.990234 0)"/>
+    <use xlink:href="#DejaVuSans-6d" transform="translate(1315.369141 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(1412.78125 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(1474.060547 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(1515.173828 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(1573.083984 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(1625.183594 0)"/>
+    <use xlink:href="#DejaVuSans-28" transform="translate(1656.970703 0)"/>
+    <use xlink:href="#DejaVuSans-4f" transform="translate(1695.984375 0)"/>
+    <use xlink:href="#DejaVuSans-69" transform="translate(1774.695312 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(1802.478516 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(1865.857422 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(1923.767578 0)"/>
+    <use xlink:href="#DejaVuSans-76" transform="translate(1955.554688 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(2014.734375 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2066.833984 0)"/>
+    <use xlink:href="#DejaVuSans-51" transform="translate(2098.621094 0)"/>
+    <use xlink:href="#DejaVuSans-75" transform="translate(2177.332031 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(2240.710938 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(2301.990234 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(2356.970703 0)"/>
+    <use xlink:href="#DejaVuSans-29" transform="translate(2414.880859 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2453.894531 0)"/>
+    <use xlink:href="#DejaVuSans-2014" transform="translate(2485.681641 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2585.681641 0)"/>
+    <use xlink:href="#DejaVuSans-44" transform="translate(2617.46875 0)"/>
+    <use xlink:href="#DejaVuSans-53" transform="translate(2694.470703 0)"/>
+    <use xlink:href="#DejaVuSans-76" transform="translate(2757.947266 0)"/>
+    <use xlink:href="#DejaVuSans-33" transform="translate(2817.126953 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2880.75 0)"/>
+    <use xlink:href="#DejaVuSans-28" transform="translate(2912.537109 0)"/>
+    <use xlink:href="#DejaVuSans-68" transform="translate(2951.550781 0)"/>
+    <use xlink:href="#DejaVuSans-69" transform="translate(3014.929688 0)"/>
+    <use xlink:href="#DejaVuSans-64" transform="translate(3042.712891 0)"/>
+    <use xlink:href="#DejaVuSans-64" transform="translate(3106.189453 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(3169.666016 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(3231.189453 0)"/>
+    <use xlink:href="#DejaVuSans-2d" transform="translate(3294.568359 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(3330.652344 0)"/>
+    <use xlink:href="#DejaVuSans-69" transform="translate(3382.751953 0)"/>
+    <use xlink:href="#DejaVuSans-7a" transform="translate(3410.535156 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(3463.025391 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(3524.548828 0)"/>
+    <use xlink:href="#DejaVuSans-6f" transform="translate(3556.335938 0)"/>
+    <use xlink:href="#DejaVuSans-70" transform="translate(3617.517578 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(3680.994141 0)"/>
+    <use xlink:href="#DejaVuSans-29" transform="translate(3733.09375 0)"/>
+   </g>
+  </g>
+  <g id="legend_1">
+   <g id="line2d_85">
+    <path d="M 386.305937 53.315969 
+L 402.555937 53.315969 
+L 418.805937 53.315969 
+" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g>
+     <use xlink:href="#mc7b58b2ba9" x="402.555937" y="53.315969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="text_41">
+    <!-- KernelAgent-Oink (ours) -->
+    <g transform="translate(429.205937 57.865969) scale(0.13 -0.13)">
+     <defs>
+      <path id="DejaVuSans-67" d="M 2906 1791 
+Q 2906 2416 2648 2759 
+Q 2391 3103 1925 3103 
+Q 1463 3103 1205 2759 
+Q 947 2416 947 1791 
+Q 947 1169 1205 825 
+Q 1463 481 1925 481 
+Q 2391 481 2648 825 
+Q 2906 1169 2906 1791 
+z
+M 3481 434 
+Q 3481 -459 3084 -895 
+Q 2688 -1331 1869 -1331 
+Q 1566 -1331 1297 -1286 
+Q 1028 -1241 775 -1147 
+L 775 -588 
+Q 1028 -725 1275 -790 
+Q 1522 -856 1778 -856 
+Q 2344 -856 2625 -561 
+Q 2906 -266 2906 331 
+L 2906 616 
+Q 2728 306 2450 153 
+Q 2172 0 1784 0 
+Q 1141 0 747 490 
+Q 353 981 353 1791 
+Q 353 2603 747 3093 
+Q 1141 3584 1784 3584 
+Q 2172 3584 2450 3431 
+Q 2728 3278 2906 2969 
+L 2906 3500 
+L 3481 3500 
+L 3481 434 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-4b"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(60.576172 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(122.099609 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(161.462891 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(224.841797 0)"/>
+     <use xlink:href="#DejaVuSans-6c" transform="translate(286.365234 0)"/>
+     <use xlink:href="#DejaVuSans-41" transform="translate(314.148438 0)"/>
+     <use xlink:href="#DejaVuSans-67" transform="translate(382.556641 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(446.033203 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(507.556641 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(570.935547 0)"/>
+     <use xlink:href="#DejaVuSans-2d" transform="translate(610.144531 0)"/>
+     <use xlink:href="#DejaVuSans-4f" transform="translate(648.978516 0)"/>
+     <use xlink:href="#DejaVuSans-69" transform="translate(727.689453 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(755.472656 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(818.851562 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(876.761719 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(908.548828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(947.5625 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(1008.744141 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(1072.123047 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(1113.236328 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1165.335938 0)"/>
+    </g>
+   </g>
+   <g id="line2d_86">
+    <path d="M 611.770625 53.315969 
+L 628.020625 53.315969 
+L 644.270625 53.315969 
+" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g>
+     <use xlink:href="#mf1a9062706" x="628.020625" y="53.315969" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="text_42">
+    <!-- Quack -->
+    <g transform="translate(654.670625 57.865969) scale(0.13 -0.13)">
+     <use xlink:href="#DejaVuSans-51"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(78.710938 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(142.089844 0)"/>
+     <use xlink:href="#DejaVuSans-63" transform="translate(203.369141 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(258.349609 0)"/>
+    </g>
+   </g>
+   <g id="line2d_87">
+    <path d="M 721.785156 53.315969 
+L 738.035156 53.315969 
+L 754.285156 53.315969 
+" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="text_43">
+    <!-- HBM peak (measured) -->
+    <g transform="translate(764.685156 57.865969) scale(0.13 -0.13)">
+     <defs>
+      <path id="DejaVuSans-48" d="M 628 4666 
+L 1259 4666 
+L 1259 2753 
+L 3553 2753 
+L 3553 4666 
+L 4184 4666 
+L 4184 0 
+L 3553 0 
+L 3553 2222 
+L 1259 2222 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-48"/>
+     <use xlink:href="#DejaVuSans-42" transform="translate(75.195312 0)"/>
+     <use xlink:href="#DejaVuSans-4d" transform="translate(143.798828 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(230.078125 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(261.865234 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(325.341797 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(386.865234 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(448.144531 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(506.054688 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(537.841797 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(576.855469 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(674.267578 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(735.791016 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(797.070312 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(849.169922 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(912.548828 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(951.412109 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1012.935547 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1076.412109 0)"/>
+    </g>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="p856ea13fce">
+   <rect x="66.53" y="151.44" width="370.343051" height="194.121604"/>
+  </clipPath>
+  <clipPath id="p1ed2bedd16">
+   <rect x="477.52" y="151.44" width="370.343051" height="194.121604"/>
+  </clipPath>
+  <clipPath id="pada9960f7c">
+   <rect x="888.51" y="151.44" width="370.343051" height="194.121604"/>
+  </clipPath>
+ </defs>
+</svg>
diff --git a/oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3_all.svg b/oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3_all.svg
new file mode 100644
index 0000000..f5cd53c
--- /dev/null
+++ b/oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3_all.svg
@@ -0,0 +1,2936 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="1706.691288pt" height="388.133504pt" viewBox="0 0 1706.691288 388.133504" xmlns="http://www.w3.org/2000/svg" version="1.1">
+ <metadata>
+  <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+   <cc:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
+    <dc:date>2026-01-12T23:50:09.117981</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <cc:Agent>
+      <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
+     </cc:Agent>
+    </dc:creator>
+   </cc:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure_1">
+  <g id="patch_1">
+   <path d="M 0 388.133504 
+L 1706.691288 388.133504 
+L 1706.691288 0 
+L 0 0 
+z
+" style="fill: #ffffff"/>
+  </g>
+  <g id="axes_1">
+   <g id="patch_2">
+    <path d="M 58.465 316.082831 
+L 432.752252 316.082831 
+L 432.752252 144.816 
+L 58.465 144.816 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="line2d_1">
+      <defs>
+       <path id="m8c58d9c87c" d="M 0 0 
+L 0 3.5 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="75.478057" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_1">
+      <!-- (4K, 6K) -->
+      <g transform="translate(76.948618 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-28" d="M 1984 4856 
+Q 1566 4138 1362 3434 
+Q 1159 2731 1159 2009 
+Q 1159 1288 1364 580 
+Q 1569 -128 1984 -844 
+L 1484 -844 
+Q 1016 -109 783 600 
+Q 550 1309 550 2009 
+Q 550 2706 781 3412 
+Q 1013 4119 1484 4856 
+L 1984 4856 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-34" d="M 2419 4116 
+L 825 1625 
+L 2419 1625 
+L 2419 4116 
+z
+M 2253 4666 
+L 3047 4666 
+L 3047 1625 
+L 3713 1625 
+L 3713 1100 
+L 3047 1100 
+L 3047 0 
+L 2419 0 
+L 2419 1100 
+L 313 1100 
+L 313 1709 
+L 2253 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-4b" d="M 628 4666 
+L 1259 4666 
+L 1259 2694 
+L 3353 4666 
+L 4166 4666 
+L 1850 2491 
+L 4331 0 
+L 3500 0 
+L 1259 2247 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-2c" d="M 750 794 
+L 1409 794 
+L 1409 256 
+L 897 -744 
+L 494 -744 
+L 750 256 
+L 750 794 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-20" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-36" d="M 2113 2584 
+Q 1688 2584 1439 2293 
+Q 1191 2003 1191 1497 
+Q 1191 994 1439 701 
+Q 1688 409 2113 409 
+Q 2538 409 2786 701 
+Q 3034 994 3034 1497 
+Q 3034 2003 2786 2293 
+Q 2538 2584 2113 2584 
+z
+M 3366 4563 
+L 3366 3988 
+Q 3128 4100 2886 4159 
+Q 2644 4219 2406 4219 
+Q 1781 4219 1451 3797 
+Q 1122 3375 1075 2522 
+Q 1259 2794 1537 2939 
+Q 1816 3084 2150 3084 
+Q 2853 3084 3261 2657 
+Q 3669 2231 3669 1497 
+Q 3669 778 3244 343 
+Q 2819 -91 2113 -91 
+Q 1303 -91 875 529 
+Q 447 1150 447 2328 
+Q 447 3434 972 4092 
+Q 1497 4750 2381 4750 
+Q 2619 4750 2861 4703 
+Q 3103 4656 3366 4563 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-29" d="M 513 4856 
+L 1013 4856 
+Q 1481 4119 1714 3412 
+Q 1947 2706 1947 2009 
+Q 1947 1309 1714 600 
+Q 1481 -109 1013 -844 
+L 513 -844 
+Q 928 -128 1133 580 
+Q 1338 1288 1338 2009 
+Q 1338 2731 1133 3434 
+Q 928 4138 513 4856 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="line2d_2">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="118.010699" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_2">
+      <!-- (16K, 6K) -->
+      <g transform="translate(119.48126 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-31" d="M 794 531 
+L 1825 531 
+L 1825 4091 
+L 703 3866 
+L 703 4441 
+L 1819 4666 
+L 2450 4666 
+L 2450 531 
+L 3481 531 
+L 3481 0 
+L 794 0 
+L 794 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="line2d_3">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="160.543341" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_3">
+      <!-- (64K, 6K) -->
+      <g transform="translate(162.013903 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="line2d_4">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="203.075984" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_4">
+      <!-- (4K, 7K) -->
+      <g transform="translate(204.546545 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-37" d="M 525 4666 
+L 3525 4666 
+L 3525 4397 
+L 1831 0 
+L 1172 0 
+L 2766 4134 
+L 525 4134 
+L 525 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="line2d_5">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="245.608626" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_5">
+      <!-- (16K, 7K) -->
+      <g transform="translate(247.079187 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="line2d_6">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="288.141268" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_6">
+      <!-- (64K, 7K) -->
+      <g transform="translate(289.611829 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_7">
+     <g id="line2d_7">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="330.673911" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_7">
+      <!-- (4K, 8K) -->
+      <g transform="translate(332.144472 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-38" d="M 2034 2216 
+Q 1584 2216 1326 1975 
+Q 1069 1734 1069 1313 
+Q 1069 891 1326 650 
+Q 1584 409 2034 409 
+Q 2484 409 2743 651 
+Q 3003 894 3003 1313 
+Q 3003 1734 2745 1975 
+Q 2488 2216 2034 2216 
+z
+M 1403 2484 
+Q 997 2584 770 2862 
+Q 544 3141 544 3541 
+Q 544 4100 942 4425 
+Q 1341 4750 2034 4750 
+Q 2731 4750 3128 4425 
+Q 3525 4100 3525 3541 
+Q 3525 3141 3298 2862 
+Q 3072 2584 2669 2484 
+Q 3125 2378 3379 2068 
+Q 3634 1759 3634 1313 
+Q 3634 634 3220 271 
+Q 2806 -91 2034 -91 
+Q 1263 -91 848 271 
+Q 434 634 434 1313 
+Q 434 1759 690 2068 
+Q 947 2378 1403 2484 
+z
+M 1172 3481 
+Q 1172 3119 1398 2916 
+Q 1625 2713 2034 2713 
+Q 2441 2713 2670 2916 
+Q 2900 3119 2900 3481 
+Q 2900 3844 2670 4047 
+Q 2441 4250 2034 4250 
+Q 1625 4250 1398 4047 
+Q 1172 3844 1172 3481 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_8">
+     <g id="line2d_8">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="373.206553" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_8">
+      <!-- (16K, 8K) -->
+      <g transform="translate(374.677114 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_9">
+     <g id="line2d_9">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="415.739195" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_9">
+      <!-- (64K, 8K) -->
+      <g transform="translate(417.209756 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_10">
+     <!-- Shape (M, N hidden) -->
+     <g transform="translate(163.813626 378.867037) scale(0.16 -0.16)">
+      <defs>
+       <path id="DejaVuSans-53" d="M 3425 4513 
+L 3425 3897 
+Q 3066 4069 2747 4153 
+Q 2428 4238 2131 4238 
+Q 1616 4238 1336 4038 
+Q 1056 3838 1056 3469 
+Q 1056 3159 1242 3001 
+Q 1428 2844 1947 2747 
+L 2328 2669 
+Q 3034 2534 3370 2195 
+Q 3706 1856 3706 1288 
+Q 3706 609 3251 259 
+Q 2797 -91 1919 -91 
+Q 1588 -91 1214 -16 
+Q 841 59 441 206 
+L 441 856 
+Q 825 641 1194 531 
+Q 1563 422 1919 422 
+Q 2459 422 2753 634 
+Q 3047 847 3047 1241 
+Q 3047 1584 2836 1778 
+Q 2625 1972 2144 2069 
+L 1759 2144 
+Q 1053 2284 737 2584 
+Q 422 2884 422 3419 
+Q 422 4038 858 4394 
+Q 1294 4750 2059 4750 
+Q 2388 4750 2728 4690 
+Q 3069 4631 3425 4513 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-68" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-61" d="M 2194 1759 
+Q 1497 1759 1228 1600 
+Q 959 1441 959 1056 
+Q 959 750 1161 570 
+Q 1363 391 1709 391 
+Q 2188 391 2477 730 
+Q 2766 1069 2766 1631 
+L 2766 1759 
+L 2194 1759 
+z
+M 3341 1997 
+L 3341 0 
+L 2766 0 
+L 2766 531 
+Q 2569 213 2275 61 
+Q 1981 -91 1556 -91 
+Q 1019 -91 701 211 
+Q 384 513 384 1019 
+Q 384 1609 779 1909 
+Q 1175 2209 1959 2209 
+L 2766 2209 
+L 2766 2266 
+Q 2766 2663 2505 2880 
+Q 2244 3097 1772 3097 
+Q 1472 3097 1187 3025 
+Q 903 2953 641 2809 
+L 641 3341 
+Q 956 3463 1253 3523 
+Q 1550 3584 1831 3584 
+Q 2591 3584 2966 3190 
+Q 3341 2797 3341 1997 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-70" d="M 1159 525 
+L 1159 -1331 
+L 581 -1331 
+L 581 3500 
+L 1159 3500 
+L 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+z
+M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-65" d="M 3597 1894 
+L 3597 1613 
+L 953 1613 
+Q 991 1019 1311 708 
+Q 1631 397 2203 397 
+Q 2534 397 2845 478 
+Q 3156 559 3463 722 
+L 3463 178 
+Q 3153 47 2828 -22 
+Q 2503 -91 2169 -91 
+Q 1331 -91 842 396 
+Q 353 884 353 1716 
+Q 353 2575 817 3079 
+Q 1281 3584 2069 3584 
+Q 2775 3584 3186 3129 
+Q 3597 2675 3597 1894 
+z
+M 3022 2063 
+Q 3016 2534 2758 2815 
+Q 2500 3097 2075 3097 
+Q 1594 3097 1305 2825 
+Q 1016 2553 972 2059 
+L 3022 2063 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-4d" d="M 628 4666 
+L 1569 4666 
+L 2759 1491 
+L 3956 4666 
+L 4897 4666 
+L 4897 0 
+L 4281 0 
+L 4281 4097 
+L 3078 897 
+L 2444 897 
+L 1241 4097 
+L 1241 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-4e" d="M 628 4666 
+L 1478 4666 
+L 3547 763 
+L 3547 4666 
+L 4159 4666 
+L 4159 0 
+L 3309 0 
+L 1241 3903 
+L 1241 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-69" d="M 603 3500 
+L 1178 3500 
+L 1178 0 
+L 603 0 
+L 603 3500 
+z
+M 603 4863 
+L 1178 4863 
+L 1178 4134 
+L 603 4134 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-64" d="M 2906 2969 
+L 2906 4863 
+L 3481 4863 
+L 3481 0 
+L 2906 0 
+L 2906 525 
+Q 2725 213 2448 61 
+Q 2172 -91 1784 -91 
+Q 1150 -91 751 415 
+Q 353 922 353 1747 
+Q 353 2572 751 3078 
+Q 1150 3584 1784 3584 
+Q 2172 3584 2448 3432 
+Q 2725 3281 2906 2969 
+z
+M 947 1747 
+Q 947 1113 1208 752 
+Q 1469 391 1925 391 
+Q 2381 391 2643 752 
+Q 2906 1113 2906 1747 
+Q 2906 2381 2643 2742 
+Q 2381 3103 1925 3103 
+Q 1469 3103 1208 2742 
+Q 947 2381 947 1747 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6e" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-53"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(63.476562 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(126.855469 0)"/>
+      <use xlink:href="#DejaVuSans-70" transform="translate(188.134766 0)"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(251.611328 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(313.134766 0)"/>
+      <use xlink:href="#DejaVuSans-28" transform="translate(344.921875 0)"/>
+      <use xlink:href="#DejaVuSans-4d" transform="translate(383.935547 0)"/>
+      <use xlink:href="#DejaVuSans-2c" transform="translate(470.214844 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(502.001953 0)"/>
+      <use xlink:href="#DejaVuSans-4e" transform="translate(533.789062 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(608.59375 0)"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(640.380859 0)"/>
+      <use xlink:href="#DejaVuSans-69" transform="translate(703.759766 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(731.542969 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(795.019531 0)"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(858.496094 0)"/>
+      <use xlink:href="#DejaVuSans-6e" transform="translate(920.019531 0)"/>
+      <use xlink:href="#DejaVuSans-29" transform="translate(983.398438 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2">
+    <g id="ytick_1">
+     <g id="line2d_10">
+      <path d="M 58.465 316.082831 
+L 432.752252 316.082831 
+" clip-path="url(#pcb97c4c092)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_11">
+      <defs>
+       <path id="m443ef8f579" d="M 0 0 
+L -3.5 0 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#m443ef8f579" x="58.465" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_11">
+      <!-- 0 -->
+      <g transform="translate(43.83 320.641894) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-30" d="M 2034 4250 
+Q 1547 4250 1301 3770 
+Q 1056 3291 1056 2328 
+Q 1056 1369 1301 889 
+Q 1547 409 2034 409 
+Q 2525 409 2770 889 
+Q 3016 1369 3016 2328 
+Q 3016 3291 2770 3770 
+Q 2525 4250 2034 4250 
+z
+M 2034 4750 
+Q 2819 4750 3233 4129 
+Q 3647 3509 3647 2328 
+Q 3647 1150 3233 529 
+Q 2819 -91 2034 -91 
+Q 1250 -91 836 529 
+Q 422 1150 422 2328 
+Q 422 3509 836 4129 
+Q 1250 4750 2034 4750 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_2">
+     <g id="line2d_12">
+      <path d="M 58.465 273.114379 
+L 432.752252 273.114379 
+" clip-path="url(#pcb97c4c092)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_13">
+      <g>
+       <use xlink:href="#m443ef8f579" x="58.465" y="273.114379" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_12">
+      <!-- 2000 -->
+      <g transform="translate(20.925 277.673441) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-32" d="M 1228 531 
+L 3431 531 
+L 3431 0 
+L 469 0 
+L 469 531 
+Q 828 903 1448 1529 
+Q 2069 2156 2228 2338 
+Q 2531 2678 2651 2914 
+Q 2772 3150 2772 3378 
+Q 2772 3750 2511 3984 
+Q 2250 4219 1831 4219 
+Q 1534 4219 1204 4116 
+Q 875 4013 500 3803 
+L 500 4441 
+Q 881 4594 1212 4672 
+Q 1544 4750 1819 4750 
+Q 2544 4750 2975 4387 
+Q 3406 4025 3406 3419 
+Q 3406 3131 3298 2873 
+Q 3191 2616 2906 2266 
+Q 2828 2175 2409 1742 
+Q 1991 1309 1228 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-32"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_3">
+     <g id="line2d_14">
+      <path d="M 58.465 230.145926 
+L 432.752252 230.145926 
+" clip-path="url(#pcb97c4c092)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_15">
+      <g>
+       <use xlink:href="#m443ef8f579" x="58.465" y="230.145926" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_13">
+      <!-- 4000 -->
+      <g transform="translate(20.925 234.704988) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-34"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_4">
+     <g id="line2d_16">
+      <path d="M 58.465 187.177473 
+L 432.752252 187.177473 
+" clip-path="url(#pcb97c4c092)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_17">
+      <g>
+       <use xlink:href="#m443ef8f579" x="58.465" y="187.177473" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_14">
+      <!-- 6000 -->
+      <g transform="translate(20.925 191.736536) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-36"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_15">
+     <!-- Memory Bandwidth (GB/s) -->
+     <g transform="translate(13.5975 335.453166) rotate(-90) scale(0.16 -0.16)">
+      <defs>
+       <path id="DejaVuSans-6d" d="M 3328 2828 
+Q 3544 3216 3844 3400 
+Q 4144 3584 4550 3584 
+Q 5097 3584 5394 3201 
+Q 5691 2819 5691 2113 
+L 5691 0 
+L 5113 0 
+L 5113 2094 
+Q 5113 2597 4934 2840 
+Q 4756 3084 4391 3084 
+Q 3944 3084 3684 2787 
+Q 3425 2491 3425 1978 
+L 3425 0 
+L 2847 0 
+L 2847 2094 
+Q 2847 2600 2669 2842 
+Q 2491 3084 2119 3084 
+Q 1678 3084 1418 2786 
+Q 1159 2488 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1356 3278 1631 3431 
+Q 1906 3584 2284 3584 
+Q 2666 3584 2933 3390 
+Q 3200 3197 3328 2828 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6f" d="M 1959 3097 
+Q 1497 3097 1228 2736 
+Q 959 2375 959 1747 
+Q 959 1119 1226 758 
+Q 1494 397 1959 397 
+Q 2419 397 2687 759 
+Q 2956 1122 2956 1747 
+Q 2956 2369 2687 2733 
+Q 2419 3097 1959 3097 
+z
+M 1959 3584 
+Q 2709 3584 3137 3096 
+Q 3566 2609 3566 1747 
+Q 3566 888 3137 398 
+Q 2709 -91 1959 -91 
+Q 1206 -91 779 398 
+Q 353 888 353 1747 
+Q 353 2609 779 3096 
+Q 1206 3584 1959 3584 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-72" d="M 2631 2963 
+Q 2534 3019 2420 3045 
+Q 2306 3072 2169 3072 
+Q 1681 3072 1420 2755 
+Q 1159 2438 1159 1844 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1341 3275 1631 3429 
+Q 1922 3584 2338 3584 
+Q 2397 3584 2469 3576 
+Q 2541 3569 2628 3553 
+L 2631 2963 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-79" d="M 2059 -325 
+Q 1816 -950 1584 -1140 
+Q 1353 -1331 966 -1331 
+L 506 -1331 
+L 506 -850 
+L 844 -850 
+Q 1081 -850 1212 -737 
+Q 1344 -625 1503 -206 
+L 1606 56 
+L 191 3500 
+L 800 3500 
+L 1894 763 
+L 2988 3500 
+L 3597 3500 
+L 2059 -325 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-42" d="M 1259 2228 
+L 1259 519 
+L 2272 519 
+Q 2781 519 3026 730 
+Q 3272 941 3272 1375 
+Q 3272 1813 3026 2020 
+Q 2781 2228 2272 2228 
+L 1259 2228 
+z
+M 1259 4147 
+L 1259 2741 
+L 2194 2741 
+Q 2656 2741 2882 2914 
+Q 3109 3088 3109 3444 
+Q 3109 3797 2882 3972 
+Q 2656 4147 2194 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 2241 4666 
+Q 2963 4666 3353 4366 
+Q 3744 4066 3744 3513 
+Q 3744 3084 3544 2831 
+Q 3344 2578 2956 2516 
+Q 3422 2416 3680 2098 
+Q 3938 1781 3938 1306 
+Q 3938 681 3513 340 
+Q 3088 0 2303 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-77" d="M 269 3500 
+L 844 3500 
+L 1563 769 
+L 2278 3500 
+L 2956 3500 
+L 3675 769 
+L 4391 3500 
+L 4966 3500 
+L 4050 0 
+L 3372 0 
+L 2619 2869 
+L 1863 0 
+L 1184 0 
+L 269 3500 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-74" d="M 1172 4494 
+L 1172 3500 
+L 2356 3500 
+L 2356 3053 
+L 1172 3053 
+L 1172 1153 
+Q 1172 725 1289 603 
+Q 1406 481 1766 481 
+L 2356 481 
+L 2356 0 
+L 1766 0 
+Q 1100 0 847 248 
+Q 594 497 594 1153 
+L 594 3053 
+L 172 3053 
+L 172 3500 
+L 594 3500 
+L 594 4494 
+L 1172 4494 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-47" d="M 3809 666 
+L 3809 1919 
+L 2778 1919 
+L 2778 2438 
+L 4434 2438 
+L 4434 434 
+Q 4069 175 3628 42 
+Q 3188 -91 2688 -91 
+Q 1594 -91 976 548 
+Q 359 1188 359 2328 
+Q 359 3472 976 4111 
+Q 1594 4750 2688 4750 
+Q 3144 4750 3555 4637 
+Q 3966 4525 4313 4306 
+L 4313 3634 
+Q 3963 3931 3569 4081 
+Q 3175 4231 2741 4231 
+Q 1884 4231 1454 3753 
+Q 1025 3275 1025 2328 
+Q 1025 1384 1454 906 
+Q 1884 428 2741 428 
+Q 3075 428 3337 486 
+Q 3600 544 3809 666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-2f" d="M 1625 4666 
+L 2156 4666 
+L 531 -594 
+L 0 -594 
+L 1625 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-73" d="M 2834 3397 
+L 2834 2853 
+Q 2591 2978 2328 3040 
+Q 2066 3103 1784 3103 
+Q 1356 3103 1142 2972 
+Q 928 2841 928 2578 
+Q 928 2378 1081 2264 
+Q 1234 2150 1697 2047 
+L 1894 2003 
+Q 2506 1872 2764 1633 
+Q 3022 1394 3022 966 
+Q 3022 478 2636 193 
+Q 2250 -91 1575 -91 
+Q 1294 -91 989 -36 
+Q 684 19 347 128 
+L 347 722 
+Q 666 556 975 473 
+Q 1284 391 1588 391 
+Q 1994 391 2212 530 
+Q 2431 669 2431 922 
+Q 2431 1156 2273 1281 
+Q 2116 1406 1581 1522 
+L 1381 1569 
+Q 847 1681 609 1914 
+Q 372 2147 372 2553 
+Q 372 3047 722 3315 
+Q 1072 3584 1716 3584 
+Q 2034 3584 2315 3537 
+Q 2597 3491 2834 3397 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-4d"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(86.279297 0)"/>
+      <use xlink:href="#DejaVuSans-6d" transform="translate(147.802734 0)"/>
+      <use xlink:href="#DejaVuSans-6f" transform="translate(245.214844 0)"/>
+      <use xlink:href="#DejaVuSans-72" transform="translate(306.396484 0)"/>
+      <use xlink:href="#DejaVuSans-79" transform="translate(347.509766 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(406.689453 0)"/>
+      <use xlink:href="#DejaVuSans-42" transform="translate(438.476562 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(507.080078 0)"/>
+      <use xlink:href="#DejaVuSans-6e" transform="translate(568.359375 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(631.738281 0)"/>
+      <use xlink:href="#DejaVuSans-77" transform="translate(695.214844 0)"/>
+      <use xlink:href="#DejaVuSans-69" transform="translate(777.001953 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(804.785156 0)"/>
+      <use xlink:href="#DejaVuSans-74" transform="translate(868.261719 0)"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(907.470703 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(970.849609 0)"/>
+      <use xlink:href="#DejaVuSans-28" transform="translate(1002.636719 0)"/>
+      <use xlink:href="#DejaVuSans-47" transform="translate(1041.650391 0)"/>
+      <use xlink:href="#DejaVuSans-42" transform="translate(1119.140625 0)"/>
+      <use xlink:href="#DejaVuSans-2f" transform="translate(1187.744141 0)"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(1221.435547 0)"/>
+      <use xlink:href="#DejaVuSans-29" transform="translate(1273.535156 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_18">
+    <path d="M 75.478057 206.026622 
+L 118.010699 180.931174 
+L 160.543341 172.021301 
+L 203.075984 200.452473 
+L 245.608626 181.644013 
+L 288.141268 175.360397 
+L 330.673911 203.288962 
+L 373.206553 181.422463 
+L 415.739195 175.218742 
+" clip-path="url(#pcb97c4c092)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <defs>
+     <path id="m4ddee04d9f" d="M 0 3.5 
+C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
+C 3.131218 1.81853 3.5 0.928211 3.5 0 
+C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
+C 1.81853 -3.131218 0.928211 -3.5 0 -3.5 
+C -0.928211 -3.5 -1.81853 -3.131218 -2.474874 -2.474874 
+C -3.131218 -1.81853 -3.5 -0.928211 -3.5 0 
+C -3.5 0.928211 -3.131218 1.81853 -2.474874 2.474874 
+C -1.81853 3.131218 -0.928211 3.5 0 3.5 
+z
+" style="stroke: #5ba3f5"/>
+    </defs>
+    <g clip-path="url(#pcb97c4c092)">
+     <use xlink:href="#m4ddee04d9f" x="75.478057" y="206.026622" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="118.010699" y="180.931174" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="160.543341" y="172.021301" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="203.075984" y="200.452473" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="245.608626" y="181.644013" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="288.141268" y="175.360397" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="330.673911" y="203.288962" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="373.206553" y="181.422463" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="415.739195" y="175.218742" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_19">
+    <path d="M 75.478057 221.88501 
+L 118.010699 197.013731 
+L 160.543341 187.801968 
+L 203.075984 214.414772 
+L 245.608626 192.509179 
+L 288.141268 186.851726 
+L 330.673911 213.077268 
+L 373.206553 193.043313 
+L 415.739195 188.073312 
+" clip-path="url(#pcb97c4c092)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <defs>
+     <path id="m32d72ab000" d="M 0 3.5 
+C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
+C 3.131218 1.81853 3.5 0.928211 3.5 0 
+C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
+C 1.81853 -3.131218 0.928211 -3.5 0 -3.5 
+C -0.928211 -3.5 -1.81853 -3.131218 -2.474874 -2.474874 
+C -3.131218 -1.81853 -3.5 -0.928211 -3.5 0 
+C -3.5 0.928211 -3.131218 1.81853 -2.474874 2.474874 
+C -1.81853 3.131218 -0.928211 3.5 0 3.5 
+z
+" style="stroke: #ff4444"/>
+    </defs>
+    <g clip-path="url(#pcb97c4c092)">
+     <use xlink:href="#m32d72ab000" x="75.478057" y="221.88501" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="118.010699" y="197.013731" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="160.543341" y="187.801968" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="203.075984" y="214.414772" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="245.608626" y="192.509179" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="288.141268" y="186.851726" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="330.673911" y="213.077268" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="373.206553" y="193.043313" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="415.739195" y="188.073312" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_20">
+    <path d="M 58.465 157.502432 
+L 432.752252 157.502432 
+" clip-path="url(#pcb97c4c092)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_3">
+    <path d="M 58.465 316.082831 
+L 58.465 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_4">
+    <path d="M 432.752252 316.082831 
+L 432.752252 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_5">
+    <path d="M 58.465 316.082831 
+L 432.752252 316.082831 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_6">
+    <path d="M 58.465 144.816 
+L 432.752252 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_16">
+    <!-- Fused Add+RMSNorm (fwd) -->
+    <g transform="translate(121.206126 138.816) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-46" d="M 628 4666 
+L 3309 4666 
+L 3309 4134 
+L 1259 4134 
+L 1259 2759 
+L 3109 2759 
+L 3109 2228 
+L 1259 2228 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-75" d="M 544 1381 
+L 544 3500 
+L 1119 3500 
+L 1119 1403 
+Q 1119 906 1312 657 
+Q 1506 409 1894 409 
+Q 2359 409 2629 706 
+Q 2900 1003 2900 1516 
+L 2900 3500 
+L 3475 3500 
+L 3475 0 
+L 2900 0 
+L 2900 538 
+Q 2691 219 2414 64 
+Q 2138 -91 1772 -91 
+Q 1169 -91 856 284 
+Q 544 659 544 1381 
+z
+M 1991 3584 
+L 1991 3584 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-41" d="M 2188 4044 
+L 1331 1722 
+L 3047 1722 
+L 2188 4044 
+z
+M 1831 4666 
+L 2547 4666 
+L 4325 0 
+L 3669 0 
+L 3244 1197 
+L 1141 1197 
+L 716 0 
+L 50 0 
+L 1831 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-2b" d="M 2944 4013 
+L 2944 2272 
+L 4684 2272 
+L 4684 1741 
+L 2944 1741 
+L 2944 0 
+L 2419 0 
+L 2419 1741 
+L 678 1741 
+L 678 2272 
+L 2419 2272 
+L 2419 4013 
+L 2944 4013 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-52" d="M 2841 2188 
+Q 3044 2119 3236 1894 
+Q 3428 1669 3622 1275 
+L 4263 0 
+L 3584 0 
+L 2988 1197 
+Q 2756 1666 2539 1819 
+Q 2322 1972 1947 1972 
+L 1259 1972 
+L 1259 0 
+L 628 0 
+L 628 4666 
+L 2053 4666 
+Q 2853 4666 3247 4331 
+Q 3641 3997 3641 3322 
+Q 3641 2881 3436 2590 
+Q 3231 2300 2841 2188 
+z
+M 1259 4147 
+L 1259 2491 
+L 2053 2491 
+Q 2509 2491 2742 2702 
+Q 2975 2913 2975 3322 
+Q 2975 3731 2742 3939 
+Q 2509 4147 2053 4147 
+L 1259 4147 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-66" d="M 2375 4863 
+L 2375 4384 
+L 1825 4384 
+Q 1516 4384 1395 4259 
+Q 1275 4134 1275 3809 
+L 1275 3500 
+L 2222 3500 
+L 2222 3053 
+L 1275 3053 
+L 1275 0 
+L 697 0 
+L 697 3053 
+L 147 3053 
+L 147 3500 
+L 697 3500 
+L 697 3744 
+Q 697 4328 969 4595 
+Q 1241 4863 1831 4863 
+L 2375 4863 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-46"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(52.019531 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(115.398438 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(167.498047 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(229.021484 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(292.498047 0)"/>
+     <use xlink:href="#DejaVuSans-41" transform="translate(324.285156 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(390.943359 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(454.419922 0)"/>
+     <use xlink:href="#DejaVuSans-2b" transform="translate(517.896484 0)"/>
+     <use xlink:href="#DejaVuSans-52" transform="translate(601.685547 0)"/>
+     <use xlink:href="#DejaVuSans-4d" transform="translate(671.167969 0)"/>
+     <use xlink:href="#DejaVuSans-53" transform="translate(757.447266 0)"/>
+     <use xlink:href="#DejaVuSans-4e" transform="translate(820.923828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(895.728516 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(956.910156 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(996.273438 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(1093.685547 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(1125.472656 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(1164.486328 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(1197.941406 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1279.728516 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1343.205078 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_2">
+   <g id="patch_7">
+    <path d="M 473.593807 316.082831 
+L 847.881059 316.082831 
+L 847.881059 144.816 
+L 473.593807 144.816 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_3">
+    <g id="xtick_10">
+     <g id="line2d_21">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="490.606864" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_17">
+      <!-- (4K, 6K) -->
+      <g transform="translate(492.077425 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_11">
+     <g id="line2d_22">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="533.139506" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_18">
+      <!-- (16K, 6K) -->
+      <g transform="translate(534.610067 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_12">
+     <g id="line2d_23">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="575.672148" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_19">
+      <!-- (64K, 6K) -->
+      <g transform="translate(577.142709 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_13">
+     <g id="line2d_24">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="618.204791" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_20">
+      <!-- (4K, 7K) -->
+      <g transform="translate(619.675352 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_14">
+     <g id="line2d_25">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="660.737433" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_21">
+      <!-- (16K, 7K) -->
+      <g transform="translate(662.207994 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_15">
+     <g id="line2d_26">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="703.270075" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_22">
+      <!-- (64K, 7K) -->
+      <g transform="translate(704.740636 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_16">
+     <g id="line2d_27">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="745.802717" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_23">
+      <!-- (4K, 8K) -->
+      <g transform="translate(747.273278 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_17">
+     <g id="line2d_28">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="788.33536" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_24">
+      <!-- (16K, 8K) -->
+      <g transform="translate(789.805921 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_18">
+     <g id="line2d_29">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="830.868002" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_25">
+      <!-- (64K, 8K) -->
+      <g transform="translate(832.338563 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_26">
+     <!-- Shape (M, N hidden) -->
+     <g transform="translate(578.942433 378.867037) scale(0.16 -0.16)">
+      <use xlink:href="#DejaVuSans-53"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(63.476562 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(126.855469 0)"/>
+      <use xlink:href="#DejaVuSans-70" transform="translate(188.134766 0)"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(251.611328 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(313.134766 0)"/>
+      <use xlink:href="#DejaVuSans-28" transform="translate(344.921875 0)"/>
+      <use xlink:href="#DejaVuSans-4d" transform="translate(383.935547 0)"/>
+      <use xlink:href="#DejaVuSans-2c" transform="translate(470.214844 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(502.001953 0)"/>
+      <use xlink:href="#DejaVuSans-4e" transform="translate(533.789062 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(608.59375 0)"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(640.380859 0)"/>
+      <use xlink:href="#DejaVuSans-69" transform="translate(703.759766 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(731.542969 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(795.019531 0)"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(858.496094 0)"/>
+      <use xlink:href="#DejaVuSans-6e" transform="translate(920.019531 0)"/>
+      <use xlink:href="#DejaVuSans-29" transform="translate(983.398438 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_4">
+    <g id="ytick_5">
+     <g id="line2d_30">
+      <path d="M 473.593807 316.082831 
+L 847.881059 316.082831 
+" clip-path="url(#paa39aed402)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_31">
+      <g>
+       <use xlink:href="#m443ef8f579" x="473.593807" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_6">
+     <g id="line2d_32">
+      <path d="M 473.593807 273.114379 
+L 847.881059 273.114379 
+" clip-path="url(#paa39aed402)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_33">
+      <g>
+       <use xlink:href="#m443ef8f579" x="473.593807" y="273.114379" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_7">
+     <g id="line2d_34">
+      <path d="M 473.593807 230.145926 
+L 847.881059 230.145926 
+" clip-path="url(#paa39aed402)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_35">
+      <g>
+       <use xlink:href="#m443ef8f579" x="473.593807" y="230.145926" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_8">
+     <g id="line2d_36">
+      <path d="M 473.593807 187.177473 
+L 847.881059 187.177473 
+" clip-path="url(#paa39aed402)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_37">
+      <g>
+       <use xlink:href="#m443ef8f579" x="473.593807" y="187.177473" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_38">
+    <path d="M 490.606864 243.285166 
+L 533.139506 226.388036 
+L 575.672148 220.344494 
+L 618.204791 240.294166 
+L 660.737433 223.097025 
+L 703.270075 217.684213 
+L 745.802717 244.379998 
+L 788.33536 231.359184 
+L 830.868002 228.003296 
+" clip-path="url(#paa39aed402)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#paa39aed402)">
+     <use xlink:href="#m4ddee04d9f" x="490.606864" y="243.285166" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="533.139506" y="226.388036" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="575.672148" y="220.344494" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="618.204791" y="240.294166" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="660.737433" y="223.097025" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="703.270075" y="217.684213" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="745.802717" y="244.379998" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="788.33536" y="231.359184" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="830.868002" y="228.003296" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_39">
+    <path d="M 490.606864 269.506625 
+L 533.139506 256.558458 
+L 575.672148 252.132367 
+L 618.204791 259.293153 
+L 660.737433 245.742249 
+L 703.270075 241.672732 
+L 745.802717 260.602156 
+L 788.33536 249.835186 
+L 830.868002 246.555873 
+" clip-path="url(#paa39aed402)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#paa39aed402)">
+     <use xlink:href="#m32d72ab000" x="490.606864" y="269.506625" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="533.139506" y="256.558458" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="575.672148" y="252.132367" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="618.204791" y="259.293153" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="660.737433" y="245.742249" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="703.270075" y="241.672732" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="745.802717" y="260.602156" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="788.33536" y="249.835186" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="830.868002" y="246.555873" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_40">
+    <path d="M 473.593807 157.502432 
+L 847.881059 157.502432 
+" clip-path="url(#paa39aed402)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_8">
+    <path d="M 473.593807 316.082831 
+L 473.593807 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_9">
+    <path d="M 847.881059 316.082831 
+L 847.881059 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_10">
+    <path d="M 473.593807 316.082831 
+L 847.881059 316.082831 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_11">
+    <path d="M 473.593807 144.816 
+L 847.881059 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_27">
+    <!-- Softmax (fwd+bwd) -->
+    <g transform="translate(571.07212 138.816) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-78" d="M 3513 3500 
+L 2247 1797 
+L 3578 0 
+L 2900 0 
+L 1881 1375 
+L 863 0 
+L 184 0 
+L 1544 1831 
+L 300 3500 
+L 978 3500 
+L 1906 2253 
+L 2834 3500 
+L 3513 3500 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-62" d="M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+M 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2969 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-53"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(63.476562 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(124.658203 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(158.113281 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(197.322266 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(294.734375 0)"/>
+     <use xlink:href="#DejaVuSans-78" transform="translate(356.013672 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(415.193359 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(446.980469 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(485.994141 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(519.449219 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(601.236328 0)"/>
+     <use xlink:href="#DejaVuSans-2b" transform="translate(664.712891 0)"/>
+     <use xlink:href="#DejaVuSans-62" transform="translate(748.501953 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(811.978516 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(893.765625 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(957.242188 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_3">
+   <g id="patch_12">
+    <path d="M 888.722614 316.082831 
+L 1263.009866 316.082831 
+L 1263.009866 144.816 
+L 888.722614 144.816 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_5">
+    <g id="xtick_19">
+     <g id="line2d_41">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="905.735671" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_28">
+      <!-- (4K, 6K) -->
+      <g transform="translate(907.206232 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_20">
+     <g id="line2d_42">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="948.268313" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_29">
+      <!-- (16K, 6K) -->
+      <g transform="translate(949.738874 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_21">
+     <g id="line2d_43">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="990.800955" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_30">
+      <!-- (64K, 6K) -->
+      <g transform="translate(992.271516 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_22">
+     <g id="line2d_44">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="1033.333597" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_31">
+      <!-- (4K, 7K) -->
+      <g transform="translate(1034.804158 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_23">
+     <g id="line2d_45">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="1075.86624" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_32">
+      <!-- (16K, 7K) -->
+      <g transform="translate(1077.336801 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_24">
+     <g id="line2d_46">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="1118.398882" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_33">
+      <!-- (64K, 7K) -->
+      <g transform="translate(1119.869443 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_25">
+     <g id="line2d_47">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="1160.931524" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_34">
+      <!-- (4K, 8K) -->
+      <g transform="translate(1162.402085 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_26">
+     <g id="line2d_48">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="1203.464166" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_35">
+      <!-- (16K, 8K) -->
+      <g transform="translate(1204.934728 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_27">
+     <g id="line2d_49">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="1245.996809" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_36">
+      <!-- (64K, 8K) -->
+      <g transform="translate(1247.46737 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_37">
+     <!-- Shape (M, N hidden) -->
+     <g transform="translate(994.07124 378.867037) scale(0.16 -0.16)">
+      <use xlink:href="#DejaVuSans-53"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(63.476562 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(126.855469 0)"/>
+      <use xlink:href="#DejaVuSans-70" transform="translate(188.134766 0)"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(251.611328 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(313.134766 0)"/>
+      <use xlink:href="#DejaVuSans-28" transform="translate(344.921875 0)"/>
+      <use xlink:href="#DejaVuSans-4d" transform="translate(383.935547 0)"/>
+      <use xlink:href="#DejaVuSans-2c" transform="translate(470.214844 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(502.001953 0)"/>
+      <use xlink:href="#DejaVuSans-4e" transform="translate(533.789062 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(608.59375 0)"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(640.380859 0)"/>
+      <use xlink:href="#DejaVuSans-69" transform="translate(703.759766 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(731.542969 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(795.019531 0)"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(858.496094 0)"/>
+      <use xlink:href="#DejaVuSans-6e" transform="translate(920.019531 0)"/>
+      <use xlink:href="#DejaVuSans-29" transform="translate(983.398438 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_6">
+    <g id="ytick_9">
+     <g id="line2d_50">
+      <path d="M 888.722614 316.082831 
+L 1263.009866 316.082831 
+" clip-path="url(#pa424c21931)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_51">
+      <g>
+       <use xlink:href="#m443ef8f579" x="888.722614" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_10">
+     <g id="line2d_52">
+      <path d="M 888.722614 273.114379 
+L 1263.009866 273.114379 
+" clip-path="url(#pa424c21931)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_53">
+      <g>
+       <use xlink:href="#m443ef8f579" x="888.722614" y="273.114379" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_11">
+     <g id="line2d_54">
+      <path d="M 888.722614 230.145926 
+L 1263.009866 230.145926 
+" clip-path="url(#pa424c21931)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_55">
+      <g>
+       <use xlink:href="#m443ef8f579" x="888.722614" y="230.145926" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_12">
+     <g id="line2d_56">
+      <path d="M 888.722614 187.177473 
+L 1263.009866 187.177473 
+" clip-path="url(#pa424c21931)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_57">
+      <g>
+       <use xlink:href="#m443ef8f579" x="888.722614" y="187.177473" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_58">
+    <path d="M 905.735671 247.493832 
+L 948.268313 222.774292 
+L 990.800955 212.924481 
+L 1033.333597 249.77445 
+L 1075.86624 227.98358 
+L 1118.398882 219.776424 
+L 1160.931524 248.672806 
+L 1203.464166 226.521779 
+L 1245.996809 218.572302 
+" clip-path="url(#pa424c21931)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pa424c21931)">
+     <use xlink:href="#m4ddee04d9f" x="905.735671" y="247.493832" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="948.268313" y="222.774292" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="990.800955" y="212.924481" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="1033.333597" y="249.77445" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="1075.86624" y="227.98358" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="1118.398882" y="219.776424" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="1160.931524" y="248.672806" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="1203.464166" y="226.521779" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="1245.996809" y="218.572302" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_59">
+    <path d="M 905.735671 268.692346 
+L 948.268313 256.805708 
+L 990.800955 252.713862 
+L 1033.333597 255.817815 
+L 1075.86624 240.722765 
+L 1118.398882 235.674322 
+L 1160.931524 256.103061 
+L 1203.464166 244.354092 
+L 1245.996809 240.581285 
+" clip-path="url(#pa424c21931)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pa424c21931)">
+     <use xlink:href="#m32d72ab000" x="905.735671" y="268.692346" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="948.268313" y="256.805708" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="990.800955" y="252.713862" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="1033.333597" y="255.817815" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="1075.86624" y="240.722765" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="1118.398882" y="235.674322" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="1160.931524" y="256.103061" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="1203.464166" y="244.354092" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="1245.996809" y="240.581285" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_60">
+    <path d="M 888.722614 157.502432 
+L 1263.009866 157.502432 
+" clip-path="url(#pa424c21931)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_13">
+    <path d="M 888.722614 316.082831 
+L 888.722614 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_14">
+    <path d="M 1263.009866 316.082831 
+L 1263.009866 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_15">
+    <path d="M 888.722614 316.082831 
+L 1263.009866 316.082831 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_16">
+    <path d="M 888.722614 144.816 
+L 1263.009866 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_38">
+    <!-- LayerNorm (fwd) -->
+    <g transform="translate(1000.25499 138.816) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-4c" d="M 628 4666 
+L 1259 4666 
+L 1259 531 
+L 3531 531 
+L 3531 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-4c"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(55.712891 0)"/>
+     <use xlink:href="#DejaVuSans-79" transform="translate(116.992188 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(176.171875 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(237.695312 0)"/>
+     <use xlink:href="#DejaVuSans-4e" transform="translate(278.808594 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(353.613281 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(414.794922 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(454.158203 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(551.570312 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(583.357422 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(622.371094 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(655.826172 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(737.613281 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(801.089844 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_4">
+   <g id="patch_17">
+    <path d="M 1303.85142 316.082831 
+L 1678.138672 316.082831 
+L 1678.138672 144.816 
+L 1303.85142 144.816 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_7">
+    <g id="xtick_28">
+     <g id="line2d_61">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="1320.864477" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_39">
+      <!-- (4K, 3K) -->
+      <g transform="translate(1322.335039 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-33" d="M 2597 2516 
+Q 3050 2419 3304 2112 
+Q 3559 1806 3559 1356 
+Q 3559 666 3084 287 
+Q 2609 -91 1734 -91 
+Q 1441 -91 1130 -33 
+Q 819 25 488 141 
+L 488 750 
+Q 750 597 1062 519 
+Q 1375 441 1716 441 
+Q 2309 441 2620 675 
+Q 2931 909 2931 1356 
+Q 2931 1769 2642 2001 
+Q 2353 2234 1838 2234 
+L 1294 2234 
+L 1294 2753 
+L 1863 2753 
+Q 2328 2753 2575 2939 
+Q 2822 3125 2822 3475 
+Q 2822 3834 2567 4026 
+Q 2313 4219 1838 4219 
+Q 1578 4219 1281 4162 
+Q 984 4106 628 3988 
+L 628 4550 
+Q 988 4650 1302 4700 
+Q 1616 4750 1894 4750 
+Q 2613 4750 3031 4423 
+Q 3450 4097 3450 3541 
+Q 3450 3153 3228 2886 
+Q 3006 2619 2597 2516 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_29">
+     <g id="line2d_62">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="1351.797308" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_40">
+      <!-- (16K, 3K) -->
+      <g transform="translate(1353.267869 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_30">
+     <g id="line2d_63">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="1382.730139" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_41">
+      <!-- (64K, 3K) -->
+      <g transform="translate(1384.2007 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_31">
+     <g id="line2d_64">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="1413.66297" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_42">
+      <!-- (4K, 6K) -->
+      <g transform="translate(1415.133531 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_32">
+     <g id="line2d_65">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="1444.5958" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_43">
+      <!-- (16K, 6K) -->
+      <g transform="translate(1446.066361 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_33">
+     <g id="line2d_66">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="1475.528631" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_44">
+      <!-- (64K, 6K) -->
+      <g transform="translate(1476.999192 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_34">
+     <g id="line2d_67">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="1506.461462" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_45">
+      <!-- (4K, 8K) -->
+      <g transform="translate(1507.932023 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_35">
+     <g id="line2d_68">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="1537.394293" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_46">
+      <!-- (16K, 8K) -->
+      <g transform="translate(1538.864854 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_36">
+     <g id="line2d_69">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="1568.327123" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_47">
+      <!-- (64K, 8K) -->
+      <g transform="translate(1569.797684 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_37">
+     <g id="line2d_70">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="1599.259954" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_48">
+      <!-- (4K, 12K) -->
+      <g transform="translate(1600.730515 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_38">
+     <g id="line2d_71">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="1630.192785" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_49">
+      <!-- (16K, 12K) -->
+      <g transform="translate(1631.663346 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_39">
+     <g id="line2d_72">
+      <g>
+       <use xlink:href="#m8c58d9c87c" x="1661.125616" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_50">
+      <!-- (64K, 12K) -->
+      <g transform="translate(1662.596177 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_51">
+     <!-- Shape (M, C classes) -->
+     <g transform="translate(1408.092546 383.366004) scale(0.16 -0.16)">
+      <defs>
+       <path id="DejaVuSans-43" d="M 4122 4306 
+L 4122 3641 
+Q 3803 3938 3442 4084 
+Q 3081 4231 2675 4231 
+Q 1875 4231 1450 3742 
+Q 1025 3253 1025 2328 
+Q 1025 1406 1450 917 
+Q 1875 428 2675 428 
+Q 3081 428 3442 575 
+Q 3803 722 4122 1019 
+L 4122 359 
+Q 3791 134 3420 21 
+Q 3050 -91 2638 -91 
+Q 1578 -91 968 557 
+Q 359 1206 359 2328 
+Q 359 3453 968 4101 
+Q 1578 4750 2638 4750 
+Q 3056 4750 3426 4639 
+Q 3797 4528 4122 4306 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-63" d="M 3122 3366 
+L 3122 2828 
+Q 2878 2963 2633 3030 
+Q 2388 3097 2138 3097 
+Q 1578 3097 1268 2742 
+Q 959 2388 959 1747 
+Q 959 1106 1268 751 
+Q 1578 397 2138 397 
+Q 2388 397 2633 464 
+Q 2878 531 3122 666 
+L 3122 134 
+Q 2881 22 2623 -34 
+Q 2366 -91 2075 -91 
+Q 1284 -91 818 406 
+Q 353 903 353 1747 
+Q 353 2603 823 3093 
+Q 1294 3584 2113 3584 
+Q 2378 3584 2631 3529 
+Q 2884 3475 3122 3366 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6c" d="M 603 4863 
+L 1178 4863 
+L 1178 0 
+L 603 0 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-53"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(63.476562 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(126.855469 0)"/>
+      <use xlink:href="#DejaVuSans-70" transform="translate(188.134766 0)"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(251.611328 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(313.134766 0)"/>
+      <use xlink:href="#DejaVuSans-28" transform="translate(344.921875 0)"/>
+      <use xlink:href="#DejaVuSans-4d" transform="translate(383.935547 0)"/>
+      <use xlink:href="#DejaVuSans-2c" transform="translate(470.214844 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(502.001953 0)"/>
+      <use xlink:href="#DejaVuSans-43" transform="translate(533.789062 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(603.613281 0)"/>
+      <use xlink:href="#DejaVuSans-63" transform="translate(635.400391 0)"/>
+      <use xlink:href="#DejaVuSans-6c" transform="translate(690.380859 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(718.164062 0)"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(779.443359 0)"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(831.542969 0)"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(883.642578 0)"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(945.166016 0)"/>
+      <use xlink:href="#DejaVuSans-29" transform="translate(997.265625 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_8">
+    <g id="ytick_13">
+     <g id="line2d_73">
+      <path d="M 1303.85142 316.082831 
+L 1678.138672 316.082831 
+" clip-path="url(#pcbba2f8de0)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_74">
+      <g>
+       <use xlink:href="#m443ef8f579" x="1303.85142" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_14">
+     <g id="line2d_75">
+      <path d="M 1303.85142 273.114379 
+L 1678.138672 273.114379 
+" clip-path="url(#pcbba2f8de0)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_76">
+      <g>
+       <use xlink:href="#m443ef8f579" x="1303.85142" y="273.114379" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_15">
+     <g id="line2d_77">
+      <path d="M 1303.85142 230.145926 
+L 1678.138672 230.145926 
+" clip-path="url(#pcbba2f8de0)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_78">
+      <g>
+       <use xlink:href="#m443ef8f579" x="1303.85142" y="230.145926" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_16">
+     <g id="line2d_79">
+      <path d="M 1303.85142 187.177473 
+L 1678.138672 187.177473 
+" clip-path="url(#pcbba2f8de0)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_80">
+      <g>
+       <use xlink:href="#m443ef8f579" x="1303.85142" y="187.177473" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_81">
+    <path d="M 1320.864477 260.53911 
+L 1351.797308 228.607266 
+L 1382.730139 213.334816 
+L 1413.66297 255.469893 
+L 1444.5958 237.279537 
+L 1475.528631 231.213568 
+L 1506.461462 247.582494 
+L 1537.394293 233.808101 
+L 1568.327123 229.484113 
+L 1599.259954 243.595486 
+L 1630.192785 232.784634 
+L 1661.125616 229.215888 
+" clip-path="url(#pcbba2f8de0)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pcbba2f8de0)">
+     <use xlink:href="#m4ddee04d9f" x="1320.864477" y="260.53911" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="1351.797308" y="228.607266" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="1382.730139" y="213.334816" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="1413.66297" y="255.469893" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="1444.5958" y="237.279537" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="1475.528631" y="231.213568" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="1506.461462" y="247.582494" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="1537.394293" y="233.808101" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="1568.327123" y="229.484113" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="1599.259954" y="243.595486" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="1630.192785" y="232.784634" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4ddee04d9f" x="1661.125616" y="229.215888" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_82">
+    <path d="M 1320.864477 304.13148 
+L 1351.797308 269.869577 
+L 1382.730139 252.938446 
+L 1413.66297 293.745122 
+L 1444.5958 259.537073 
+L 1475.528631 253.973238 
+L 1506.461462 284.942857 
+L 1537.394293 253.545768 
+L 1568.327123 249.075663 
+L 1599.259954 271.907974 
+L 1630.192785 255.416905 
+L 1661.125616 252.326788 
+" clip-path="url(#pcbba2f8de0)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pcbba2f8de0)">
+     <use xlink:href="#m32d72ab000" x="1320.864477" y="304.13148" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="1351.797308" y="269.869577" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="1382.730139" y="252.938446" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="1413.66297" y="293.745122" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="1444.5958" y="259.537073" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="1475.528631" y="253.973238" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="1506.461462" y="284.942857" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="1537.394293" y="253.545768" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="1568.327123" y="249.075663" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="1599.259954" y="271.907974" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="1630.192785" y="255.416905" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m32d72ab000" x="1661.125616" y="252.326788" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_83">
+    <path d="M 1303.85142 157.502432 
+L 1678.138672 157.502432 
+" clip-path="url(#pcbba2f8de0)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_18">
+    <path d="M 1303.85142 316.082831 
+L 1303.85142 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_19">
+    <path d="M 1678.138672 316.082831 
+L 1678.138672 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_20">
+    <path d="M 1303.85142 316.082831 
+L 1678.138672 316.082831 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_21">
+    <path d="M 1303.85142 144.816 
+L 1678.138672 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_52">
+    <!-- Cross-Entropy (fwd+bwd) -->
+    <g transform="translate(1375.821765 138.816) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-2d" d="M 313 2009 
+L 1997 2009 
+L 1997 1497 
+L 313 1497 
+L 313 2009 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-45" d="M 628 4666 
+L 3578 4666 
+L 3578 4134 
+L 1259 4134 
+L 1259 2753 
+L 3481 2753 
+L 3481 2222 
+L 1259 2222 
+L 1259 531 
+L 3634 531 
+L 3634 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-43"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(69.824219 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(108.6875 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(169.869141 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(221.96875 0)"/>
+     <use xlink:href="#DejaVuSans-2d" transform="translate(274.068359 0)"/>
+     <use xlink:href="#DejaVuSans-45" transform="translate(310.152344 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(373.335938 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(436.714844 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(475.923828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(514.787109 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(575.96875 0)"/>
+     <use xlink:href="#DejaVuSans-79" transform="translate(639.445312 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(698.625 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(730.412109 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(769.425781 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(802.880859 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(884.667969 0)"/>
+     <use xlink:href="#DejaVuSans-2b" transform="translate(948.144531 0)"/>
+     <use xlink:href="#DejaVuSans-62" transform="translate(1031.933594 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(1095.410156 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1177.197266 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1240.673828 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="text_53">
+   <!-- SM100 BF16 Kernel Benchmarks (Oink vs Quack) — DSv3 (4 ops) -->
+   <g transform="translate(497.034219 18.156563) scale(0.22 -0.22)">
+    <defs>
+     <path id="DejaVuSans-6b" d="M 581 4863 
+L 1159 4863 
+L 1159 1991 
+L 2875 3500 
+L 3609 3500 
+L 1753 1863 
+L 3688 0 
+L 2938 0 
+L 1159 1709 
+L 1159 0 
+L 581 0 
+L 581 4863 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-4f" d="M 2522 4238 
+Q 1834 4238 1429 3725 
+Q 1025 3213 1025 2328 
+Q 1025 1447 1429 934 
+Q 1834 422 2522 422 
+Q 3209 422 3611 934 
+Q 4013 1447 4013 2328 
+Q 4013 3213 3611 3725 
+Q 3209 4238 2522 4238 
+z
+M 2522 4750 
+Q 3503 4750 4090 4092 
+Q 4678 3434 4678 2328 
+Q 4678 1225 4090 567 
+Q 3503 -91 2522 -91 
+Q 1538 -91 948 565 
+Q 359 1222 359 2328 
+Q 359 3434 948 4092 
+Q 1538 4750 2522 4750 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-76" d="M 191 3500 
+L 800 3500 
+L 1894 563 
+L 2988 3500 
+L 3597 3500 
+L 2284 0 
+L 1503 0 
+L 191 3500 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-51" d="M 2522 4238 
+Q 1834 4238 1429 3725 
+Q 1025 3213 1025 2328 
+Q 1025 1447 1429 934 
+Q 1834 422 2522 422 
+Q 3209 422 3611 934 
+Q 4013 1447 4013 2328 
+Q 4013 3213 3611 3725 
+Q 3209 4238 2522 4238 
+z
+M 3406 84 
+L 4238 -825 
+L 3475 -825 
+L 2784 -78 
+Q 2681 -84 2626 -87 
+Q 2572 -91 2522 -91 
+Q 1538 -91 948 567 
+Q 359 1225 359 2328 
+Q 359 3434 948 4092 
+Q 1538 4750 2522 4750 
+Q 3503 4750 4090 4092 
+Q 4678 3434 4678 2328 
+Q 4678 1516 4351 937 
+Q 4025 359 3406 84 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-2014" d="M 313 1978 
+L 6088 1978 
+L 6088 1528 
+L 313 1528 
+L 313 1978 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-44" d="M 1259 4147 
+L 1259 519 
+L 2022 519 
+Q 2988 519 3436 956 
+Q 3884 1394 3884 2338 
+Q 3884 3275 3436 3711 
+Q 2988 4147 2022 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 1925 4666 
+Q 3281 4666 3915 4102 
+Q 4550 3538 4550 2338 
+Q 4550 1131 3912 565 
+Q 3275 0 1925 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+    </defs>
+    <use xlink:href="#DejaVuSans-53"/>
+    <use xlink:href="#DejaVuSans-4d" transform="translate(63.476562 0)"/>
+    <use xlink:href="#DejaVuSans-31" transform="translate(149.755859 0)"/>
+    <use xlink:href="#DejaVuSans-30" transform="translate(213.378906 0)"/>
+    <use xlink:href="#DejaVuSans-30" transform="translate(277.001953 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(340.625 0)"/>
+    <use xlink:href="#DejaVuSans-42" transform="translate(372.412109 0)"/>
+    <use xlink:href="#DejaVuSans-46" transform="translate(441.015625 0)"/>
+    <use xlink:href="#DejaVuSans-31" transform="translate(498.535156 0)"/>
+    <use xlink:href="#DejaVuSans-36" transform="translate(562.158203 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(625.78125 0)"/>
+    <use xlink:href="#DejaVuSans-4b" transform="translate(657.568359 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(718.144531 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(779.667969 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(819.03125 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(882.410156 0)"/>
+    <use xlink:href="#DejaVuSans-6c" transform="translate(943.933594 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(971.716797 0)"/>
+    <use xlink:href="#DejaVuSans-42" transform="translate(1003.503906 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(1072.107422 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(1133.630859 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(1197.009766 0)"/>
+    <use xlink:href="#DejaVuSans-68" transform="translate(1251.990234 0)"/>
+    <use xlink:href="#DejaVuSans-6d" transform="translate(1315.369141 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(1412.78125 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(1474.060547 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(1515.173828 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(1573.083984 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(1625.183594 0)"/>
+    <use xlink:href="#DejaVuSans-28" transform="translate(1656.970703 0)"/>
+    <use xlink:href="#DejaVuSans-4f" transform="translate(1695.984375 0)"/>
+    <use xlink:href="#DejaVuSans-69" transform="translate(1774.695312 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(1802.478516 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(1865.857422 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(1923.767578 0)"/>
+    <use xlink:href="#DejaVuSans-76" transform="translate(1955.554688 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(2014.734375 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2066.833984 0)"/>
+    <use xlink:href="#DejaVuSans-51" transform="translate(2098.621094 0)"/>
+    <use xlink:href="#DejaVuSans-75" transform="translate(2177.332031 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(2240.710938 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(2301.990234 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(2356.970703 0)"/>
+    <use xlink:href="#DejaVuSans-29" transform="translate(2414.880859 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2453.894531 0)"/>
+    <use xlink:href="#DejaVuSans-2014" transform="translate(2485.681641 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2585.681641 0)"/>
+    <use xlink:href="#DejaVuSans-44" transform="translate(2617.46875 0)"/>
+    <use xlink:href="#DejaVuSans-53" transform="translate(2694.470703 0)"/>
+    <use xlink:href="#DejaVuSans-76" transform="translate(2757.947266 0)"/>
+    <use xlink:href="#DejaVuSans-33" transform="translate(2817.126953 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2880.75 0)"/>
+    <use xlink:href="#DejaVuSans-28" transform="translate(2912.537109 0)"/>
+    <use xlink:href="#DejaVuSans-34" transform="translate(2951.550781 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(3015.173828 0)"/>
+    <use xlink:href="#DejaVuSans-6f" transform="translate(3046.960938 0)"/>
+    <use xlink:href="#DejaVuSans-70" transform="translate(3108.142578 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(3171.619141 0)"/>
+    <use xlink:href="#DejaVuSans-29" transform="translate(3223.71875 0)"/>
+   </g>
+  </g>
+  <g id="legend_1">
+   <g id="line2d_84">
+    <path d="M 594.240937 46.691969 
+L 610.490937 46.691969 
+L 626.740937 46.691969 
+" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g>
+     <use xlink:href="#m4ddee04d9f" x="610.490937" y="46.691969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="text_54">
+    <!-- KernelAgent-Oink (ours) -->
+    <g transform="translate(637.140937 51.241969) scale(0.13 -0.13)">
+     <defs>
+      <path id="DejaVuSans-67" d="M 2906 1791 
+Q 2906 2416 2648 2759 
+Q 2391 3103 1925 3103 
+Q 1463 3103 1205 2759 
+Q 947 2416 947 1791 
+Q 947 1169 1205 825 
+Q 1463 481 1925 481 
+Q 2391 481 2648 825 
+Q 2906 1169 2906 1791 
+z
+M 3481 434 
+Q 3481 -459 3084 -895 
+Q 2688 -1331 1869 -1331 
+Q 1566 -1331 1297 -1286 
+Q 1028 -1241 775 -1147 
+L 775 -588 
+Q 1028 -725 1275 -790 
+Q 1522 -856 1778 -856 
+Q 2344 -856 2625 -561 
+Q 2906 -266 2906 331 
+L 2906 616 
+Q 2728 306 2450 153 
+Q 2172 0 1784 0 
+Q 1141 0 747 490 
+Q 353 981 353 1791 
+Q 353 2603 747 3093 
+Q 1141 3584 1784 3584 
+Q 2172 3584 2450 3431 
+Q 2728 3278 2906 2969 
+L 2906 3500 
+L 3481 3500 
+L 3481 434 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-4b"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(60.576172 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(122.099609 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(161.462891 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(224.841797 0)"/>
+     <use xlink:href="#DejaVuSans-6c" transform="translate(286.365234 0)"/>
+     <use xlink:href="#DejaVuSans-41" transform="translate(314.148438 0)"/>
+     <use xlink:href="#DejaVuSans-67" transform="translate(382.556641 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(446.033203 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(507.556641 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(570.935547 0)"/>
+     <use xlink:href="#DejaVuSans-2d" transform="translate(610.144531 0)"/>
+     <use xlink:href="#DejaVuSans-4f" transform="translate(648.978516 0)"/>
+     <use xlink:href="#DejaVuSans-69" transform="translate(727.689453 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(755.472656 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(818.851562 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(876.761719 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(908.548828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(947.5625 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(1008.744141 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(1072.123047 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(1113.236328 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1165.335938 0)"/>
+    </g>
+   </g>
+   <g id="line2d_85">
+    <path d="M 819.705625 46.691969 
+L 835.955625 46.691969 
+L 852.205625 46.691969 
+" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g>
+     <use xlink:href="#m32d72ab000" x="835.955625" y="46.691969" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="text_55">
+    <!-- Quack -->
+    <g transform="translate(862.605625 51.241969) scale(0.13 -0.13)">
+     <use xlink:href="#DejaVuSans-51"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(78.710938 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(142.089844 0)"/>
+     <use xlink:href="#DejaVuSans-63" transform="translate(203.369141 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(258.349609 0)"/>
+    </g>
+   </g>
+   <g id="line2d_86">
+    <path d="M 929.720156 46.691969 
+L 945.970156 46.691969 
+L 962.220156 46.691969 
+" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="text_56">
+    <!-- HBM peak (measured) -->
+    <g transform="translate(972.620156 51.241969) scale(0.13 -0.13)">
+     <defs>
+      <path id="DejaVuSans-48" d="M 628 4666 
+L 1259 4666 
+L 1259 2753 
+L 3553 2753 
+L 3553 4666 
+L 4184 4666 
+L 4184 0 
+L 3553 0 
+L 3553 2222 
+L 1259 2222 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-48"/>
+     <use xlink:href="#DejaVuSans-42" transform="translate(75.195312 0)"/>
+     <use xlink:href="#DejaVuSans-4d" transform="translate(143.798828 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(230.078125 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(261.865234 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(325.341797 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(386.865234 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(448.144531 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(506.054688 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(537.841797 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(576.855469 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(674.267578 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(735.791016 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(797.070312 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(849.169922 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(912.548828 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(951.412109 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1012.935547 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1076.412109 0)"/>
+    </g>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="pcb97c4c092">
+   <rect x="58.465" y="144.816" width="374.287252" height="171.266831"/>
+  </clipPath>
+  <clipPath id="paa39aed402">
+   <rect x="473.593807" y="144.816" width="374.287252" height="171.266831"/>
+  </clipPath>
+  <clipPath id="pa424c21931">
+   <rect x="888.722614" y="144.816" width="374.287252" height="171.266831"/>
+  </clipPath>
+  <clipPath id="pcbba2f8de0">
+   <rect x="1303.85142" y="144.816" width="374.287252" height="171.266831"/>
+  </clipPath>
+ </defs>
+</svg>
diff --git a/oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3_cross_entropy.svg b/oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3_cross_entropy.svg
new file mode 100644
index 0000000..db39e3c
--- /dev/null
+++ b/oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3_cross_entropy.svg
@@ -0,0 +1,1687 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="414.886659pt" height="387.048504pt" viewBox="0 0 414.886659 387.048504" xmlns="http://www.w3.org/2000/svg" version="1.1">
+ <metadata>
+  <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+   <cc:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
+    <dc:date>2026-01-12T23:31:44.506589</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <cc:Agent>
+      <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
+     </cc:Agent>
+    </dc:creator>
+   </cc:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure_1">
+  <g id="patch_1">
+   <path d="M 0 387.048504 
+L 414.886659 387.048504 
+L 414.886659 0 
+L 0 0 
+z
+" style="fill: #ffffff"/>
+  </g>
+  <g id="axes_1">
+   <g id="patch_2">
+    <path d="M 58.465 334.482831 
+L 384.123653 334.482831 
+L 384.123653 144.816 
+L 58.465 144.816 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="line2d_1">
+      <defs>
+       <path id="m15b5a74a8c" d="M 0 0 
+L 0 3.5 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#m15b5a74a8c" x="73.267666" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_1">
+      <!-- (4K, 3K) -->
+      <g transform="translate(74.738227 346.855738) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-28" d="M 1984 4856 
+Q 1566 4138 1362 3434 
+Q 1159 2731 1159 2009 
+Q 1159 1288 1364 580 
+Q 1569 -128 1984 -844 
+L 1484 -844 
+Q 1016 -109 783 600 
+Q 550 1309 550 2009 
+Q 550 2706 781 3412 
+Q 1013 4119 1484 4856 
+L 1984 4856 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-34" d="M 2419 4116 
+L 825 1625 
+L 2419 1625 
+L 2419 4116 
+z
+M 2253 4666 
+L 3047 4666 
+L 3047 1625 
+L 3713 1625 
+L 3713 1100 
+L 3047 1100 
+L 3047 0 
+L 2419 0 
+L 2419 1100 
+L 313 1100 
+L 313 1709 
+L 2253 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-4b" d="M 628 4666 
+L 1259 4666 
+L 1259 2694 
+L 3353 4666 
+L 4166 4666 
+L 1850 2491 
+L 4331 0 
+L 3500 0 
+L 1259 2247 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-2c" d="M 750 794 
+L 1409 794 
+L 1409 256 
+L 897 -744 
+L 494 -744 
+L 750 256 
+L 750 794 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-20" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-33" d="M 2597 2516 
+Q 3050 2419 3304 2112 
+Q 3559 1806 3559 1356 
+Q 3559 666 3084 287 
+Q 2609 -91 1734 -91 
+Q 1441 -91 1130 -33 
+Q 819 25 488 141 
+L 488 750 
+Q 750 597 1062 519 
+Q 1375 441 1716 441 
+Q 2309 441 2620 675 
+Q 2931 909 2931 1356 
+Q 2931 1769 2642 2001 
+Q 2353 2234 1838 2234 
+L 1294 2234 
+L 1294 2753 
+L 1863 2753 
+Q 2328 2753 2575 2939 
+Q 2822 3125 2822 3475 
+Q 2822 3834 2567 4026 
+Q 2313 4219 1838 4219 
+Q 1578 4219 1281 4162 
+Q 984 4106 628 3988 
+L 628 4550 
+Q 988 4650 1302 4700 
+Q 1616 4750 1894 4750 
+Q 2613 4750 3031 4423 
+Q 3450 4097 3450 3541 
+Q 3450 3153 3228 2886 
+Q 3006 2619 2597 2516 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-29" d="M 513 4856 
+L 1013 4856 
+Q 1481 4119 1714 3412 
+Q 1947 2706 1947 2009 
+Q 1947 1309 1714 600 
+Q 1481 -109 1013 -844 
+L 513 -844 
+Q 928 -128 1133 580 
+Q 1338 1288 1338 2009 
+Q 1338 2731 1133 3434 
+Q 928 4138 513 4856 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="line2d_2">
+      <g>
+       <use xlink:href="#m15b5a74a8c" x="100.181604" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_2">
+      <!-- (16K, 3K) -->
+      <g transform="translate(101.652165 346.855738) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-31" d="M 794 531 
+L 1825 531 
+L 1825 4091 
+L 703 3866 
+L 703 4441 
+L 1819 4666 
+L 2450 4666 
+L 2450 531 
+L 3481 531 
+L 3481 0 
+L 794 0 
+L 794 531 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-36" d="M 2113 2584 
+Q 1688 2584 1439 2293 
+Q 1191 2003 1191 1497 
+Q 1191 994 1439 701 
+Q 1688 409 2113 409 
+Q 2538 409 2786 701 
+Q 3034 994 3034 1497 
+Q 3034 2003 2786 2293 
+Q 2538 2584 2113 2584 
+z
+M 3366 4563 
+L 3366 3988 
+Q 3128 4100 2886 4159 
+Q 2644 4219 2406 4219 
+Q 1781 4219 1451 3797 
+Q 1122 3375 1075 2522 
+Q 1259 2794 1537 2939 
+Q 1816 3084 2150 3084 
+Q 2853 3084 3261 2657 
+Q 3669 2231 3669 1497 
+Q 3669 778 3244 343 
+Q 2819 -91 2113 -91 
+Q 1303 -91 875 529 
+Q 447 1150 447 2328 
+Q 447 3434 972 4092 
+Q 1497 4750 2381 4750 
+Q 2619 4750 2861 4703 
+Q 3103 4656 3366 4563 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="line2d_3">
+      <g>
+       <use xlink:href="#m15b5a74a8c" x="127.095543" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_3">
+      <!-- (64K, 3K) -->
+      <g transform="translate(128.566104 346.855738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="line2d_4">
+      <g>
+       <use xlink:href="#m15b5a74a8c" x="154.009481" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_4">
+      <!-- (4K, 6K) -->
+      <g transform="translate(155.480042 346.855738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="line2d_5">
+      <g>
+       <use xlink:href="#m15b5a74a8c" x="180.923419" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_5">
+      <!-- (16K, 6K) -->
+      <g transform="translate(182.39398 346.855738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="line2d_6">
+      <g>
+       <use xlink:href="#m15b5a74a8c" x="207.837357" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_6">
+      <!-- (64K, 6K) -->
+      <g transform="translate(209.307918 346.855738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_7">
+     <g id="line2d_7">
+      <g>
+       <use xlink:href="#m15b5a74a8c" x="234.751295" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_7">
+      <!-- (4K, 8K) -->
+      <g transform="translate(236.221857 346.855738) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-38" d="M 2034 2216 
+Q 1584 2216 1326 1975 
+Q 1069 1734 1069 1313 
+Q 1069 891 1326 650 
+Q 1584 409 2034 409 
+Q 2484 409 2743 651 
+Q 3003 894 3003 1313 
+Q 3003 1734 2745 1975 
+Q 2488 2216 2034 2216 
+z
+M 1403 2484 
+Q 997 2584 770 2862 
+Q 544 3141 544 3541 
+Q 544 4100 942 4425 
+Q 1341 4750 2034 4750 
+Q 2731 4750 3128 4425 
+Q 3525 4100 3525 3541 
+Q 3525 3141 3298 2862 
+Q 3072 2584 2669 2484 
+Q 3125 2378 3379 2068 
+Q 3634 1759 3634 1313 
+Q 3634 634 3220 271 
+Q 2806 -91 2034 -91 
+Q 1263 -91 848 271 
+Q 434 634 434 1313 
+Q 434 1759 690 2068 
+Q 947 2378 1403 2484 
+z
+M 1172 3481 
+Q 1172 3119 1398 2916 
+Q 1625 2713 2034 2713 
+Q 2441 2713 2670 2916 
+Q 2900 3119 2900 3481 
+Q 2900 3844 2670 4047 
+Q 2441 4250 2034 4250 
+Q 1625 4250 1398 4047 
+Q 1172 3844 1172 3481 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_8">
+     <g id="line2d_8">
+      <g>
+       <use xlink:href="#m15b5a74a8c" x="261.665234" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_8">
+      <!-- (16K, 8K) -->
+      <g transform="translate(263.135795 346.855738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_9">
+     <g id="line2d_9">
+      <g>
+       <use xlink:href="#m15b5a74a8c" x="288.579172" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_9">
+      <!-- (64K, 8K) -->
+      <g transform="translate(290.049733 346.855738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_10">
+     <g id="line2d_10">
+      <g>
+       <use xlink:href="#m15b5a74a8c" x="315.49311" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_10">
+      <!-- (4K, 12K) -->
+      <g transform="translate(316.963671 346.855738) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-32" d="M 1228 531 
+L 3431 531 
+L 3431 0 
+L 469 0 
+L 469 531 
+Q 828 903 1448 1529 
+Q 2069 2156 2228 2338 
+Q 2531 2678 2651 2914 
+Q 2772 3150 2772 3378 
+Q 2772 3750 2511 3984 
+Q 2250 4219 1831 4219 
+Q 1534 4219 1204 4116 
+Q 875 4013 500 3803 
+L 500 4441 
+Q 881 4594 1212 4672 
+Q 1544 4750 1819 4750 
+Q 2544 4750 2975 4387 
+Q 3406 4025 3406 3419 
+Q 3406 3131 3298 2873 
+Q 3191 2616 2906 2266 
+Q 2828 2175 2409 1742 
+Q 1991 1309 1228 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_11">
+     <g id="line2d_11">
+      <g>
+       <use xlink:href="#m15b5a74a8c" x="342.407048" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_11">
+      <!-- (16K, 12K) -->
+      <g transform="translate(343.87761 346.855738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_12">
+     <g id="line2d_12">
+      <g>
+       <use xlink:href="#m15b5a74a8c" x="369.320987" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_12">
+      <!-- (64K, 12K) -->
+      <g transform="translate(370.791548 346.855738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2">
+    <g id="ytick_1">
+     <g id="line2d_13">
+      <path d="M 58.465 334.482831 
+L 384.123653 334.482831 
+" clip-path="url(#p81d229eaf5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_14">
+      <defs>
+       <path id="md612f4bb41" d="M 0 0 
+L -3.5 0 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#md612f4bb41" x="58.465" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_13">
+      <!-- 0 -->
+      <g transform="translate(43.83 339.041894) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-30" d="M 2034 4250 
+Q 1547 4250 1301 3770 
+Q 1056 3291 1056 2328 
+Q 1056 1369 1301 889 
+Q 1547 409 2034 409 
+Q 2525 409 2770 889 
+Q 3016 1369 3016 2328 
+Q 3016 3291 2770 3770 
+Q 2525 4250 2034 4250 
+z
+M 2034 4750 
+Q 2819 4750 3233 4129 
+Q 3647 3509 3647 2328 
+Q 3647 1150 3233 529 
+Q 2819 -91 2034 -91 
+Q 1250 -91 836 529 
+Q 422 1150 422 2328 
+Q 422 3509 836 4129 
+Q 1250 4750 2034 4750 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_2">
+     <g id="line2d_15">
+      <path d="M 58.465 286.898076 
+L 384.123653 286.898076 
+" clip-path="url(#p81d229eaf5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_16">
+      <g>
+       <use xlink:href="#md612f4bb41" x="58.465" y="286.898076" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_14">
+      <!-- 2000 -->
+      <g transform="translate(20.925 291.457138) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-32"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_3">
+     <g id="line2d_17">
+      <path d="M 58.465 239.313321 
+L 384.123653 239.313321 
+" clip-path="url(#p81d229eaf5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_18">
+      <g>
+       <use xlink:href="#md612f4bb41" x="58.465" y="239.313321" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_15">
+      <!-- 4000 -->
+      <g transform="translate(20.925 243.872383) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-34"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_4">
+     <g id="line2d_19">
+      <path d="M 58.465 191.728565 
+L 384.123653 191.728565 
+" clip-path="url(#p81d229eaf5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_20">
+      <g>
+       <use xlink:href="#md612f4bb41" x="58.465" y="191.728565" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_16">
+      <!-- 6000 -->
+      <g transform="translate(20.925 196.287628) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-36"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_17">
+     <!-- Memory Bandwidth (GB/s) -->
+     <g transform="translate(13.5975 344.653166) rotate(-90) scale(0.16 -0.16)">
+      <defs>
+       <path id="DejaVuSans-4d" d="M 628 4666 
+L 1569 4666 
+L 2759 1491 
+L 3956 4666 
+L 4897 4666 
+L 4897 0 
+L 4281 0 
+L 4281 4097 
+L 3078 897 
+L 2444 897 
+L 1241 4097 
+L 1241 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-65" d="M 3597 1894 
+L 3597 1613 
+L 953 1613 
+Q 991 1019 1311 708 
+Q 1631 397 2203 397 
+Q 2534 397 2845 478 
+Q 3156 559 3463 722 
+L 3463 178 
+Q 3153 47 2828 -22 
+Q 2503 -91 2169 -91 
+Q 1331 -91 842 396 
+Q 353 884 353 1716 
+Q 353 2575 817 3079 
+Q 1281 3584 2069 3584 
+Q 2775 3584 3186 3129 
+Q 3597 2675 3597 1894 
+z
+M 3022 2063 
+Q 3016 2534 2758 2815 
+Q 2500 3097 2075 3097 
+Q 1594 3097 1305 2825 
+Q 1016 2553 972 2059 
+L 3022 2063 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6d" d="M 3328 2828 
+Q 3544 3216 3844 3400 
+Q 4144 3584 4550 3584 
+Q 5097 3584 5394 3201 
+Q 5691 2819 5691 2113 
+L 5691 0 
+L 5113 0 
+L 5113 2094 
+Q 5113 2597 4934 2840 
+Q 4756 3084 4391 3084 
+Q 3944 3084 3684 2787 
+Q 3425 2491 3425 1978 
+L 3425 0 
+L 2847 0 
+L 2847 2094 
+Q 2847 2600 2669 2842 
+Q 2491 3084 2119 3084 
+Q 1678 3084 1418 2786 
+Q 1159 2488 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1356 3278 1631 3431 
+Q 1906 3584 2284 3584 
+Q 2666 3584 2933 3390 
+Q 3200 3197 3328 2828 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6f" d="M 1959 3097 
+Q 1497 3097 1228 2736 
+Q 959 2375 959 1747 
+Q 959 1119 1226 758 
+Q 1494 397 1959 397 
+Q 2419 397 2687 759 
+Q 2956 1122 2956 1747 
+Q 2956 2369 2687 2733 
+Q 2419 3097 1959 3097 
+z
+M 1959 3584 
+Q 2709 3584 3137 3096 
+Q 3566 2609 3566 1747 
+Q 3566 888 3137 398 
+Q 2709 -91 1959 -91 
+Q 1206 -91 779 398 
+Q 353 888 353 1747 
+Q 353 2609 779 3096 
+Q 1206 3584 1959 3584 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-72" d="M 2631 2963 
+Q 2534 3019 2420 3045 
+Q 2306 3072 2169 3072 
+Q 1681 3072 1420 2755 
+Q 1159 2438 1159 1844 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1341 3275 1631 3429 
+Q 1922 3584 2338 3584 
+Q 2397 3584 2469 3576 
+Q 2541 3569 2628 3553 
+L 2631 2963 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-79" d="M 2059 -325 
+Q 1816 -950 1584 -1140 
+Q 1353 -1331 966 -1331 
+L 506 -1331 
+L 506 -850 
+L 844 -850 
+Q 1081 -850 1212 -737 
+Q 1344 -625 1503 -206 
+L 1606 56 
+L 191 3500 
+L 800 3500 
+L 1894 763 
+L 2988 3500 
+L 3597 3500 
+L 2059 -325 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-42" d="M 1259 2228 
+L 1259 519 
+L 2272 519 
+Q 2781 519 3026 730 
+Q 3272 941 3272 1375 
+Q 3272 1813 3026 2020 
+Q 2781 2228 2272 2228 
+L 1259 2228 
+z
+M 1259 4147 
+L 1259 2741 
+L 2194 2741 
+Q 2656 2741 2882 2914 
+Q 3109 3088 3109 3444 
+Q 3109 3797 2882 3972 
+Q 2656 4147 2194 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 2241 4666 
+Q 2963 4666 3353 4366 
+Q 3744 4066 3744 3513 
+Q 3744 3084 3544 2831 
+Q 3344 2578 2956 2516 
+Q 3422 2416 3680 2098 
+Q 3938 1781 3938 1306 
+Q 3938 681 3513 340 
+Q 3088 0 2303 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-61" d="M 2194 1759 
+Q 1497 1759 1228 1600 
+Q 959 1441 959 1056 
+Q 959 750 1161 570 
+Q 1363 391 1709 391 
+Q 2188 391 2477 730 
+Q 2766 1069 2766 1631 
+L 2766 1759 
+L 2194 1759 
+z
+M 3341 1997 
+L 3341 0 
+L 2766 0 
+L 2766 531 
+Q 2569 213 2275 61 
+Q 1981 -91 1556 -91 
+Q 1019 -91 701 211 
+Q 384 513 384 1019 
+Q 384 1609 779 1909 
+Q 1175 2209 1959 2209 
+L 2766 2209 
+L 2766 2266 
+Q 2766 2663 2505 2880 
+Q 2244 3097 1772 3097 
+Q 1472 3097 1187 3025 
+Q 903 2953 641 2809 
+L 641 3341 
+Q 956 3463 1253 3523 
+Q 1550 3584 1831 3584 
+Q 2591 3584 2966 3190 
+Q 3341 2797 3341 1997 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6e" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-64" d="M 2906 2969 
+L 2906 4863 
+L 3481 4863 
+L 3481 0 
+L 2906 0 
+L 2906 525 
+Q 2725 213 2448 61 
+Q 2172 -91 1784 -91 
+Q 1150 -91 751 415 
+Q 353 922 353 1747 
+Q 353 2572 751 3078 
+Q 1150 3584 1784 3584 
+Q 2172 3584 2448 3432 
+Q 2725 3281 2906 2969 
+z
+M 947 1747 
+Q 947 1113 1208 752 
+Q 1469 391 1925 391 
+Q 2381 391 2643 752 
+Q 2906 1113 2906 1747 
+Q 2906 2381 2643 2742 
+Q 2381 3103 1925 3103 
+Q 1469 3103 1208 2742 
+Q 947 2381 947 1747 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-77" d="M 269 3500 
+L 844 3500 
+L 1563 769 
+L 2278 3500 
+L 2956 3500 
+L 3675 769 
+L 4391 3500 
+L 4966 3500 
+L 4050 0 
+L 3372 0 
+L 2619 2869 
+L 1863 0 
+L 1184 0 
+L 269 3500 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-69" d="M 603 3500 
+L 1178 3500 
+L 1178 0 
+L 603 0 
+L 603 3500 
+z
+M 603 4863 
+L 1178 4863 
+L 1178 4134 
+L 603 4134 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-74" d="M 1172 4494 
+L 1172 3500 
+L 2356 3500 
+L 2356 3053 
+L 1172 3053 
+L 1172 1153 
+Q 1172 725 1289 603 
+Q 1406 481 1766 481 
+L 2356 481 
+L 2356 0 
+L 1766 0 
+Q 1100 0 847 248 
+Q 594 497 594 1153 
+L 594 3053 
+L 172 3053 
+L 172 3500 
+L 594 3500 
+L 594 4494 
+L 1172 4494 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-68" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-47" d="M 3809 666 
+L 3809 1919 
+L 2778 1919 
+L 2778 2438 
+L 4434 2438 
+L 4434 434 
+Q 4069 175 3628 42 
+Q 3188 -91 2688 -91 
+Q 1594 -91 976 548 
+Q 359 1188 359 2328 
+Q 359 3472 976 4111 
+Q 1594 4750 2688 4750 
+Q 3144 4750 3555 4637 
+Q 3966 4525 4313 4306 
+L 4313 3634 
+Q 3963 3931 3569 4081 
+Q 3175 4231 2741 4231 
+Q 1884 4231 1454 3753 
+Q 1025 3275 1025 2328 
+Q 1025 1384 1454 906 
+Q 1884 428 2741 428 
+Q 3075 428 3337 486 
+Q 3600 544 3809 666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-2f" d="M 1625 4666 
+L 2156 4666 
+L 531 -594 
+L 0 -594 
+L 1625 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-73" d="M 2834 3397 
+L 2834 2853 
+Q 2591 2978 2328 3040 
+Q 2066 3103 1784 3103 
+Q 1356 3103 1142 2972 
+Q 928 2841 928 2578 
+Q 928 2378 1081 2264 
+Q 1234 2150 1697 2047 
+L 1894 2003 
+Q 2506 1872 2764 1633 
+Q 3022 1394 3022 966 
+Q 3022 478 2636 193 
+Q 2250 -91 1575 -91 
+Q 1294 -91 989 -36 
+Q 684 19 347 128 
+L 347 722 
+Q 666 556 975 473 
+Q 1284 391 1588 391 
+Q 1994 391 2212 530 
+Q 2431 669 2431 922 
+Q 2431 1156 2273 1281 
+Q 2116 1406 1581 1522 
+L 1381 1569 
+Q 847 1681 609 1914 
+Q 372 2147 372 2553 
+Q 372 3047 722 3315 
+Q 1072 3584 1716 3584 
+Q 2034 3584 2315 3537 
+Q 2597 3491 2834 3397 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-4d"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(86.279297 0)"/>
+      <use xlink:href="#DejaVuSans-6d" transform="translate(147.802734 0)"/>
+      <use xlink:href="#DejaVuSans-6f" transform="translate(245.214844 0)"/>
+      <use xlink:href="#DejaVuSans-72" transform="translate(306.396484 0)"/>
+      <use xlink:href="#DejaVuSans-79" transform="translate(347.509766 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(406.689453 0)"/>
+      <use xlink:href="#DejaVuSans-42" transform="translate(438.476562 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(507.080078 0)"/>
+      <use xlink:href="#DejaVuSans-6e" transform="translate(568.359375 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(631.738281 0)"/>
+      <use xlink:href="#DejaVuSans-77" transform="translate(695.214844 0)"/>
+      <use xlink:href="#DejaVuSans-69" transform="translate(777.001953 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(804.785156 0)"/>
+      <use xlink:href="#DejaVuSans-74" transform="translate(868.261719 0)"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(907.470703 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(970.849609 0)"/>
+      <use xlink:href="#DejaVuSans-28" transform="translate(1002.636719 0)"/>
+      <use xlink:href="#DejaVuSans-47" transform="translate(1041.650391 0)"/>
+      <use xlink:href="#DejaVuSans-42" transform="translate(1119.140625 0)"/>
+      <use xlink:href="#DejaVuSans-2f" transform="translate(1187.744141 0)"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(1221.435547 0)"/>
+      <use xlink:href="#DejaVuSans-29" transform="translate(1273.535156 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_21">
+    <path d="M 73.267666 272.971787 
+L 100.181604 237.609356 
+L 127.095543 220.696114 
+L 154.009481 267.357961 
+L 180.923419 247.213329 
+L 207.837357 240.495665 
+L 234.751295 258.623181 
+L 261.665234 243.36894 
+L 288.579172 238.580406 
+L 315.49311 254.207831 
+L 342.407048 242.235517 
+L 369.320987 238.283365 
+" clip-path="url(#p81d229eaf5)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <defs>
+     <path id="m04c2f89d97" d="M 0 3.5 
+C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
+C 3.131218 1.81853 3.5 0.928211 3.5 0 
+C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
+C 1.81853 -3.131218 0.928211 -3.5 0 -3.5 
+C -0.928211 -3.5 -1.81853 -3.131218 -2.474874 -2.474874 
+C -3.131218 -1.81853 -3.5 -0.928211 -3.5 0 
+C -3.5 0.928211 -3.131218 1.81853 -2.474874 2.474874 
+C -1.81853 3.131218 -0.928211 3.5 0 3.5 
+z
+" style="stroke: #5ba3f5"/>
+    </defs>
+    <g clip-path="url(#p81d229eaf5)">
+     <use xlink:href="#m04c2f89d97" x="73.267666" y="272.971787" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m04c2f89d97" x="100.181604" y="237.609356" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m04c2f89d97" x="127.095543" y="220.696114" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m04c2f89d97" x="154.009481" y="267.357961" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m04c2f89d97" x="180.923419" y="247.213329" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m04c2f89d97" x="207.837357" y="240.495665" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m04c2f89d97" x="234.751295" y="258.623181" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m04c2f89d97" x="261.665234" y="243.36894" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m04c2f89d97" x="288.579172" y="238.580406" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m04c2f89d97" x="315.49311" y="254.207831" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m04c2f89d97" x="342.407048" y="242.235517" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m04c2f89d97" x="369.320987" y="238.283365" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_22">
+    <path d="M 73.267666 321.24749 
+L 100.181604 283.304671 
+L 127.095543 264.554548 
+L 154.009481 309.745277 
+L 180.923419 271.862096 
+L 207.837357 265.700512 
+L 234.751295 299.997343 
+L 261.665234 265.227117 
+L 288.579172 260.276768 
+L 315.49311 285.562061 
+L 342.407048 267.299279 
+L 369.320987 263.877176 
+" clip-path="url(#p81d229eaf5)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <defs>
+     <path id="m902d21bb1f" d="M 0 3.5 
+C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
+C 3.131218 1.81853 3.5 0.928211 3.5 0 
+C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
+C 1.81853 -3.131218 0.928211 -3.5 0 -3.5 
+C -0.928211 -3.5 -1.81853 -3.131218 -2.474874 -2.474874 
+C -3.131218 -1.81853 -3.5 -0.928211 -3.5 0 
+C -3.5 0.928211 -3.131218 1.81853 -2.474874 2.474874 
+C -1.81853 3.131218 -0.928211 3.5 0 3.5 
+z
+" style="stroke: #ff4444"/>
+    </defs>
+    <g clip-path="url(#p81d229eaf5)">
+     <use xlink:href="#m902d21bb1f" x="73.267666" y="321.24749" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m902d21bb1f" x="100.181604" y="283.304671" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m902d21bb1f" x="127.095543" y="264.554548" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m902d21bb1f" x="154.009481" y="309.745277" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m902d21bb1f" x="180.923419" y="271.862096" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m902d21bb1f" x="207.837357" y="265.700512" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m902d21bb1f" x="234.751295" y="299.997343" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m902d21bb1f" x="261.665234" y="265.227117" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m902d21bb1f" x="288.579172" y="260.276768" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m902d21bb1f" x="315.49311" y="285.562061" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m902d21bb1f" x="342.407048" y="267.299279" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m902d21bb1f" x="369.320987" y="263.877176" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_23">
+    <path d="M 58.465 158.865395 
+L 384.123653 158.865395 
+" clip-path="url(#p81d229eaf5)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_3">
+    <path d="M 58.465 334.482831 
+L 58.465 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_4">
+    <path d="M 384.123653 334.482831 
+L 384.123653 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_5">
+    <path d="M 58.465 334.482831 
+L 384.123653 334.482831 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_6">
+    <path d="M 58.465 144.816 
+L 384.123653 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_18">
+    <!-- Cross-Entropy (fwd+bwd) -->
+    <g transform="translate(106.121045 138.816) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-43" d="M 4122 4306 
+L 4122 3641 
+Q 3803 3938 3442 4084 
+Q 3081 4231 2675 4231 
+Q 1875 4231 1450 3742 
+Q 1025 3253 1025 2328 
+Q 1025 1406 1450 917 
+Q 1875 428 2675 428 
+Q 3081 428 3442 575 
+Q 3803 722 4122 1019 
+L 4122 359 
+Q 3791 134 3420 21 
+Q 3050 -91 2638 -91 
+Q 1578 -91 968 557 
+Q 359 1206 359 2328 
+Q 359 3453 968 4101 
+Q 1578 4750 2638 4750 
+Q 3056 4750 3426 4639 
+Q 3797 4528 4122 4306 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-2d" d="M 313 2009 
+L 1997 2009 
+L 1997 1497 
+L 313 1497 
+L 313 2009 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-45" d="M 628 4666 
+L 3578 4666 
+L 3578 4134 
+L 1259 4134 
+L 1259 2753 
+L 3481 2753 
+L 3481 2222 
+L 1259 2222 
+L 1259 531 
+L 3634 531 
+L 3634 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-70" d="M 1159 525 
+L 1159 -1331 
+L 581 -1331 
+L 581 3500 
+L 1159 3500 
+L 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+z
+M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-66" d="M 2375 4863 
+L 2375 4384 
+L 1825 4384 
+Q 1516 4384 1395 4259 
+Q 1275 4134 1275 3809 
+L 1275 3500 
+L 2222 3500 
+L 2222 3053 
+L 1275 3053 
+L 1275 0 
+L 697 0 
+L 697 3053 
+L 147 3053 
+L 147 3500 
+L 697 3500 
+L 697 3744 
+Q 697 4328 969 4595 
+Q 1241 4863 1831 4863 
+L 2375 4863 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-2b" d="M 2944 4013 
+L 2944 2272 
+L 4684 2272 
+L 4684 1741 
+L 2944 1741 
+L 2944 0 
+L 2419 0 
+L 2419 1741 
+L 678 1741 
+L 678 2272 
+L 2419 2272 
+L 2419 4013 
+L 2944 4013 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-62" d="M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+M 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2969 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-43"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(69.824219 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(108.6875 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(169.869141 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(221.96875 0)"/>
+     <use xlink:href="#DejaVuSans-2d" transform="translate(274.068359 0)"/>
+     <use xlink:href="#DejaVuSans-45" transform="translate(310.152344 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(373.335938 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(436.714844 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(475.923828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(514.787109 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(575.96875 0)"/>
+     <use xlink:href="#DejaVuSans-79" transform="translate(639.445312 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(698.625 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(730.412109 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(769.425781 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(802.880859 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(884.667969 0)"/>
+     <use xlink:href="#DejaVuSans-2b" transform="translate(948.144531 0)"/>
+     <use xlink:href="#DejaVuSans-62" transform="translate(1031.933594 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(1095.410156 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1177.197266 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1240.673828 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="text_19">
+   <!-- SM100 BF16 — DSv3 CrossEntropy -->
+   <g transform="translate(50.706406 15.117187) scale(0.18 -0.18)">
+    <defs>
+     <path id="DejaVuSans-53" d="M 3425 4513 
+L 3425 3897 
+Q 3066 4069 2747 4153 
+Q 2428 4238 2131 4238 
+Q 1616 4238 1336 4038 
+Q 1056 3838 1056 3469 
+Q 1056 3159 1242 3001 
+Q 1428 2844 1947 2747 
+L 2328 2669 
+Q 3034 2534 3370 2195 
+Q 3706 1856 3706 1288 
+Q 3706 609 3251 259 
+Q 2797 -91 1919 -91 
+Q 1588 -91 1214 -16 
+Q 841 59 441 206 
+L 441 856 
+Q 825 641 1194 531 
+Q 1563 422 1919 422 
+Q 2459 422 2753 634 
+Q 3047 847 3047 1241 
+Q 3047 1584 2836 1778 
+Q 2625 1972 2144 2069 
+L 1759 2144 
+Q 1053 2284 737 2584 
+Q 422 2884 422 3419 
+Q 422 4038 858 4394 
+Q 1294 4750 2059 4750 
+Q 2388 4750 2728 4690 
+Q 3069 4631 3425 4513 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-46" d="M 628 4666 
+L 3309 4666 
+L 3309 4134 
+L 1259 4134 
+L 1259 2759 
+L 3109 2759 
+L 3109 2228 
+L 1259 2228 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-2014" d="M 313 1978 
+L 6088 1978 
+L 6088 1528 
+L 313 1528 
+L 313 1978 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-44" d="M 1259 4147 
+L 1259 519 
+L 2022 519 
+Q 2988 519 3436 956 
+Q 3884 1394 3884 2338 
+Q 3884 3275 3436 3711 
+Q 2988 4147 2022 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 1925 4666 
+Q 3281 4666 3915 4102 
+Q 4550 3538 4550 2338 
+Q 4550 1131 3912 565 
+Q 3275 0 1925 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-76" d="M 191 3500 
+L 800 3500 
+L 1894 563 
+L 2988 3500 
+L 3597 3500 
+L 2284 0 
+L 1503 0 
+L 191 3500 
+z
+" transform="scale(0.015625)"/>
+    </defs>
+    <use xlink:href="#DejaVuSans-53"/>
+    <use xlink:href="#DejaVuSans-4d" transform="translate(63.476562 0)"/>
+    <use xlink:href="#DejaVuSans-31" transform="translate(149.755859 0)"/>
+    <use xlink:href="#DejaVuSans-30" transform="translate(213.378906 0)"/>
+    <use xlink:href="#DejaVuSans-30" transform="translate(277.001953 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(340.625 0)"/>
+    <use xlink:href="#DejaVuSans-42" transform="translate(372.412109 0)"/>
+    <use xlink:href="#DejaVuSans-46" transform="translate(441.015625 0)"/>
+    <use xlink:href="#DejaVuSans-31" transform="translate(498.535156 0)"/>
+    <use xlink:href="#DejaVuSans-36" transform="translate(562.158203 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(625.78125 0)"/>
+    <use xlink:href="#DejaVuSans-2014" transform="translate(657.568359 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(757.568359 0)"/>
+    <use xlink:href="#DejaVuSans-44" transform="translate(789.355469 0)"/>
+    <use xlink:href="#DejaVuSans-53" transform="translate(866.357422 0)"/>
+    <use xlink:href="#DejaVuSans-76" transform="translate(929.833984 0)"/>
+    <use xlink:href="#DejaVuSans-33" transform="translate(989.013672 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(1052.636719 0)"/>
+    <use xlink:href="#DejaVuSans-43" transform="translate(1084.423828 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(1154.248047 0)"/>
+    <use xlink:href="#DejaVuSans-6f" transform="translate(1193.111328 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(1254.292969 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(1306.392578 0)"/>
+    <use xlink:href="#DejaVuSans-45" transform="translate(1358.492188 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(1421.675781 0)"/>
+    <use xlink:href="#DejaVuSans-74" transform="translate(1485.054688 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(1524.263672 0)"/>
+    <use xlink:href="#DejaVuSans-6f" transform="translate(1563.126953 0)"/>
+    <use xlink:href="#DejaVuSans-70" transform="translate(1624.308594 0)"/>
+    <use xlink:href="#DejaVuSans-79" transform="translate(1687.785156 0)"/>
+   </g>
+  </g>
+  <g id="legend_1">
+   <g id="line2d_24">
+    <path d="M 115.874375 45.382125 
+L 130.874375 45.382125 
+L 145.874375 45.382125 
+" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g>
+     <use xlink:href="#m04c2f89d97" x="130.874375" y="45.382125" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="text_20">
+    <!-- KernelAgent-Oink (ours) -->
+    <g transform="translate(155.474375 49.582125) scale(0.12 -0.12)">
+     <defs>
+      <path id="DejaVuSans-6c" d="M 603 4863 
+L 1178 4863 
+L 1178 0 
+L 603 0 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-41" d="M 2188 4044 
+L 1331 1722 
+L 3047 1722 
+L 2188 4044 
+z
+M 1831 4666 
+L 2547 4666 
+L 4325 0 
+L 3669 0 
+L 3244 1197 
+L 1141 1197 
+L 716 0 
+L 50 0 
+L 1831 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-67" d="M 2906 1791 
+Q 2906 2416 2648 2759 
+Q 2391 3103 1925 3103 
+Q 1463 3103 1205 2759 
+Q 947 2416 947 1791 
+Q 947 1169 1205 825 
+Q 1463 481 1925 481 
+Q 2391 481 2648 825 
+Q 2906 1169 2906 1791 
+z
+M 3481 434 
+Q 3481 -459 3084 -895 
+Q 2688 -1331 1869 -1331 
+Q 1566 -1331 1297 -1286 
+Q 1028 -1241 775 -1147 
+L 775 -588 
+Q 1028 -725 1275 -790 
+Q 1522 -856 1778 -856 
+Q 2344 -856 2625 -561 
+Q 2906 -266 2906 331 
+L 2906 616 
+Q 2728 306 2450 153 
+Q 2172 0 1784 0 
+Q 1141 0 747 490 
+Q 353 981 353 1791 
+Q 353 2603 747 3093 
+Q 1141 3584 1784 3584 
+Q 2172 3584 2450 3431 
+Q 2728 3278 2906 2969 
+L 2906 3500 
+L 3481 3500 
+L 3481 434 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-4f" d="M 2522 4238 
+Q 1834 4238 1429 3725 
+Q 1025 3213 1025 2328 
+Q 1025 1447 1429 934 
+Q 1834 422 2522 422 
+Q 3209 422 3611 934 
+Q 4013 1447 4013 2328 
+Q 4013 3213 3611 3725 
+Q 3209 4238 2522 4238 
+z
+M 2522 4750 
+Q 3503 4750 4090 4092 
+Q 4678 3434 4678 2328 
+Q 4678 1225 4090 567 
+Q 3503 -91 2522 -91 
+Q 1538 -91 948 565 
+Q 359 1222 359 2328 
+Q 359 3434 948 4092 
+Q 1538 4750 2522 4750 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-6b" d="M 581 4863 
+L 1159 4863 
+L 1159 1991 
+L 2875 3500 
+L 3609 3500 
+L 1753 1863 
+L 3688 0 
+L 2938 0 
+L 1159 1709 
+L 1159 0 
+L 581 0 
+L 581 4863 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-75" d="M 544 1381 
+L 544 3500 
+L 1119 3500 
+L 1119 1403 
+Q 1119 906 1312 657 
+Q 1506 409 1894 409 
+Q 2359 409 2629 706 
+Q 2900 1003 2900 1516 
+L 2900 3500 
+L 3475 3500 
+L 3475 0 
+L 2900 0 
+L 2900 538 
+Q 2691 219 2414 64 
+Q 2138 -91 1772 -91 
+Q 1169 -91 856 284 
+Q 544 659 544 1381 
+z
+M 1991 3584 
+L 1991 3584 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-4b"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(60.576172 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(122.099609 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(161.462891 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(224.841797 0)"/>
+     <use xlink:href="#DejaVuSans-6c" transform="translate(286.365234 0)"/>
+     <use xlink:href="#DejaVuSans-41" transform="translate(314.148438 0)"/>
+     <use xlink:href="#DejaVuSans-67" transform="translate(382.556641 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(446.033203 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(507.556641 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(570.935547 0)"/>
+     <use xlink:href="#DejaVuSans-2d" transform="translate(610.144531 0)"/>
+     <use xlink:href="#DejaVuSans-4f" transform="translate(648.978516 0)"/>
+     <use xlink:href="#DejaVuSans-69" transform="translate(727.689453 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(755.472656 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(818.851562 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(876.761719 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(908.548828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(947.5625 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(1008.744141 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(1072.123047 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(1113.236328 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1165.335938 0)"/>
+    </g>
+   </g>
+   <g id="line2d_25">
+    <path d="M 115.874375 62.995875 
+L 130.874375 62.995875 
+L 145.874375 62.995875 
+" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g>
+     <use xlink:href="#m902d21bb1f" x="130.874375" y="62.995875" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="text_21">
+    <!-- Quack -->
+    <g transform="translate(155.474375 67.195875) scale(0.12 -0.12)">
+     <defs>
+      <path id="DejaVuSans-51" d="M 2522 4238 
+Q 1834 4238 1429 3725 
+Q 1025 3213 1025 2328 
+Q 1025 1447 1429 934 
+Q 1834 422 2522 422 
+Q 3209 422 3611 934 
+Q 4013 1447 4013 2328 
+Q 4013 3213 3611 3725 
+Q 3209 4238 2522 4238 
+z
+M 3406 84 
+L 4238 -825 
+L 3475 -825 
+L 2784 -78 
+Q 2681 -84 2626 -87 
+Q 2572 -91 2522 -91 
+Q 1538 -91 948 567 
+Q 359 1225 359 2328 
+Q 359 3434 948 4092 
+Q 1538 4750 2522 4750 
+Q 3503 4750 4090 4092 
+Q 4678 3434 4678 2328 
+Q 4678 1516 4351 937 
+Q 4025 359 3406 84 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-63" d="M 3122 3366 
+L 3122 2828 
+Q 2878 2963 2633 3030 
+Q 2388 3097 2138 3097 
+Q 1578 3097 1268 2742 
+Q 959 2388 959 1747 
+Q 959 1106 1268 751 
+Q 1578 397 2138 397 
+Q 2388 397 2633 464 
+Q 2878 531 3122 666 
+L 3122 134 
+Q 2881 22 2623 -34 
+Q 2366 -91 2075 -91 
+Q 1284 -91 818 406 
+Q 353 903 353 1747 
+Q 353 2603 823 3093 
+Q 1294 3584 2113 3584 
+Q 2378 3584 2631 3529 
+Q 2884 3475 3122 3366 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-51"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(78.710938 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(142.089844 0)"/>
+     <use xlink:href="#DejaVuSans-63" transform="translate(203.369141 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(258.349609 0)"/>
+    </g>
+   </g>
+   <g id="line2d_26">
+    <path d="M 115.874375 80.609625 
+L 130.874375 80.609625 
+L 145.874375 80.609625 
+" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="text_22">
+    <!-- HBM peak (measured) -->
+    <g transform="translate(155.474375 84.809625) scale(0.12 -0.12)">
+     <defs>
+      <path id="DejaVuSans-48" d="M 628 4666 
+L 1259 4666 
+L 1259 2753 
+L 3553 2753 
+L 3553 4666 
+L 4184 4666 
+L 4184 0 
+L 3553 0 
+L 3553 2222 
+L 1259 2222 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-48"/>
+     <use xlink:href="#DejaVuSans-42" transform="translate(75.195312 0)"/>
+     <use xlink:href="#DejaVuSans-4d" transform="translate(143.798828 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(230.078125 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(261.865234 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(325.341797 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(386.865234 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(448.144531 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(506.054688 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(537.841797 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(576.855469 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(674.267578 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(735.791016 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(797.070312 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(849.169922 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(912.548828 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(951.412109 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1012.935547 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1076.412109 0)"/>
+    </g>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="p81d229eaf5">
+   <rect x="58.465" y="144.816" width="325.658653" height="189.666831"/>
+  </clipPath>
+ </defs>
+</svg>
diff --git a/oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3_with_layernorm.svg b/oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3_with_layernorm.svg
new file mode 100644
index 0000000..e8d4cc6
--- /dev/null
+++ b/oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3_with_layernorm.svg
@@ -0,0 +1,2720 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="1728pt" height="360pt" viewBox="0 0 1728 360" xmlns="http://www.w3.org/2000/svg" version="1.1">
+ <metadata>
+  <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+   <cc:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
+    <dc:date>2026-01-08T16:35:17.144819</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <cc:Agent>
+      <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
+     </cc:Agent>
+    </dc:creator>
+   </cc:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure_1">
+  <g id="patch_1">
+   <path d="M 0 360 
+L 1728 360 
+L 1728 0 
+L 0 0 
+z
+" style="fill: #ffffff"/>
+  </g>
+  <g id="axes_1">
+   <g id="patch_2">
+    <path d="M 66.53 302.361604 
+L 441.930945 302.361604 
+L 441.930945 95.28 
+L 66.53 95.28 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="line2d_1">
+      <defs>
+       <path id="m6ad96393bb" d="M 0 0 
+L 0 3.5 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#m6ad96393bb" x="83.593679" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_1">
+      <!-- (4K, 6K) -->
+      <g transform="translate(85.06424 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-28" d="M 1984 4856 
+Q 1566 4138 1362 3434 
+Q 1159 2731 1159 2009 
+Q 1159 1288 1364 580 
+Q 1569 -128 1984 -844 
+L 1484 -844 
+Q 1016 -109 783 600 
+Q 550 1309 550 2009 
+Q 550 2706 781 3412 
+Q 1013 4119 1484 4856 
+L 1984 4856 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-34" d="M 2419 4116 
+L 825 1625 
+L 2419 1625 
+L 2419 4116 
+z
+M 2253 4666 
+L 3047 4666 
+L 3047 1625 
+L 3713 1625 
+L 3713 1100 
+L 3047 1100 
+L 3047 0 
+L 2419 0 
+L 2419 1100 
+L 313 1100 
+L 313 1709 
+L 2253 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-4b" d="M 628 4666 
+L 1259 4666 
+L 1259 2694 
+L 3353 4666 
+L 4166 4666 
+L 1850 2491 
+L 4331 0 
+L 3500 0 
+L 1259 2247 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-2c" d="M 750 794 
+L 1409 794 
+L 1409 256 
+L 897 -744 
+L 494 -744 
+L 750 256 
+L 750 794 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-20" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-36" d="M 2113 2584 
+Q 1688 2584 1439 2293 
+Q 1191 2003 1191 1497 
+Q 1191 994 1439 701 
+Q 1688 409 2113 409 
+Q 2538 409 2786 701 
+Q 3034 994 3034 1497 
+Q 3034 2003 2786 2293 
+Q 2538 2584 2113 2584 
+z
+M 3366 4563 
+L 3366 3988 
+Q 3128 4100 2886 4159 
+Q 2644 4219 2406 4219 
+Q 1781 4219 1451 3797 
+Q 1122 3375 1075 2522 
+Q 1259 2794 1537 2939 
+Q 1816 3084 2150 3084 
+Q 2853 3084 3261 2657 
+Q 3669 2231 3669 1497 
+Q 3669 778 3244 343 
+Q 2819 -91 2113 -91 
+Q 1303 -91 875 529 
+Q 447 1150 447 2328 
+Q 447 3434 972 4092 
+Q 1497 4750 2381 4750 
+Q 2619 4750 2861 4703 
+Q 3103 4656 3366 4563 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-29" d="M 513 4856 
+L 1013 4856 
+Q 1481 4119 1714 3412 
+Q 1947 2706 1947 2009 
+Q 1947 1309 1714 600 
+Q 1481 -109 1013 -844 
+L 513 -844 
+Q 928 -128 1133 580 
+Q 1338 1288 1338 2009 
+Q 1338 2731 1133 3434 
+Q 928 4138 513 4856 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="line2d_2">
+      <g>
+       <use xlink:href="#m6ad96393bb" x="151.848397" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_2">
+      <!-- (16K, 6K) -->
+      <g transform="translate(153.318958 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-31" d="M 794 531 
+L 1825 531 
+L 1825 4091 
+L 703 3866 
+L 703 4441 
+L 1819 4666 
+L 2450 4666 
+L 2450 531 
+L 3481 531 
+L 3481 0 
+L 794 0 
+L 794 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="line2d_3">
+      <g>
+       <use xlink:href="#m6ad96393bb" x="220.103114" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_3">
+      <!-- (64K, 6K) -->
+      <g transform="translate(221.573675 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="line2d_4">
+      <g>
+       <use xlink:href="#m6ad96393bb" x="288.357831" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_4">
+      <!-- (4K, 8K) -->
+      <g transform="translate(289.828392 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-38" d="M 2034 2216 
+Q 1584 2216 1326 1975 
+Q 1069 1734 1069 1313 
+Q 1069 891 1326 650 
+Q 1584 409 2034 409 
+Q 2484 409 2743 651 
+Q 3003 894 3003 1313 
+Q 3003 1734 2745 1975 
+Q 2488 2216 2034 2216 
+z
+M 1403 2484 
+Q 997 2584 770 2862 
+Q 544 3141 544 3541 
+Q 544 4100 942 4425 
+Q 1341 4750 2034 4750 
+Q 2731 4750 3128 4425 
+Q 3525 4100 3525 3541 
+Q 3525 3141 3298 2862 
+Q 3072 2584 2669 2484 
+Q 3125 2378 3379 2068 
+Q 3634 1759 3634 1313 
+Q 3634 634 3220 271 
+Q 2806 -91 2034 -91 
+Q 1263 -91 848 271 
+Q 434 634 434 1313 
+Q 434 1759 690 2068 
+Q 947 2378 1403 2484 
+z
+M 1172 3481 
+Q 1172 3119 1398 2916 
+Q 1625 2713 2034 2713 
+Q 2441 2713 2670 2916 
+Q 2900 3119 2900 3481 
+Q 2900 3844 2670 4047 
+Q 2441 4250 2034 4250 
+Q 1625 4250 1398 4047 
+Q 1172 3844 1172 3481 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="line2d_5">
+      <g>
+       <use xlink:href="#m6ad96393bb" x="356.612549" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_5">
+      <!-- (16K, 8K) -->
+      <g transform="translate(358.08311 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="line2d_6">
+      <g>
+       <use xlink:href="#m6ad96393bb" x="424.867266" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_6">
+      <!-- (64K, 8K) -->
+      <g transform="translate(426.337827 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2">
+    <g id="ytick_1">
+     <g id="line2d_7">
+      <path d="M 66.53 302.361604 
+L 441.930945 302.361604 
+" clip-path="url(#p1e1ded2a29)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_8">
+      <defs>
+       <path id="m8ba46e77ee" d="M 0 0 
+L -3.5 0 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="66.53" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_7">
+      <!-- 0 -->
+      <g transform="translate(51.895 306.920666) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-30" d="M 2034 4250 
+Q 1547 4250 1301 3770 
+Q 1056 3291 1056 2328 
+Q 1056 1369 1301 889 
+Q 1547 409 2034 409 
+Q 2525 409 2770 889 
+Q 3016 1369 3016 2328 
+Q 3016 3291 2770 3770 
+Q 2525 4250 2034 4250 
+z
+M 2034 4750 
+Q 2819 4750 3233 4129 
+Q 3647 3509 3647 2328 
+Q 3647 1150 3233 529 
+Q 2819 -91 2034 -91 
+Q 1250 -91 836 529 
+Q 422 1150 422 2328 
+Q 422 3509 836 4129 
+Q 1250 4750 2034 4750 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_2">
+     <g id="line2d_9">
+      <path d="M 66.53 276.439075 
+L 441.930945 276.439075 
+" clip-path="url(#p1e1ded2a29)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_10">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="66.53" y="276.439075" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_8">
+      <!-- 1000 -->
+      <g transform="translate(28.99 280.998138) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_3">
+     <g id="line2d_11">
+      <path d="M 66.53 250.516546 
+L 441.930945 250.516546 
+" clip-path="url(#p1e1ded2a29)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_12">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="66.53" y="250.516546" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_9">
+      <!-- 2000 -->
+      <g transform="translate(28.99 255.075609) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-32" d="M 1228 531 
+L 3431 531 
+L 3431 0 
+L 469 0 
+L 469 531 
+Q 828 903 1448 1529 
+Q 2069 2156 2228 2338 
+Q 2531 2678 2651 2914 
+Q 2772 3150 2772 3378 
+Q 2772 3750 2511 3984 
+Q 2250 4219 1831 4219 
+Q 1534 4219 1204 4116 
+Q 875 4013 500 3803 
+L 500 4441 
+Q 881 4594 1212 4672 
+Q 1544 4750 1819 4750 
+Q 2544 4750 2975 4387 
+Q 3406 4025 3406 3419 
+Q 3406 3131 3298 2873 
+Q 3191 2616 2906 2266 
+Q 2828 2175 2409 1742 
+Q 1991 1309 1228 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-32"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_4">
+     <g id="line2d_13">
+      <path d="M 66.53 224.594018 
+L 441.930945 224.594018 
+" clip-path="url(#p1e1ded2a29)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_14">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="66.53" y="224.594018" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_10">
+      <!-- 3000 -->
+      <g transform="translate(28.99 229.15308) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-33" d="M 2597 2516 
+Q 3050 2419 3304 2112 
+Q 3559 1806 3559 1356 
+Q 3559 666 3084 287 
+Q 2609 -91 1734 -91 
+Q 1441 -91 1130 -33 
+Q 819 25 488 141 
+L 488 750 
+Q 750 597 1062 519 
+Q 1375 441 1716 441 
+Q 2309 441 2620 675 
+Q 2931 909 2931 1356 
+Q 2931 1769 2642 2001 
+Q 2353 2234 1838 2234 
+L 1294 2234 
+L 1294 2753 
+L 1863 2753 
+Q 2328 2753 2575 2939 
+Q 2822 3125 2822 3475 
+Q 2822 3834 2567 4026 
+Q 2313 4219 1838 4219 
+Q 1578 4219 1281 4162 
+Q 984 4106 628 3988 
+L 628 4550 
+Q 988 4650 1302 4700 
+Q 1616 4750 1894 4750 
+Q 2613 4750 3031 4423 
+Q 3450 4097 3450 3541 
+Q 3450 3153 3228 2886 
+Q 3006 2619 2597 2516 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-33"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_5">
+     <g id="line2d_15">
+      <path d="M 66.53 198.671489 
+L 441.930945 198.671489 
+" clip-path="url(#p1e1ded2a29)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_16">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="66.53" y="198.671489" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_11">
+      <!-- 4000 -->
+      <g transform="translate(28.99 203.230551) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-34"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_6">
+     <g id="line2d_17">
+      <path d="M 66.53 172.74896 
+L 441.930945 172.74896 
+" clip-path="url(#p1e1ded2a29)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_18">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="66.53" y="172.74896" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_12">
+      <!-- 5000 -->
+      <g transform="translate(28.99 177.308023) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-35" d="M 691 4666 
+L 3169 4666 
+L 3169 4134 
+L 1269 4134 
+L 1269 2991 
+Q 1406 3038 1543 3061 
+Q 1681 3084 1819 3084 
+Q 2600 3084 3056 2656 
+Q 3513 2228 3513 1497 
+Q 3513 744 3044 326 
+Q 2575 -91 1722 -91 
+Q 1428 -91 1123 -41 
+Q 819 9 494 109 
+L 494 744 
+Q 775 591 1075 516 
+Q 1375 441 1709 441 
+Q 2250 441 2565 725 
+Q 2881 1009 2881 1497 
+Q 2881 1984 2565 2268 
+Q 2250 2553 1709 2553 
+Q 1456 2553 1204 2497 
+Q 953 2441 691 2322 
+L 691 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-35"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_7">
+     <g id="line2d_19">
+      <path d="M 66.53 146.826432 
+L 441.930945 146.826432 
+" clip-path="url(#p1e1ded2a29)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_20">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="66.53" y="146.826432" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_13">
+      <!-- 6000 -->
+      <g transform="translate(28.99 151.385494) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-36"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_8">
+     <g id="line2d_21">
+      <path d="M 66.53 120.903903 
+L 441.930945 120.903903 
+" clip-path="url(#p1e1ded2a29)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_22">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="66.53" y="120.903903" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_14">
+      <!-- 7000 -->
+      <g transform="translate(28.99 125.462965) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-37" d="M 525 4666 
+L 3525 4666 
+L 3525 4397 
+L 1831 0 
+L 1172 0 
+L 2766 4134 
+L 525 4134 
+L 525 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-37"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_15">
+     <!-- Memory Bandwidth (GB/s) -->
+     <g transform="translate(21.6625 303.824552) rotate(-90) scale(0.16 -0.16)">
+      <defs>
+       <path id="DejaVuSans-4d" d="M 628 4666 
+L 1569 4666 
+L 2759 1491 
+L 3956 4666 
+L 4897 4666 
+L 4897 0 
+L 4281 0 
+L 4281 4097 
+L 3078 897 
+L 2444 897 
+L 1241 4097 
+L 1241 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-65" d="M 3597 1894 
+L 3597 1613 
+L 953 1613 
+Q 991 1019 1311 708 
+Q 1631 397 2203 397 
+Q 2534 397 2845 478 
+Q 3156 559 3463 722 
+L 3463 178 
+Q 3153 47 2828 -22 
+Q 2503 -91 2169 -91 
+Q 1331 -91 842 396 
+Q 353 884 353 1716 
+Q 353 2575 817 3079 
+Q 1281 3584 2069 3584 
+Q 2775 3584 3186 3129 
+Q 3597 2675 3597 1894 
+z
+M 3022 2063 
+Q 3016 2534 2758 2815 
+Q 2500 3097 2075 3097 
+Q 1594 3097 1305 2825 
+Q 1016 2553 972 2059 
+L 3022 2063 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6d" d="M 3328 2828 
+Q 3544 3216 3844 3400 
+Q 4144 3584 4550 3584 
+Q 5097 3584 5394 3201 
+Q 5691 2819 5691 2113 
+L 5691 0 
+L 5113 0 
+L 5113 2094 
+Q 5113 2597 4934 2840 
+Q 4756 3084 4391 3084 
+Q 3944 3084 3684 2787 
+Q 3425 2491 3425 1978 
+L 3425 0 
+L 2847 0 
+L 2847 2094 
+Q 2847 2600 2669 2842 
+Q 2491 3084 2119 3084 
+Q 1678 3084 1418 2786 
+Q 1159 2488 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1356 3278 1631 3431 
+Q 1906 3584 2284 3584 
+Q 2666 3584 2933 3390 
+Q 3200 3197 3328 2828 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6f" d="M 1959 3097 
+Q 1497 3097 1228 2736 
+Q 959 2375 959 1747 
+Q 959 1119 1226 758 
+Q 1494 397 1959 397 
+Q 2419 397 2687 759 
+Q 2956 1122 2956 1747 
+Q 2956 2369 2687 2733 
+Q 2419 3097 1959 3097 
+z
+M 1959 3584 
+Q 2709 3584 3137 3096 
+Q 3566 2609 3566 1747 
+Q 3566 888 3137 398 
+Q 2709 -91 1959 -91 
+Q 1206 -91 779 398 
+Q 353 888 353 1747 
+Q 353 2609 779 3096 
+Q 1206 3584 1959 3584 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-72" d="M 2631 2963 
+Q 2534 3019 2420 3045 
+Q 2306 3072 2169 3072 
+Q 1681 3072 1420 2755 
+Q 1159 2438 1159 1844 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1341 3275 1631 3429 
+Q 1922 3584 2338 3584 
+Q 2397 3584 2469 3576 
+Q 2541 3569 2628 3553 
+L 2631 2963 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-79" d="M 2059 -325 
+Q 1816 -950 1584 -1140 
+Q 1353 -1331 966 -1331 
+L 506 -1331 
+L 506 -850 
+L 844 -850 
+Q 1081 -850 1212 -737 
+Q 1344 -625 1503 -206 
+L 1606 56 
+L 191 3500 
+L 800 3500 
+L 1894 763 
+L 2988 3500 
+L 3597 3500 
+L 2059 -325 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-42" d="M 1259 2228 
+L 1259 519 
+L 2272 519 
+Q 2781 519 3026 730 
+Q 3272 941 3272 1375 
+Q 3272 1813 3026 2020 
+Q 2781 2228 2272 2228 
+L 1259 2228 
+z
+M 1259 4147 
+L 1259 2741 
+L 2194 2741 
+Q 2656 2741 2882 2914 
+Q 3109 3088 3109 3444 
+Q 3109 3797 2882 3972 
+Q 2656 4147 2194 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 2241 4666 
+Q 2963 4666 3353 4366 
+Q 3744 4066 3744 3513 
+Q 3744 3084 3544 2831 
+Q 3344 2578 2956 2516 
+Q 3422 2416 3680 2098 
+Q 3938 1781 3938 1306 
+Q 3938 681 3513 340 
+Q 3088 0 2303 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-61" d="M 2194 1759 
+Q 1497 1759 1228 1600 
+Q 959 1441 959 1056 
+Q 959 750 1161 570 
+Q 1363 391 1709 391 
+Q 2188 391 2477 730 
+Q 2766 1069 2766 1631 
+L 2766 1759 
+L 2194 1759 
+z
+M 3341 1997 
+L 3341 0 
+L 2766 0 
+L 2766 531 
+Q 2569 213 2275 61 
+Q 1981 -91 1556 -91 
+Q 1019 -91 701 211 
+Q 384 513 384 1019 
+Q 384 1609 779 1909 
+Q 1175 2209 1959 2209 
+L 2766 2209 
+L 2766 2266 
+Q 2766 2663 2505 2880 
+Q 2244 3097 1772 3097 
+Q 1472 3097 1187 3025 
+Q 903 2953 641 2809 
+L 641 3341 
+Q 956 3463 1253 3523 
+Q 1550 3584 1831 3584 
+Q 2591 3584 2966 3190 
+Q 3341 2797 3341 1997 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6e" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-64" d="M 2906 2969 
+L 2906 4863 
+L 3481 4863 
+L 3481 0 
+L 2906 0 
+L 2906 525 
+Q 2725 213 2448 61 
+Q 2172 -91 1784 -91 
+Q 1150 -91 751 415 
+Q 353 922 353 1747 
+Q 353 2572 751 3078 
+Q 1150 3584 1784 3584 
+Q 2172 3584 2448 3432 
+Q 2725 3281 2906 2969 
+z
+M 947 1747 
+Q 947 1113 1208 752 
+Q 1469 391 1925 391 
+Q 2381 391 2643 752 
+Q 2906 1113 2906 1747 
+Q 2906 2381 2643 2742 
+Q 2381 3103 1925 3103 
+Q 1469 3103 1208 2742 
+Q 947 2381 947 1747 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-77" d="M 269 3500 
+L 844 3500 
+L 1563 769 
+L 2278 3500 
+L 2956 3500 
+L 3675 769 
+L 4391 3500 
+L 4966 3500 
+L 4050 0 
+L 3372 0 
+L 2619 2869 
+L 1863 0 
+L 1184 0 
+L 269 3500 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-69" d="M 603 3500 
+L 1178 3500 
+L 1178 0 
+L 603 0 
+L 603 3500 
+z
+M 603 4863 
+L 1178 4863 
+L 1178 4134 
+L 603 4134 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-74" d="M 1172 4494 
+L 1172 3500 
+L 2356 3500 
+L 2356 3053 
+L 1172 3053 
+L 1172 1153 
+Q 1172 725 1289 603 
+Q 1406 481 1766 481 
+L 2356 481 
+L 2356 0 
+L 1766 0 
+Q 1100 0 847 248 
+Q 594 497 594 1153 
+L 594 3053 
+L 172 3053 
+L 172 3500 
+L 594 3500 
+L 594 4494 
+L 1172 4494 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-68" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-47" d="M 3809 666 
+L 3809 1919 
+L 2778 1919 
+L 2778 2438 
+L 4434 2438 
+L 4434 434 
+Q 4069 175 3628 42 
+Q 3188 -91 2688 -91 
+Q 1594 -91 976 548 
+Q 359 1188 359 2328 
+Q 359 3472 976 4111 
+Q 1594 4750 2688 4750 
+Q 3144 4750 3555 4637 
+Q 3966 4525 4313 4306 
+L 4313 3634 
+Q 3963 3931 3569 4081 
+Q 3175 4231 2741 4231 
+Q 1884 4231 1454 3753 
+Q 1025 3275 1025 2328 
+Q 1025 1384 1454 906 
+Q 1884 428 2741 428 
+Q 3075 428 3337 486 
+Q 3600 544 3809 666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-2f" d="M 1625 4666 
+L 2156 4666 
+L 531 -594 
+L 0 -594 
+L 1625 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-73" d="M 2834 3397 
+L 2834 2853 
+Q 2591 2978 2328 3040 
+Q 2066 3103 1784 3103 
+Q 1356 3103 1142 2972 
+Q 928 2841 928 2578 
+Q 928 2378 1081 2264 
+Q 1234 2150 1697 2047 
+L 1894 2003 
+Q 2506 1872 2764 1633 
+Q 3022 1394 3022 966 
+Q 3022 478 2636 193 
+Q 2250 -91 1575 -91 
+Q 1294 -91 989 -36 
+Q 684 19 347 128 
+L 347 722 
+Q 666 556 975 473 
+Q 1284 391 1588 391 
+Q 1994 391 2212 530 
+Q 2431 669 2431 922 
+Q 2431 1156 2273 1281 
+Q 2116 1406 1581 1522 
+L 1381 1569 
+Q 847 1681 609 1914 
+Q 372 2147 372 2553 
+Q 372 3047 722 3315 
+Q 1072 3584 1716 3584 
+Q 2034 3584 2315 3537 
+Q 2597 3491 2834 3397 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-4d"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(86.279297 0)"/>
+      <use xlink:href="#DejaVuSans-6d" transform="translate(147.802734 0)"/>
+      <use xlink:href="#DejaVuSans-6f" transform="translate(245.214844 0)"/>
+      <use xlink:href="#DejaVuSans-72" transform="translate(306.396484 0)"/>
+      <use xlink:href="#DejaVuSans-79" transform="translate(347.509766 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(406.689453 0)"/>
+      <use xlink:href="#DejaVuSans-42" transform="translate(438.476562 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(507.080078 0)"/>
+      <use xlink:href="#DejaVuSans-6e" transform="translate(568.359375 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(631.738281 0)"/>
+      <use xlink:href="#DejaVuSans-77" transform="translate(695.214844 0)"/>
+      <use xlink:href="#DejaVuSans-69" transform="translate(777.001953 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(804.785156 0)"/>
+      <use xlink:href="#DejaVuSans-74" transform="translate(868.261719 0)"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(907.470703 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(970.849609 0)"/>
+      <use xlink:href="#DejaVuSans-28" transform="translate(1002.636719 0)"/>
+      <use xlink:href="#DejaVuSans-47" transform="translate(1041.650391 0)"/>
+      <use xlink:href="#DejaVuSans-42" transform="translate(1119.140625 0)"/>
+      <use xlink:href="#DejaVuSans-2f" transform="translate(1187.744141 0)"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(1221.435547 0)"/>
+      <use xlink:href="#DejaVuSans-29" transform="translate(1273.535156 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_23">
+    <path d="M 83.593679 272.159594 
+L 151.848397 195.029867 
+L 220.103114 193.264032 
+L 288.357831 263.563331 
+L 356.612549 193.825638 
+L 424.867266 196.186021 
+" clip-path="url(#p1e1ded2a29)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <defs>
+     <path id="mae8045ff5a" d="M 0 3.5 
+C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
+C 3.131218 1.81853 3.5 0.928211 3.5 0 
+C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
+C 1.81853 -3.131218 0.928211 -3.5 0 -3.5 
+C -0.928211 -3.5 -1.81853 -3.131218 -2.474874 -2.474874 
+C -3.131218 -1.81853 -3.5 -0.928211 -3.5 0 
+C -3.5 0.928211 -3.131218 1.81853 -2.474874 2.474874 
+C -1.81853 3.131218 -0.928211 3.5 0 3.5 
+z
+" style="stroke: #5ba3f5"/>
+    </defs>
+    <g clip-path="url(#p1e1ded2a29)">
+     <use xlink:href="#mae8045ff5a" x="83.593679" y="272.159594" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mae8045ff5a" x="151.848397" y="195.029867" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mae8045ff5a" x="220.103114" y="193.264032" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mae8045ff5a" x="288.357831" y="263.563331" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mae8045ff5a" x="356.612549" y="193.825638" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mae8045ff5a" x="424.867266" y="196.186021" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_24">
+    <path d="M 83.593679 225.20668 
+L 151.848397 197.771954 
+L 220.103114 187.47801 
+L 288.357831 224.643778 
+L 356.612549 203.208915 
+L 424.867266 196.374915 
+" clip-path="url(#p1e1ded2a29)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <defs>
+     <path id="mae70e8af38" d="M 0 3.5 
+C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
+C 3.131218 1.81853 3.5 0.928211 3.5 0 
+C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
+C 1.81853 -3.131218 0.928211 -3.5 0 -3.5 
+C -0.928211 -3.5 -1.81853 -3.131218 -2.474874 -2.474874 
+C -3.131218 -1.81853 -3.5 -0.928211 -3.5 0 
+C -3.5 0.928211 -3.131218 1.81853 -2.474874 2.474874 
+C -1.81853 3.131218 -0.928211 3.5 0 3.5 
+z
+" style="stroke: #ff4444"/>
+    </defs>
+    <g clip-path="url(#p1e1ded2a29)">
+     <use xlink:href="#mae70e8af38" x="83.593679" y="225.20668" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mae70e8af38" x="151.848397" y="197.771954" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mae70e8af38" x="220.103114" y="187.47801" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mae70e8af38" x="288.357831" y="224.643778" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mae70e8af38" x="356.612549" y="203.208915" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mae70e8af38" x="424.867266" y="196.374915" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_25">
+    <path d="M 66.53 110.619378 
+L 441.930945 110.619378 
+" clip-path="url(#p1e1ded2a29)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_3">
+    <path d="M 66.53 302.361604 
+L 66.53 95.28 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_4">
+    <path d="M 441.930945 302.361604 
+L 441.930945 95.28 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_5">
+    <path d="M 66.53 302.361604 
+L 441.930945 302.361604 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_6">
+    <path d="M 66.53 95.28 
+L 441.930945 95.28 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_16">
+    <!-- RMSNorm (fp32 weight) -->
+    <g transform="translate(146.527191 89.28) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-52" d="M 2841 2188 
+Q 3044 2119 3236 1894 
+Q 3428 1669 3622 1275 
+L 4263 0 
+L 3584 0 
+L 2988 1197 
+Q 2756 1666 2539 1819 
+Q 2322 1972 1947 1972 
+L 1259 1972 
+L 1259 0 
+L 628 0 
+L 628 4666 
+L 2053 4666 
+Q 2853 4666 3247 4331 
+Q 3641 3997 3641 3322 
+Q 3641 2881 3436 2590 
+Q 3231 2300 2841 2188 
+z
+M 1259 4147 
+L 1259 2491 
+L 2053 2491 
+Q 2509 2491 2742 2702 
+Q 2975 2913 2975 3322 
+Q 2975 3731 2742 3939 
+Q 2509 4147 2053 4147 
+L 1259 4147 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-53" d="M 3425 4513 
+L 3425 3897 
+Q 3066 4069 2747 4153 
+Q 2428 4238 2131 4238 
+Q 1616 4238 1336 4038 
+Q 1056 3838 1056 3469 
+Q 1056 3159 1242 3001 
+Q 1428 2844 1947 2747 
+L 2328 2669 
+Q 3034 2534 3370 2195 
+Q 3706 1856 3706 1288 
+Q 3706 609 3251 259 
+Q 2797 -91 1919 -91 
+Q 1588 -91 1214 -16 
+Q 841 59 441 206 
+L 441 856 
+Q 825 641 1194 531 
+Q 1563 422 1919 422 
+Q 2459 422 2753 634 
+Q 3047 847 3047 1241 
+Q 3047 1584 2836 1778 
+Q 2625 1972 2144 2069 
+L 1759 2144 
+Q 1053 2284 737 2584 
+Q 422 2884 422 3419 
+Q 422 4038 858 4394 
+Q 1294 4750 2059 4750 
+Q 2388 4750 2728 4690 
+Q 3069 4631 3425 4513 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-4e" d="M 628 4666 
+L 1478 4666 
+L 3547 763 
+L 3547 4666 
+L 4159 4666 
+L 4159 0 
+L 3309 0 
+L 1241 3903 
+L 1241 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-66" d="M 2375 4863 
+L 2375 4384 
+L 1825 4384 
+Q 1516 4384 1395 4259 
+Q 1275 4134 1275 3809 
+L 1275 3500 
+L 2222 3500 
+L 2222 3053 
+L 1275 3053 
+L 1275 0 
+L 697 0 
+L 697 3053 
+L 147 3053 
+L 147 3500 
+L 697 3500 
+L 697 3744 
+Q 697 4328 969 4595 
+Q 1241 4863 1831 4863 
+L 2375 4863 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-70" d="M 1159 525 
+L 1159 -1331 
+L 581 -1331 
+L 581 3500 
+L 1159 3500 
+L 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+z
+M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-67" d="M 2906 1791 
+Q 2906 2416 2648 2759 
+Q 2391 3103 1925 3103 
+Q 1463 3103 1205 2759 
+Q 947 2416 947 1791 
+Q 947 1169 1205 825 
+Q 1463 481 1925 481 
+Q 2391 481 2648 825 
+Q 2906 1169 2906 1791 
+z
+M 3481 434 
+Q 3481 -459 3084 -895 
+Q 2688 -1331 1869 -1331 
+Q 1566 -1331 1297 -1286 
+Q 1028 -1241 775 -1147 
+L 775 -588 
+Q 1028 -725 1275 -790 
+Q 1522 -856 1778 -856 
+Q 2344 -856 2625 -561 
+Q 2906 -266 2906 331 
+L 2906 616 
+Q 2728 306 2450 153 
+Q 2172 0 1784 0 
+Q 1141 0 747 490 
+Q 353 981 353 1791 
+Q 353 2603 747 3093 
+Q 1141 3584 1784 3584 
+Q 2172 3584 2450 3431 
+Q 2728 3278 2906 2969 
+L 2906 3500 
+L 3481 3500 
+L 3481 434 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-52"/>
+     <use xlink:href="#DejaVuSans-4d" transform="translate(69.482422 0)"/>
+     <use xlink:href="#DejaVuSans-53" transform="translate(155.761719 0)"/>
+     <use xlink:href="#DejaVuSans-4e" transform="translate(219.238281 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(294.042969 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(355.224609 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(394.587891 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(492 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(523.787109 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(562.800781 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(598.005859 0)"/>
+     <use xlink:href="#DejaVuSans-33" transform="translate(661.482422 0)"/>
+     <use xlink:href="#DejaVuSans-32" transform="translate(725.105469 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(788.728516 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(820.515625 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(902.302734 0)"/>
+     <use xlink:href="#DejaVuSans-69" transform="translate(963.826172 0)"/>
+     <use xlink:href="#DejaVuSans-67" transform="translate(991.609375 0)"/>
+     <use xlink:href="#DejaVuSans-68" transform="translate(1055.085938 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(1118.464844 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1157.673828 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_2">
+   <g id="patch_7">
+    <path d="M 482.7725 302.361604 
+L 858.173445 302.361604 
+L 858.173445 95.28 
+L 482.7725 95.28 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_3">
+    <g id="xtick_7">
+     <g id="line2d_26">
+      <g>
+       <use xlink:href="#m6ad96393bb" x="499.836179" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_17">
+      <!-- (4K, 6K) -->
+      <g transform="translate(501.30674 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_8">
+     <g id="line2d_27">
+      <g>
+       <use xlink:href="#m6ad96393bb" x="568.090897" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_18">
+      <!-- (16K, 6K) -->
+      <g transform="translate(569.561458 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_9">
+     <g id="line2d_28">
+      <g>
+       <use xlink:href="#m6ad96393bb" x="636.345614" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_19">
+      <!-- (64K, 6K) -->
+      <g transform="translate(637.816175 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_10">
+     <g id="line2d_29">
+      <g>
+       <use xlink:href="#m6ad96393bb" x="704.600331" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_20">
+      <!-- (4K, 8K) -->
+      <g transform="translate(706.070892 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_11">
+     <g id="line2d_30">
+      <g>
+       <use xlink:href="#m6ad96393bb" x="772.855049" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_21">
+      <!-- (16K, 8K) -->
+      <g transform="translate(774.32561 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_12">
+     <g id="line2d_31">
+      <g>
+       <use xlink:href="#m6ad96393bb" x="841.109766" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_22">
+      <!-- (64K, 8K) -->
+      <g transform="translate(842.580327 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_4">
+    <g id="ytick_9">
+     <g id="line2d_32">
+      <path d="M 482.7725 302.361604 
+L 858.173445 302.361604 
+" clip-path="url(#pb440e45e99)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_33">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="482.7725" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_10">
+     <g id="line2d_34">
+      <path d="M 482.7725 276.439075 
+L 858.173445 276.439075 
+" clip-path="url(#pb440e45e99)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_35">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="482.7725" y="276.439075" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_11">
+     <g id="line2d_36">
+      <path d="M 482.7725 250.516546 
+L 858.173445 250.516546 
+" clip-path="url(#pb440e45e99)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_37">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="482.7725" y="250.516546" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_12">
+     <g id="line2d_38">
+      <path d="M 482.7725 224.594018 
+L 858.173445 224.594018 
+" clip-path="url(#pb440e45e99)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_39">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="482.7725" y="224.594018" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_13">
+     <g id="line2d_40">
+      <path d="M 482.7725 198.671489 
+L 858.173445 198.671489 
+" clip-path="url(#pb440e45e99)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_41">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="482.7725" y="198.671489" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_14">
+     <g id="line2d_42">
+      <path d="M 482.7725 172.74896 
+L 858.173445 172.74896 
+" clip-path="url(#pb440e45e99)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_43">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="482.7725" y="172.74896" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_15">
+     <g id="line2d_44">
+      <path d="M 482.7725 146.826432 
+L 858.173445 146.826432 
+" clip-path="url(#pb440e45e99)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_45">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="482.7725" y="146.826432" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_16">
+     <g id="line2d_46">
+      <path d="M 482.7725 120.903903 
+L 858.173445 120.903903 
+" clip-path="url(#pb440e45e99)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_47">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="482.7725" y="120.903903" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_48">
+    <path d="M 499.836179 230.257851 
+L 568.090897 179.17427 
+L 636.345614 169.204529 
+L 704.600331 207.439749 
+L 772.855049 169.971131 
+L 841.109766 163.297212 
+" clip-path="url(#pb440e45e99)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pb440e45e99)">
+     <use xlink:href="#mae8045ff5a" x="499.836179" y="230.257851" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mae8045ff5a" x="568.090897" y="179.17427" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mae8045ff5a" x="636.345614" y="169.204529" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mae8045ff5a" x="704.600331" y="207.439749" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mae8045ff5a" x="772.855049" y="169.971131" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mae8045ff5a" x="841.109766" y="163.297212" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_49">
+    <path d="M 499.836179 209.293665 
+L 568.090897 182.757282 
+L 636.345614 173.695178 
+L 704.600331 190.772636 
+L 772.855049 169.129494 
+L 841.109766 162.556672 
+" clip-path="url(#pb440e45e99)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pb440e45e99)">
+     <use xlink:href="#mae70e8af38" x="499.836179" y="209.293665" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mae70e8af38" x="568.090897" y="182.757282" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mae70e8af38" x="636.345614" y="173.695178" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mae70e8af38" x="704.600331" y="190.772636" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mae70e8af38" x="772.855049" y="169.129494" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mae70e8af38" x="841.109766" y="162.556672" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_50">
+    <path d="M 482.7725 110.619378 
+L 858.173445 110.619378 
+" clip-path="url(#pb440e45e99)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_8">
+    <path d="M 482.7725 302.361604 
+L 482.7725 95.28 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_9">
+    <path d="M 858.173445 302.361604 
+L 858.173445 95.28 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_10">
+    <path d="M 482.7725 302.361604 
+L 858.173445 302.361604 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_11">
+    <path d="M 482.7725 95.28 
+L 858.173445 95.28 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_23">
+    <!-- Softmax (fwd+bwd) -->
+    <g transform="translate(580.80766 89.28) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-78" d="M 3513 3500 
+L 2247 1797 
+L 3578 0 
+L 2900 0 
+L 1881 1375 
+L 863 0 
+L 184 0 
+L 1544 1831 
+L 300 3500 
+L 978 3500 
+L 1906 2253 
+L 2834 3500 
+L 3513 3500 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-2b" d="M 2944 4013 
+L 2944 2272 
+L 4684 2272 
+L 4684 1741 
+L 2944 1741 
+L 2944 0 
+L 2419 0 
+L 2419 1741 
+L 678 1741 
+L 678 2272 
+L 2419 2272 
+L 2419 4013 
+L 2944 4013 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-62" d="M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+M 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2969 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-53"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(63.476562 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(124.658203 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(158.113281 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(197.322266 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(294.734375 0)"/>
+     <use xlink:href="#DejaVuSans-78" transform="translate(356.013672 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(415.193359 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(446.980469 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(485.994141 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(519.449219 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(601.236328 0)"/>
+     <use xlink:href="#DejaVuSans-2b" transform="translate(664.712891 0)"/>
+     <use xlink:href="#DejaVuSans-62" transform="translate(748.501953 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(811.978516 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(893.765625 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(957.242188 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_3">
+   <g id="patch_12">
+    <path d="M 899.015 302.361604 
+L 1274.415945 302.361604 
+L 1274.415945 95.28 
+L 899.015 95.28 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_5">
+    <g id="xtick_13">
+     <g id="line2d_51">
+      <g>
+       <use xlink:href="#m6ad96393bb" x="916.078679" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_24">
+      <!-- (4K, 6K) -->
+      <g transform="translate(917.54924 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_14">
+     <g id="line2d_52">
+      <g>
+       <use xlink:href="#m6ad96393bb" x="984.333397" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_25">
+      <!-- (16K, 6K) -->
+      <g transform="translate(985.803958 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_15">
+     <g id="line2d_53">
+      <g>
+       <use xlink:href="#m6ad96393bb" x="1052.588114" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_26">
+      <!-- (64K, 6K) -->
+      <g transform="translate(1054.058675 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_16">
+     <g id="line2d_54">
+      <g>
+       <use xlink:href="#m6ad96393bb" x="1120.842831" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_27">
+      <!-- (4K, 8K) -->
+      <g transform="translate(1122.313392 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_17">
+     <g id="line2d_55">
+      <g>
+       <use xlink:href="#m6ad96393bb" x="1189.097549" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_28">
+      <!-- (16K, 8K) -->
+      <g transform="translate(1190.56811 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_18">
+     <g id="line2d_56">
+      <g>
+       <use xlink:href="#m6ad96393bb" x="1257.352266" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_29">
+      <!-- (64K, 8K) -->
+      <g transform="translate(1258.822827 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_6">
+    <g id="ytick_17">
+     <g id="line2d_57">
+      <path d="M 899.015 302.361604 
+L 1274.415945 302.361604 
+" clip-path="url(#p084fd1625d)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_58">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="899.015" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_18">
+     <g id="line2d_59">
+      <path d="M 899.015 276.439075 
+L 1274.415945 276.439075 
+" clip-path="url(#p084fd1625d)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_60">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="899.015" y="276.439075" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_19">
+     <g id="line2d_61">
+      <path d="M 899.015 250.516546 
+L 1274.415945 250.516546 
+" clip-path="url(#p084fd1625d)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_62">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="899.015" y="250.516546" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_20">
+     <g id="line2d_63">
+      <path d="M 899.015 224.594018 
+L 1274.415945 224.594018 
+" clip-path="url(#p084fd1625d)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_64">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="899.015" y="224.594018" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_21">
+     <g id="line2d_65">
+      <path d="M 899.015 198.671489 
+L 1274.415945 198.671489 
+" clip-path="url(#p084fd1625d)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_66">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="899.015" y="198.671489" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_22">
+     <g id="line2d_67">
+      <path d="M 899.015 172.74896 
+L 1274.415945 172.74896 
+" clip-path="url(#p084fd1625d)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_68">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="899.015" y="172.74896" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_23">
+     <g id="line2d_69">
+      <path d="M 899.015 146.826432 
+L 1274.415945 146.826432 
+" clip-path="url(#p084fd1625d)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_70">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="899.015" y="146.826432" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_24">
+     <g id="line2d_71">
+      <path d="M 899.015 120.903903 
+L 1274.415945 120.903903 
+" clip-path="url(#p084fd1625d)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_72">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="899.015" y="120.903903" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_73">
+    <path d="M 916.078679 270.539093 
+L 984.333397 192.128303 
+L 1052.588114 180.212864 
+L 1120.842831 260.545493 
+L 1189.097549 184.529482 
+L 1257.352266 175.658501 
+" clip-path="url(#p084fd1625d)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p084fd1625d)">
+     <use xlink:href="#mae8045ff5a" x="916.078679" y="270.539093" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mae8045ff5a" x="984.333397" y="192.128303" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mae8045ff5a" x="1052.588114" y="180.212864" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mae8045ff5a" x="1120.842831" y="260.545493" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mae8045ff5a" x="1189.097549" y="184.529482" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mae8045ff5a" x="1257.352266" y="175.658501" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_74">
+    <path d="M 916.078679 254.64417 
+L 984.333397 200.239505 
+L 1052.588114 189.844138 
+L 1120.842831 243.78625 
+L 1189.097549 189.103961 
+L 1257.352266 180.922022 
+" clip-path="url(#p084fd1625d)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p084fd1625d)">
+     <use xlink:href="#mae70e8af38" x="916.078679" y="254.64417" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mae70e8af38" x="984.333397" y="200.239505" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mae70e8af38" x="1052.588114" y="189.844138" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mae70e8af38" x="1120.842831" y="243.78625" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mae70e8af38" x="1189.097549" y="189.103961" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mae70e8af38" x="1257.352266" y="180.922022" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_75">
+    <path d="M 899.015 110.619378 
+L 1274.415945 110.619378 
+" clip-path="url(#p084fd1625d)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_13">
+    <path d="M 899.015 302.361604 
+L 899.015 95.28 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_14">
+    <path d="M 1274.415945 302.361604 
+L 1274.415945 95.28 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_15">
+    <path d="M 899.015 302.361604 
+L 1274.415945 302.361604 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_16">
+    <path d="M 899.015 95.28 
+L 1274.415945 95.28 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_30">
+    <!-- Cross-Entropy (fwd+bwd) -->
+    <g transform="translate(971.542191 89.28) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-43" d="M 4122 4306 
+L 4122 3641 
+Q 3803 3938 3442 4084 
+Q 3081 4231 2675 4231 
+Q 1875 4231 1450 3742 
+Q 1025 3253 1025 2328 
+Q 1025 1406 1450 917 
+Q 1875 428 2675 428 
+Q 3081 428 3442 575 
+Q 3803 722 4122 1019 
+L 4122 359 
+Q 3791 134 3420 21 
+Q 3050 -91 2638 -91 
+Q 1578 -91 968 557 
+Q 359 1206 359 2328 
+Q 359 3453 968 4101 
+Q 1578 4750 2638 4750 
+Q 3056 4750 3426 4639 
+Q 3797 4528 4122 4306 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-2d" d="M 313 2009 
+L 1997 2009 
+L 1997 1497 
+L 313 1497 
+L 313 2009 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-45" d="M 628 4666 
+L 3578 4666 
+L 3578 4134 
+L 1259 4134 
+L 1259 2753 
+L 3481 2753 
+L 3481 2222 
+L 1259 2222 
+L 1259 531 
+L 3634 531 
+L 3634 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-43"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(69.824219 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(108.6875 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(169.869141 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(221.96875 0)"/>
+     <use xlink:href="#DejaVuSans-2d" transform="translate(274.068359 0)"/>
+     <use xlink:href="#DejaVuSans-45" transform="translate(310.152344 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(373.335938 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(436.714844 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(475.923828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(514.787109 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(575.96875 0)"/>
+     <use xlink:href="#DejaVuSans-79" transform="translate(639.445312 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(698.625 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(730.412109 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(769.425781 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(802.880859 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(884.667969 0)"/>
+     <use xlink:href="#DejaVuSans-2b" transform="translate(948.144531 0)"/>
+     <use xlink:href="#DejaVuSans-62" transform="translate(1031.933594 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(1095.410156 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1177.197266 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1240.673828 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_4">
+   <g id="patch_17">
+    <path d="M 1315.2575 302.361604 
+L 1690.658445 302.361604 
+L 1690.658445 95.28 
+L 1315.2575 95.28 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_7">
+    <g id="xtick_19">
+     <g id="line2d_76">
+      <g>
+       <use xlink:href="#m6ad96393bb" x="1332.321179" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_31">
+      <!-- (4K, 6K) -->
+      <g transform="translate(1333.79174 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_20">
+     <g id="line2d_77">
+      <g>
+       <use xlink:href="#m6ad96393bb" x="1400.575897" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_32">
+      <!-- (16K, 6K) -->
+      <g transform="translate(1402.046458 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_21">
+     <g id="line2d_78">
+      <g>
+       <use xlink:href="#m6ad96393bb" x="1468.830614" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_33">
+      <!-- (64K, 6K) -->
+      <g transform="translate(1470.301175 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_22">
+     <g id="line2d_79">
+      <g>
+       <use xlink:href="#m6ad96393bb" x="1537.085331" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_34">
+      <!-- (4K, 8K) -->
+      <g transform="translate(1538.555892 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_23">
+     <g id="line2d_80">
+      <g>
+       <use xlink:href="#m6ad96393bb" x="1605.340049" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_35">
+      <!-- (16K, 8K) -->
+      <g transform="translate(1606.81061 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_24">
+     <g id="line2d_81">
+      <g>
+       <use xlink:href="#m6ad96393bb" x="1673.594766" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_36">
+      <!-- (64K, 8K) -->
+      <g transform="translate(1675.065327 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_8">
+    <g id="ytick_25">
+     <g id="line2d_82">
+      <path d="M 1315.2575 302.361604 
+L 1690.658445 302.361604 
+" clip-path="url(#pe49f8c1ba2)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_83">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="1315.2575" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_26">
+     <g id="line2d_84">
+      <path d="M 1315.2575 276.439075 
+L 1690.658445 276.439075 
+" clip-path="url(#pe49f8c1ba2)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_85">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="1315.2575" y="276.439075" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_27">
+     <g id="line2d_86">
+      <path d="M 1315.2575 250.516546 
+L 1690.658445 250.516546 
+" clip-path="url(#pe49f8c1ba2)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_87">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="1315.2575" y="250.516546" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_28">
+     <g id="line2d_88">
+      <path d="M 1315.2575 224.594018 
+L 1690.658445 224.594018 
+" clip-path="url(#pe49f8c1ba2)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_89">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="1315.2575" y="224.594018" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_29">
+     <g id="line2d_90">
+      <path d="M 1315.2575 198.671489 
+L 1690.658445 198.671489 
+" clip-path="url(#pe49f8c1ba2)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_91">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="1315.2575" y="198.671489" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_30">
+     <g id="line2d_92">
+      <path d="M 1315.2575 172.74896 
+L 1690.658445 172.74896 
+" clip-path="url(#pe49f8c1ba2)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_93">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="1315.2575" y="172.74896" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_31">
+     <g id="line2d_94">
+      <path d="M 1315.2575 146.826432 
+L 1690.658445 146.826432 
+" clip-path="url(#pe49f8c1ba2)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_95">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="1315.2575" y="146.826432" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_32">
+     <g id="line2d_96">
+      <path d="M 1315.2575 120.903903 
+L 1690.658445 120.903903 
+" clip-path="url(#pe49f8c1ba2)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_97">
+      <g>
+       <use xlink:href="#m8ba46e77ee" x="1315.2575" y="120.903903" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_98">
+    <path d="M 1332.321179 249.250576 
+L 1400.575897 236.227718 
+L 1468.830614 231.495765 
+L 1537.085331 237.847671 
+L 1605.340049 225.576877 
+L 1673.594766 221.793314 
+" clip-path="url(#pe49f8c1ba2)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pe49f8c1ba2)">
+     <use xlink:href="#mae8045ff5a" x="1332.321179" y="249.250576" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mae8045ff5a" x="1400.575897" y="236.227718" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mae8045ff5a" x="1468.830614" y="231.495765" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mae8045ff5a" x="1537.085331" y="237.847671" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mae8045ff5a" x="1605.340049" y="225.576877" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mae8045ff5a" x="1673.594766" y="221.793314" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_99">
+    <path d="M 1332.321179 244.807576 
+L 1400.575897 230.902584 
+L 1468.830614 226.009343 
+L 1537.085331 229.971564 
+L 1605.340049 215.698405 
+L 1673.594766 211.308378 
+" clip-path="url(#pe49f8c1ba2)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pe49f8c1ba2)">
+     <use xlink:href="#mae70e8af38" x="1332.321179" y="244.807576" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mae70e8af38" x="1400.575897" y="230.902584" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mae70e8af38" x="1468.830614" y="226.009343" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mae70e8af38" x="1537.085331" y="229.971564" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mae70e8af38" x="1605.340049" y="215.698405" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mae70e8af38" x="1673.594766" y="211.308378" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_100">
+    <path d="M 1315.2575 110.619378 
+L 1690.658445 110.619378 
+" clip-path="url(#pe49f8c1ba2)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_18">
+    <path d="M 1315.2575 302.361604 
+L 1315.2575 95.28 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_19">
+    <path d="M 1690.658445 302.361604 
+L 1690.658445 95.28 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_20">
+    <path d="M 1315.2575 302.361604 
+L 1690.658445 302.361604 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_21">
+    <path d="M 1315.2575 95.28 
+L 1690.658445 95.28 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_37">
+    <!-- LayerNorm (fwd) -->
+    <g transform="translate(1427.346723 89.28) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-4c" d="M 628 4666 
+L 1259 4666 
+L 1259 531 
+L 3531 531 
+L 3531 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-4c"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(55.712891 0)"/>
+     <use xlink:href="#DejaVuSans-79" transform="translate(116.992188 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(176.171875 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(237.695312 0)"/>
+     <use xlink:href="#DejaVuSans-4e" transform="translate(278.808594 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(353.613281 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(414.794922 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(454.158203 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(551.570312 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(583.357422 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(622.371094 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(655.826172 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(737.613281 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(801.089844 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="text_38">
+   <!-- SM100 BF16 Kernel Benchmarks (Oink vs Quack) — DSv3 (+LayerNorm) -->
+   <g transform="translate(465.145156 18.516563) scale(0.22 -0.22)">
+    <defs>
+     <path id="DejaVuSans-46" d="M 628 4666 
+L 3309 4666 
+L 3309 4134 
+L 1259 4134 
+L 1259 2759 
+L 3109 2759 
+L 3109 2228 
+L 1259 2228 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-6c" d="M 603 4863 
+L 1178 4863 
+L 1178 0 
+L 603 0 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-63" d="M 3122 3366 
+L 3122 2828 
+Q 2878 2963 2633 3030 
+Q 2388 3097 2138 3097 
+Q 1578 3097 1268 2742 
+Q 959 2388 959 1747 
+Q 959 1106 1268 751 
+Q 1578 397 2138 397 
+Q 2388 397 2633 464 
+Q 2878 531 3122 666 
+L 3122 134 
+Q 2881 22 2623 -34 
+Q 2366 -91 2075 -91 
+Q 1284 -91 818 406 
+Q 353 903 353 1747 
+Q 353 2603 823 3093 
+Q 1294 3584 2113 3584 
+Q 2378 3584 2631 3529 
+Q 2884 3475 3122 3366 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-6b" d="M 581 4863 
+L 1159 4863 
+L 1159 1991 
+L 2875 3500 
+L 3609 3500 
+L 1753 1863 
+L 3688 0 
+L 2938 0 
+L 1159 1709 
+L 1159 0 
+L 581 0 
+L 581 4863 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-4f" d="M 2522 4238 
+Q 1834 4238 1429 3725 
+Q 1025 3213 1025 2328 
+Q 1025 1447 1429 934 
+Q 1834 422 2522 422 
+Q 3209 422 3611 934 
+Q 4013 1447 4013 2328 
+Q 4013 3213 3611 3725 
+Q 3209 4238 2522 4238 
+z
+M 2522 4750 
+Q 3503 4750 4090 4092 
+Q 4678 3434 4678 2328 
+Q 4678 1225 4090 567 
+Q 3503 -91 2522 -91 
+Q 1538 -91 948 565 
+Q 359 1222 359 2328 
+Q 359 3434 948 4092 
+Q 1538 4750 2522 4750 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-76" d="M 191 3500 
+L 800 3500 
+L 1894 563 
+L 2988 3500 
+L 3597 3500 
+L 2284 0 
+L 1503 0 
+L 191 3500 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-51" d="M 2522 4238 
+Q 1834 4238 1429 3725 
+Q 1025 3213 1025 2328 
+Q 1025 1447 1429 934 
+Q 1834 422 2522 422 
+Q 3209 422 3611 934 
+Q 4013 1447 4013 2328 
+Q 4013 3213 3611 3725 
+Q 3209 4238 2522 4238 
+z
+M 3406 84 
+L 4238 -825 
+L 3475 -825 
+L 2784 -78 
+Q 2681 -84 2626 -87 
+Q 2572 -91 2522 -91 
+Q 1538 -91 948 567 
+Q 359 1225 359 2328 
+Q 359 3434 948 4092 
+Q 1538 4750 2522 4750 
+Q 3503 4750 4090 4092 
+Q 4678 3434 4678 2328 
+Q 4678 1516 4351 937 
+Q 4025 359 3406 84 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-75" d="M 544 1381 
+L 544 3500 
+L 1119 3500 
+L 1119 1403 
+Q 1119 906 1312 657 
+Q 1506 409 1894 409 
+Q 2359 409 2629 706 
+Q 2900 1003 2900 1516 
+L 2900 3500 
+L 3475 3500 
+L 3475 0 
+L 2900 0 
+L 2900 538 
+Q 2691 219 2414 64 
+Q 2138 -91 1772 -91 
+Q 1169 -91 856 284 
+Q 544 659 544 1381 
+z
+M 1991 3584 
+L 1991 3584 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-2014" d="M 313 1978 
+L 6088 1978 
+L 6088 1528 
+L 313 1528 
+L 313 1978 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-44" d="M 1259 4147 
+L 1259 519 
+L 2022 519 
+Q 2988 519 3436 956 
+Q 3884 1394 3884 2338 
+Q 3884 3275 3436 3711 
+Q 2988 4147 2022 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 1925 4666 
+Q 3281 4666 3915 4102 
+Q 4550 3538 4550 2338 
+Q 4550 1131 3912 565 
+Q 3275 0 1925 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+    </defs>
+    <use xlink:href="#DejaVuSans-53"/>
+    <use xlink:href="#DejaVuSans-4d" transform="translate(63.476562 0)"/>
+    <use xlink:href="#DejaVuSans-31" transform="translate(149.755859 0)"/>
+    <use xlink:href="#DejaVuSans-30" transform="translate(213.378906 0)"/>
+    <use xlink:href="#DejaVuSans-30" transform="translate(277.001953 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(340.625 0)"/>
+    <use xlink:href="#DejaVuSans-42" transform="translate(372.412109 0)"/>
+    <use xlink:href="#DejaVuSans-46" transform="translate(441.015625 0)"/>
+    <use xlink:href="#DejaVuSans-31" transform="translate(498.535156 0)"/>
+    <use xlink:href="#DejaVuSans-36" transform="translate(562.158203 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(625.78125 0)"/>
+    <use xlink:href="#DejaVuSans-4b" transform="translate(657.568359 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(718.144531 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(779.667969 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(819.03125 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(882.410156 0)"/>
+    <use xlink:href="#DejaVuSans-6c" transform="translate(943.933594 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(971.716797 0)"/>
+    <use xlink:href="#DejaVuSans-42" transform="translate(1003.503906 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(1072.107422 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(1133.630859 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(1197.009766 0)"/>
+    <use xlink:href="#DejaVuSans-68" transform="translate(1251.990234 0)"/>
+    <use xlink:href="#DejaVuSans-6d" transform="translate(1315.369141 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(1412.78125 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(1474.060547 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(1515.173828 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(1573.083984 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(1625.183594 0)"/>
+    <use xlink:href="#DejaVuSans-28" transform="translate(1656.970703 0)"/>
+    <use xlink:href="#DejaVuSans-4f" transform="translate(1695.984375 0)"/>
+    <use xlink:href="#DejaVuSans-69" transform="translate(1774.695312 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(1802.478516 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(1865.857422 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(1923.767578 0)"/>
+    <use xlink:href="#DejaVuSans-76" transform="translate(1955.554688 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(2014.734375 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2066.833984 0)"/>
+    <use xlink:href="#DejaVuSans-51" transform="translate(2098.621094 0)"/>
+    <use xlink:href="#DejaVuSans-75" transform="translate(2177.332031 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(2240.710938 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(2301.990234 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(2356.970703 0)"/>
+    <use xlink:href="#DejaVuSans-29" transform="translate(2414.880859 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2453.894531 0)"/>
+    <use xlink:href="#DejaVuSans-2014" transform="translate(2485.681641 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2585.681641 0)"/>
+    <use xlink:href="#DejaVuSans-44" transform="translate(2617.46875 0)"/>
+    <use xlink:href="#DejaVuSans-53" transform="translate(2694.470703 0)"/>
+    <use xlink:href="#DejaVuSans-76" transform="translate(2757.947266 0)"/>
+    <use xlink:href="#DejaVuSans-33" transform="translate(2817.126953 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2880.75 0)"/>
+    <use xlink:href="#DejaVuSans-28" transform="translate(2912.537109 0)"/>
+    <use xlink:href="#DejaVuSans-2b" transform="translate(2951.550781 0)"/>
+    <use xlink:href="#DejaVuSans-4c" transform="translate(3035.339844 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(3091.052734 0)"/>
+    <use xlink:href="#DejaVuSans-79" transform="translate(3152.332031 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(3211.511719 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(3273.035156 0)"/>
+    <use xlink:href="#DejaVuSans-4e" transform="translate(3314.148438 0)"/>
+    <use xlink:href="#DejaVuSans-6f" transform="translate(3388.953125 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(3450.134766 0)"/>
+    <use xlink:href="#DejaVuSans-6d" transform="translate(3489.498047 0)"/>
+    <use xlink:href="#DejaVuSans-29" transform="translate(3586.910156 0)"/>
+   </g>
+  </g>
+  <g id="legend_1">
+   <g id="line2d_101">
+    <path d="M 582.175625 39.937812 
+L 599.675625 39.937812 
+L 617.175625 39.937812 
+" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g>
+     <use xlink:href="#mae8045ff5a" x="599.675625" y="39.937812" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="text_39">
+    <!-- KernelAgent-Oink (ours) -->
+    <g transform="translate(628.375625 44.837812) scale(0.14 -0.14)">
+     <defs>
+      <path id="DejaVuSans-41" d="M 2188 4044 
+L 1331 1722 
+L 3047 1722 
+L 2188 4044 
+z
+M 1831 4666 
+L 2547 4666 
+L 4325 0 
+L 3669 0 
+L 3244 1197 
+L 1141 1197 
+L 716 0 
+L 50 0 
+L 1831 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-4b"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(60.576172 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(122.099609 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(161.462891 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(224.841797 0)"/>
+     <use xlink:href="#DejaVuSans-6c" transform="translate(286.365234 0)"/>
+     <use xlink:href="#DejaVuSans-41" transform="translate(314.148438 0)"/>
+     <use xlink:href="#DejaVuSans-67" transform="translate(382.556641 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(446.033203 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(507.556641 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(570.935547 0)"/>
+     <use xlink:href="#DejaVuSans-2d" transform="translate(610.144531 0)"/>
+     <use xlink:href="#DejaVuSans-4f" transform="translate(648.978516 0)"/>
+     <use xlink:href="#DejaVuSans-69" transform="translate(727.689453 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(755.472656 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(818.851562 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(876.761719 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(908.548828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(947.5625 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(1008.744141 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(1072.123047 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(1113.236328 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1165.335938 0)"/>
+    </g>
+   </g>
+   <g id="line2d_102">
+    <path d="M 824.98375 39.937812 
+L 842.48375 39.937812 
+L 859.98375 39.937812 
+" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g>
+     <use xlink:href="#mae70e8af38" x="842.48375" y="39.937812" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="text_40">
+    <!-- Quack -->
+    <g transform="translate(871.18375 44.837812) scale(0.14 -0.14)">
+     <use xlink:href="#DejaVuSans-51"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(78.710938 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(142.089844 0)"/>
+     <use xlink:href="#DejaVuSans-63" transform="translate(203.369141 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(258.349609 0)"/>
+    </g>
+   </g>
+   <g id="line2d_103">
+    <path d="M 943.460938 39.937812 
+L 960.960938 39.937812 
+L 978.460938 39.937812 
+" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="text_41">
+    <!-- HBM peak (measured) -->
+    <g transform="translate(989.660938 44.837812) scale(0.14 -0.14)">
+     <defs>
+      <path id="DejaVuSans-48" d="M 628 4666 
+L 1259 4666 
+L 1259 2753 
+L 3553 2753 
+L 3553 4666 
+L 4184 4666 
+L 4184 0 
+L 3553 0 
+L 3553 2222 
+L 1259 2222 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-48"/>
+     <use xlink:href="#DejaVuSans-42" transform="translate(75.195312 0)"/>
+     <use xlink:href="#DejaVuSans-4d" transform="translate(143.798828 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(230.078125 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(261.865234 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(325.341797 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(386.865234 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(448.144531 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(506.054688 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(537.841797 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(576.855469 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(674.267578 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(735.791016 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(797.070312 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(849.169922 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(912.548828 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(951.412109 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1012.935547 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1076.412109 0)"/>
+    </g>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="p1e1ded2a29">
+   <rect x="66.53" y="95.28" width="375.400945" height="207.081604"/>
+  </clipPath>
+  <clipPath id="pb440e45e99">
+   <rect x="482.7725" y="95.28" width="375.400945" height="207.081604"/>
+  </clipPath>
+  <clipPath id="p084fd1625d">
+   <rect x="899.015" y="95.28" width="375.400945" height="207.081604"/>
+  </clipPath>
+  <clipPath id="pe49f8c1ba2">
+   <rect x="1315.2575" y="95.28" width="375.400945" height="207.081604"/>
+  </clipPath>
+ </defs>
+</svg>
diff --git a/oink/benchmarks/media/sm100_bf16_oink_vs_quack_with_layernorm.svg b/oink/benchmarks/media/sm100_bf16_oink_vs_quack_with_layernorm.svg
new file mode 100644
index 0000000..a5670bd
--- /dev/null
+++ b/oink/benchmarks/media/sm100_bf16_oink_vs_quack_with_layernorm.svg
@@ -0,0 +1,2580 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="1706.903903pt" height="387.112144pt" viewBox="0 0 1706.903903 387.112144" xmlns="http://www.w3.org/2000/svg" version="1.1">
+ <metadata>
+  <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+   <cc:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
+    <dc:date>2026-01-12T23:31:33.339254</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <cc:Agent>
+      <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
+     </cc:Agent>
+    </dc:creator>
+   </cc:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure_1">
+  <g id="patch_1">
+   <path d="M 0 387.112144 
+L 1706.903903 387.112144 
+L 1706.903903 0 
+L 0 0 
+z
+" style="fill: #ffffff"/>
+  </g>
+  <g id="axes_1">
+   <g id="patch_2">
+    <path d="M 58.465 334.546471 
+L 429.474812 334.546471 
+L 429.474812 144.816 
+L 58.465 144.816 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="line2d_1">
+      <defs>
+       <path id="m53396d3673" d="M 0 0 
+L 0 3.5 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#m53396d3673" x="75.329082" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_1">
+      <!-- (8K, 4K) -->
+      <g transform="translate(76.799643 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-28" d="M 1984 4856 
+Q 1566 4138 1362 3434 
+Q 1159 2731 1159 2009 
+Q 1159 1288 1364 580 
+Q 1569 -128 1984 -844 
+L 1484 -844 
+Q 1016 -109 783 600 
+Q 550 1309 550 2009 
+Q 550 2706 781 3412 
+Q 1013 4119 1484 4856 
+L 1984 4856 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-38" d="M 2034 2216 
+Q 1584 2216 1326 1975 
+Q 1069 1734 1069 1313 
+Q 1069 891 1326 650 
+Q 1584 409 2034 409 
+Q 2484 409 2743 651 
+Q 3003 894 3003 1313 
+Q 3003 1734 2745 1975 
+Q 2488 2216 2034 2216 
+z
+M 1403 2484 
+Q 997 2584 770 2862 
+Q 544 3141 544 3541 
+Q 544 4100 942 4425 
+Q 1341 4750 2034 4750 
+Q 2731 4750 3128 4425 
+Q 3525 4100 3525 3541 
+Q 3525 3141 3298 2862 
+Q 3072 2584 2669 2484 
+Q 3125 2378 3379 2068 
+Q 3634 1759 3634 1313 
+Q 3634 634 3220 271 
+Q 2806 -91 2034 -91 
+Q 1263 -91 848 271 
+Q 434 634 434 1313 
+Q 434 1759 690 2068 
+Q 947 2378 1403 2484 
+z
+M 1172 3481 
+Q 1172 3119 1398 2916 
+Q 1625 2713 2034 2713 
+Q 2441 2713 2670 2916 
+Q 2900 3119 2900 3481 
+Q 2900 3844 2670 4047 
+Q 2441 4250 2034 4250 
+Q 1625 4250 1398 4047 
+Q 1172 3844 1172 3481 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-4b" d="M 628 4666 
+L 1259 4666 
+L 1259 2694 
+L 3353 4666 
+L 4166 4666 
+L 1850 2491 
+L 4331 0 
+L 3500 0 
+L 1259 2247 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-2c" d="M 750 794 
+L 1409 794 
+L 1409 256 
+L 897 -744 
+L 494 -744 
+L 750 256 
+L 750 794 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-20" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-34" d="M 2419 4116 
+L 825 1625 
+L 2419 1625 
+L 2419 4116 
+z
+M 2253 4666 
+L 3047 4666 
+L 3047 1625 
+L 3713 1625 
+L 3713 1100 
+L 3047 1100 
+L 3047 0 
+L 2419 0 
+L 2419 1100 
+L 313 1100 
+L 313 1709 
+L 2253 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-29" d="M 513 4856 
+L 1013 4856 
+Q 1481 4119 1714 3412 
+Q 1947 2706 1947 2009 
+Q 1947 1309 1714 600 
+Q 1481 -109 1013 -844 
+L 513 -844 
+Q 928 -128 1133 580 
+Q 1338 1288 1338 2009 
+Q 1338 2731 1133 3434 
+Q 928 4138 513 4856 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="line2d_2">
+      <g>
+       <use xlink:href="#m53396d3673" x="131.54269" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_2">
+      <!-- (16K, 4K) -->
+      <g transform="translate(133.013251 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-31" d="M 794 531 
+L 1825 531 
+L 1825 4091 
+L 703 3866 
+L 703 4441 
+L 1819 4666 
+L 2450 4666 
+L 2450 531 
+L 3481 531 
+L 3481 0 
+L 794 0 
+L 794 531 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-36" d="M 2113 2584 
+Q 1688 2584 1439 2293 
+Q 1191 2003 1191 1497 
+Q 1191 994 1439 701 
+Q 1688 409 2113 409 
+Q 2538 409 2786 701 
+Q 3034 994 3034 1497 
+Q 3034 2003 2786 2293 
+Q 2538 2584 2113 2584 
+z
+M 3366 4563 
+L 3366 3988 
+Q 3128 4100 2886 4159 
+Q 2644 4219 2406 4219 
+Q 1781 4219 1451 3797 
+Q 1122 3375 1075 2522 
+Q 1259 2794 1537 2939 
+Q 1816 3084 2150 3084 
+Q 2853 3084 3261 2657 
+Q 3669 2231 3669 1497 
+Q 3669 778 3244 343 
+Q 2819 -91 2113 -91 
+Q 1303 -91 875 529 
+Q 447 1150 447 2328 
+Q 447 3434 972 4092 
+Q 1497 4750 2381 4750 
+Q 2619 4750 2861 4703 
+Q 3103 4656 3366 4563 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="line2d_3">
+      <g>
+       <use xlink:href="#m53396d3673" x="187.756298" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_3">
+      <!-- (32K, 4K) -->
+      <g transform="translate(189.226859 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-33" d="M 2597 2516 
+Q 3050 2419 3304 2112 
+Q 3559 1806 3559 1356 
+Q 3559 666 3084 287 
+Q 2609 -91 1734 -91 
+Q 1441 -91 1130 -33 
+Q 819 25 488 141 
+L 488 750 
+Q 750 597 1062 519 
+Q 1375 441 1716 441 
+Q 2309 441 2620 675 
+Q 2931 909 2931 1356 
+Q 2931 1769 2642 2001 
+Q 2353 2234 1838 2234 
+L 1294 2234 
+L 1294 2753 
+L 1863 2753 
+Q 2328 2753 2575 2939 
+Q 2822 3125 2822 3475 
+Q 2822 3834 2567 4026 
+Q 2313 4219 1838 4219 
+Q 1578 4219 1281 4162 
+Q 984 4106 628 3988 
+L 628 4550 
+Q 988 4650 1302 4700 
+Q 1616 4750 1894 4750 
+Q 2613 4750 3031 4423 
+Q 3450 4097 3450 3541 
+Q 3450 3153 3228 2886 
+Q 3006 2619 2597 2516 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-32" d="M 1228 531 
+L 3431 531 
+L 3431 0 
+L 469 0 
+L 469 531 
+Q 828 903 1448 1529 
+Q 2069 2156 2228 2338 
+Q 2531 2678 2651 2914 
+Q 2772 3150 2772 3378 
+Q 2772 3750 2511 3984 
+Q 2250 4219 1831 4219 
+Q 1534 4219 1204 4116 
+Q 875 4013 500 3803 
+L 500 4441 
+Q 881 4594 1212 4672 
+Q 1544 4750 1819 4750 
+Q 2544 4750 2975 4387 
+Q 3406 4025 3406 3419 
+Q 3406 3131 3298 2873 
+Q 3191 2616 2906 2266 
+Q 2828 2175 2409 1742 
+Q 1991 1309 1228 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="line2d_4">
+      <g>
+       <use xlink:href="#m53396d3673" x="243.969906" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_4">
+      <!-- (64K, 4K) -->
+      <g transform="translate(245.440467 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="line2d_5">
+      <g>
+       <use xlink:href="#m53396d3673" x="300.183514" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_5">
+      <!-- (128K, 4K) -->
+      <g transform="translate(301.654075 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="line2d_6">
+      <g>
+       <use xlink:href="#m53396d3673" x="356.397122" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_6">
+      <!-- (256K, 4K) -->
+      <g transform="translate(357.867683 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-35" d="M 691 4666 
+L 3169 4666 
+L 3169 4134 
+L 1269 4134 
+L 1269 2991 
+Q 1406 3038 1543 3061 
+Q 1681 3084 1819 3084 
+Q 2600 3084 3056 2656 
+Q 3513 2228 3513 1497 
+Q 3513 744 3044 326 
+Q 2575 -91 1722 -91 
+Q 1428 -91 1123 -41 
+Q 819 9 494 109 
+L 494 744 
+Q 775 591 1075 516 
+Q 1375 441 1709 441 
+Q 2250 441 2565 725 
+Q 2881 1009 2881 1497 
+Q 2881 1984 2565 2268 
+Q 2250 2553 1709 2553 
+Q 1456 2553 1204 2497 
+Q 953 2441 691 2322 
+L 691 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_7">
+     <g id="line2d_7">
+      <g>
+       <use xlink:href="#m53396d3673" x="412.61073" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_7">
+      <!-- (512K, 4K) -->
+      <g transform="translate(414.081291 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2">
+    <g id="ytick_1">
+     <g id="line2d_8">
+      <path d="M 58.465 334.546471 
+L 429.474812 334.546471 
+" clip-path="url(#p5c358f3005)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_9">
+      <defs>
+       <path id="mf533c794cf" d="M 0 0 
+L -3.5 0 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#mf533c794cf" x="58.465" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_8">
+      <!-- 0 -->
+      <g transform="translate(43.83 339.105533) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-30" d="M 2034 4250 
+Q 1547 4250 1301 3770 
+Q 1056 3291 1056 2328 
+Q 1056 1369 1301 889 
+Q 1547 409 2034 409 
+Q 2525 409 2770 889 
+Q 3016 1369 3016 2328 
+Q 3016 3291 2770 3770 
+Q 2525 4250 2034 4250 
+z
+M 2034 4750 
+Q 2819 4750 3233 4129 
+Q 3647 3509 3647 2328 
+Q 3647 1150 3233 529 
+Q 2819 -91 2034 -91 
+Q 1250 -91 836 529 
+Q 422 1150 422 2328 
+Q 422 3509 836 4129 
+Q 1250 4750 2034 4750 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_2">
+     <g id="line2d_10">
+      <path d="M 58.465 286.945749 
+L 429.474812 286.945749 
+" clip-path="url(#p5c358f3005)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_11">
+      <g>
+       <use xlink:href="#mf533c794cf" x="58.465" y="286.945749" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_9">
+      <!-- 2000 -->
+      <g transform="translate(20.925 291.504812) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-32"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_3">
+     <g id="line2d_12">
+      <path d="M 58.465 239.345028 
+L 429.474812 239.345028 
+" clip-path="url(#p5c358f3005)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_13">
+      <g>
+       <use xlink:href="#mf533c794cf" x="58.465" y="239.345028" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_10">
+      <!-- 4000 -->
+      <g transform="translate(20.925 243.90409) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-34"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_4">
+     <g id="line2d_14">
+      <path d="M 58.465 191.744306 
+L 429.474812 191.744306 
+" clip-path="url(#p5c358f3005)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_15">
+      <g>
+       <use xlink:href="#mf533c794cf" x="58.465" y="191.744306" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_11">
+      <!-- 6000 -->
+      <g transform="translate(20.925 196.303368) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-36"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_12">
+     <!-- Memory Bandwidth (GB/s) -->
+     <g transform="translate(13.5975 344.684985) rotate(-90) scale(0.16 -0.16)">
+      <defs>
+       <path id="DejaVuSans-4d" d="M 628 4666 
+L 1569 4666 
+L 2759 1491 
+L 3956 4666 
+L 4897 4666 
+L 4897 0 
+L 4281 0 
+L 4281 4097 
+L 3078 897 
+L 2444 897 
+L 1241 4097 
+L 1241 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-65" d="M 3597 1894 
+L 3597 1613 
+L 953 1613 
+Q 991 1019 1311 708 
+Q 1631 397 2203 397 
+Q 2534 397 2845 478 
+Q 3156 559 3463 722 
+L 3463 178 
+Q 3153 47 2828 -22 
+Q 2503 -91 2169 -91 
+Q 1331 -91 842 396 
+Q 353 884 353 1716 
+Q 353 2575 817 3079 
+Q 1281 3584 2069 3584 
+Q 2775 3584 3186 3129 
+Q 3597 2675 3597 1894 
+z
+M 3022 2063 
+Q 3016 2534 2758 2815 
+Q 2500 3097 2075 3097 
+Q 1594 3097 1305 2825 
+Q 1016 2553 972 2059 
+L 3022 2063 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6d" d="M 3328 2828 
+Q 3544 3216 3844 3400 
+Q 4144 3584 4550 3584 
+Q 5097 3584 5394 3201 
+Q 5691 2819 5691 2113 
+L 5691 0 
+L 5113 0 
+L 5113 2094 
+Q 5113 2597 4934 2840 
+Q 4756 3084 4391 3084 
+Q 3944 3084 3684 2787 
+Q 3425 2491 3425 1978 
+L 3425 0 
+L 2847 0 
+L 2847 2094 
+Q 2847 2600 2669 2842 
+Q 2491 3084 2119 3084 
+Q 1678 3084 1418 2786 
+Q 1159 2488 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1356 3278 1631 3431 
+Q 1906 3584 2284 3584 
+Q 2666 3584 2933 3390 
+Q 3200 3197 3328 2828 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6f" d="M 1959 3097 
+Q 1497 3097 1228 2736 
+Q 959 2375 959 1747 
+Q 959 1119 1226 758 
+Q 1494 397 1959 397 
+Q 2419 397 2687 759 
+Q 2956 1122 2956 1747 
+Q 2956 2369 2687 2733 
+Q 2419 3097 1959 3097 
+z
+M 1959 3584 
+Q 2709 3584 3137 3096 
+Q 3566 2609 3566 1747 
+Q 3566 888 3137 398 
+Q 2709 -91 1959 -91 
+Q 1206 -91 779 398 
+Q 353 888 353 1747 
+Q 353 2609 779 3096 
+Q 1206 3584 1959 3584 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-72" d="M 2631 2963 
+Q 2534 3019 2420 3045 
+Q 2306 3072 2169 3072 
+Q 1681 3072 1420 2755 
+Q 1159 2438 1159 1844 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1341 3275 1631 3429 
+Q 1922 3584 2338 3584 
+Q 2397 3584 2469 3576 
+Q 2541 3569 2628 3553 
+L 2631 2963 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-79" d="M 2059 -325 
+Q 1816 -950 1584 -1140 
+Q 1353 -1331 966 -1331 
+L 506 -1331 
+L 506 -850 
+L 844 -850 
+Q 1081 -850 1212 -737 
+Q 1344 -625 1503 -206 
+L 1606 56 
+L 191 3500 
+L 800 3500 
+L 1894 763 
+L 2988 3500 
+L 3597 3500 
+L 2059 -325 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-42" d="M 1259 2228 
+L 1259 519 
+L 2272 519 
+Q 2781 519 3026 730 
+Q 3272 941 3272 1375 
+Q 3272 1813 3026 2020 
+Q 2781 2228 2272 2228 
+L 1259 2228 
+z
+M 1259 4147 
+L 1259 2741 
+L 2194 2741 
+Q 2656 2741 2882 2914 
+Q 3109 3088 3109 3444 
+Q 3109 3797 2882 3972 
+Q 2656 4147 2194 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 2241 4666 
+Q 2963 4666 3353 4366 
+Q 3744 4066 3744 3513 
+Q 3744 3084 3544 2831 
+Q 3344 2578 2956 2516 
+Q 3422 2416 3680 2098 
+Q 3938 1781 3938 1306 
+Q 3938 681 3513 340 
+Q 3088 0 2303 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-61" d="M 2194 1759 
+Q 1497 1759 1228 1600 
+Q 959 1441 959 1056 
+Q 959 750 1161 570 
+Q 1363 391 1709 391 
+Q 2188 391 2477 730 
+Q 2766 1069 2766 1631 
+L 2766 1759 
+L 2194 1759 
+z
+M 3341 1997 
+L 3341 0 
+L 2766 0 
+L 2766 531 
+Q 2569 213 2275 61 
+Q 1981 -91 1556 -91 
+Q 1019 -91 701 211 
+Q 384 513 384 1019 
+Q 384 1609 779 1909 
+Q 1175 2209 1959 2209 
+L 2766 2209 
+L 2766 2266 
+Q 2766 2663 2505 2880 
+Q 2244 3097 1772 3097 
+Q 1472 3097 1187 3025 
+Q 903 2953 641 2809 
+L 641 3341 
+Q 956 3463 1253 3523 
+Q 1550 3584 1831 3584 
+Q 2591 3584 2966 3190 
+Q 3341 2797 3341 1997 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6e" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-64" d="M 2906 2969 
+L 2906 4863 
+L 3481 4863 
+L 3481 0 
+L 2906 0 
+L 2906 525 
+Q 2725 213 2448 61 
+Q 2172 -91 1784 -91 
+Q 1150 -91 751 415 
+Q 353 922 353 1747 
+Q 353 2572 751 3078 
+Q 1150 3584 1784 3584 
+Q 2172 3584 2448 3432 
+Q 2725 3281 2906 2969 
+z
+M 947 1747 
+Q 947 1113 1208 752 
+Q 1469 391 1925 391 
+Q 2381 391 2643 752 
+Q 2906 1113 2906 1747 
+Q 2906 2381 2643 2742 
+Q 2381 3103 1925 3103 
+Q 1469 3103 1208 2742 
+Q 947 2381 947 1747 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-77" d="M 269 3500 
+L 844 3500 
+L 1563 769 
+L 2278 3500 
+L 2956 3500 
+L 3675 769 
+L 4391 3500 
+L 4966 3500 
+L 4050 0 
+L 3372 0 
+L 2619 2869 
+L 1863 0 
+L 1184 0 
+L 269 3500 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-69" d="M 603 3500 
+L 1178 3500 
+L 1178 0 
+L 603 0 
+L 603 3500 
+z
+M 603 4863 
+L 1178 4863 
+L 1178 4134 
+L 603 4134 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-74" d="M 1172 4494 
+L 1172 3500 
+L 2356 3500 
+L 2356 3053 
+L 1172 3053 
+L 1172 1153 
+Q 1172 725 1289 603 
+Q 1406 481 1766 481 
+L 2356 481 
+L 2356 0 
+L 1766 0 
+Q 1100 0 847 248 
+Q 594 497 594 1153 
+L 594 3053 
+L 172 3053 
+L 172 3500 
+L 594 3500 
+L 594 4494 
+L 1172 4494 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-68" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-47" d="M 3809 666 
+L 3809 1919 
+L 2778 1919 
+L 2778 2438 
+L 4434 2438 
+L 4434 434 
+Q 4069 175 3628 42 
+Q 3188 -91 2688 -91 
+Q 1594 -91 976 548 
+Q 359 1188 359 2328 
+Q 359 3472 976 4111 
+Q 1594 4750 2688 4750 
+Q 3144 4750 3555 4637 
+Q 3966 4525 4313 4306 
+L 4313 3634 
+Q 3963 3931 3569 4081 
+Q 3175 4231 2741 4231 
+Q 1884 4231 1454 3753 
+Q 1025 3275 1025 2328 
+Q 1025 1384 1454 906 
+Q 1884 428 2741 428 
+Q 3075 428 3337 486 
+Q 3600 544 3809 666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-2f" d="M 1625 4666 
+L 2156 4666 
+L 531 -594 
+L 0 -594 
+L 1625 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-73" d="M 2834 3397 
+L 2834 2853 
+Q 2591 2978 2328 3040 
+Q 2066 3103 1784 3103 
+Q 1356 3103 1142 2972 
+Q 928 2841 928 2578 
+Q 928 2378 1081 2264 
+Q 1234 2150 1697 2047 
+L 1894 2003 
+Q 2506 1872 2764 1633 
+Q 3022 1394 3022 966 
+Q 3022 478 2636 193 
+Q 2250 -91 1575 -91 
+Q 1294 -91 989 -36 
+Q 684 19 347 128 
+L 347 722 
+Q 666 556 975 473 
+Q 1284 391 1588 391 
+Q 1994 391 2212 530 
+Q 2431 669 2431 922 
+Q 2431 1156 2273 1281 
+Q 2116 1406 1581 1522 
+L 1381 1569 
+Q 847 1681 609 1914 
+Q 372 2147 372 2553 
+Q 372 3047 722 3315 
+Q 1072 3584 1716 3584 
+Q 2034 3584 2315 3537 
+Q 2597 3491 2834 3397 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-4d"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(86.279297 0)"/>
+      <use xlink:href="#DejaVuSans-6d" transform="translate(147.802734 0)"/>
+      <use xlink:href="#DejaVuSans-6f" transform="translate(245.214844 0)"/>
+      <use xlink:href="#DejaVuSans-72" transform="translate(306.396484 0)"/>
+      <use xlink:href="#DejaVuSans-79" transform="translate(347.509766 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(406.689453 0)"/>
+      <use xlink:href="#DejaVuSans-42" transform="translate(438.476562 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(507.080078 0)"/>
+      <use xlink:href="#DejaVuSans-6e" transform="translate(568.359375 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(631.738281 0)"/>
+      <use xlink:href="#DejaVuSans-77" transform="translate(695.214844 0)"/>
+      <use xlink:href="#DejaVuSans-69" transform="translate(777.001953 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(804.785156 0)"/>
+      <use xlink:href="#DejaVuSans-74" transform="translate(868.261719 0)"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(907.470703 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(970.849609 0)"/>
+      <use xlink:href="#DejaVuSans-28" transform="translate(1002.636719 0)"/>
+      <use xlink:href="#DejaVuSans-47" transform="translate(1041.650391 0)"/>
+      <use xlink:href="#DejaVuSans-42" transform="translate(1119.140625 0)"/>
+      <use xlink:href="#DejaVuSans-2f" transform="translate(1187.744141 0)"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(1221.435547 0)"/>
+      <use xlink:href="#DejaVuSans-29" transform="translate(1273.535156 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_16">
+    <path d="M 75.329082 253.25486 
+L 131.54269 259.661974 
+L 187.756298 223.419235 
+L 243.969906 218.133855 
+L 300.183514 216.008349 
+L 356.397122 214.921541 
+L 412.61073 214.432846 
+" clip-path="url(#p5c358f3005)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <defs>
+     <path id="m7c10715330" d="M 0 3.5 
+C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
+C 3.131218 1.81853 3.5 0.928211 3.5 0 
+C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
+C 1.81853 -3.131218 0.928211 -3.5 0 -3.5 
+C -0.928211 -3.5 -1.81853 -3.131218 -2.474874 -2.474874 
+C -3.131218 -1.81853 -3.5 -0.928211 -3.5 0 
+C -3.5 0.928211 -3.131218 1.81853 -2.474874 2.474874 
+C -1.81853 3.131218 -0.928211 3.5 0 3.5 
+z
+" style="stroke: #5ba3f5"/>
+    </defs>
+    <g clip-path="url(#p5c358f3005)">
+     <use xlink:href="#m7c10715330" x="75.329082" y="253.25486" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7c10715330" x="131.54269" y="259.661974" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7c10715330" x="187.756298" y="223.419235" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7c10715330" x="243.969906" y="218.133855" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7c10715330" x="300.183514" y="216.008349" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7c10715330" x="356.397122" y="214.921541" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7c10715330" x="412.61073" y="214.432846" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_17">
+    <path d="M 75.329082 258.412109 
+L 131.54269 246.283012 
+L 187.756298 238.132423 
+L 243.969906 234.06502 
+L 300.183514 231.64472 
+L 356.397122 230.413258 
+L 412.61073 229.891064 
+" clip-path="url(#p5c358f3005)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <defs>
+     <path id="me2b5b002fa" d="M 0 3.5 
+C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
+C 3.131218 1.81853 3.5 0.928211 3.5 0 
+C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
+C 1.81853 -3.131218 0.928211 -3.5 0 -3.5 
+C -0.928211 -3.5 -1.81853 -3.131218 -2.474874 -2.474874 
+C -3.131218 -1.81853 -3.5 -0.928211 -3.5 0 
+C -3.5 0.928211 -3.131218 1.81853 -2.474874 2.474874 
+C -1.81853 3.131218 -0.928211 3.5 0 3.5 
+z
+" style="stroke: #ff4444"/>
+    </defs>
+    <g clip-path="url(#p5c358f3005)">
+     <use xlink:href="#me2b5b002fa" x="75.329082" y="258.412109" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#me2b5b002fa" x="131.54269" y="246.283012" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#me2b5b002fa" x="187.756298" y="238.132423" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#me2b5b002fa" x="243.969906" y="234.06502" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#me2b5b002fa" x="300.183514" y="231.64472" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#me2b5b002fa" x="356.397122" y="230.413258" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#me2b5b002fa" x="412.61073" y="229.891064" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_18">
+    <path d="M 58.465 158.870109 
+L 429.474812 158.870109 
+" clip-path="url(#p5c358f3005)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_3">
+    <path d="M 58.465 334.546471 
+L 58.465 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_4">
+    <path d="M 429.474812 334.546471 
+L 429.474812 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_5">
+    <path d="M 58.465 334.546471 
+L 429.474812 334.546471 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_6">
+    <path d="M 58.465 144.816 
+L 429.474812 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_13">
+    <!-- RMSNorm (fp32 weight) -->
+    <g transform="translate(136.266625 138.816) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-52" d="M 2841 2188 
+Q 3044 2119 3236 1894 
+Q 3428 1669 3622 1275 
+L 4263 0 
+L 3584 0 
+L 2988 1197 
+Q 2756 1666 2539 1819 
+Q 2322 1972 1947 1972 
+L 1259 1972 
+L 1259 0 
+L 628 0 
+L 628 4666 
+L 2053 4666 
+Q 2853 4666 3247 4331 
+Q 3641 3997 3641 3322 
+Q 3641 2881 3436 2590 
+Q 3231 2300 2841 2188 
+z
+M 1259 4147 
+L 1259 2491 
+L 2053 2491 
+Q 2509 2491 2742 2702 
+Q 2975 2913 2975 3322 
+Q 2975 3731 2742 3939 
+Q 2509 4147 2053 4147 
+L 1259 4147 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-53" d="M 3425 4513 
+L 3425 3897 
+Q 3066 4069 2747 4153 
+Q 2428 4238 2131 4238 
+Q 1616 4238 1336 4038 
+Q 1056 3838 1056 3469 
+Q 1056 3159 1242 3001 
+Q 1428 2844 1947 2747 
+L 2328 2669 
+Q 3034 2534 3370 2195 
+Q 3706 1856 3706 1288 
+Q 3706 609 3251 259 
+Q 2797 -91 1919 -91 
+Q 1588 -91 1214 -16 
+Q 841 59 441 206 
+L 441 856 
+Q 825 641 1194 531 
+Q 1563 422 1919 422 
+Q 2459 422 2753 634 
+Q 3047 847 3047 1241 
+Q 3047 1584 2836 1778 
+Q 2625 1972 2144 2069 
+L 1759 2144 
+Q 1053 2284 737 2584 
+Q 422 2884 422 3419 
+Q 422 4038 858 4394 
+Q 1294 4750 2059 4750 
+Q 2388 4750 2728 4690 
+Q 3069 4631 3425 4513 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-4e" d="M 628 4666 
+L 1478 4666 
+L 3547 763 
+L 3547 4666 
+L 4159 4666 
+L 4159 0 
+L 3309 0 
+L 1241 3903 
+L 1241 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-66" d="M 2375 4863 
+L 2375 4384 
+L 1825 4384 
+Q 1516 4384 1395 4259 
+Q 1275 4134 1275 3809 
+L 1275 3500 
+L 2222 3500 
+L 2222 3053 
+L 1275 3053 
+L 1275 0 
+L 697 0 
+L 697 3053 
+L 147 3053 
+L 147 3500 
+L 697 3500 
+L 697 3744 
+Q 697 4328 969 4595 
+Q 1241 4863 1831 4863 
+L 2375 4863 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-70" d="M 1159 525 
+L 1159 -1331 
+L 581 -1331 
+L 581 3500 
+L 1159 3500 
+L 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+z
+M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-67" d="M 2906 1791 
+Q 2906 2416 2648 2759 
+Q 2391 3103 1925 3103 
+Q 1463 3103 1205 2759 
+Q 947 2416 947 1791 
+Q 947 1169 1205 825 
+Q 1463 481 1925 481 
+Q 2391 481 2648 825 
+Q 2906 1169 2906 1791 
+z
+M 3481 434 
+Q 3481 -459 3084 -895 
+Q 2688 -1331 1869 -1331 
+Q 1566 -1331 1297 -1286 
+Q 1028 -1241 775 -1147 
+L 775 -588 
+Q 1028 -725 1275 -790 
+Q 1522 -856 1778 -856 
+Q 2344 -856 2625 -561 
+Q 2906 -266 2906 331 
+L 2906 616 
+Q 2728 306 2450 153 
+Q 2172 0 1784 0 
+Q 1141 0 747 490 
+Q 353 981 353 1791 
+Q 353 2603 747 3093 
+Q 1141 3584 1784 3584 
+Q 2172 3584 2450 3431 
+Q 2728 3278 2906 2969 
+L 2906 3500 
+L 3481 3500 
+L 3481 434 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-52"/>
+     <use xlink:href="#DejaVuSans-4d" transform="translate(69.482422 0)"/>
+     <use xlink:href="#DejaVuSans-53" transform="translate(155.761719 0)"/>
+     <use xlink:href="#DejaVuSans-4e" transform="translate(219.238281 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(294.042969 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(355.224609 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(394.587891 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(492 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(523.787109 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(562.800781 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(598.005859 0)"/>
+     <use xlink:href="#DejaVuSans-33" transform="translate(661.482422 0)"/>
+     <use xlink:href="#DejaVuSans-32" transform="translate(725.105469 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(788.728516 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(820.515625 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(902.302734 0)"/>
+     <use xlink:href="#DejaVuSans-69" transform="translate(963.826172 0)"/>
+     <use xlink:href="#DejaVuSans-67" transform="translate(991.609375 0)"/>
+     <use xlink:href="#DejaVuSans-68" transform="translate(1055.085938 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(1118.464844 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1157.673828 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_2">
+   <g id="patch_7">
+    <path d="M 474.7075 334.546471 
+L 845.717312 334.546471 
+L 845.717312 144.816 
+L 474.7075 144.816 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_3">
+    <g id="xtick_8">
+     <g id="line2d_19">
+      <g>
+       <use xlink:href="#m53396d3673" x="491.571582" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_14">
+      <!-- (8K, 4K) -->
+      <g transform="translate(493.042143 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_9">
+     <g id="line2d_20">
+      <g>
+       <use xlink:href="#m53396d3673" x="547.78519" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_15">
+      <!-- (16K, 4K) -->
+      <g transform="translate(549.255751 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_10">
+     <g id="line2d_21">
+      <g>
+       <use xlink:href="#m53396d3673" x="603.998798" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_16">
+      <!-- (32K, 4K) -->
+      <g transform="translate(605.469359 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_11">
+     <g id="line2d_22">
+      <g>
+       <use xlink:href="#m53396d3673" x="660.212406" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_17">
+      <!-- (64K, 4K) -->
+      <g transform="translate(661.682967 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_12">
+     <g id="line2d_23">
+      <g>
+       <use xlink:href="#m53396d3673" x="716.426014" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_18">
+      <!-- (128K, 4K) -->
+      <g transform="translate(717.896575 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_13">
+     <g id="line2d_24">
+      <g>
+       <use xlink:href="#m53396d3673" x="772.639622" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_19">
+      <!-- (256K, 4K) -->
+      <g transform="translate(774.110183 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_14">
+     <g id="line2d_25">
+      <g>
+       <use xlink:href="#m53396d3673" x="828.85323" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_20">
+      <!-- (512K, 4K) -->
+      <g transform="translate(830.323791 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_4">
+    <g id="ytick_5">
+     <g id="line2d_26">
+      <path d="M 474.7075 334.546471 
+L 845.717312 334.546471 
+" clip-path="url(#p9fd6c82b75)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_27">
+      <g>
+       <use xlink:href="#mf533c794cf" x="474.7075" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_6">
+     <g id="line2d_28">
+      <path d="M 474.7075 286.945749 
+L 845.717312 286.945749 
+" clip-path="url(#p9fd6c82b75)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_29">
+      <g>
+       <use xlink:href="#mf533c794cf" x="474.7075" y="286.945749" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_7">
+     <g id="line2d_30">
+      <path d="M 474.7075 239.345028 
+L 845.717312 239.345028 
+" clip-path="url(#p9fd6c82b75)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_31">
+      <g>
+       <use xlink:href="#mf533c794cf" x="474.7075" y="239.345028" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_8">
+     <g id="line2d_32">
+      <path d="M 474.7075 191.744306 
+L 845.717312 191.744306 
+" clip-path="url(#p9fd6c82b75)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_33">
+      <g>
+       <use xlink:href="#mf533c794cf" x="474.7075" y="191.744306" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_34">
+    <path d="M 491.571582 242.661221 
+L 547.78519 232.6956 
+L 603.998798 226.518353 
+L 660.212406 223.572657 
+L 716.426014 221.993259 
+L 772.639622 221.173843 
+L 828.85323 220.741664 
+" clip-path="url(#p9fd6c82b75)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p9fd6c82b75)">
+     <use xlink:href="#m7c10715330" x="491.571582" y="242.661221" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7c10715330" x="547.78519" y="232.6956" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7c10715330" x="603.998798" y="226.518353" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7c10715330" x="660.212406" y="223.572657" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7c10715330" x="716.426014" y="221.993259" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7c10715330" x="772.639622" y="221.173843" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7c10715330" x="828.85323" y="220.741664" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_35">
+    <path d="M 491.571582 272.482214 
+L 547.78519 265.651785 
+L 603.998798 261.232124 
+L 660.212406 258.947326 
+L 716.426014 257.740701 
+L 772.639622 257.088157 
+L 828.85323 256.798828 
+" clip-path="url(#p9fd6c82b75)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p9fd6c82b75)">
+     <use xlink:href="#me2b5b002fa" x="491.571582" y="272.482214" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#me2b5b002fa" x="547.78519" y="265.651785" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#me2b5b002fa" x="603.998798" y="261.232124" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#me2b5b002fa" x="660.212406" y="258.947326" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#me2b5b002fa" x="716.426014" y="257.740701" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#me2b5b002fa" x="772.639622" y="257.088157" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#me2b5b002fa" x="828.85323" y="256.798828" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_36">
+    <path d="M 474.7075 158.870109 
+L 845.717312 158.870109 
+" clip-path="url(#p9fd6c82b75)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_8">
+    <path d="M 474.7075 334.546471 
+L 474.7075 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_9">
+    <path d="M 845.717312 334.546471 
+L 845.717312 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_10">
+    <path d="M 474.7075 334.546471 
+L 845.717312 334.546471 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_11">
+    <path d="M 474.7075 144.816 
+L 845.717312 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_21">
+    <!-- Softmax (fwd+bwd) -->
+    <g transform="translate(570.547094 138.816) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-78" d="M 3513 3500 
+L 2247 1797 
+L 3578 0 
+L 2900 0 
+L 1881 1375 
+L 863 0 
+L 184 0 
+L 1544 1831 
+L 300 3500 
+L 978 3500 
+L 1906 2253 
+L 2834 3500 
+L 3513 3500 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-2b" d="M 2944 4013 
+L 2944 2272 
+L 4684 2272 
+L 4684 1741 
+L 2944 1741 
+L 2944 0 
+L 2419 0 
+L 2419 1741 
+L 678 1741 
+L 678 2272 
+L 2419 2272 
+L 2419 4013 
+L 2944 4013 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-62" d="M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+M 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2969 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-53"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(63.476562 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(124.658203 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(158.113281 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(197.322266 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(294.734375 0)"/>
+     <use xlink:href="#DejaVuSans-78" transform="translate(356.013672 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(415.193359 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(446.980469 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(485.994141 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(519.449219 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(601.236328 0)"/>
+     <use xlink:href="#DejaVuSans-2b" transform="translate(664.712891 0)"/>
+     <use xlink:href="#DejaVuSans-62" transform="translate(748.501953 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(811.978516 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(893.765625 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(957.242188 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_3">
+   <g id="patch_12">
+    <path d="M 890.95 334.546471 
+L 1261.959812 334.546471 
+L 1261.959812 144.816 
+L 890.95 144.816 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_5">
+    <g id="xtick_15">
+     <g id="line2d_37">
+      <g>
+       <use xlink:href="#m53396d3673" x="907.814082" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_22">
+      <!-- (8K, 4K) -->
+      <g transform="translate(909.284643 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_16">
+     <g id="line2d_38">
+      <g>
+       <use xlink:href="#m53396d3673" x="964.02769" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_23">
+      <!-- (16K, 4K) -->
+      <g transform="translate(965.498251 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_17">
+     <g id="line2d_39">
+      <g>
+       <use xlink:href="#m53396d3673" x="1020.241298" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_24">
+      <!-- (32K, 4K) -->
+      <g transform="translate(1021.711859 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_18">
+     <g id="line2d_40">
+      <g>
+       <use xlink:href="#m53396d3673" x="1076.454906" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_25">
+      <!-- (64K, 4K) -->
+      <g transform="translate(1077.925467 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_19">
+     <g id="line2d_41">
+      <g>
+       <use xlink:href="#m53396d3673" x="1132.668514" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_26">
+      <!-- (128K, 4K) -->
+      <g transform="translate(1134.139075 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_20">
+     <g id="line2d_42">
+      <g>
+       <use xlink:href="#m53396d3673" x="1188.882122" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_27">
+      <!-- (256K, 4K) -->
+      <g transform="translate(1190.352683 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_21">
+     <g id="line2d_43">
+      <g>
+       <use xlink:href="#m53396d3673" x="1245.09573" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_28">
+      <!-- (512K, 4K) -->
+      <g transform="translate(1246.566291 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_6">
+    <g id="ytick_9">
+     <g id="line2d_44">
+      <path d="M 890.95 334.546471 
+L 1261.959812 334.546471 
+" clip-path="url(#pda63148a07)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_45">
+      <g>
+       <use xlink:href="#mf533c794cf" x="890.95" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_10">
+     <g id="line2d_46">
+      <path d="M 890.95 286.945749 
+L 1261.959812 286.945749 
+" clip-path="url(#pda63148a07)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_47">
+      <g>
+       <use xlink:href="#mf533c794cf" x="890.95" y="286.945749" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_11">
+     <g id="line2d_48">
+      <path d="M 890.95 239.345028 
+L 1261.959812 239.345028 
+" clip-path="url(#pda63148a07)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_49">
+      <g>
+       <use xlink:href="#mf533c794cf" x="890.95" y="239.345028" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_12">
+     <g id="line2d_50">
+      <path d="M 890.95 191.744306 
+L 1261.959812 191.744306 
+" clip-path="url(#pda63148a07)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_51">
+      <g>
+       <use xlink:href="#mf533c794cf" x="890.95" y="191.744306" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_52">
+    <path d="M 907.814082 257.088265 
+L 964.02769 246.107662 
+L 1020.241298 241.060757 
+L 1076.454906 237.106886 
+L 1132.668514 235.500346 
+L 1188.882122 234.468963 
+L 1245.09573 233.975246 
+" clip-path="url(#pda63148a07)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pda63148a07)">
+     <use xlink:href="#m7c10715330" x="907.814082" y="257.088265" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7c10715330" x="964.02769" y="246.107662" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7c10715330" x="1020.241298" y="241.060757" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7c10715330" x="1076.454906" y="237.106886" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7c10715330" x="1132.668514" y="235.500346" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7c10715330" x="1188.882122" y="234.468963" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7c10715330" x="1245.09573" y="233.975246" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_53">
+    <path d="M 907.814082 299.12209 
+L 964.02769 272.123621 
+L 1020.241298 266.042305 
+L 1076.454906 262.721252 
+L 1132.668514 261.113394 
+L 1188.882122 260.124563 
+L 1245.09573 259.694299 
+" clip-path="url(#pda63148a07)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pda63148a07)">
+     <use xlink:href="#me2b5b002fa" x="907.814082" y="299.12209" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#me2b5b002fa" x="964.02769" y="272.123621" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#me2b5b002fa" x="1020.241298" y="266.042305" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#me2b5b002fa" x="1076.454906" y="262.721252" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#me2b5b002fa" x="1132.668514" y="261.113394" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#me2b5b002fa" x="1188.882122" y="260.124563" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#me2b5b002fa" x="1245.09573" y="259.694299" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_54">
+    <path d="M 890.95 158.870109 
+L 1261.959812 158.870109 
+" clip-path="url(#pda63148a07)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_13">
+    <path d="M 890.95 334.546471 
+L 890.95 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_14">
+    <path d="M 1261.959812 334.546471 
+L 1261.959812 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_15">
+    <path d="M 890.95 334.546471 
+L 1261.959812 334.546471 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_16">
+    <path d="M 890.95 144.816 
+L 1261.959812 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_29">
+    <!-- Cross-Entropy (fwd+bwd) -->
+    <g transform="translate(961.281625 138.816) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-43" d="M 4122 4306 
+L 4122 3641 
+Q 3803 3938 3442 4084 
+Q 3081 4231 2675 4231 
+Q 1875 4231 1450 3742 
+Q 1025 3253 1025 2328 
+Q 1025 1406 1450 917 
+Q 1875 428 2675 428 
+Q 3081 428 3442 575 
+Q 3803 722 4122 1019 
+L 4122 359 
+Q 3791 134 3420 21 
+Q 3050 -91 2638 -91 
+Q 1578 -91 968 557 
+Q 359 1206 359 2328 
+Q 359 3453 968 4101 
+Q 1578 4750 2638 4750 
+Q 3056 4750 3426 4639 
+Q 3797 4528 4122 4306 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-2d" d="M 313 2009 
+L 1997 2009 
+L 1997 1497 
+L 313 1497 
+L 313 2009 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-45" d="M 628 4666 
+L 3578 4666 
+L 3578 4134 
+L 1259 4134 
+L 1259 2753 
+L 3481 2753 
+L 3481 2222 
+L 1259 2222 
+L 1259 531 
+L 3634 531 
+L 3634 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-43"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(69.824219 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(108.6875 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(169.869141 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(221.96875 0)"/>
+     <use xlink:href="#DejaVuSans-2d" transform="translate(274.068359 0)"/>
+     <use xlink:href="#DejaVuSans-45" transform="translate(310.152344 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(373.335938 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(436.714844 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(475.923828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(514.787109 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(575.96875 0)"/>
+     <use xlink:href="#DejaVuSans-79" transform="translate(639.445312 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(698.625 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(730.412109 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(769.425781 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(802.880859 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(884.667969 0)"/>
+     <use xlink:href="#DejaVuSans-2b" transform="translate(948.144531 0)"/>
+     <use xlink:href="#DejaVuSans-62" transform="translate(1031.933594 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(1095.410156 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1177.197266 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1240.673828 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_4">
+   <g id="patch_17">
+    <path d="M 1307.1925 334.546471 
+L 1678.202312 334.546471 
+L 1678.202312 144.816 
+L 1307.1925 144.816 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_7">
+    <g id="xtick_22">
+     <g id="line2d_55">
+      <g>
+       <use xlink:href="#m53396d3673" x="1324.056582" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_30">
+      <!-- (8K, 4K) -->
+      <g transform="translate(1325.527143 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_23">
+     <g id="line2d_56">
+      <g>
+       <use xlink:href="#m53396d3673" x="1380.27019" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_31">
+      <!-- (16K, 4K) -->
+      <g transform="translate(1381.740751 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_24">
+     <g id="line2d_57">
+      <g>
+       <use xlink:href="#m53396d3673" x="1436.483798" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_32">
+      <!-- (32K, 4K) -->
+      <g transform="translate(1437.954359 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_25">
+     <g id="line2d_58">
+      <g>
+       <use xlink:href="#m53396d3673" x="1492.697406" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_33">
+      <!-- (64K, 4K) -->
+      <g transform="translate(1494.167967 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_26">
+     <g id="line2d_59">
+      <g>
+       <use xlink:href="#m53396d3673" x="1548.911014" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_34">
+      <!-- (128K, 4K) -->
+      <g transform="translate(1550.381575 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_27">
+     <g id="line2d_60">
+      <g>
+       <use xlink:href="#m53396d3673" x="1605.124622" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_35">
+      <!-- (256K, 4K) -->
+      <g transform="translate(1606.595183 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_28">
+     <g id="line2d_61">
+      <g>
+       <use xlink:href="#m53396d3673" x="1661.33823" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_36">
+      <!-- (512K, 4K) -->
+      <g transform="translate(1662.808791 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_8">
+    <g id="ytick_13">
+     <g id="line2d_62">
+      <path d="M 1307.1925 334.546471 
+L 1678.202312 334.546471 
+" clip-path="url(#pdcd35808ba)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_63">
+      <g>
+       <use xlink:href="#mf533c794cf" x="1307.1925" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_14">
+     <g id="line2d_64">
+      <path d="M 1307.1925 286.945749 
+L 1678.202312 286.945749 
+" clip-path="url(#pdcd35808ba)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_65">
+      <g>
+       <use xlink:href="#mf533c794cf" x="1307.1925" y="286.945749" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_15">
+     <g id="line2d_66">
+      <path d="M 1307.1925 239.345028 
+L 1678.202312 239.345028 
+" clip-path="url(#pdcd35808ba)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_67">
+      <g>
+       <use xlink:href="#mf533c794cf" x="1307.1925" y="239.345028" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_16">
+     <g id="line2d_68">
+      <path d="M 1307.1925 191.744306 
+L 1678.202312 191.744306 
+" clip-path="url(#pdcd35808ba)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_69">
+      <g>
+       <use xlink:href="#mf533c794cf" x="1307.1925" y="191.744306" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_70">
+    <path d="M 1324.056582 258.252957 
+L 1380.27019 246.854983 
+L 1436.483798 240.694095 
+L 1492.697406 237.286932 
+L 1548.911014 235.416291 
+L 1605.124622 234.374703 
+L 1661.33823 233.955946 
+" clip-path="url(#pdcd35808ba)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pdcd35808ba)">
+     <use xlink:href="#m7c10715330" x="1324.056582" y="258.252957" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7c10715330" x="1380.27019" y="246.854983" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7c10715330" x="1436.483798" y="240.694095" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7c10715330" x="1492.697406" y="237.286932" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7c10715330" x="1548.911014" y="235.416291" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7c10715330" x="1605.124622" y="234.374703" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m7c10715330" x="1661.33823" y="233.955946" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_71">
+    <path d="M 1324.056582 267.610946 
+L 1380.27019 259.360866 
+L 1436.483798 253.978014 
+L 1492.697406 251.199004 
+L 1548.911014 250.113639 
+L 1605.124622 249.284432 
+L 1661.33823 249.208269 
+" clip-path="url(#pdcd35808ba)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pdcd35808ba)">
+     <use xlink:href="#me2b5b002fa" x="1324.056582" y="267.610946" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#me2b5b002fa" x="1380.27019" y="259.360866" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#me2b5b002fa" x="1436.483798" y="253.978014" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#me2b5b002fa" x="1492.697406" y="251.199004" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#me2b5b002fa" x="1548.911014" y="250.113639" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#me2b5b002fa" x="1605.124622" y="249.284432" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#me2b5b002fa" x="1661.33823" y="249.208269" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_72">
+    <path d="M 1307.1925 158.870109 
+L 1678.202312 158.870109 
+" clip-path="url(#pdcd35808ba)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_18">
+    <path d="M 1307.1925 334.546471 
+L 1307.1925 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_19">
+    <path d="M 1678.202312 334.546471 
+L 1678.202312 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_20">
+    <path d="M 1307.1925 334.546471 
+L 1678.202312 334.546471 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_21">
+    <path d="M 1307.1925 144.816 
+L 1678.202312 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_37">
+    <!-- LayerNorm (fwd) -->
+    <g transform="translate(1417.086156 138.816) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-4c" d="M 628 4666 
+L 1259 4666 
+L 1259 531 
+L 3531 531 
+L 3531 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-4c"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(55.712891 0)"/>
+     <use xlink:href="#DejaVuSans-79" transform="translate(116.992188 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(176.171875 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(237.695312 0)"/>
+     <use xlink:href="#DejaVuSans-4e" transform="translate(278.808594 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(353.613281 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(414.794922 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(454.158203 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(551.570312 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(583.357422 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(622.371094 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(655.826172 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(737.613281 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(801.089844 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="text_38">
+   <!-- SM100 BF16 Kernel Benchmarks (Oink vs Quack) — Quack-suite (+LayerNorm) -->
+   <g transform="translate(420.446719 18.156563) scale(0.22 -0.22)">
+    <defs>
+     <path id="DejaVuSans-46" d="M 628 4666 
+L 3309 4666 
+L 3309 4134 
+L 1259 4134 
+L 1259 2759 
+L 3109 2759 
+L 3109 2228 
+L 1259 2228 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-6c" d="M 603 4863 
+L 1178 4863 
+L 1178 0 
+L 603 0 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-63" d="M 3122 3366 
+L 3122 2828 
+Q 2878 2963 2633 3030 
+Q 2388 3097 2138 3097 
+Q 1578 3097 1268 2742 
+Q 959 2388 959 1747 
+Q 959 1106 1268 751 
+Q 1578 397 2138 397 
+Q 2388 397 2633 464 
+Q 2878 531 3122 666 
+L 3122 134 
+Q 2881 22 2623 -34 
+Q 2366 -91 2075 -91 
+Q 1284 -91 818 406 
+Q 353 903 353 1747 
+Q 353 2603 823 3093 
+Q 1294 3584 2113 3584 
+Q 2378 3584 2631 3529 
+Q 2884 3475 3122 3366 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-6b" d="M 581 4863 
+L 1159 4863 
+L 1159 1991 
+L 2875 3500 
+L 3609 3500 
+L 1753 1863 
+L 3688 0 
+L 2938 0 
+L 1159 1709 
+L 1159 0 
+L 581 0 
+L 581 4863 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-4f" d="M 2522 4238 
+Q 1834 4238 1429 3725 
+Q 1025 3213 1025 2328 
+Q 1025 1447 1429 934 
+Q 1834 422 2522 422 
+Q 3209 422 3611 934 
+Q 4013 1447 4013 2328 
+Q 4013 3213 3611 3725 
+Q 3209 4238 2522 4238 
+z
+M 2522 4750 
+Q 3503 4750 4090 4092 
+Q 4678 3434 4678 2328 
+Q 4678 1225 4090 567 
+Q 3503 -91 2522 -91 
+Q 1538 -91 948 565 
+Q 359 1222 359 2328 
+Q 359 3434 948 4092 
+Q 1538 4750 2522 4750 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-76" d="M 191 3500 
+L 800 3500 
+L 1894 563 
+L 2988 3500 
+L 3597 3500 
+L 2284 0 
+L 1503 0 
+L 191 3500 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-51" d="M 2522 4238 
+Q 1834 4238 1429 3725 
+Q 1025 3213 1025 2328 
+Q 1025 1447 1429 934 
+Q 1834 422 2522 422 
+Q 3209 422 3611 934 
+Q 4013 1447 4013 2328 
+Q 4013 3213 3611 3725 
+Q 3209 4238 2522 4238 
+z
+M 3406 84 
+L 4238 -825 
+L 3475 -825 
+L 2784 -78 
+Q 2681 -84 2626 -87 
+Q 2572 -91 2522 -91 
+Q 1538 -91 948 567 
+Q 359 1225 359 2328 
+Q 359 3434 948 4092 
+Q 1538 4750 2522 4750 
+Q 3503 4750 4090 4092 
+Q 4678 3434 4678 2328 
+Q 4678 1516 4351 937 
+Q 4025 359 3406 84 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-75" d="M 544 1381 
+L 544 3500 
+L 1119 3500 
+L 1119 1403 
+Q 1119 906 1312 657 
+Q 1506 409 1894 409 
+Q 2359 409 2629 706 
+Q 2900 1003 2900 1516 
+L 2900 3500 
+L 3475 3500 
+L 3475 0 
+L 2900 0 
+L 2900 538 
+Q 2691 219 2414 64 
+Q 2138 -91 1772 -91 
+Q 1169 -91 856 284 
+Q 544 659 544 1381 
+z
+M 1991 3584 
+L 1991 3584 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-2014" d="M 313 1978 
+L 6088 1978 
+L 6088 1528 
+L 313 1528 
+L 313 1978 
+z
+" transform="scale(0.015625)"/>
+    </defs>
+    <use xlink:href="#DejaVuSans-53"/>
+    <use xlink:href="#DejaVuSans-4d" transform="translate(63.476562 0)"/>
+    <use xlink:href="#DejaVuSans-31" transform="translate(149.755859 0)"/>
+    <use xlink:href="#DejaVuSans-30" transform="translate(213.378906 0)"/>
+    <use xlink:href="#DejaVuSans-30" transform="translate(277.001953 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(340.625 0)"/>
+    <use xlink:href="#DejaVuSans-42" transform="translate(372.412109 0)"/>
+    <use xlink:href="#DejaVuSans-46" transform="translate(441.015625 0)"/>
+    <use xlink:href="#DejaVuSans-31" transform="translate(498.535156 0)"/>
+    <use xlink:href="#DejaVuSans-36" transform="translate(562.158203 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(625.78125 0)"/>
+    <use xlink:href="#DejaVuSans-4b" transform="translate(657.568359 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(718.144531 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(779.667969 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(819.03125 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(882.410156 0)"/>
+    <use xlink:href="#DejaVuSans-6c" transform="translate(943.933594 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(971.716797 0)"/>
+    <use xlink:href="#DejaVuSans-42" transform="translate(1003.503906 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(1072.107422 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(1133.630859 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(1197.009766 0)"/>
+    <use xlink:href="#DejaVuSans-68" transform="translate(1251.990234 0)"/>
+    <use xlink:href="#DejaVuSans-6d" transform="translate(1315.369141 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(1412.78125 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(1474.060547 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(1515.173828 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(1573.083984 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(1625.183594 0)"/>
+    <use xlink:href="#DejaVuSans-28" transform="translate(1656.970703 0)"/>
+    <use xlink:href="#DejaVuSans-4f" transform="translate(1695.984375 0)"/>
+    <use xlink:href="#DejaVuSans-69" transform="translate(1774.695312 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(1802.478516 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(1865.857422 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(1923.767578 0)"/>
+    <use xlink:href="#DejaVuSans-76" transform="translate(1955.554688 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(2014.734375 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2066.833984 0)"/>
+    <use xlink:href="#DejaVuSans-51" transform="translate(2098.621094 0)"/>
+    <use xlink:href="#DejaVuSans-75" transform="translate(2177.332031 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(2240.710938 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(2301.990234 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(2356.970703 0)"/>
+    <use xlink:href="#DejaVuSans-29" transform="translate(2414.880859 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2453.894531 0)"/>
+    <use xlink:href="#DejaVuSans-2014" transform="translate(2485.681641 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2585.681641 0)"/>
+    <use xlink:href="#DejaVuSans-51" transform="translate(2617.46875 0)"/>
+    <use xlink:href="#DejaVuSans-75" transform="translate(2696.179688 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(2759.558594 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(2820.837891 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(2875.818359 0)"/>
+    <use xlink:href="#DejaVuSans-2d" transform="translate(2933.728516 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(2969.8125 0)"/>
+    <use xlink:href="#DejaVuSans-75" transform="translate(3021.912109 0)"/>
+    <use xlink:href="#DejaVuSans-69" transform="translate(3085.291016 0)"/>
+    <use xlink:href="#DejaVuSans-74" transform="translate(3113.074219 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(3152.283203 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(3213.806641 0)"/>
+    <use xlink:href="#DejaVuSans-28" transform="translate(3245.59375 0)"/>
+    <use xlink:href="#DejaVuSans-2b" transform="translate(3284.607422 0)"/>
+    <use xlink:href="#DejaVuSans-4c" transform="translate(3368.396484 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(3424.109375 0)"/>
+    <use xlink:href="#DejaVuSans-79" transform="translate(3485.388672 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(3544.568359 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(3606.091797 0)"/>
+    <use xlink:href="#DejaVuSans-4e" transform="translate(3647.205078 0)"/>
+    <use xlink:href="#DejaVuSans-6f" transform="translate(3722.009766 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(3783.191406 0)"/>
+    <use xlink:href="#DejaVuSans-6d" transform="translate(3822.554688 0)"/>
+    <use xlink:href="#DejaVuSans-29" transform="translate(3919.966797 0)"/>
+   </g>
+  </g>
+  <g id="legend_1">
+   <g id="line2d_73">
+    <path d="M 594.240937 46.691969 
+L 610.490937 46.691969 
+L 626.740937 46.691969 
+" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g>
+     <use xlink:href="#m7c10715330" x="610.490937" y="46.691969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="text_39">
+    <!-- KernelAgent-Oink (ours) -->
+    <g transform="translate(637.140937 51.241969) scale(0.13 -0.13)">
+     <defs>
+      <path id="DejaVuSans-41" d="M 2188 4044 
+L 1331 1722 
+L 3047 1722 
+L 2188 4044 
+z
+M 1831 4666 
+L 2547 4666 
+L 4325 0 
+L 3669 0 
+L 3244 1197 
+L 1141 1197 
+L 716 0 
+L 50 0 
+L 1831 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-4b"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(60.576172 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(122.099609 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(161.462891 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(224.841797 0)"/>
+     <use xlink:href="#DejaVuSans-6c" transform="translate(286.365234 0)"/>
+     <use xlink:href="#DejaVuSans-41" transform="translate(314.148438 0)"/>
+     <use xlink:href="#DejaVuSans-67" transform="translate(382.556641 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(446.033203 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(507.556641 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(570.935547 0)"/>
+     <use xlink:href="#DejaVuSans-2d" transform="translate(610.144531 0)"/>
+     <use xlink:href="#DejaVuSans-4f" transform="translate(648.978516 0)"/>
+     <use xlink:href="#DejaVuSans-69" transform="translate(727.689453 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(755.472656 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(818.851562 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(876.761719 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(908.548828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(947.5625 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(1008.744141 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(1072.123047 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(1113.236328 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1165.335938 0)"/>
+    </g>
+   </g>
+   <g id="line2d_74">
+    <path d="M 819.705625 46.691969 
+L 835.955625 46.691969 
+L 852.205625 46.691969 
+" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g>
+     <use xlink:href="#me2b5b002fa" x="835.955625" y="46.691969" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="text_40">
+    <!-- Quack -->
+    <g transform="translate(862.605625 51.241969) scale(0.13 -0.13)">
+     <use xlink:href="#DejaVuSans-51"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(78.710938 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(142.089844 0)"/>
+     <use xlink:href="#DejaVuSans-63" transform="translate(203.369141 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(258.349609 0)"/>
+    </g>
+   </g>
+   <g id="line2d_75">
+    <path d="M 929.720156 46.691969 
+L 945.970156 46.691969 
+L 962.220156 46.691969 
+" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="text_41">
+    <!-- HBM peak (measured) -->
+    <g transform="translate(972.620156 51.241969) scale(0.13 -0.13)">
+     <defs>
+      <path id="DejaVuSans-48" d="M 628 4666 
+L 1259 4666 
+L 1259 2753 
+L 3553 2753 
+L 3553 4666 
+L 4184 4666 
+L 4184 0 
+L 3553 0 
+L 3553 2222 
+L 1259 2222 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-48"/>
+     <use xlink:href="#DejaVuSans-42" transform="translate(75.195312 0)"/>
+     <use xlink:href="#DejaVuSans-4d" transform="translate(143.798828 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(230.078125 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(261.865234 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(325.341797 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(386.865234 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(448.144531 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(506.054688 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(537.841797 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(576.855469 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(674.267578 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(735.791016 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(797.070312 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(849.169922 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(912.548828 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(951.412109 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1012.935547 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1076.412109 0)"/>
+    </g>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="p5c358f3005">
+   <rect x="58.465" y="144.816" width="371.009812" height="189.730471"/>
+  </clipPath>
+  <clipPath id="p9fd6c82b75">
+   <rect x="474.7075" y="144.816" width="371.009812" height="189.730471"/>
+  </clipPath>
+  <clipPath id="pda63148a07">
+   <rect x="890.95" y="144.816" width="371.009812" height="189.730471"/>
+  </clipPath>
+  <clipPath id="pdcd35808ba">
+   <rect x="1307.1925" y="144.816" width="371.009812" height="189.730471"/>
+  </clipPath>
+ </defs>
+</svg>
diff --git a/oink/benchmarks/media/sm100_fp16_oink_vs_quack.svg b/oink/benchmarks/media/sm100_fp16_oink_vs_quack.svg
new file mode 100644
index 0000000..0a021e9
--- /dev/null
+++ b/oink/benchmarks/media/sm100_fp16_oink_vs_quack.svg
@@ -0,0 +1,2280 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="1275.328413pt" height="387.112144pt" viewBox="0 0 1275.328413 387.112144" xmlns="http://www.w3.org/2000/svg" version="1.1">
+ <metadata>
+  <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+   <cc:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
+    <dc:date>2026-01-12T23:31:38.919062</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <cc:Agent>
+      <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
+     </cc:Agent>
+    </dc:creator>
+   </cc:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure_1">
+  <g id="patch_1">
+   <path d="M 0 387.112144 
+L 1275.328413 387.112144 
+L 1275.328413 0 
+L 0 0 
+z
+" style="fill: #ffffff"/>
+  </g>
+  <g id="axes_1">
+   <g id="patch_2">
+    <path d="M 58.465 334.546471 
+L 424.416918 334.546471 
+L 424.416918 144.816 
+L 58.465 144.816 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="line2d_1">
+      <defs>
+       <path id="meb1843bdeb" d="M 0 0 
+L 0 3.5 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#meb1843bdeb" x="75.099178" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_1">
+      <!-- (8K, 4K) -->
+      <g transform="translate(76.569739 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-28" d="M 1984 4856 
+Q 1566 4138 1362 3434 
+Q 1159 2731 1159 2009 
+Q 1159 1288 1364 580 
+Q 1569 -128 1984 -844 
+L 1484 -844 
+Q 1016 -109 783 600 
+Q 550 1309 550 2009 
+Q 550 2706 781 3412 
+Q 1013 4119 1484 4856 
+L 1984 4856 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-38" d="M 2034 2216 
+Q 1584 2216 1326 1975 
+Q 1069 1734 1069 1313 
+Q 1069 891 1326 650 
+Q 1584 409 2034 409 
+Q 2484 409 2743 651 
+Q 3003 894 3003 1313 
+Q 3003 1734 2745 1975 
+Q 2488 2216 2034 2216 
+z
+M 1403 2484 
+Q 997 2584 770 2862 
+Q 544 3141 544 3541 
+Q 544 4100 942 4425 
+Q 1341 4750 2034 4750 
+Q 2731 4750 3128 4425 
+Q 3525 4100 3525 3541 
+Q 3525 3141 3298 2862 
+Q 3072 2584 2669 2484 
+Q 3125 2378 3379 2068 
+Q 3634 1759 3634 1313 
+Q 3634 634 3220 271 
+Q 2806 -91 2034 -91 
+Q 1263 -91 848 271 
+Q 434 634 434 1313 
+Q 434 1759 690 2068 
+Q 947 2378 1403 2484 
+z
+M 1172 3481 
+Q 1172 3119 1398 2916 
+Q 1625 2713 2034 2713 
+Q 2441 2713 2670 2916 
+Q 2900 3119 2900 3481 
+Q 2900 3844 2670 4047 
+Q 2441 4250 2034 4250 
+Q 1625 4250 1398 4047 
+Q 1172 3844 1172 3481 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-4b" d="M 628 4666 
+L 1259 4666 
+L 1259 2694 
+L 3353 4666 
+L 4166 4666 
+L 1850 2491 
+L 4331 0 
+L 3500 0 
+L 1259 2247 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-2c" d="M 750 794 
+L 1409 794 
+L 1409 256 
+L 897 -744 
+L 494 -744 
+L 750 256 
+L 750 794 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-20" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-34" d="M 2419 4116 
+L 825 1625 
+L 2419 1625 
+L 2419 4116 
+z
+M 2253 4666 
+L 3047 4666 
+L 3047 1625 
+L 3713 1625 
+L 3713 1100 
+L 3047 1100 
+L 3047 0 
+L 2419 0 
+L 2419 1100 
+L 313 1100 
+L 313 1709 
+L 2253 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-29" d="M 513 4856 
+L 1013 4856 
+Q 1481 4119 1714 3412 
+Q 1947 2706 1947 2009 
+Q 1947 1309 1714 600 
+Q 1481 -109 1013 -844 
+L 513 -844 
+Q 928 -128 1133 580 
+Q 1338 1288 1338 2009 
+Q 1338 2731 1133 3434 
+Q 928 4138 513 4856 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="line2d_2">
+      <g>
+       <use xlink:href="#meb1843bdeb" x="130.546438" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_2">
+      <!-- (16K, 4K) -->
+      <g transform="translate(132.017 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-31" d="M 794 531 
+L 1825 531 
+L 1825 4091 
+L 703 3866 
+L 703 4441 
+L 1819 4666 
+L 2450 4666 
+L 2450 531 
+L 3481 531 
+L 3481 0 
+L 794 0 
+L 794 531 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-36" d="M 2113 2584 
+Q 1688 2584 1439 2293 
+Q 1191 2003 1191 1497 
+Q 1191 994 1439 701 
+Q 1688 409 2113 409 
+Q 2538 409 2786 701 
+Q 3034 994 3034 1497 
+Q 3034 2003 2786 2293 
+Q 2538 2584 2113 2584 
+z
+M 3366 4563 
+L 3366 3988 
+Q 3128 4100 2886 4159 
+Q 2644 4219 2406 4219 
+Q 1781 4219 1451 3797 
+Q 1122 3375 1075 2522 
+Q 1259 2794 1537 2939 
+Q 1816 3084 2150 3084 
+Q 2853 3084 3261 2657 
+Q 3669 2231 3669 1497 
+Q 3669 778 3244 343 
+Q 2819 -91 2113 -91 
+Q 1303 -91 875 529 
+Q 447 1150 447 2328 
+Q 447 3434 972 4092 
+Q 1497 4750 2381 4750 
+Q 2619 4750 2861 4703 
+Q 3103 4656 3366 4563 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="line2d_3">
+      <g>
+       <use xlink:href="#meb1843bdeb" x="185.993699" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_3">
+      <!-- (32K, 4K) -->
+      <g transform="translate(187.46426 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-33" d="M 2597 2516 
+Q 3050 2419 3304 2112 
+Q 3559 1806 3559 1356 
+Q 3559 666 3084 287 
+Q 2609 -91 1734 -91 
+Q 1441 -91 1130 -33 
+Q 819 25 488 141 
+L 488 750 
+Q 750 597 1062 519 
+Q 1375 441 1716 441 
+Q 2309 441 2620 675 
+Q 2931 909 2931 1356 
+Q 2931 1769 2642 2001 
+Q 2353 2234 1838 2234 
+L 1294 2234 
+L 1294 2753 
+L 1863 2753 
+Q 2328 2753 2575 2939 
+Q 2822 3125 2822 3475 
+Q 2822 3834 2567 4026 
+Q 2313 4219 1838 4219 
+Q 1578 4219 1281 4162 
+Q 984 4106 628 3988 
+L 628 4550 
+Q 988 4650 1302 4700 
+Q 1616 4750 1894 4750 
+Q 2613 4750 3031 4423 
+Q 3450 4097 3450 3541 
+Q 3450 3153 3228 2886 
+Q 3006 2619 2597 2516 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-32" d="M 1228 531 
+L 3431 531 
+L 3431 0 
+L 469 0 
+L 469 531 
+Q 828 903 1448 1529 
+Q 2069 2156 2228 2338 
+Q 2531 2678 2651 2914 
+Q 2772 3150 2772 3378 
+Q 2772 3750 2511 3984 
+Q 2250 4219 1831 4219 
+Q 1534 4219 1204 4116 
+Q 875 4013 500 3803 
+L 500 4441 
+Q 881 4594 1212 4672 
+Q 1544 4750 1819 4750 
+Q 2544 4750 2975 4387 
+Q 3406 4025 3406 3419 
+Q 3406 3131 3298 2873 
+Q 3191 2616 2906 2266 
+Q 2828 2175 2409 1742 
+Q 1991 1309 1228 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="line2d_4">
+      <g>
+       <use xlink:href="#meb1843bdeb" x="241.440959" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_4">
+      <!-- (64K, 4K) -->
+      <g transform="translate(242.91152 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="line2d_5">
+      <g>
+       <use xlink:href="#meb1843bdeb" x="296.888219" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_5">
+      <!-- (128K, 4K) -->
+      <g transform="translate(298.35878 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="line2d_6">
+      <g>
+       <use xlink:href="#meb1843bdeb" x="352.33548" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_6">
+      <!-- (256K, 4K) -->
+      <g transform="translate(353.806041 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-35" d="M 691 4666 
+L 3169 4666 
+L 3169 4134 
+L 1269 4134 
+L 1269 2991 
+Q 1406 3038 1543 3061 
+Q 1681 3084 1819 3084 
+Q 2600 3084 3056 2656 
+Q 3513 2228 3513 1497 
+Q 3513 744 3044 326 
+Q 2575 -91 1722 -91 
+Q 1428 -91 1123 -41 
+Q 819 9 494 109 
+L 494 744 
+Q 775 591 1075 516 
+Q 1375 441 1709 441 
+Q 2250 441 2565 725 
+Q 2881 1009 2881 1497 
+Q 2881 1984 2565 2268 
+Q 2250 2553 1709 2553 
+Q 1456 2553 1204 2497 
+Q 953 2441 691 2322 
+L 691 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_7">
+     <g id="line2d_7">
+      <g>
+       <use xlink:href="#meb1843bdeb" x="407.78274" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_7">
+      <!-- (512K, 4K) -->
+      <g transform="translate(409.253301 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2">
+    <g id="ytick_1">
+     <g id="line2d_8">
+      <path d="M 58.465 334.546471 
+L 424.416918 334.546471 
+" clip-path="url(#pd71b444ea1)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_9">
+      <defs>
+       <path id="md6c5251344" d="M 0 0 
+L -3.5 0 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#md6c5251344" x="58.465" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_8">
+      <!-- 0 -->
+      <g transform="translate(43.83 339.105533) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-30" d="M 2034 4250 
+Q 1547 4250 1301 3770 
+Q 1056 3291 1056 2328 
+Q 1056 1369 1301 889 
+Q 1547 409 2034 409 
+Q 2525 409 2770 889 
+Q 3016 1369 3016 2328 
+Q 3016 3291 2770 3770 
+Q 2525 4250 2034 4250 
+z
+M 2034 4750 
+Q 2819 4750 3233 4129 
+Q 3647 3509 3647 2328 
+Q 3647 1150 3233 529 
+Q 2819 -91 2034 -91 
+Q 1250 -91 836 529 
+Q 422 1150 422 2328 
+Q 422 3509 836 4129 
+Q 1250 4750 2034 4750 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_2">
+     <g id="line2d_10">
+      <path d="M 58.465 286.915059 
+L 424.416918 286.915059 
+" clip-path="url(#pd71b444ea1)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_11">
+      <g>
+       <use xlink:href="#md6c5251344" x="58.465" y="286.915059" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_9">
+      <!-- 2000 -->
+      <g transform="translate(20.925 291.474121) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-32"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_3">
+     <g id="line2d_12">
+      <path d="M 58.465 239.283646 
+L 424.416918 239.283646 
+" clip-path="url(#pd71b444ea1)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_13">
+      <g>
+       <use xlink:href="#md6c5251344" x="58.465" y="239.283646" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_10">
+      <!-- 4000 -->
+      <g transform="translate(20.925 243.842709) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-34"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_4">
+     <g id="line2d_14">
+      <path d="M 58.465 191.652234 
+L 424.416918 191.652234 
+" clip-path="url(#pd71b444ea1)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_15">
+      <g>
+       <use xlink:href="#md6c5251344" x="58.465" y="191.652234" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_11">
+      <!-- 6000 -->
+      <g transform="translate(20.925 196.211297) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-36"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_12">
+     <!-- Memory Bandwidth (GB/s) -->
+     <g transform="translate(13.5975 344.684985) rotate(-90) scale(0.16 -0.16)">
+      <defs>
+       <path id="DejaVuSans-4d" d="M 628 4666 
+L 1569 4666 
+L 2759 1491 
+L 3956 4666 
+L 4897 4666 
+L 4897 0 
+L 4281 0 
+L 4281 4097 
+L 3078 897 
+L 2444 897 
+L 1241 4097 
+L 1241 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-65" d="M 3597 1894 
+L 3597 1613 
+L 953 1613 
+Q 991 1019 1311 708 
+Q 1631 397 2203 397 
+Q 2534 397 2845 478 
+Q 3156 559 3463 722 
+L 3463 178 
+Q 3153 47 2828 -22 
+Q 2503 -91 2169 -91 
+Q 1331 -91 842 396 
+Q 353 884 353 1716 
+Q 353 2575 817 3079 
+Q 1281 3584 2069 3584 
+Q 2775 3584 3186 3129 
+Q 3597 2675 3597 1894 
+z
+M 3022 2063 
+Q 3016 2534 2758 2815 
+Q 2500 3097 2075 3097 
+Q 1594 3097 1305 2825 
+Q 1016 2553 972 2059 
+L 3022 2063 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6d" d="M 3328 2828 
+Q 3544 3216 3844 3400 
+Q 4144 3584 4550 3584 
+Q 5097 3584 5394 3201 
+Q 5691 2819 5691 2113 
+L 5691 0 
+L 5113 0 
+L 5113 2094 
+Q 5113 2597 4934 2840 
+Q 4756 3084 4391 3084 
+Q 3944 3084 3684 2787 
+Q 3425 2491 3425 1978 
+L 3425 0 
+L 2847 0 
+L 2847 2094 
+Q 2847 2600 2669 2842 
+Q 2491 3084 2119 3084 
+Q 1678 3084 1418 2786 
+Q 1159 2488 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1356 3278 1631 3431 
+Q 1906 3584 2284 3584 
+Q 2666 3584 2933 3390 
+Q 3200 3197 3328 2828 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6f" d="M 1959 3097 
+Q 1497 3097 1228 2736 
+Q 959 2375 959 1747 
+Q 959 1119 1226 758 
+Q 1494 397 1959 397 
+Q 2419 397 2687 759 
+Q 2956 1122 2956 1747 
+Q 2956 2369 2687 2733 
+Q 2419 3097 1959 3097 
+z
+M 1959 3584 
+Q 2709 3584 3137 3096 
+Q 3566 2609 3566 1747 
+Q 3566 888 3137 398 
+Q 2709 -91 1959 -91 
+Q 1206 -91 779 398 
+Q 353 888 353 1747 
+Q 353 2609 779 3096 
+Q 1206 3584 1959 3584 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-72" d="M 2631 2963 
+Q 2534 3019 2420 3045 
+Q 2306 3072 2169 3072 
+Q 1681 3072 1420 2755 
+Q 1159 2438 1159 1844 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1341 3275 1631 3429 
+Q 1922 3584 2338 3584 
+Q 2397 3584 2469 3576 
+Q 2541 3569 2628 3553 
+L 2631 2963 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-79" d="M 2059 -325 
+Q 1816 -950 1584 -1140 
+Q 1353 -1331 966 -1331 
+L 506 -1331 
+L 506 -850 
+L 844 -850 
+Q 1081 -850 1212 -737 
+Q 1344 -625 1503 -206 
+L 1606 56 
+L 191 3500 
+L 800 3500 
+L 1894 763 
+L 2988 3500 
+L 3597 3500 
+L 2059 -325 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-42" d="M 1259 2228 
+L 1259 519 
+L 2272 519 
+Q 2781 519 3026 730 
+Q 3272 941 3272 1375 
+Q 3272 1813 3026 2020 
+Q 2781 2228 2272 2228 
+L 1259 2228 
+z
+M 1259 4147 
+L 1259 2741 
+L 2194 2741 
+Q 2656 2741 2882 2914 
+Q 3109 3088 3109 3444 
+Q 3109 3797 2882 3972 
+Q 2656 4147 2194 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 2241 4666 
+Q 2963 4666 3353 4366 
+Q 3744 4066 3744 3513 
+Q 3744 3084 3544 2831 
+Q 3344 2578 2956 2516 
+Q 3422 2416 3680 2098 
+Q 3938 1781 3938 1306 
+Q 3938 681 3513 340 
+Q 3088 0 2303 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-61" d="M 2194 1759 
+Q 1497 1759 1228 1600 
+Q 959 1441 959 1056 
+Q 959 750 1161 570 
+Q 1363 391 1709 391 
+Q 2188 391 2477 730 
+Q 2766 1069 2766 1631 
+L 2766 1759 
+L 2194 1759 
+z
+M 3341 1997 
+L 3341 0 
+L 2766 0 
+L 2766 531 
+Q 2569 213 2275 61 
+Q 1981 -91 1556 -91 
+Q 1019 -91 701 211 
+Q 384 513 384 1019 
+Q 384 1609 779 1909 
+Q 1175 2209 1959 2209 
+L 2766 2209 
+L 2766 2266 
+Q 2766 2663 2505 2880 
+Q 2244 3097 1772 3097 
+Q 1472 3097 1187 3025 
+Q 903 2953 641 2809 
+L 641 3341 
+Q 956 3463 1253 3523 
+Q 1550 3584 1831 3584 
+Q 2591 3584 2966 3190 
+Q 3341 2797 3341 1997 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6e" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-64" d="M 2906 2969 
+L 2906 4863 
+L 3481 4863 
+L 3481 0 
+L 2906 0 
+L 2906 525 
+Q 2725 213 2448 61 
+Q 2172 -91 1784 -91 
+Q 1150 -91 751 415 
+Q 353 922 353 1747 
+Q 353 2572 751 3078 
+Q 1150 3584 1784 3584 
+Q 2172 3584 2448 3432 
+Q 2725 3281 2906 2969 
+z
+M 947 1747 
+Q 947 1113 1208 752 
+Q 1469 391 1925 391 
+Q 2381 391 2643 752 
+Q 2906 1113 2906 1747 
+Q 2906 2381 2643 2742 
+Q 2381 3103 1925 3103 
+Q 1469 3103 1208 2742 
+Q 947 2381 947 1747 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-77" d="M 269 3500 
+L 844 3500 
+L 1563 769 
+L 2278 3500 
+L 2956 3500 
+L 3675 769 
+L 4391 3500 
+L 4966 3500 
+L 4050 0 
+L 3372 0 
+L 2619 2869 
+L 1863 0 
+L 1184 0 
+L 269 3500 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-69" d="M 603 3500 
+L 1178 3500 
+L 1178 0 
+L 603 0 
+L 603 3500 
+z
+M 603 4863 
+L 1178 4863 
+L 1178 4134 
+L 603 4134 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-74" d="M 1172 4494 
+L 1172 3500 
+L 2356 3500 
+L 2356 3053 
+L 1172 3053 
+L 1172 1153 
+Q 1172 725 1289 603 
+Q 1406 481 1766 481 
+L 2356 481 
+L 2356 0 
+L 1766 0 
+Q 1100 0 847 248 
+Q 594 497 594 1153 
+L 594 3053 
+L 172 3053 
+L 172 3500 
+L 594 3500 
+L 594 4494 
+L 1172 4494 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-68" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-47" d="M 3809 666 
+L 3809 1919 
+L 2778 1919 
+L 2778 2438 
+L 4434 2438 
+L 4434 434 
+Q 4069 175 3628 42 
+Q 3188 -91 2688 -91 
+Q 1594 -91 976 548 
+Q 359 1188 359 2328 
+Q 359 3472 976 4111 
+Q 1594 4750 2688 4750 
+Q 3144 4750 3555 4637 
+Q 3966 4525 4313 4306 
+L 4313 3634 
+Q 3963 3931 3569 4081 
+Q 3175 4231 2741 4231 
+Q 1884 4231 1454 3753 
+Q 1025 3275 1025 2328 
+Q 1025 1384 1454 906 
+Q 1884 428 2741 428 
+Q 3075 428 3337 486 
+Q 3600 544 3809 666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-2f" d="M 1625 4666 
+L 2156 4666 
+L 531 -594 
+L 0 -594 
+L 1625 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-73" d="M 2834 3397 
+L 2834 2853 
+Q 2591 2978 2328 3040 
+Q 2066 3103 1784 3103 
+Q 1356 3103 1142 2972 
+Q 928 2841 928 2578 
+Q 928 2378 1081 2264 
+Q 1234 2150 1697 2047 
+L 1894 2003 
+Q 2506 1872 2764 1633 
+Q 3022 1394 3022 966 
+Q 3022 478 2636 193 
+Q 2250 -91 1575 -91 
+Q 1294 -91 989 -36 
+Q 684 19 347 128 
+L 347 722 
+Q 666 556 975 473 
+Q 1284 391 1588 391 
+Q 1994 391 2212 530 
+Q 2431 669 2431 922 
+Q 2431 1156 2273 1281 
+Q 2116 1406 1581 1522 
+L 1381 1569 
+Q 847 1681 609 1914 
+Q 372 2147 372 2553 
+Q 372 3047 722 3315 
+Q 1072 3584 1716 3584 
+Q 2034 3584 2315 3537 
+Q 2597 3491 2834 3397 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-4d"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(86.279297 0)"/>
+      <use xlink:href="#DejaVuSans-6d" transform="translate(147.802734 0)"/>
+      <use xlink:href="#DejaVuSans-6f" transform="translate(245.214844 0)"/>
+      <use xlink:href="#DejaVuSans-72" transform="translate(306.396484 0)"/>
+      <use xlink:href="#DejaVuSans-79" transform="translate(347.509766 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(406.689453 0)"/>
+      <use xlink:href="#DejaVuSans-42" transform="translate(438.476562 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(507.080078 0)"/>
+      <use xlink:href="#DejaVuSans-6e" transform="translate(568.359375 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(631.738281 0)"/>
+      <use xlink:href="#DejaVuSans-77" transform="translate(695.214844 0)"/>
+      <use xlink:href="#DejaVuSans-69" transform="translate(777.001953 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(804.785156 0)"/>
+      <use xlink:href="#DejaVuSans-74" transform="translate(868.261719 0)"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(907.470703 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(970.849609 0)"/>
+      <use xlink:href="#DejaVuSans-28" transform="translate(1002.636719 0)"/>
+      <use xlink:href="#DejaVuSans-47" transform="translate(1041.650391 0)"/>
+      <use xlink:href="#DejaVuSans-42" transform="translate(1119.140625 0)"/>
+      <use xlink:href="#DejaVuSans-2f" transform="translate(1187.744141 0)"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(1221.435547 0)"/>
+      <use xlink:href="#DejaVuSans-29" transform="translate(1273.535156 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_16">
+    <path d="M 75.099178 251.913135 
+L 130.546438 241.17648 
+L 185.993699 219.657703 
+L 241.440959 214.163341 
+L 296.888219 211.264677 
+L 352.33548 210.014971 
+L 407.78274 209.246698 
+" clip-path="url(#pd71b444ea1)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <defs>
+     <path id="m00e73cd0be" d="M 0 3.5 
+C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
+C 3.131218 1.81853 3.5 0.928211 3.5 0 
+C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
+C 1.81853 -3.131218 0.928211 -3.5 0 -3.5 
+C -0.928211 -3.5 -1.81853 -3.131218 -2.474874 -2.474874 
+C -3.131218 -1.81853 -3.5 -0.928211 -3.5 0 
+C -3.5 0.928211 -3.131218 1.81853 -2.474874 2.474874 
+C -1.81853 3.131218 -0.928211 3.5 0 3.5 
+z
+" style="stroke: #5ba3f5"/>
+    </defs>
+    <g clip-path="url(#pd71b444ea1)">
+     <use xlink:href="#m00e73cd0be" x="75.099178" y="251.913135" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m00e73cd0be" x="130.546438" y="241.17648" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m00e73cd0be" x="185.993699" y="219.657703" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m00e73cd0be" x="241.440959" y="214.163341" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m00e73cd0be" x="296.888219" y="211.264677" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m00e73cd0be" x="352.33548" y="210.014971" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m00e73cd0be" x="407.78274" y="209.246698" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_17">
+    <path d="M 75.099178 259.330658 
+L 130.546438 248.708389 
+L 185.993699 241.397352 
+L 241.440959 237.270903 
+L 296.888219 235.199319 
+L 352.33548 234.549081 
+L 407.78274 233.995559 
+" clip-path="url(#pd71b444ea1)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <defs>
+     <path id="maccf74412a" d="M 0 3.5 
+C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
+C 3.131218 1.81853 3.5 0.928211 3.5 0 
+C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
+C 1.81853 -3.131218 0.928211 -3.5 0 -3.5 
+C -0.928211 -3.5 -1.81853 -3.131218 -2.474874 -2.474874 
+C -3.131218 -1.81853 -3.5 -0.928211 -3.5 0 
+C -3.5 0.928211 -3.131218 1.81853 -2.474874 2.474874 
+C -1.81853 3.131218 -0.928211 3.5 0 3.5 
+z
+" style="stroke: #ff4444"/>
+    </defs>
+    <g clip-path="url(#pd71b444ea1)">
+     <use xlink:href="#maccf74412a" x="75.099178" y="259.330658" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#maccf74412a" x="130.546438" y="248.708389" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#maccf74412a" x="185.993699" y="241.397352" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#maccf74412a" x="241.440959" y="237.270903" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#maccf74412a" x="296.888219" y="235.199319" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#maccf74412a" x="352.33548" y="234.549081" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#maccf74412a" x="407.78274" y="233.995559" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_18">
+    <path d="M 58.465 158.870109 
+L 424.416918 158.870109 
+" clip-path="url(#pd71b444ea1)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_3">
+    <path d="M 58.465 334.546471 
+L 58.465 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_4">
+    <path d="M 424.416918 334.546471 
+L 424.416918 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_5">
+    <path d="M 58.465 334.546471 
+L 424.416918 334.546471 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_6">
+    <path d="M 58.465 144.816 
+L 424.416918 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_13">
+    <!-- RMSNorm (fp32 weight) -->
+    <g transform="translate(133.737678 138.816) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-52" d="M 2841 2188 
+Q 3044 2119 3236 1894 
+Q 3428 1669 3622 1275 
+L 4263 0 
+L 3584 0 
+L 2988 1197 
+Q 2756 1666 2539 1819 
+Q 2322 1972 1947 1972 
+L 1259 1972 
+L 1259 0 
+L 628 0 
+L 628 4666 
+L 2053 4666 
+Q 2853 4666 3247 4331 
+Q 3641 3997 3641 3322 
+Q 3641 2881 3436 2590 
+Q 3231 2300 2841 2188 
+z
+M 1259 4147 
+L 1259 2491 
+L 2053 2491 
+Q 2509 2491 2742 2702 
+Q 2975 2913 2975 3322 
+Q 2975 3731 2742 3939 
+Q 2509 4147 2053 4147 
+L 1259 4147 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-53" d="M 3425 4513 
+L 3425 3897 
+Q 3066 4069 2747 4153 
+Q 2428 4238 2131 4238 
+Q 1616 4238 1336 4038 
+Q 1056 3838 1056 3469 
+Q 1056 3159 1242 3001 
+Q 1428 2844 1947 2747 
+L 2328 2669 
+Q 3034 2534 3370 2195 
+Q 3706 1856 3706 1288 
+Q 3706 609 3251 259 
+Q 2797 -91 1919 -91 
+Q 1588 -91 1214 -16 
+Q 841 59 441 206 
+L 441 856 
+Q 825 641 1194 531 
+Q 1563 422 1919 422 
+Q 2459 422 2753 634 
+Q 3047 847 3047 1241 
+Q 3047 1584 2836 1778 
+Q 2625 1972 2144 2069 
+L 1759 2144 
+Q 1053 2284 737 2584 
+Q 422 2884 422 3419 
+Q 422 4038 858 4394 
+Q 1294 4750 2059 4750 
+Q 2388 4750 2728 4690 
+Q 3069 4631 3425 4513 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-4e" d="M 628 4666 
+L 1478 4666 
+L 3547 763 
+L 3547 4666 
+L 4159 4666 
+L 4159 0 
+L 3309 0 
+L 1241 3903 
+L 1241 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-66" d="M 2375 4863 
+L 2375 4384 
+L 1825 4384 
+Q 1516 4384 1395 4259 
+Q 1275 4134 1275 3809 
+L 1275 3500 
+L 2222 3500 
+L 2222 3053 
+L 1275 3053 
+L 1275 0 
+L 697 0 
+L 697 3053 
+L 147 3053 
+L 147 3500 
+L 697 3500 
+L 697 3744 
+Q 697 4328 969 4595 
+Q 1241 4863 1831 4863 
+L 2375 4863 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-70" d="M 1159 525 
+L 1159 -1331 
+L 581 -1331 
+L 581 3500 
+L 1159 3500 
+L 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+z
+M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-67" d="M 2906 1791 
+Q 2906 2416 2648 2759 
+Q 2391 3103 1925 3103 
+Q 1463 3103 1205 2759 
+Q 947 2416 947 1791 
+Q 947 1169 1205 825 
+Q 1463 481 1925 481 
+Q 2391 481 2648 825 
+Q 2906 1169 2906 1791 
+z
+M 3481 434 
+Q 3481 -459 3084 -895 
+Q 2688 -1331 1869 -1331 
+Q 1566 -1331 1297 -1286 
+Q 1028 -1241 775 -1147 
+L 775 -588 
+Q 1028 -725 1275 -790 
+Q 1522 -856 1778 -856 
+Q 2344 -856 2625 -561 
+Q 2906 -266 2906 331 
+L 2906 616 
+Q 2728 306 2450 153 
+Q 2172 0 1784 0 
+Q 1141 0 747 490 
+Q 353 981 353 1791 
+Q 353 2603 747 3093 
+Q 1141 3584 1784 3584 
+Q 2172 3584 2450 3431 
+Q 2728 3278 2906 2969 
+L 2906 3500 
+L 3481 3500 
+L 3481 434 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-52"/>
+     <use xlink:href="#DejaVuSans-4d" transform="translate(69.482422 0)"/>
+     <use xlink:href="#DejaVuSans-53" transform="translate(155.761719 0)"/>
+     <use xlink:href="#DejaVuSans-4e" transform="translate(219.238281 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(294.042969 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(355.224609 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(394.587891 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(492 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(523.787109 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(562.800781 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(598.005859 0)"/>
+     <use xlink:href="#DejaVuSans-33" transform="translate(661.482422 0)"/>
+     <use xlink:href="#DejaVuSans-32" transform="translate(725.105469 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(788.728516 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(820.515625 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(902.302734 0)"/>
+     <use xlink:href="#DejaVuSans-69" transform="translate(963.826172 0)"/>
+     <use xlink:href="#DejaVuSans-67" transform="translate(991.609375 0)"/>
+     <use xlink:href="#DejaVuSans-68" transform="translate(1055.085938 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(1118.464844 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1157.673828 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_2">
+   <g id="patch_7">
+    <path d="M 469.455 334.546471 
+L 835.406918 334.546471 
+L 835.406918 144.816 
+L 469.455 144.816 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_3">
+    <g id="xtick_8">
+     <g id="line2d_19">
+      <g>
+       <use xlink:href="#meb1843bdeb" x="486.089178" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_14">
+      <!-- (8K, 4K) -->
+      <g transform="translate(487.559739 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_9">
+     <g id="line2d_20">
+      <g>
+       <use xlink:href="#meb1843bdeb" x="541.536438" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_15">
+      <!-- (16K, 4K) -->
+      <g transform="translate(543.007 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_10">
+     <g id="line2d_21">
+      <g>
+       <use xlink:href="#meb1843bdeb" x="596.983699" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_16">
+      <!-- (32K, 4K) -->
+      <g transform="translate(598.45426 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_11">
+     <g id="line2d_22">
+      <g>
+       <use xlink:href="#meb1843bdeb" x="652.430959" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_17">
+      <!-- (64K, 4K) -->
+      <g transform="translate(653.90152 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_12">
+     <g id="line2d_23">
+      <g>
+       <use xlink:href="#meb1843bdeb" x="707.878219" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_18">
+      <!-- (128K, 4K) -->
+      <g transform="translate(709.34878 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_13">
+     <g id="line2d_24">
+      <g>
+       <use xlink:href="#meb1843bdeb" x="763.32548" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_19">
+      <!-- (256K, 4K) -->
+      <g transform="translate(764.796041 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_14">
+     <g id="line2d_25">
+      <g>
+       <use xlink:href="#meb1843bdeb" x="818.77274" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_20">
+      <!-- (512K, 4K) -->
+      <g transform="translate(820.243301 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_4">
+    <g id="ytick_5">
+     <g id="line2d_26">
+      <path d="M 469.455 334.546471 
+L 835.406918 334.546471 
+" clip-path="url(#p5400b7ad60)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_27">
+      <g>
+       <use xlink:href="#md6c5251344" x="469.455" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_6">
+     <g id="line2d_28">
+      <path d="M 469.455 286.915059 
+L 835.406918 286.915059 
+" clip-path="url(#p5400b7ad60)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_29">
+      <g>
+       <use xlink:href="#md6c5251344" x="469.455" y="286.915059" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_7">
+     <g id="line2d_30">
+      <path d="M 469.455 239.283646 
+L 835.406918 239.283646 
+" clip-path="url(#p5400b7ad60)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_31">
+      <g>
+       <use xlink:href="#md6c5251344" x="469.455" y="239.283646" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_8">
+     <g id="line2d_32">
+      <path d="M 469.455 191.652234 
+L 835.406918 191.652234 
+" clip-path="url(#p5400b7ad60)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_33">
+      <g>
+       <use xlink:href="#md6c5251344" x="469.455" y="191.652234" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_34">
+    <path d="M 486.089178 238.582331 
+L 541.536438 227.350322 
+L 596.983699 221.013408 
+L 652.430959 217.531347 
+L 707.878219 215.493568 
+L 763.32548 214.504926 
+L 818.77274 214.038054 
+" clip-path="url(#p5400b7ad60)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p5400b7ad60)">
+     <use xlink:href="#m00e73cd0be" x="486.089178" y="238.582331" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m00e73cd0be" x="541.536438" y="227.350322" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m00e73cd0be" x="596.983699" y="221.013408" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m00e73cd0be" x="652.430959" y="217.531347" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m00e73cd0be" x="707.878219" y="215.493568" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m00e73cd0be" x="763.32548" y="214.504926" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m00e73cd0be" x="818.77274" y="214.038054" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_35">
+    <path d="M 486.089178 271.055153 
+L 541.536438 263.729959 
+L 596.983699 258.77406 
+L 652.430959 256.357586 
+L 707.878219 255.137832 
+L 763.32548 254.494771 
+L 818.77274 254.183547 
+" clip-path="url(#p5400b7ad60)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p5400b7ad60)">
+     <use xlink:href="#maccf74412a" x="486.089178" y="271.055153" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#maccf74412a" x="541.536438" y="263.729959" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#maccf74412a" x="596.983699" y="258.77406" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#maccf74412a" x="652.430959" y="256.357586" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#maccf74412a" x="707.878219" y="255.137832" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#maccf74412a" x="763.32548" y="254.494771" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#maccf74412a" x="818.77274" y="254.183547" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_36">
+    <path d="M 469.455 158.870109 
+L 835.406918 158.870109 
+" clip-path="url(#p5400b7ad60)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_8">
+    <path d="M 469.455 334.546471 
+L 469.455 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_9">
+    <path d="M 835.406918 334.546471 
+L 835.406918 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_10">
+    <path d="M 469.455 334.546471 
+L 835.406918 334.546471 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_11">
+    <path d="M 469.455 144.816 
+L 835.406918 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_21">
+    <!-- Softmax (fwd+bwd) -->
+    <g transform="translate(562.765646 138.816) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-78" d="M 3513 3500 
+L 2247 1797 
+L 3578 0 
+L 2900 0 
+L 1881 1375 
+L 863 0 
+L 184 0 
+L 1544 1831 
+L 300 3500 
+L 978 3500 
+L 1906 2253 
+L 2834 3500 
+L 3513 3500 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-2b" d="M 2944 4013 
+L 2944 2272 
+L 4684 2272 
+L 4684 1741 
+L 2944 1741 
+L 2944 0 
+L 2419 0 
+L 2419 1741 
+L 678 1741 
+L 678 2272 
+L 2419 2272 
+L 2419 4013 
+L 2944 4013 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-62" d="M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+M 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2969 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-53"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(63.476562 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(124.658203 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(158.113281 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(197.322266 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(294.734375 0)"/>
+     <use xlink:href="#DejaVuSans-78" transform="translate(356.013672 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(415.193359 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(446.980469 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(485.994141 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(519.449219 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(601.236328 0)"/>
+     <use xlink:href="#DejaVuSans-2b" transform="translate(664.712891 0)"/>
+     <use xlink:href="#DejaVuSans-62" transform="translate(748.501953 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(811.978516 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(893.765625 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(957.242188 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_3">
+   <g id="patch_12">
+    <path d="M 880.445 334.546471 
+L 1246.396918 334.546471 
+L 1246.396918 144.816 
+L 880.445 144.816 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_5">
+    <g id="xtick_15">
+     <g id="line2d_37">
+      <g>
+       <use xlink:href="#meb1843bdeb" x="897.079178" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_22">
+      <!-- (8K, 4K) -->
+      <g transform="translate(898.549739 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_16">
+     <g id="line2d_38">
+      <g>
+       <use xlink:href="#meb1843bdeb" x="952.526438" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_23">
+      <!-- (16K, 4K) -->
+      <g transform="translate(953.997 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_17">
+     <g id="line2d_39">
+      <g>
+       <use xlink:href="#meb1843bdeb" x="1007.973699" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_24">
+      <!-- (32K, 4K) -->
+      <g transform="translate(1009.44426 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_18">
+     <g id="line2d_40">
+      <g>
+       <use xlink:href="#meb1843bdeb" x="1063.420959" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_25">
+      <!-- (64K, 4K) -->
+      <g transform="translate(1064.89152 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_19">
+     <g id="line2d_41">
+      <g>
+       <use xlink:href="#meb1843bdeb" x="1118.868219" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_26">
+      <!-- (128K, 4K) -->
+      <g transform="translate(1120.33878 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_20">
+     <g id="line2d_42">
+      <g>
+       <use xlink:href="#meb1843bdeb" x="1174.31548" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_27">
+      <!-- (256K, 4K) -->
+      <g transform="translate(1175.786041 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_21">
+     <g id="line2d_43">
+      <g>
+       <use xlink:href="#meb1843bdeb" x="1229.76274" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_28">
+      <!-- (512K, 4K) -->
+      <g transform="translate(1231.233301 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_6">
+    <g id="ytick_9">
+     <g id="line2d_44">
+      <path d="M 880.445 334.546471 
+L 1246.396918 334.546471 
+" clip-path="url(#p657ab3184b)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_45">
+      <g>
+       <use xlink:href="#md6c5251344" x="880.445" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_10">
+     <g id="line2d_46">
+      <path d="M 880.445 286.915059 
+L 1246.396918 286.915059 
+" clip-path="url(#p657ab3184b)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_47">
+      <g>
+       <use xlink:href="#md6c5251344" x="880.445" y="286.915059" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_11">
+     <g id="line2d_48">
+      <path d="M 880.445 239.283646 
+L 1246.396918 239.283646 
+" clip-path="url(#p657ab3184b)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_49">
+      <g>
+       <use xlink:href="#md6c5251344" x="880.445" y="239.283646" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_12">
+     <g id="line2d_50">
+      <path d="M 880.445 191.652234 
+L 1246.396918 191.652234 
+" clip-path="url(#p657ab3184b)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_51">
+      <g>
+       <use xlink:href="#md6c5251344" x="880.445" y="191.652234" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_52">
+    <path d="M 897.079178 256.925659 
+L 952.526438 244.945202 
+L 1007.973699 237.689301 
+L 1063.420959 233.432348 
+L 1118.868219 231.001708 
+L 1174.31548 229.984945 
+L 1229.76274 229.36148 
+" clip-path="url(#p657ab3184b)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p657ab3184b)">
+     <use xlink:href="#m00e73cd0be" x="897.079178" y="256.925659" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m00e73cd0be" x="952.526438" y="244.945202" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m00e73cd0be" x="1007.973699" y="237.689301" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m00e73cd0be" x="1063.420959" y="233.432348" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m00e73cd0be" x="1118.868219" y="231.001708" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m00e73cd0be" x="1174.31548" y="229.984945" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m00e73cd0be" x="1229.76274" y="229.36148" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_53">
+    <path d="M 897.079178 299.955996 
+L 952.526438 270.110028 
+L 1007.973699 262.988024 
+L 1063.420959 259.42104 
+L 1118.868219 257.236743 
+L 1174.31548 256.309819 
+L 1229.76274 255.686765 
+" clip-path="url(#p657ab3184b)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p657ab3184b)">
+     <use xlink:href="#maccf74412a" x="897.079178" y="299.955996" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#maccf74412a" x="952.526438" y="270.110028" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#maccf74412a" x="1007.973699" y="262.988024" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#maccf74412a" x="1063.420959" y="259.42104" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#maccf74412a" x="1118.868219" y="257.236743" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#maccf74412a" x="1174.31548" y="256.309819" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#maccf74412a" x="1229.76274" y="255.686765" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_54">
+    <path d="M 880.445 158.870109 
+L 1246.396918 158.870109 
+" clip-path="url(#p657ab3184b)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_13">
+    <path d="M 880.445 334.546471 
+L 880.445 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_14">
+    <path d="M 1246.396918 334.546471 
+L 1246.396918 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_15">
+    <path d="M 880.445 334.546471 
+L 1246.396918 334.546471 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_16">
+    <path d="M 880.445 144.816 
+L 1246.396918 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_29">
+    <!-- Cross-Entropy (fwd+bwd) -->
+    <g transform="translate(948.247678 138.816) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-43" d="M 4122 4306 
+L 4122 3641 
+Q 3803 3938 3442 4084 
+Q 3081 4231 2675 4231 
+Q 1875 4231 1450 3742 
+Q 1025 3253 1025 2328 
+Q 1025 1406 1450 917 
+Q 1875 428 2675 428 
+Q 3081 428 3442 575 
+Q 3803 722 4122 1019 
+L 4122 359 
+Q 3791 134 3420 21 
+Q 3050 -91 2638 -91 
+Q 1578 -91 968 557 
+Q 359 1206 359 2328 
+Q 359 3453 968 4101 
+Q 1578 4750 2638 4750 
+Q 3056 4750 3426 4639 
+Q 3797 4528 4122 4306 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-2d" d="M 313 2009 
+L 1997 2009 
+L 1997 1497 
+L 313 1497 
+L 313 2009 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-45" d="M 628 4666 
+L 3578 4666 
+L 3578 4134 
+L 1259 4134 
+L 1259 2753 
+L 3481 2753 
+L 3481 2222 
+L 1259 2222 
+L 1259 531 
+L 3634 531 
+L 3634 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-43"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(69.824219 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(108.6875 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(169.869141 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(221.96875 0)"/>
+     <use xlink:href="#DejaVuSans-2d" transform="translate(274.068359 0)"/>
+     <use xlink:href="#DejaVuSans-45" transform="translate(310.152344 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(373.335938 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(436.714844 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(475.923828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(514.787109 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(575.96875 0)"/>
+     <use xlink:href="#DejaVuSans-79" transform="translate(639.445312 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(698.625 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(730.412109 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(769.425781 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(802.880859 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(884.667969 0)"/>
+     <use xlink:href="#DejaVuSans-2b" transform="translate(948.144531 0)"/>
+     <use xlink:href="#DejaVuSans-62" transform="translate(1031.933594 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(1095.410156 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1177.197266 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1240.673828 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="text_30">
+   <!-- SM100 FP16 Kernel Benchmarks (Oink vs Quack) — Quack-suite -->
+   <g transform="translate(287.333438 18.156563) scale(0.22 -0.22)">
+    <defs>
+     <path id="DejaVuSans-46" d="M 628 4666 
+L 3309 4666 
+L 3309 4134 
+L 1259 4134 
+L 1259 2759 
+L 3109 2759 
+L 3109 2228 
+L 1259 2228 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-50" d="M 1259 4147 
+L 1259 2394 
+L 2053 2394 
+Q 2494 2394 2734 2622 
+Q 2975 2850 2975 3272 
+Q 2975 3691 2734 3919 
+Q 2494 4147 2053 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 2053 4666 
+Q 2838 4666 3239 4311 
+Q 3641 3956 3641 3272 
+Q 3641 2581 3239 2228 
+Q 2838 1875 2053 1875 
+L 1259 1875 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-6c" d="M 603 4863 
+L 1178 4863 
+L 1178 0 
+L 603 0 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-63" d="M 3122 3366 
+L 3122 2828 
+Q 2878 2963 2633 3030 
+Q 2388 3097 2138 3097 
+Q 1578 3097 1268 2742 
+Q 959 2388 959 1747 
+Q 959 1106 1268 751 
+Q 1578 397 2138 397 
+Q 2388 397 2633 464 
+Q 2878 531 3122 666 
+L 3122 134 
+Q 2881 22 2623 -34 
+Q 2366 -91 2075 -91 
+Q 1284 -91 818 406 
+Q 353 903 353 1747 
+Q 353 2603 823 3093 
+Q 1294 3584 2113 3584 
+Q 2378 3584 2631 3529 
+Q 2884 3475 3122 3366 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-6b" d="M 581 4863 
+L 1159 4863 
+L 1159 1991 
+L 2875 3500 
+L 3609 3500 
+L 1753 1863 
+L 3688 0 
+L 2938 0 
+L 1159 1709 
+L 1159 0 
+L 581 0 
+L 581 4863 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-4f" d="M 2522 4238 
+Q 1834 4238 1429 3725 
+Q 1025 3213 1025 2328 
+Q 1025 1447 1429 934 
+Q 1834 422 2522 422 
+Q 3209 422 3611 934 
+Q 4013 1447 4013 2328 
+Q 4013 3213 3611 3725 
+Q 3209 4238 2522 4238 
+z
+M 2522 4750 
+Q 3503 4750 4090 4092 
+Q 4678 3434 4678 2328 
+Q 4678 1225 4090 567 
+Q 3503 -91 2522 -91 
+Q 1538 -91 948 565 
+Q 359 1222 359 2328 
+Q 359 3434 948 4092 
+Q 1538 4750 2522 4750 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-76" d="M 191 3500 
+L 800 3500 
+L 1894 563 
+L 2988 3500 
+L 3597 3500 
+L 2284 0 
+L 1503 0 
+L 191 3500 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-51" d="M 2522 4238 
+Q 1834 4238 1429 3725 
+Q 1025 3213 1025 2328 
+Q 1025 1447 1429 934 
+Q 1834 422 2522 422 
+Q 3209 422 3611 934 
+Q 4013 1447 4013 2328 
+Q 4013 3213 3611 3725 
+Q 3209 4238 2522 4238 
+z
+M 3406 84 
+L 4238 -825 
+L 3475 -825 
+L 2784 -78 
+Q 2681 -84 2626 -87 
+Q 2572 -91 2522 -91 
+Q 1538 -91 948 567 
+Q 359 1225 359 2328 
+Q 359 3434 948 4092 
+Q 1538 4750 2522 4750 
+Q 3503 4750 4090 4092 
+Q 4678 3434 4678 2328 
+Q 4678 1516 4351 937 
+Q 4025 359 3406 84 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-75" d="M 544 1381 
+L 544 3500 
+L 1119 3500 
+L 1119 1403 
+Q 1119 906 1312 657 
+Q 1506 409 1894 409 
+Q 2359 409 2629 706 
+Q 2900 1003 2900 1516 
+L 2900 3500 
+L 3475 3500 
+L 3475 0 
+L 2900 0 
+L 2900 538 
+Q 2691 219 2414 64 
+Q 2138 -91 1772 -91 
+Q 1169 -91 856 284 
+Q 544 659 544 1381 
+z
+M 1991 3584 
+L 1991 3584 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-2014" d="M 313 1978 
+L 6088 1978 
+L 6088 1528 
+L 313 1528 
+L 313 1978 
+z
+" transform="scale(0.015625)"/>
+    </defs>
+    <use xlink:href="#DejaVuSans-53"/>
+    <use xlink:href="#DejaVuSans-4d" transform="translate(63.476562 0)"/>
+    <use xlink:href="#DejaVuSans-31" transform="translate(149.755859 0)"/>
+    <use xlink:href="#DejaVuSans-30" transform="translate(213.378906 0)"/>
+    <use xlink:href="#DejaVuSans-30" transform="translate(277.001953 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(340.625 0)"/>
+    <use xlink:href="#DejaVuSans-46" transform="translate(372.412109 0)"/>
+    <use xlink:href="#DejaVuSans-50" transform="translate(429.931641 0)"/>
+    <use xlink:href="#DejaVuSans-31" transform="translate(490.234375 0)"/>
+    <use xlink:href="#DejaVuSans-36" transform="translate(553.857422 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(617.480469 0)"/>
+    <use xlink:href="#DejaVuSans-4b" transform="translate(649.267578 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(709.84375 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(771.367188 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(810.730469 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(874.109375 0)"/>
+    <use xlink:href="#DejaVuSans-6c" transform="translate(935.632812 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(963.416016 0)"/>
+    <use xlink:href="#DejaVuSans-42" transform="translate(995.203125 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(1063.806641 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(1125.330078 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(1188.708984 0)"/>
+    <use xlink:href="#DejaVuSans-68" transform="translate(1243.689453 0)"/>
+    <use xlink:href="#DejaVuSans-6d" transform="translate(1307.068359 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(1404.480469 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(1465.759766 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(1506.873047 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(1564.783203 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(1616.882812 0)"/>
+    <use xlink:href="#DejaVuSans-28" transform="translate(1648.669922 0)"/>
+    <use xlink:href="#DejaVuSans-4f" transform="translate(1687.683594 0)"/>
+    <use xlink:href="#DejaVuSans-69" transform="translate(1766.394531 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(1794.177734 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(1857.556641 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(1915.466797 0)"/>
+    <use xlink:href="#DejaVuSans-76" transform="translate(1947.253906 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(2006.433594 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2058.533203 0)"/>
+    <use xlink:href="#DejaVuSans-51" transform="translate(2090.320312 0)"/>
+    <use xlink:href="#DejaVuSans-75" transform="translate(2169.03125 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(2232.410156 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(2293.689453 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(2348.669922 0)"/>
+    <use xlink:href="#DejaVuSans-29" transform="translate(2406.580078 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2445.59375 0)"/>
+    <use xlink:href="#DejaVuSans-2014" transform="translate(2477.380859 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2577.380859 0)"/>
+    <use xlink:href="#DejaVuSans-51" transform="translate(2609.167969 0)"/>
+    <use xlink:href="#DejaVuSans-75" transform="translate(2687.878906 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(2751.257812 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(2812.537109 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(2867.517578 0)"/>
+    <use xlink:href="#DejaVuSans-2d" transform="translate(2925.427734 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(2961.511719 0)"/>
+    <use xlink:href="#DejaVuSans-75" transform="translate(3013.611328 0)"/>
+    <use xlink:href="#DejaVuSans-69" transform="translate(3076.990234 0)"/>
+    <use xlink:href="#DejaVuSans-74" transform="translate(3104.773438 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(3143.982422 0)"/>
+   </g>
+  </g>
+  <g id="legend_1">
+   <g id="line2d_55">
+    <path d="M 378.240937 46.691969 
+L 394.490937 46.691969 
+L 410.740937 46.691969 
+" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g>
+     <use xlink:href="#m00e73cd0be" x="394.490937" y="46.691969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="text_31">
+    <!-- KernelAgent-Oink (ours) -->
+    <g transform="translate(421.140937 51.241969) scale(0.13 -0.13)">
+     <defs>
+      <path id="DejaVuSans-41" d="M 2188 4044 
+L 1331 1722 
+L 3047 1722 
+L 2188 4044 
+z
+M 1831 4666 
+L 2547 4666 
+L 4325 0 
+L 3669 0 
+L 3244 1197 
+L 1141 1197 
+L 716 0 
+L 50 0 
+L 1831 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-4b"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(60.576172 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(122.099609 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(161.462891 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(224.841797 0)"/>
+     <use xlink:href="#DejaVuSans-6c" transform="translate(286.365234 0)"/>
+     <use xlink:href="#DejaVuSans-41" transform="translate(314.148438 0)"/>
+     <use xlink:href="#DejaVuSans-67" transform="translate(382.556641 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(446.033203 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(507.556641 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(570.935547 0)"/>
+     <use xlink:href="#DejaVuSans-2d" transform="translate(610.144531 0)"/>
+     <use xlink:href="#DejaVuSans-4f" transform="translate(648.978516 0)"/>
+     <use xlink:href="#DejaVuSans-69" transform="translate(727.689453 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(755.472656 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(818.851562 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(876.761719 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(908.548828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(947.5625 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(1008.744141 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(1072.123047 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(1113.236328 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1165.335938 0)"/>
+    </g>
+   </g>
+   <g id="line2d_56">
+    <path d="M 603.705625 46.691969 
+L 619.955625 46.691969 
+L 636.205625 46.691969 
+" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g>
+     <use xlink:href="#maccf74412a" x="619.955625" y="46.691969" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="text_32">
+    <!-- Quack -->
+    <g transform="translate(646.605625 51.241969) scale(0.13 -0.13)">
+     <use xlink:href="#DejaVuSans-51"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(78.710938 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(142.089844 0)"/>
+     <use xlink:href="#DejaVuSans-63" transform="translate(203.369141 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(258.349609 0)"/>
+    </g>
+   </g>
+   <g id="line2d_57">
+    <path d="M 713.720156 46.691969 
+L 729.970156 46.691969 
+L 746.220156 46.691969 
+" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="text_33">
+    <!-- HBM peak (measured) -->
+    <g transform="translate(756.620156 51.241969) scale(0.13 -0.13)">
+     <defs>
+      <path id="DejaVuSans-48" d="M 628 4666 
+L 1259 4666 
+L 1259 2753 
+L 3553 2753 
+L 3553 4666 
+L 4184 4666 
+L 4184 0 
+L 3553 0 
+L 3553 2222 
+L 1259 2222 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-48"/>
+     <use xlink:href="#DejaVuSans-42" transform="translate(75.195312 0)"/>
+     <use xlink:href="#DejaVuSans-4d" transform="translate(143.798828 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(230.078125 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(261.865234 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(325.341797 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(386.865234 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(448.144531 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(506.054688 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(537.841797 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(576.855469 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(674.267578 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(735.791016 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(797.070312 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(849.169922 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(912.548828 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(951.412109 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1012.935547 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1076.412109 0)"/>
+    </g>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="pd71b444ea1">
+   <rect x="58.465" y="144.816" width="365.951918" height="189.730471"/>
+  </clipPath>
+  <clipPath id="p5400b7ad60">
+   <rect x="469.455" y="144.816" width="365.951918" height="189.730471"/>
+  </clipPath>
+  <clipPath id="p657ab3184b">
+   <rect x="880.445" y="144.816" width="365.951918" height="189.730471"/>
+  </clipPath>
+ </defs>
+</svg>
diff --git a/oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3.svg b/oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3.svg
new file mode 100644
index 0000000..9a58fde
--- /dev/null
+++ b/oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3.svg
@@ -0,0 +1,2621 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="1296pt" height="403.2pt" viewBox="0 0 1296 403.2" xmlns="http://www.w3.org/2000/svg" version="1.1">
+ <metadata>
+  <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+   <cc:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
+    <dc:date>2026-01-12T20:27:30.111404</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <cc:Agent>
+      <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
+     </cc:Agent>
+    </dc:creator>
+   </cc:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure_1">
+  <g id="patch_1">
+   <path d="M 0 403.2 
+L 1296 403.2 
+L 1296 0 
+L 0 0 
+z
+" style="fill: #ffffff"/>
+  </g>
+  <g id="axes_1">
+   <g id="patch_2">
+    <path d="M 66.53 345.561604 
+L 436.873051 345.561604 
+L 436.873051 151.44 
+L 66.53 151.44 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="line2d_1">
+      <defs>
+       <path id="m9bb0c315f1" d="M 0 0 
+L 0 3.5 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#m9bb0c315f1" x="83.363775" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_1">
+      <!-- (4K, 6K) -->
+      <g transform="translate(84.834336 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-28" d="M 1984 4856 
+Q 1566 4138 1362 3434 
+Q 1159 2731 1159 2009 
+Q 1159 1288 1364 580 
+Q 1569 -128 1984 -844 
+L 1484 -844 
+Q 1016 -109 783 600 
+Q 550 1309 550 2009 
+Q 550 2706 781 3412 
+Q 1013 4119 1484 4856 
+L 1984 4856 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-34" d="M 2419 4116 
+L 825 1625 
+L 2419 1625 
+L 2419 4116 
+z
+M 2253 4666 
+L 3047 4666 
+L 3047 1625 
+L 3713 1625 
+L 3713 1100 
+L 3047 1100 
+L 3047 0 
+L 2419 0 
+L 2419 1100 
+L 313 1100 
+L 313 1709 
+L 2253 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-4b" d="M 628 4666 
+L 1259 4666 
+L 1259 2694 
+L 3353 4666 
+L 4166 4666 
+L 1850 2491 
+L 4331 0 
+L 3500 0 
+L 1259 2247 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-2c" d="M 750 794 
+L 1409 794 
+L 1409 256 
+L 897 -744 
+L 494 -744 
+L 750 256 
+L 750 794 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-20" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-36" d="M 2113 2584 
+Q 1688 2584 1439 2293 
+Q 1191 2003 1191 1497 
+Q 1191 994 1439 701 
+Q 1688 409 2113 409 
+Q 2538 409 2786 701 
+Q 3034 994 3034 1497 
+Q 3034 2003 2786 2293 
+Q 2538 2584 2113 2584 
+z
+M 3366 4563 
+L 3366 3988 
+Q 3128 4100 2886 4159 
+Q 2644 4219 2406 4219 
+Q 1781 4219 1451 3797 
+Q 1122 3375 1075 2522 
+Q 1259 2794 1537 2939 
+Q 1816 3084 2150 3084 
+Q 2853 3084 3261 2657 
+Q 3669 2231 3669 1497 
+Q 3669 778 3244 343 
+Q 2819 -91 2113 -91 
+Q 1303 -91 875 529 
+Q 447 1150 447 2328 
+Q 447 3434 972 4092 
+Q 1497 4750 2381 4750 
+Q 2619 4750 2861 4703 
+Q 3103 4656 3366 4563 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-29" d="M 513 4856 
+L 1013 4856 
+Q 1481 4119 1714 3412 
+Q 1947 2706 1947 2009 
+Q 1947 1309 1714 600 
+Q 1481 -109 1013 -844 
+L 513 -844 
+Q 928 -128 1133 580 
+Q 1338 1288 1338 2009 
+Q 1338 2731 1133 3434 
+Q 928 4138 513 4856 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="line2d_2">
+      <g>
+       <use xlink:href="#m9bb0c315f1" x="125.448213" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_2">
+      <!-- (16K, 6K) -->
+      <g transform="translate(126.918774 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-31" d="M 794 531 
+L 1825 531 
+L 1825 4091 
+L 703 3866 
+L 703 4441 
+L 1819 4666 
+L 2450 4666 
+L 2450 531 
+L 3481 531 
+L 3481 0 
+L 794 0 
+L 794 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="line2d_3">
+      <g>
+       <use xlink:href="#m9bb0c315f1" x="167.53265" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_3">
+      <!-- (64K, 6K) -->
+      <g transform="translate(169.003211 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="line2d_4">
+      <g>
+       <use xlink:href="#m9bb0c315f1" x="209.617088" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_4">
+      <!-- (4K, 7K) -->
+      <g transform="translate(211.087649 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-37" d="M 525 4666 
+L 3525 4666 
+L 3525 4397 
+L 1831 0 
+L 1172 0 
+L 2766 4134 
+L 525 4134 
+L 525 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="line2d_5">
+      <g>
+       <use xlink:href="#m9bb0c315f1" x="251.701526" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_5">
+      <!-- (16K, 7K) -->
+      <g transform="translate(253.172087 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="line2d_6">
+      <g>
+       <use xlink:href="#m9bb0c315f1" x="293.785963" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_6">
+      <!-- (64K, 7K) -->
+      <g transform="translate(295.256524 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_7">
+     <g id="line2d_7">
+      <g>
+       <use xlink:href="#m9bb0c315f1" x="335.870401" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_7">
+      <!-- (4K, 8K) -->
+      <g transform="translate(337.340962 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-38" d="M 2034 2216 
+Q 1584 2216 1326 1975 
+Q 1069 1734 1069 1313 
+Q 1069 891 1326 650 
+Q 1584 409 2034 409 
+Q 2484 409 2743 651 
+Q 3003 894 3003 1313 
+Q 3003 1734 2745 1975 
+Q 2488 2216 2034 2216 
+z
+M 1403 2484 
+Q 997 2584 770 2862 
+Q 544 3141 544 3541 
+Q 544 4100 942 4425 
+Q 1341 4750 2034 4750 
+Q 2731 4750 3128 4425 
+Q 3525 4100 3525 3541 
+Q 3525 3141 3298 2862 
+Q 3072 2584 2669 2484 
+Q 3125 2378 3379 2068 
+Q 3634 1759 3634 1313 
+Q 3634 634 3220 271 
+Q 2806 -91 2034 -91 
+Q 1263 -91 848 271 
+Q 434 634 434 1313 
+Q 434 1759 690 2068 
+Q 947 2378 1403 2484 
+z
+M 1172 3481 
+Q 1172 3119 1398 2916 
+Q 1625 2713 2034 2713 
+Q 2441 2713 2670 2916 
+Q 2900 3119 2900 3481 
+Q 2900 3844 2670 4047 
+Q 2441 4250 2034 4250 
+Q 1625 4250 1398 4047 
+Q 1172 3844 1172 3481 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_8">
+     <g id="line2d_8">
+      <g>
+       <use xlink:href="#m9bb0c315f1" x="377.954838" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_8">
+      <!-- (16K, 8K) -->
+      <g transform="translate(379.4254 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_9">
+     <g id="line2d_9">
+      <g>
+       <use xlink:href="#m9bb0c315f1" x="420.039276" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_9">
+      <!-- (64K, 8K) -->
+      <g transform="translate(421.509837 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2">
+    <g id="ytick_1">
+     <g id="line2d_10">
+      <path d="M 66.53 345.561604 
+L 436.873051 345.561604 
+" clip-path="url(#p7f372e1956)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_11">
+      <defs>
+       <path id="m0a40772bfa" d="M 0 0 
+L -3.5 0 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#m0a40772bfa" x="66.53" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_10">
+      <!-- 0 -->
+      <g transform="translate(51.895 350.120666) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-30" d="M 2034 4250 
+Q 1547 4250 1301 3770 
+Q 1056 3291 1056 2328 
+Q 1056 1369 1301 889 
+Q 1547 409 2034 409 
+Q 2525 409 2770 889 
+Q 3016 1369 3016 2328 
+Q 3016 3291 2770 3770 
+Q 2525 4250 2034 4250 
+z
+M 2034 4750 
+Q 2819 4750 3233 4129 
+Q 3647 3509 3647 2328 
+Q 3647 1150 3233 529 
+Q 2819 -91 2034 -91 
+Q 1250 -91 836 529 
+Q 422 1150 422 2328 
+Q 422 3509 836 4129 
+Q 1250 4750 2034 4750 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_2">
+     <g id="line2d_12">
+      <path d="M 66.53 321.194706 
+L 436.873051 321.194706 
+" clip-path="url(#p7f372e1956)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_13">
+      <g>
+       <use xlink:href="#m0a40772bfa" x="66.53" y="321.194706" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_11">
+      <!-- 1000 -->
+      <g transform="translate(28.99 325.753768) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_3">
+     <g id="line2d_14">
+      <path d="M 66.53 296.827808 
+L 436.873051 296.827808 
+" clip-path="url(#p7f372e1956)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_15">
+      <g>
+       <use xlink:href="#m0a40772bfa" x="66.53" y="296.827808" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_12">
+      <!-- 2000 -->
+      <g transform="translate(28.99 301.38687) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-32" d="M 1228 531 
+L 3431 531 
+L 3431 0 
+L 469 0 
+L 469 531 
+Q 828 903 1448 1529 
+Q 2069 2156 2228 2338 
+Q 2531 2678 2651 2914 
+Q 2772 3150 2772 3378 
+Q 2772 3750 2511 3984 
+Q 2250 4219 1831 4219 
+Q 1534 4219 1204 4116 
+Q 875 4013 500 3803 
+L 500 4441 
+Q 881 4594 1212 4672 
+Q 1544 4750 1819 4750 
+Q 2544 4750 2975 4387 
+Q 3406 4025 3406 3419 
+Q 3406 3131 3298 2873 
+Q 3191 2616 2906 2266 
+Q 2828 2175 2409 1742 
+Q 1991 1309 1228 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-32"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_4">
+     <g id="line2d_16">
+      <path d="M 66.53 272.460909 
+L 436.873051 272.460909 
+" clip-path="url(#p7f372e1956)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_17">
+      <g>
+       <use xlink:href="#m0a40772bfa" x="66.53" y="272.460909" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_13">
+      <!-- 3000 -->
+      <g transform="translate(28.99 277.019972) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-33" d="M 2597 2516 
+Q 3050 2419 3304 2112 
+Q 3559 1806 3559 1356 
+Q 3559 666 3084 287 
+Q 2609 -91 1734 -91 
+Q 1441 -91 1130 -33 
+Q 819 25 488 141 
+L 488 750 
+Q 750 597 1062 519 
+Q 1375 441 1716 441 
+Q 2309 441 2620 675 
+Q 2931 909 2931 1356 
+Q 2931 1769 2642 2001 
+Q 2353 2234 1838 2234 
+L 1294 2234 
+L 1294 2753 
+L 1863 2753 
+Q 2328 2753 2575 2939 
+Q 2822 3125 2822 3475 
+Q 2822 3834 2567 4026 
+Q 2313 4219 1838 4219 
+Q 1578 4219 1281 4162 
+Q 984 4106 628 3988 
+L 628 4550 
+Q 988 4650 1302 4700 
+Q 1616 4750 1894 4750 
+Q 2613 4750 3031 4423 
+Q 3450 4097 3450 3541 
+Q 3450 3153 3228 2886 
+Q 3006 2619 2597 2516 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-33"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_5">
+     <g id="line2d_18">
+      <path d="M 66.53 248.094011 
+L 436.873051 248.094011 
+" clip-path="url(#p7f372e1956)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_19">
+      <g>
+       <use xlink:href="#m0a40772bfa" x="66.53" y="248.094011" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_14">
+      <!-- 4000 -->
+      <g transform="translate(28.99 252.653074) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-34"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_6">
+     <g id="line2d_20">
+      <path d="M 66.53 223.727113 
+L 436.873051 223.727113 
+" clip-path="url(#p7f372e1956)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_21">
+      <g>
+       <use xlink:href="#m0a40772bfa" x="66.53" y="223.727113" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_15">
+      <!-- 5000 -->
+      <g transform="translate(28.99 228.286175) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-35" d="M 691 4666 
+L 3169 4666 
+L 3169 4134 
+L 1269 4134 
+L 1269 2991 
+Q 1406 3038 1543 3061 
+Q 1681 3084 1819 3084 
+Q 2600 3084 3056 2656 
+Q 3513 2228 3513 1497 
+Q 3513 744 3044 326 
+Q 2575 -91 1722 -91 
+Q 1428 -91 1123 -41 
+Q 819 9 494 109 
+L 494 744 
+Q 775 591 1075 516 
+Q 1375 441 1709 441 
+Q 2250 441 2565 725 
+Q 2881 1009 2881 1497 
+Q 2881 1984 2565 2268 
+Q 2250 2553 1709 2553 
+Q 1456 2553 1204 2497 
+Q 953 2441 691 2322 
+L 691 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-35"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_7">
+     <g id="line2d_22">
+      <path d="M 66.53 199.360215 
+L 436.873051 199.360215 
+" clip-path="url(#p7f372e1956)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_23">
+      <g>
+       <use xlink:href="#m0a40772bfa" x="66.53" y="199.360215" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_16">
+      <!-- 6000 -->
+      <g transform="translate(28.99 203.919277) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-36"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_8">
+     <g id="line2d_24">
+      <path d="M 66.53 174.993316 
+L 436.873051 174.993316 
+" clip-path="url(#p7f372e1956)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_25">
+      <g>
+       <use xlink:href="#m0a40772bfa" x="66.53" y="174.993316" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_17">
+      <!-- 7000 -->
+      <g transform="translate(28.99 179.552379) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-37"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_18">
+     <!-- Memory Bandwidth (GB/s) -->
+     <g transform="translate(21.6625 353.504552) rotate(-90) scale(0.16 -0.16)">
+      <defs>
+       <path id="DejaVuSans-4d" d="M 628 4666 
+L 1569 4666 
+L 2759 1491 
+L 3956 4666 
+L 4897 4666 
+L 4897 0 
+L 4281 0 
+L 4281 4097 
+L 3078 897 
+L 2444 897 
+L 1241 4097 
+L 1241 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-65" d="M 3597 1894 
+L 3597 1613 
+L 953 1613 
+Q 991 1019 1311 708 
+Q 1631 397 2203 397 
+Q 2534 397 2845 478 
+Q 3156 559 3463 722 
+L 3463 178 
+Q 3153 47 2828 -22 
+Q 2503 -91 2169 -91 
+Q 1331 -91 842 396 
+Q 353 884 353 1716 
+Q 353 2575 817 3079 
+Q 1281 3584 2069 3584 
+Q 2775 3584 3186 3129 
+Q 3597 2675 3597 1894 
+z
+M 3022 2063 
+Q 3016 2534 2758 2815 
+Q 2500 3097 2075 3097 
+Q 1594 3097 1305 2825 
+Q 1016 2553 972 2059 
+L 3022 2063 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6d" d="M 3328 2828 
+Q 3544 3216 3844 3400 
+Q 4144 3584 4550 3584 
+Q 5097 3584 5394 3201 
+Q 5691 2819 5691 2113 
+L 5691 0 
+L 5113 0 
+L 5113 2094 
+Q 5113 2597 4934 2840 
+Q 4756 3084 4391 3084 
+Q 3944 3084 3684 2787 
+Q 3425 2491 3425 1978 
+L 3425 0 
+L 2847 0 
+L 2847 2094 
+Q 2847 2600 2669 2842 
+Q 2491 3084 2119 3084 
+Q 1678 3084 1418 2786 
+Q 1159 2488 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1356 3278 1631 3431 
+Q 1906 3584 2284 3584 
+Q 2666 3584 2933 3390 
+Q 3200 3197 3328 2828 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6f" d="M 1959 3097 
+Q 1497 3097 1228 2736 
+Q 959 2375 959 1747 
+Q 959 1119 1226 758 
+Q 1494 397 1959 397 
+Q 2419 397 2687 759 
+Q 2956 1122 2956 1747 
+Q 2956 2369 2687 2733 
+Q 2419 3097 1959 3097 
+z
+M 1959 3584 
+Q 2709 3584 3137 3096 
+Q 3566 2609 3566 1747 
+Q 3566 888 3137 398 
+Q 2709 -91 1959 -91 
+Q 1206 -91 779 398 
+Q 353 888 353 1747 
+Q 353 2609 779 3096 
+Q 1206 3584 1959 3584 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-72" d="M 2631 2963 
+Q 2534 3019 2420 3045 
+Q 2306 3072 2169 3072 
+Q 1681 3072 1420 2755 
+Q 1159 2438 1159 1844 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1341 3275 1631 3429 
+Q 1922 3584 2338 3584 
+Q 2397 3584 2469 3576 
+Q 2541 3569 2628 3553 
+L 2631 2963 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-79" d="M 2059 -325 
+Q 1816 -950 1584 -1140 
+Q 1353 -1331 966 -1331 
+L 506 -1331 
+L 506 -850 
+L 844 -850 
+Q 1081 -850 1212 -737 
+Q 1344 -625 1503 -206 
+L 1606 56 
+L 191 3500 
+L 800 3500 
+L 1894 763 
+L 2988 3500 
+L 3597 3500 
+L 2059 -325 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-42" d="M 1259 2228 
+L 1259 519 
+L 2272 519 
+Q 2781 519 3026 730 
+Q 3272 941 3272 1375 
+Q 3272 1813 3026 2020 
+Q 2781 2228 2272 2228 
+L 1259 2228 
+z
+M 1259 4147 
+L 1259 2741 
+L 2194 2741 
+Q 2656 2741 2882 2914 
+Q 3109 3088 3109 3444 
+Q 3109 3797 2882 3972 
+Q 2656 4147 2194 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 2241 4666 
+Q 2963 4666 3353 4366 
+Q 3744 4066 3744 3513 
+Q 3744 3084 3544 2831 
+Q 3344 2578 2956 2516 
+Q 3422 2416 3680 2098 
+Q 3938 1781 3938 1306 
+Q 3938 681 3513 340 
+Q 3088 0 2303 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-61" d="M 2194 1759 
+Q 1497 1759 1228 1600 
+Q 959 1441 959 1056 
+Q 959 750 1161 570 
+Q 1363 391 1709 391 
+Q 2188 391 2477 730 
+Q 2766 1069 2766 1631 
+L 2766 1759 
+L 2194 1759 
+z
+M 3341 1997 
+L 3341 0 
+L 2766 0 
+L 2766 531 
+Q 2569 213 2275 61 
+Q 1981 -91 1556 -91 
+Q 1019 -91 701 211 
+Q 384 513 384 1019 
+Q 384 1609 779 1909 
+Q 1175 2209 1959 2209 
+L 2766 2209 
+L 2766 2266 
+Q 2766 2663 2505 2880 
+Q 2244 3097 1772 3097 
+Q 1472 3097 1187 3025 
+Q 903 2953 641 2809 
+L 641 3341 
+Q 956 3463 1253 3523 
+Q 1550 3584 1831 3584 
+Q 2591 3584 2966 3190 
+Q 3341 2797 3341 1997 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6e" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-64" d="M 2906 2969 
+L 2906 4863 
+L 3481 4863 
+L 3481 0 
+L 2906 0 
+L 2906 525 
+Q 2725 213 2448 61 
+Q 2172 -91 1784 -91 
+Q 1150 -91 751 415 
+Q 353 922 353 1747 
+Q 353 2572 751 3078 
+Q 1150 3584 1784 3584 
+Q 2172 3584 2448 3432 
+Q 2725 3281 2906 2969 
+z
+M 947 1747 
+Q 947 1113 1208 752 
+Q 1469 391 1925 391 
+Q 2381 391 2643 752 
+Q 2906 1113 2906 1747 
+Q 2906 2381 2643 2742 
+Q 2381 3103 1925 3103 
+Q 1469 3103 1208 2742 
+Q 947 2381 947 1747 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-77" d="M 269 3500 
+L 844 3500 
+L 1563 769 
+L 2278 3500 
+L 2956 3500 
+L 3675 769 
+L 4391 3500 
+L 4966 3500 
+L 4050 0 
+L 3372 0 
+L 2619 2869 
+L 1863 0 
+L 1184 0 
+L 269 3500 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-69" d="M 603 3500 
+L 1178 3500 
+L 1178 0 
+L 603 0 
+L 603 3500 
+z
+M 603 4863 
+L 1178 4863 
+L 1178 4134 
+L 603 4134 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-74" d="M 1172 4494 
+L 1172 3500 
+L 2356 3500 
+L 2356 3053 
+L 1172 3053 
+L 1172 1153 
+Q 1172 725 1289 603 
+Q 1406 481 1766 481 
+L 2356 481 
+L 2356 0 
+L 1766 0 
+Q 1100 0 847 248 
+Q 594 497 594 1153 
+L 594 3053 
+L 172 3053 
+L 172 3500 
+L 594 3500 
+L 594 4494 
+L 1172 4494 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-68" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-47" d="M 3809 666 
+L 3809 1919 
+L 2778 1919 
+L 2778 2438 
+L 4434 2438 
+L 4434 434 
+Q 4069 175 3628 42 
+Q 3188 -91 2688 -91 
+Q 1594 -91 976 548 
+Q 359 1188 359 2328 
+Q 359 3472 976 4111 
+Q 1594 4750 2688 4750 
+Q 3144 4750 3555 4637 
+Q 3966 4525 4313 4306 
+L 4313 3634 
+Q 3963 3931 3569 4081 
+Q 3175 4231 2741 4231 
+Q 1884 4231 1454 3753 
+Q 1025 3275 1025 2328 
+Q 1025 1384 1454 906 
+Q 1884 428 2741 428 
+Q 3075 428 3337 486 
+Q 3600 544 3809 666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-2f" d="M 1625 4666 
+L 2156 4666 
+L 531 -594 
+L 0 -594 
+L 1625 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-73" d="M 2834 3397 
+L 2834 2853 
+Q 2591 2978 2328 3040 
+Q 2066 3103 1784 3103 
+Q 1356 3103 1142 2972 
+Q 928 2841 928 2578 
+Q 928 2378 1081 2264 
+Q 1234 2150 1697 2047 
+L 1894 2003 
+Q 2506 1872 2764 1633 
+Q 3022 1394 3022 966 
+Q 3022 478 2636 193 
+Q 2250 -91 1575 -91 
+Q 1294 -91 989 -36 
+Q 684 19 347 128 
+L 347 722 
+Q 666 556 975 473 
+Q 1284 391 1588 391 
+Q 1994 391 2212 530 
+Q 2431 669 2431 922 
+Q 2431 1156 2273 1281 
+Q 2116 1406 1581 1522 
+L 1381 1569 
+Q 847 1681 609 1914 
+Q 372 2147 372 2553 
+Q 372 3047 722 3315 
+Q 1072 3584 1716 3584 
+Q 2034 3584 2315 3537 
+Q 2597 3491 2834 3397 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-4d"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(86.279297 0)"/>
+      <use xlink:href="#DejaVuSans-6d" transform="translate(147.802734 0)"/>
+      <use xlink:href="#DejaVuSans-6f" transform="translate(245.214844 0)"/>
+      <use xlink:href="#DejaVuSans-72" transform="translate(306.396484 0)"/>
+      <use xlink:href="#DejaVuSans-79" transform="translate(347.509766 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(406.689453 0)"/>
+      <use xlink:href="#DejaVuSans-42" transform="translate(438.476562 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(507.080078 0)"/>
+      <use xlink:href="#DejaVuSans-6e" transform="translate(568.359375 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(631.738281 0)"/>
+      <use xlink:href="#DejaVuSans-77" transform="translate(695.214844 0)"/>
+      <use xlink:href="#DejaVuSans-69" transform="translate(777.001953 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(804.785156 0)"/>
+      <use xlink:href="#DejaVuSans-74" transform="translate(868.261719 0)"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(907.470703 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(970.849609 0)"/>
+      <use xlink:href="#DejaVuSans-28" transform="translate(1002.636719 0)"/>
+      <use xlink:href="#DejaVuSans-47" transform="translate(1041.650391 0)"/>
+      <use xlink:href="#DejaVuSans-42" transform="translate(1119.140625 0)"/>
+      <use xlink:href="#DejaVuSans-2f" transform="translate(1187.744141 0)"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(1221.435547 0)"/>
+      <use xlink:href="#DejaVuSans-29" transform="translate(1273.535156 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_26">
+    <path d="M 83.363775 220.542078 
+L 125.448213 191.175904 
+L 167.53265 180.378635 
+L 209.617088 218.832242 
+L 251.701526 198.974879 
+L 293.785963 186.339704 
+L 335.870401 214.277333 
+L 377.954838 189.080127 
+L 420.039276 182.463102 
+" clip-path="url(#p7f372e1956)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <defs>
+     <path id="m6ed91b5a99" d="M 0 3.5 
+C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
+C 3.131218 1.81853 3.5 0.928211 3.5 0 
+C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
+C 1.81853 -3.131218 0.928211 -3.5 0 -3.5 
+C -0.928211 -3.5 -1.81853 -3.131218 -2.474874 -2.474874 
+C -3.131218 -1.81853 -3.5 -0.928211 -3.5 0 
+C -3.5 0.928211 -3.131218 1.81853 -2.474874 2.474874 
+C -1.81853 3.131218 -0.928211 3.5 0 3.5 
+z
+" style="stroke: #5ba3f5"/>
+    </defs>
+    <g clip-path="url(#p7f372e1956)">
+     <use xlink:href="#m6ed91b5a99" x="83.363775" y="220.542078" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6ed91b5a99" x="125.448213" y="191.175904" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6ed91b5a99" x="167.53265" y="180.378635" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6ed91b5a99" x="209.617088" y="218.832242" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6ed91b5a99" x="251.701526" y="198.974879" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6ed91b5a99" x="293.785963" y="186.339704" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6ed91b5a99" x="335.870401" y="214.277333" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6ed91b5a99" x="377.954838" y="189.080127" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6ed91b5a99" x="420.039276" y="182.463102" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_27">
+    <path d="M 83.363775 237.140916 
+L 125.448213 208.572261 
+L 167.53265 199.383628 
+L 209.617088 230.552779 
+L 251.701526 208.558611 
+L 293.785963 198.960974 
+L 335.870401 227.845095 
+L 377.954838 203.69787 
+L 420.039276 198.099645 
+" clip-path="url(#p7f372e1956)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <defs>
+     <path id="m915f9547b2" d="M 0 3.5 
+C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
+C 3.131218 1.81853 3.5 0.928211 3.5 0 
+C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
+C 1.81853 -3.131218 0.928211 -3.5 0 -3.5 
+C -0.928211 -3.5 -1.81853 -3.131218 -2.474874 -2.474874 
+C -3.131218 -1.81853 -3.5 -0.928211 -3.5 0 
+C -3.5 0.928211 -3.131218 1.81853 -2.474874 2.474874 
+C -1.81853 3.131218 -0.928211 3.5 0 3.5 
+z
+" style="stroke: #ff4444"/>
+    </defs>
+    <g clip-path="url(#p7f372e1956)">
+     <use xlink:href="#m915f9547b2" x="83.363775" y="237.140916" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m915f9547b2" x="125.448213" y="208.572261" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m915f9547b2" x="167.53265" y="199.383628" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m915f9547b2" x="209.617088" y="230.552779" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m915f9547b2" x="251.701526" y="208.558611" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m915f9547b2" x="293.785963" y="198.960974" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m915f9547b2" x="335.870401" y="227.845095" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m915f9547b2" x="377.954838" y="203.69787" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m915f9547b2" x="420.039276" y="198.099645" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_28">
+    <path d="M 66.53 165.819378 
+L 436.873051 165.819378 
+" clip-path="url(#p7f372e1956)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_3">
+    <path d="M 66.53 345.561604 
+L 66.53 151.44 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_4">
+    <path d="M 436.873051 345.561604 
+L 436.873051 151.44 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_5">
+    <path d="M 66.53 345.561604 
+L 436.873051 345.561604 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_6">
+    <path d="M 66.53 151.44 
+L 436.873051 151.44 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_19">
+    <!-- Fused Add+RMSNorm (fwd) -->
+    <g transform="translate(127.299026 145.44) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-46" d="M 628 4666 
+L 3309 4666 
+L 3309 4134 
+L 1259 4134 
+L 1259 2759 
+L 3109 2759 
+L 3109 2228 
+L 1259 2228 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-75" d="M 544 1381 
+L 544 3500 
+L 1119 3500 
+L 1119 1403 
+Q 1119 906 1312 657 
+Q 1506 409 1894 409 
+Q 2359 409 2629 706 
+Q 2900 1003 2900 1516 
+L 2900 3500 
+L 3475 3500 
+L 3475 0 
+L 2900 0 
+L 2900 538 
+Q 2691 219 2414 64 
+Q 2138 -91 1772 -91 
+Q 1169 -91 856 284 
+Q 544 659 544 1381 
+z
+M 1991 3584 
+L 1991 3584 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-41" d="M 2188 4044 
+L 1331 1722 
+L 3047 1722 
+L 2188 4044 
+z
+M 1831 4666 
+L 2547 4666 
+L 4325 0 
+L 3669 0 
+L 3244 1197 
+L 1141 1197 
+L 716 0 
+L 50 0 
+L 1831 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-2b" d="M 2944 4013 
+L 2944 2272 
+L 4684 2272 
+L 4684 1741 
+L 2944 1741 
+L 2944 0 
+L 2419 0 
+L 2419 1741 
+L 678 1741 
+L 678 2272 
+L 2419 2272 
+L 2419 4013 
+L 2944 4013 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-52" d="M 2841 2188 
+Q 3044 2119 3236 1894 
+Q 3428 1669 3622 1275 
+L 4263 0 
+L 3584 0 
+L 2988 1197 
+Q 2756 1666 2539 1819 
+Q 2322 1972 1947 1972 
+L 1259 1972 
+L 1259 0 
+L 628 0 
+L 628 4666 
+L 2053 4666 
+Q 2853 4666 3247 4331 
+Q 3641 3997 3641 3322 
+Q 3641 2881 3436 2590 
+Q 3231 2300 2841 2188 
+z
+M 1259 4147 
+L 1259 2491 
+L 2053 2491 
+Q 2509 2491 2742 2702 
+Q 2975 2913 2975 3322 
+Q 2975 3731 2742 3939 
+Q 2509 4147 2053 4147 
+L 1259 4147 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-53" d="M 3425 4513 
+L 3425 3897 
+Q 3066 4069 2747 4153 
+Q 2428 4238 2131 4238 
+Q 1616 4238 1336 4038 
+Q 1056 3838 1056 3469 
+Q 1056 3159 1242 3001 
+Q 1428 2844 1947 2747 
+L 2328 2669 
+Q 3034 2534 3370 2195 
+Q 3706 1856 3706 1288 
+Q 3706 609 3251 259 
+Q 2797 -91 1919 -91 
+Q 1588 -91 1214 -16 
+Q 841 59 441 206 
+L 441 856 
+Q 825 641 1194 531 
+Q 1563 422 1919 422 
+Q 2459 422 2753 634 
+Q 3047 847 3047 1241 
+Q 3047 1584 2836 1778 
+Q 2625 1972 2144 2069 
+L 1759 2144 
+Q 1053 2284 737 2584 
+Q 422 2884 422 3419 
+Q 422 4038 858 4394 
+Q 1294 4750 2059 4750 
+Q 2388 4750 2728 4690 
+Q 3069 4631 3425 4513 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-4e" d="M 628 4666 
+L 1478 4666 
+L 3547 763 
+L 3547 4666 
+L 4159 4666 
+L 4159 0 
+L 3309 0 
+L 1241 3903 
+L 1241 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-66" d="M 2375 4863 
+L 2375 4384 
+L 1825 4384 
+Q 1516 4384 1395 4259 
+Q 1275 4134 1275 3809 
+L 1275 3500 
+L 2222 3500 
+L 2222 3053 
+L 1275 3053 
+L 1275 0 
+L 697 0 
+L 697 3053 
+L 147 3053 
+L 147 3500 
+L 697 3500 
+L 697 3744 
+Q 697 4328 969 4595 
+Q 1241 4863 1831 4863 
+L 2375 4863 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-46"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(52.019531 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(115.398438 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(167.498047 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(229.021484 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(292.498047 0)"/>
+     <use xlink:href="#DejaVuSans-41" transform="translate(324.285156 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(390.943359 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(454.419922 0)"/>
+     <use xlink:href="#DejaVuSans-2b" transform="translate(517.896484 0)"/>
+     <use xlink:href="#DejaVuSans-52" transform="translate(601.685547 0)"/>
+     <use xlink:href="#DejaVuSans-4d" transform="translate(671.167969 0)"/>
+     <use xlink:href="#DejaVuSans-53" transform="translate(757.447266 0)"/>
+     <use xlink:href="#DejaVuSans-4e" transform="translate(820.923828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(895.728516 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(956.910156 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(996.273438 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(1093.685547 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(1125.472656 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(1164.486328 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(1197.941406 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1279.728516 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1343.205078 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_2">
+   <g id="patch_7">
+    <path d="M 477.52 345.561604 
+L 847.863051 345.561604 
+L 847.863051 151.44 
+L 477.52 151.44 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_3">
+    <g id="xtick_10">
+     <g id="line2d_29">
+      <g>
+       <use xlink:href="#m9bb0c315f1" x="494.353775" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_20">
+      <!-- (4K, 6K) -->
+      <g transform="translate(495.824336 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_11">
+     <g id="line2d_30">
+      <g>
+       <use xlink:href="#m9bb0c315f1" x="536.438213" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_21">
+      <!-- (16K, 6K) -->
+      <g transform="translate(537.908774 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_12">
+     <g id="line2d_31">
+      <g>
+       <use xlink:href="#m9bb0c315f1" x="578.52265" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_22">
+      <!-- (64K, 6K) -->
+      <g transform="translate(579.993211 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_13">
+     <g id="line2d_32">
+      <g>
+       <use xlink:href="#m9bb0c315f1" x="620.607088" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_23">
+      <!-- (4K, 7K) -->
+      <g transform="translate(622.077649 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_14">
+     <g id="line2d_33">
+      <g>
+       <use xlink:href="#m9bb0c315f1" x="662.691526" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_24">
+      <!-- (16K, 7K) -->
+      <g transform="translate(664.162087 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_15">
+     <g id="line2d_34">
+      <g>
+       <use xlink:href="#m9bb0c315f1" x="704.775963" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_25">
+      <!-- (64K, 7K) -->
+      <g transform="translate(706.246524 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_16">
+     <g id="line2d_35">
+      <g>
+       <use xlink:href="#m9bb0c315f1" x="746.860401" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_26">
+      <!-- (4K, 8K) -->
+      <g transform="translate(748.330962 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_17">
+     <g id="line2d_36">
+      <g>
+       <use xlink:href="#m9bb0c315f1" x="788.944838" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_27">
+      <!-- (16K, 8K) -->
+      <g transform="translate(790.4154 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_18">
+     <g id="line2d_37">
+      <g>
+       <use xlink:href="#m9bb0c315f1" x="831.029276" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_28">
+      <!-- (64K, 8K) -->
+      <g transform="translate(832.499837 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_4">
+    <g id="ytick_9">
+     <g id="line2d_38">
+      <path d="M 477.52 345.561604 
+L 847.863051 345.561604 
+" clip-path="url(#p1662381337)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_39">
+      <g>
+       <use xlink:href="#m0a40772bfa" x="477.52" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_10">
+     <g id="line2d_40">
+      <path d="M 477.52 321.194706 
+L 847.863051 321.194706 
+" clip-path="url(#p1662381337)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_41">
+      <g>
+       <use xlink:href="#m0a40772bfa" x="477.52" y="321.194706" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_11">
+     <g id="line2d_42">
+      <path d="M 477.52 296.827808 
+L 847.863051 296.827808 
+" clip-path="url(#p1662381337)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_43">
+      <g>
+       <use xlink:href="#m0a40772bfa" x="477.52" y="296.827808" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_12">
+     <g id="line2d_44">
+      <path d="M 477.52 272.460909 
+L 847.863051 272.460909 
+" clip-path="url(#p1662381337)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_45">
+      <g>
+       <use xlink:href="#m0a40772bfa" x="477.52" y="272.460909" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_13">
+     <g id="line2d_46">
+      <path d="M 477.52 248.094011 
+L 847.863051 248.094011 
+" clip-path="url(#p1662381337)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_47">
+      <g>
+       <use xlink:href="#m0a40772bfa" x="477.52" y="248.094011" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_14">
+     <g id="line2d_48">
+      <path d="M 477.52 223.727113 
+L 847.863051 223.727113 
+" clip-path="url(#p1662381337)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_49">
+      <g>
+       <use xlink:href="#m0a40772bfa" x="477.52" y="223.727113" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_15">
+     <g id="line2d_50">
+      <path d="M 477.52 199.360215 
+L 847.863051 199.360215 
+" clip-path="url(#p1662381337)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_51">
+      <g>
+       <use xlink:href="#m0a40772bfa" x="477.52" y="199.360215" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_16">
+     <g id="line2d_52">
+      <path d="M 477.52 174.993316 
+L 847.863051 174.993316 
+" clip-path="url(#p1662381337)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_53">
+      <g>
+       <use xlink:href="#m0a40772bfa" x="477.52" y="174.993316" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_54">
+    <path d="M 494.353775 262.316346 
+L 536.438213 242.128516 
+L 578.52265 235.472551 
+L 620.607088 256.079671 
+L 662.691526 236.706592 
+L 704.775963 230.042683 
+L 746.860401 261.652432 
+L 788.944838 246.424047 
+L 831.029276 241.460071 
+" clip-path="url(#p1662381337)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p1662381337)">
+     <use xlink:href="#m6ed91b5a99" x="494.353775" y="262.316346" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6ed91b5a99" x="536.438213" y="242.128516" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6ed91b5a99" x="578.52265" y="235.472551" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6ed91b5a99" x="620.607088" y="256.079671" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6ed91b5a99" x="662.691526" y="236.706592" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6ed91b5a99" x="704.775963" y="230.042683" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6ed91b5a99" x="746.860401" y="261.652432" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6ed91b5a99" x="788.944838" y="246.424047" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6ed91b5a99" x="831.029276" y="241.460071" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_55">
+    <path d="M 494.353775 289.622796 
+L 536.438213 274.506128 
+L 578.52265 268.230802 
+L 620.607088 278.803887 
+L 662.691526 262.273435 
+L 704.775963 256.960887 
+L 746.860401 280.708975 
+L 788.944838 268.033572 
+L 831.029276 264.152803 
+" clip-path="url(#p1662381337)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p1662381337)">
+     <use xlink:href="#m915f9547b2" x="494.353775" y="289.622796" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m915f9547b2" x="536.438213" y="274.506128" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m915f9547b2" x="578.52265" y="268.230802" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m915f9547b2" x="620.607088" y="278.803887" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m915f9547b2" x="662.691526" y="262.273435" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m915f9547b2" x="704.775963" y="256.960887" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m915f9547b2" x="746.860401" y="280.708975" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m915f9547b2" x="788.944838" y="268.033572" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m915f9547b2" x="831.029276" y="264.152803" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_56">
+    <path d="M 477.52 165.819378 
+L 847.863051 165.819378 
+" clip-path="url(#p1662381337)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_8">
+    <path d="M 477.52 345.561604 
+L 477.52 151.44 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_9">
+    <path d="M 847.863051 345.561604 
+L 847.863051 151.44 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_10">
+    <path d="M 477.52 345.561604 
+L 847.863051 345.561604 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_11">
+    <path d="M 477.52 151.44 
+L 847.863051 151.44 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_29">
+    <!-- Softmax (fwd+bwd) -->
+    <g transform="translate(573.026213 145.44) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-78" d="M 3513 3500 
+L 2247 1797 
+L 3578 0 
+L 2900 0 
+L 1881 1375 
+L 863 0 
+L 184 0 
+L 1544 1831 
+L 300 3500 
+L 978 3500 
+L 1906 2253 
+L 2834 3500 
+L 3513 3500 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-62" d="M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+M 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2969 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-53"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(63.476562 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(124.658203 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(158.113281 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(197.322266 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(294.734375 0)"/>
+     <use xlink:href="#DejaVuSans-78" transform="translate(356.013672 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(415.193359 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(446.980469 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(485.994141 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(519.449219 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(601.236328 0)"/>
+     <use xlink:href="#DejaVuSans-2b" transform="translate(664.712891 0)"/>
+     <use xlink:href="#DejaVuSans-62" transform="translate(748.501953 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(811.978516 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(893.765625 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(957.242188 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_3">
+   <g id="patch_12">
+    <path d="M 888.51 345.561604 
+L 1258.853051 345.561604 
+L 1258.853051 151.44 
+L 888.51 151.44 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_5">
+    <g id="xtick_19">
+     <g id="line2d_57">
+      <g>
+       <use xlink:href="#m9bb0c315f1" x="905.343775" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_30">
+      <!-- (4K, 6K) -->
+      <g transform="translate(906.814336 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_20">
+     <g id="line2d_58">
+      <g>
+       <use xlink:href="#m9bb0c315f1" x="947.428213" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_31">
+      <!-- (16K, 6K) -->
+      <g transform="translate(948.898774 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_21">
+     <g id="line2d_59">
+      <g>
+       <use xlink:href="#m9bb0c315f1" x="989.51265" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_32">
+      <!-- (64K, 6K) -->
+      <g transform="translate(990.983211 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_22">
+     <g id="line2d_60">
+      <g>
+       <use xlink:href="#m9bb0c315f1" x="1031.597088" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_33">
+      <!-- (4K, 7K) -->
+      <g transform="translate(1033.067649 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_23">
+     <g id="line2d_61">
+      <g>
+       <use xlink:href="#m9bb0c315f1" x="1073.681526" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_34">
+      <!-- (16K, 7K) -->
+      <g transform="translate(1075.152087 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_24">
+     <g id="line2d_62">
+      <g>
+       <use xlink:href="#m9bb0c315f1" x="1115.765963" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_35">
+      <!-- (64K, 7K) -->
+      <g transform="translate(1117.236524 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_25">
+     <g id="line2d_63">
+      <g>
+       <use xlink:href="#m9bb0c315f1" x="1157.850401" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_36">
+      <!-- (4K, 8K) -->
+      <g transform="translate(1159.320962 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_26">
+     <g id="line2d_64">
+      <g>
+       <use xlink:href="#m9bb0c315f1" x="1199.934838" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_37">
+      <!-- (16K, 8K) -->
+      <g transform="translate(1201.4054 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_27">
+     <g id="line2d_65">
+      <g>
+       <use xlink:href="#m9bb0c315f1" x="1242.019276" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_38">
+      <!-- (64K, 8K) -->
+      <g transform="translate(1243.489837 357.934511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_6">
+    <g id="ytick_17">
+     <g id="line2d_66">
+      <path d="M 888.51 345.561604 
+L 1258.853051 345.561604 
+" clip-path="url(#p5bc307ebb5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_67">
+      <g>
+       <use xlink:href="#m0a40772bfa" x="888.51" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_18">
+     <g id="line2d_68">
+      <path d="M 888.51 321.194706 
+L 1258.853051 321.194706 
+" clip-path="url(#p5bc307ebb5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_69">
+      <g>
+       <use xlink:href="#m0a40772bfa" x="888.51" y="321.194706" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_19">
+     <g id="line2d_70">
+      <path d="M 888.51 296.827808 
+L 1258.853051 296.827808 
+" clip-path="url(#p5bc307ebb5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_71">
+      <g>
+       <use xlink:href="#m0a40772bfa" x="888.51" y="296.827808" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_20">
+     <g id="line2d_72">
+      <path d="M 888.51 272.460909 
+L 1258.853051 272.460909 
+" clip-path="url(#p5bc307ebb5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_73">
+      <g>
+       <use xlink:href="#m0a40772bfa" x="888.51" y="272.460909" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_21">
+     <g id="line2d_74">
+      <path d="M 888.51 248.094011 
+L 1258.853051 248.094011 
+" clip-path="url(#p5bc307ebb5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_75">
+      <g>
+       <use xlink:href="#m0a40772bfa" x="888.51" y="248.094011" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_22">
+     <g id="line2d_76">
+      <path d="M 888.51 223.727113 
+L 1258.853051 223.727113 
+" clip-path="url(#p5bc307ebb5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_77">
+      <g>
+       <use xlink:href="#m0a40772bfa" x="888.51" y="223.727113" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_23">
+     <g id="line2d_78">
+      <path d="M 888.51 199.360215 
+L 1258.853051 199.360215 
+" clip-path="url(#p5bc307ebb5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_79">
+      <g>
+       <use xlink:href="#m0a40772bfa" x="888.51" y="199.360215" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_24">
+     <g id="line2d_80">
+      <path d="M 888.51 174.993316 
+L 1258.853051 174.993316 
+" clip-path="url(#p5bc307ebb5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_81">
+      <g>
+       <use xlink:href="#m0a40772bfa" x="888.51" y="174.993316" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_82">
+    <path d="M 905.343775 268.093279 
+L 947.428213 239.904905 
+L 989.51265 229.298272 
+L 1031.597088 269.830982 
+L 1073.681526 243.059358 
+L 1115.765963 233.19888 
+L 1157.850401 268.825774 
+L 1199.934838 243.698436 
+L 1242.019276 234.310691 
+" clip-path="url(#p5bc307ebb5)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p5bc307ebb5)">
+     <use xlink:href="#m6ed91b5a99" x="905.343775" y="268.093279" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6ed91b5a99" x="947.428213" y="239.904905" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6ed91b5a99" x="989.51265" y="229.298272" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6ed91b5a99" x="1031.597088" y="269.830982" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6ed91b5a99" x="1073.681526" y="243.059358" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6ed91b5a99" x="1115.765963" y="233.19888" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6ed91b5a99" x="1157.850401" y="268.825774" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6ed91b5a99" x="1199.934838" y="243.698436" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6ed91b5a99" x="1242.019276" y="234.310691" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_83">
+    <path d="M 905.343775 289.92474 
+L 947.428213 276.628747 
+L 989.51265 272.623044 
+L 1031.597088 274.767602 
+L 1073.681526 257.35917 
+L 1115.765963 251.66936 
+L 1157.850401 277.833954 
+L 1199.934838 265.051317 
+L 1242.019276 260.961163 
+" clip-path="url(#p5bc307ebb5)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p5bc307ebb5)">
+     <use xlink:href="#m915f9547b2" x="905.343775" y="289.92474" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m915f9547b2" x="947.428213" y="276.628747" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m915f9547b2" x="989.51265" y="272.623044" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m915f9547b2" x="1031.597088" y="274.767602" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m915f9547b2" x="1073.681526" y="257.35917" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m915f9547b2" x="1115.765963" y="251.66936" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m915f9547b2" x="1157.850401" y="277.833954" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m915f9547b2" x="1199.934838" y="265.051317" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m915f9547b2" x="1242.019276" y="260.961163" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_84">
+    <path d="M 888.51 165.819378 
+L 1258.853051 165.819378 
+" clip-path="url(#p5bc307ebb5)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_13">
+    <path d="M 888.51 345.561604 
+L 888.51 151.44 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_14">
+    <path d="M 1258.853051 345.561604 
+L 1258.853051 151.44 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_15">
+    <path d="M 888.51 345.561604 
+L 1258.853051 345.561604 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_16">
+    <path d="M 888.51 151.44 
+L 1258.853051 151.44 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_39">
+    <!-- LayerNorm (fwd) -->
+    <g transform="translate(998.070276 145.44) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-4c" d="M 628 4666 
+L 1259 4666 
+L 1259 531 
+L 3531 531 
+L 3531 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-4c"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(55.712891 0)"/>
+     <use xlink:href="#DejaVuSans-79" transform="translate(116.992188 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(176.171875 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(237.695312 0)"/>
+     <use xlink:href="#DejaVuSans-4e" transform="translate(278.808594 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(353.613281 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(414.794922 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(454.158203 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(551.570312 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(583.357422 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(622.371094 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(655.826172 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(737.613281 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(801.089844 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="text_40">
+   <!-- SM100 FP16 Kernel Benchmarks (Oink vs Quack) — DSv3 (hidden-size ops) -->
+   <g transform="translate(233.982344 24.780563) scale(0.22 -0.22)">
+    <defs>
+     <path id="DejaVuSans-50" d="M 1259 4147 
+L 1259 2394 
+L 2053 2394 
+Q 2494 2394 2734 2622 
+Q 2975 2850 2975 3272 
+Q 2975 3691 2734 3919 
+Q 2494 4147 2053 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 2053 4666 
+Q 2838 4666 3239 4311 
+Q 3641 3956 3641 3272 
+Q 3641 2581 3239 2228 
+Q 2838 1875 2053 1875 
+L 1259 1875 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-6c" d="M 603 4863 
+L 1178 4863 
+L 1178 0 
+L 603 0 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-63" d="M 3122 3366 
+L 3122 2828 
+Q 2878 2963 2633 3030 
+Q 2388 3097 2138 3097 
+Q 1578 3097 1268 2742 
+Q 959 2388 959 1747 
+Q 959 1106 1268 751 
+Q 1578 397 2138 397 
+Q 2388 397 2633 464 
+Q 2878 531 3122 666 
+L 3122 134 
+Q 2881 22 2623 -34 
+Q 2366 -91 2075 -91 
+Q 1284 -91 818 406 
+Q 353 903 353 1747 
+Q 353 2603 823 3093 
+Q 1294 3584 2113 3584 
+Q 2378 3584 2631 3529 
+Q 2884 3475 3122 3366 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-6b" d="M 581 4863 
+L 1159 4863 
+L 1159 1991 
+L 2875 3500 
+L 3609 3500 
+L 1753 1863 
+L 3688 0 
+L 2938 0 
+L 1159 1709 
+L 1159 0 
+L 581 0 
+L 581 4863 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-4f" d="M 2522 4238 
+Q 1834 4238 1429 3725 
+Q 1025 3213 1025 2328 
+Q 1025 1447 1429 934 
+Q 1834 422 2522 422 
+Q 3209 422 3611 934 
+Q 4013 1447 4013 2328 
+Q 4013 3213 3611 3725 
+Q 3209 4238 2522 4238 
+z
+M 2522 4750 
+Q 3503 4750 4090 4092 
+Q 4678 3434 4678 2328 
+Q 4678 1225 4090 567 
+Q 3503 -91 2522 -91 
+Q 1538 -91 948 565 
+Q 359 1222 359 2328 
+Q 359 3434 948 4092 
+Q 1538 4750 2522 4750 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-76" d="M 191 3500 
+L 800 3500 
+L 1894 563 
+L 2988 3500 
+L 3597 3500 
+L 2284 0 
+L 1503 0 
+L 191 3500 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-51" d="M 2522 4238 
+Q 1834 4238 1429 3725 
+Q 1025 3213 1025 2328 
+Q 1025 1447 1429 934 
+Q 1834 422 2522 422 
+Q 3209 422 3611 934 
+Q 4013 1447 4013 2328 
+Q 4013 3213 3611 3725 
+Q 3209 4238 2522 4238 
+z
+M 3406 84 
+L 4238 -825 
+L 3475 -825 
+L 2784 -78 
+Q 2681 -84 2626 -87 
+Q 2572 -91 2522 -91 
+Q 1538 -91 948 567 
+Q 359 1225 359 2328 
+Q 359 3434 948 4092 
+Q 1538 4750 2522 4750 
+Q 3503 4750 4090 4092 
+Q 4678 3434 4678 2328 
+Q 4678 1516 4351 937 
+Q 4025 359 3406 84 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-2014" d="M 313 1978 
+L 6088 1978 
+L 6088 1528 
+L 313 1528 
+L 313 1978 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-44" d="M 1259 4147 
+L 1259 519 
+L 2022 519 
+Q 2988 519 3436 956 
+Q 3884 1394 3884 2338 
+Q 3884 3275 3436 3711 
+Q 2988 4147 2022 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 1925 4666 
+Q 3281 4666 3915 4102 
+Q 4550 3538 4550 2338 
+Q 4550 1131 3912 565 
+Q 3275 0 1925 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-2d" d="M 313 2009 
+L 1997 2009 
+L 1997 1497 
+L 313 1497 
+L 313 2009 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-7a" d="M 353 3500 
+L 3084 3500 
+L 3084 2975 
+L 922 459 
+L 3084 459 
+L 3084 0 
+L 275 0 
+L 275 525 
+L 2438 3041 
+L 353 3041 
+L 353 3500 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-70" d="M 1159 525 
+L 1159 -1331 
+L 581 -1331 
+L 581 3500 
+L 1159 3500 
+L 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+z
+M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+" transform="scale(0.015625)"/>
+    </defs>
+    <use xlink:href="#DejaVuSans-53"/>
+    <use xlink:href="#DejaVuSans-4d" transform="translate(63.476562 0)"/>
+    <use xlink:href="#DejaVuSans-31" transform="translate(149.755859 0)"/>
+    <use xlink:href="#DejaVuSans-30" transform="translate(213.378906 0)"/>
+    <use xlink:href="#DejaVuSans-30" transform="translate(277.001953 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(340.625 0)"/>
+    <use xlink:href="#DejaVuSans-46" transform="translate(372.412109 0)"/>
+    <use xlink:href="#DejaVuSans-50" transform="translate(429.931641 0)"/>
+    <use xlink:href="#DejaVuSans-31" transform="translate(490.234375 0)"/>
+    <use xlink:href="#DejaVuSans-36" transform="translate(553.857422 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(617.480469 0)"/>
+    <use xlink:href="#DejaVuSans-4b" transform="translate(649.267578 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(709.84375 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(771.367188 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(810.730469 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(874.109375 0)"/>
+    <use xlink:href="#DejaVuSans-6c" transform="translate(935.632812 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(963.416016 0)"/>
+    <use xlink:href="#DejaVuSans-42" transform="translate(995.203125 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(1063.806641 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(1125.330078 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(1188.708984 0)"/>
+    <use xlink:href="#DejaVuSans-68" transform="translate(1243.689453 0)"/>
+    <use xlink:href="#DejaVuSans-6d" transform="translate(1307.068359 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(1404.480469 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(1465.759766 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(1506.873047 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(1564.783203 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(1616.882812 0)"/>
+    <use xlink:href="#DejaVuSans-28" transform="translate(1648.669922 0)"/>
+    <use xlink:href="#DejaVuSans-4f" transform="translate(1687.683594 0)"/>
+    <use xlink:href="#DejaVuSans-69" transform="translate(1766.394531 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(1794.177734 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(1857.556641 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(1915.466797 0)"/>
+    <use xlink:href="#DejaVuSans-76" transform="translate(1947.253906 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(2006.433594 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2058.533203 0)"/>
+    <use xlink:href="#DejaVuSans-51" transform="translate(2090.320312 0)"/>
+    <use xlink:href="#DejaVuSans-75" transform="translate(2169.03125 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(2232.410156 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(2293.689453 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(2348.669922 0)"/>
+    <use xlink:href="#DejaVuSans-29" transform="translate(2406.580078 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2445.59375 0)"/>
+    <use xlink:href="#DejaVuSans-2014" transform="translate(2477.380859 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2577.380859 0)"/>
+    <use xlink:href="#DejaVuSans-44" transform="translate(2609.167969 0)"/>
+    <use xlink:href="#DejaVuSans-53" transform="translate(2686.169922 0)"/>
+    <use xlink:href="#DejaVuSans-76" transform="translate(2749.646484 0)"/>
+    <use xlink:href="#DejaVuSans-33" transform="translate(2808.826172 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2872.449219 0)"/>
+    <use xlink:href="#DejaVuSans-28" transform="translate(2904.236328 0)"/>
+    <use xlink:href="#DejaVuSans-68" transform="translate(2943.25 0)"/>
+    <use xlink:href="#DejaVuSans-69" transform="translate(3006.628906 0)"/>
+    <use xlink:href="#DejaVuSans-64" transform="translate(3034.412109 0)"/>
+    <use xlink:href="#DejaVuSans-64" transform="translate(3097.888672 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(3161.365234 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(3222.888672 0)"/>
+    <use xlink:href="#DejaVuSans-2d" transform="translate(3286.267578 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(3322.351562 0)"/>
+    <use xlink:href="#DejaVuSans-69" transform="translate(3374.451172 0)"/>
+    <use xlink:href="#DejaVuSans-7a" transform="translate(3402.234375 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(3454.724609 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(3516.248047 0)"/>
+    <use xlink:href="#DejaVuSans-6f" transform="translate(3548.035156 0)"/>
+    <use xlink:href="#DejaVuSans-70" transform="translate(3609.216797 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(3672.693359 0)"/>
+    <use xlink:href="#DejaVuSans-29" transform="translate(3724.792969 0)"/>
+   </g>
+  </g>
+  <g id="legend_1">
+   <g id="line2d_85">
+    <path d="M 386.305937 53.315969 
+L 402.555937 53.315969 
+L 418.805937 53.315969 
+" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g>
+     <use xlink:href="#m6ed91b5a99" x="402.555937" y="53.315969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="text_41">
+    <!-- KernelAgent-Oink (ours) -->
+    <g transform="translate(429.205937 57.865969) scale(0.13 -0.13)">
+     <defs>
+      <path id="DejaVuSans-67" d="M 2906 1791 
+Q 2906 2416 2648 2759 
+Q 2391 3103 1925 3103 
+Q 1463 3103 1205 2759 
+Q 947 2416 947 1791 
+Q 947 1169 1205 825 
+Q 1463 481 1925 481 
+Q 2391 481 2648 825 
+Q 2906 1169 2906 1791 
+z
+M 3481 434 
+Q 3481 -459 3084 -895 
+Q 2688 -1331 1869 -1331 
+Q 1566 -1331 1297 -1286 
+Q 1028 -1241 775 -1147 
+L 775 -588 
+Q 1028 -725 1275 -790 
+Q 1522 -856 1778 -856 
+Q 2344 -856 2625 -561 
+Q 2906 -266 2906 331 
+L 2906 616 
+Q 2728 306 2450 153 
+Q 2172 0 1784 0 
+Q 1141 0 747 490 
+Q 353 981 353 1791 
+Q 353 2603 747 3093 
+Q 1141 3584 1784 3584 
+Q 2172 3584 2450 3431 
+Q 2728 3278 2906 2969 
+L 2906 3500 
+L 3481 3500 
+L 3481 434 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-4b"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(60.576172 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(122.099609 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(161.462891 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(224.841797 0)"/>
+     <use xlink:href="#DejaVuSans-6c" transform="translate(286.365234 0)"/>
+     <use xlink:href="#DejaVuSans-41" transform="translate(314.148438 0)"/>
+     <use xlink:href="#DejaVuSans-67" transform="translate(382.556641 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(446.033203 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(507.556641 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(570.935547 0)"/>
+     <use xlink:href="#DejaVuSans-2d" transform="translate(610.144531 0)"/>
+     <use xlink:href="#DejaVuSans-4f" transform="translate(648.978516 0)"/>
+     <use xlink:href="#DejaVuSans-69" transform="translate(727.689453 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(755.472656 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(818.851562 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(876.761719 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(908.548828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(947.5625 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(1008.744141 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(1072.123047 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(1113.236328 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1165.335938 0)"/>
+    </g>
+   </g>
+   <g id="line2d_86">
+    <path d="M 611.770625 53.315969 
+L 628.020625 53.315969 
+L 644.270625 53.315969 
+" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g>
+     <use xlink:href="#m915f9547b2" x="628.020625" y="53.315969" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="text_42">
+    <!-- Quack -->
+    <g transform="translate(654.670625 57.865969) scale(0.13 -0.13)">
+     <use xlink:href="#DejaVuSans-51"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(78.710938 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(142.089844 0)"/>
+     <use xlink:href="#DejaVuSans-63" transform="translate(203.369141 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(258.349609 0)"/>
+    </g>
+   </g>
+   <g id="line2d_87">
+    <path d="M 721.785156 53.315969 
+L 738.035156 53.315969 
+L 754.285156 53.315969 
+" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="text_43">
+    <!-- HBM peak (measured) -->
+    <g transform="translate(764.685156 57.865969) scale(0.13 -0.13)">
+     <defs>
+      <path id="DejaVuSans-48" d="M 628 4666 
+L 1259 4666 
+L 1259 2753 
+L 3553 2753 
+L 3553 4666 
+L 4184 4666 
+L 4184 0 
+L 3553 0 
+L 3553 2222 
+L 1259 2222 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-48"/>
+     <use xlink:href="#DejaVuSans-42" transform="translate(75.195312 0)"/>
+     <use xlink:href="#DejaVuSans-4d" transform="translate(143.798828 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(230.078125 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(261.865234 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(325.341797 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(386.865234 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(448.144531 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(506.054688 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(537.841797 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(576.855469 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(674.267578 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(735.791016 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(797.070312 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(849.169922 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(912.548828 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(951.412109 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1012.935547 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1076.412109 0)"/>
+    </g>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="p7f372e1956">
+   <rect x="66.53" y="151.44" width="370.343051" height="194.121604"/>
+  </clipPath>
+  <clipPath id="p1662381337">
+   <rect x="477.52" y="151.44" width="370.343051" height="194.121604"/>
+  </clipPath>
+  <clipPath id="p5bc307ebb5">
+   <rect x="888.51" y="151.44" width="370.343051" height="194.121604"/>
+  </clipPath>
+ </defs>
+</svg>
diff --git a/oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3_all.svg b/oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3_all.svg
new file mode 100644
index 0000000..bff56d0
--- /dev/null
+++ b/oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3_all.svg
@@ -0,0 +1,2957 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="1706.691288pt" height="388.133504pt" viewBox="0 0 1706.691288 388.133504" xmlns="http://www.w3.org/2000/svg" version="1.1">
+ <metadata>
+  <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+   <cc:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
+    <dc:date>2026-01-12T23:50:13.556455</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <cc:Agent>
+      <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
+     </cc:Agent>
+    </dc:creator>
+   </cc:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure_1">
+  <g id="patch_1">
+   <path d="M 0 388.133504 
+L 1706.691288 388.133504 
+L 1706.691288 0 
+L 0 0 
+z
+" style="fill: #ffffff"/>
+  </g>
+  <g id="axes_1">
+   <g id="patch_2">
+    <path d="M 58.465 316.082831 
+L 432.752252 316.082831 
+L 432.752252 144.816 
+L 58.465 144.816 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="line2d_1">
+      <defs>
+       <path id="me1a6361767" d="M 0 0 
+L 0 3.5 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#me1a6361767" x="75.478057" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_1">
+      <!-- (4K, 6K) -->
+      <g transform="translate(76.948618 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-28" d="M 1984 4856 
+Q 1566 4138 1362 3434 
+Q 1159 2731 1159 2009 
+Q 1159 1288 1364 580 
+Q 1569 -128 1984 -844 
+L 1484 -844 
+Q 1016 -109 783 600 
+Q 550 1309 550 2009 
+Q 550 2706 781 3412 
+Q 1013 4119 1484 4856 
+L 1984 4856 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-34" d="M 2419 4116 
+L 825 1625 
+L 2419 1625 
+L 2419 4116 
+z
+M 2253 4666 
+L 3047 4666 
+L 3047 1625 
+L 3713 1625 
+L 3713 1100 
+L 3047 1100 
+L 3047 0 
+L 2419 0 
+L 2419 1100 
+L 313 1100 
+L 313 1709 
+L 2253 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-4b" d="M 628 4666 
+L 1259 4666 
+L 1259 2694 
+L 3353 4666 
+L 4166 4666 
+L 1850 2491 
+L 4331 0 
+L 3500 0 
+L 1259 2247 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-2c" d="M 750 794 
+L 1409 794 
+L 1409 256 
+L 897 -744 
+L 494 -744 
+L 750 256 
+L 750 794 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-20" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-36" d="M 2113 2584 
+Q 1688 2584 1439 2293 
+Q 1191 2003 1191 1497 
+Q 1191 994 1439 701 
+Q 1688 409 2113 409 
+Q 2538 409 2786 701 
+Q 3034 994 3034 1497 
+Q 3034 2003 2786 2293 
+Q 2538 2584 2113 2584 
+z
+M 3366 4563 
+L 3366 3988 
+Q 3128 4100 2886 4159 
+Q 2644 4219 2406 4219 
+Q 1781 4219 1451 3797 
+Q 1122 3375 1075 2522 
+Q 1259 2794 1537 2939 
+Q 1816 3084 2150 3084 
+Q 2853 3084 3261 2657 
+Q 3669 2231 3669 1497 
+Q 3669 778 3244 343 
+Q 2819 -91 2113 -91 
+Q 1303 -91 875 529 
+Q 447 1150 447 2328 
+Q 447 3434 972 4092 
+Q 1497 4750 2381 4750 
+Q 2619 4750 2861 4703 
+Q 3103 4656 3366 4563 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-29" d="M 513 4856 
+L 1013 4856 
+Q 1481 4119 1714 3412 
+Q 1947 2706 1947 2009 
+Q 1947 1309 1714 600 
+Q 1481 -109 1013 -844 
+L 513 -844 
+Q 928 -128 1133 580 
+Q 1338 1288 1338 2009 
+Q 1338 2731 1133 3434 
+Q 928 4138 513 4856 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="line2d_2">
+      <g>
+       <use xlink:href="#me1a6361767" x="118.010699" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_2">
+      <!-- (16K, 6K) -->
+      <g transform="translate(119.48126 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-31" d="M 794 531 
+L 1825 531 
+L 1825 4091 
+L 703 3866 
+L 703 4441 
+L 1819 4666 
+L 2450 4666 
+L 2450 531 
+L 3481 531 
+L 3481 0 
+L 794 0 
+L 794 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="line2d_3">
+      <g>
+       <use xlink:href="#me1a6361767" x="160.543341" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_3">
+      <!-- (64K, 6K) -->
+      <g transform="translate(162.013903 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="line2d_4">
+      <g>
+       <use xlink:href="#me1a6361767" x="203.075984" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_4">
+      <!-- (4K, 7K) -->
+      <g transform="translate(204.546545 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-37" d="M 525 4666 
+L 3525 4666 
+L 3525 4397 
+L 1831 0 
+L 1172 0 
+L 2766 4134 
+L 525 4134 
+L 525 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="line2d_5">
+      <g>
+       <use xlink:href="#me1a6361767" x="245.608626" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_5">
+      <!-- (16K, 7K) -->
+      <g transform="translate(247.079187 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="line2d_6">
+      <g>
+       <use xlink:href="#me1a6361767" x="288.141268" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_6">
+      <!-- (64K, 7K) -->
+      <g transform="translate(289.611829 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_7">
+     <g id="line2d_7">
+      <g>
+       <use xlink:href="#me1a6361767" x="330.673911" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_7">
+      <!-- (4K, 8K) -->
+      <g transform="translate(332.144472 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-38" d="M 2034 2216 
+Q 1584 2216 1326 1975 
+Q 1069 1734 1069 1313 
+Q 1069 891 1326 650 
+Q 1584 409 2034 409 
+Q 2484 409 2743 651 
+Q 3003 894 3003 1313 
+Q 3003 1734 2745 1975 
+Q 2488 2216 2034 2216 
+z
+M 1403 2484 
+Q 997 2584 770 2862 
+Q 544 3141 544 3541 
+Q 544 4100 942 4425 
+Q 1341 4750 2034 4750 
+Q 2731 4750 3128 4425 
+Q 3525 4100 3525 3541 
+Q 3525 3141 3298 2862 
+Q 3072 2584 2669 2484 
+Q 3125 2378 3379 2068 
+Q 3634 1759 3634 1313 
+Q 3634 634 3220 271 
+Q 2806 -91 2034 -91 
+Q 1263 -91 848 271 
+Q 434 634 434 1313 
+Q 434 1759 690 2068 
+Q 947 2378 1403 2484 
+z
+M 1172 3481 
+Q 1172 3119 1398 2916 
+Q 1625 2713 2034 2713 
+Q 2441 2713 2670 2916 
+Q 2900 3119 2900 3481 
+Q 2900 3844 2670 4047 
+Q 2441 4250 2034 4250 
+Q 1625 4250 1398 4047 
+Q 1172 3844 1172 3481 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_8">
+     <g id="line2d_8">
+      <g>
+       <use xlink:href="#me1a6361767" x="373.206553" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_8">
+      <!-- (16K, 8K) -->
+      <g transform="translate(374.677114 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_9">
+     <g id="line2d_9">
+      <g>
+       <use xlink:href="#me1a6361767" x="415.739195" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_9">
+      <!-- (64K, 8K) -->
+      <g transform="translate(417.209756 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_10">
+     <!-- Shape (M, N hidden) -->
+     <g transform="translate(163.813626 378.867037) scale(0.16 -0.16)">
+      <defs>
+       <path id="DejaVuSans-53" d="M 3425 4513 
+L 3425 3897 
+Q 3066 4069 2747 4153 
+Q 2428 4238 2131 4238 
+Q 1616 4238 1336 4038 
+Q 1056 3838 1056 3469 
+Q 1056 3159 1242 3001 
+Q 1428 2844 1947 2747 
+L 2328 2669 
+Q 3034 2534 3370 2195 
+Q 3706 1856 3706 1288 
+Q 3706 609 3251 259 
+Q 2797 -91 1919 -91 
+Q 1588 -91 1214 -16 
+Q 841 59 441 206 
+L 441 856 
+Q 825 641 1194 531 
+Q 1563 422 1919 422 
+Q 2459 422 2753 634 
+Q 3047 847 3047 1241 
+Q 3047 1584 2836 1778 
+Q 2625 1972 2144 2069 
+L 1759 2144 
+Q 1053 2284 737 2584 
+Q 422 2884 422 3419 
+Q 422 4038 858 4394 
+Q 1294 4750 2059 4750 
+Q 2388 4750 2728 4690 
+Q 3069 4631 3425 4513 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-68" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-61" d="M 2194 1759 
+Q 1497 1759 1228 1600 
+Q 959 1441 959 1056 
+Q 959 750 1161 570 
+Q 1363 391 1709 391 
+Q 2188 391 2477 730 
+Q 2766 1069 2766 1631 
+L 2766 1759 
+L 2194 1759 
+z
+M 3341 1997 
+L 3341 0 
+L 2766 0 
+L 2766 531 
+Q 2569 213 2275 61 
+Q 1981 -91 1556 -91 
+Q 1019 -91 701 211 
+Q 384 513 384 1019 
+Q 384 1609 779 1909 
+Q 1175 2209 1959 2209 
+L 2766 2209 
+L 2766 2266 
+Q 2766 2663 2505 2880 
+Q 2244 3097 1772 3097 
+Q 1472 3097 1187 3025 
+Q 903 2953 641 2809 
+L 641 3341 
+Q 956 3463 1253 3523 
+Q 1550 3584 1831 3584 
+Q 2591 3584 2966 3190 
+Q 3341 2797 3341 1997 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-70" d="M 1159 525 
+L 1159 -1331 
+L 581 -1331 
+L 581 3500 
+L 1159 3500 
+L 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+z
+M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-65" d="M 3597 1894 
+L 3597 1613 
+L 953 1613 
+Q 991 1019 1311 708 
+Q 1631 397 2203 397 
+Q 2534 397 2845 478 
+Q 3156 559 3463 722 
+L 3463 178 
+Q 3153 47 2828 -22 
+Q 2503 -91 2169 -91 
+Q 1331 -91 842 396 
+Q 353 884 353 1716 
+Q 353 2575 817 3079 
+Q 1281 3584 2069 3584 
+Q 2775 3584 3186 3129 
+Q 3597 2675 3597 1894 
+z
+M 3022 2063 
+Q 3016 2534 2758 2815 
+Q 2500 3097 2075 3097 
+Q 1594 3097 1305 2825 
+Q 1016 2553 972 2059 
+L 3022 2063 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-4d" d="M 628 4666 
+L 1569 4666 
+L 2759 1491 
+L 3956 4666 
+L 4897 4666 
+L 4897 0 
+L 4281 0 
+L 4281 4097 
+L 3078 897 
+L 2444 897 
+L 1241 4097 
+L 1241 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-4e" d="M 628 4666 
+L 1478 4666 
+L 3547 763 
+L 3547 4666 
+L 4159 4666 
+L 4159 0 
+L 3309 0 
+L 1241 3903 
+L 1241 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-69" d="M 603 3500 
+L 1178 3500 
+L 1178 0 
+L 603 0 
+L 603 3500 
+z
+M 603 4863 
+L 1178 4863 
+L 1178 4134 
+L 603 4134 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-64" d="M 2906 2969 
+L 2906 4863 
+L 3481 4863 
+L 3481 0 
+L 2906 0 
+L 2906 525 
+Q 2725 213 2448 61 
+Q 2172 -91 1784 -91 
+Q 1150 -91 751 415 
+Q 353 922 353 1747 
+Q 353 2572 751 3078 
+Q 1150 3584 1784 3584 
+Q 2172 3584 2448 3432 
+Q 2725 3281 2906 2969 
+z
+M 947 1747 
+Q 947 1113 1208 752 
+Q 1469 391 1925 391 
+Q 2381 391 2643 752 
+Q 2906 1113 2906 1747 
+Q 2906 2381 2643 2742 
+Q 2381 3103 1925 3103 
+Q 1469 3103 1208 2742 
+Q 947 2381 947 1747 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6e" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-53"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(63.476562 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(126.855469 0)"/>
+      <use xlink:href="#DejaVuSans-70" transform="translate(188.134766 0)"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(251.611328 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(313.134766 0)"/>
+      <use xlink:href="#DejaVuSans-28" transform="translate(344.921875 0)"/>
+      <use xlink:href="#DejaVuSans-4d" transform="translate(383.935547 0)"/>
+      <use xlink:href="#DejaVuSans-2c" transform="translate(470.214844 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(502.001953 0)"/>
+      <use xlink:href="#DejaVuSans-4e" transform="translate(533.789062 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(608.59375 0)"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(640.380859 0)"/>
+      <use xlink:href="#DejaVuSans-69" transform="translate(703.759766 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(731.542969 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(795.019531 0)"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(858.496094 0)"/>
+      <use xlink:href="#DejaVuSans-6e" transform="translate(920.019531 0)"/>
+      <use xlink:href="#DejaVuSans-29" transform="translate(983.398438 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2">
+    <g id="ytick_1">
+     <g id="line2d_10">
+      <path d="M 58.465 316.082831 
+L 432.752252 316.082831 
+" clip-path="url(#pb6a1cac816)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_11">
+      <defs>
+       <path id="mda30e46ddb" d="M 0 0 
+L -3.5 0 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#mda30e46ddb" x="58.465" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_11">
+      <!-- 0 -->
+      <g transform="translate(43.83 320.641894) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-30" d="M 2034 4250 
+Q 1547 4250 1301 3770 
+Q 1056 3291 1056 2328 
+Q 1056 1369 1301 889 
+Q 1547 409 2034 409 
+Q 2525 409 2770 889 
+Q 3016 1369 3016 2328 
+Q 3016 3291 2770 3770 
+Q 2525 4250 2034 4250 
+z
+M 2034 4750 
+Q 2819 4750 3233 4129 
+Q 3647 3509 3647 2328 
+Q 3647 1150 3233 529 
+Q 2819 -91 2034 -91 
+Q 1250 -91 836 529 
+Q 422 1150 422 2328 
+Q 422 3509 836 4129 
+Q 1250 4750 2034 4750 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_2">
+     <g id="line2d_12">
+      <path d="M 58.465 273.086675 
+L 432.752252 273.086675 
+" clip-path="url(#pb6a1cac816)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_13">
+      <g>
+       <use xlink:href="#mda30e46ddb" x="58.465" y="273.086675" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_12">
+      <!-- 2000 -->
+      <g transform="translate(20.925 277.645737) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-32" d="M 1228 531 
+L 3431 531 
+L 3431 0 
+L 469 0 
+L 469 531 
+Q 828 903 1448 1529 
+Q 2069 2156 2228 2338 
+Q 2531 2678 2651 2914 
+Q 2772 3150 2772 3378 
+Q 2772 3750 2511 3984 
+Q 2250 4219 1831 4219 
+Q 1534 4219 1204 4116 
+Q 875 4013 500 3803 
+L 500 4441 
+Q 881 4594 1212 4672 
+Q 1544 4750 1819 4750 
+Q 2544 4750 2975 4387 
+Q 3406 4025 3406 3419 
+Q 3406 3131 3298 2873 
+Q 3191 2616 2906 2266 
+Q 2828 2175 2409 1742 
+Q 1991 1309 1228 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-32"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_3">
+     <g id="line2d_14">
+      <path d="M 58.465 230.090518 
+L 432.752252 230.090518 
+" clip-path="url(#pb6a1cac816)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_15">
+      <g>
+       <use xlink:href="#mda30e46ddb" x="58.465" y="230.090518" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_13">
+      <!-- 4000 -->
+      <g transform="translate(20.925 234.64958) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-34"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_4">
+     <g id="line2d_16">
+      <path d="M 58.465 187.094361 
+L 432.752252 187.094361 
+" clip-path="url(#pb6a1cac816)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_17">
+      <g>
+       <use xlink:href="#mda30e46ddb" x="58.465" y="187.094361" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_14">
+      <!-- 6000 -->
+      <g transform="translate(20.925 191.653424) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-36"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_15">
+     <!-- Memory Bandwidth (GB/s) -->
+     <g transform="translate(13.5975 335.453166) rotate(-90) scale(0.16 -0.16)">
+      <defs>
+       <path id="DejaVuSans-6d" d="M 3328 2828 
+Q 3544 3216 3844 3400 
+Q 4144 3584 4550 3584 
+Q 5097 3584 5394 3201 
+Q 5691 2819 5691 2113 
+L 5691 0 
+L 5113 0 
+L 5113 2094 
+Q 5113 2597 4934 2840 
+Q 4756 3084 4391 3084 
+Q 3944 3084 3684 2787 
+Q 3425 2491 3425 1978 
+L 3425 0 
+L 2847 0 
+L 2847 2094 
+Q 2847 2600 2669 2842 
+Q 2491 3084 2119 3084 
+Q 1678 3084 1418 2786 
+Q 1159 2488 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1356 3278 1631 3431 
+Q 1906 3584 2284 3584 
+Q 2666 3584 2933 3390 
+Q 3200 3197 3328 2828 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6f" d="M 1959 3097 
+Q 1497 3097 1228 2736 
+Q 959 2375 959 1747 
+Q 959 1119 1226 758 
+Q 1494 397 1959 397 
+Q 2419 397 2687 759 
+Q 2956 1122 2956 1747 
+Q 2956 2369 2687 2733 
+Q 2419 3097 1959 3097 
+z
+M 1959 3584 
+Q 2709 3584 3137 3096 
+Q 3566 2609 3566 1747 
+Q 3566 888 3137 398 
+Q 2709 -91 1959 -91 
+Q 1206 -91 779 398 
+Q 353 888 353 1747 
+Q 353 2609 779 3096 
+Q 1206 3584 1959 3584 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-72" d="M 2631 2963 
+Q 2534 3019 2420 3045 
+Q 2306 3072 2169 3072 
+Q 1681 3072 1420 2755 
+Q 1159 2438 1159 1844 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1341 3275 1631 3429 
+Q 1922 3584 2338 3584 
+Q 2397 3584 2469 3576 
+Q 2541 3569 2628 3553 
+L 2631 2963 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-79" d="M 2059 -325 
+Q 1816 -950 1584 -1140 
+Q 1353 -1331 966 -1331 
+L 506 -1331 
+L 506 -850 
+L 844 -850 
+Q 1081 -850 1212 -737 
+Q 1344 -625 1503 -206 
+L 1606 56 
+L 191 3500 
+L 800 3500 
+L 1894 763 
+L 2988 3500 
+L 3597 3500 
+L 2059 -325 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-42" d="M 1259 2228 
+L 1259 519 
+L 2272 519 
+Q 2781 519 3026 730 
+Q 3272 941 3272 1375 
+Q 3272 1813 3026 2020 
+Q 2781 2228 2272 2228 
+L 1259 2228 
+z
+M 1259 4147 
+L 1259 2741 
+L 2194 2741 
+Q 2656 2741 2882 2914 
+Q 3109 3088 3109 3444 
+Q 3109 3797 2882 3972 
+Q 2656 4147 2194 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 2241 4666 
+Q 2963 4666 3353 4366 
+Q 3744 4066 3744 3513 
+Q 3744 3084 3544 2831 
+Q 3344 2578 2956 2516 
+Q 3422 2416 3680 2098 
+Q 3938 1781 3938 1306 
+Q 3938 681 3513 340 
+Q 3088 0 2303 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-77" d="M 269 3500 
+L 844 3500 
+L 1563 769 
+L 2278 3500 
+L 2956 3500 
+L 3675 769 
+L 4391 3500 
+L 4966 3500 
+L 4050 0 
+L 3372 0 
+L 2619 2869 
+L 1863 0 
+L 1184 0 
+L 269 3500 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-74" d="M 1172 4494 
+L 1172 3500 
+L 2356 3500 
+L 2356 3053 
+L 1172 3053 
+L 1172 1153 
+Q 1172 725 1289 603 
+Q 1406 481 1766 481 
+L 2356 481 
+L 2356 0 
+L 1766 0 
+Q 1100 0 847 248 
+Q 594 497 594 1153 
+L 594 3053 
+L 172 3053 
+L 172 3500 
+L 594 3500 
+L 594 4494 
+L 1172 4494 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-47" d="M 3809 666 
+L 3809 1919 
+L 2778 1919 
+L 2778 2438 
+L 4434 2438 
+L 4434 434 
+Q 4069 175 3628 42 
+Q 3188 -91 2688 -91 
+Q 1594 -91 976 548 
+Q 359 1188 359 2328 
+Q 359 3472 976 4111 
+Q 1594 4750 2688 4750 
+Q 3144 4750 3555 4637 
+Q 3966 4525 4313 4306 
+L 4313 3634 
+Q 3963 3931 3569 4081 
+Q 3175 4231 2741 4231 
+Q 1884 4231 1454 3753 
+Q 1025 3275 1025 2328 
+Q 1025 1384 1454 906 
+Q 1884 428 2741 428 
+Q 3075 428 3337 486 
+Q 3600 544 3809 666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-2f" d="M 1625 4666 
+L 2156 4666 
+L 531 -594 
+L 0 -594 
+L 1625 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-73" d="M 2834 3397 
+L 2834 2853 
+Q 2591 2978 2328 3040 
+Q 2066 3103 1784 3103 
+Q 1356 3103 1142 2972 
+Q 928 2841 928 2578 
+Q 928 2378 1081 2264 
+Q 1234 2150 1697 2047 
+L 1894 2003 
+Q 2506 1872 2764 1633 
+Q 3022 1394 3022 966 
+Q 3022 478 2636 193 
+Q 2250 -91 1575 -91 
+Q 1294 -91 989 -36 
+Q 684 19 347 128 
+L 347 722 
+Q 666 556 975 473 
+Q 1284 391 1588 391 
+Q 1994 391 2212 530 
+Q 2431 669 2431 922 
+Q 2431 1156 2273 1281 
+Q 2116 1406 1581 1522 
+L 1381 1569 
+Q 847 1681 609 1914 
+Q 372 2147 372 2553 
+Q 372 3047 722 3315 
+Q 1072 3584 1716 3584 
+Q 2034 3584 2315 3537 
+Q 2597 3491 2834 3397 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-4d"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(86.279297 0)"/>
+      <use xlink:href="#DejaVuSans-6d" transform="translate(147.802734 0)"/>
+      <use xlink:href="#DejaVuSans-6f" transform="translate(245.214844 0)"/>
+      <use xlink:href="#DejaVuSans-72" transform="translate(306.396484 0)"/>
+      <use xlink:href="#DejaVuSans-79" transform="translate(347.509766 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(406.689453 0)"/>
+      <use xlink:href="#DejaVuSans-42" transform="translate(438.476562 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(507.080078 0)"/>
+      <use xlink:href="#DejaVuSans-6e" transform="translate(568.359375 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(631.738281 0)"/>
+      <use xlink:href="#DejaVuSans-77" transform="translate(695.214844 0)"/>
+      <use xlink:href="#DejaVuSans-69" transform="translate(777.001953 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(804.785156 0)"/>
+      <use xlink:href="#DejaVuSans-74" transform="translate(868.261719 0)"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(907.470703 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(970.849609 0)"/>
+      <use xlink:href="#DejaVuSans-28" transform="translate(1002.636719 0)"/>
+      <use xlink:href="#DejaVuSans-47" transform="translate(1041.650391 0)"/>
+      <use xlink:href="#DejaVuSans-42" transform="translate(1119.140625 0)"/>
+      <use xlink:href="#DejaVuSans-2f" transform="translate(1187.744141 0)"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(1221.435547 0)"/>
+      <use xlink:href="#DejaVuSans-29" transform="translate(1273.535156 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_18">
+    <path d="M 75.478057 205.5528 
+L 118.010699 179.140297 
+L 160.543341 169.809528 
+L 203.075984 205.429197 
+L 245.608626 185.748795 
+L 288.141268 175.799063 
+L 330.673911 200.175862 
+L 373.206553 178.450876 
+L 415.739195 171.359059 
+" clip-path="url(#pb6a1cac816)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <defs>
+     <path id="m487c4cd14e" d="M 0 3.5 
+C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
+C 3.131218 1.81853 3.5 0.928211 3.5 0 
+C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
+C 1.81853 -3.131218 0.928211 -3.5 0 -3.5 
+C -0.928211 -3.5 -1.81853 -3.131218 -2.474874 -2.474874 
+C -3.131218 -1.81853 -3.5 -0.928211 -3.5 0 
+C -3.5 0.928211 -3.131218 1.81853 -2.474874 2.474874 
+C -1.81853 3.131218 -0.928211 3.5 0 3.5 
+z
+" style="stroke: #5ba3f5"/>
+    </defs>
+    <g clip-path="url(#pb6a1cac816)">
+     <use xlink:href="#m487c4cd14e" x="75.478057" y="205.5528" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="118.010699" y="179.140297" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="160.543341" y="169.809528" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="203.075984" y="205.429197" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="245.608626" y="185.748795" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="288.141268" y="175.799063" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="330.673911" y="200.175862" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="373.206553" y="178.450876" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="415.739195" y="171.359059" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_19">
+    <path d="M 75.478057 220.928241 
+L 118.010699 194.339842 
+L 160.543341 186.565136 
+L 203.075984 214.462 
+L 245.608626 195.061322 
+L 288.141268 186.606295 
+L 330.673911 213.294106 
+L 373.206553 191.64191 
+L 415.739195 185.505697 
+" clip-path="url(#pb6a1cac816)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <defs>
+     <path id="m29f540cea5" d="M 0 3.5 
+C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
+C 3.131218 1.81853 3.5 0.928211 3.5 0 
+C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
+C 1.81853 -3.131218 0.928211 -3.5 0 -3.5 
+C -0.928211 -3.5 -1.81853 -3.131218 -2.474874 -2.474874 
+C -3.131218 -1.81853 -3.5 -0.928211 -3.5 0 
+C -3.5 0.928211 -3.131218 1.81853 -2.474874 2.474874 
+C -1.81853 3.131218 -0.928211 3.5 0 3.5 
+z
+" style="stroke: #ff4444"/>
+    </defs>
+    <g clip-path="url(#pb6a1cac816)">
+     <use xlink:href="#m29f540cea5" x="75.478057" y="220.928241" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="118.010699" y="194.339842" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="160.543341" y="186.565136" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="203.075984" y="214.462" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="245.608626" y="195.061322" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="288.141268" y="186.606295" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="330.673911" y="213.294106" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="373.206553" y="191.64191" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="415.739195" y="185.505697" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_20">
+    <path d="M 58.465 157.502432 
+L 432.752252 157.502432 
+" clip-path="url(#pb6a1cac816)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_3">
+    <path d="M 58.465 316.082831 
+L 58.465 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_4">
+    <path d="M 432.752252 316.082831 
+L 432.752252 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_5">
+    <path d="M 58.465 316.082831 
+L 432.752252 316.082831 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_6">
+    <path d="M 58.465 144.816 
+L 432.752252 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_16">
+    <!-- Fused Add+RMSNorm (fwd) -->
+    <g transform="translate(121.206126 138.816) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-46" d="M 628 4666 
+L 3309 4666 
+L 3309 4134 
+L 1259 4134 
+L 1259 2759 
+L 3109 2759 
+L 3109 2228 
+L 1259 2228 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-75" d="M 544 1381 
+L 544 3500 
+L 1119 3500 
+L 1119 1403 
+Q 1119 906 1312 657 
+Q 1506 409 1894 409 
+Q 2359 409 2629 706 
+Q 2900 1003 2900 1516 
+L 2900 3500 
+L 3475 3500 
+L 3475 0 
+L 2900 0 
+L 2900 538 
+Q 2691 219 2414 64 
+Q 2138 -91 1772 -91 
+Q 1169 -91 856 284 
+Q 544 659 544 1381 
+z
+M 1991 3584 
+L 1991 3584 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-41" d="M 2188 4044 
+L 1331 1722 
+L 3047 1722 
+L 2188 4044 
+z
+M 1831 4666 
+L 2547 4666 
+L 4325 0 
+L 3669 0 
+L 3244 1197 
+L 1141 1197 
+L 716 0 
+L 50 0 
+L 1831 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-2b" d="M 2944 4013 
+L 2944 2272 
+L 4684 2272 
+L 4684 1741 
+L 2944 1741 
+L 2944 0 
+L 2419 0 
+L 2419 1741 
+L 678 1741 
+L 678 2272 
+L 2419 2272 
+L 2419 4013 
+L 2944 4013 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-52" d="M 2841 2188 
+Q 3044 2119 3236 1894 
+Q 3428 1669 3622 1275 
+L 4263 0 
+L 3584 0 
+L 2988 1197 
+Q 2756 1666 2539 1819 
+Q 2322 1972 1947 1972 
+L 1259 1972 
+L 1259 0 
+L 628 0 
+L 628 4666 
+L 2053 4666 
+Q 2853 4666 3247 4331 
+Q 3641 3997 3641 3322 
+Q 3641 2881 3436 2590 
+Q 3231 2300 2841 2188 
+z
+M 1259 4147 
+L 1259 2491 
+L 2053 2491 
+Q 2509 2491 2742 2702 
+Q 2975 2913 2975 3322 
+Q 2975 3731 2742 3939 
+Q 2509 4147 2053 4147 
+L 1259 4147 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-66" d="M 2375 4863 
+L 2375 4384 
+L 1825 4384 
+Q 1516 4384 1395 4259 
+Q 1275 4134 1275 3809 
+L 1275 3500 
+L 2222 3500 
+L 2222 3053 
+L 1275 3053 
+L 1275 0 
+L 697 0 
+L 697 3053 
+L 147 3053 
+L 147 3500 
+L 697 3500 
+L 697 3744 
+Q 697 4328 969 4595 
+Q 1241 4863 1831 4863 
+L 2375 4863 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-46"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(52.019531 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(115.398438 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(167.498047 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(229.021484 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(292.498047 0)"/>
+     <use xlink:href="#DejaVuSans-41" transform="translate(324.285156 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(390.943359 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(454.419922 0)"/>
+     <use xlink:href="#DejaVuSans-2b" transform="translate(517.896484 0)"/>
+     <use xlink:href="#DejaVuSans-52" transform="translate(601.685547 0)"/>
+     <use xlink:href="#DejaVuSans-4d" transform="translate(671.167969 0)"/>
+     <use xlink:href="#DejaVuSans-53" transform="translate(757.447266 0)"/>
+     <use xlink:href="#DejaVuSans-4e" transform="translate(820.923828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(895.728516 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(956.910156 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(996.273438 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(1093.685547 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(1125.472656 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(1164.486328 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(1197.941406 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1279.728516 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1343.205078 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_2">
+   <g id="patch_7">
+    <path d="M 473.593807 316.082831 
+L 847.881059 316.082831 
+L 847.881059 144.816 
+L 473.593807 144.816 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_3">
+    <g id="xtick_10">
+     <g id="line2d_21">
+      <g>
+       <use xlink:href="#me1a6361767" x="490.606864" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_17">
+      <!-- (4K, 6K) -->
+      <g transform="translate(492.077425 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_11">
+     <g id="line2d_22">
+      <g>
+       <use xlink:href="#me1a6361767" x="533.139506" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_18">
+      <!-- (16K, 6K) -->
+      <g transform="translate(534.610067 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_12">
+     <g id="line2d_23">
+      <g>
+       <use xlink:href="#me1a6361767" x="575.672148" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_19">
+      <!-- (64K, 6K) -->
+      <g transform="translate(577.142709 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_13">
+     <g id="line2d_24">
+      <g>
+       <use xlink:href="#me1a6361767" x="618.204791" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_20">
+      <!-- (4K, 7K) -->
+      <g transform="translate(619.675352 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_14">
+     <g id="line2d_25">
+      <g>
+       <use xlink:href="#me1a6361767" x="660.737433" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_21">
+      <!-- (16K, 7K) -->
+      <g transform="translate(662.207994 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_15">
+     <g id="line2d_26">
+      <g>
+       <use xlink:href="#me1a6361767" x="703.270075" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_22">
+      <!-- (64K, 7K) -->
+      <g transform="translate(704.740636 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_16">
+     <g id="line2d_27">
+      <g>
+       <use xlink:href="#me1a6361767" x="745.802717" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_23">
+      <!-- (4K, 8K) -->
+      <g transform="translate(747.273278 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_17">
+     <g id="line2d_28">
+      <g>
+       <use xlink:href="#me1a6361767" x="788.33536" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_24">
+      <!-- (16K, 8K) -->
+      <g transform="translate(789.805921 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_18">
+     <g id="line2d_29">
+      <g>
+       <use xlink:href="#me1a6361767" x="830.868002" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_25">
+      <!-- (64K, 8K) -->
+      <g transform="translate(832.338563 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_26">
+     <!-- Shape (M, N hidden) -->
+     <g transform="translate(578.942433 378.867037) scale(0.16 -0.16)">
+      <use xlink:href="#DejaVuSans-53"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(63.476562 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(126.855469 0)"/>
+      <use xlink:href="#DejaVuSans-70" transform="translate(188.134766 0)"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(251.611328 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(313.134766 0)"/>
+      <use xlink:href="#DejaVuSans-28" transform="translate(344.921875 0)"/>
+      <use xlink:href="#DejaVuSans-4d" transform="translate(383.935547 0)"/>
+      <use xlink:href="#DejaVuSans-2c" transform="translate(470.214844 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(502.001953 0)"/>
+      <use xlink:href="#DejaVuSans-4e" transform="translate(533.789062 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(608.59375 0)"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(640.380859 0)"/>
+      <use xlink:href="#DejaVuSans-69" transform="translate(703.759766 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(731.542969 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(795.019531 0)"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(858.496094 0)"/>
+      <use xlink:href="#DejaVuSans-6e" transform="translate(920.019531 0)"/>
+      <use xlink:href="#DejaVuSans-29" transform="translate(983.398438 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_4">
+    <g id="ytick_5">
+     <g id="line2d_30">
+      <path d="M 473.593807 316.082831 
+L 847.881059 316.082831 
+" clip-path="url(#p7b554e3208)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_31">
+      <g>
+       <use xlink:href="#mda30e46ddb" x="473.593807" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_6">
+     <g id="line2d_32">
+      <path d="M 473.593807 273.086675 
+L 847.881059 273.086675 
+" clip-path="url(#p7b554e3208)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_33">
+      <g>
+       <use xlink:href="#mda30e46ddb" x="473.593807" y="273.086675" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_7">
+     <g id="line2d_34">
+      <path d="M 473.593807 230.090518 
+L 847.881059 230.090518 
+" clip-path="url(#p7b554e3208)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_35">
+      <g>
+       <use xlink:href="#mda30e46ddb" x="473.593807" y="230.090518" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_8">
+     <g id="line2d_36">
+      <path d="M 473.593807 187.094361 
+L 847.881059 187.094361 
+" clip-path="url(#p7b554e3208)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_37">
+      <g>
+       <use xlink:href="#mda30e46ddb" x="473.593807" y="187.094361" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_38">
+    <path d="M 490.606864 242.230362 
+L 533.139506 224.659801 
+L 575.672148 218.979013 
+L 618.204791 237.379809 
+L 660.737433 219.905786 
+L 703.270075 214.100532 
+L 745.802717 242.388038 
+L 788.33536 228.316324 
+L 830.868002 224.247779 
+" clip-path="url(#p7b554e3208)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p7b554e3208)">
+     <use xlink:href="#m487c4cd14e" x="490.606864" y="242.230362" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="533.139506" y="224.659801" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="575.672148" y="218.979013" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="618.204791" y="237.379809" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="660.737433" y="219.905786" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="703.270075" y="214.100532" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="745.802717" y="242.388038" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="788.33536" y="228.316324" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="830.868002" y="224.247779" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_39">
+    <path d="M 490.606864 268.05437 
+L 533.139506 253.28622 
+L 575.672148 247.945881 
+L 618.204791 257.681156 
+L 660.737433 242.707566 
+L 703.270075 237.866283 
+L 745.802717 258.863992 
+L 788.33536 247.646974 
+L 830.868002 244.271053 
+" clip-path="url(#p7b554e3208)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p7b554e3208)">
+     <use xlink:href="#m29f540cea5" x="490.606864" y="268.05437" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="533.139506" y="253.28622" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="575.672148" y="247.945881" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="618.204791" y="257.681156" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="660.737433" y="242.707566" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="703.270075" y="237.866283" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="745.802717" y="258.863992" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="788.33536" y="247.646974" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="830.868002" y="244.271053" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_40">
+    <path d="M 473.593807 157.502432 
+L 847.881059 157.502432 
+" clip-path="url(#p7b554e3208)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_8">
+    <path d="M 473.593807 316.082831 
+L 473.593807 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_9">
+    <path d="M 847.881059 316.082831 
+L 847.881059 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_10">
+    <path d="M 473.593807 316.082831 
+L 847.881059 316.082831 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_11">
+    <path d="M 473.593807 144.816 
+L 847.881059 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_27">
+    <!-- Softmax (fwd+bwd) -->
+    <g transform="translate(571.07212 138.816) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-78" d="M 3513 3500 
+L 2247 1797 
+L 3578 0 
+L 2900 0 
+L 1881 1375 
+L 863 0 
+L 184 0 
+L 1544 1831 
+L 300 3500 
+L 978 3500 
+L 1906 2253 
+L 2834 3500 
+L 3513 3500 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-62" d="M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+M 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2969 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-53"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(63.476562 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(124.658203 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(158.113281 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(197.322266 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(294.734375 0)"/>
+     <use xlink:href="#DejaVuSans-78" transform="translate(356.013672 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(415.193359 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(446.980469 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(485.994141 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(519.449219 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(601.236328 0)"/>
+     <use xlink:href="#DejaVuSans-2b" transform="translate(664.712891 0)"/>
+     <use xlink:href="#DejaVuSans-62" transform="translate(748.501953 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(811.978516 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(893.765625 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(957.242188 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_3">
+   <g id="patch_12">
+    <path d="M 888.722614 316.082831 
+L 1263.009866 316.082831 
+L 1263.009866 144.816 
+L 888.722614 144.816 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_5">
+    <g id="xtick_19">
+     <g id="line2d_41">
+      <g>
+       <use xlink:href="#me1a6361767" x="905.735671" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_28">
+      <!-- (4K, 6K) -->
+      <g transform="translate(907.206232 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_20">
+     <g id="line2d_42">
+      <g>
+       <use xlink:href="#me1a6361767" x="948.268313" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_29">
+      <!-- (16K, 6K) -->
+      <g transform="translate(949.738874 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_21">
+     <g id="line2d_43">
+      <g>
+       <use xlink:href="#me1a6361767" x="990.800955" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_30">
+      <!-- (64K, 6K) -->
+      <g transform="translate(992.271516 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_22">
+     <g id="line2d_44">
+      <g>
+       <use xlink:href="#me1a6361767" x="1033.333597" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_31">
+      <!-- (4K, 7K) -->
+      <g transform="translate(1034.804158 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_23">
+     <g id="line2d_45">
+      <g>
+       <use xlink:href="#me1a6361767" x="1075.86624" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_32">
+      <!-- (16K, 7K) -->
+      <g transform="translate(1077.336801 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_24">
+     <g id="line2d_46">
+      <g>
+       <use xlink:href="#me1a6361767" x="1118.398882" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_33">
+      <!-- (64K, 7K) -->
+      <g transform="translate(1119.869443 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_25">
+     <g id="line2d_47">
+      <g>
+       <use xlink:href="#me1a6361767" x="1160.931524" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_34">
+      <!-- (4K, 8K) -->
+      <g transform="translate(1162.402085 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_26">
+     <g id="line2d_48">
+      <g>
+       <use xlink:href="#me1a6361767" x="1203.464166" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_35">
+      <!-- (16K, 8K) -->
+      <g transform="translate(1204.934728 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_27">
+     <g id="line2d_49">
+      <g>
+       <use xlink:href="#me1a6361767" x="1245.996809" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_36">
+      <!-- (64K, 8K) -->
+      <g transform="translate(1247.46737 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_37">
+     <!-- Shape (M, N hidden) -->
+     <g transform="translate(994.07124 378.867037) scale(0.16 -0.16)">
+      <use xlink:href="#DejaVuSans-53"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(63.476562 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(126.855469 0)"/>
+      <use xlink:href="#DejaVuSans-70" transform="translate(188.134766 0)"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(251.611328 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(313.134766 0)"/>
+      <use xlink:href="#DejaVuSans-28" transform="translate(344.921875 0)"/>
+      <use xlink:href="#DejaVuSans-4d" transform="translate(383.935547 0)"/>
+      <use xlink:href="#DejaVuSans-2c" transform="translate(470.214844 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(502.001953 0)"/>
+      <use xlink:href="#DejaVuSans-4e" transform="translate(533.789062 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(608.59375 0)"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(640.380859 0)"/>
+      <use xlink:href="#DejaVuSans-69" transform="translate(703.759766 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(731.542969 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(795.019531 0)"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(858.496094 0)"/>
+      <use xlink:href="#DejaVuSans-6e" transform="translate(920.019531 0)"/>
+      <use xlink:href="#DejaVuSans-29" transform="translate(983.398438 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_6">
+    <g id="ytick_9">
+     <g id="line2d_50">
+      <path d="M 888.722614 316.082831 
+L 1263.009866 316.082831 
+" clip-path="url(#pe3e5c8b465)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_51">
+      <g>
+       <use xlink:href="#mda30e46ddb" x="888.722614" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_10">
+     <g id="line2d_52">
+      <path d="M 888.722614 273.086675 
+L 1263.009866 273.086675 
+" clip-path="url(#pe3e5c8b465)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_53">
+      <g>
+       <use xlink:href="#mda30e46ddb" x="888.722614" y="273.086675" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_11">
+     <g id="line2d_54">
+      <path d="M 888.722614 230.090518 
+L 1263.009866 230.090518 
+" clip-path="url(#pe3e5c8b465)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_55">
+      <g>
+       <use xlink:href="#mda30e46ddb" x="888.722614" y="230.090518" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_12">
+     <g id="line2d_56">
+      <path d="M 888.722614 187.094361 
+L 1263.009866 187.094361 
+" clip-path="url(#pe3e5c8b465)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_57">
+      <g>
+       <use xlink:href="#mda30e46ddb" x="888.722614" y="187.094361" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_58">
+    <path d="M 905.735671 247.795263 
+L 948.268313 222.809969 
+L 990.800955 213.278342 
+L 1033.333597 249.388012 
+L 1075.86624 225.532749 
+L 1118.398882 216.665557 
+L 1160.931524 248.221497 
+L 1203.464166 225.860191 
+L 1245.996809 217.405128 
+" clip-path="url(#pe3e5c8b465)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pe3e5c8b465)">
+     <use xlink:href="#m487c4cd14e" x="905.735671" y="247.795263" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="948.268313" y="222.809969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="990.800955" y="213.278342" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="1033.333597" y="249.388012" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="1075.86624" y="225.532749" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="1118.398882" y="216.665557" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="1160.931524" y="248.221497" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="1203.464166" y="225.860191" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="1245.996809" y="217.405128" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_59">
+    <path d="M 905.735671 267.417263 
+L 948.268313 255.200155 
+L 990.800955 251.112552 
+L 1033.333597 254.491374 
+L 1075.86624 238.25727 
+L 1118.398882 232.686349 
+L 1160.931524 256.363568 
+L 1203.464166 244.963072 
+L 1245.996809 241.004541 
+" clip-path="url(#pe3e5c8b465)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pe3e5c8b465)">
+     <use xlink:href="#m29f540cea5" x="905.735671" y="267.417263" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="948.268313" y="255.200155" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="990.800955" y="251.112552" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="1033.333597" y="254.491374" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="1075.86624" y="238.25727" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="1118.398882" y="232.686349" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="1160.931524" y="256.363568" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="1203.464166" y="244.963072" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="1245.996809" y="241.004541" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_60">
+    <path d="M 888.722614 157.502432 
+L 1263.009866 157.502432 
+" clip-path="url(#pe3e5c8b465)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_13">
+    <path d="M 888.722614 316.082831 
+L 888.722614 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_14">
+    <path d="M 1263.009866 316.082831 
+L 1263.009866 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_15">
+    <path d="M 888.722614 316.082831 
+L 1263.009866 316.082831 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_16">
+    <path d="M 888.722614 144.816 
+L 1263.009866 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_38">
+    <!-- LayerNorm (fwd) -->
+    <g transform="translate(1000.25499 138.816) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-4c" d="M 628 4666 
+L 1259 4666 
+L 1259 531 
+L 3531 531 
+L 3531 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-4c"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(55.712891 0)"/>
+     <use xlink:href="#DejaVuSans-79" transform="translate(116.992188 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(176.171875 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(237.695312 0)"/>
+     <use xlink:href="#DejaVuSans-4e" transform="translate(278.808594 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(353.613281 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(414.794922 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(454.158203 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(551.570312 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(583.357422 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(622.371094 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(655.826172 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(737.613281 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(801.089844 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_4">
+   <g id="patch_17">
+    <path d="M 1303.85142 316.082831 
+L 1678.138672 316.082831 
+L 1678.138672 144.816 
+L 1303.85142 144.816 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_7">
+    <g id="xtick_28">
+     <g id="line2d_61">
+      <g>
+       <use xlink:href="#me1a6361767" x="1320.864477" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_39">
+      <!-- (4K, 3K) -->
+      <g transform="translate(1322.335039 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-33" d="M 2597 2516 
+Q 3050 2419 3304 2112 
+Q 3559 1806 3559 1356 
+Q 3559 666 3084 287 
+Q 2609 -91 1734 -91 
+Q 1441 -91 1130 -33 
+Q 819 25 488 141 
+L 488 750 
+Q 750 597 1062 519 
+Q 1375 441 1716 441 
+Q 2309 441 2620 675 
+Q 2931 909 2931 1356 
+Q 2931 1769 2642 2001 
+Q 2353 2234 1838 2234 
+L 1294 2234 
+L 1294 2753 
+L 1863 2753 
+Q 2328 2753 2575 2939 
+Q 2822 3125 2822 3475 
+Q 2822 3834 2567 4026 
+Q 2313 4219 1838 4219 
+Q 1578 4219 1281 4162 
+Q 984 4106 628 3988 
+L 628 4550 
+Q 988 4650 1302 4700 
+Q 1616 4750 1894 4750 
+Q 2613 4750 3031 4423 
+Q 3450 4097 3450 3541 
+Q 3450 3153 3228 2886 
+Q 3006 2619 2597 2516 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_29">
+     <g id="line2d_62">
+      <g>
+       <use xlink:href="#me1a6361767" x="1351.797308" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_40">
+      <!-- (16K, 3K) -->
+      <g transform="translate(1353.267869 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_30">
+     <g id="line2d_63">
+      <g>
+       <use xlink:href="#me1a6361767" x="1382.730139" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_41">
+      <!-- (64K, 3K) -->
+      <g transform="translate(1384.2007 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_31">
+     <g id="line2d_64">
+      <g>
+       <use xlink:href="#me1a6361767" x="1413.66297" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_42">
+      <!-- (4K, 6K) -->
+      <g transform="translate(1415.133531 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_32">
+     <g id="line2d_65">
+      <g>
+       <use xlink:href="#me1a6361767" x="1444.5958" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_43">
+      <!-- (16K, 6K) -->
+      <g transform="translate(1446.066361 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_33">
+     <g id="line2d_66">
+      <g>
+       <use xlink:href="#me1a6361767" x="1475.528631" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_44">
+      <!-- (64K, 6K) -->
+      <g transform="translate(1476.999192 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_34">
+     <g id="line2d_67">
+      <g>
+       <use xlink:href="#me1a6361767" x="1506.461462" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_45">
+      <!-- (4K, 8K) -->
+      <g transform="translate(1507.932023 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_35">
+     <g id="line2d_68">
+      <g>
+       <use xlink:href="#me1a6361767" x="1537.394293" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_46">
+      <!-- (16K, 8K) -->
+      <g transform="translate(1538.864854 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_36">
+     <g id="line2d_69">
+      <g>
+       <use xlink:href="#me1a6361767" x="1568.327123" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_47">
+      <!-- (64K, 8K) -->
+      <g transform="translate(1569.797684 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_37">
+     <g id="line2d_70">
+      <g>
+       <use xlink:href="#me1a6361767" x="1599.259954" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_48">
+      <!-- (4K, 12K) -->
+      <g transform="translate(1600.730515 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_38">
+     <g id="line2d_71">
+      <g>
+       <use xlink:href="#me1a6361767" x="1630.192785" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_49">
+      <!-- (16K, 12K) -->
+      <g transform="translate(1631.663346 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_39">
+     <g id="line2d_72">
+      <g>
+       <use xlink:href="#me1a6361767" x="1661.125616" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_50">
+      <!-- (64K, 12K) -->
+      <g transform="translate(1662.596177 328.455738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_51">
+     <!-- Shape (M, C classes) -->
+     <g transform="translate(1408.092546 383.366004) scale(0.16 -0.16)">
+      <defs>
+       <path id="DejaVuSans-43" d="M 4122 4306 
+L 4122 3641 
+Q 3803 3938 3442 4084 
+Q 3081 4231 2675 4231 
+Q 1875 4231 1450 3742 
+Q 1025 3253 1025 2328 
+Q 1025 1406 1450 917 
+Q 1875 428 2675 428 
+Q 3081 428 3442 575 
+Q 3803 722 4122 1019 
+L 4122 359 
+Q 3791 134 3420 21 
+Q 3050 -91 2638 -91 
+Q 1578 -91 968 557 
+Q 359 1206 359 2328 
+Q 359 3453 968 4101 
+Q 1578 4750 2638 4750 
+Q 3056 4750 3426 4639 
+Q 3797 4528 4122 4306 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-63" d="M 3122 3366 
+L 3122 2828 
+Q 2878 2963 2633 3030 
+Q 2388 3097 2138 3097 
+Q 1578 3097 1268 2742 
+Q 959 2388 959 1747 
+Q 959 1106 1268 751 
+Q 1578 397 2138 397 
+Q 2388 397 2633 464 
+Q 2878 531 3122 666 
+L 3122 134 
+Q 2881 22 2623 -34 
+Q 2366 -91 2075 -91 
+Q 1284 -91 818 406 
+Q 353 903 353 1747 
+Q 353 2603 823 3093 
+Q 1294 3584 2113 3584 
+Q 2378 3584 2631 3529 
+Q 2884 3475 3122 3366 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6c" d="M 603 4863 
+L 1178 4863 
+L 1178 0 
+L 603 0 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-53"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(63.476562 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(126.855469 0)"/>
+      <use xlink:href="#DejaVuSans-70" transform="translate(188.134766 0)"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(251.611328 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(313.134766 0)"/>
+      <use xlink:href="#DejaVuSans-28" transform="translate(344.921875 0)"/>
+      <use xlink:href="#DejaVuSans-4d" transform="translate(383.935547 0)"/>
+      <use xlink:href="#DejaVuSans-2c" transform="translate(470.214844 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(502.001953 0)"/>
+      <use xlink:href="#DejaVuSans-43" transform="translate(533.789062 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(603.613281 0)"/>
+      <use xlink:href="#DejaVuSans-63" transform="translate(635.400391 0)"/>
+      <use xlink:href="#DejaVuSans-6c" transform="translate(690.380859 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(718.164062 0)"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(779.443359 0)"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(831.542969 0)"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(883.642578 0)"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(945.166016 0)"/>
+      <use xlink:href="#DejaVuSans-29" transform="translate(997.265625 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_8">
+    <g id="ytick_13">
+     <g id="line2d_73">
+      <path d="M 1303.85142 316.082831 
+L 1678.138672 316.082831 
+" clip-path="url(#p66a4d4c4b8)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_74">
+      <g>
+       <use xlink:href="#mda30e46ddb" x="1303.85142" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_14">
+     <g id="line2d_75">
+      <path d="M 1303.85142 273.086675 
+L 1678.138672 273.086675 
+" clip-path="url(#p66a4d4c4b8)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_76">
+      <g>
+       <use xlink:href="#mda30e46ddb" x="1303.85142" y="273.086675" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_15">
+     <g id="line2d_77">
+      <path d="M 1303.85142 230.090518 
+L 1678.138672 230.090518 
+" clip-path="url(#p66a4d4c4b8)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_78">
+      <g>
+       <use xlink:href="#mda30e46ddb" x="1303.85142" y="230.090518" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_16">
+     <g id="line2d_79">
+      <path d="M 1303.85142 187.094361 
+L 1678.138672 187.094361 
+" clip-path="url(#p66a4d4c4b8)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_80">
+      <g>
+       <use xlink:href="#mda30e46ddb" x="1303.85142" y="187.094361" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_81">
+    <path d="M 1320.864477 259.659908 
+L 1351.797308 229.36802 
+L 1382.730139 213.321305 
+L 1413.66297 253.788039 
+L 1444.5958 231.855436 
+L 1475.528631 223.774347 
+L 1506.461462 243.579895 
+L 1537.394293 223.893026 
+L 1568.327123 217.84971 
+L 1599.259954 241.504302 
+L 1630.192785 225.418811 
+L 1661.125616 221.008931 
+" clip-path="url(#p66a4d4c4b8)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p66a4d4c4b8)">
+     <use xlink:href="#m487c4cd14e" x="1320.864477" y="259.659908" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="1351.797308" y="229.36802" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="1382.730139" y="213.321305" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="1413.66297" y="253.788039" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="1444.5958" y="231.855436" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="1475.528631" y="223.774347" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="1506.461462" y="243.579895" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="1537.394293" y="223.893026" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="1568.327123" y="217.84971" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="1599.259954" y="241.504302" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="1630.192785" y="225.418811" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m487c4cd14e" x="1661.125616" y="221.008931" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_82">
+    <path d="M 1320.864477 302.225194 
+L 1351.797308 267.457114 
+L 1382.730139 251.224717 
+L 1413.66297 290.711635 
+L 1444.5958 257.786545 
+L 1475.528631 251.452025 
+L 1506.461462 282.752575 
+L 1537.394293 250.30758 
+L 1568.327123 245.121217 
+L 1599.259954 267.900687 
+L 1630.192785 253.059875 
+L 1661.125616 249.723372 
+" clip-path="url(#p66a4d4c4b8)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p66a4d4c4b8)">
+     <use xlink:href="#m29f540cea5" x="1320.864477" y="302.225194" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="1351.797308" y="267.457114" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="1382.730139" y="251.224717" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="1413.66297" y="290.711635" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="1444.5958" y="257.786545" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="1475.528631" y="251.452025" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="1506.461462" y="282.752575" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="1537.394293" y="250.30758" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="1568.327123" y="245.121217" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="1599.259954" y="267.900687" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="1630.192785" y="253.059875" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m29f540cea5" x="1661.125616" y="249.723372" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_83">
+    <path d="M 1303.85142 157.502432 
+L 1678.138672 157.502432 
+" clip-path="url(#p66a4d4c4b8)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_18">
+    <path d="M 1303.85142 316.082831 
+L 1303.85142 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_19">
+    <path d="M 1678.138672 316.082831 
+L 1678.138672 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_20">
+    <path d="M 1303.85142 316.082831 
+L 1678.138672 316.082831 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_21">
+    <path d="M 1303.85142 144.816 
+L 1678.138672 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_52">
+    <!-- Cross-Entropy (fwd+bwd) -->
+    <g transform="translate(1375.821765 138.816) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-2d" d="M 313 2009 
+L 1997 2009 
+L 1997 1497 
+L 313 1497 
+L 313 2009 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-45" d="M 628 4666 
+L 3578 4666 
+L 3578 4134 
+L 1259 4134 
+L 1259 2753 
+L 3481 2753 
+L 3481 2222 
+L 1259 2222 
+L 1259 531 
+L 3634 531 
+L 3634 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-43"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(69.824219 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(108.6875 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(169.869141 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(221.96875 0)"/>
+     <use xlink:href="#DejaVuSans-2d" transform="translate(274.068359 0)"/>
+     <use xlink:href="#DejaVuSans-45" transform="translate(310.152344 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(373.335938 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(436.714844 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(475.923828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(514.787109 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(575.96875 0)"/>
+     <use xlink:href="#DejaVuSans-79" transform="translate(639.445312 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(698.625 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(730.412109 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(769.425781 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(802.880859 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(884.667969 0)"/>
+     <use xlink:href="#DejaVuSans-2b" transform="translate(948.144531 0)"/>
+     <use xlink:href="#DejaVuSans-62" transform="translate(1031.933594 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(1095.410156 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1177.197266 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1240.673828 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="text_53">
+   <!-- SM100 FP16 Kernel Benchmarks (Oink vs Quack) — DSv3 (4 ops) -->
+   <g transform="translate(497.948594 18.156563) scale(0.22 -0.22)">
+    <defs>
+     <path id="DejaVuSans-50" d="M 1259 4147 
+L 1259 2394 
+L 2053 2394 
+Q 2494 2394 2734 2622 
+Q 2975 2850 2975 3272 
+Q 2975 3691 2734 3919 
+Q 2494 4147 2053 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 2053 4666 
+Q 2838 4666 3239 4311 
+Q 3641 3956 3641 3272 
+Q 3641 2581 3239 2228 
+Q 2838 1875 2053 1875 
+L 1259 1875 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-6b" d="M 581 4863 
+L 1159 4863 
+L 1159 1991 
+L 2875 3500 
+L 3609 3500 
+L 1753 1863 
+L 3688 0 
+L 2938 0 
+L 1159 1709 
+L 1159 0 
+L 581 0 
+L 581 4863 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-4f" d="M 2522 4238 
+Q 1834 4238 1429 3725 
+Q 1025 3213 1025 2328 
+Q 1025 1447 1429 934 
+Q 1834 422 2522 422 
+Q 3209 422 3611 934 
+Q 4013 1447 4013 2328 
+Q 4013 3213 3611 3725 
+Q 3209 4238 2522 4238 
+z
+M 2522 4750 
+Q 3503 4750 4090 4092 
+Q 4678 3434 4678 2328 
+Q 4678 1225 4090 567 
+Q 3503 -91 2522 -91 
+Q 1538 -91 948 565 
+Q 359 1222 359 2328 
+Q 359 3434 948 4092 
+Q 1538 4750 2522 4750 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-76" d="M 191 3500 
+L 800 3500 
+L 1894 563 
+L 2988 3500 
+L 3597 3500 
+L 2284 0 
+L 1503 0 
+L 191 3500 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-51" d="M 2522 4238 
+Q 1834 4238 1429 3725 
+Q 1025 3213 1025 2328 
+Q 1025 1447 1429 934 
+Q 1834 422 2522 422 
+Q 3209 422 3611 934 
+Q 4013 1447 4013 2328 
+Q 4013 3213 3611 3725 
+Q 3209 4238 2522 4238 
+z
+M 3406 84 
+L 4238 -825 
+L 3475 -825 
+L 2784 -78 
+Q 2681 -84 2626 -87 
+Q 2572 -91 2522 -91 
+Q 1538 -91 948 567 
+Q 359 1225 359 2328 
+Q 359 3434 948 4092 
+Q 1538 4750 2522 4750 
+Q 3503 4750 4090 4092 
+Q 4678 3434 4678 2328 
+Q 4678 1516 4351 937 
+Q 4025 359 3406 84 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-2014" d="M 313 1978 
+L 6088 1978 
+L 6088 1528 
+L 313 1528 
+L 313 1978 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-44" d="M 1259 4147 
+L 1259 519 
+L 2022 519 
+Q 2988 519 3436 956 
+Q 3884 1394 3884 2338 
+Q 3884 3275 3436 3711 
+Q 2988 4147 2022 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 1925 4666 
+Q 3281 4666 3915 4102 
+Q 4550 3538 4550 2338 
+Q 4550 1131 3912 565 
+Q 3275 0 1925 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+    </defs>
+    <use xlink:href="#DejaVuSans-53"/>
+    <use xlink:href="#DejaVuSans-4d" transform="translate(63.476562 0)"/>
+    <use xlink:href="#DejaVuSans-31" transform="translate(149.755859 0)"/>
+    <use xlink:href="#DejaVuSans-30" transform="translate(213.378906 0)"/>
+    <use xlink:href="#DejaVuSans-30" transform="translate(277.001953 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(340.625 0)"/>
+    <use xlink:href="#DejaVuSans-46" transform="translate(372.412109 0)"/>
+    <use xlink:href="#DejaVuSans-50" transform="translate(429.931641 0)"/>
+    <use xlink:href="#DejaVuSans-31" transform="translate(490.234375 0)"/>
+    <use xlink:href="#DejaVuSans-36" transform="translate(553.857422 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(617.480469 0)"/>
+    <use xlink:href="#DejaVuSans-4b" transform="translate(649.267578 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(709.84375 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(771.367188 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(810.730469 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(874.109375 0)"/>
+    <use xlink:href="#DejaVuSans-6c" transform="translate(935.632812 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(963.416016 0)"/>
+    <use xlink:href="#DejaVuSans-42" transform="translate(995.203125 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(1063.806641 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(1125.330078 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(1188.708984 0)"/>
+    <use xlink:href="#DejaVuSans-68" transform="translate(1243.689453 0)"/>
+    <use xlink:href="#DejaVuSans-6d" transform="translate(1307.068359 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(1404.480469 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(1465.759766 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(1506.873047 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(1564.783203 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(1616.882812 0)"/>
+    <use xlink:href="#DejaVuSans-28" transform="translate(1648.669922 0)"/>
+    <use xlink:href="#DejaVuSans-4f" transform="translate(1687.683594 0)"/>
+    <use xlink:href="#DejaVuSans-69" transform="translate(1766.394531 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(1794.177734 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(1857.556641 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(1915.466797 0)"/>
+    <use xlink:href="#DejaVuSans-76" transform="translate(1947.253906 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(2006.433594 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2058.533203 0)"/>
+    <use xlink:href="#DejaVuSans-51" transform="translate(2090.320312 0)"/>
+    <use xlink:href="#DejaVuSans-75" transform="translate(2169.03125 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(2232.410156 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(2293.689453 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(2348.669922 0)"/>
+    <use xlink:href="#DejaVuSans-29" transform="translate(2406.580078 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2445.59375 0)"/>
+    <use xlink:href="#DejaVuSans-2014" transform="translate(2477.380859 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2577.380859 0)"/>
+    <use xlink:href="#DejaVuSans-44" transform="translate(2609.167969 0)"/>
+    <use xlink:href="#DejaVuSans-53" transform="translate(2686.169922 0)"/>
+    <use xlink:href="#DejaVuSans-76" transform="translate(2749.646484 0)"/>
+    <use xlink:href="#DejaVuSans-33" transform="translate(2808.826172 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2872.449219 0)"/>
+    <use xlink:href="#DejaVuSans-28" transform="translate(2904.236328 0)"/>
+    <use xlink:href="#DejaVuSans-34" transform="translate(2943.25 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(3006.873047 0)"/>
+    <use xlink:href="#DejaVuSans-6f" transform="translate(3038.660156 0)"/>
+    <use xlink:href="#DejaVuSans-70" transform="translate(3099.841797 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(3163.318359 0)"/>
+    <use xlink:href="#DejaVuSans-29" transform="translate(3215.417969 0)"/>
+   </g>
+  </g>
+  <g id="legend_1">
+   <g id="line2d_84">
+    <path d="M 594.240937 46.691969 
+L 610.490937 46.691969 
+L 626.740937 46.691969 
+" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g>
+     <use xlink:href="#m487c4cd14e" x="610.490937" y="46.691969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="text_54">
+    <!-- KernelAgent-Oink (ours) -->
+    <g transform="translate(637.140937 51.241969) scale(0.13 -0.13)">
+     <defs>
+      <path id="DejaVuSans-67" d="M 2906 1791 
+Q 2906 2416 2648 2759 
+Q 2391 3103 1925 3103 
+Q 1463 3103 1205 2759 
+Q 947 2416 947 1791 
+Q 947 1169 1205 825 
+Q 1463 481 1925 481 
+Q 2391 481 2648 825 
+Q 2906 1169 2906 1791 
+z
+M 3481 434 
+Q 3481 -459 3084 -895 
+Q 2688 -1331 1869 -1331 
+Q 1566 -1331 1297 -1286 
+Q 1028 -1241 775 -1147 
+L 775 -588 
+Q 1028 -725 1275 -790 
+Q 1522 -856 1778 -856 
+Q 2344 -856 2625 -561 
+Q 2906 -266 2906 331 
+L 2906 616 
+Q 2728 306 2450 153 
+Q 2172 0 1784 0 
+Q 1141 0 747 490 
+Q 353 981 353 1791 
+Q 353 2603 747 3093 
+Q 1141 3584 1784 3584 
+Q 2172 3584 2450 3431 
+Q 2728 3278 2906 2969 
+L 2906 3500 
+L 3481 3500 
+L 3481 434 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-4b"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(60.576172 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(122.099609 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(161.462891 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(224.841797 0)"/>
+     <use xlink:href="#DejaVuSans-6c" transform="translate(286.365234 0)"/>
+     <use xlink:href="#DejaVuSans-41" transform="translate(314.148438 0)"/>
+     <use xlink:href="#DejaVuSans-67" transform="translate(382.556641 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(446.033203 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(507.556641 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(570.935547 0)"/>
+     <use xlink:href="#DejaVuSans-2d" transform="translate(610.144531 0)"/>
+     <use xlink:href="#DejaVuSans-4f" transform="translate(648.978516 0)"/>
+     <use xlink:href="#DejaVuSans-69" transform="translate(727.689453 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(755.472656 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(818.851562 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(876.761719 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(908.548828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(947.5625 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(1008.744141 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(1072.123047 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(1113.236328 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1165.335938 0)"/>
+    </g>
+   </g>
+   <g id="line2d_85">
+    <path d="M 819.705625 46.691969 
+L 835.955625 46.691969 
+L 852.205625 46.691969 
+" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g>
+     <use xlink:href="#m29f540cea5" x="835.955625" y="46.691969" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="text_55">
+    <!-- Quack -->
+    <g transform="translate(862.605625 51.241969) scale(0.13 -0.13)">
+     <use xlink:href="#DejaVuSans-51"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(78.710938 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(142.089844 0)"/>
+     <use xlink:href="#DejaVuSans-63" transform="translate(203.369141 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(258.349609 0)"/>
+    </g>
+   </g>
+   <g id="line2d_86">
+    <path d="M 929.720156 46.691969 
+L 945.970156 46.691969 
+L 962.220156 46.691969 
+" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="text_56">
+    <!-- HBM peak (measured) -->
+    <g transform="translate(972.620156 51.241969) scale(0.13 -0.13)">
+     <defs>
+      <path id="DejaVuSans-48" d="M 628 4666 
+L 1259 4666 
+L 1259 2753 
+L 3553 2753 
+L 3553 4666 
+L 4184 4666 
+L 4184 0 
+L 3553 0 
+L 3553 2222 
+L 1259 2222 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-48"/>
+     <use xlink:href="#DejaVuSans-42" transform="translate(75.195312 0)"/>
+     <use xlink:href="#DejaVuSans-4d" transform="translate(143.798828 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(230.078125 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(261.865234 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(325.341797 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(386.865234 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(448.144531 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(506.054688 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(537.841797 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(576.855469 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(674.267578 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(735.791016 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(797.070312 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(849.169922 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(912.548828 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(951.412109 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1012.935547 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1076.412109 0)"/>
+    </g>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="pb6a1cac816">
+   <rect x="58.465" y="144.816" width="374.287252" height="171.266831"/>
+  </clipPath>
+  <clipPath id="p7b554e3208">
+   <rect x="473.593807" y="144.816" width="374.287252" height="171.266831"/>
+  </clipPath>
+  <clipPath id="pe3e5c8b465">
+   <rect x="888.722614" y="144.816" width="374.287252" height="171.266831"/>
+  </clipPath>
+  <clipPath id="p66a4d4c4b8">
+   <rect x="1303.85142" y="144.816" width="374.287252" height="171.266831"/>
+  </clipPath>
+ </defs>
+</svg>
diff --git a/oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3_cross_entropy.svg b/oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3_cross_entropy.svg
new file mode 100644
index 0000000..6a16fe8
--- /dev/null
+++ b/oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3_cross_entropy.svg
@@ -0,0 +1,1708 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="414.886659pt" height="387.048504pt" viewBox="0 0 414.886659 387.048504" xmlns="http://www.w3.org/2000/svg" version="1.1">
+ <metadata>
+  <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+   <cc:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
+    <dc:date>2026-01-12T23:31:46.294935</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <cc:Agent>
+      <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
+     </cc:Agent>
+    </dc:creator>
+   </cc:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure_1">
+  <g id="patch_1">
+   <path d="M 0 387.048504 
+L 414.886659 387.048504 
+L 414.886659 0 
+L 0 0 
+z
+" style="fill: #ffffff"/>
+  </g>
+  <g id="axes_1">
+   <g id="patch_2">
+    <path d="M 58.465 334.482831 
+L 384.123653 334.482831 
+L 384.123653 144.816 
+L 58.465 144.816 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="line2d_1">
+      <defs>
+       <path id="meecc0d2803" d="M 0 0 
+L 0 3.5 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#meecc0d2803" x="73.267666" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_1">
+      <!-- (4K, 3K) -->
+      <g transform="translate(74.738227 346.855738) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-28" d="M 1984 4856 
+Q 1566 4138 1362 3434 
+Q 1159 2731 1159 2009 
+Q 1159 1288 1364 580 
+Q 1569 -128 1984 -844 
+L 1484 -844 
+Q 1016 -109 783 600 
+Q 550 1309 550 2009 
+Q 550 2706 781 3412 
+Q 1013 4119 1484 4856 
+L 1984 4856 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-34" d="M 2419 4116 
+L 825 1625 
+L 2419 1625 
+L 2419 4116 
+z
+M 2253 4666 
+L 3047 4666 
+L 3047 1625 
+L 3713 1625 
+L 3713 1100 
+L 3047 1100 
+L 3047 0 
+L 2419 0 
+L 2419 1100 
+L 313 1100 
+L 313 1709 
+L 2253 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-4b" d="M 628 4666 
+L 1259 4666 
+L 1259 2694 
+L 3353 4666 
+L 4166 4666 
+L 1850 2491 
+L 4331 0 
+L 3500 0 
+L 1259 2247 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-2c" d="M 750 794 
+L 1409 794 
+L 1409 256 
+L 897 -744 
+L 494 -744 
+L 750 256 
+L 750 794 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-20" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-33" d="M 2597 2516 
+Q 3050 2419 3304 2112 
+Q 3559 1806 3559 1356 
+Q 3559 666 3084 287 
+Q 2609 -91 1734 -91 
+Q 1441 -91 1130 -33 
+Q 819 25 488 141 
+L 488 750 
+Q 750 597 1062 519 
+Q 1375 441 1716 441 
+Q 2309 441 2620 675 
+Q 2931 909 2931 1356 
+Q 2931 1769 2642 2001 
+Q 2353 2234 1838 2234 
+L 1294 2234 
+L 1294 2753 
+L 1863 2753 
+Q 2328 2753 2575 2939 
+Q 2822 3125 2822 3475 
+Q 2822 3834 2567 4026 
+Q 2313 4219 1838 4219 
+Q 1578 4219 1281 4162 
+Q 984 4106 628 3988 
+L 628 4550 
+Q 988 4650 1302 4700 
+Q 1616 4750 1894 4750 
+Q 2613 4750 3031 4423 
+Q 3450 4097 3450 3541 
+Q 3450 3153 3228 2886 
+Q 3006 2619 2597 2516 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-29" d="M 513 4856 
+L 1013 4856 
+Q 1481 4119 1714 3412 
+Q 1947 2706 1947 2009 
+Q 1947 1309 1714 600 
+Q 1481 -109 1013 -844 
+L 513 -844 
+Q 928 -128 1133 580 
+Q 1338 1288 1338 2009 
+Q 1338 2731 1133 3434 
+Q 928 4138 513 4856 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="line2d_2">
+      <g>
+       <use xlink:href="#meecc0d2803" x="100.181604" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_2">
+      <!-- (16K, 3K) -->
+      <g transform="translate(101.652165 346.855738) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-31" d="M 794 531 
+L 1825 531 
+L 1825 4091 
+L 703 3866 
+L 703 4441 
+L 1819 4666 
+L 2450 4666 
+L 2450 531 
+L 3481 531 
+L 3481 0 
+L 794 0 
+L 794 531 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-36" d="M 2113 2584 
+Q 1688 2584 1439 2293 
+Q 1191 2003 1191 1497 
+Q 1191 994 1439 701 
+Q 1688 409 2113 409 
+Q 2538 409 2786 701 
+Q 3034 994 3034 1497 
+Q 3034 2003 2786 2293 
+Q 2538 2584 2113 2584 
+z
+M 3366 4563 
+L 3366 3988 
+Q 3128 4100 2886 4159 
+Q 2644 4219 2406 4219 
+Q 1781 4219 1451 3797 
+Q 1122 3375 1075 2522 
+Q 1259 2794 1537 2939 
+Q 1816 3084 2150 3084 
+Q 2853 3084 3261 2657 
+Q 3669 2231 3669 1497 
+Q 3669 778 3244 343 
+Q 2819 -91 2113 -91 
+Q 1303 -91 875 529 
+Q 447 1150 447 2328 
+Q 447 3434 972 4092 
+Q 1497 4750 2381 4750 
+Q 2619 4750 2861 4703 
+Q 3103 4656 3366 4563 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="line2d_3">
+      <g>
+       <use xlink:href="#meecc0d2803" x="127.095543" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_3">
+      <!-- (64K, 3K) -->
+      <g transform="translate(128.566104 346.855738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="line2d_4">
+      <g>
+       <use xlink:href="#meecc0d2803" x="154.009481" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_4">
+      <!-- (4K, 6K) -->
+      <g transform="translate(155.480042 346.855738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="line2d_5">
+      <g>
+       <use xlink:href="#meecc0d2803" x="180.923419" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_5">
+      <!-- (16K, 6K) -->
+      <g transform="translate(182.39398 346.855738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="line2d_6">
+      <g>
+       <use xlink:href="#meecc0d2803" x="207.837357" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_6">
+      <!-- (64K, 6K) -->
+      <g transform="translate(209.307918 346.855738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_7">
+     <g id="line2d_7">
+      <g>
+       <use xlink:href="#meecc0d2803" x="234.751295" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_7">
+      <!-- (4K, 8K) -->
+      <g transform="translate(236.221857 346.855738) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-38" d="M 2034 2216 
+Q 1584 2216 1326 1975 
+Q 1069 1734 1069 1313 
+Q 1069 891 1326 650 
+Q 1584 409 2034 409 
+Q 2484 409 2743 651 
+Q 3003 894 3003 1313 
+Q 3003 1734 2745 1975 
+Q 2488 2216 2034 2216 
+z
+M 1403 2484 
+Q 997 2584 770 2862 
+Q 544 3141 544 3541 
+Q 544 4100 942 4425 
+Q 1341 4750 2034 4750 
+Q 2731 4750 3128 4425 
+Q 3525 4100 3525 3541 
+Q 3525 3141 3298 2862 
+Q 3072 2584 2669 2484 
+Q 3125 2378 3379 2068 
+Q 3634 1759 3634 1313 
+Q 3634 634 3220 271 
+Q 2806 -91 2034 -91 
+Q 1263 -91 848 271 
+Q 434 634 434 1313 
+Q 434 1759 690 2068 
+Q 947 2378 1403 2484 
+z
+M 1172 3481 
+Q 1172 3119 1398 2916 
+Q 1625 2713 2034 2713 
+Q 2441 2713 2670 2916 
+Q 2900 3119 2900 3481 
+Q 2900 3844 2670 4047 
+Q 2441 4250 2034 4250 
+Q 1625 4250 1398 4047 
+Q 1172 3844 1172 3481 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_8">
+     <g id="line2d_8">
+      <g>
+       <use xlink:href="#meecc0d2803" x="261.665234" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_8">
+      <!-- (16K, 8K) -->
+      <g transform="translate(263.135795 346.855738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_9">
+     <g id="line2d_9">
+      <g>
+       <use xlink:href="#meecc0d2803" x="288.579172" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_9">
+      <!-- (64K, 8K) -->
+      <g transform="translate(290.049733 346.855738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_10">
+     <g id="line2d_10">
+      <g>
+       <use xlink:href="#meecc0d2803" x="315.49311" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_10">
+      <!-- (4K, 12K) -->
+      <g transform="translate(316.963671 346.855738) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-32" d="M 1228 531 
+L 3431 531 
+L 3431 0 
+L 469 0 
+L 469 531 
+Q 828 903 1448 1529 
+Q 2069 2156 2228 2338 
+Q 2531 2678 2651 2914 
+Q 2772 3150 2772 3378 
+Q 2772 3750 2511 3984 
+Q 2250 4219 1831 4219 
+Q 1534 4219 1204 4116 
+Q 875 4013 500 3803 
+L 500 4441 
+Q 881 4594 1212 4672 
+Q 1544 4750 1819 4750 
+Q 2544 4750 2975 4387 
+Q 3406 4025 3406 3419 
+Q 3406 3131 3298 2873 
+Q 3191 2616 2906 2266 
+Q 2828 2175 2409 1742 
+Q 1991 1309 1228 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_11">
+     <g id="line2d_11">
+      <g>
+       <use xlink:href="#meecc0d2803" x="342.407048" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_11">
+      <!-- (16K, 12K) -->
+      <g transform="translate(343.87761 346.855738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_12">
+     <g id="line2d_12">
+      <g>
+       <use xlink:href="#meecc0d2803" x="369.320987" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_12">
+      <!-- (64K, 12K) -->
+      <g transform="translate(370.791548 346.855738) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2">
+    <g id="ytick_1">
+     <g id="line2d_13">
+      <path d="M 58.465 334.482831 
+L 384.123653 334.482831 
+" clip-path="url(#p86cc9ba3a0)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_14">
+      <defs>
+       <path id="m4bf57ba76b" d="M 0 0 
+L -3.5 0 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#m4bf57ba76b" x="58.465" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_13">
+      <!-- 0 -->
+      <g transform="translate(43.83 339.041894) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-30" d="M 2034 4250 
+Q 1547 4250 1301 3770 
+Q 1056 3291 1056 2328 
+Q 1056 1369 1301 889 
+Q 1547 409 2034 409 
+Q 2525 409 2770 889 
+Q 3016 1369 3016 2328 
+Q 3016 3291 2770 3770 
+Q 2525 4250 2034 4250 
+z
+M 2034 4750 
+Q 2819 4750 3233 4129 
+Q 3647 3509 3647 2328 
+Q 3647 1150 3233 529 
+Q 2819 -91 2034 -91 
+Q 1250 -91 836 529 
+Q 422 1150 422 2328 
+Q 422 3509 836 4129 
+Q 1250 4750 2034 4750 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_2">
+     <g id="line2d_15">
+      <path d="M 58.465 286.867396 
+L 384.123653 286.867396 
+" clip-path="url(#p86cc9ba3a0)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_16">
+      <g>
+       <use xlink:href="#m4bf57ba76b" x="58.465" y="286.867396" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_14">
+      <!-- 2000 -->
+      <g transform="translate(20.925 291.426458) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-32"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_3">
+     <g id="line2d_17">
+      <path d="M 58.465 239.25196 
+L 384.123653 239.25196 
+" clip-path="url(#p86cc9ba3a0)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_18">
+      <g>
+       <use xlink:href="#m4bf57ba76b" x="58.465" y="239.25196" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_15">
+      <!-- 4000 -->
+      <g transform="translate(20.925 243.811022) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-34"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_4">
+     <g id="line2d_19">
+      <path d="M 58.465 191.636524 
+L 384.123653 191.636524 
+" clip-path="url(#p86cc9ba3a0)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_20">
+      <g>
+       <use xlink:href="#m4bf57ba76b" x="58.465" y="191.636524" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_16">
+      <!-- 6000 -->
+      <g transform="translate(20.925 196.195587) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-36"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_17">
+     <!-- Memory Bandwidth (GB/s) -->
+     <g transform="translate(13.5975 344.653166) rotate(-90) scale(0.16 -0.16)">
+      <defs>
+       <path id="DejaVuSans-4d" d="M 628 4666 
+L 1569 4666 
+L 2759 1491 
+L 3956 4666 
+L 4897 4666 
+L 4897 0 
+L 4281 0 
+L 4281 4097 
+L 3078 897 
+L 2444 897 
+L 1241 4097 
+L 1241 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-65" d="M 3597 1894 
+L 3597 1613 
+L 953 1613 
+Q 991 1019 1311 708 
+Q 1631 397 2203 397 
+Q 2534 397 2845 478 
+Q 3156 559 3463 722 
+L 3463 178 
+Q 3153 47 2828 -22 
+Q 2503 -91 2169 -91 
+Q 1331 -91 842 396 
+Q 353 884 353 1716 
+Q 353 2575 817 3079 
+Q 1281 3584 2069 3584 
+Q 2775 3584 3186 3129 
+Q 3597 2675 3597 1894 
+z
+M 3022 2063 
+Q 3016 2534 2758 2815 
+Q 2500 3097 2075 3097 
+Q 1594 3097 1305 2825 
+Q 1016 2553 972 2059 
+L 3022 2063 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6d" d="M 3328 2828 
+Q 3544 3216 3844 3400 
+Q 4144 3584 4550 3584 
+Q 5097 3584 5394 3201 
+Q 5691 2819 5691 2113 
+L 5691 0 
+L 5113 0 
+L 5113 2094 
+Q 5113 2597 4934 2840 
+Q 4756 3084 4391 3084 
+Q 3944 3084 3684 2787 
+Q 3425 2491 3425 1978 
+L 3425 0 
+L 2847 0 
+L 2847 2094 
+Q 2847 2600 2669 2842 
+Q 2491 3084 2119 3084 
+Q 1678 3084 1418 2786 
+Q 1159 2488 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1356 3278 1631 3431 
+Q 1906 3584 2284 3584 
+Q 2666 3584 2933 3390 
+Q 3200 3197 3328 2828 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6f" d="M 1959 3097 
+Q 1497 3097 1228 2736 
+Q 959 2375 959 1747 
+Q 959 1119 1226 758 
+Q 1494 397 1959 397 
+Q 2419 397 2687 759 
+Q 2956 1122 2956 1747 
+Q 2956 2369 2687 2733 
+Q 2419 3097 1959 3097 
+z
+M 1959 3584 
+Q 2709 3584 3137 3096 
+Q 3566 2609 3566 1747 
+Q 3566 888 3137 398 
+Q 2709 -91 1959 -91 
+Q 1206 -91 779 398 
+Q 353 888 353 1747 
+Q 353 2609 779 3096 
+Q 1206 3584 1959 3584 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-72" d="M 2631 2963 
+Q 2534 3019 2420 3045 
+Q 2306 3072 2169 3072 
+Q 1681 3072 1420 2755 
+Q 1159 2438 1159 1844 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1341 3275 1631 3429 
+Q 1922 3584 2338 3584 
+Q 2397 3584 2469 3576 
+Q 2541 3569 2628 3553 
+L 2631 2963 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-79" d="M 2059 -325 
+Q 1816 -950 1584 -1140 
+Q 1353 -1331 966 -1331 
+L 506 -1331 
+L 506 -850 
+L 844 -850 
+Q 1081 -850 1212 -737 
+Q 1344 -625 1503 -206 
+L 1606 56 
+L 191 3500 
+L 800 3500 
+L 1894 763 
+L 2988 3500 
+L 3597 3500 
+L 2059 -325 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-42" d="M 1259 2228 
+L 1259 519 
+L 2272 519 
+Q 2781 519 3026 730 
+Q 3272 941 3272 1375 
+Q 3272 1813 3026 2020 
+Q 2781 2228 2272 2228 
+L 1259 2228 
+z
+M 1259 4147 
+L 1259 2741 
+L 2194 2741 
+Q 2656 2741 2882 2914 
+Q 3109 3088 3109 3444 
+Q 3109 3797 2882 3972 
+Q 2656 4147 2194 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 2241 4666 
+Q 2963 4666 3353 4366 
+Q 3744 4066 3744 3513 
+Q 3744 3084 3544 2831 
+Q 3344 2578 2956 2516 
+Q 3422 2416 3680 2098 
+Q 3938 1781 3938 1306 
+Q 3938 681 3513 340 
+Q 3088 0 2303 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-61" d="M 2194 1759 
+Q 1497 1759 1228 1600 
+Q 959 1441 959 1056 
+Q 959 750 1161 570 
+Q 1363 391 1709 391 
+Q 2188 391 2477 730 
+Q 2766 1069 2766 1631 
+L 2766 1759 
+L 2194 1759 
+z
+M 3341 1997 
+L 3341 0 
+L 2766 0 
+L 2766 531 
+Q 2569 213 2275 61 
+Q 1981 -91 1556 -91 
+Q 1019 -91 701 211 
+Q 384 513 384 1019 
+Q 384 1609 779 1909 
+Q 1175 2209 1959 2209 
+L 2766 2209 
+L 2766 2266 
+Q 2766 2663 2505 2880 
+Q 2244 3097 1772 3097 
+Q 1472 3097 1187 3025 
+Q 903 2953 641 2809 
+L 641 3341 
+Q 956 3463 1253 3523 
+Q 1550 3584 1831 3584 
+Q 2591 3584 2966 3190 
+Q 3341 2797 3341 1997 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6e" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-64" d="M 2906 2969 
+L 2906 4863 
+L 3481 4863 
+L 3481 0 
+L 2906 0 
+L 2906 525 
+Q 2725 213 2448 61 
+Q 2172 -91 1784 -91 
+Q 1150 -91 751 415 
+Q 353 922 353 1747 
+Q 353 2572 751 3078 
+Q 1150 3584 1784 3584 
+Q 2172 3584 2448 3432 
+Q 2725 3281 2906 2969 
+z
+M 947 1747 
+Q 947 1113 1208 752 
+Q 1469 391 1925 391 
+Q 2381 391 2643 752 
+Q 2906 1113 2906 1747 
+Q 2906 2381 2643 2742 
+Q 2381 3103 1925 3103 
+Q 1469 3103 1208 2742 
+Q 947 2381 947 1747 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-77" d="M 269 3500 
+L 844 3500 
+L 1563 769 
+L 2278 3500 
+L 2956 3500 
+L 3675 769 
+L 4391 3500 
+L 4966 3500 
+L 4050 0 
+L 3372 0 
+L 2619 2869 
+L 1863 0 
+L 1184 0 
+L 269 3500 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-69" d="M 603 3500 
+L 1178 3500 
+L 1178 0 
+L 603 0 
+L 603 3500 
+z
+M 603 4863 
+L 1178 4863 
+L 1178 4134 
+L 603 4134 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-74" d="M 1172 4494 
+L 1172 3500 
+L 2356 3500 
+L 2356 3053 
+L 1172 3053 
+L 1172 1153 
+Q 1172 725 1289 603 
+Q 1406 481 1766 481 
+L 2356 481 
+L 2356 0 
+L 1766 0 
+Q 1100 0 847 248 
+Q 594 497 594 1153 
+L 594 3053 
+L 172 3053 
+L 172 3500 
+L 594 3500 
+L 594 4494 
+L 1172 4494 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-68" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-47" d="M 3809 666 
+L 3809 1919 
+L 2778 1919 
+L 2778 2438 
+L 4434 2438 
+L 4434 434 
+Q 4069 175 3628 42 
+Q 3188 -91 2688 -91 
+Q 1594 -91 976 548 
+Q 359 1188 359 2328 
+Q 359 3472 976 4111 
+Q 1594 4750 2688 4750 
+Q 3144 4750 3555 4637 
+Q 3966 4525 4313 4306 
+L 4313 3634 
+Q 3963 3931 3569 4081 
+Q 3175 4231 2741 4231 
+Q 1884 4231 1454 3753 
+Q 1025 3275 1025 2328 
+Q 1025 1384 1454 906 
+Q 1884 428 2741 428 
+Q 3075 428 3337 486 
+Q 3600 544 3809 666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-2f" d="M 1625 4666 
+L 2156 4666 
+L 531 -594 
+L 0 -594 
+L 1625 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-73" d="M 2834 3397 
+L 2834 2853 
+Q 2591 2978 2328 3040 
+Q 2066 3103 1784 3103 
+Q 1356 3103 1142 2972 
+Q 928 2841 928 2578 
+Q 928 2378 1081 2264 
+Q 1234 2150 1697 2047 
+L 1894 2003 
+Q 2506 1872 2764 1633 
+Q 3022 1394 3022 966 
+Q 3022 478 2636 193 
+Q 2250 -91 1575 -91 
+Q 1294 -91 989 -36 
+Q 684 19 347 128 
+L 347 722 
+Q 666 556 975 473 
+Q 1284 391 1588 391 
+Q 1994 391 2212 530 
+Q 2431 669 2431 922 
+Q 2431 1156 2273 1281 
+Q 2116 1406 1581 1522 
+L 1381 1569 
+Q 847 1681 609 1914 
+Q 372 2147 372 2553 
+Q 372 3047 722 3315 
+Q 1072 3584 1716 3584 
+Q 2034 3584 2315 3537 
+Q 2597 3491 2834 3397 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-4d"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(86.279297 0)"/>
+      <use xlink:href="#DejaVuSans-6d" transform="translate(147.802734 0)"/>
+      <use xlink:href="#DejaVuSans-6f" transform="translate(245.214844 0)"/>
+      <use xlink:href="#DejaVuSans-72" transform="translate(306.396484 0)"/>
+      <use xlink:href="#DejaVuSans-79" transform="translate(347.509766 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(406.689453 0)"/>
+      <use xlink:href="#DejaVuSans-42" transform="translate(438.476562 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(507.080078 0)"/>
+      <use xlink:href="#DejaVuSans-6e" transform="translate(568.359375 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(631.738281 0)"/>
+      <use xlink:href="#DejaVuSans-77" transform="translate(695.214844 0)"/>
+      <use xlink:href="#DejaVuSans-69" transform="translate(777.001953 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(804.785156 0)"/>
+      <use xlink:href="#DejaVuSans-74" transform="translate(868.261719 0)"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(907.470703 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(970.849609 0)"/>
+      <use xlink:href="#DejaVuSans-28" transform="translate(1002.636719 0)"/>
+      <use xlink:href="#DejaVuSans-47" transform="translate(1041.650391 0)"/>
+      <use xlink:href="#DejaVuSans-42" transform="translate(1119.140625 0)"/>
+      <use xlink:href="#DejaVuSans-2f" transform="translate(1187.744141 0)"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(1221.435547 0)"/>
+      <use xlink:href="#DejaVuSans-29" transform="translate(1273.535156 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_21">
+    <path d="M 73.267666 271.998128 
+L 100.181604 238.45184 
+L 127.095543 220.681152 
+L 154.009481 265.495416 
+L 180.923419 241.206491 
+L 207.837357 232.257213 
+L 234.751295 254.190564 
+L 261.665234 232.388642 
+L 288.579172 225.696064 
+L 315.49311 251.89198 
+L 342.407048 234.07835 
+L 369.320987 229.194696 
+" clip-path="url(#p86cc9ba3a0)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <defs>
+     <path id="mab266826f4" d="M 0 3.5 
+C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
+C 3.131218 1.81853 3.5 0.928211 3.5 0 
+C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
+C 1.81853 -3.131218 0.928211 -3.5 0 -3.5 
+C -0.928211 -3.5 -1.81853 -3.131218 -2.474874 -2.474874 
+C -3.131218 -1.81853 -3.5 -0.928211 -3.5 0 
+C -3.5 0.928211 -3.131218 1.81853 -2.474874 2.474874 
+C -1.81853 3.131218 -0.928211 3.5 0 3.5 
+z
+" style="stroke: #5ba3f5"/>
+    </defs>
+    <g clip-path="url(#p86cc9ba3a0)">
+     <use xlink:href="#mab266826f4" x="73.267666" y="271.998128" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mab266826f4" x="100.181604" y="238.45184" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mab266826f4" x="127.095543" y="220.681152" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mab266826f4" x="154.009481" y="265.495416" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mab266826f4" x="180.923419" y="241.206491" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mab266826f4" x="207.837357" y="232.257213" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mab266826f4" x="234.751295" y="254.190564" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mab266826f4" x="261.665234" y="232.388642" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mab266826f4" x="288.579172" y="225.696064" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mab266826f4" x="315.49311" y="251.89198" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mab266826f4" x="342.407048" y="234.07835" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mab266826f4" x="369.320987" y="229.194696" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_22">
+    <path d="M 73.267666 319.136403 
+L 100.181604 280.633025 
+L 127.095543 262.656706 
+L 154.009481 306.385888 
+L 180.923419 269.923501 
+L 207.837357 262.908434 
+L 234.751295 297.571749 
+L 261.665234 261.641036 
+L 288.579172 255.897477 
+L 315.49311 281.124253 
+L 342.407048 264.689023 
+L 369.320987 260.994064 
+" clip-path="url(#p86cc9ba3a0)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <defs>
+     <path id="m1dc950a644" d="M 0 3.5 
+C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
+C 3.131218 1.81853 3.5 0.928211 3.5 0 
+C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
+C 1.81853 -3.131218 0.928211 -3.5 0 -3.5 
+C -0.928211 -3.5 -1.81853 -3.131218 -2.474874 -2.474874 
+C -3.131218 -1.81853 -3.5 -0.928211 -3.5 0 
+C -3.5 0.928211 -3.131218 1.81853 -2.474874 2.474874 
+C -1.81853 3.131218 -0.928211 3.5 0 3.5 
+z
+" style="stroke: #ff4444"/>
+    </defs>
+    <g clip-path="url(#p86cc9ba3a0)">
+     <use xlink:href="#m1dc950a644" x="73.267666" y="319.136403" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1dc950a644" x="100.181604" y="280.633025" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1dc950a644" x="127.095543" y="262.656706" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1dc950a644" x="154.009481" y="306.385888" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1dc950a644" x="180.923419" y="269.923501" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1dc950a644" x="207.837357" y="262.908434" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1dc950a644" x="234.751295" y="297.571749" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1dc950a644" x="261.665234" y="261.641036" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1dc950a644" x="288.579172" y="255.897477" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1dc950a644" x="315.49311" y="281.124253" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1dc950a644" x="342.407048" y="264.689023" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1dc950a644" x="369.320987" y="260.994064" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_23">
+    <path d="M 58.465 158.865395 
+L 384.123653 158.865395 
+" clip-path="url(#p86cc9ba3a0)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_3">
+    <path d="M 58.465 334.482831 
+L 58.465 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_4">
+    <path d="M 384.123653 334.482831 
+L 384.123653 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_5">
+    <path d="M 58.465 334.482831 
+L 384.123653 334.482831 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_6">
+    <path d="M 58.465 144.816 
+L 384.123653 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_18">
+    <!-- Cross-Entropy (fwd+bwd) -->
+    <g transform="translate(106.121045 138.816) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-43" d="M 4122 4306 
+L 4122 3641 
+Q 3803 3938 3442 4084 
+Q 3081 4231 2675 4231 
+Q 1875 4231 1450 3742 
+Q 1025 3253 1025 2328 
+Q 1025 1406 1450 917 
+Q 1875 428 2675 428 
+Q 3081 428 3442 575 
+Q 3803 722 4122 1019 
+L 4122 359 
+Q 3791 134 3420 21 
+Q 3050 -91 2638 -91 
+Q 1578 -91 968 557 
+Q 359 1206 359 2328 
+Q 359 3453 968 4101 
+Q 1578 4750 2638 4750 
+Q 3056 4750 3426 4639 
+Q 3797 4528 4122 4306 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-2d" d="M 313 2009 
+L 1997 2009 
+L 1997 1497 
+L 313 1497 
+L 313 2009 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-45" d="M 628 4666 
+L 3578 4666 
+L 3578 4134 
+L 1259 4134 
+L 1259 2753 
+L 3481 2753 
+L 3481 2222 
+L 1259 2222 
+L 1259 531 
+L 3634 531 
+L 3634 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-70" d="M 1159 525 
+L 1159 -1331 
+L 581 -1331 
+L 581 3500 
+L 1159 3500 
+L 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+z
+M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-66" d="M 2375 4863 
+L 2375 4384 
+L 1825 4384 
+Q 1516 4384 1395 4259 
+Q 1275 4134 1275 3809 
+L 1275 3500 
+L 2222 3500 
+L 2222 3053 
+L 1275 3053 
+L 1275 0 
+L 697 0 
+L 697 3053 
+L 147 3053 
+L 147 3500 
+L 697 3500 
+L 697 3744 
+Q 697 4328 969 4595 
+Q 1241 4863 1831 4863 
+L 2375 4863 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-2b" d="M 2944 4013 
+L 2944 2272 
+L 4684 2272 
+L 4684 1741 
+L 2944 1741 
+L 2944 0 
+L 2419 0 
+L 2419 1741 
+L 678 1741 
+L 678 2272 
+L 2419 2272 
+L 2419 4013 
+L 2944 4013 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-62" d="M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+M 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2969 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-43"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(69.824219 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(108.6875 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(169.869141 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(221.96875 0)"/>
+     <use xlink:href="#DejaVuSans-2d" transform="translate(274.068359 0)"/>
+     <use xlink:href="#DejaVuSans-45" transform="translate(310.152344 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(373.335938 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(436.714844 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(475.923828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(514.787109 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(575.96875 0)"/>
+     <use xlink:href="#DejaVuSans-79" transform="translate(639.445312 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(698.625 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(730.412109 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(769.425781 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(802.880859 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(884.667969 0)"/>
+     <use xlink:href="#DejaVuSans-2b" transform="translate(948.144531 0)"/>
+     <use xlink:href="#DejaVuSans-62" transform="translate(1031.933594 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(1095.410156 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1177.197266 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1240.673828 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="text_19">
+   <!-- SM100 FP16 — DSv3 CrossEntropy -->
+   <g transform="translate(51.454531 15.117187) scale(0.18 -0.18)">
+    <defs>
+     <path id="DejaVuSans-53" d="M 3425 4513 
+L 3425 3897 
+Q 3066 4069 2747 4153 
+Q 2428 4238 2131 4238 
+Q 1616 4238 1336 4038 
+Q 1056 3838 1056 3469 
+Q 1056 3159 1242 3001 
+Q 1428 2844 1947 2747 
+L 2328 2669 
+Q 3034 2534 3370 2195 
+Q 3706 1856 3706 1288 
+Q 3706 609 3251 259 
+Q 2797 -91 1919 -91 
+Q 1588 -91 1214 -16 
+Q 841 59 441 206 
+L 441 856 
+Q 825 641 1194 531 
+Q 1563 422 1919 422 
+Q 2459 422 2753 634 
+Q 3047 847 3047 1241 
+Q 3047 1584 2836 1778 
+Q 2625 1972 2144 2069 
+L 1759 2144 
+Q 1053 2284 737 2584 
+Q 422 2884 422 3419 
+Q 422 4038 858 4394 
+Q 1294 4750 2059 4750 
+Q 2388 4750 2728 4690 
+Q 3069 4631 3425 4513 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-46" d="M 628 4666 
+L 3309 4666 
+L 3309 4134 
+L 1259 4134 
+L 1259 2759 
+L 3109 2759 
+L 3109 2228 
+L 1259 2228 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-50" d="M 1259 4147 
+L 1259 2394 
+L 2053 2394 
+Q 2494 2394 2734 2622 
+Q 2975 2850 2975 3272 
+Q 2975 3691 2734 3919 
+Q 2494 4147 2053 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 2053 4666 
+Q 2838 4666 3239 4311 
+Q 3641 3956 3641 3272 
+Q 3641 2581 3239 2228 
+Q 2838 1875 2053 1875 
+L 1259 1875 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-2014" d="M 313 1978 
+L 6088 1978 
+L 6088 1528 
+L 313 1528 
+L 313 1978 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-44" d="M 1259 4147 
+L 1259 519 
+L 2022 519 
+Q 2988 519 3436 956 
+Q 3884 1394 3884 2338 
+Q 3884 3275 3436 3711 
+Q 2988 4147 2022 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 1925 4666 
+Q 3281 4666 3915 4102 
+Q 4550 3538 4550 2338 
+Q 4550 1131 3912 565 
+Q 3275 0 1925 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-76" d="M 191 3500 
+L 800 3500 
+L 1894 563 
+L 2988 3500 
+L 3597 3500 
+L 2284 0 
+L 1503 0 
+L 191 3500 
+z
+" transform="scale(0.015625)"/>
+    </defs>
+    <use xlink:href="#DejaVuSans-53"/>
+    <use xlink:href="#DejaVuSans-4d" transform="translate(63.476562 0)"/>
+    <use xlink:href="#DejaVuSans-31" transform="translate(149.755859 0)"/>
+    <use xlink:href="#DejaVuSans-30" transform="translate(213.378906 0)"/>
+    <use xlink:href="#DejaVuSans-30" transform="translate(277.001953 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(340.625 0)"/>
+    <use xlink:href="#DejaVuSans-46" transform="translate(372.412109 0)"/>
+    <use xlink:href="#DejaVuSans-50" transform="translate(429.931641 0)"/>
+    <use xlink:href="#DejaVuSans-31" transform="translate(490.234375 0)"/>
+    <use xlink:href="#DejaVuSans-36" transform="translate(553.857422 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(617.480469 0)"/>
+    <use xlink:href="#DejaVuSans-2014" transform="translate(649.267578 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(749.267578 0)"/>
+    <use xlink:href="#DejaVuSans-44" transform="translate(781.054688 0)"/>
+    <use xlink:href="#DejaVuSans-53" transform="translate(858.056641 0)"/>
+    <use xlink:href="#DejaVuSans-76" transform="translate(921.533203 0)"/>
+    <use xlink:href="#DejaVuSans-33" transform="translate(980.712891 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(1044.335938 0)"/>
+    <use xlink:href="#DejaVuSans-43" transform="translate(1076.123047 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(1145.947266 0)"/>
+    <use xlink:href="#DejaVuSans-6f" transform="translate(1184.810547 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(1245.992188 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(1298.091797 0)"/>
+    <use xlink:href="#DejaVuSans-45" transform="translate(1350.191406 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(1413.375 0)"/>
+    <use xlink:href="#DejaVuSans-74" transform="translate(1476.753906 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(1515.962891 0)"/>
+    <use xlink:href="#DejaVuSans-6f" transform="translate(1554.826172 0)"/>
+    <use xlink:href="#DejaVuSans-70" transform="translate(1616.007812 0)"/>
+    <use xlink:href="#DejaVuSans-79" transform="translate(1679.484375 0)"/>
+   </g>
+  </g>
+  <g id="legend_1">
+   <g id="line2d_24">
+    <path d="M 115.874375 45.382125 
+L 130.874375 45.382125 
+L 145.874375 45.382125 
+" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g>
+     <use xlink:href="#mab266826f4" x="130.874375" y="45.382125" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="text_20">
+    <!-- KernelAgent-Oink (ours) -->
+    <g transform="translate(155.474375 49.582125) scale(0.12 -0.12)">
+     <defs>
+      <path id="DejaVuSans-6c" d="M 603 4863 
+L 1178 4863 
+L 1178 0 
+L 603 0 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-41" d="M 2188 4044 
+L 1331 1722 
+L 3047 1722 
+L 2188 4044 
+z
+M 1831 4666 
+L 2547 4666 
+L 4325 0 
+L 3669 0 
+L 3244 1197 
+L 1141 1197 
+L 716 0 
+L 50 0 
+L 1831 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-67" d="M 2906 1791 
+Q 2906 2416 2648 2759 
+Q 2391 3103 1925 3103 
+Q 1463 3103 1205 2759 
+Q 947 2416 947 1791 
+Q 947 1169 1205 825 
+Q 1463 481 1925 481 
+Q 2391 481 2648 825 
+Q 2906 1169 2906 1791 
+z
+M 3481 434 
+Q 3481 -459 3084 -895 
+Q 2688 -1331 1869 -1331 
+Q 1566 -1331 1297 -1286 
+Q 1028 -1241 775 -1147 
+L 775 -588 
+Q 1028 -725 1275 -790 
+Q 1522 -856 1778 -856 
+Q 2344 -856 2625 -561 
+Q 2906 -266 2906 331 
+L 2906 616 
+Q 2728 306 2450 153 
+Q 2172 0 1784 0 
+Q 1141 0 747 490 
+Q 353 981 353 1791 
+Q 353 2603 747 3093 
+Q 1141 3584 1784 3584 
+Q 2172 3584 2450 3431 
+Q 2728 3278 2906 2969 
+L 2906 3500 
+L 3481 3500 
+L 3481 434 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-4f" d="M 2522 4238 
+Q 1834 4238 1429 3725 
+Q 1025 3213 1025 2328 
+Q 1025 1447 1429 934 
+Q 1834 422 2522 422 
+Q 3209 422 3611 934 
+Q 4013 1447 4013 2328 
+Q 4013 3213 3611 3725 
+Q 3209 4238 2522 4238 
+z
+M 2522 4750 
+Q 3503 4750 4090 4092 
+Q 4678 3434 4678 2328 
+Q 4678 1225 4090 567 
+Q 3503 -91 2522 -91 
+Q 1538 -91 948 565 
+Q 359 1222 359 2328 
+Q 359 3434 948 4092 
+Q 1538 4750 2522 4750 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-6b" d="M 581 4863 
+L 1159 4863 
+L 1159 1991 
+L 2875 3500 
+L 3609 3500 
+L 1753 1863 
+L 3688 0 
+L 2938 0 
+L 1159 1709 
+L 1159 0 
+L 581 0 
+L 581 4863 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-75" d="M 544 1381 
+L 544 3500 
+L 1119 3500 
+L 1119 1403 
+Q 1119 906 1312 657 
+Q 1506 409 1894 409 
+Q 2359 409 2629 706 
+Q 2900 1003 2900 1516 
+L 2900 3500 
+L 3475 3500 
+L 3475 0 
+L 2900 0 
+L 2900 538 
+Q 2691 219 2414 64 
+Q 2138 -91 1772 -91 
+Q 1169 -91 856 284 
+Q 544 659 544 1381 
+z
+M 1991 3584 
+L 1991 3584 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-4b"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(60.576172 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(122.099609 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(161.462891 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(224.841797 0)"/>
+     <use xlink:href="#DejaVuSans-6c" transform="translate(286.365234 0)"/>
+     <use xlink:href="#DejaVuSans-41" transform="translate(314.148438 0)"/>
+     <use xlink:href="#DejaVuSans-67" transform="translate(382.556641 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(446.033203 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(507.556641 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(570.935547 0)"/>
+     <use xlink:href="#DejaVuSans-2d" transform="translate(610.144531 0)"/>
+     <use xlink:href="#DejaVuSans-4f" transform="translate(648.978516 0)"/>
+     <use xlink:href="#DejaVuSans-69" transform="translate(727.689453 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(755.472656 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(818.851562 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(876.761719 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(908.548828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(947.5625 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(1008.744141 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(1072.123047 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(1113.236328 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1165.335938 0)"/>
+    </g>
+   </g>
+   <g id="line2d_25">
+    <path d="M 115.874375 62.995875 
+L 130.874375 62.995875 
+L 145.874375 62.995875 
+" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g>
+     <use xlink:href="#m1dc950a644" x="130.874375" y="62.995875" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="text_21">
+    <!-- Quack -->
+    <g transform="translate(155.474375 67.195875) scale(0.12 -0.12)">
+     <defs>
+      <path id="DejaVuSans-51" d="M 2522 4238 
+Q 1834 4238 1429 3725 
+Q 1025 3213 1025 2328 
+Q 1025 1447 1429 934 
+Q 1834 422 2522 422 
+Q 3209 422 3611 934 
+Q 4013 1447 4013 2328 
+Q 4013 3213 3611 3725 
+Q 3209 4238 2522 4238 
+z
+M 3406 84 
+L 4238 -825 
+L 3475 -825 
+L 2784 -78 
+Q 2681 -84 2626 -87 
+Q 2572 -91 2522 -91 
+Q 1538 -91 948 567 
+Q 359 1225 359 2328 
+Q 359 3434 948 4092 
+Q 1538 4750 2522 4750 
+Q 3503 4750 4090 4092 
+Q 4678 3434 4678 2328 
+Q 4678 1516 4351 937 
+Q 4025 359 3406 84 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-63" d="M 3122 3366 
+L 3122 2828 
+Q 2878 2963 2633 3030 
+Q 2388 3097 2138 3097 
+Q 1578 3097 1268 2742 
+Q 959 2388 959 1747 
+Q 959 1106 1268 751 
+Q 1578 397 2138 397 
+Q 2388 397 2633 464 
+Q 2878 531 3122 666 
+L 3122 134 
+Q 2881 22 2623 -34 
+Q 2366 -91 2075 -91 
+Q 1284 -91 818 406 
+Q 353 903 353 1747 
+Q 353 2603 823 3093 
+Q 1294 3584 2113 3584 
+Q 2378 3584 2631 3529 
+Q 2884 3475 3122 3366 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-51"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(78.710938 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(142.089844 0)"/>
+     <use xlink:href="#DejaVuSans-63" transform="translate(203.369141 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(258.349609 0)"/>
+    </g>
+   </g>
+   <g id="line2d_26">
+    <path d="M 115.874375 80.609625 
+L 130.874375 80.609625 
+L 145.874375 80.609625 
+" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="text_22">
+    <!-- HBM peak (measured) -->
+    <g transform="translate(155.474375 84.809625) scale(0.12 -0.12)">
+     <defs>
+      <path id="DejaVuSans-48" d="M 628 4666 
+L 1259 4666 
+L 1259 2753 
+L 3553 2753 
+L 3553 4666 
+L 4184 4666 
+L 4184 0 
+L 3553 0 
+L 3553 2222 
+L 1259 2222 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-48"/>
+     <use xlink:href="#DejaVuSans-42" transform="translate(75.195312 0)"/>
+     <use xlink:href="#DejaVuSans-4d" transform="translate(143.798828 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(230.078125 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(261.865234 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(325.341797 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(386.865234 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(448.144531 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(506.054688 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(537.841797 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(576.855469 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(674.267578 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(735.791016 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(797.070312 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(849.169922 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(912.548828 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(951.412109 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1012.935547 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1076.412109 0)"/>
+    </g>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="p86cc9ba3a0">
+   <rect x="58.465" y="144.816" width="325.658653" height="189.666831"/>
+  </clipPath>
+ </defs>
+</svg>
diff --git a/oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3_with_layernorm.svg b/oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3_with_layernorm.svg
new file mode 100644
index 0000000..242d013
--- /dev/null
+++ b/oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3_with_layernorm.svg
@@ -0,0 +1,2741 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="1728pt" height="360pt" viewBox="0 0 1728 360" xmlns="http://www.w3.org/2000/svg" version="1.1">
+ <metadata>
+  <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+   <cc:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
+    <dc:date>2026-01-08T16:35:17.806957</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <cc:Agent>
+      <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
+     </cc:Agent>
+    </dc:creator>
+   </cc:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure_1">
+  <g id="patch_1">
+   <path d="M 0 360 
+L 1728 360 
+L 1728 0 
+L 0 0 
+z
+" style="fill: #ffffff"/>
+  </g>
+  <g id="axes_1">
+   <g id="patch_2">
+    <path d="M 66.53 302.361604 
+L 441.930945 302.361604 
+L 441.930945 95.28 
+L 66.53 95.28 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="line2d_1">
+      <defs>
+       <path id="m68a6986d45" d="M 0 0 
+L 0 3.5 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#m68a6986d45" x="83.593679" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_1">
+      <!-- (4K, 6K) -->
+      <g transform="translate(85.06424 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-28" d="M 1984 4856 
+Q 1566 4138 1362 3434 
+Q 1159 2731 1159 2009 
+Q 1159 1288 1364 580 
+Q 1569 -128 1984 -844 
+L 1484 -844 
+Q 1016 -109 783 600 
+Q 550 1309 550 2009 
+Q 550 2706 781 3412 
+Q 1013 4119 1484 4856 
+L 1984 4856 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-34" d="M 2419 4116 
+L 825 1625 
+L 2419 1625 
+L 2419 4116 
+z
+M 2253 4666 
+L 3047 4666 
+L 3047 1625 
+L 3713 1625 
+L 3713 1100 
+L 3047 1100 
+L 3047 0 
+L 2419 0 
+L 2419 1100 
+L 313 1100 
+L 313 1709 
+L 2253 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-4b" d="M 628 4666 
+L 1259 4666 
+L 1259 2694 
+L 3353 4666 
+L 4166 4666 
+L 1850 2491 
+L 4331 0 
+L 3500 0 
+L 1259 2247 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-2c" d="M 750 794 
+L 1409 794 
+L 1409 256 
+L 897 -744 
+L 494 -744 
+L 750 256 
+L 750 794 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-20" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-36" d="M 2113 2584 
+Q 1688 2584 1439 2293 
+Q 1191 2003 1191 1497 
+Q 1191 994 1439 701 
+Q 1688 409 2113 409 
+Q 2538 409 2786 701 
+Q 3034 994 3034 1497 
+Q 3034 2003 2786 2293 
+Q 2538 2584 2113 2584 
+z
+M 3366 4563 
+L 3366 3988 
+Q 3128 4100 2886 4159 
+Q 2644 4219 2406 4219 
+Q 1781 4219 1451 3797 
+Q 1122 3375 1075 2522 
+Q 1259 2794 1537 2939 
+Q 1816 3084 2150 3084 
+Q 2853 3084 3261 2657 
+Q 3669 2231 3669 1497 
+Q 3669 778 3244 343 
+Q 2819 -91 2113 -91 
+Q 1303 -91 875 529 
+Q 447 1150 447 2328 
+Q 447 3434 972 4092 
+Q 1497 4750 2381 4750 
+Q 2619 4750 2861 4703 
+Q 3103 4656 3366 4563 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-29" d="M 513 4856 
+L 1013 4856 
+Q 1481 4119 1714 3412 
+Q 1947 2706 1947 2009 
+Q 1947 1309 1714 600 
+Q 1481 -109 1013 -844 
+L 513 -844 
+Q 928 -128 1133 580 
+Q 1338 1288 1338 2009 
+Q 1338 2731 1133 3434 
+Q 928 4138 513 4856 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="line2d_2">
+      <g>
+       <use xlink:href="#m68a6986d45" x="151.848397" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_2">
+      <!-- (16K, 6K) -->
+      <g transform="translate(153.318958 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-31" d="M 794 531 
+L 1825 531 
+L 1825 4091 
+L 703 3866 
+L 703 4441 
+L 1819 4666 
+L 2450 4666 
+L 2450 531 
+L 3481 531 
+L 3481 0 
+L 794 0 
+L 794 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="line2d_3">
+      <g>
+       <use xlink:href="#m68a6986d45" x="220.103114" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_3">
+      <!-- (64K, 6K) -->
+      <g transform="translate(221.573675 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="line2d_4">
+      <g>
+       <use xlink:href="#m68a6986d45" x="288.357831" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_4">
+      <!-- (4K, 8K) -->
+      <g transform="translate(289.828392 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-38" d="M 2034 2216 
+Q 1584 2216 1326 1975 
+Q 1069 1734 1069 1313 
+Q 1069 891 1326 650 
+Q 1584 409 2034 409 
+Q 2484 409 2743 651 
+Q 3003 894 3003 1313 
+Q 3003 1734 2745 1975 
+Q 2488 2216 2034 2216 
+z
+M 1403 2484 
+Q 997 2584 770 2862 
+Q 544 3141 544 3541 
+Q 544 4100 942 4425 
+Q 1341 4750 2034 4750 
+Q 2731 4750 3128 4425 
+Q 3525 4100 3525 3541 
+Q 3525 3141 3298 2862 
+Q 3072 2584 2669 2484 
+Q 3125 2378 3379 2068 
+Q 3634 1759 3634 1313 
+Q 3634 634 3220 271 
+Q 2806 -91 2034 -91 
+Q 1263 -91 848 271 
+Q 434 634 434 1313 
+Q 434 1759 690 2068 
+Q 947 2378 1403 2484 
+z
+M 1172 3481 
+Q 1172 3119 1398 2916 
+Q 1625 2713 2034 2713 
+Q 2441 2713 2670 2916 
+Q 2900 3119 2900 3481 
+Q 2900 3844 2670 4047 
+Q 2441 4250 2034 4250 
+Q 1625 4250 1398 4047 
+Q 1172 3844 1172 3481 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="line2d_5">
+      <g>
+       <use xlink:href="#m68a6986d45" x="356.612549" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_5">
+      <!-- (16K, 8K) -->
+      <g transform="translate(358.08311 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="line2d_6">
+      <g>
+       <use xlink:href="#m68a6986d45" x="424.867266" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_6">
+      <!-- (64K, 8K) -->
+      <g transform="translate(426.337827 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2">
+    <g id="ytick_1">
+     <g id="line2d_7">
+      <path d="M 66.53 302.361604 
+L 441.930945 302.361604 
+" clip-path="url(#p11615d3359)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_8">
+      <defs>
+       <path id="m0986283986" d="M 0 0 
+L -3.5 0 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#m0986283986" x="66.53" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_7">
+      <!-- 0 -->
+      <g transform="translate(51.895 306.920666) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-30" d="M 2034 4250 
+Q 1547 4250 1301 3770 
+Q 1056 3291 1056 2328 
+Q 1056 1369 1301 889 
+Q 1547 409 2034 409 
+Q 2525 409 2770 889 
+Q 3016 1369 3016 2328 
+Q 3016 3291 2770 3770 
+Q 2525 4250 2034 4250 
+z
+M 2034 4750 
+Q 2819 4750 3233 4129 
+Q 3647 3509 3647 2328 
+Q 3647 1150 3233 529 
+Q 2819 -91 2034 -91 
+Q 1250 -91 836 529 
+Q 422 1150 422 2328 
+Q 422 3509 836 4129 
+Q 1250 4750 2034 4750 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_2">
+     <g id="line2d_9">
+      <path d="M 66.53 276.422362 
+L 441.930945 276.422362 
+" clip-path="url(#p11615d3359)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_10">
+      <g>
+       <use xlink:href="#m0986283986" x="66.53" y="276.422362" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_8">
+      <!-- 1000 -->
+      <g transform="translate(28.99 280.981424) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_3">
+     <g id="line2d_11">
+      <path d="M 66.53 250.48312 
+L 441.930945 250.48312 
+" clip-path="url(#p11615d3359)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_12">
+      <g>
+       <use xlink:href="#m0986283986" x="66.53" y="250.48312" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_9">
+      <!-- 2000 -->
+      <g transform="translate(28.99 255.042182) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-32" d="M 1228 531 
+L 3431 531 
+L 3431 0 
+L 469 0 
+L 469 531 
+Q 828 903 1448 1529 
+Q 2069 2156 2228 2338 
+Q 2531 2678 2651 2914 
+Q 2772 3150 2772 3378 
+Q 2772 3750 2511 3984 
+Q 2250 4219 1831 4219 
+Q 1534 4219 1204 4116 
+Q 875 4013 500 3803 
+L 500 4441 
+Q 881 4594 1212 4672 
+Q 1544 4750 1819 4750 
+Q 2544 4750 2975 4387 
+Q 3406 4025 3406 3419 
+Q 3406 3131 3298 2873 
+Q 3191 2616 2906 2266 
+Q 2828 2175 2409 1742 
+Q 1991 1309 1228 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-32"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_4">
+     <g id="line2d_13">
+      <path d="M 66.53 224.543878 
+L 441.930945 224.543878 
+" clip-path="url(#p11615d3359)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_14">
+      <g>
+       <use xlink:href="#m0986283986" x="66.53" y="224.543878" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_10">
+      <!-- 3000 -->
+      <g transform="translate(28.99 229.10294) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-33" d="M 2597 2516 
+Q 3050 2419 3304 2112 
+Q 3559 1806 3559 1356 
+Q 3559 666 3084 287 
+Q 2609 -91 1734 -91 
+Q 1441 -91 1130 -33 
+Q 819 25 488 141 
+L 488 750 
+Q 750 597 1062 519 
+Q 1375 441 1716 441 
+Q 2309 441 2620 675 
+Q 2931 909 2931 1356 
+Q 2931 1769 2642 2001 
+Q 2353 2234 1838 2234 
+L 1294 2234 
+L 1294 2753 
+L 1863 2753 
+Q 2328 2753 2575 2939 
+Q 2822 3125 2822 3475 
+Q 2822 3834 2567 4026 
+Q 2313 4219 1838 4219 
+Q 1578 4219 1281 4162 
+Q 984 4106 628 3988 
+L 628 4550 
+Q 988 4650 1302 4700 
+Q 1616 4750 1894 4750 
+Q 2613 4750 3031 4423 
+Q 3450 4097 3450 3541 
+Q 3450 3153 3228 2886 
+Q 3006 2619 2597 2516 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-33"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_5">
+     <g id="line2d_15">
+      <path d="M 66.53 198.604635 
+L 441.930945 198.604635 
+" clip-path="url(#p11615d3359)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_16">
+      <g>
+       <use xlink:href="#m0986283986" x="66.53" y="198.604635" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_11">
+      <!-- 4000 -->
+      <g transform="translate(28.99 203.163698) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-34"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_6">
+     <g id="line2d_17">
+      <path d="M 66.53 172.665393 
+L 441.930945 172.665393 
+" clip-path="url(#p11615d3359)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_18">
+      <g>
+       <use xlink:href="#m0986283986" x="66.53" y="172.665393" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_12">
+      <!-- 5000 -->
+      <g transform="translate(28.99 177.224456) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-35" d="M 691 4666 
+L 3169 4666 
+L 3169 4134 
+L 1269 4134 
+L 1269 2991 
+Q 1406 3038 1543 3061 
+Q 1681 3084 1819 3084 
+Q 2600 3084 3056 2656 
+Q 3513 2228 3513 1497 
+Q 3513 744 3044 326 
+Q 2575 -91 1722 -91 
+Q 1428 -91 1123 -41 
+Q 819 9 494 109 
+L 494 744 
+Q 775 591 1075 516 
+Q 1375 441 1709 441 
+Q 2250 441 2565 725 
+Q 2881 1009 2881 1497 
+Q 2881 1984 2565 2268 
+Q 2250 2553 1709 2553 
+Q 1456 2553 1204 2497 
+Q 953 2441 691 2322 
+L 691 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-35"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_7">
+     <g id="line2d_19">
+      <path d="M 66.53 146.726151 
+L 441.930945 146.726151 
+" clip-path="url(#p11615d3359)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_20">
+      <g>
+       <use xlink:href="#m0986283986" x="66.53" y="146.726151" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_13">
+      <!-- 6000 -->
+      <g transform="translate(28.99 151.285214) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-36"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_8">
+     <g id="line2d_21">
+      <path d="M 66.53 120.786909 
+L 441.930945 120.786909 
+" clip-path="url(#p11615d3359)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_22">
+      <g>
+       <use xlink:href="#m0986283986" x="66.53" y="120.786909" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_14">
+      <!-- 7000 -->
+      <g transform="translate(28.99 125.345971) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-37" d="M 525 4666 
+L 3525 4666 
+L 3525 4397 
+L 1831 0 
+L 1172 0 
+L 2766 4134 
+L 525 4134 
+L 525 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-37"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_15">
+     <!-- Memory Bandwidth (GB/s) -->
+     <g transform="translate(21.6625 303.824552) rotate(-90) scale(0.16 -0.16)">
+      <defs>
+       <path id="DejaVuSans-4d" d="M 628 4666 
+L 1569 4666 
+L 2759 1491 
+L 3956 4666 
+L 4897 4666 
+L 4897 0 
+L 4281 0 
+L 4281 4097 
+L 3078 897 
+L 2444 897 
+L 1241 4097 
+L 1241 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-65" d="M 3597 1894 
+L 3597 1613 
+L 953 1613 
+Q 991 1019 1311 708 
+Q 1631 397 2203 397 
+Q 2534 397 2845 478 
+Q 3156 559 3463 722 
+L 3463 178 
+Q 3153 47 2828 -22 
+Q 2503 -91 2169 -91 
+Q 1331 -91 842 396 
+Q 353 884 353 1716 
+Q 353 2575 817 3079 
+Q 1281 3584 2069 3584 
+Q 2775 3584 3186 3129 
+Q 3597 2675 3597 1894 
+z
+M 3022 2063 
+Q 3016 2534 2758 2815 
+Q 2500 3097 2075 3097 
+Q 1594 3097 1305 2825 
+Q 1016 2553 972 2059 
+L 3022 2063 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6d" d="M 3328 2828 
+Q 3544 3216 3844 3400 
+Q 4144 3584 4550 3584 
+Q 5097 3584 5394 3201 
+Q 5691 2819 5691 2113 
+L 5691 0 
+L 5113 0 
+L 5113 2094 
+Q 5113 2597 4934 2840 
+Q 4756 3084 4391 3084 
+Q 3944 3084 3684 2787 
+Q 3425 2491 3425 1978 
+L 3425 0 
+L 2847 0 
+L 2847 2094 
+Q 2847 2600 2669 2842 
+Q 2491 3084 2119 3084 
+Q 1678 3084 1418 2786 
+Q 1159 2488 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1356 3278 1631 3431 
+Q 1906 3584 2284 3584 
+Q 2666 3584 2933 3390 
+Q 3200 3197 3328 2828 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6f" d="M 1959 3097 
+Q 1497 3097 1228 2736 
+Q 959 2375 959 1747 
+Q 959 1119 1226 758 
+Q 1494 397 1959 397 
+Q 2419 397 2687 759 
+Q 2956 1122 2956 1747 
+Q 2956 2369 2687 2733 
+Q 2419 3097 1959 3097 
+z
+M 1959 3584 
+Q 2709 3584 3137 3096 
+Q 3566 2609 3566 1747 
+Q 3566 888 3137 398 
+Q 2709 -91 1959 -91 
+Q 1206 -91 779 398 
+Q 353 888 353 1747 
+Q 353 2609 779 3096 
+Q 1206 3584 1959 3584 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-72" d="M 2631 2963 
+Q 2534 3019 2420 3045 
+Q 2306 3072 2169 3072 
+Q 1681 3072 1420 2755 
+Q 1159 2438 1159 1844 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1341 3275 1631 3429 
+Q 1922 3584 2338 3584 
+Q 2397 3584 2469 3576 
+Q 2541 3569 2628 3553 
+L 2631 2963 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-79" d="M 2059 -325 
+Q 1816 -950 1584 -1140 
+Q 1353 -1331 966 -1331 
+L 506 -1331 
+L 506 -850 
+L 844 -850 
+Q 1081 -850 1212 -737 
+Q 1344 -625 1503 -206 
+L 1606 56 
+L 191 3500 
+L 800 3500 
+L 1894 763 
+L 2988 3500 
+L 3597 3500 
+L 2059 -325 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-42" d="M 1259 2228 
+L 1259 519 
+L 2272 519 
+Q 2781 519 3026 730 
+Q 3272 941 3272 1375 
+Q 3272 1813 3026 2020 
+Q 2781 2228 2272 2228 
+L 1259 2228 
+z
+M 1259 4147 
+L 1259 2741 
+L 2194 2741 
+Q 2656 2741 2882 2914 
+Q 3109 3088 3109 3444 
+Q 3109 3797 2882 3972 
+Q 2656 4147 2194 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 2241 4666 
+Q 2963 4666 3353 4366 
+Q 3744 4066 3744 3513 
+Q 3744 3084 3544 2831 
+Q 3344 2578 2956 2516 
+Q 3422 2416 3680 2098 
+Q 3938 1781 3938 1306 
+Q 3938 681 3513 340 
+Q 3088 0 2303 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-61" d="M 2194 1759 
+Q 1497 1759 1228 1600 
+Q 959 1441 959 1056 
+Q 959 750 1161 570 
+Q 1363 391 1709 391 
+Q 2188 391 2477 730 
+Q 2766 1069 2766 1631 
+L 2766 1759 
+L 2194 1759 
+z
+M 3341 1997 
+L 3341 0 
+L 2766 0 
+L 2766 531 
+Q 2569 213 2275 61 
+Q 1981 -91 1556 -91 
+Q 1019 -91 701 211 
+Q 384 513 384 1019 
+Q 384 1609 779 1909 
+Q 1175 2209 1959 2209 
+L 2766 2209 
+L 2766 2266 
+Q 2766 2663 2505 2880 
+Q 2244 3097 1772 3097 
+Q 1472 3097 1187 3025 
+Q 903 2953 641 2809 
+L 641 3341 
+Q 956 3463 1253 3523 
+Q 1550 3584 1831 3584 
+Q 2591 3584 2966 3190 
+Q 3341 2797 3341 1997 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6e" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-64" d="M 2906 2969 
+L 2906 4863 
+L 3481 4863 
+L 3481 0 
+L 2906 0 
+L 2906 525 
+Q 2725 213 2448 61 
+Q 2172 -91 1784 -91 
+Q 1150 -91 751 415 
+Q 353 922 353 1747 
+Q 353 2572 751 3078 
+Q 1150 3584 1784 3584 
+Q 2172 3584 2448 3432 
+Q 2725 3281 2906 2969 
+z
+M 947 1747 
+Q 947 1113 1208 752 
+Q 1469 391 1925 391 
+Q 2381 391 2643 752 
+Q 2906 1113 2906 1747 
+Q 2906 2381 2643 2742 
+Q 2381 3103 1925 3103 
+Q 1469 3103 1208 2742 
+Q 947 2381 947 1747 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-77" d="M 269 3500 
+L 844 3500 
+L 1563 769 
+L 2278 3500 
+L 2956 3500 
+L 3675 769 
+L 4391 3500 
+L 4966 3500 
+L 4050 0 
+L 3372 0 
+L 2619 2869 
+L 1863 0 
+L 1184 0 
+L 269 3500 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-69" d="M 603 3500 
+L 1178 3500 
+L 1178 0 
+L 603 0 
+L 603 3500 
+z
+M 603 4863 
+L 1178 4863 
+L 1178 4134 
+L 603 4134 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-74" d="M 1172 4494 
+L 1172 3500 
+L 2356 3500 
+L 2356 3053 
+L 1172 3053 
+L 1172 1153 
+Q 1172 725 1289 603 
+Q 1406 481 1766 481 
+L 2356 481 
+L 2356 0 
+L 1766 0 
+Q 1100 0 847 248 
+Q 594 497 594 1153 
+L 594 3053 
+L 172 3053 
+L 172 3500 
+L 594 3500 
+L 594 4494 
+L 1172 4494 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-68" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-47" d="M 3809 666 
+L 3809 1919 
+L 2778 1919 
+L 2778 2438 
+L 4434 2438 
+L 4434 434 
+Q 4069 175 3628 42 
+Q 3188 -91 2688 -91 
+Q 1594 -91 976 548 
+Q 359 1188 359 2328 
+Q 359 3472 976 4111 
+Q 1594 4750 2688 4750 
+Q 3144 4750 3555 4637 
+Q 3966 4525 4313 4306 
+L 4313 3634 
+Q 3963 3931 3569 4081 
+Q 3175 4231 2741 4231 
+Q 1884 4231 1454 3753 
+Q 1025 3275 1025 2328 
+Q 1025 1384 1454 906 
+Q 1884 428 2741 428 
+Q 3075 428 3337 486 
+Q 3600 544 3809 666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-2f" d="M 1625 4666 
+L 2156 4666 
+L 531 -594 
+L 0 -594 
+L 1625 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-73" d="M 2834 3397 
+L 2834 2853 
+Q 2591 2978 2328 3040 
+Q 2066 3103 1784 3103 
+Q 1356 3103 1142 2972 
+Q 928 2841 928 2578 
+Q 928 2378 1081 2264 
+Q 1234 2150 1697 2047 
+L 1894 2003 
+Q 2506 1872 2764 1633 
+Q 3022 1394 3022 966 
+Q 3022 478 2636 193 
+Q 2250 -91 1575 -91 
+Q 1294 -91 989 -36 
+Q 684 19 347 128 
+L 347 722 
+Q 666 556 975 473 
+Q 1284 391 1588 391 
+Q 1994 391 2212 530 
+Q 2431 669 2431 922 
+Q 2431 1156 2273 1281 
+Q 2116 1406 1581 1522 
+L 1381 1569 
+Q 847 1681 609 1914 
+Q 372 2147 372 2553 
+Q 372 3047 722 3315 
+Q 1072 3584 1716 3584 
+Q 2034 3584 2315 3537 
+Q 2597 3491 2834 3397 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-4d"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(86.279297 0)"/>
+      <use xlink:href="#DejaVuSans-6d" transform="translate(147.802734 0)"/>
+      <use xlink:href="#DejaVuSans-6f" transform="translate(245.214844 0)"/>
+      <use xlink:href="#DejaVuSans-72" transform="translate(306.396484 0)"/>
+      <use xlink:href="#DejaVuSans-79" transform="translate(347.509766 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(406.689453 0)"/>
+      <use xlink:href="#DejaVuSans-42" transform="translate(438.476562 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(507.080078 0)"/>
+      <use xlink:href="#DejaVuSans-6e" transform="translate(568.359375 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(631.738281 0)"/>
+      <use xlink:href="#DejaVuSans-77" transform="translate(695.214844 0)"/>
+      <use xlink:href="#DejaVuSans-69" transform="translate(777.001953 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(804.785156 0)"/>
+      <use xlink:href="#DejaVuSans-74" transform="translate(868.261719 0)"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(907.470703 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(970.849609 0)"/>
+      <use xlink:href="#DejaVuSans-28" transform="translate(1002.636719 0)"/>
+      <use xlink:href="#DejaVuSans-47" transform="translate(1041.650391 0)"/>
+      <use xlink:href="#DejaVuSans-42" transform="translate(1119.140625 0)"/>
+      <use xlink:href="#DejaVuSans-2f" transform="translate(1187.744141 0)"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(1221.435547 0)"/>
+      <use xlink:href="#DejaVuSans-29" transform="translate(1273.535156 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_23">
+    <path d="M 83.593679 272.288339 
+L 151.848397 195.207187 
+L 220.103114 192.974007 
+L 288.357831 256.237388 
+L 356.612549 194.045128 
+L 424.867266 263.527692 
+" clip-path="url(#p11615d3359)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <defs>
+     <path id="m276b91ba99" d="M 0 3.5 
+C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
+C 3.131218 1.81853 3.5 0.928211 3.5 0 
+C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
+C 1.81853 -3.131218 0.928211 -3.5 0 -3.5 
+C -0.928211 -3.5 -1.81853 -3.131218 -2.474874 -2.474874 
+C -3.131218 -1.81853 -3.5 -0.928211 -3.5 0 
+C -3.5 0.928211 -3.131218 1.81853 -2.474874 2.474874 
+C -1.81853 3.131218 -0.928211 3.5 0 3.5 
+z
+" style="stroke: #5ba3f5"/>
+    </defs>
+    <g clip-path="url(#p11615d3359)">
+     <use xlink:href="#m276b91ba99" x="83.593679" y="272.288339" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m276b91ba99" x="151.848397" y="195.207187" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m276b91ba99" x="220.103114" y="192.974007" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m276b91ba99" x="288.357831" y="256.237388" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m276b91ba99" x="356.612549" y="194.045128" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m276b91ba99" x="424.867266" y="263.527692" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_24">
+    <path d="M 83.593679 224.912937 
+L 151.848397 197.119755 
+L 220.103114 186.68722 
+L 288.357831 219.891429 
+L 356.612549 201.198527 
+L 424.867266 194.575077 
+" clip-path="url(#p11615d3359)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <defs>
+     <path id="m6033ca0b55" d="M 0 3.5 
+C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
+C 3.131218 1.81853 3.5 0.928211 3.5 0 
+C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
+C 1.81853 -3.131218 0.928211 -3.5 0 -3.5 
+C -0.928211 -3.5 -1.81853 -3.131218 -2.474874 -2.474874 
+C -3.131218 -1.81853 -3.5 -0.928211 -3.5 0 
+C -3.5 0.928211 -3.131218 1.81853 -2.474874 2.474874 
+C -1.81853 3.131218 -0.928211 3.5 0 3.5 
+z
+" style="stroke: #ff4444"/>
+    </defs>
+    <g clip-path="url(#p11615d3359)">
+     <use xlink:href="#m6033ca0b55" x="83.593679" y="224.912937" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m6033ca0b55" x="151.848397" y="197.119755" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m6033ca0b55" x="220.103114" y="186.68722" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m6033ca0b55" x="288.357831" y="219.891429" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m6033ca0b55" x="356.612549" y="201.198527" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m6033ca0b55" x="424.867266" y="194.575077" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_25">
+    <path d="M 66.53 110.619378 
+L 441.930945 110.619378 
+" clip-path="url(#p11615d3359)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_3">
+    <path d="M 66.53 302.361604 
+L 66.53 95.28 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_4">
+    <path d="M 441.930945 302.361604 
+L 441.930945 95.28 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_5">
+    <path d="M 66.53 302.361604 
+L 441.930945 302.361604 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_6">
+    <path d="M 66.53 95.28 
+L 441.930945 95.28 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_16">
+    <!-- RMSNorm (fp32 weight) -->
+    <g transform="translate(146.527191 89.28) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-52" d="M 2841 2188 
+Q 3044 2119 3236 1894 
+Q 3428 1669 3622 1275 
+L 4263 0 
+L 3584 0 
+L 2988 1197 
+Q 2756 1666 2539 1819 
+Q 2322 1972 1947 1972 
+L 1259 1972 
+L 1259 0 
+L 628 0 
+L 628 4666 
+L 2053 4666 
+Q 2853 4666 3247 4331 
+Q 3641 3997 3641 3322 
+Q 3641 2881 3436 2590 
+Q 3231 2300 2841 2188 
+z
+M 1259 4147 
+L 1259 2491 
+L 2053 2491 
+Q 2509 2491 2742 2702 
+Q 2975 2913 2975 3322 
+Q 2975 3731 2742 3939 
+Q 2509 4147 2053 4147 
+L 1259 4147 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-53" d="M 3425 4513 
+L 3425 3897 
+Q 3066 4069 2747 4153 
+Q 2428 4238 2131 4238 
+Q 1616 4238 1336 4038 
+Q 1056 3838 1056 3469 
+Q 1056 3159 1242 3001 
+Q 1428 2844 1947 2747 
+L 2328 2669 
+Q 3034 2534 3370 2195 
+Q 3706 1856 3706 1288 
+Q 3706 609 3251 259 
+Q 2797 -91 1919 -91 
+Q 1588 -91 1214 -16 
+Q 841 59 441 206 
+L 441 856 
+Q 825 641 1194 531 
+Q 1563 422 1919 422 
+Q 2459 422 2753 634 
+Q 3047 847 3047 1241 
+Q 3047 1584 2836 1778 
+Q 2625 1972 2144 2069 
+L 1759 2144 
+Q 1053 2284 737 2584 
+Q 422 2884 422 3419 
+Q 422 4038 858 4394 
+Q 1294 4750 2059 4750 
+Q 2388 4750 2728 4690 
+Q 3069 4631 3425 4513 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-4e" d="M 628 4666 
+L 1478 4666 
+L 3547 763 
+L 3547 4666 
+L 4159 4666 
+L 4159 0 
+L 3309 0 
+L 1241 3903 
+L 1241 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-66" d="M 2375 4863 
+L 2375 4384 
+L 1825 4384 
+Q 1516 4384 1395 4259 
+Q 1275 4134 1275 3809 
+L 1275 3500 
+L 2222 3500 
+L 2222 3053 
+L 1275 3053 
+L 1275 0 
+L 697 0 
+L 697 3053 
+L 147 3053 
+L 147 3500 
+L 697 3500 
+L 697 3744 
+Q 697 4328 969 4595 
+Q 1241 4863 1831 4863 
+L 2375 4863 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-70" d="M 1159 525 
+L 1159 -1331 
+L 581 -1331 
+L 581 3500 
+L 1159 3500 
+L 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+z
+M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-67" d="M 2906 1791 
+Q 2906 2416 2648 2759 
+Q 2391 3103 1925 3103 
+Q 1463 3103 1205 2759 
+Q 947 2416 947 1791 
+Q 947 1169 1205 825 
+Q 1463 481 1925 481 
+Q 2391 481 2648 825 
+Q 2906 1169 2906 1791 
+z
+M 3481 434 
+Q 3481 -459 3084 -895 
+Q 2688 -1331 1869 -1331 
+Q 1566 -1331 1297 -1286 
+Q 1028 -1241 775 -1147 
+L 775 -588 
+Q 1028 -725 1275 -790 
+Q 1522 -856 1778 -856 
+Q 2344 -856 2625 -561 
+Q 2906 -266 2906 331 
+L 2906 616 
+Q 2728 306 2450 153 
+Q 2172 0 1784 0 
+Q 1141 0 747 490 
+Q 353 981 353 1791 
+Q 353 2603 747 3093 
+Q 1141 3584 1784 3584 
+Q 2172 3584 2450 3431 
+Q 2728 3278 2906 2969 
+L 2906 3500 
+L 3481 3500 
+L 3481 434 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-52"/>
+     <use xlink:href="#DejaVuSans-4d" transform="translate(69.482422 0)"/>
+     <use xlink:href="#DejaVuSans-53" transform="translate(155.761719 0)"/>
+     <use xlink:href="#DejaVuSans-4e" transform="translate(219.238281 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(294.042969 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(355.224609 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(394.587891 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(492 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(523.787109 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(562.800781 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(598.005859 0)"/>
+     <use xlink:href="#DejaVuSans-33" transform="translate(661.482422 0)"/>
+     <use xlink:href="#DejaVuSans-32" transform="translate(725.105469 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(788.728516 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(820.515625 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(902.302734 0)"/>
+     <use xlink:href="#DejaVuSans-69" transform="translate(963.826172 0)"/>
+     <use xlink:href="#DejaVuSans-67" transform="translate(991.609375 0)"/>
+     <use xlink:href="#DejaVuSans-68" transform="translate(1055.085938 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(1118.464844 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1157.673828 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_2">
+   <g id="patch_7">
+    <path d="M 482.7725 302.361604 
+L 858.173445 302.361604 
+L 858.173445 95.28 
+L 482.7725 95.28 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_3">
+    <g id="xtick_7">
+     <g id="line2d_26">
+      <g>
+       <use xlink:href="#m68a6986d45" x="499.836179" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_17">
+      <!-- (4K, 6K) -->
+      <g transform="translate(501.30674 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_8">
+     <g id="line2d_27">
+      <g>
+       <use xlink:href="#m68a6986d45" x="568.090897" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_18">
+      <!-- (16K, 6K) -->
+      <g transform="translate(569.561458 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_9">
+     <g id="line2d_28">
+      <g>
+       <use xlink:href="#m68a6986d45" x="636.345614" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_19">
+      <!-- (64K, 6K) -->
+      <g transform="translate(637.816175 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_10">
+     <g id="line2d_29">
+      <g>
+       <use xlink:href="#m68a6986d45" x="704.600331" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_20">
+      <!-- (4K, 8K) -->
+      <g transform="translate(706.070892 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_11">
+     <g id="line2d_30">
+      <g>
+       <use xlink:href="#m68a6986d45" x="772.855049" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_21">
+      <!-- (16K, 8K) -->
+      <g transform="translate(774.32561 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_12">
+     <g id="line2d_31">
+      <g>
+       <use xlink:href="#m68a6986d45" x="841.109766" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_22">
+      <!-- (64K, 8K) -->
+      <g transform="translate(842.580327 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_4">
+    <g id="ytick_9">
+     <g id="line2d_32">
+      <path d="M 482.7725 302.361604 
+L 858.173445 302.361604 
+" clip-path="url(#p05495e384c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_33">
+      <g>
+       <use xlink:href="#m0986283986" x="482.7725" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_10">
+     <g id="line2d_34">
+      <path d="M 482.7725 276.422362 
+L 858.173445 276.422362 
+" clip-path="url(#p05495e384c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_35">
+      <g>
+       <use xlink:href="#m0986283986" x="482.7725" y="276.422362" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_11">
+     <g id="line2d_36">
+      <path d="M 482.7725 250.48312 
+L 858.173445 250.48312 
+" clip-path="url(#p05495e384c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_37">
+      <g>
+       <use xlink:href="#m0986283986" x="482.7725" y="250.48312" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_12">
+     <g id="line2d_38">
+      <path d="M 482.7725 224.543878 
+L 858.173445 224.543878 
+" clip-path="url(#p05495e384c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_39">
+      <g>
+       <use xlink:href="#m0986283986" x="482.7725" y="224.543878" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_13">
+     <g id="line2d_40">
+      <path d="M 482.7725 198.604635 
+L 858.173445 198.604635 
+" clip-path="url(#p05495e384c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_41">
+      <g>
+       <use xlink:href="#m0986283986" x="482.7725" y="198.604635" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_14">
+     <g id="line2d_42">
+      <path d="M 482.7725 172.665393 
+L 858.173445 172.665393 
+" clip-path="url(#p05495e384c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_43">
+      <g>
+       <use xlink:href="#m0986283986" x="482.7725" y="172.665393" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_15">
+     <g id="line2d_44">
+      <path d="M 482.7725 146.726151 
+L 858.173445 146.726151 
+" clip-path="url(#p05495e384c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_45">
+      <g>
+       <use xlink:href="#m0986283986" x="482.7725" y="146.726151" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_16">
+     <g id="line2d_46">
+      <path d="M 482.7725 120.786909 
+L 858.173445 120.786909 
+" clip-path="url(#p05495e384c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_47">
+      <g>
+       <use xlink:href="#m0986283986" x="482.7725" y="120.786909" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_48">
+    <path d="M 499.836179 231.95811 
+L 568.090897 175.574968 
+L 636.345614 164.909134 
+L 704.600331 206.868218 
+L 772.855049 165.72142 
+L 841.109766 158.179462 
+" clip-path="url(#p05495e384c)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p05495e384c)">
+     <use xlink:href="#m276b91ba99" x="499.836179" y="231.95811" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m276b91ba99" x="568.090897" y="175.574968" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m276b91ba99" x="636.345614" y="164.909134" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m276b91ba99" x="704.600331" y="206.868218" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m276b91ba99" x="772.855049" y="165.72142" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m276b91ba99" x="841.109766" y="158.179462" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_49">
+    <path d="M 499.836179 204.838924 
+L 568.090897 175.797116 
+L 636.345614 165.334908 
+L 704.600331 187.81875 
+L 772.855049 164.76046 
+L 841.109766 157.989828 
+" clip-path="url(#p05495e384c)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p05495e384c)">
+     <use xlink:href="#m6033ca0b55" x="499.836179" y="204.838924" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m6033ca0b55" x="568.090897" y="175.797116" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m6033ca0b55" x="636.345614" y="165.334908" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m6033ca0b55" x="704.600331" y="187.81875" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m6033ca0b55" x="772.855049" y="164.76046" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m6033ca0b55" x="841.109766" y="157.989828" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_50">
+    <path d="M 482.7725 110.619378 
+L 858.173445 110.619378 
+" clip-path="url(#p05495e384c)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_8">
+    <path d="M 482.7725 302.361604 
+L 482.7725 95.28 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_9">
+    <path d="M 858.173445 302.361604 
+L 858.173445 95.28 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_10">
+    <path d="M 482.7725 302.361604 
+L 858.173445 302.361604 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_11">
+    <path d="M 482.7725 95.28 
+L 858.173445 95.28 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_23">
+    <!-- Softmax (fwd+bwd) -->
+    <g transform="translate(580.80766 89.28) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-78" d="M 3513 3500 
+L 2247 1797 
+L 3578 0 
+L 2900 0 
+L 1881 1375 
+L 863 0 
+L 184 0 
+L 1544 1831 
+L 300 3500 
+L 978 3500 
+L 1906 2253 
+L 2834 3500 
+L 3513 3500 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-2b" d="M 2944 4013 
+L 2944 2272 
+L 4684 2272 
+L 4684 1741 
+L 2944 1741 
+L 2944 0 
+L 2419 0 
+L 2419 1741 
+L 678 1741 
+L 678 2272 
+L 2419 2272 
+L 2419 4013 
+L 2944 4013 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-62" d="M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+M 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2969 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-53"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(63.476562 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(124.658203 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(158.113281 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(197.322266 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(294.734375 0)"/>
+     <use xlink:href="#DejaVuSans-78" transform="translate(356.013672 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(415.193359 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(446.980469 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(485.994141 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(519.449219 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(601.236328 0)"/>
+     <use xlink:href="#DejaVuSans-2b" transform="translate(664.712891 0)"/>
+     <use xlink:href="#DejaVuSans-62" transform="translate(748.501953 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(811.978516 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(893.765625 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(957.242188 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_3">
+   <g id="patch_12">
+    <path d="M 899.015 302.361604 
+L 1274.415945 302.361604 
+L 1274.415945 95.28 
+L 899.015 95.28 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_5">
+    <g id="xtick_13">
+     <g id="line2d_51">
+      <g>
+       <use xlink:href="#m68a6986d45" x="916.078679" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_24">
+      <!-- (4K, 6K) -->
+      <g transform="translate(917.54924 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_14">
+     <g id="line2d_52">
+      <g>
+       <use xlink:href="#m68a6986d45" x="984.333397" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_25">
+      <!-- (16K, 6K) -->
+      <g transform="translate(985.803958 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_15">
+     <g id="line2d_53">
+      <g>
+       <use xlink:href="#m68a6986d45" x="1052.588114" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_26">
+      <!-- (64K, 6K) -->
+      <g transform="translate(1054.058675 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_16">
+     <g id="line2d_54">
+      <g>
+       <use xlink:href="#m68a6986d45" x="1120.842831" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_27">
+      <!-- (4K, 8K) -->
+      <g transform="translate(1122.313392 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_17">
+     <g id="line2d_55">
+      <g>
+       <use xlink:href="#m68a6986d45" x="1189.097549" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_28">
+      <!-- (16K, 8K) -->
+      <g transform="translate(1190.56811 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_18">
+     <g id="line2d_56">
+      <g>
+       <use xlink:href="#m68a6986d45" x="1257.352266" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_29">
+      <!-- (64K, 8K) -->
+      <g transform="translate(1258.822827 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_6">
+    <g id="ytick_17">
+     <g id="line2d_57">
+      <path d="M 899.015 302.361604 
+L 1274.415945 302.361604 
+" clip-path="url(#peca114f933)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_58">
+      <g>
+       <use xlink:href="#m0986283986" x="899.015" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_18">
+     <g id="line2d_59">
+      <path d="M 899.015 276.422362 
+L 1274.415945 276.422362 
+" clip-path="url(#peca114f933)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_60">
+      <g>
+       <use xlink:href="#m0986283986" x="899.015" y="276.422362" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_19">
+     <g id="line2d_61">
+      <path d="M 899.015 250.48312 
+L 1274.415945 250.48312 
+" clip-path="url(#peca114f933)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_62">
+      <g>
+       <use xlink:href="#m0986283986" x="899.015" y="250.48312" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_20">
+     <g id="line2d_63">
+      <path d="M 899.015 224.543878 
+L 1274.415945 224.543878 
+" clip-path="url(#peca114f933)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_64">
+      <g>
+       <use xlink:href="#m0986283986" x="899.015" y="224.543878" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_21">
+     <g id="line2d_65">
+      <path d="M 899.015 198.604635 
+L 1274.415945 198.604635 
+" clip-path="url(#peca114f933)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_66">
+      <g>
+       <use xlink:href="#m0986283986" x="899.015" y="198.604635" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_22">
+     <g id="line2d_67">
+      <path d="M 899.015 172.665393 
+L 1274.415945 172.665393 
+" clip-path="url(#peca114f933)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_68">
+      <g>
+       <use xlink:href="#m0986283986" x="899.015" y="172.665393" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_23">
+     <g id="line2d_69">
+      <path d="M 899.015 146.726151 
+L 1274.415945 146.726151 
+" clip-path="url(#peca114f933)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_70">
+      <g>
+       <use xlink:href="#m0986283986" x="899.015" y="146.726151" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_24">
+     <g id="line2d_71">
+      <path d="M 899.015 120.786909 
+L 1274.415945 120.786909 
+" clip-path="url(#peca114f933)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_72">
+      <g>
+       <use xlink:href="#m0986283986" x="899.015" y="120.786909" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_73">
+    <path d="M 916.078679 271.918647 
+L 984.333397 188.208369 
+L 1052.588114 175.00953 
+L 1120.842831 262.55956 
+L 1189.097549 179.846433 
+L 1257.352266 169.215125 
+" clip-path="url(#peca114f933)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#peca114f933)">
+     <use xlink:href="#m276b91ba99" x="916.078679" y="271.918647" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m276b91ba99" x="984.333397" y="188.208369" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m276b91ba99" x="1052.588114" y="175.00953" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m276b91ba99" x="1120.842831" y="262.55956" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m276b91ba99" x="1189.097549" y="179.846433" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m276b91ba99" x="1257.352266" y="169.215125" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_74">
+    <path d="M 916.078679 258.002578 
+L 984.333397 196.912386 
+L 1052.588114 185.399558 
+L 1120.842831 246.477491 
+L 1189.097549 183.586083 
+L 1257.352266 174.771889 
+" clip-path="url(#peca114f933)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#peca114f933)">
+     <use xlink:href="#m6033ca0b55" x="916.078679" y="258.002578" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m6033ca0b55" x="984.333397" y="196.912386" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m6033ca0b55" x="1052.588114" y="185.399558" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m6033ca0b55" x="1120.842831" y="246.477491" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m6033ca0b55" x="1189.097549" y="183.586083" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m6033ca0b55" x="1257.352266" y="174.771889" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_75">
+    <path d="M 899.015 110.619378 
+L 1274.415945 110.619378 
+" clip-path="url(#peca114f933)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_13">
+    <path d="M 899.015 302.361604 
+L 899.015 95.28 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_14">
+    <path d="M 1274.415945 302.361604 
+L 1274.415945 95.28 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_15">
+    <path d="M 899.015 302.361604 
+L 1274.415945 302.361604 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_16">
+    <path d="M 899.015 95.28 
+L 1274.415945 95.28 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_30">
+    <!-- Cross-Entropy (fwd+bwd) -->
+    <g transform="translate(971.542191 89.28) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-43" d="M 4122 4306 
+L 4122 3641 
+Q 3803 3938 3442 4084 
+Q 3081 4231 2675 4231 
+Q 1875 4231 1450 3742 
+Q 1025 3253 1025 2328 
+Q 1025 1406 1450 917 
+Q 1875 428 2675 428 
+Q 3081 428 3442 575 
+Q 3803 722 4122 1019 
+L 4122 359 
+Q 3791 134 3420 21 
+Q 3050 -91 2638 -91 
+Q 1578 -91 968 557 
+Q 359 1206 359 2328 
+Q 359 3453 968 4101 
+Q 1578 4750 2638 4750 
+Q 3056 4750 3426 4639 
+Q 3797 4528 4122 4306 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-2d" d="M 313 2009 
+L 1997 2009 
+L 1997 1497 
+L 313 1497 
+L 313 2009 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-45" d="M 628 4666 
+L 3578 4666 
+L 3578 4134 
+L 1259 4134 
+L 1259 2753 
+L 3481 2753 
+L 3481 2222 
+L 1259 2222 
+L 1259 531 
+L 3634 531 
+L 3634 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-43"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(69.824219 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(108.6875 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(169.869141 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(221.96875 0)"/>
+     <use xlink:href="#DejaVuSans-2d" transform="translate(274.068359 0)"/>
+     <use xlink:href="#DejaVuSans-45" transform="translate(310.152344 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(373.335938 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(436.714844 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(475.923828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(514.787109 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(575.96875 0)"/>
+     <use xlink:href="#DejaVuSans-79" transform="translate(639.445312 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(698.625 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(730.412109 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(769.425781 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(802.880859 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(884.667969 0)"/>
+     <use xlink:href="#DejaVuSans-2b" transform="translate(948.144531 0)"/>
+     <use xlink:href="#DejaVuSans-62" transform="translate(1031.933594 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(1095.410156 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1177.197266 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1240.673828 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_4">
+   <g id="patch_17">
+    <path d="M 1315.2575 302.361604 
+L 1690.658445 302.361604 
+L 1690.658445 95.28 
+L 1315.2575 95.28 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_7">
+    <g id="xtick_19">
+     <g id="line2d_76">
+      <g>
+       <use xlink:href="#m68a6986d45" x="1332.321179" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_31">
+      <!-- (4K, 6K) -->
+      <g transform="translate(1333.79174 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_20">
+     <g id="line2d_77">
+      <g>
+       <use xlink:href="#m68a6986d45" x="1400.575897" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_32">
+      <!-- (16K, 6K) -->
+      <g transform="translate(1402.046458 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_21">
+     <g id="line2d_78">
+      <g>
+       <use xlink:href="#m68a6986d45" x="1468.830614" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_33">
+      <!-- (64K, 6K) -->
+      <g transform="translate(1470.301175 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_22">
+     <g id="line2d_79">
+      <g>
+       <use xlink:href="#m68a6986d45" x="1537.085331" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_34">
+      <!-- (4K, 8K) -->
+      <g transform="translate(1538.555892 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_23">
+     <g id="line2d_80">
+      <g>
+       <use xlink:href="#m68a6986d45" x="1605.340049" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_35">
+      <!-- (16K, 8K) -->
+      <g transform="translate(1606.81061 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_24">
+     <g id="line2d_81">
+      <g>
+       <use xlink:href="#m68a6986d45" x="1673.594766" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_36">
+      <!-- (64K, 8K) -->
+      <g transform="translate(1675.065327 314.734511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_8">
+    <g id="ytick_25">
+     <g id="line2d_82">
+      <path d="M 1315.2575 302.361604 
+L 1690.658445 302.361604 
+" clip-path="url(#p9afaee9892)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_83">
+      <g>
+       <use xlink:href="#m0986283986" x="1315.2575" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_26">
+     <g id="line2d_84">
+      <path d="M 1315.2575 276.422362 
+L 1690.658445 276.422362 
+" clip-path="url(#p9afaee9892)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_85">
+      <g>
+       <use xlink:href="#m0986283986" x="1315.2575" y="276.422362" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_27">
+     <g id="line2d_86">
+      <path d="M 1315.2575 250.48312 
+L 1690.658445 250.48312 
+" clip-path="url(#p9afaee9892)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_87">
+      <g>
+       <use xlink:href="#m0986283986" x="1315.2575" y="250.48312" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_28">
+     <g id="line2d_88">
+      <path d="M 1315.2575 224.543878 
+L 1690.658445 224.543878 
+" clip-path="url(#p9afaee9892)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_89">
+      <g>
+       <use xlink:href="#m0986283986" x="1315.2575" y="224.543878" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_29">
+     <g id="line2d_90">
+      <path d="M 1315.2575 198.604635 
+L 1690.658445 198.604635 
+" clip-path="url(#p9afaee9892)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_91">
+      <g>
+       <use xlink:href="#m0986283986" x="1315.2575" y="198.604635" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_30">
+     <g id="line2d_92">
+      <path d="M 1315.2575 172.665393 
+L 1690.658445 172.665393 
+" clip-path="url(#p9afaee9892)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_93">
+      <g>
+       <use xlink:href="#m0986283986" x="1315.2575" y="172.665393" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_31">
+     <g id="line2d_94">
+      <path d="M 1315.2575 146.726151 
+L 1690.658445 146.726151 
+" clip-path="url(#p9afaee9892)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_95">
+      <g>
+       <use xlink:href="#m0986283986" x="1315.2575" y="146.726151" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_32">
+     <g id="line2d_96">
+      <path d="M 1315.2575 120.786909 
+L 1690.658445 120.786909 
+" clip-path="url(#p9afaee9892)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_97">
+      <g>
+       <use xlink:href="#m0986283986" x="1315.2575" y="120.786909" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_98">
+    <path d="M 1332.321179 249.414778 
+L 1400.575897 237.106807 
+L 1468.830614 232.504421 
+L 1537.085331 237.895969 
+L 1605.340049 225.501182 
+L 1673.594766 221.51405 
+" clip-path="url(#p9afaee9892)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p9afaee9892)">
+     <use xlink:href="#m276b91ba99" x="1332.321179" y="249.414778" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m276b91ba99" x="1400.575897" y="237.106807" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m276b91ba99" x="1468.830614" y="232.504421" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m276b91ba99" x="1537.085331" y="237.895969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m276b91ba99" x="1605.340049" y="225.501182" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m276b91ba99" x="1673.594766" y="221.51405" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_99">
+    <path d="M 1332.321179 243.101433 
+L 1400.575897 229.090352 
+L 1468.830614 224.074044 
+L 1537.085331 230.351093 
+L 1605.340049 216.573501 
+L 1673.594766 211.849353 
+" clip-path="url(#p9afaee9892)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p9afaee9892)">
+     <use xlink:href="#m6033ca0b55" x="1332.321179" y="243.101433" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m6033ca0b55" x="1400.575897" y="229.090352" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m6033ca0b55" x="1468.830614" y="224.074044" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m6033ca0b55" x="1537.085331" y="230.351093" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m6033ca0b55" x="1605.340049" y="216.573501" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m6033ca0b55" x="1673.594766" y="211.849353" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_100">
+    <path d="M 1315.2575 110.619378 
+L 1690.658445 110.619378 
+" clip-path="url(#p9afaee9892)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_18">
+    <path d="M 1315.2575 302.361604 
+L 1315.2575 95.28 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_19">
+    <path d="M 1690.658445 302.361604 
+L 1690.658445 95.28 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_20">
+    <path d="M 1315.2575 302.361604 
+L 1690.658445 302.361604 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_21">
+    <path d="M 1315.2575 95.28 
+L 1690.658445 95.28 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_37">
+    <!-- LayerNorm (fwd) -->
+    <g transform="translate(1427.346723 89.28) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-4c" d="M 628 4666 
+L 1259 4666 
+L 1259 531 
+L 3531 531 
+L 3531 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-4c"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(55.712891 0)"/>
+     <use xlink:href="#DejaVuSans-79" transform="translate(116.992188 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(176.171875 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(237.695312 0)"/>
+     <use xlink:href="#DejaVuSans-4e" transform="translate(278.808594 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(353.613281 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(414.794922 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(454.158203 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(551.570312 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(583.357422 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(622.371094 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(655.826172 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(737.613281 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(801.089844 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="text_38">
+   <!-- SM100 FP16 Kernel Benchmarks (Oink vs Quack) — DSv3 (+LayerNorm) -->
+   <g transform="translate(466.059531 18.516563) scale(0.22 -0.22)">
+    <defs>
+     <path id="DejaVuSans-46" d="M 628 4666 
+L 3309 4666 
+L 3309 4134 
+L 1259 4134 
+L 1259 2759 
+L 3109 2759 
+L 3109 2228 
+L 1259 2228 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-50" d="M 1259 4147 
+L 1259 2394 
+L 2053 2394 
+Q 2494 2394 2734 2622 
+Q 2975 2850 2975 3272 
+Q 2975 3691 2734 3919 
+Q 2494 4147 2053 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 2053 4666 
+Q 2838 4666 3239 4311 
+Q 3641 3956 3641 3272 
+Q 3641 2581 3239 2228 
+Q 2838 1875 2053 1875 
+L 1259 1875 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-6c" d="M 603 4863 
+L 1178 4863 
+L 1178 0 
+L 603 0 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-63" d="M 3122 3366 
+L 3122 2828 
+Q 2878 2963 2633 3030 
+Q 2388 3097 2138 3097 
+Q 1578 3097 1268 2742 
+Q 959 2388 959 1747 
+Q 959 1106 1268 751 
+Q 1578 397 2138 397 
+Q 2388 397 2633 464 
+Q 2878 531 3122 666 
+L 3122 134 
+Q 2881 22 2623 -34 
+Q 2366 -91 2075 -91 
+Q 1284 -91 818 406 
+Q 353 903 353 1747 
+Q 353 2603 823 3093 
+Q 1294 3584 2113 3584 
+Q 2378 3584 2631 3529 
+Q 2884 3475 3122 3366 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-6b" d="M 581 4863 
+L 1159 4863 
+L 1159 1991 
+L 2875 3500 
+L 3609 3500 
+L 1753 1863 
+L 3688 0 
+L 2938 0 
+L 1159 1709 
+L 1159 0 
+L 581 0 
+L 581 4863 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-4f" d="M 2522 4238 
+Q 1834 4238 1429 3725 
+Q 1025 3213 1025 2328 
+Q 1025 1447 1429 934 
+Q 1834 422 2522 422 
+Q 3209 422 3611 934 
+Q 4013 1447 4013 2328 
+Q 4013 3213 3611 3725 
+Q 3209 4238 2522 4238 
+z
+M 2522 4750 
+Q 3503 4750 4090 4092 
+Q 4678 3434 4678 2328 
+Q 4678 1225 4090 567 
+Q 3503 -91 2522 -91 
+Q 1538 -91 948 565 
+Q 359 1222 359 2328 
+Q 359 3434 948 4092 
+Q 1538 4750 2522 4750 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-76" d="M 191 3500 
+L 800 3500 
+L 1894 563 
+L 2988 3500 
+L 3597 3500 
+L 2284 0 
+L 1503 0 
+L 191 3500 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-51" d="M 2522 4238 
+Q 1834 4238 1429 3725 
+Q 1025 3213 1025 2328 
+Q 1025 1447 1429 934 
+Q 1834 422 2522 422 
+Q 3209 422 3611 934 
+Q 4013 1447 4013 2328 
+Q 4013 3213 3611 3725 
+Q 3209 4238 2522 4238 
+z
+M 3406 84 
+L 4238 -825 
+L 3475 -825 
+L 2784 -78 
+Q 2681 -84 2626 -87 
+Q 2572 -91 2522 -91 
+Q 1538 -91 948 567 
+Q 359 1225 359 2328 
+Q 359 3434 948 4092 
+Q 1538 4750 2522 4750 
+Q 3503 4750 4090 4092 
+Q 4678 3434 4678 2328 
+Q 4678 1516 4351 937 
+Q 4025 359 3406 84 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-75" d="M 544 1381 
+L 544 3500 
+L 1119 3500 
+L 1119 1403 
+Q 1119 906 1312 657 
+Q 1506 409 1894 409 
+Q 2359 409 2629 706 
+Q 2900 1003 2900 1516 
+L 2900 3500 
+L 3475 3500 
+L 3475 0 
+L 2900 0 
+L 2900 538 
+Q 2691 219 2414 64 
+Q 2138 -91 1772 -91 
+Q 1169 -91 856 284 
+Q 544 659 544 1381 
+z
+M 1991 3584 
+L 1991 3584 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-2014" d="M 313 1978 
+L 6088 1978 
+L 6088 1528 
+L 313 1528 
+L 313 1978 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-44" d="M 1259 4147 
+L 1259 519 
+L 2022 519 
+Q 2988 519 3436 956 
+Q 3884 1394 3884 2338 
+Q 3884 3275 3436 3711 
+Q 2988 4147 2022 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 1925 4666 
+Q 3281 4666 3915 4102 
+Q 4550 3538 4550 2338 
+Q 4550 1131 3912 565 
+Q 3275 0 1925 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+    </defs>
+    <use xlink:href="#DejaVuSans-53"/>
+    <use xlink:href="#DejaVuSans-4d" transform="translate(63.476562 0)"/>
+    <use xlink:href="#DejaVuSans-31" transform="translate(149.755859 0)"/>
+    <use xlink:href="#DejaVuSans-30" transform="translate(213.378906 0)"/>
+    <use xlink:href="#DejaVuSans-30" transform="translate(277.001953 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(340.625 0)"/>
+    <use xlink:href="#DejaVuSans-46" transform="translate(372.412109 0)"/>
+    <use xlink:href="#DejaVuSans-50" transform="translate(429.931641 0)"/>
+    <use xlink:href="#DejaVuSans-31" transform="translate(490.234375 0)"/>
+    <use xlink:href="#DejaVuSans-36" transform="translate(553.857422 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(617.480469 0)"/>
+    <use xlink:href="#DejaVuSans-4b" transform="translate(649.267578 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(709.84375 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(771.367188 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(810.730469 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(874.109375 0)"/>
+    <use xlink:href="#DejaVuSans-6c" transform="translate(935.632812 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(963.416016 0)"/>
+    <use xlink:href="#DejaVuSans-42" transform="translate(995.203125 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(1063.806641 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(1125.330078 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(1188.708984 0)"/>
+    <use xlink:href="#DejaVuSans-68" transform="translate(1243.689453 0)"/>
+    <use xlink:href="#DejaVuSans-6d" transform="translate(1307.068359 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(1404.480469 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(1465.759766 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(1506.873047 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(1564.783203 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(1616.882812 0)"/>
+    <use xlink:href="#DejaVuSans-28" transform="translate(1648.669922 0)"/>
+    <use xlink:href="#DejaVuSans-4f" transform="translate(1687.683594 0)"/>
+    <use xlink:href="#DejaVuSans-69" transform="translate(1766.394531 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(1794.177734 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(1857.556641 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(1915.466797 0)"/>
+    <use xlink:href="#DejaVuSans-76" transform="translate(1947.253906 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(2006.433594 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2058.533203 0)"/>
+    <use xlink:href="#DejaVuSans-51" transform="translate(2090.320312 0)"/>
+    <use xlink:href="#DejaVuSans-75" transform="translate(2169.03125 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(2232.410156 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(2293.689453 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(2348.669922 0)"/>
+    <use xlink:href="#DejaVuSans-29" transform="translate(2406.580078 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2445.59375 0)"/>
+    <use xlink:href="#DejaVuSans-2014" transform="translate(2477.380859 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2577.380859 0)"/>
+    <use xlink:href="#DejaVuSans-44" transform="translate(2609.167969 0)"/>
+    <use xlink:href="#DejaVuSans-53" transform="translate(2686.169922 0)"/>
+    <use xlink:href="#DejaVuSans-76" transform="translate(2749.646484 0)"/>
+    <use xlink:href="#DejaVuSans-33" transform="translate(2808.826172 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2872.449219 0)"/>
+    <use xlink:href="#DejaVuSans-28" transform="translate(2904.236328 0)"/>
+    <use xlink:href="#DejaVuSans-2b" transform="translate(2943.25 0)"/>
+    <use xlink:href="#DejaVuSans-4c" transform="translate(3027.039062 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(3082.751953 0)"/>
+    <use xlink:href="#DejaVuSans-79" transform="translate(3144.03125 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(3203.210938 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(3264.734375 0)"/>
+    <use xlink:href="#DejaVuSans-4e" transform="translate(3305.847656 0)"/>
+    <use xlink:href="#DejaVuSans-6f" transform="translate(3380.652344 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(3441.833984 0)"/>
+    <use xlink:href="#DejaVuSans-6d" transform="translate(3481.197266 0)"/>
+    <use xlink:href="#DejaVuSans-29" transform="translate(3578.609375 0)"/>
+   </g>
+  </g>
+  <g id="legend_1">
+   <g id="line2d_101">
+    <path d="M 582.175625 39.937812 
+L 599.675625 39.937812 
+L 617.175625 39.937812 
+" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g>
+     <use xlink:href="#m276b91ba99" x="599.675625" y="39.937812" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="text_39">
+    <!-- KernelAgent-Oink (ours) -->
+    <g transform="translate(628.375625 44.837812) scale(0.14 -0.14)">
+     <defs>
+      <path id="DejaVuSans-41" d="M 2188 4044 
+L 1331 1722 
+L 3047 1722 
+L 2188 4044 
+z
+M 1831 4666 
+L 2547 4666 
+L 4325 0 
+L 3669 0 
+L 3244 1197 
+L 1141 1197 
+L 716 0 
+L 50 0 
+L 1831 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-4b"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(60.576172 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(122.099609 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(161.462891 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(224.841797 0)"/>
+     <use xlink:href="#DejaVuSans-6c" transform="translate(286.365234 0)"/>
+     <use xlink:href="#DejaVuSans-41" transform="translate(314.148438 0)"/>
+     <use xlink:href="#DejaVuSans-67" transform="translate(382.556641 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(446.033203 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(507.556641 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(570.935547 0)"/>
+     <use xlink:href="#DejaVuSans-2d" transform="translate(610.144531 0)"/>
+     <use xlink:href="#DejaVuSans-4f" transform="translate(648.978516 0)"/>
+     <use xlink:href="#DejaVuSans-69" transform="translate(727.689453 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(755.472656 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(818.851562 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(876.761719 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(908.548828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(947.5625 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(1008.744141 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(1072.123047 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(1113.236328 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1165.335938 0)"/>
+    </g>
+   </g>
+   <g id="line2d_102">
+    <path d="M 824.98375 39.937812 
+L 842.48375 39.937812 
+L 859.98375 39.937812 
+" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g>
+     <use xlink:href="#m6033ca0b55" x="842.48375" y="39.937812" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="text_40">
+    <!-- Quack -->
+    <g transform="translate(871.18375 44.837812) scale(0.14 -0.14)">
+     <use xlink:href="#DejaVuSans-51"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(78.710938 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(142.089844 0)"/>
+     <use xlink:href="#DejaVuSans-63" transform="translate(203.369141 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(258.349609 0)"/>
+    </g>
+   </g>
+   <g id="line2d_103">
+    <path d="M 943.460938 39.937812 
+L 960.960938 39.937812 
+L 978.460938 39.937812 
+" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="text_41">
+    <!-- HBM peak (measured) -->
+    <g transform="translate(989.660938 44.837812) scale(0.14 -0.14)">
+     <defs>
+      <path id="DejaVuSans-48" d="M 628 4666 
+L 1259 4666 
+L 1259 2753 
+L 3553 2753 
+L 3553 4666 
+L 4184 4666 
+L 4184 0 
+L 3553 0 
+L 3553 2222 
+L 1259 2222 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-48"/>
+     <use xlink:href="#DejaVuSans-42" transform="translate(75.195312 0)"/>
+     <use xlink:href="#DejaVuSans-4d" transform="translate(143.798828 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(230.078125 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(261.865234 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(325.341797 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(386.865234 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(448.144531 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(506.054688 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(537.841797 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(576.855469 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(674.267578 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(735.791016 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(797.070312 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(849.169922 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(912.548828 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(951.412109 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1012.935547 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1076.412109 0)"/>
+    </g>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="p11615d3359">
+   <rect x="66.53" y="95.28" width="375.400945" height="207.081604"/>
+  </clipPath>
+  <clipPath id="p05495e384c">
+   <rect x="482.7725" y="95.28" width="375.400945" height="207.081604"/>
+  </clipPath>
+  <clipPath id="peca114f933">
+   <rect x="899.015" y="95.28" width="375.400945" height="207.081604"/>
+  </clipPath>
+  <clipPath id="p9afaee9892">
+   <rect x="1315.2575" y="95.28" width="375.400945" height="207.081604"/>
+  </clipPath>
+ </defs>
+</svg>
diff --git a/oink/benchmarks/media/sm100_fp16_oink_vs_quack_with_layernorm.svg b/oink/benchmarks/media/sm100_fp16_oink_vs_quack_with_layernorm.svg
new file mode 100644
index 0000000..dac54ac
--- /dev/null
+++ b/oink/benchmarks/media/sm100_fp16_oink_vs_quack_with_layernorm.svg
@@ -0,0 +1,2601 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="1706.903903pt" height="387.112144pt" viewBox="0 0 1706.903903 387.112144" xmlns="http://www.w3.org/2000/svg" version="1.1">
+ <metadata>
+  <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+   <cc:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
+    <dc:date>2026-01-12T23:31:35.225900</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <cc:Agent>
+      <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
+     </cc:Agent>
+    </dc:creator>
+   </cc:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure_1">
+  <g id="patch_1">
+   <path d="M 0 387.112144 
+L 1706.903903 387.112144 
+L 1706.903903 0 
+L 0 0 
+z
+" style="fill: #ffffff"/>
+  </g>
+  <g id="axes_1">
+   <g id="patch_2">
+    <path d="M 58.465 334.546471 
+L 429.474812 334.546471 
+L 429.474812 144.816 
+L 58.465 144.816 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="line2d_1">
+      <defs>
+       <path id="mb7b8ee7556" d="M 0 0 
+L 0 3.5 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#mb7b8ee7556" x="75.329082" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_1">
+      <!-- (8K, 4K) -->
+      <g transform="translate(76.799643 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-28" d="M 1984 4856 
+Q 1566 4138 1362 3434 
+Q 1159 2731 1159 2009 
+Q 1159 1288 1364 580 
+Q 1569 -128 1984 -844 
+L 1484 -844 
+Q 1016 -109 783 600 
+Q 550 1309 550 2009 
+Q 550 2706 781 3412 
+Q 1013 4119 1484 4856 
+L 1984 4856 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-38" d="M 2034 2216 
+Q 1584 2216 1326 1975 
+Q 1069 1734 1069 1313 
+Q 1069 891 1326 650 
+Q 1584 409 2034 409 
+Q 2484 409 2743 651 
+Q 3003 894 3003 1313 
+Q 3003 1734 2745 1975 
+Q 2488 2216 2034 2216 
+z
+M 1403 2484 
+Q 997 2584 770 2862 
+Q 544 3141 544 3541 
+Q 544 4100 942 4425 
+Q 1341 4750 2034 4750 
+Q 2731 4750 3128 4425 
+Q 3525 4100 3525 3541 
+Q 3525 3141 3298 2862 
+Q 3072 2584 2669 2484 
+Q 3125 2378 3379 2068 
+Q 3634 1759 3634 1313 
+Q 3634 634 3220 271 
+Q 2806 -91 2034 -91 
+Q 1263 -91 848 271 
+Q 434 634 434 1313 
+Q 434 1759 690 2068 
+Q 947 2378 1403 2484 
+z
+M 1172 3481 
+Q 1172 3119 1398 2916 
+Q 1625 2713 2034 2713 
+Q 2441 2713 2670 2916 
+Q 2900 3119 2900 3481 
+Q 2900 3844 2670 4047 
+Q 2441 4250 2034 4250 
+Q 1625 4250 1398 4047 
+Q 1172 3844 1172 3481 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-4b" d="M 628 4666 
+L 1259 4666 
+L 1259 2694 
+L 3353 4666 
+L 4166 4666 
+L 1850 2491 
+L 4331 0 
+L 3500 0 
+L 1259 2247 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-2c" d="M 750 794 
+L 1409 794 
+L 1409 256 
+L 897 -744 
+L 494 -744 
+L 750 256 
+L 750 794 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-20" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-34" d="M 2419 4116 
+L 825 1625 
+L 2419 1625 
+L 2419 4116 
+z
+M 2253 4666 
+L 3047 4666 
+L 3047 1625 
+L 3713 1625 
+L 3713 1100 
+L 3047 1100 
+L 3047 0 
+L 2419 0 
+L 2419 1100 
+L 313 1100 
+L 313 1709 
+L 2253 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-29" d="M 513 4856 
+L 1013 4856 
+Q 1481 4119 1714 3412 
+Q 1947 2706 1947 2009 
+Q 1947 1309 1714 600 
+Q 1481 -109 1013 -844 
+L 513 -844 
+Q 928 -128 1133 580 
+Q 1338 1288 1338 2009 
+Q 1338 2731 1133 3434 
+Q 928 4138 513 4856 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="line2d_2">
+      <g>
+       <use xlink:href="#mb7b8ee7556" x="131.54269" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_2">
+      <!-- (16K, 4K) -->
+      <g transform="translate(133.013251 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-31" d="M 794 531 
+L 1825 531 
+L 1825 4091 
+L 703 3866 
+L 703 4441 
+L 1819 4666 
+L 2450 4666 
+L 2450 531 
+L 3481 531 
+L 3481 0 
+L 794 0 
+L 794 531 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-36" d="M 2113 2584 
+Q 1688 2584 1439 2293 
+Q 1191 2003 1191 1497 
+Q 1191 994 1439 701 
+Q 1688 409 2113 409 
+Q 2538 409 2786 701 
+Q 3034 994 3034 1497 
+Q 3034 2003 2786 2293 
+Q 2538 2584 2113 2584 
+z
+M 3366 4563 
+L 3366 3988 
+Q 3128 4100 2886 4159 
+Q 2644 4219 2406 4219 
+Q 1781 4219 1451 3797 
+Q 1122 3375 1075 2522 
+Q 1259 2794 1537 2939 
+Q 1816 3084 2150 3084 
+Q 2853 3084 3261 2657 
+Q 3669 2231 3669 1497 
+Q 3669 778 3244 343 
+Q 2819 -91 2113 -91 
+Q 1303 -91 875 529 
+Q 447 1150 447 2328 
+Q 447 3434 972 4092 
+Q 1497 4750 2381 4750 
+Q 2619 4750 2861 4703 
+Q 3103 4656 3366 4563 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="line2d_3">
+      <g>
+       <use xlink:href="#mb7b8ee7556" x="187.756298" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_3">
+      <!-- (32K, 4K) -->
+      <g transform="translate(189.226859 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-33" d="M 2597 2516 
+Q 3050 2419 3304 2112 
+Q 3559 1806 3559 1356 
+Q 3559 666 3084 287 
+Q 2609 -91 1734 -91 
+Q 1441 -91 1130 -33 
+Q 819 25 488 141 
+L 488 750 
+Q 750 597 1062 519 
+Q 1375 441 1716 441 
+Q 2309 441 2620 675 
+Q 2931 909 2931 1356 
+Q 2931 1769 2642 2001 
+Q 2353 2234 1838 2234 
+L 1294 2234 
+L 1294 2753 
+L 1863 2753 
+Q 2328 2753 2575 2939 
+Q 2822 3125 2822 3475 
+Q 2822 3834 2567 4026 
+Q 2313 4219 1838 4219 
+Q 1578 4219 1281 4162 
+Q 984 4106 628 3988 
+L 628 4550 
+Q 988 4650 1302 4700 
+Q 1616 4750 1894 4750 
+Q 2613 4750 3031 4423 
+Q 3450 4097 3450 3541 
+Q 3450 3153 3228 2886 
+Q 3006 2619 2597 2516 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-32" d="M 1228 531 
+L 3431 531 
+L 3431 0 
+L 469 0 
+L 469 531 
+Q 828 903 1448 1529 
+Q 2069 2156 2228 2338 
+Q 2531 2678 2651 2914 
+Q 2772 3150 2772 3378 
+Q 2772 3750 2511 3984 
+Q 2250 4219 1831 4219 
+Q 1534 4219 1204 4116 
+Q 875 4013 500 3803 
+L 500 4441 
+Q 881 4594 1212 4672 
+Q 1544 4750 1819 4750 
+Q 2544 4750 2975 4387 
+Q 3406 4025 3406 3419 
+Q 3406 3131 3298 2873 
+Q 3191 2616 2906 2266 
+Q 2828 2175 2409 1742 
+Q 1991 1309 1228 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="line2d_4">
+      <g>
+       <use xlink:href="#mb7b8ee7556" x="243.969906" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_4">
+      <!-- (64K, 4K) -->
+      <g transform="translate(245.440467 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="line2d_5">
+      <g>
+       <use xlink:href="#mb7b8ee7556" x="300.183514" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_5">
+      <!-- (128K, 4K) -->
+      <g transform="translate(301.654075 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="line2d_6">
+      <g>
+       <use xlink:href="#mb7b8ee7556" x="356.397122" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_6">
+      <!-- (256K, 4K) -->
+      <g transform="translate(357.867683 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-35" d="M 691 4666 
+L 3169 4666 
+L 3169 4134 
+L 1269 4134 
+L 1269 2991 
+Q 1406 3038 1543 3061 
+Q 1681 3084 1819 3084 
+Q 2600 3084 3056 2656 
+Q 3513 2228 3513 1497 
+Q 3513 744 3044 326 
+Q 2575 -91 1722 -91 
+Q 1428 -91 1123 -41 
+Q 819 9 494 109 
+L 494 744 
+Q 775 591 1075 516 
+Q 1375 441 1709 441 
+Q 2250 441 2565 725 
+Q 2881 1009 2881 1497 
+Q 2881 1984 2565 2268 
+Q 2250 2553 1709 2553 
+Q 1456 2553 1204 2497 
+Q 953 2441 691 2322 
+L 691 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_7">
+     <g id="line2d_7">
+      <g>
+       <use xlink:href="#mb7b8ee7556" x="412.61073" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_7">
+      <!-- (512K, 4K) -->
+      <g transform="translate(414.081291 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2">
+    <g id="ytick_1">
+     <g id="line2d_8">
+      <path d="M 58.465 334.546471 
+L 429.474812 334.546471 
+" clip-path="url(#p1a1505db71)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_9">
+      <defs>
+       <path id="me7483c6a33" d="M 0 0 
+L -3.5 0 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#me7483c6a33" x="58.465" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_8">
+      <!-- 0 -->
+      <g transform="translate(43.83 339.105533) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-30" d="M 2034 4250 
+Q 1547 4250 1301 3770 
+Q 1056 3291 1056 2328 
+Q 1056 1369 1301 889 
+Q 1547 409 2034 409 
+Q 2525 409 2770 889 
+Q 3016 1369 3016 2328 
+Q 3016 3291 2770 3770 
+Q 2525 4250 2034 4250 
+z
+M 2034 4750 
+Q 2819 4750 3233 4129 
+Q 3647 3509 3647 2328 
+Q 3647 1150 3233 529 
+Q 2819 -91 2034 -91 
+Q 1250 -91 836 529 
+Q 422 1150 422 2328 
+Q 422 3509 836 4129 
+Q 1250 4750 2034 4750 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_2">
+     <g id="line2d_10">
+      <path d="M 58.465 286.915059 
+L 429.474812 286.915059 
+" clip-path="url(#p1a1505db71)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_11">
+      <g>
+       <use xlink:href="#me7483c6a33" x="58.465" y="286.915059" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_9">
+      <!-- 2000 -->
+      <g transform="translate(20.925 291.474121) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-32"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_3">
+     <g id="line2d_12">
+      <path d="M 58.465 239.283646 
+L 429.474812 239.283646 
+" clip-path="url(#p1a1505db71)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_13">
+      <g>
+       <use xlink:href="#me7483c6a33" x="58.465" y="239.283646" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_10">
+      <!-- 4000 -->
+      <g transform="translate(20.925 243.842709) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-34"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_4">
+     <g id="line2d_14">
+      <path d="M 58.465 191.652234 
+L 429.474812 191.652234 
+" clip-path="url(#p1a1505db71)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_15">
+      <g>
+       <use xlink:href="#me7483c6a33" x="58.465" y="191.652234" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_11">
+      <!-- 6000 -->
+      <g transform="translate(20.925 196.211297) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-36"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_12">
+     <!-- Memory Bandwidth (GB/s) -->
+     <g transform="translate(13.5975 344.684985) rotate(-90) scale(0.16 -0.16)">
+      <defs>
+       <path id="DejaVuSans-4d" d="M 628 4666 
+L 1569 4666 
+L 2759 1491 
+L 3956 4666 
+L 4897 4666 
+L 4897 0 
+L 4281 0 
+L 4281 4097 
+L 3078 897 
+L 2444 897 
+L 1241 4097 
+L 1241 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-65" d="M 3597 1894 
+L 3597 1613 
+L 953 1613 
+Q 991 1019 1311 708 
+Q 1631 397 2203 397 
+Q 2534 397 2845 478 
+Q 3156 559 3463 722 
+L 3463 178 
+Q 3153 47 2828 -22 
+Q 2503 -91 2169 -91 
+Q 1331 -91 842 396 
+Q 353 884 353 1716 
+Q 353 2575 817 3079 
+Q 1281 3584 2069 3584 
+Q 2775 3584 3186 3129 
+Q 3597 2675 3597 1894 
+z
+M 3022 2063 
+Q 3016 2534 2758 2815 
+Q 2500 3097 2075 3097 
+Q 1594 3097 1305 2825 
+Q 1016 2553 972 2059 
+L 3022 2063 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6d" d="M 3328 2828 
+Q 3544 3216 3844 3400 
+Q 4144 3584 4550 3584 
+Q 5097 3584 5394 3201 
+Q 5691 2819 5691 2113 
+L 5691 0 
+L 5113 0 
+L 5113 2094 
+Q 5113 2597 4934 2840 
+Q 4756 3084 4391 3084 
+Q 3944 3084 3684 2787 
+Q 3425 2491 3425 1978 
+L 3425 0 
+L 2847 0 
+L 2847 2094 
+Q 2847 2600 2669 2842 
+Q 2491 3084 2119 3084 
+Q 1678 3084 1418 2786 
+Q 1159 2488 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1356 3278 1631 3431 
+Q 1906 3584 2284 3584 
+Q 2666 3584 2933 3390 
+Q 3200 3197 3328 2828 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6f" d="M 1959 3097 
+Q 1497 3097 1228 2736 
+Q 959 2375 959 1747 
+Q 959 1119 1226 758 
+Q 1494 397 1959 397 
+Q 2419 397 2687 759 
+Q 2956 1122 2956 1747 
+Q 2956 2369 2687 2733 
+Q 2419 3097 1959 3097 
+z
+M 1959 3584 
+Q 2709 3584 3137 3096 
+Q 3566 2609 3566 1747 
+Q 3566 888 3137 398 
+Q 2709 -91 1959 -91 
+Q 1206 -91 779 398 
+Q 353 888 353 1747 
+Q 353 2609 779 3096 
+Q 1206 3584 1959 3584 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-72" d="M 2631 2963 
+Q 2534 3019 2420 3045 
+Q 2306 3072 2169 3072 
+Q 1681 3072 1420 2755 
+Q 1159 2438 1159 1844 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1341 3275 1631 3429 
+Q 1922 3584 2338 3584 
+Q 2397 3584 2469 3576 
+Q 2541 3569 2628 3553 
+L 2631 2963 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-79" d="M 2059 -325 
+Q 1816 -950 1584 -1140 
+Q 1353 -1331 966 -1331 
+L 506 -1331 
+L 506 -850 
+L 844 -850 
+Q 1081 -850 1212 -737 
+Q 1344 -625 1503 -206 
+L 1606 56 
+L 191 3500 
+L 800 3500 
+L 1894 763 
+L 2988 3500 
+L 3597 3500 
+L 2059 -325 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-42" d="M 1259 2228 
+L 1259 519 
+L 2272 519 
+Q 2781 519 3026 730 
+Q 3272 941 3272 1375 
+Q 3272 1813 3026 2020 
+Q 2781 2228 2272 2228 
+L 1259 2228 
+z
+M 1259 4147 
+L 1259 2741 
+L 2194 2741 
+Q 2656 2741 2882 2914 
+Q 3109 3088 3109 3444 
+Q 3109 3797 2882 3972 
+Q 2656 4147 2194 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 2241 4666 
+Q 2963 4666 3353 4366 
+Q 3744 4066 3744 3513 
+Q 3744 3084 3544 2831 
+Q 3344 2578 2956 2516 
+Q 3422 2416 3680 2098 
+Q 3938 1781 3938 1306 
+Q 3938 681 3513 340 
+Q 3088 0 2303 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-61" d="M 2194 1759 
+Q 1497 1759 1228 1600 
+Q 959 1441 959 1056 
+Q 959 750 1161 570 
+Q 1363 391 1709 391 
+Q 2188 391 2477 730 
+Q 2766 1069 2766 1631 
+L 2766 1759 
+L 2194 1759 
+z
+M 3341 1997 
+L 3341 0 
+L 2766 0 
+L 2766 531 
+Q 2569 213 2275 61 
+Q 1981 -91 1556 -91 
+Q 1019 -91 701 211 
+Q 384 513 384 1019 
+Q 384 1609 779 1909 
+Q 1175 2209 1959 2209 
+L 2766 2209 
+L 2766 2266 
+Q 2766 2663 2505 2880 
+Q 2244 3097 1772 3097 
+Q 1472 3097 1187 3025 
+Q 903 2953 641 2809 
+L 641 3341 
+Q 956 3463 1253 3523 
+Q 1550 3584 1831 3584 
+Q 2591 3584 2966 3190 
+Q 3341 2797 3341 1997 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6e" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-64" d="M 2906 2969 
+L 2906 4863 
+L 3481 4863 
+L 3481 0 
+L 2906 0 
+L 2906 525 
+Q 2725 213 2448 61 
+Q 2172 -91 1784 -91 
+Q 1150 -91 751 415 
+Q 353 922 353 1747 
+Q 353 2572 751 3078 
+Q 1150 3584 1784 3584 
+Q 2172 3584 2448 3432 
+Q 2725 3281 2906 2969 
+z
+M 947 1747 
+Q 947 1113 1208 752 
+Q 1469 391 1925 391 
+Q 2381 391 2643 752 
+Q 2906 1113 2906 1747 
+Q 2906 2381 2643 2742 
+Q 2381 3103 1925 3103 
+Q 1469 3103 1208 2742 
+Q 947 2381 947 1747 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-77" d="M 269 3500 
+L 844 3500 
+L 1563 769 
+L 2278 3500 
+L 2956 3500 
+L 3675 769 
+L 4391 3500 
+L 4966 3500 
+L 4050 0 
+L 3372 0 
+L 2619 2869 
+L 1863 0 
+L 1184 0 
+L 269 3500 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-69" d="M 603 3500 
+L 1178 3500 
+L 1178 0 
+L 603 0 
+L 603 3500 
+z
+M 603 4863 
+L 1178 4863 
+L 1178 4134 
+L 603 4134 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-74" d="M 1172 4494 
+L 1172 3500 
+L 2356 3500 
+L 2356 3053 
+L 1172 3053 
+L 1172 1153 
+Q 1172 725 1289 603 
+Q 1406 481 1766 481 
+L 2356 481 
+L 2356 0 
+L 1766 0 
+Q 1100 0 847 248 
+Q 594 497 594 1153 
+L 594 3053 
+L 172 3053 
+L 172 3500 
+L 594 3500 
+L 594 4494 
+L 1172 4494 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-68" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-47" d="M 3809 666 
+L 3809 1919 
+L 2778 1919 
+L 2778 2438 
+L 4434 2438 
+L 4434 434 
+Q 4069 175 3628 42 
+Q 3188 -91 2688 -91 
+Q 1594 -91 976 548 
+Q 359 1188 359 2328 
+Q 359 3472 976 4111 
+Q 1594 4750 2688 4750 
+Q 3144 4750 3555 4637 
+Q 3966 4525 4313 4306 
+L 4313 3634 
+Q 3963 3931 3569 4081 
+Q 3175 4231 2741 4231 
+Q 1884 4231 1454 3753 
+Q 1025 3275 1025 2328 
+Q 1025 1384 1454 906 
+Q 1884 428 2741 428 
+Q 3075 428 3337 486 
+Q 3600 544 3809 666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-2f" d="M 1625 4666 
+L 2156 4666 
+L 531 -594 
+L 0 -594 
+L 1625 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-73" d="M 2834 3397 
+L 2834 2853 
+Q 2591 2978 2328 3040 
+Q 2066 3103 1784 3103 
+Q 1356 3103 1142 2972 
+Q 928 2841 928 2578 
+Q 928 2378 1081 2264 
+Q 1234 2150 1697 2047 
+L 1894 2003 
+Q 2506 1872 2764 1633 
+Q 3022 1394 3022 966 
+Q 3022 478 2636 193 
+Q 2250 -91 1575 -91 
+Q 1294 -91 989 -36 
+Q 684 19 347 128 
+L 347 722 
+Q 666 556 975 473 
+Q 1284 391 1588 391 
+Q 1994 391 2212 530 
+Q 2431 669 2431 922 
+Q 2431 1156 2273 1281 
+Q 2116 1406 1581 1522 
+L 1381 1569 
+Q 847 1681 609 1914 
+Q 372 2147 372 2553 
+Q 372 3047 722 3315 
+Q 1072 3584 1716 3584 
+Q 2034 3584 2315 3537 
+Q 2597 3491 2834 3397 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-4d"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(86.279297 0)"/>
+      <use xlink:href="#DejaVuSans-6d" transform="translate(147.802734 0)"/>
+      <use xlink:href="#DejaVuSans-6f" transform="translate(245.214844 0)"/>
+      <use xlink:href="#DejaVuSans-72" transform="translate(306.396484 0)"/>
+      <use xlink:href="#DejaVuSans-79" transform="translate(347.509766 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(406.689453 0)"/>
+      <use xlink:href="#DejaVuSans-42" transform="translate(438.476562 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(507.080078 0)"/>
+      <use xlink:href="#DejaVuSans-6e" transform="translate(568.359375 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(631.738281 0)"/>
+      <use xlink:href="#DejaVuSans-77" transform="translate(695.214844 0)"/>
+      <use xlink:href="#DejaVuSans-69" transform="translate(777.001953 0)"/>
+      <use xlink:href="#DejaVuSans-64" transform="translate(804.785156 0)"/>
+      <use xlink:href="#DejaVuSans-74" transform="translate(868.261719 0)"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(907.470703 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(970.849609 0)"/>
+      <use xlink:href="#DejaVuSans-28" transform="translate(1002.636719 0)"/>
+      <use xlink:href="#DejaVuSans-47" transform="translate(1041.650391 0)"/>
+      <use xlink:href="#DejaVuSans-42" transform="translate(1119.140625 0)"/>
+      <use xlink:href="#DejaVuSans-2f" transform="translate(1187.744141 0)"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(1221.435547 0)"/>
+      <use xlink:href="#DejaVuSans-29" transform="translate(1273.535156 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_16">
+    <path d="M 75.329082 251.913135 
+L 131.54269 241.17648 
+L 187.756298 219.657703 
+L 243.969906 214.163341 
+L 300.183514 211.264677 
+L 356.397122 210.014971 
+L 412.61073 209.246698 
+" clip-path="url(#p1a1505db71)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <defs>
+     <path id="maa38a61819" d="M 0 3.5 
+C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
+C 3.131218 1.81853 3.5 0.928211 3.5 0 
+C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
+C 1.81853 -3.131218 0.928211 -3.5 0 -3.5 
+C -0.928211 -3.5 -1.81853 -3.131218 -2.474874 -2.474874 
+C -3.131218 -1.81853 -3.5 -0.928211 -3.5 0 
+C -3.5 0.928211 -3.131218 1.81853 -2.474874 2.474874 
+C -1.81853 3.131218 -0.928211 3.5 0 3.5 
+z
+" style="stroke: #5ba3f5"/>
+    </defs>
+    <g clip-path="url(#p1a1505db71)">
+     <use xlink:href="#maa38a61819" x="75.329082" y="251.913135" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#maa38a61819" x="131.54269" y="241.17648" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#maa38a61819" x="187.756298" y="219.657703" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#maa38a61819" x="243.969906" y="214.163341" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#maa38a61819" x="300.183514" y="211.264677" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#maa38a61819" x="356.397122" y="210.014971" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#maa38a61819" x="412.61073" y="209.246698" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_17">
+    <path d="M 75.329082 259.330658 
+L 131.54269 248.708389 
+L 187.756298 241.397352 
+L 243.969906 237.270903 
+L 300.183514 235.199319 
+L 356.397122 234.549081 
+L 412.61073 233.995559 
+" clip-path="url(#p1a1505db71)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <defs>
+     <path id="m69a13e037a" d="M 0 3.5 
+C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
+C 3.131218 1.81853 3.5 0.928211 3.5 0 
+C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
+C 1.81853 -3.131218 0.928211 -3.5 0 -3.5 
+C -0.928211 -3.5 -1.81853 -3.131218 -2.474874 -2.474874 
+C -3.131218 -1.81853 -3.5 -0.928211 -3.5 0 
+C -3.5 0.928211 -3.131218 1.81853 -2.474874 2.474874 
+C -1.81853 3.131218 -0.928211 3.5 0 3.5 
+z
+" style="stroke: #ff4444"/>
+    </defs>
+    <g clip-path="url(#p1a1505db71)">
+     <use xlink:href="#m69a13e037a" x="75.329082" y="259.330658" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m69a13e037a" x="131.54269" y="248.708389" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m69a13e037a" x="187.756298" y="241.397352" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m69a13e037a" x="243.969906" y="237.270903" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m69a13e037a" x="300.183514" y="235.199319" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m69a13e037a" x="356.397122" y="234.549081" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m69a13e037a" x="412.61073" y="233.995559" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_18">
+    <path d="M 58.465 158.870109 
+L 429.474812 158.870109 
+" clip-path="url(#p1a1505db71)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_3">
+    <path d="M 58.465 334.546471 
+L 58.465 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_4">
+    <path d="M 429.474812 334.546471 
+L 429.474812 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_5">
+    <path d="M 58.465 334.546471 
+L 429.474812 334.546471 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_6">
+    <path d="M 58.465 144.816 
+L 429.474812 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_13">
+    <!-- RMSNorm (fp32 weight) -->
+    <g transform="translate(136.266625 138.816) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-52" d="M 2841 2188 
+Q 3044 2119 3236 1894 
+Q 3428 1669 3622 1275 
+L 4263 0 
+L 3584 0 
+L 2988 1197 
+Q 2756 1666 2539 1819 
+Q 2322 1972 1947 1972 
+L 1259 1972 
+L 1259 0 
+L 628 0 
+L 628 4666 
+L 2053 4666 
+Q 2853 4666 3247 4331 
+Q 3641 3997 3641 3322 
+Q 3641 2881 3436 2590 
+Q 3231 2300 2841 2188 
+z
+M 1259 4147 
+L 1259 2491 
+L 2053 2491 
+Q 2509 2491 2742 2702 
+Q 2975 2913 2975 3322 
+Q 2975 3731 2742 3939 
+Q 2509 4147 2053 4147 
+L 1259 4147 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-53" d="M 3425 4513 
+L 3425 3897 
+Q 3066 4069 2747 4153 
+Q 2428 4238 2131 4238 
+Q 1616 4238 1336 4038 
+Q 1056 3838 1056 3469 
+Q 1056 3159 1242 3001 
+Q 1428 2844 1947 2747 
+L 2328 2669 
+Q 3034 2534 3370 2195 
+Q 3706 1856 3706 1288 
+Q 3706 609 3251 259 
+Q 2797 -91 1919 -91 
+Q 1588 -91 1214 -16 
+Q 841 59 441 206 
+L 441 856 
+Q 825 641 1194 531 
+Q 1563 422 1919 422 
+Q 2459 422 2753 634 
+Q 3047 847 3047 1241 
+Q 3047 1584 2836 1778 
+Q 2625 1972 2144 2069 
+L 1759 2144 
+Q 1053 2284 737 2584 
+Q 422 2884 422 3419 
+Q 422 4038 858 4394 
+Q 1294 4750 2059 4750 
+Q 2388 4750 2728 4690 
+Q 3069 4631 3425 4513 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-4e" d="M 628 4666 
+L 1478 4666 
+L 3547 763 
+L 3547 4666 
+L 4159 4666 
+L 4159 0 
+L 3309 0 
+L 1241 3903 
+L 1241 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-66" d="M 2375 4863 
+L 2375 4384 
+L 1825 4384 
+Q 1516 4384 1395 4259 
+Q 1275 4134 1275 3809 
+L 1275 3500 
+L 2222 3500 
+L 2222 3053 
+L 1275 3053 
+L 1275 0 
+L 697 0 
+L 697 3053 
+L 147 3053 
+L 147 3500 
+L 697 3500 
+L 697 3744 
+Q 697 4328 969 4595 
+Q 1241 4863 1831 4863 
+L 2375 4863 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-70" d="M 1159 525 
+L 1159 -1331 
+L 581 -1331 
+L 581 3500 
+L 1159 3500 
+L 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+z
+M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-67" d="M 2906 1791 
+Q 2906 2416 2648 2759 
+Q 2391 3103 1925 3103 
+Q 1463 3103 1205 2759 
+Q 947 2416 947 1791 
+Q 947 1169 1205 825 
+Q 1463 481 1925 481 
+Q 2391 481 2648 825 
+Q 2906 1169 2906 1791 
+z
+M 3481 434 
+Q 3481 -459 3084 -895 
+Q 2688 -1331 1869 -1331 
+Q 1566 -1331 1297 -1286 
+Q 1028 -1241 775 -1147 
+L 775 -588 
+Q 1028 -725 1275 -790 
+Q 1522 -856 1778 -856 
+Q 2344 -856 2625 -561 
+Q 2906 -266 2906 331 
+L 2906 616 
+Q 2728 306 2450 153 
+Q 2172 0 1784 0 
+Q 1141 0 747 490 
+Q 353 981 353 1791 
+Q 353 2603 747 3093 
+Q 1141 3584 1784 3584 
+Q 2172 3584 2450 3431 
+Q 2728 3278 2906 2969 
+L 2906 3500 
+L 3481 3500 
+L 3481 434 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-52"/>
+     <use xlink:href="#DejaVuSans-4d" transform="translate(69.482422 0)"/>
+     <use xlink:href="#DejaVuSans-53" transform="translate(155.761719 0)"/>
+     <use xlink:href="#DejaVuSans-4e" transform="translate(219.238281 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(294.042969 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(355.224609 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(394.587891 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(492 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(523.787109 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(562.800781 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(598.005859 0)"/>
+     <use xlink:href="#DejaVuSans-33" transform="translate(661.482422 0)"/>
+     <use xlink:href="#DejaVuSans-32" transform="translate(725.105469 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(788.728516 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(820.515625 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(902.302734 0)"/>
+     <use xlink:href="#DejaVuSans-69" transform="translate(963.826172 0)"/>
+     <use xlink:href="#DejaVuSans-67" transform="translate(991.609375 0)"/>
+     <use xlink:href="#DejaVuSans-68" transform="translate(1055.085938 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(1118.464844 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1157.673828 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_2">
+   <g id="patch_7">
+    <path d="M 474.7075 334.546471 
+L 845.717312 334.546471 
+L 845.717312 144.816 
+L 474.7075 144.816 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_3">
+    <g id="xtick_8">
+     <g id="line2d_19">
+      <g>
+       <use xlink:href="#mb7b8ee7556" x="491.571582" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_14">
+      <!-- (8K, 4K) -->
+      <g transform="translate(493.042143 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_9">
+     <g id="line2d_20">
+      <g>
+       <use xlink:href="#mb7b8ee7556" x="547.78519" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_15">
+      <!-- (16K, 4K) -->
+      <g transform="translate(549.255751 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_10">
+     <g id="line2d_21">
+      <g>
+       <use xlink:href="#mb7b8ee7556" x="603.998798" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_16">
+      <!-- (32K, 4K) -->
+      <g transform="translate(605.469359 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_11">
+     <g id="line2d_22">
+      <g>
+       <use xlink:href="#mb7b8ee7556" x="660.212406" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_17">
+      <!-- (64K, 4K) -->
+      <g transform="translate(661.682967 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_12">
+     <g id="line2d_23">
+      <g>
+       <use xlink:href="#mb7b8ee7556" x="716.426014" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_18">
+      <!-- (128K, 4K) -->
+      <g transform="translate(717.896575 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_13">
+     <g id="line2d_24">
+      <g>
+       <use xlink:href="#mb7b8ee7556" x="772.639622" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_19">
+      <!-- (256K, 4K) -->
+      <g transform="translate(774.110183 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_14">
+     <g id="line2d_25">
+      <g>
+       <use xlink:href="#mb7b8ee7556" x="828.85323" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_20">
+      <!-- (512K, 4K) -->
+      <g transform="translate(830.323791 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_4">
+    <g id="ytick_5">
+     <g id="line2d_26">
+      <path d="M 474.7075 334.546471 
+L 845.717312 334.546471 
+" clip-path="url(#p2964b5d0ab)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_27">
+      <g>
+       <use xlink:href="#me7483c6a33" x="474.7075" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_6">
+     <g id="line2d_28">
+      <path d="M 474.7075 286.915059 
+L 845.717312 286.915059 
+" clip-path="url(#p2964b5d0ab)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_29">
+      <g>
+       <use xlink:href="#me7483c6a33" x="474.7075" y="286.915059" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_7">
+     <g id="line2d_30">
+      <path d="M 474.7075 239.283646 
+L 845.717312 239.283646 
+" clip-path="url(#p2964b5d0ab)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_31">
+      <g>
+       <use xlink:href="#me7483c6a33" x="474.7075" y="239.283646" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_8">
+     <g id="line2d_32">
+      <path d="M 474.7075 191.652234 
+L 845.717312 191.652234 
+" clip-path="url(#p2964b5d0ab)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_33">
+      <g>
+       <use xlink:href="#me7483c6a33" x="474.7075" y="191.652234" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_34">
+    <path d="M 491.571582 238.582331 
+L 547.78519 227.350322 
+L 603.998798 221.013408 
+L 660.212406 217.531347 
+L 716.426014 215.493568 
+L 772.639622 214.504926 
+L 828.85323 214.038054 
+" clip-path="url(#p2964b5d0ab)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p2964b5d0ab)">
+     <use xlink:href="#maa38a61819" x="491.571582" y="238.582331" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#maa38a61819" x="547.78519" y="227.350322" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#maa38a61819" x="603.998798" y="221.013408" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#maa38a61819" x="660.212406" y="217.531347" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#maa38a61819" x="716.426014" y="215.493568" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#maa38a61819" x="772.639622" y="214.504926" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#maa38a61819" x="828.85323" y="214.038054" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_35">
+    <path d="M 491.571582 271.055153 
+L 547.78519 263.729959 
+L 603.998798 258.77406 
+L 660.212406 256.357586 
+L 716.426014 255.137832 
+L 772.639622 254.494771 
+L 828.85323 254.183547 
+" clip-path="url(#p2964b5d0ab)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p2964b5d0ab)">
+     <use xlink:href="#m69a13e037a" x="491.571582" y="271.055153" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m69a13e037a" x="547.78519" y="263.729959" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m69a13e037a" x="603.998798" y="258.77406" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m69a13e037a" x="660.212406" y="256.357586" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m69a13e037a" x="716.426014" y="255.137832" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m69a13e037a" x="772.639622" y="254.494771" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m69a13e037a" x="828.85323" y="254.183547" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_36">
+    <path d="M 474.7075 158.870109 
+L 845.717312 158.870109 
+" clip-path="url(#p2964b5d0ab)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_8">
+    <path d="M 474.7075 334.546471 
+L 474.7075 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_9">
+    <path d="M 845.717312 334.546471 
+L 845.717312 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_10">
+    <path d="M 474.7075 334.546471 
+L 845.717312 334.546471 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_11">
+    <path d="M 474.7075 144.816 
+L 845.717312 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_21">
+    <!-- Softmax (fwd+bwd) -->
+    <g transform="translate(570.547094 138.816) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-78" d="M 3513 3500 
+L 2247 1797 
+L 3578 0 
+L 2900 0 
+L 1881 1375 
+L 863 0 
+L 184 0 
+L 1544 1831 
+L 300 3500 
+L 978 3500 
+L 1906 2253 
+L 2834 3500 
+L 3513 3500 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-2b" d="M 2944 4013 
+L 2944 2272 
+L 4684 2272 
+L 4684 1741 
+L 2944 1741 
+L 2944 0 
+L 2419 0 
+L 2419 1741 
+L 678 1741 
+L 678 2272 
+L 2419 2272 
+L 2419 4013 
+L 2944 4013 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-62" d="M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+M 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2969 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-53"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(63.476562 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(124.658203 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(158.113281 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(197.322266 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(294.734375 0)"/>
+     <use xlink:href="#DejaVuSans-78" transform="translate(356.013672 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(415.193359 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(446.980469 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(485.994141 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(519.449219 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(601.236328 0)"/>
+     <use xlink:href="#DejaVuSans-2b" transform="translate(664.712891 0)"/>
+     <use xlink:href="#DejaVuSans-62" transform="translate(748.501953 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(811.978516 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(893.765625 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(957.242188 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_3">
+   <g id="patch_12">
+    <path d="M 890.95 334.546471 
+L 1261.959812 334.546471 
+L 1261.959812 144.816 
+L 890.95 144.816 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_5">
+    <g id="xtick_15">
+     <g id="line2d_37">
+      <g>
+       <use xlink:href="#mb7b8ee7556" x="907.814082" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_22">
+      <!-- (8K, 4K) -->
+      <g transform="translate(909.284643 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_16">
+     <g id="line2d_38">
+      <g>
+       <use xlink:href="#mb7b8ee7556" x="964.02769" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_23">
+      <!-- (16K, 4K) -->
+      <g transform="translate(965.498251 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_17">
+     <g id="line2d_39">
+      <g>
+       <use xlink:href="#mb7b8ee7556" x="1020.241298" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_24">
+      <!-- (32K, 4K) -->
+      <g transform="translate(1021.711859 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_18">
+     <g id="line2d_40">
+      <g>
+       <use xlink:href="#mb7b8ee7556" x="1076.454906" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_25">
+      <!-- (64K, 4K) -->
+      <g transform="translate(1077.925467 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_19">
+     <g id="line2d_41">
+      <g>
+       <use xlink:href="#mb7b8ee7556" x="1132.668514" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_26">
+      <!-- (128K, 4K) -->
+      <g transform="translate(1134.139075 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_20">
+     <g id="line2d_42">
+      <g>
+       <use xlink:href="#mb7b8ee7556" x="1188.882122" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_27">
+      <!-- (256K, 4K) -->
+      <g transform="translate(1190.352683 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_21">
+     <g id="line2d_43">
+      <g>
+       <use xlink:href="#mb7b8ee7556" x="1245.09573" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_28">
+      <!-- (512K, 4K) -->
+      <g transform="translate(1246.566291 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_6">
+    <g id="ytick_9">
+     <g id="line2d_44">
+      <path d="M 890.95 334.546471 
+L 1261.959812 334.546471 
+" clip-path="url(#p629cd7b308)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_45">
+      <g>
+       <use xlink:href="#me7483c6a33" x="890.95" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_10">
+     <g id="line2d_46">
+      <path d="M 890.95 286.915059 
+L 1261.959812 286.915059 
+" clip-path="url(#p629cd7b308)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_47">
+      <g>
+       <use xlink:href="#me7483c6a33" x="890.95" y="286.915059" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_11">
+     <g id="line2d_48">
+      <path d="M 890.95 239.283646 
+L 1261.959812 239.283646 
+" clip-path="url(#p629cd7b308)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_49">
+      <g>
+       <use xlink:href="#me7483c6a33" x="890.95" y="239.283646" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_12">
+     <g id="line2d_50">
+      <path d="M 890.95 191.652234 
+L 1261.959812 191.652234 
+" clip-path="url(#p629cd7b308)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_51">
+      <g>
+       <use xlink:href="#me7483c6a33" x="890.95" y="191.652234" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_52">
+    <path d="M 907.814082 256.925659 
+L 964.02769 244.945202 
+L 1020.241298 237.689301 
+L 1076.454906 233.432348 
+L 1132.668514 231.001708 
+L 1188.882122 229.984945 
+L 1245.09573 229.36148 
+" clip-path="url(#p629cd7b308)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p629cd7b308)">
+     <use xlink:href="#maa38a61819" x="907.814082" y="256.925659" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#maa38a61819" x="964.02769" y="244.945202" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#maa38a61819" x="1020.241298" y="237.689301" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#maa38a61819" x="1076.454906" y="233.432348" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#maa38a61819" x="1132.668514" y="231.001708" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#maa38a61819" x="1188.882122" y="229.984945" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#maa38a61819" x="1245.09573" y="229.36148" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_53">
+    <path d="M 907.814082 299.955996 
+L 964.02769 270.110028 
+L 1020.241298 262.988024 
+L 1076.454906 259.42104 
+L 1132.668514 257.236743 
+L 1188.882122 256.309819 
+L 1245.09573 255.686765 
+" clip-path="url(#p629cd7b308)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p629cd7b308)">
+     <use xlink:href="#m69a13e037a" x="907.814082" y="299.955996" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m69a13e037a" x="964.02769" y="270.110028" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m69a13e037a" x="1020.241298" y="262.988024" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m69a13e037a" x="1076.454906" y="259.42104" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m69a13e037a" x="1132.668514" y="257.236743" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m69a13e037a" x="1188.882122" y="256.309819" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m69a13e037a" x="1245.09573" y="255.686765" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_54">
+    <path d="M 890.95 158.870109 
+L 1261.959812 158.870109 
+" clip-path="url(#p629cd7b308)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_13">
+    <path d="M 890.95 334.546471 
+L 890.95 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_14">
+    <path d="M 1261.959812 334.546471 
+L 1261.959812 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_15">
+    <path d="M 890.95 334.546471 
+L 1261.959812 334.546471 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_16">
+    <path d="M 890.95 144.816 
+L 1261.959812 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_29">
+    <!-- Cross-Entropy (fwd+bwd) -->
+    <g transform="translate(961.281625 138.816) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-43" d="M 4122 4306 
+L 4122 3641 
+Q 3803 3938 3442 4084 
+Q 3081 4231 2675 4231 
+Q 1875 4231 1450 3742 
+Q 1025 3253 1025 2328 
+Q 1025 1406 1450 917 
+Q 1875 428 2675 428 
+Q 3081 428 3442 575 
+Q 3803 722 4122 1019 
+L 4122 359 
+Q 3791 134 3420 21 
+Q 3050 -91 2638 -91 
+Q 1578 -91 968 557 
+Q 359 1206 359 2328 
+Q 359 3453 968 4101 
+Q 1578 4750 2638 4750 
+Q 3056 4750 3426 4639 
+Q 3797 4528 4122 4306 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-2d" d="M 313 2009 
+L 1997 2009 
+L 1997 1497 
+L 313 1497 
+L 313 2009 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-45" d="M 628 4666 
+L 3578 4666 
+L 3578 4134 
+L 1259 4134 
+L 1259 2753 
+L 3481 2753 
+L 3481 2222 
+L 1259 2222 
+L 1259 531 
+L 3634 531 
+L 3634 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-43"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(69.824219 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(108.6875 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(169.869141 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(221.96875 0)"/>
+     <use xlink:href="#DejaVuSans-2d" transform="translate(274.068359 0)"/>
+     <use xlink:href="#DejaVuSans-45" transform="translate(310.152344 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(373.335938 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(436.714844 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(475.923828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(514.787109 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(575.96875 0)"/>
+     <use xlink:href="#DejaVuSans-79" transform="translate(639.445312 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(698.625 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(730.412109 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(769.425781 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(802.880859 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(884.667969 0)"/>
+     <use xlink:href="#DejaVuSans-2b" transform="translate(948.144531 0)"/>
+     <use xlink:href="#DejaVuSans-62" transform="translate(1031.933594 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(1095.410156 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1177.197266 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1240.673828 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_4">
+   <g id="patch_17">
+    <path d="M 1307.1925 334.546471 
+L 1678.202312 334.546471 
+L 1678.202312 144.816 
+L 1307.1925 144.816 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_7">
+    <g id="xtick_22">
+     <g id="line2d_55">
+      <g>
+       <use xlink:href="#mb7b8ee7556" x="1324.056582" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_30">
+      <!-- (8K, 4K) -->
+      <g transform="translate(1325.527143 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_23">
+     <g id="line2d_56">
+      <g>
+       <use xlink:href="#mb7b8ee7556" x="1380.27019" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_31">
+      <!-- (16K, 4K) -->
+      <g transform="translate(1381.740751 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_24">
+     <g id="line2d_57">
+      <g>
+       <use xlink:href="#mb7b8ee7556" x="1436.483798" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_32">
+      <!-- (32K, 4K) -->
+      <g transform="translate(1437.954359 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_25">
+     <g id="line2d_58">
+      <g>
+       <use xlink:href="#mb7b8ee7556" x="1492.697406" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_33">
+      <!-- (64K, 4K) -->
+      <g transform="translate(1494.167967 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_26">
+     <g id="line2d_59">
+      <g>
+       <use xlink:href="#mb7b8ee7556" x="1548.911014" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_34">
+      <!-- (128K, 4K) -->
+      <g transform="translate(1550.381575 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_27">
+     <g id="line2d_60">
+      <g>
+       <use xlink:href="#mb7b8ee7556" x="1605.124622" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_35">
+      <!-- (256K, 4K) -->
+      <g transform="translate(1606.595183 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_28">
+     <g id="line2d_61">
+      <g>
+       <use xlink:href="#mb7b8ee7556" x="1661.33823" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_36">
+      <!-- (512K, 4K) -->
+      <g transform="translate(1662.808791 346.919378) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(229.882812 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(295.458984 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(327.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(422.65625 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(488.232422 0)"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_8">
+    <g id="ytick_13">
+     <g id="line2d_62">
+      <path d="M 1307.1925 334.546471 
+L 1678.202312 334.546471 
+" clip-path="url(#p3993db1e81)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_63">
+      <g>
+       <use xlink:href="#me7483c6a33" x="1307.1925" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_14">
+     <g id="line2d_64">
+      <path d="M 1307.1925 286.915059 
+L 1678.202312 286.915059 
+" clip-path="url(#p3993db1e81)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_65">
+      <g>
+       <use xlink:href="#me7483c6a33" x="1307.1925" y="286.915059" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_15">
+     <g id="line2d_66">
+      <path d="M 1307.1925 239.283646 
+L 1678.202312 239.283646 
+" clip-path="url(#p3993db1e81)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_67">
+      <g>
+       <use xlink:href="#me7483c6a33" x="1307.1925" y="239.283646" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_16">
+     <g id="line2d_68">
+      <path d="M 1307.1925 191.652234 
+L 1678.202312 191.652234 
+" clip-path="url(#p3993db1e81)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_69">
+      <g>
+       <use xlink:href="#me7483c6a33" x="1307.1925" y="191.652234" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_70">
+    <path d="M 1324.056582 254.839025 
+L 1380.27019 246.363789 
+L 1436.483798 239.203362 
+L 1492.697406 235.841015 
+L 1548.911014 234.050124 
+L 1605.124622 233.048687 
+L 1661.33823 232.538313 
+" clip-path="url(#p3993db1e81)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p3993db1e81)">
+     <use xlink:href="#maa38a61819" x="1324.056582" y="254.839025" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#maa38a61819" x="1380.27019" y="246.363789" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#maa38a61819" x="1436.483798" y="239.203362" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#maa38a61819" x="1492.697406" y="235.841015" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#maa38a61819" x="1548.911014" y="234.050124" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#maa38a61819" x="1605.124622" y="233.048687" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#maa38a61819" x="1661.33823" y="232.538313" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="line2d_71">
+    <path d="M 1324.056582 268.569389 
+L 1380.27019 260.780292 
+L 1436.483798 255.300491 
+L 1492.697406 252.60039 
+L 1548.911014 251.060411 
+L 1605.124622 250.770026 
+L 1661.33823 249.905994 
+" clip-path="url(#p3993db1e81)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p3993db1e81)">
+     <use xlink:href="#m69a13e037a" x="1324.056582" y="268.569389" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m69a13e037a" x="1380.27019" y="260.780292" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m69a13e037a" x="1436.483798" y="255.300491" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m69a13e037a" x="1492.697406" y="252.60039" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m69a13e037a" x="1548.911014" y="251.060411" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m69a13e037a" x="1605.124622" y="250.770026" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m69a13e037a" x="1661.33823" y="249.905994" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="line2d_72">
+    <path d="M 1307.1925 158.870109 
+L 1678.202312 158.870109 
+" clip-path="url(#p3993db1e81)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="patch_18">
+    <path d="M 1307.1925 334.546471 
+L 1307.1925 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_19">
+    <path d="M 1678.202312 334.546471 
+L 1678.202312 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_20">
+    <path d="M 1307.1925 334.546471 
+L 1678.202312 334.546471 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_21">
+    <path d="M 1307.1925 144.816 
+L 1678.202312 144.816 
+" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_37">
+    <!-- LayerNorm (fwd) -->
+    <g transform="translate(1417.086156 138.816) scale(0.18 -0.18)">
+     <defs>
+      <path id="DejaVuSans-4c" d="M 628 4666 
+L 1259 4666 
+L 1259 531 
+L 3531 531 
+L 3531 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-4c"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(55.712891 0)"/>
+     <use xlink:href="#DejaVuSans-79" transform="translate(116.992188 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(176.171875 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(237.695312 0)"/>
+     <use xlink:href="#DejaVuSans-4e" transform="translate(278.808594 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(353.613281 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(414.794922 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(454.158203 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(551.570312 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(583.357422 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(622.371094 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(655.826172 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(737.613281 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(801.089844 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="text_38">
+   <!-- SM100 FP16 Kernel Benchmarks (Oink vs Quack) — Quack-suite (+LayerNorm) -->
+   <g transform="translate(421.361094 18.156563) scale(0.22 -0.22)">
+    <defs>
+     <path id="DejaVuSans-46" d="M 628 4666 
+L 3309 4666 
+L 3309 4134 
+L 1259 4134 
+L 1259 2759 
+L 3109 2759 
+L 3109 2228 
+L 1259 2228 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-50" d="M 1259 4147 
+L 1259 2394 
+L 2053 2394 
+Q 2494 2394 2734 2622 
+Q 2975 2850 2975 3272 
+Q 2975 3691 2734 3919 
+Q 2494 4147 2053 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 2053 4666 
+Q 2838 4666 3239 4311 
+Q 3641 3956 3641 3272 
+Q 3641 2581 3239 2228 
+Q 2838 1875 2053 1875 
+L 1259 1875 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-6c" d="M 603 4863 
+L 1178 4863 
+L 1178 0 
+L 603 0 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-63" d="M 3122 3366 
+L 3122 2828 
+Q 2878 2963 2633 3030 
+Q 2388 3097 2138 3097 
+Q 1578 3097 1268 2742 
+Q 959 2388 959 1747 
+Q 959 1106 1268 751 
+Q 1578 397 2138 397 
+Q 2388 397 2633 464 
+Q 2878 531 3122 666 
+L 3122 134 
+Q 2881 22 2623 -34 
+Q 2366 -91 2075 -91 
+Q 1284 -91 818 406 
+Q 353 903 353 1747 
+Q 353 2603 823 3093 
+Q 1294 3584 2113 3584 
+Q 2378 3584 2631 3529 
+Q 2884 3475 3122 3366 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-6b" d="M 581 4863 
+L 1159 4863 
+L 1159 1991 
+L 2875 3500 
+L 3609 3500 
+L 1753 1863 
+L 3688 0 
+L 2938 0 
+L 1159 1709 
+L 1159 0 
+L 581 0 
+L 581 4863 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-4f" d="M 2522 4238 
+Q 1834 4238 1429 3725 
+Q 1025 3213 1025 2328 
+Q 1025 1447 1429 934 
+Q 1834 422 2522 422 
+Q 3209 422 3611 934 
+Q 4013 1447 4013 2328 
+Q 4013 3213 3611 3725 
+Q 3209 4238 2522 4238 
+z
+M 2522 4750 
+Q 3503 4750 4090 4092 
+Q 4678 3434 4678 2328 
+Q 4678 1225 4090 567 
+Q 3503 -91 2522 -91 
+Q 1538 -91 948 565 
+Q 359 1222 359 2328 
+Q 359 3434 948 4092 
+Q 1538 4750 2522 4750 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-76" d="M 191 3500 
+L 800 3500 
+L 1894 563 
+L 2988 3500 
+L 3597 3500 
+L 2284 0 
+L 1503 0 
+L 191 3500 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-51" d="M 2522 4238 
+Q 1834 4238 1429 3725 
+Q 1025 3213 1025 2328 
+Q 1025 1447 1429 934 
+Q 1834 422 2522 422 
+Q 3209 422 3611 934 
+Q 4013 1447 4013 2328 
+Q 4013 3213 3611 3725 
+Q 3209 4238 2522 4238 
+z
+M 3406 84 
+L 4238 -825 
+L 3475 -825 
+L 2784 -78 
+Q 2681 -84 2626 -87 
+Q 2572 -91 2522 -91 
+Q 1538 -91 948 567 
+Q 359 1225 359 2328 
+Q 359 3434 948 4092 
+Q 1538 4750 2522 4750 
+Q 3503 4750 4090 4092 
+Q 4678 3434 4678 2328 
+Q 4678 1516 4351 937 
+Q 4025 359 3406 84 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-75" d="M 544 1381 
+L 544 3500 
+L 1119 3500 
+L 1119 1403 
+Q 1119 906 1312 657 
+Q 1506 409 1894 409 
+Q 2359 409 2629 706 
+Q 2900 1003 2900 1516 
+L 2900 3500 
+L 3475 3500 
+L 3475 0 
+L 2900 0 
+L 2900 538 
+Q 2691 219 2414 64 
+Q 2138 -91 1772 -91 
+Q 1169 -91 856 284 
+Q 544 659 544 1381 
+z
+M 1991 3584 
+L 1991 3584 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-2014" d="M 313 1978 
+L 6088 1978 
+L 6088 1528 
+L 313 1528 
+L 313 1978 
+z
+" transform="scale(0.015625)"/>
+    </defs>
+    <use xlink:href="#DejaVuSans-53"/>
+    <use xlink:href="#DejaVuSans-4d" transform="translate(63.476562 0)"/>
+    <use xlink:href="#DejaVuSans-31" transform="translate(149.755859 0)"/>
+    <use xlink:href="#DejaVuSans-30" transform="translate(213.378906 0)"/>
+    <use xlink:href="#DejaVuSans-30" transform="translate(277.001953 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(340.625 0)"/>
+    <use xlink:href="#DejaVuSans-46" transform="translate(372.412109 0)"/>
+    <use xlink:href="#DejaVuSans-50" transform="translate(429.931641 0)"/>
+    <use xlink:href="#DejaVuSans-31" transform="translate(490.234375 0)"/>
+    <use xlink:href="#DejaVuSans-36" transform="translate(553.857422 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(617.480469 0)"/>
+    <use xlink:href="#DejaVuSans-4b" transform="translate(649.267578 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(709.84375 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(771.367188 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(810.730469 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(874.109375 0)"/>
+    <use xlink:href="#DejaVuSans-6c" transform="translate(935.632812 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(963.416016 0)"/>
+    <use xlink:href="#DejaVuSans-42" transform="translate(995.203125 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(1063.806641 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(1125.330078 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(1188.708984 0)"/>
+    <use xlink:href="#DejaVuSans-68" transform="translate(1243.689453 0)"/>
+    <use xlink:href="#DejaVuSans-6d" transform="translate(1307.068359 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(1404.480469 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(1465.759766 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(1506.873047 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(1564.783203 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(1616.882812 0)"/>
+    <use xlink:href="#DejaVuSans-28" transform="translate(1648.669922 0)"/>
+    <use xlink:href="#DejaVuSans-4f" transform="translate(1687.683594 0)"/>
+    <use xlink:href="#DejaVuSans-69" transform="translate(1766.394531 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(1794.177734 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(1857.556641 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(1915.466797 0)"/>
+    <use xlink:href="#DejaVuSans-76" transform="translate(1947.253906 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(2006.433594 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2058.533203 0)"/>
+    <use xlink:href="#DejaVuSans-51" transform="translate(2090.320312 0)"/>
+    <use xlink:href="#DejaVuSans-75" transform="translate(2169.03125 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(2232.410156 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(2293.689453 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(2348.669922 0)"/>
+    <use xlink:href="#DejaVuSans-29" transform="translate(2406.580078 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2445.59375 0)"/>
+    <use xlink:href="#DejaVuSans-2014" transform="translate(2477.380859 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2577.380859 0)"/>
+    <use xlink:href="#DejaVuSans-51" transform="translate(2609.167969 0)"/>
+    <use xlink:href="#DejaVuSans-75" transform="translate(2687.878906 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(2751.257812 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(2812.537109 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(2867.517578 0)"/>
+    <use xlink:href="#DejaVuSans-2d" transform="translate(2925.427734 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(2961.511719 0)"/>
+    <use xlink:href="#DejaVuSans-75" transform="translate(3013.611328 0)"/>
+    <use xlink:href="#DejaVuSans-69" transform="translate(3076.990234 0)"/>
+    <use xlink:href="#DejaVuSans-74" transform="translate(3104.773438 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(3143.982422 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(3205.505859 0)"/>
+    <use xlink:href="#DejaVuSans-28" transform="translate(3237.292969 0)"/>
+    <use xlink:href="#DejaVuSans-2b" transform="translate(3276.306641 0)"/>
+    <use xlink:href="#DejaVuSans-4c" transform="translate(3360.095703 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(3415.808594 0)"/>
+    <use xlink:href="#DejaVuSans-79" transform="translate(3477.087891 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(3536.267578 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(3597.791016 0)"/>
+    <use xlink:href="#DejaVuSans-4e" transform="translate(3638.904297 0)"/>
+    <use xlink:href="#DejaVuSans-6f" transform="translate(3713.708984 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(3774.890625 0)"/>
+    <use xlink:href="#DejaVuSans-6d" transform="translate(3814.253906 0)"/>
+    <use xlink:href="#DejaVuSans-29" transform="translate(3911.666016 0)"/>
+   </g>
+  </g>
+  <g id="legend_1">
+   <g id="line2d_73">
+    <path d="M 594.240937 46.691969 
+L 610.490937 46.691969 
+L 626.740937 46.691969 
+" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g>
+     <use xlink:href="#maa38a61819" x="610.490937" y="46.691969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    </g>
+   </g>
+   <g id="text_39">
+    <!-- KernelAgent-Oink (ours) -->
+    <g transform="translate(637.140937 51.241969) scale(0.13 -0.13)">
+     <defs>
+      <path id="DejaVuSans-41" d="M 2188 4044 
+L 1331 1722 
+L 3047 1722 
+L 2188 4044 
+z
+M 1831 4666 
+L 2547 4666 
+L 4325 0 
+L 3669 0 
+L 3244 1197 
+L 1141 1197 
+L 716 0 
+L 50 0 
+L 1831 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-4b"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(60.576172 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(122.099609 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(161.462891 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(224.841797 0)"/>
+     <use xlink:href="#DejaVuSans-6c" transform="translate(286.365234 0)"/>
+     <use xlink:href="#DejaVuSans-41" transform="translate(314.148438 0)"/>
+     <use xlink:href="#DejaVuSans-67" transform="translate(382.556641 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(446.033203 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(507.556641 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(570.935547 0)"/>
+     <use xlink:href="#DejaVuSans-2d" transform="translate(610.144531 0)"/>
+     <use xlink:href="#DejaVuSans-4f" transform="translate(648.978516 0)"/>
+     <use xlink:href="#DejaVuSans-69" transform="translate(727.689453 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(755.472656 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(818.851562 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(876.761719 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(908.548828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(947.5625 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(1008.744141 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(1072.123047 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(1113.236328 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1165.335938 0)"/>
+    </g>
+   </g>
+   <g id="line2d_74">
+    <path d="M 819.705625 46.691969 
+L 835.955625 46.691969 
+L 852.205625 46.691969 
+" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g>
+     <use xlink:href="#m69a13e037a" x="835.955625" y="46.691969" style="fill: #ff4444; stroke: #ff4444"/>
+    </g>
+   </g>
+   <g id="text_40">
+    <!-- Quack -->
+    <g transform="translate(862.605625 51.241969) scale(0.13 -0.13)">
+     <use xlink:href="#DejaVuSans-51"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(78.710938 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(142.089844 0)"/>
+     <use xlink:href="#DejaVuSans-63" transform="translate(203.369141 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(258.349609 0)"/>
+    </g>
+   </g>
+   <g id="line2d_75">
+    <path d="M 929.720156 46.691969 
+L 945.970156 46.691969 
+L 962.220156 46.691969 
+" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   </g>
+   <g id="text_41">
+    <!-- HBM peak (measured) -->
+    <g transform="translate(972.620156 51.241969) scale(0.13 -0.13)">
+     <defs>
+      <path id="DejaVuSans-48" d="M 628 4666 
+L 1259 4666 
+L 1259 2753 
+L 3553 2753 
+L 3553 4666 
+L 4184 4666 
+L 4184 0 
+L 3553 0 
+L 3553 2222 
+L 1259 2222 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-48"/>
+     <use xlink:href="#DejaVuSans-42" transform="translate(75.195312 0)"/>
+     <use xlink:href="#DejaVuSans-4d" transform="translate(143.798828 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(230.078125 0)"/>
+     <use xlink:href="#DejaVuSans-70" transform="translate(261.865234 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(325.341797 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(386.865234 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(448.144531 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(506.054688 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(537.841797 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(576.855469 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(674.267578 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(735.791016 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(797.070312 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(849.169922 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(912.548828 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(951.412109 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1012.935547 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1076.412109 0)"/>
+    </g>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="p1a1505db71">
+   <rect x="58.465" y="144.816" width="371.009812" height="189.730471"/>
+  </clipPath>
+  <clipPath id="p2964b5d0ab">
+   <rect x="474.7075" y="144.816" width="371.009812" height="189.730471"/>
+  </clipPath>
+  <clipPath id="p629cd7b308">
+   <rect x="890.95" y="144.816" width="371.009812" height="189.730471"/>
+  </clipPath>
+  <clipPath id="p3993db1e81">
+   <rect x="1307.1925" y="144.816" width="371.009812" height="189.730471"/>
+  </clipPath>
+ </defs>
+</svg>
diff --git a/oink/benchmarks/readme/plot_quack_style_svg.py b/oink/benchmarks/readme/plot_quack_style_svg.py
new file mode 100644
index 0000000..1799f2e
--- /dev/null
+++ b/oink/benchmarks/readme/plot_quack_style_svg.py
@@ -0,0 +1,431 @@
+from __future__ import annotations
+
+"""
+Generate Quack-style SVG performance plots (Oink vs Quack) from the SM100 suite
+JSON artifacts under `/tmp/kernelagent_oink_sm100_suite_{bf16,fp16}`.
+
+The intent is to match Quack's README visual style:
+  - 3 horizontal panels (suite-dependent):
+      - Quack-suite: RMSNorm / Softmax / CrossEntropy
+      - DSv3 (hidden-size): Fused Add+RMSNorm / Softmax / LayerNorm
+      - DSv3 (all ops, 4-panel): Fused Add+RMSNorm / Softmax / LayerNorm / CrossEntropy
+      - DSv3 CrossEntropy: CrossEntropy-only (single panel)
+  - y-axis: model memory bandwidth (GB/s) derived from an IO model
+  - x-axis: a small set of labeled (M, N) shape points
+  - thick lines + markers, dashed y-grid, compact legend
+  - optional horizontal roofline line (measured STREAM-like HBM peak)
+
+Example:
+  python oink/benchmarks/readme/plot_quack_style_svg.py \\
+    --in-dir /tmp/kernelagent_oink_sm100_suite_bf16 \\
+    --suite quack_suite \\
+    --roofline-json /tmp/hbm_roofline_sm100_bf16.json \\
+    --out oink/benchmarks/media/sm100_bf16_oink_vs_quack.svg
+
+For completeness, we can also include LayerNorm as an extra panel (Quack's
+own README plot does not include LayerNorm):
+  python oink/benchmarks/readme/plot_quack_style_svg.py \\
+    --in-dir /tmp/kernelagent_oink_sm100_suite_bf16 \\
+    --suite quack_suite \\
+    --include-layernorm \\
+    --roofline-json /tmp/hbm_roofline_sm100_bf16.json \\
+    --out oink/benchmarks/media/sm100_bf16_oink_vs_quack_with_layernorm.svg
+
+Note on DSv3 suite:
+- The DSv3 plot intentionally covers only the hidden-size ops (fused Add+RMSNorm,
+  Softmax, LayerNorm) which share the same `(M, N)` sweep.
+- CrossEntropy in DSv3 uses a vocab-size-like `N` sweep and is plotted separately
+  via `--suite dsv3_cross_entropy` to avoid a mixed x-axis with gaps.
+- For README embedding convenience, `--suite dsv3_all` renders a 4-panel
+  single-row figure where the CrossEntropy panel uses its own x-axis.
+- The RMSNorm panel uses the real block primitive (fused residual-add + RMSNorm)
+  when available: `fused_add_rmsnorm_dsv3.json`.
+"""
+
+import argparse
+import json
+import math
+import os
+from collections import defaultdict
+from statistics import median
+from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Tuple
+
+
+def _load_json(path: str) -> Dict[str, Any]:
+    with open(path) as f:
+        return json.load(f)
+
+
+def _fmt_k(v: int) -> str:
+    # Match Quack's x-axis labels: "32K" means 32768 (1024-based).
+    if v % 1024 == 0:
+        return f"{v // 1024}K"
+    return str(v)
+
+
+def _shape_label(m: int, n: int) -> str:
+    return f"({_fmt_k(m)}, {_fmt_k(n)})"
+
+
+def _gbps_from_row(prefix: str, row: Mapping[str, Any]) -> Optional[float]:
+    # Prefer GB/s in the JSON if present; otherwise fall back to TB/s.
+    gbps_key = f"{prefix}_gbps"
+    tbps_key = f"{prefix}_tbps"
+    if gbps_key in row and row[gbps_key] is not None:
+        return float(row[gbps_key])
+    if tbps_key in row and row[tbps_key] is not None:
+        return float(row[tbps_key]) * 1000.0
+    return None
+
+
+def _aggregate_by_shape(rows: Sequence[Mapping[str, Any]]) -> Dict[Tuple[int, int], Dict[str, float]]:
+    """Aggregate duplicate (M, N) rows using median (more robust than mean)."""
+    buckets: dict[tuple[int, int], dict[str, list[float]]] = defaultdict(
+        lambda: defaultdict(list)
+    )
+    for r in rows:
+        m = int(r["M"])
+        n = int(r["N"])
+        ours = _gbps_from_row("ours", r)
+        quack = _gbps_from_row("quack", r)
+        if ours is not None:
+            buckets[(m, n)]["ours"].append(ours)
+        if quack is not None:
+            buckets[(m, n)]["quack"].append(quack)
+
+    out: Dict[Tuple[int, int], Dict[str, float]] = {}
+    for k, vs in buckets.items():
+        if not vs["ours"] or not vs["quack"]:
+            continue
+        out[k] = dict(ours=float(median(vs["ours"])), quack=float(median(vs["quack"])))
+    return out
+
+
+def _sort_shapes(shapes: Iterable[Tuple[int, int]]) -> List[Tuple[int, int]]:
+    # Sort by N then M to keep the x-axis stable across panels.
+    return sorted(set(shapes), key=lambda x: (x[1], x[0]))
+
+
+def _read_roofline_gbps(path: str) -> float:
+    payload = _load_json(path)
+    rows = payload.get("rows", [])
+    best_tbps = max(float(r["tbps"]) for r in rows)
+    return best_tbps * 1000.0
+
+
+def _ensure_matplotlib():
+    try:
+        import matplotlib as mpl  # noqa: F401
+        import matplotlib.pyplot as plt  # noqa: F401
+    except Exception as e:  # pragma: no cover
+        raise SystemExit(
+            "matplotlib is required to generate SVG plots.\n"
+            "Install with: `python -m pip install matplotlib`"
+        ) from e
+
+
+def _plot(
+    *,
+    panels: Sequence[Tuple[str, Dict[Tuple[int, int], Dict[str, float]]]],
+    roofline_gbps: Optional[float],
+    out_path: str,
+    title: str,
+    shape_policy: str,
+    per_panel_x: bool,
+) -> None:
+    _ensure_matplotlib()
+    import matplotlib as mpl
+    import matplotlib.pyplot as plt
+
+    mpl.rcParams.update(
+        {
+            # Quack-style: embed glyphs as paths for consistent rendering.
+            "svg.fonttype": "path",
+            "font.family": "DejaVu Sans",
+            "axes.titlesize": 18,
+            "axes.labelsize": 16,
+            "xtick.labelsize": 10,
+            "ytick.labelsize": 12,
+        }
+    )
+
+    # Colors roughly matching Quack's SVG palette.
+    COLOR_OINK = "#5ba3f5"
+    COLOR_QUACK = "#ff4444"
+    COLOR_ROOF = "#4d4d4d"
+
+    fig, axes = plt.subplots(
+        nrows=1,
+        ncols=len(panels),
+        figsize=(6.0 * len(panels), 5.6),
+        constrained_layout=False,
+        sharey=True,
+    )
+    if len(panels) == 1:
+        axes = [axes]
+
+    max_y = 0.0
+    for ax, (panel_title, data) in zip(axes, panels):
+        if per_panel_x:
+            shapes = _sort_shapes(data.keys())
+        else:
+            # Quack-style plots use a single shared x-axis across panels. Prefer
+            # the intersection so every panel has a value at every x tick
+            # (cleaner than rendering gaps), and fall back to the union if the
+            # intersection is empty.
+            shape_sets = [set(d.keys()) for _n, d in panels]
+            if shape_policy in {"first", "primary"}:
+                shapes = _sort_shapes(shape_sets[0]) if shape_sets else []
+            elif shape_policy == "intersection" and shape_sets:
+                common = set.intersection(*shape_sets)
+                shapes = _sort_shapes(common) if common else []
+            elif shape_policy == "union":
+                shapes = _sort_shapes(s for _n, d in panels for s in d.keys())
+            else:
+                raise ValueError(f"Unsupported shape_policy: {shape_policy}")
+            if not shapes:
+                shapes = _sort_shapes(s for _n, d in panels for s in d.keys())
+
+        x = list(range(len(shapes)))
+        x_labels = [_shape_label(m, n) for (m, n) in shapes]
+
+        ours_y: List[float] = []
+        quack_y: List[float] = []
+        for s in shapes:
+            rec = data.get(s)
+            if rec is None:  # only possible in shared-x mode with union
+                ours_y.append(math.nan)
+                quack_y.append(math.nan)
+                continue
+            ours_y.append(float(rec["ours"]))
+            quack_y.append(float(rec["quack"]))
+        max_y = max(max_y, *(v for v in ours_y if math.isfinite(v)), *(v for v in quack_y if math.isfinite(v)))
+
+        ax.plot(
+            x,
+            ours_y,
+            marker="o",
+            linewidth=5,
+            markersize=7,
+            color=COLOR_OINK,
+            label="KernelAgent-Oink (ours)",
+        )
+        ax.plot(
+            x,
+            quack_y,
+            marker="o",
+            linewidth=5,
+            markersize=7,
+            color=COLOR_QUACK,
+            label="Quack",
+        )
+        if roofline_gbps is not None:
+            ax.axhline(
+                roofline_gbps,
+                color=COLOR_ROOF,
+                linewidth=3,
+                linestyle=(0, (4, 6)),
+                label="HBM peak (measured)" if ax is axes[0] else None,
+            )
+            max_y = max(max_y, float(roofline_gbps))
+
+        ax.set_title(panel_title)
+        ax.set_xticks(x)
+        ax.set_xticklabels(x_labels, rotation=-45, ha="left")
+        if per_panel_x:
+            # DSv3 "all ops" figure: each panel has its own x-axis. Make the
+            # semantics explicit so readers don't assume the same `N` meaning
+            # across panels (CrossEntropy uses a classes/vocab-shard-like axis).
+            if "cross" in panel_title.lower():
+                ax.set_xlabel("Shape (M, C classes)")
+            else:
+                ax.set_xlabel("Shape (M, N hidden)")
+
+        # Quack-like dashed y-grid.
+        ax.grid(axis="y", linestyle=(0, (4, 7.2)), linewidth=0.8, color="#b0b0b0")
+        ax.set_axisbelow(True)
+
+        # Light spines (Quack SVG uses a light gray frame).
+        for spine in ax.spines.values():
+            spine.set_color("#d3d3d3")
+            spine.set_linewidth(1.5)
+
+    axes[0].set_ylabel("Memory Bandwidth (GB/s)")
+
+    # A little headroom above the tallest curve/roofline.
+    ymax = max_y * 1.08 if max_y > 0 else 1.0
+    for ax in axes:
+        ax.set_ylim(0.0, ymax)
+
+    # Tight layout for the axes area, reserving headroom for the suptitle and a
+    # shared legend. In some matplotlib versions, figure-level legends can
+    # overlap the middle panel title unless we reserve a slightly taller header
+    # band.
+    fig.tight_layout(rect=(0.0, 0.0, 1.0, 0.70))
+
+    # Single shared legend across the top (like Quack), but keep it inside the
+    # reserved header band so it doesn't overlap the middle panel title.
+    handles, labels = axes[0].get_legend_handles_labels()
+    # Quack's legend fits nicely in one row because their plots are 3-panel and
+    # therefore wide. For single-panel figures, a 3-column legend can overflow
+    # the canvas and get clipped in the SVG, so we stack it vertically.
+    legend_ncol = min(3, len(labels))
+    legend_fontsize = 13
+    if len(panels) == 1:
+        legend_ncol = 1
+        legend_fontsize = 12
+    fig.legend(
+        handles,
+        labels,
+        loc="upper center",
+        ncol=legend_ncol,
+        frameon=False,
+        bbox_to_anchor=(0.5, 0.91),
+        fontsize=legend_fontsize,
+        handlelength=2.5,
+    )
+    # Single-panel figures (e.g. DSv3 CrossEntropy) are much narrower than the
+    # Quack-style 3-panel plots; use a slightly smaller suptitle font to avoid
+    # clipping in the exported SVG.
+    suptitle_fs = 22 if len(panels) > 1 else 18
+    fig.suptitle(title, y=0.98, fontsize=suptitle_fs)
+
+    out_path = os.path.abspath(out_path)
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    # Use a tight bounding box so rotated x tick labels and the figure-level
+    # legend don't get clipped in SVG exports (matplotlib can be fragile here
+    # across versions).
+    fig.savefig(out_path, format="svg", bbox_inches="tight", pad_inches=0.02)
+    plt.close(fig)
+
+
+def _panel_files_for_suite(suite: str) -> List[Tuple[str, str]]:
+    if suite == "quack_suite":
+        return [
+            ("RMSNorm (fp32 weight)", "rmsnorm_fwd_quack_suite_wfp32.json"),
+            ("Softmax (fwd+bwd)", "softmax_fwd_bwd_quack_suite.json"),
+            ("Cross-Entropy (fwd+bwd)", "cross_entropy_fwd_bwd_quack_suite.json"),
+        ]
+    if suite == "dsv3":
+        return [
+            ("Fused Add+RMSNorm (fwd)", "fused_add_rmsnorm_dsv3.json"),
+            ("Softmax (fwd+bwd)", "softmax_fwd_bwd_dsv3.json"),
+            ("LayerNorm (fwd)", "layernorm_fwd_dsv3.json"),
+        ]
+    if suite == "dsv3_all":
+        return [
+            ("Fused Add+RMSNorm (fwd)", "fused_add_rmsnorm_dsv3.json"),
+            ("Softmax (fwd+bwd)", "softmax_fwd_bwd_dsv3.json"),
+            ("LayerNorm (fwd)", "layernorm_fwd_dsv3.json"),
+            ("Cross-Entropy (fwd+bwd)", "cross_entropy_fwd_bwd_dsv3.json"),
+        ]
+    if suite == "dsv3_cross_entropy":
+        return [
+            ("Cross-Entropy (fwd+bwd)", "cross_entropy_fwd_bwd_dsv3.json"),
+        ]
+    raise ValueError(f"Unsupported suite: {suite}")
+
+
+def _layernorm_file_for_suite(suite: str) -> str:
+    if suite == "quack_suite":
+        return "layernorm_fwd_quack_suite.json"
+    raise ValueError(f"Unsupported suite: {suite}")
+
+
+def main() -> None:
+    p = argparse.ArgumentParser(
+        description="Generate Quack-style SVG plots from KernelAgent-Oink suite JSONs."
+    )
+    p.add_argument(
+        "--in-dir", type=str, required=True, help="Directory containing suite JSON outputs"
+    )
+    p.add_argument(
+        "--suite",
+        type=str,
+        default="quack_suite",
+        choices=["quack_suite", "dsv3", "dsv3_all", "dsv3_cross_entropy"],
+    )
+    p.add_argument(
+        "--include-layernorm",
+        action="store_true",
+        help="Add a LayerNorm (fwd) panel (only meaningful for `--suite quack_suite`).",
+    )
+    p.add_argument(
+        "--shape-policy",
+        type=str,
+        default="intersection",
+        choices=["intersection", "union", "first"],
+        help=(
+            "How to pick x-axis shapes across panels. "
+            "`intersection` matches Quack-style (only shapes common to every panel). "
+            "`first` uses the first panel's shapes (keeps DSv3 N=7168 visible). "
+            "`union` includes every shape across panels (may create gaps)."
+        ),
+    )
+    p.add_argument("--roofline-json", type=str, default=None, help="Optional /tmp/hbm_roofline_sm100_*.json path")
+    p.add_argument("--out", type=str, required=True, help="Output SVG path")
+    p.add_argument("--title", type=str, default=None, help="Optional figure title override")
+    args = p.parse_args()
+
+    in_dir = os.path.abspath(args.in_dir)
+    if not os.path.isdir(in_dir):
+        raise SystemExit(f"--in-dir is not a directory: {in_dir}")
+
+    roofline_gbps = _read_roofline_gbps(args.roofline_json) if args.roofline_json else None
+
+    panel_files = list(_panel_files_for_suite(str(args.suite)))
+    if args.include_layernorm:
+        if args.suite != "quack_suite":
+            raise SystemExit("--include-layernorm is only supported for `--suite quack_suite`.")
+        panel_files.append(("LayerNorm (fwd)", _layernorm_file_for_suite(str(args.suite))))
+
+    panels: List[Tuple[str, Dict[Tuple[int, int], Dict[str, float]]]] = []
+    for panel_title, filename in panel_files:
+        path = os.path.join(in_dir, filename)
+        if not os.path.exists(path):
+            raise SystemExit(f"Missing required JSON: {path}")
+        payload = _load_json(path)
+        rows = payload.get("rows", [])
+        if not isinstance(rows, list):
+            rows = []
+        panels.append((panel_title, _aggregate_by_shape(rows)))
+
+    if args.title is not None:
+        title = str(args.title)
+    else:
+        # Try to infer dtype from the first panel's JSON.
+        first_json = os.path.join(in_dir, panel_files[0][1])
+        payload = _load_json(first_json)
+        rows = payload.get("rows", [])
+        dtype = rows[0].get("dtype", "") if rows else ""
+        if args.suite == "quack_suite":
+            suite_name = "Quack-suite"
+        elif args.suite == "dsv3":
+            suite_name = "DSv3 (hidden-size ops)"
+        elif args.suite == "dsv3_all":
+            suite_name = "DSv3 (4 ops)"
+        elif args.suite == "dsv3_cross_entropy":
+            # Keep this short: this suite is rendered as a single panel, so the
+            # figure is much narrower than the 3-panel plots.
+            suite_name = "DSv3 CrossEntropy"
+        else:
+            suite_name = str(args.suite)
+        suffix = " (+LayerNorm)" if (args.suite == "quack_suite" and args.include_layernorm) else ""
+        if args.suite == "dsv3_cross_entropy":
+            title = f"SM100 {dtype.upper()} — {suite_name}{suffix}"
+        else:
+            title = f"SM100 {dtype.upper()} Kernel Benchmarks (Oink vs Quack) — {suite_name}{suffix}"
+
+    _plot(
+        panels=panels,
+        roofline_gbps=roofline_gbps,
+        out_path=str(args.out),
+        title=title,
+        shape_policy=str(args.shape_policy),
+        per_panel_x=(str(args.suite) == "dsv3_all"),
+    )
+    print(f"Wrote: {os.path.abspath(args.out)}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/oink/benchmarks/readme/run_sm100_suite.py b/oink/benchmarks/readme/run_sm100_suite.py
new file mode 100644
index 0000000..5ac1091
--- /dev/null
+++ b/oink/benchmarks/readme/run_sm100_suite.py
@@ -0,0 +1,302 @@
+from __future__ import annotations
+
+import argparse
+import os
+import subprocess
+import sys
+from datetime import datetime
+from typing import List, Tuple
+
+
+def _ts() -> str:
+    return datetime.now().strftime("%Y%m%d_%H%M%S")
+
+
+def _run(cmd: List[str], *, dry_run: bool) -> None:
+    print("+", " ".join(cmd), flush=True)
+    if dry_run:
+        return
+    subprocess.run(cmd, check=True)
+
+
+def main() -> None:
+    p = argparse.ArgumentParser()
+    p.add_argument("--dtype", type=str, default="bf16", choices=["fp16", "bf16", "fp32"])
+    p.add_argument(
+        "--out-dir",
+        type=str,
+        default=None,
+        help="Directory to write JSON outputs (default: /tmp/kernelagent_oink_sm100_suite_<timestamp>)",
+    )
+    p.add_argument(
+        "--skip-verify",
+        action="store_true",
+        help="Skip correctness checks (Oink/Quack vs PyTorch / pure-PyTorch references)",
+    )
+    p.add_argument("--dry-run", action="store_true", help="Print commands without executing them")
+    args = p.parse_args()
+
+    # Standardize env for standalone runs outside the vLLM plugin.
+    os.environ.setdefault("PYTORCH_ALLOC_CONF", "expandable_segments:True")
+    os.environ.setdefault("CUTE_DSL_ARCH", "sm_100a")
+
+    out_dir = args.out_dir or f"/tmp/kernelagent_oink_sm100_suite_{_ts()}"
+    os.makedirs(out_dir, exist_ok=True)
+
+    here = os.path.dirname(os.path.abspath(__file__))
+    bench_dir = os.path.abspath(os.path.join(here, "..", "benchmark"))
+    py = sys.executable
+
+    def script(name: str) -> str:
+        return os.path.join(bench_dir, name)
+
+    common = ["--dtype", args.dtype]
+    if args.skip_verify:
+        common = [*common, "--skip-verify"]
+
+    runs: List[Tuple[str, List[str]]] = [
+        (
+            "rmsnorm_fwd_quack_suite_wfp32",
+            [
+                py,
+                script("benchmark_rmsnorm_sm100.py"),
+                *common,
+                "--weight-dtype",
+                "fp32",
+                "--quack-suite",
+                "--iters",
+                "200",
+                "--warmup-ms",
+                "25",
+                "--json",
+                os.path.join(out_dir, "rmsnorm_fwd_quack_suite_wfp32.json"),
+            ],
+        ),
+        (
+            "rmsnorm_fwd_dsv3_wfp32",
+            [
+                py,
+                script("benchmark_rmsnorm_sm100.py"),
+                *common,
+                "--weight-dtype",
+                "fp32",
+                "--dsv3",
+                "--iters",
+                "200",
+                "--warmup-ms",
+                "25",
+                "--json",
+                os.path.join(out_dir, "rmsnorm_fwd_dsv3_wfp32.json"),
+            ],
+        ),
+        (
+            "rmsnorm_bwd_quack_suite_wfp32",
+            [
+                py,
+                script("benchmark_rmsnorm_bwd_sm100.py"),
+                *common,
+                "--weight-dtype",
+                "fp32",
+                "--quack-suite",
+                "--iters",
+                "100",
+                "--warmup-ms",
+                "25",
+                "--json",
+                os.path.join(out_dir, "rmsnorm_bwd_quack_suite_wfp32.json"),
+            ],
+        ),
+        (
+            "rmsnorm_bwd_dsv3_wfp32",
+            [
+                py,
+                script("benchmark_rmsnorm_bwd_sm100.py"),
+                *common,
+                "--weight-dtype",
+                "fp32",
+                "--dsv3",
+                "--iters",
+                "100",
+                "--warmup-ms",
+                "25",
+                "--json",
+                os.path.join(out_dir, "rmsnorm_bwd_dsv3_wfp32.json"),
+            ],
+        ),
+        # vLLM inference-style RMSNorm (weight dtype == activation dtype).
+        (
+            "rmsnorm_fwd_quack_suite_wsame",
+            [
+                py,
+                script("benchmark_rmsnorm_sm100.py"),
+                *common,
+                "--weight-dtype",
+                "same",
+                "--quack-suite",
+                "--iters",
+                "200",
+                "--warmup-ms",
+                "25",
+                "--json",
+                os.path.join(out_dir, "rmsnorm_fwd_quack_suite_wsame.json"),
+            ],
+        ),
+        (
+            "rmsnorm_fwd_dsv3_wsame",
+            [
+                py,
+                script("benchmark_rmsnorm_sm100.py"),
+                *common,
+                "--weight-dtype",
+                "same",
+                "--dsv3",
+                "--iters",
+                "200",
+                "--warmup-ms",
+                "25",
+                "--json",
+                os.path.join(out_dir, "rmsnorm_fwd_dsv3_wsame.json"),
+            ],
+        ),
+        (
+            "rmsnorm_bwd_quack_suite_wsame",
+            [
+                py,
+                script("benchmark_rmsnorm_bwd_sm100.py"),
+                *common,
+                "--weight-dtype",
+                "same",
+                "--quack-suite",
+                "--iters",
+                "100",
+                "--warmup-ms",
+                "25",
+                "--json",
+                os.path.join(out_dir, "rmsnorm_bwd_quack_suite_wsame.json"),
+            ],
+        ),
+        (
+            "rmsnorm_bwd_dsv3_wsame",
+            [
+                py,
+                script("benchmark_rmsnorm_bwd_sm100.py"),
+                *common,
+                "--weight-dtype",
+                "same",
+                "--dsv3",
+                "--iters",
+                "100",
+                "--warmup-ms",
+                "25",
+                "--json",
+                os.path.join(out_dir, "rmsnorm_bwd_dsv3_wsame.json"),
+            ],
+        ),
+        (
+            "softmax_fwd_bwd_quack_suite",
+            [
+                py,
+                script("benchmark_softmax_sm100.py"),
+                *common,
+                "--mode",
+                "fwd_bwd",
+                "--quack-suite",
+                "--iters",
+                "50",
+                "--warmup-ms",
+                "25",
+                "--json",
+                os.path.join(out_dir, "softmax_fwd_bwd_quack_suite.json"),
+            ],
+        ),
+        (
+            "softmax_fwd_bwd_dsv3",
+            [
+                py,
+                script("benchmark_softmax_sm100.py"),
+                *common,
+                "--mode",
+                "fwd_bwd",
+                "--dsv3",
+                "--iters",
+                "50",
+                "--warmup-ms",
+                "25",
+                "--json",
+                os.path.join(out_dir, "softmax_fwd_bwd_dsv3.json"),
+            ],
+        ),
+        (
+            "cross_entropy_fwd_bwd_quack_suite",
+            [
+                py,
+                script("benchmark_cross_entropy_sm100.py"),
+                *common,
+                "--mode",
+                "fwd_bwd",
+                "--quack-suite",
+                "--iters",
+                "50",
+                "--warmup-ms",
+                "25",
+                "--json",
+                os.path.join(out_dir, "cross_entropy_fwd_bwd_quack_suite.json"),
+            ],
+        ),
+        (
+            "cross_entropy_fwd_bwd_dsv3",
+            [
+                py,
+                script("benchmark_cross_entropy_sm100.py"),
+                *common,
+                "--mode",
+                "fwd_bwd",
+                "--dsv3",
+                "--iters",
+                "50",
+                "--warmup-ms",
+                "25",
+                "--json",
+                os.path.join(out_dir, "cross_entropy_fwd_bwd_dsv3.json"),
+            ],
+        ),
+        (
+            "layernorm_fwd_quack_suite",
+            [
+                py,
+                script("benchmark_layernorm_sm100.py"),
+                *common,
+                "--quack-suite",
+                "--iters",
+                "200",
+                "--warmup-ms",
+                "25",
+                "--json",
+                os.path.join(out_dir, "layernorm_fwd_quack_suite.json"),
+            ],
+        ),
+        (
+            "layernorm_fwd_dsv3",
+            [
+                py,
+                script("benchmark_layernorm_sm100.py"),
+                *common,
+                "--dsv3",
+                "--iters",
+                "200",
+                "--warmup-ms",
+                "25",
+                "--json",
+                os.path.join(out_dir, "layernorm_fwd_dsv3.json"),
+            ],
+        ),
+    ]
+
+    print(f"Writing results to: {out_dir}", flush=True)
+    for name, cmd in runs:
+        print(f"\n== {name} ==", flush=True)
+        _run(cmd, dry_run=bool(args.dry_run))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/oink/benchmarks/readme/summarize_results.py b/oink/benchmarks/readme/summarize_results.py
new file mode 100644
index 0000000..70782dd
--- /dev/null
+++ b/oink/benchmarks/readme/summarize_results.py
@@ -0,0 +1,205 @@
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import os
+from typing import Any, Dict, Iterable, List, Optional, Sequence
+
+
+def _load_json(path: str) -> Dict[str, Any]:
+    with open(path) as f:
+        return json.load(f)
+
+
+def _fmt_cell(v: object) -> str:
+    if v is None:
+        return ""
+    if isinstance(v, float):
+        if math.isfinite(v):
+            av = abs(v)
+            # Use scientific notation for very small values so we don't render
+            # meaningful error stats as "0.0000".
+            if av != 0.0 and av < 1e-3:
+                return f"{v:.2e}"
+            return f"{v:.4f}"
+        return str(v)
+    return str(v)
+
+
+def _md_table(rows: Sequence[Dict[str, Any]], columns: Sequence[str]) -> str:
+    header = "| " + " | ".join(columns) + " |"
+    sep = "|" + "|".join(["---"] * len(columns)) + "|"
+    lines = [header, sep]
+    for r in rows:
+        lines.append("| " + " | ".join(_fmt_cell(r.get(c)) for c in columns) + " |")
+    return "\n".join(lines)
+
+
+def _pick_columns(rows: Sequence[Dict[str, Any]]) -> List[str]:
+    preferred = [
+        "M",
+        "N",
+        "dtype",
+        "weight_dtype",
+        "mode",
+        "eps",
+        "store_rstd",
+        "return_rstd",
+        "return_mean",
+        "ignore_index",
+        "ours_ms",
+        "ours_tbps",
+        "ours_hbm_frac",
+        "quack_ms",
+        "quack_tbps",
+        "speedup_vs_quack",
+    ]
+    present = set().union(*(r.keys() for r in rows)) if rows else set()
+    cols = [c for c in preferred if c in present]
+    # Fall back to a stable sorted view if we missed everything (shouldn't happen).
+    return cols or sorted(present)
+
+
+def _geomean(values: Iterable[float]) -> Optional[float]:
+    logs: List[float] = []
+    for v in values:
+        if v <= 0 or not math.isfinite(v):
+            continue
+        logs.append(math.log(v))
+    if not logs:
+        return None
+    return math.exp(sum(logs) / len(logs))
+
+
+def _collect_error_prefixes(rows: Sequence[Dict[str, Any]]) -> List[str]:
+    """Infer error-stat prefixes like `ours_err_dx` from row keys."""
+    prefixes: set[str] = set()
+    for r in rows:
+        for k in r.keys():
+            if not isinstance(k, str):
+                continue
+            if not k.endswith("_max_abs"):
+                continue
+            if "err_" not in k:
+                continue
+            prefixes.add(k[: -len("_max_abs")])
+    return sorted(prefixes)
+
+
+def _summarize_error_stats(rows: Sequence[Dict[str, Any]]) -> str:
+    prefixes = _collect_error_prefixes(rows)
+    if not prefixes:
+        return ""
+
+    out_rows: List[Dict[str, Any]] = []
+    for pfx in prefixes:
+        # Per-prefix worst-case across rows.
+        max_abs_vals = [float(r[pfx + "_max_abs"]) for r in rows if (pfx + "_max_abs") in r]
+        p99_abs_vals = [float(r[pfx + "_p99_abs"]) for r in rows if (pfx + "_p99_abs") in r]
+        rel_l2_vals = [float(r[pfx + "_rel_l2"]) for r in rows if (pfx + "_rel_l2") in r]
+        if not max_abs_vals and not p99_abs_vals and not rel_l2_vals:
+            continue
+        out_rows.append(
+            {
+                "metric": pfx,
+                "max_abs (max over shapes)": max(max_abs_vals) if max_abs_vals else None,
+                "p99_abs (max over shapes)": max(p99_abs_vals) if p99_abs_vals else None,
+                "rel_l2 (max over shapes)": max(rel_l2_vals) if rel_l2_vals else None,
+            }
+        )
+
+    if not out_rows:
+        return ""
+
+    cols = ["metric", "max_abs (max over shapes)", "p99_abs (max over shapes)", "rel_l2 (max over shapes)"]
+    return "\n".join(["", "### Error Stats (vs PyTorch ref)", "", _md_table(out_rows, cols), ""])
+
+
+def summarize_one(path: str) -> str:
+    payload = _load_json(path)
+    meta = payload.get("meta", {})
+    rows = payload.get("rows", [])
+    if not isinstance(rows, list):
+        rows = []
+
+    cols = _pick_columns(rows)
+    parts: List[str] = []
+
+    base = os.path.basename(path)
+    parts.append(f"## `{base}`")
+    if meta:
+        device = meta.get("device")
+        cap = meta.get("capability")
+        torch_ver = meta.get("torch")
+        cuda_ver = meta.get("cuda")
+        git_sha = meta.get("git_sha")
+        ts = meta.get("timestamp")
+        parts.append("")
+        parts.append(
+            f"- device: `{device}` | capability: `{cap}` | torch: `{torch_ver}` | cuda: `{cuda_ver}` | git_sha: `{git_sha}` | timestamp: `{ts}`"
+        )
+        method = meta.get("method")
+        if method is not None:
+            parts.append(f"- method: `{method}`")
+        if meta.get("warmup_ms") is not None and meta.get("rep_ms") is not None:
+            parts.append(f"- warmup_ms: `{meta.get('warmup_ms')}` | rep_ms: `{meta.get('rep_ms')}`")
+
+    if rows:
+        parts.append("")
+        parts.append(_md_table(rows, cols))
+
+        speeds = [float(r["speedup_vs_quack"]) for r in rows if "speedup_vs_quack" in r]
+        gm = _geomean(speeds)
+        if gm is not None:
+            parts.append("")
+            parts.append(f"- geomean speedup vs Quack: `{gm:.3f}x` (over {len(speeds)} shapes)")
+
+        err_block = _summarize_error_stats(rows)
+        if err_block:
+            parts.append(err_block.rstrip())
+    else:
+        parts.append("")
+        parts.append("_No rows found in JSON._")
+
+    parts.append("")
+    return "\n".join(parts)
+
+
+def main() -> None:
+    p = argparse.ArgumentParser(description="Summarize KernelAgent-Oink benchmark JSONs into Markdown tables.")
+    p.add_argument("--in-dir", type=str, required=True, help="Directory containing benchmark JSON files")
+    p.add_argument("--out", type=str, default=None, help="Optional output markdown path (default: stdout)")
+    args = p.parse_args()
+
+    in_dir = os.path.abspath(args.in_dir)
+    if not os.path.isdir(in_dir):
+        raise SystemExit(f"--in-dir is not a directory: {in_dir}")
+
+    json_paths = sorted(
+        os.path.join(in_dir, name) for name in os.listdir(in_dir) if name.endswith(".json")
+    )
+    if not json_paths:
+        raise SystemExit(f"No .json files found under: {in_dir}")
+
+    out_parts: List[str] = []
+    out_parts.append("# KernelAgent-Oink SM100 Benchmark Summary")
+    out_parts.append("")
+    out_parts.append(f"Input directory: `{in_dir}`")
+    out_parts.append("")
+    for path in json_paths:
+        out_parts.append(summarize_one(path))
+
+    text = "\n".join(out_parts).rstrip() + "\n"
+    if args.out is None:
+        print(text, end="")
+        return
+
+    out_path = os.path.abspath(args.out)
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    with open(out_path, "w") as f:
+        f.write(text)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/oink/pyproject.toml b/oink/pyproject.toml
index a9ec306..0d19d6e 100644
--- a/oink/pyproject.toml
+++ b/oink/pyproject.toml
@@ -5,11 +5,26 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "kernelagent-oink"
 version = "0.1.0"
-description = "vLLM plugin that registers Oink Blackwell RMSNorm custom ops"
+description = "CuTeDSL kernels for Blackwell (SM100), shipped as a vLLM plugin"
 readme = "README.md"
 requires-python = ">=3.10"
 license = {text = "Apache-2.0"}
 authors = [{name = "PyTorch Labs"}]
+keywords = ["cuda", "cutlass", "cute", "cutedsl", "blackwell", "sm100", "vllm"]
+classifiers = [
+  "License :: OSI Approved :: Apache Software License",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3 :: Only",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+
+[project.urls]
+Repository = "https://github.com/meta-pytorch/KernelAgent"
+Documentation = "https://github.com/meta-pytorch/KernelAgent/tree/main/oink"
+Issues = "https://github.com/meta-pytorch/KernelAgent/issues"
 
 # Keep dependencies minimal, but include the CuTeDSL stack required by the
 # Blackwell RMSNorm implementation.
@@ -21,6 +36,13 @@ dependencies = [
   "cuda-python",
 ]
 
+[project.optional-dependencies]
+# Optional extras for running the in-repo benchmark suite (not needed for vLLM integration).
+bench = [
+  "matplotlib",
+  "triton",
+]
+
 [project.entry-points."vllm.general_plugins"]
 oink = "kernelagent_oink:register"
 
diff --git a/oink/src/kernelagent_oink/__init__.py b/oink/src/kernelagent_oink/__init__.py
index bbbd7c1..f5c36a6 100644
--- a/oink/src/kernelagent_oink/__init__.py
+++ b/oink/src/kernelagent_oink/__init__.py
@@ -12,6 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""
+KernelAgent-Oink: SM100 CuTeDSL kernels + optional vLLM plugin.
+
+This package can be loaded as a vLLM "general plugin" (entrypoint group
+`vllm.general_plugins`). In that mode it registers Oink custom ops only when
+explicitly enabled via an environment variable (so installing the package does
+not change behavior by default).
+
+For standalone usage (outside vLLM), call `kernelagent_oink.register(force=True)`
+to register the custom ops explicitly.
+"""
+
 from __future__ import annotations
 
 import logging
@@ -48,11 +60,14 @@ def _compute_cutedsl_arch(major: int, minor: int) -> str:
     return f"sm_{major}{minor}{suffix}"
 
 
-def register() -> None:
-    """vLLM plugin entrypoint.
+def register(*, force: bool = False) -> None:
+    """Register Oink torch custom ops.
+
+    - vLLM plugin mode (default): no-op unless `VLLM_USE_OINK_RMSNORM` is truthy.
+    - Standalone mode: pass `force=True` to register explicitly.
 
-    This function must be safe to call multiple times and must not raise.
-    vLLM executes it in multiple processes (engine + workers).
+    This function must be safe to call multiple times and must not raise. vLLM
+    executes it in multiple processes (engine + workers).
     """
     global _OPS_REGISTERED
 
@@ -60,8 +75,9 @@ def register() -> None:
         return
 
     # Gate on the vLLM integration flag so installing the package does not
-    # change behavior unless explicitly enabled.
-    if not _env_truthy("VLLM_USE_OINK_RMSNORM"):
+    # change behavior unless explicitly enabled. For standalone usage (outside
+    # vLLM), callers can pass force=True to register the ops explicitly.
+    if not force and not _env_truthy("VLLM_USE_OINK_RMSNORM"):
         return
 
     try:
diff --git a/oink/src/kernelagent_oink/blackwell/cross_entropy.py b/oink/src/kernelagent_oink/blackwell/cross_entropy.py
new file mode 100644
index 0000000..94f052f
--- /dev/null
+++ b/oink/src/kernelagent_oink/blackwell/cross_entropy.py
@@ -0,0 +1,1209 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Cross-entropy forward + backward kernels for SM100 (Blackwell) in CuteDSL.
+
+This module implements numerically stable cross-entropy over the last
+dimension of 2D logits tensors `(M, N)` together with its backward pass,
+targeting SM100 with Quack-style tiling, cp.async pipelines, and (for the
+forward pass) optional cluster-wide online softmax reductions, but without
+depending on the external `quack` package at runtime.
+
+Public APIs:
+
+- ``cross_entropy_forward(logits, target, ignore_index=-100, reduction="none")``
+  returns ``(loss, lse)`` where ``loss`` follows the requested reduction and
+  ``lse`` is always per-example log-sum-exp (shape ``(M,)``).
+- ``cross_entropy_backward(dloss, logits, target, lse, ignore_index=-100)``
+  returns per-logit gradients ``dlogits`` matching PyTorch /
+  ``quack.cross_entropy_bwd`` semantics for ``reduction="none"``.
+- ``cross_entropy(logits, target, ignore_index=-100, reduction="mean"|"sum"|"none")``
+  is a convenience wrapper that mirrors ``torch.nn.functional.cross_entropy``
+  reductions using the SM100 CuteDSL kernels for the forward pass.
+
+The kernels are self-contained and use only local helpers in
+`kernelagent_oink.blackwell.lite_quack` plus CuTeDSL/CUTLASS.
+"""
+
+from __future__ import annotations
+
+import importlib.metadata
+import math
+import os
+import re
+from typing import Literal, Optional, Type
+
+import torch
+from torch import Tensor
+
+import cuda.bindings.driver as cuda  # provided by NVIDIA cuda-python
+
+# CuTeDSL caches generated MLIR into a tempdir under a global default
+# (`/tmp/$USER/cutlass_python_cache`). The cache bytecode format can differ across
+# `nvidia-cutlass-dsl` versions, and cross-version cache sharing causes noisy
+# warnings (and disables cache reuse).
+if "CUTE_DSL_CACHE_DIR" not in os.environ:
+    try:
+        _dsl_ver = importlib.metadata.version("nvidia-cutlass-dsl")
+    except Exception:
+        _dsl_ver = "unknown"
+    _dsl_ver = re.sub(r"[^0-9A-Za-z]+", "_", _dsl_ver)
+    _user = os.environ.get("USER") or os.environ.get("USERNAME") or "user"
+    _tmp = os.environ.get("TMPDIR") or "/tmp"
+    os.environ["CUTE_DSL_CACHE_DIR"] = os.path.join(
+        _tmp, _user, f"cutlass_python_cache_{_dsl_ver}"
+    )
+
+try:
+    import cutlass  # type: ignore  # noqa: F401
+except Exception as e:
+    raise ImportError(
+        "kernelagent_oink.blackwell.cross_entropy requires CuTeDSL's Python package "
+        "(`cutlass`, typically provided by `nvidia-cutlass-dsl`)."
+    ) from e
+
+import cutlass.cute as cute
+from cutlass import Boolean, Float32, Int32, const_expr
+from cutlass.cute import runtime as rt
+from cutlass.cute.runtime import from_dlpack
+
+from kernelagent_oink.blackwell.lite_quack import (
+    _KERNEL_ACCEPTS_LAYOUT_ARGS,
+    TORCH2CUTE_DTYPE,
+    ReductionBase,
+    domain_offset_i64,
+    fill_oob,
+    online_softmax_reduce,
+    predicate_k,
+)
+
+_FWD_COMPILE_CACHE: dict[tuple[type[cutlass.Numeric], int], cute.Kernel] = {}
+_BWD_COMPILE_CACHE: dict[tuple[type[cutlass.Numeric], int], cute.Kernel] = {}
+_PTR_FWD_COMPILE_CACHE: dict[tuple[object, ...], object] = {}
+_PTR_BWD_COMPILE_CACHE: dict[tuple[object, ...], object] = {}
+
+
+def _convert_logits_2d(x: Tensor) -> cute.Tensor:
+    """Convert a 2D logits tensor (M, N) into a CuTe tensor.
+
+    We assume 16-byte alignment and mark the layout compact and row-major
+    in the last dimension, matching the conventions used in the SM100
+    softmax and RMSNorm kernels.
+    """
+    assert x.dim() == 2, "Input logits must be 2D (M, N)"
+    return (
+        from_dlpack(x.detach(), assumed_align=16)
+        .mark_compact_shape_dynamic(mode=0, stride_order=(0, 1))
+    )
+
+
+def _convert_1d(t: Tensor, assumed_align: int) -> cute.Tensor:
+    """Convert a 1D tensor with a fully dynamic layout."""
+    assert t.dim() == 1, "Expected a 1D tensor"
+    return from_dlpack(t.detach(), assumed_align=assumed_align).mark_layout_dynamic()
+
+
+class CrossEntropyFwdSM100(ReductionBase):
+    """SM100-tuned cross-entropy forward kernel.
+
+    This mirrors the structure of ``quack.cross_entropy.CrossEntropy`` but
+    is simplified to always use the single-pass online softmax reduction and
+    never computes gradients inside the forward kernel.
+    """
+
+    def __init__(self, dtype: Type[cutlass.Numeric], N: int):
+        # Use one stage with an Int64 reduction buffer packing (max, sum_exp)
+        # pairs via lite_quack.online_softmax_reduce.
+        super().__init__(dtype, N, stage=1, reduction_dtype=cutlass.Int64)
+
+    def _calculate_threads_per_row(self) -> int:
+        N = self.N
+        return (
+            8
+            if N <= 64
+            else (
+                16
+                if N <= 128
+                else (32 if N <= 3072 else (64 if N <= 6144 else (128 if N <= 16384 else 256)))
+            )
+        )
+
+    def _set_cluster_n(self) -> None:
+        # Match Quack's cluster_n growth policy while keeping it explicit so
+        # we can tune SM100-specific shapes later if needed.
+        N = self.N
+        if const_expr(self.dtype.width == 16):
+            cluster_n = (
+                1
+                if N <= 16 * 1024
+                else (
+                    2
+                    if N <= 32 * 1024
+                    else (4 if N <= 64 * 1024 else (8 if N <= 128 * 1024 else 16))
+                )
+            )
+        else:  # fp32
+            cluster_n = (
+                1
+                if N <= 16 * 1024
+                else (
+                    2
+                    if N <= 64 * 1024
+                    else (4 if N <= 128 * 1024 else (8 if N <= 256 * 1024 else 16))
+                )
+            )
+        self.cluster_n = cluster_n
+
+    @cute.jit
+    def __call__(
+        self,
+        mX: cute.Tensor,  # (M, N)
+        mTarget: cute.Tensor,  # (M,)
+        mLoss: cute.Tensor,  # (M,)
+        mLSE: Optional[cute.Tensor],  # (M,)
+        ignore_index: Int32,
+        stream: cuda.CUstream,
+    ) -> None:
+        assert mX.element_type == self.dtype
+        self._set_cluster_n()
+        # If N is not divisible by the full 128-bit vector width, step down
+        # to the largest compatible vector size as in Quack.
+        num_copy_bits = math.gcd(self.N, 128 // self.dtype.width) * self.dtype.width
+        tiler_mn, tv_layout = self._get_tv_layout(num_copy_bits=num_copy_bits)
+        num_threads = (
+            cute.size(tv_layout, mode=[0]) if _KERNEL_ACCEPTS_LAYOUT_ARGS else self._get_num_threads()
+        )
+        num_warps = num_threads // cute.arch.WARP_SIZE
+        kernel = (
+            self.kernel(
+                mX,
+                mTarget,
+                mLoss,
+                mLSE,
+                ignore_index,
+                tv_layout,
+                tiler_mn,
+            )
+            if _KERNEL_ACCEPTS_LAYOUT_ARGS
+            else self.kernel(
+                mX,
+                mTarget,
+                mLoss,
+                mLSE,
+                ignore_index,
+            )
+        )
+        kernel.launch(
+            grid=[cute.ceil_div(mX.shape[0], tiler_mn[0]), self.cluster_n, 1],
+            block=[num_threads, 1, 1],
+            cluster=[1, self.cluster_n, 1] if const_expr(self.cluster_n > 1) else None,
+            smem=self._smem_size_in_bytes(tiler_mn, num_warps),
+            stream=stream,
+        )
+
+    @cute.jit
+    def launch_from_ptrs(
+        self,
+        ptr_logits: cute.Pointer,
+        ptr_target: cute.Pointer,
+        ptr_loss: cute.Pointer,
+        ptr_lse: cute.Pointer,
+        M: Int32,
+        ld: Int32,
+        ignore_index: Int32,
+        stream: cuda.CUstream,
+    ) -> None:
+        """Pointer-based entrypoint that bypasses DLPack conversions."""
+        ld_assumed = cute.assume(ld, divby=128 // self.dtype.width)
+        layout_mn = cute.make_layout((M, self.N), stride=(ld_assumed, 1))
+        layout_m = cute.make_layout((M,), stride=(1,))
+        mX = cute.make_tensor(ptr_logits, layout_mn)
+        mTarget = cute.make_tensor(ptr_target, layout_m)
+        mLoss = cute.make_tensor(ptr_loss, layout_m)
+        mLSE = cute.make_tensor(ptr_lse, layout_m)
+        self.__call__(mX, mTarget, mLoss, mLSE, ignore_index, stream)
+
+    @cute.jit
+    def _kernel_impl(
+        self,
+        mX: cute.Tensor,  # (M, N)
+        mTarget: cute.Tensor,  # (M,)
+        mLoss: cute.Tensor,  # (M,)
+        mLSE: Optional[cute.Tensor],  # (M,)
+        ignore_index: Int32,  # Index to ignore in loss computation
+        tv_layout: cute.Layout,
+        tiler_mn: cute.Shape,
+    ) -> None:
+        tidx, _, _ = cute.arch.thread_idx()
+        bidx, _, _ = cute.arch.block_idx()
+        if const_expr(self.cluster_n > 1):
+            cluster_y = cute.arch.block_idx()[1]
+        else:
+            cluster_y = const_expr(0)
+
+        shape: cute.Shape = mX.shape
+        idX = cute.make_identity_tensor(shape)
+
+        # Slice per-CTA region; use 64-bit indexing for large tensors.
+        mX_off = domain_offset_i64((bidx * tiler_mn[0], 0), mX)
+        gX = cute.local_tile(mX_off, tiler_mn, (0, cluster_y))
+        cX = cute.local_tile(idX, tiler_mn, (bidx, cluster_y))
+
+        smem = cutlass.utils.SmemAllocator()
+        sX = smem.allocate_tensor(
+            mX.element_type,
+            cute.make_ordered_layout(tiler_mn, order=(1, 0)),
+            byte_alignment=16,
+        )
+        reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(smem, tv_layout)
+
+        # Copy setup: gmem -> smem via cp.async, 128-bit or narrower as needed.
+        num_copy_elems_X = tv_layout.shape[1][0]
+        num_copy_bits_X = mX.element_type.width * num_copy_elems_X
+        copy_atom_load_X = cute.make_copy_atom(
+            cute.nvgpu.cpasync.CopyG2SOp(),
+            gX.element_type,
+            num_bits_per_copy=num_copy_bits_X,
+        )
+        thr_copy_X = cute.make_tiled_copy(copy_atom_load_X, tv_layout, tiler_mn).get_slice(tidx)
+
+        tXgX = thr_copy_X.partition_S(gX)
+        tXsX = thr_copy_X.partition_D(sX)
+        tXcX = thr_copy_X.partition_S(cX)[(0, None), None, None]
+        tXrX = cute.make_fragment_like(tXgX)
+
+        num_warps = cute.size(tv_layout, mode=[0]) // cute.arch.WARP_SIZE
+        self._initialize_cluster(tidx, mbar_ptr, num_warps)
+
+        row = tXcX[0][0]
+        target = Int32.zero
+        if row < shape[0]:
+            target = Int32(mTarget[row])
+
+        is_even_N = const_expr(shape[1] == tiler_mn[1] * self.cluster_n)
+        tXpX = (
+            predicate_k(thr_copy_X.partition_S(cX), limit=shape[1])
+            if const_expr(not is_even_N)
+            else None
+        )
+        if row < shape[0]:
+            cute.copy(copy_atom_load_X, tXgX, tXsX, pred=tXpX)
+        cute.arch.cp_async_commit_group()
+        cute.arch.cp_async_wait_group(0)
+
+        # Fill out-of-bounds values with -inf so they are ignored in max/sum.
+        if const_expr(not is_even_N):
+            fill_oob(tXsX, tXpX, -tXsX.element_type.inf)
+
+        cute.autovec_copy(tXsX, tXrX)
+        x = tXrX.load().to(Float32)
+
+        should_ignore = Boolean(target == ignore_index)
+
+        # Load the target logit if this row is not ignored. Use Int64 indexing
+        # to safely handle very large tensors.
+        target_logit = Float32.zero
+        if row < shape[0] and tXcX[0][1] == 0 and not should_ignore:
+            mX_row = domain_offset_i64((row, 0), mX)
+            target_logit = Float32(mX_row[0, target])
+
+        threads_per_row = tv_layout.shape[0][0]
+        max_x, denom, _ = online_softmax_reduce(
+            x,
+            threads_per_row,
+            reduction_buffer[None, None, 0],
+            mbar_ptr,
+            hook_fn=cute.arch.cluster_wait if const_expr(self.cluster_n > 1) else None,
+            phase=None,
+            return_exp_x=False,
+        )
+
+        # Write loss and lse to gmem. Only one CTA in the cluster writes to
+        # avoid duplicate stores.
+        if (
+            tXcX[0][1] == 0
+            and row < shape[0]
+            and (self.cluster_n == 1 or cute.arch.block_idx_in_cluster() == 0)
+        ):
+            lse = max_x + cute.math.log(denom, fastmath=True)
+            loss_val = (lse - target_logit) if not should_ignore else Float32.zero
+            mLoss[row] = mLoss.element_type(loss_val)
+            if const_expr(mLSE is not None):
+                mLSE[row] = lse
+
+    if _KERNEL_ACCEPTS_LAYOUT_ARGS:
+
+        @cute.kernel
+        def kernel(
+            self,
+            mX: cute.Tensor,  # (M, N)
+            mTarget: cute.Tensor,  # (M,)
+            mLoss: cute.Tensor,  # (M,)
+            mLSE: Optional[cute.Tensor],  # (M,)
+            ignore_index: Int32,
+            tv_layout: cute.Layout,
+            tiler_mn: cute.Shape,
+        ) -> None:
+            self._kernel_impl(
+                mX,
+                mTarget,
+                mLoss,
+                mLSE,
+                ignore_index,
+                tv_layout,
+                tiler_mn,
+            )
+    else:
+
+        @cute.kernel
+        def kernel(
+            self,
+            mX: cute.Tensor,  # (M, N)
+            mTarget: cute.Tensor,  # (M,)
+            mLoss: cute.Tensor,  # (M,)
+            mLSE: Optional[cute.Tensor],  # (M,)
+            ignore_index: Int32,
+        ) -> None:
+            num_copy_bits = math.gcd(self.N, 128 // self.dtype.width) * self.dtype.width
+            tiler_mn, tv_layout = self._get_tv_layout(num_copy_bits=num_copy_bits)
+            self._kernel_impl(
+                mX,
+                mTarget,
+                mLoss,
+                mLSE,
+                ignore_index,
+                tv_layout,
+                tiler_mn,
+            )
+
+
+class CrossEntropyBackwardSM100:
+    """SM100-tuned cross-entropy backward kernel.
+
+    This is a direct port of ``quack.cross_entropy.CrossEntropyBackward`` to
+    the local lite_quack helpers, using cp.async tiling over the (M, N)
+    logits and broadcasting ``dloss`` / ``lse`` across the row dimension.
+    """
+
+    def __init__(self, dtype: Type[cutlass.Numeric], N: int):
+        self.dtype = dtype
+        self.N = N
+
+    def _get_num_threads(self) -> int:
+        # Keep in sync with _get_tv_layout() (we tile N in 16k blocks).
+        N = min(self.N, 16384)
+        return 128 if N <= 16384 else 256
+
+    def _calculate_threads_per_row(self) -> int:
+        N = min(self.N, 16384)  # We split by blocks of 16k in N.
+        return (
+            8
+            if N <= 64
+            else (
+                16
+                if N <= 128
+                else (32 if N <= 3072 else (64 if N <= 6144 else (128 if N <= 16384 else 256)))
+            )
+        )
+
+    def _get_tv_layout(self, num_copy_bits: int = 128) -> tuple[cute.Shape, cute.Layout]:
+        vecsize = num_copy_bits // self.dtype.width
+        assert self.N % vecsize == 0, f"Input N {self.N} is not divisible by vector size {vecsize}"
+        N = min(self.N, 16384)
+        num_threads = 128 if N <= 16384 else 256
+        threads_per_row = self._calculate_threads_per_row()
+        cols_per_block = num_threads // threads_per_row
+        num_blocks_N = cute.ceil_div(N // vecsize, threads_per_row)
+        tiler_mn = (cols_per_block, vecsize * num_blocks_N * threads_per_row)
+        tv_layout = cute.make_layout(
+            ((threads_per_row, cols_per_block), (vecsize, num_blocks_N)),
+            stride=(
+                (vecsize * cols_per_block, 1),
+                (cols_per_block, cols_per_block * vecsize * threads_per_row),
+            ),
+        )
+        return tiler_mn, tv_layout
+
+    @cute.jit
+    def __call__(
+        self,
+        mX: cute.Tensor,
+        mTarget: cute.Tensor,
+        mDLoss: cute.Tensor,
+        mdX: cute.Tensor,
+        mLSE: cute.Tensor,
+        ignore_index: Int32,  # Index to ignore in gradient computation
+        stream: cuda.CUstream,
+    ) -> None:
+        assert mX.element_type == self.dtype
+        assert mdX.element_type == self.dtype
+        num_copy_bits = math.gcd(self.N, 128 // self.dtype.width) * self.dtype.width
+        tiler_mn, tv_layout = self._get_tv_layout(num_copy_bits=num_copy_bits)
+        num_threads = (
+            cute.size(tv_layout, mode=[0]) if _KERNEL_ACCEPTS_LAYOUT_ARGS else self._get_num_threads()
+        )
+        # Broadcast (M,) tensors along the N dimension with stride 0.
+        mDLoss, mTarget, mLSE = [
+            cute.make_tensor(
+                X.iterator,
+                cute.append(X.layout, cute.make_layout((self.N,), stride=(0,))),
+            )
+            for X in (mDLoss, mTarget, mLSE)
+        ]
+        smem_size = cute.size_in_bytes(
+            mX.element_type,
+            cute.make_ordered_layout(tiler_mn, order=(1, 0)),
+        )
+        kernel = (
+            self.kernel(
+                mX,
+                mTarget,
+                mDLoss,
+                mdX,
+                mLSE,
+                ignore_index,
+                tv_layout,
+                tiler_mn,
+            )
+            if _KERNEL_ACCEPTS_LAYOUT_ARGS
+            else self.kernel(
+                mX,
+                mTarget,
+                mDLoss,
+                mdX,
+                mLSE,
+                ignore_index,
+            )
+        )
+        kernel.launch(
+            grid=[
+                cute.ceil_div(mX.shape[0], tiler_mn[0]),
+                cute.ceil_div(mX.shape[1], tiler_mn[1]),
+                1,
+            ],
+            block=[num_threads, 1, 1],
+            smem=smem_size,
+            stream=stream,
+        )
+
+    @cute.jit
+    def launch_from_ptrs(
+        self,
+        ptr_logits: cute.Pointer,
+        ptr_target: cute.Pointer,
+        ptr_dloss: cute.Pointer,
+        ptr_dx: cute.Pointer,
+        ptr_lse: cute.Pointer,
+        M: Int32,
+        ld: Int32,
+        ignore_index: Int32,
+        stream: cuda.CUstream,
+    ) -> None:
+        """Pointer-based entrypoint that bypasses DLPack conversions."""
+        ld_assumed = cute.assume(ld, divby=128 // self.dtype.width)
+        layout_mn = cute.make_layout((M, self.N), stride=(ld_assumed, 1))
+        layout_m = cute.make_layout((M,), stride=(1,))
+        mX = cute.make_tensor(ptr_logits, layout_mn)
+        mdX = cute.make_tensor(ptr_dx, layout_mn)
+        mTarget = cute.make_tensor(ptr_target, layout_m)
+        mDLoss = cute.make_tensor(ptr_dloss, layout_m)
+        mLSE = cute.make_tensor(ptr_lse, layout_m)
+        self.__call__(mX, mTarget, mDLoss, mdX, mLSE, ignore_index, stream)
+
+    @cute.jit
+    def _kernel_impl(
+        self,
+        mX: cute.Tensor,  # (M, N)
+        mTarget: cute.Tensor,  # (M,)
+        mDLoss: cute.Tensor,  # (M,)
+        mdX: cute.Tensor,  # (M, N)
+        mLSE: cute.Tensor,  # (M,)
+        ignore_index: Int32,  # Index to ignore in gradient computation
+        tv_layout: cute.Layout,
+        tiler_mn: cute.Shape,
+    ) -> None:
+        tidx, _, _ = cute.arch.thread_idx()
+        bidx, bidy, _ = cute.arch.block_idx()
+        shape = mX.shape
+
+        smem = cutlass.utils.SmemAllocator()
+        sX = smem.allocate_tensor(
+            mX.element_type,
+            cute.make_ordered_layout(tiler_mn, order=(1, 0)),
+            byte_alignment=16,
+        )
+
+        idX = cute.make_identity_tensor(shape)
+        mX_off, mdX_off = [
+            domain_offset_i64((bidx * tiler_mn[0], 0), mT) for mT in (mX, mdX)
+        ]
+        gX, gdX = [cute.local_tile(mT, tiler_mn, (0, bidy)) for mT in (mX_off, mdX_off)]
+        cX = cute.local_tile(idX, tiler_mn, (bidx, bidy))
+
+        num_copy_elems_X = tv_layout.shape[1][0]
+        num_copy_bits_X = mX.element_type.width * num_copy_elems_X
+        copy_atom_load_X = cute.make_copy_atom(
+            cute.nvgpu.cpasync.CopyG2SOp(),
+            gX.element_type,
+            num_bits_per_copy=num_copy_bits_X,
+        )
+        copy_atom_store_dX = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(),
+            gdX.element_type,
+            num_bits_per_copy=num_copy_bits_X,
+        )
+        thr_copy_X = cute.make_tiled_copy(copy_atom_load_X, tv_layout, tiler_mn).get_slice(tidx)
+        thr_copy_dX = cute.make_tiled_copy(copy_atom_store_dX, tv_layout, tiler_mn).get_slice(tidx)
+
+        tXgX = thr_copy_X.partition_S(gX)
+        tXsX = thr_copy_X.partition_D(sX)
+        tXcX = thr_copy_X.partition_S(cX)[(0, None), None, None]
+        tXcFull = thr_copy_X.partition_S(cX)
+        tXgdX = thr_copy_dX.partition_D(gdX)
+
+        tXrX, tXrdX = [cute.make_fragment_like(thr) for thr in (tXgX, tXgdX)]
+
+        is_even_N = const_expr(shape[1] % tiler_mn[1] == 0)
+        row = tXcX[0][0]
+        tXpX = (
+            predicate_k(thr_copy_X.partition_S(cX), limit=shape[1])
+            if not is_even_N
+            else None
+        )
+        if row < shape[0]:
+            cute.copy(copy_atom_load_X, tXgX, tXsX, pred=tXpX)
+        cute.arch.cp_async_commit_group()
+        cute.arch.cp_async_wait_group(0)
+        if const_expr(not is_even_N):
+            fill_oob(tXsX, tXpX, -tXsX.element_type.inf)
+        cute.autovec_copy(tXsX, tXrX)
+        x = tXrX.load().to(Float32)
+
+        target = Int32.zero
+        dloss = Float32.zero
+        lse = Float32.zero
+        if row < shape[0]:
+            target = Int32(mTarget[row])
+            should_ignore = Boolean(target == ignore_index)
+            dloss = Float32(mDLoss[row]) if not should_ignore else Float32.zero
+            lse = Float32(mLSE[row])
+
+        log2_e = math.log2(math.e)
+        probs = cute.math.exp2(x * log2_e - (lse * log2_e), fastmath=True)
+        prob_shifted = probs - 1.0
+        mask = cute.make_fragment_like(tXrX, cutlass.Boolean)
+        for i in cutlass.range(cute.size(tXcFull), unroll_full=True):
+            mask[i] = tXcFull[i][1] == target
+        grad = cute.where(mask.load(), prob_shifted, probs)
+        grad = grad * dloss
+
+        tXrdX.store(grad.to(tXrdX.element_type))
+        tXpdX = (
+            predicate_k(thr_copy_dX.partition_S(cX), limit=shape[1])
+            if not is_even_N
+            else None
+        )
+        if row < shape[0]:
+            cute.copy(copy_atom_store_dX, tXrdX, tXgdX, pred=tXpdX)
+
+    if _KERNEL_ACCEPTS_LAYOUT_ARGS:
+
+        @cute.kernel
+        def kernel(
+            self,
+            mX: cute.Tensor,  # (M, N)
+            mTarget: cute.Tensor,  # (M,)
+            mDLoss: cute.Tensor,  # (M,)
+            mdX: cute.Tensor,  # (M, N)
+            mLSE: cute.Tensor,  # (M,)
+            ignore_index: Int32,  # Index to ignore in gradient computation
+            tv_layout: cute.Layout,
+            tiler_mn: cute.Shape,
+        ) -> None:
+            self._kernel_impl(
+                mX,
+                mTarget,
+                mDLoss,
+                mdX,
+                mLSE,
+                ignore_index,
+                tv_layout,
+                tiler_mn,
+            )
+    else:
+
+        @cute.kernel
+        def kernel(
+            self,
+            mX: cute.Tensor,  # (M, N)
+            mTarget: cute.Tensor,  # (M,)
+            mDLoss: cute.Tensor,  # (M,)
+            mdX: cute.Tensor,  # (M, N)
+            mLSE: cute.Tensor,  # (M,)
+            ignore_index: Int32,  # Index to ignore in gradient computation
+        ) -> None:
+            num_copy_bits = math.gcd(self.N, 128 // self.dtype.width) * self.dtype.width
+            tiler_mn, tv_layout = self._get_tv_layout(num_copy_bits=num_copy_bits)
+            self._kernel_impl(
+                mX,
+                mTarget,
+                mDLoss,
+                mdX,
+                mLSE,
+                ignore_index,
+                tv_layout,
+                tiler_mn,
+            )
+
+
+def cross_entropy_forward(
+    logits: Tensor,
+    target: Tensor,
+    ignore_index: int = -100,
+    reduction: Literal["none", "mean", "sum"] = "none",
+) -> tuple[Tensor, Tensor]:
+    """SM100 CuteDSL cross-entropy forward pass.
+
+    Args:
+        logits: Tensor of shape ``(M, N)`` on CUDA.
+        target: Tensor of shape ``(M,)`` with integer class indices.
+        ignore_index: Target value to ignore when computing the loss.
+        reduction: One of ``"none"``, ``"mean"``, or ``"sum"`` following
+            ``torch.nn.functional.cross_entropy`` semantics.
+
+    Returns:
+        A tuple ``(loss, lse)`` where:
+        - ``loss`` has shape ``(M,)`` if ``reduction="none"`` or is a scalar
+          otherwise.
+        - ``lse`` is the per-example log-sum-exp with shape ``(M,)``.
+    """
+    assert logits.dim() == 2, "logits must be 2D (M, N)"
+    assert target.dim() == 1, "target must be 1D (M,)"
+    assert logits.shape[0] == target.shape[0], "Batch dimensions must match"
+    assert logits.is_cuda and target.is_cuda, "logits and target must be on CUDA device"
+    assert logits.dtype in TORCH2CUTE_DTYPE, "Unsupported logits dtype"
+    assert target.dtype in (torch.int32, torch.int64), "target must be int32 or int64"
+
+    M, N = logits.shape
+    device = logits.device
+    dtype_cute = TORCH2CUTE_DTYPE[logits.dtype]
+
+    loss = torch.empty(M, device=device, dtype=torch.float32)
+    lse = torch.empty(M, device=device, dtype=torch.float32)
+
+    if _can_use_ptr_path_logits(logits) and _can_use_ptr_path_target(target):
+        _cross_entropy_forward_ptr_into(
+            logits=logits,
+            target=target,
+            loss=loss,
+            lse=lse,
+            ignore_index=int(ignore_index),
+        )
+        if reduction == "none":
+            return loss, lse
+        with torch.no_grad():
+            mask = target != ignore_index
+            if reduction == "sum":
+                reduced = loss.sum()
+            elif reduction == "mean":
+                valid = mask.sum()
+                if valid > 0:
+                    reduced = loss[mask].sum() / valid.to(loss.dtype)
+                else:
+                    reduced = loss.sum() * 0.0
+            else:
+                raise ValueError(
+                    f"Invalid reduction mode: {reduction}. Expected 'none', 'mean', or 'sum'."
+                )
+        return reduced, lse
+
+    mX = _convert_logits_2d(logits)
+    mTarget = _convert_1d(target.to(torch.int64), assumed_align=8)
+    mLoss = _convert_1d(loss, assumed_align=4)
+    mLSE = _convert_1d(lse, assumed_align=4)
+
+    current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+
+    compile_key = (dtype_cute, N)
+    kernel = _FWD_COMPILE_CACHE.get(compile_key)
+    if kernel is None:
+        op = CrossEntropyFwdSM100(dtype_cute, N)
+        kernel = cute.compile(
+            op,
+            mX,
+            mTarget,
+            mLoss,
+            mLSE,
+            Int32(ignore_index),
+            current_stream,
+        )
+        _FWD_COMPILE_CACHE[compile_key] = kernel
+
+    kernel(mX, mTarget, mLoss, mLSE, Int32(ignore_index), current_stream)
+
+    if reduction == "none":
+        return loss, lse
+
+    with torch.no_grad():
+        mask = target != ignore_index
+        if reduction == "sum":
+            reduced = loss.sum()
+        elif reduction == "mean":
+            valid = mask.sum()
+            if valid > 0:
+                reduced = loss[mask].sum() / valid.to(loss.dtype)
+            else:
+                reduced = loss.sum() * 0.0
+        else:
+            raise ValueError(
+                f"Invalid reduction mode: {reduction}. Expected 'none', 'mean', or 'sum'."
+            )
+    return reduced, lse
+
+
+def _cross_entropy_backward_sm100(
+    logits: Tensor,
+    target: Tensor,
+    dloss: Tensor,
+    lse: Tensor,
+    dx: Tensor,
+    ignore_index: int = -100,
+) -> None:
+    """Internal SM100 cross-entropy backward dispatch using CuteDSL."""
+    assert logits.dim() == 2, "logits must be 2D (M, N)"
+    assert target.dim() == 1, "target must be 1D (M,)"
+    assert dloss.dim() == 1, "dloss must be 1D (M,)"
+    assert lse.dim() == 1, "lse must be 1D (M,)"
+    assert logits.shape[0] == target.shape[0] == dloss.shape[0] == lse.shape[0], (
+        "Batch dimensions must match"
+    )
+    assert logits.is_cuda and target.is_cuda and dloss.is_cuda and lse.is_cuda, (
+        "All tensors must be on CUDA device"
+    )
+    assert logits.dtype in TORCH2CUTE_DTYPE, "Unsupported logits dtype"
+    assert target.dtype in (torch.int32, torch.int64), "target must be int32 or int64"
+
+    M, N = logits.shape
+    dtype_cute = TORCH2CUTE_DTYPE[logits.dtype]
+
+    if (
+        _can_use_ptr_path_logits(logits)
+        and _can_use_ptr_path_logits(dx)
+        and _can_use_ptr_path_target(target)
+        and _can_use_ptr_path_f32_1d(dloss)
+        and _can_use_ptr_path_f32_1d(lse)
+        and logits.stride() == dx.stride()
+    ):
+        _cross_entropy_backward_ptr_into(
+            logits=logits,
+            target=target,
+            dloss=dloss,
+            lse=lse,
+            dx=dx,
+            ignore_index=int(ignore_index),
+        )
+        return
+
+    mX = _convert_logits_2d(logits)
+    mdX = _convert_logits_2d(dx)
+    mTarget = _convert_1d(target.to(torch.int64), assumed_align=8)
+    mDLoss = _convert_1d(dloss.to(torch.float32), assumed_align=4)
+    mLSE = _convert_1d(lse.to(torch.float32), assumed_align=4)
+
+    current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+
+    compile_key = (dtype_cute, N)
+    kernel = _BWD_COMPILE_CACHE.get(compile_key)
+    if kernel is None:
+        op = CrossEntropyBackwardSM100(dtype_cute, N)
+        kernel = cute.compile(
+            op,
+            mX,
+            mTarget,
+            mDLoss,
+            mdX,
+            mLSE,
+            Int32(ignore_index),
+            current_stream,
+        )
+        _BWD_COMPILE_CACHE[compile_key] = kernel
+
+    kernel(mX, mTarget, mDLoss, mdX, mLSE, Int32(ignore_index), current_stream)
+
+
+def _can_use_ptr_path_logits(x: Tensor) -> bool:
+    if not x.is_cuda or x.dim() != 2:
+        return False
+    if x.dtype not in TORCH2CUTE_DTYPE:
+        return False
+    if x.stride(1) != 1:
+        return False
+    if (x.data_ptr() % 16) != 0:
+        return False
+    dtype_x = TORCH2CUTE_DTYPE[x.dtype]
+    divby = 128 // dtype_x.width
+    if (x.stride(0) % divby) != 0:
+        return False
+    return True
+
+
+def _can_use_ptr_path_target(t: Tensor) -> bool:
+    if not t.is_cuda or t.dim() != 1:
+        return False
+    if t.dtype is not torch.int64:
+        return False
+    if not t.is_contiguous():
+        return False
+    if t.stride(0) != 1:
+        return False
+    if (t.data_ptr() % 8) != 0:
+        return False
+    return True
+
+
+def _can_use_ptr_path_f32_1d(t: Tensor) -> bool:
+    if not t.is_cuda or t.dim() != 1:
+        return False
+    if t.dtype is not torch.float32:
+        return False
+    if not t.is_contiguous():
+        return False
+    if t.stride(0) != 1:
+        return False
+    if (t.data_ptr() % 4) != 0:
+        return False
+    return True
+
+
+def _cross_entropy_forward_ptr_into(
+    *,
+    logits: Tensor,
+    target: Tensor,
+    loss: Tensor,
+    lse: Tensor,
+    ignore_index: int,
+) -> None:
+    assert logits.is_cuda and logits.dim() == 2
+    assert target.is_cuda and target.dim() == 1 and target.shape[0] == logits.shape[0]
+    assert target.dtype is torch.int64
+    assert loss.is_cuda and loss.shape == (logits.shape[0],) and loss.dtype is torch.float32
+    assert lse.is_cuda and lse.shape == (logits.shape[0],) and lse.dtype is torch.float32
+
+    M, N = logits.shape
+    device_index = logits.get_device()
+    if torch.cuda.current_device() != device_index:
+        torch.cuda.set_device(device_index)
+    stream = cuda.CUstream(int(torch.cuda.current_stream().cuda_stream))
+
+    dtype_x = TORCH2CUTE_DTYPE[logits.dtype]
+    key = ("ptr_fwd", int(N), dtype_x, int(device_index))
+    compiled = _PTR_FWD_COMPILE_CACHE.get(key)
+    if compiled is None:
+        op = CrossEntropyFwdSM100(dtype_x, int(N))
+        ptr_logits = rt.make_ptr(
+            dtype_x, logits.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+        )
+        ptr_target = rt.make_ptr(
+            cutlass.Int64,
+            target.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=8,
+        )
+        ptr_loss = rt.make_ptr(
+            cutlass.Float32,
+            loss.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=4,
+        )
+        ptr_lse = rt.make_ptr(
+            cutlass.Float32,
+            lse.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=4,
+        )
+        compiled = cute.compile(
+            op.launch_from_ptrs,
+            ptr_logits,
+            ptr_target,
+            ptr_loss,
+            ptr_lse,
+            Int32(int(M)),
+            Int32(int(logits.stride(0))),
+            Int32(int(ignore_index)),
+            stream,
+        )
+        _PTR_FWD_COMPILE_CACHE[key] = compiled
+
+    ptr_logits = rt.make_ptr(
+        dtype_x, logits.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+    )
+    ptr_target = rt.make_ptr(
+        cutlass.Int64,
+        target.data_ptr(),
+        mem_space=rt.AddressSpace.gmem,
+        assumed_align=8,
+    )
+    ptr_loss = rt.make_ptr(
+        cutlass.Float32,
+        loss.data_ptr(),
+        mem_space=rt.AddressSpace.gmem,
+        assumed_align=4,
+    )
+    ptr_lse = rt.make_ptr(
+        cutlass.Float32,
+        lse.data_ptr(),
+        mem_space=rt.AddressSpace.gmem,
+        assumed_align=4,
+    )
+    compiled(
+        ptr_logits,
+        ptr_target,
+        ptr_loss,
+        ptr_lse,
+        Int32(int(M)),
+        Int32(int(logits.stride(0))),
+        Int32(int(ignore_index)),
+        stream,
+    )
+
+
+def _cross_entropy_backward_ptr_into(
+    *,
+    logits: Tensor,
+    target: Tensor,
+    dloss: Tensor,
+    lse: Tensor,
+    dx: Tensor,
+    ignore_index: int,
+) -> None:
+    assert logits.is_cuda and logits.dim() == 2
+    assert target.is_cuda and target.dim() == 1 and target.shape[0] == logits.shape[0]
+    assert target.dtype is torch.int64
+    assert dloss.is_cuda and dloss.shape == (logits.shape[0],) and dloss.dtype is torch.float32
+    assert lse.is_cuda and lse.shape == (logits.shape[0],) and lse.dtype is torch.float32
+    assert dx.is_cuda and dx.shape == logits.shape and dx.dtype == logits.dtype
+    assert dx.stride() == logits.stride(), "Pointer path expects dx to match logits strides"
+
+    M, N = logits.shape
+    device_index = logits.get_device()
+    if torch.cuda.current_device() != device_index:
+        torch.cuda.set_device(device_index)
+    stream = cuda.CUstream(int(torch.cuda.current_stream().cuda_stream))
+
+    dtype_x = TORCH2CUTE_DTYPE[logits.dtype]
+    key = ("ptr_bwd", int(N), dtype_x, int(device_index))
+    compiled = _PTR_BWD_COMPILE_CACHE.get(key)
+    if compiled is None:
+        op = CrossEntropyBackwardSM100(dtype_x, int(N))
+        ptr_logits = rt.make_ptr(
+            dtype_x, logits.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+        )
+        ptr_target = rt.make_ptr(
+            cutlass.Int64,
+            target.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=8,
+        )
+        ptr_dloss = rt.make_ptr(
+            cutlass.Float32,
+            dloss.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=4,
+        )
+        ptr_dx = rt.make_ptr(
+            dtype_x, dx.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+        )
+        ptr_lse = rt.make_ptr(
+            cutlass.Float32,
+            lse.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=4,
+        )
+        compiled = cute.compile(
+            op.launch_from_ptrs,
+            ptr_logits,
+            ptr_target,
+            ptr_dloss,
+            ptr_dx,
+            ptr_lse,
+            Int32(int(M)),
+            Int32(int(logits.stride(0))),
+            Int32(int(ignore_index)),
+            stream,
+        )
+        _PTR_BWD_COMPILE_CACHE[key] = compiled
+
+    ptr_logits = rt.make_ptr(
+        dtype_x, logits.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+    )
+    ptr_target = rt.make_ptr(
+        cutlass.Int64,
+        target.data_ptr(),
+        mem_space=rt.AddressSpace.gmem,
+        assumed_align=8,
+    )
+    ptr_dloss = rt.make_ptr(
+        cutlass.Float32,
+        dloss.data_ptr(),
+        mem_space=rt.AddressSpace.gmem,
+        assumed_align=4,
+    )
+    ptr_dx = rt.make_ptr(dtype_x, dx.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+    ptr_lse = rt.make_ptr(
+        cutlass.Float32,
+        lse.data_ptr(),
+        mem_space=rt.AddressSpace.gmem,
+        assumed_align=4,
+    )
+    compiled(
+        ptr_logits,
+        ptr_target,
+        ptr_dloss,
+        ptr_dx,
+        ptr_lse,
+        Int32(int(M)),
+        Int32(int(logits.stride(0))),
+        Int32(int(ignore_index)),
+        stream,
+    )
+
+
+def cross_entropy_backward(
+    dloss: Tensor,
+    logits: Tensor,
+    target: Tensor,
+    lse: Tensor,
+    ignore_index: int = -100,
+) -> Tensor:
+    """SM100 CuteDSL cross-entropy backward pass.
+
+    Args:
+        dloss: Upstream gradient of shape ``(M,)`` corresponding to
+            ``reduction="none"``.
+        logits: Input logits tensor of shape ``(M, N)``.
+        target: Integer class indices of shape ``(M,)``.
+        lse: Per-example log-sum-exp tensor of shape ``(M,)`` as returned
+            by :func:`cross_entropy_forward`.
+        ignore_index: Target value to ignore in gradient computation.
+
+    Returns:
+        ``dlogits`` of shape ``(M, N)`` with the same dtype as ``logits``.
+    """
+    assert logits.dim() == 2, "logits must be 2D (M, N)"
+    assert dloss.dim() == 1, "dloss must be 1D (M,)"
+    assert logits.size(0) == dloss.size(0), "Batch dimensions must match"
+    assert logits.is_cuda and dloss.is_cuda, "logits and dloss must be on CUDA device"
+
+    dx = torch.empty_like(logits)
+    _cross_entropy_backward_sm100(
+        logits,
+        target,
+        dloss,
+        lse,
+        dx,
+        ignore_index=ignore_index,
+    )
+    return dx
+
+
+def cross_entropy(
+    logits: Tensor,
+    target: Tensor,
+    ignore_index: int = -100,
+    reduction: Literal["none", "mean", "sum"] = "mean",
+) -> Tensor:
+    """Convenience wrapper mirroring ``torch.nn.functional.cross_entropy`` reductions.
+
+    This uses :func:`cross_entropy_forward` under the hood but returns only
+    the reduced loss tensor.
+    """
+    loss, _lse = cross_entropy_forward(
+        logits,
+        target,
+        ignore_index=ignore_index,
+        reduction="none",
+    )
+    if reduction == "none":
+        return loss
+    mask = target != ignore_index
+    if reduction == "sum":
+        return loss.sum()
+    if reduction == "mean":
+        valid = mask.sum()
+        if valid > 0:
+            return loss[mask].sum() / valid.to(loss.dtype)
+        return loss.sum() * 0.0
+    raise ValueError(
+        f"Invalid reduction mode: {reduction}. Expected one of 'none', 'mean', or 'sum'."
+    )
+
+
+def verify_cross_entropy_parity(
+    M: int,
+    N: int,
+    dtype: torch.dtype = torch.bfloat16,
+    ignore_index: int = -100,
+) -> None:
+    """Compare SM100 CuteDSL cross-entropy against PyTorch for a single shape."""
+    device = torch.device("cuda")
+    torch.manual_seed(0)
+
+    logits = 0.1 * torch.randn(M, N, device=device, dtype=dtype)
+    logits.requires_grad_(True)
+    target = torch.randint(0, N, (M,), device=device, dtype=torch.int64)
+
+    # Optionally sprinkle some ignore_index entries for robustness.
+    if ignore_index != -100:
+        mask = torch.rand(M, device=device) < 0.1
+        target[mask] = ignore_index
+
+    loss, lse = cross_entropy_forward(logits, target, ignore_index=ignore_index, reduction="none")
+
+    logits_ref = logits.detach().clone().requires_grad_()
+    target_ref = target.detach().clone()
+    loss_ref = torch.nn.functional.cross_entropy(
+        logits_ref.float(),
+        target_ref,
+        ignore_index=ignore_index,
+        reduction="none",
+    )
+
+    # Forward parity
+    if dtype in (torch.float16, torch.bfloat16):
+        atol = 5e-2
+        rtol = 5e-2
+    else:
+        atol = 1e-4
+        rtol = 1e-4
+    torch.testing.assert_close(loss, loss_ref, atol=atol, rtol=rtol)
+
+    # Backward parity
+    dloss = torch.randn_like(loss_ref)
+    (dx_ref,) = torch.autograd.grad(loss_ref, logits_ref, grad_outputs=dloss)
+    dx = cross_entropy_backward(dloss, logits, target, lse, ignore_index=ignore_index)
+    torch.testing.assert_close(dx, dx_ref.to(logits.dtype), atol=atol, rtol=rtol)
+
+
+if __name__ == "__main__":
+    # Minimal functional check when executed directly. For performance
+    # comparisons and detailed tuning, use the dedicated benchmark harness.
+    if not torch.cuda.is_available():
+        print("CUDA not available; cross-entropy parity check skipped.")
+        raise SystemExit(0)
+
+    M, N = 1024, 8192
+    dtype = torch.bfloat16
+    verify_cross_entropy_parity(M, N, dtype=dtype, ignore_index=-100)
+    print("SM100 cross-entropy CuteDSL parity check passed.")
diff --git a/oink/src/kernelagent_oink/blackwell/layernorm.py b/oink/src/kernelagent_oink/blackwell/layernorm.py
new file mode 100644
index 0000000..05b11de
--- /dev/null
+++ b/oink/src/kernelagent_oink/blackwell/layernorm.py
@@ -0,0 +1,1368 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+LayerNorm kernel for SM100 (Blackwell) in CuteDSL.
+
+This implementation:
+- Mirrors Quack's LayerNorm tiling / cluster policy / cp.async pipeline
+  but uses only local helpers so that it does not depend on the external
+  `quack` package at runtime.
+- Supports fp16 / bf16 / fp32 inputs with fp32 accumulation.
+- Optionally writes out per-row `rstd` and `mean` buffers for reuse in
+  backward or fused kernels.
+
+Backward is implemented with dedicated CuteDSL kernels for input and
+parameter gradients (dx, dweight, dbias), avoiding PyTorch autograd
+while matching `torch.nn.functional.layer_norm`'s gradients numerically.
+"""
+
+from __future__ import annotations
+
+import importlib.metadata
+import os
+import re
+import operator
+from typing import Optional, Tuple, Type
+
+import torch
+from torch import Tensor
+
+import cuda.bindings.driver as cuda  # provided by NVIDIA cuda-python
+
+# CuTeDSL caches generated MLIR into a tempdir under a global default
+# (`/tmp/$USER/cutlass_python_cache`). The cache bytecode format can differ across
+# `nvidia-cutlass-dsl` versions, and cross-version cache sharing causes noisy
+# warnings (and disables cache reuse).
+if "CUTE_DSL_CACHE_DIR" not in os.environ:
+    try:
+        _dsl_ver = importlib.metadata.version("nvidia-cutlass-dsl")
+    except Exception:
+        _dsl_ver = "unknown"
+    _dsl_ver = re.sub(r"[^0-9A-Za-z]+", "_", _dsl_ver)
+    _user = os.environ.get("USER") or os.environ.get("USERNAME") or "user"
+    _tmp = os.environ.get("TMPDIR") or "/tmp"
+    os.environ["CUTE_DSL_CACHE_DIR"] = os.path.join(
+        _tmp, _user, f"cutlass_python_cache_{_dsl_ver}"
+    )
+
+try:
+    import cutlass  # type: ignore  # noqa: F401
+except Exception as e:
+    raise ImportError(
+        "kernelagent_oink.blackwell.layernorm requires CuTeDSL's Python package "
+        "(`cutlass`, typically provided by `nvidia-cutlass-dsl`)."
+    ) from e
+
+import cutlass.cute as cute
+from cutlass import Float32, Int32, const_expr
+from cutlass.cute import runtime as rt
+from cutlass.cute.runtime import from_dlpack
+
+# Simple compile cache for the forward kernel
+_COMPILE_CACHE: dict[Tuple[int, type[cutlass.Numeric], bool, bool, bool], object] = {}
+_PTR_COMPILE_CACHE: dict[Tuple[object, ...], object] = {}
+
+# Backward compile caches: one for dx, one for parameter gradients.
+_BWD_DX_COMPILE_CACHE: dict[Tuple[int, Type[cutlass.Numeric]], object] = {}
+_BWD_PARAM_COMPILE_CACHE: dict[Tuple[int, Type[cutlass.Numeric], bool], object] = {}
+
+# Local helpers cloned from Quack via lite_quack so that this kernel does
+# not depend on `quack` at runtime.
+from kernelagent_oink.blackwell.lite_quack import (
+    _KERNEL_ACCEPTS_LAYOUT_ARGS,
+    TORCH2CUTE_DTYPE,
+    ReductionBase as _ReductionBase,
+    convert_from_dlpack as convert_from_dlpack_cute,
+    domain_offset_i64,
+    get_sm_count,
+    predicate_k,
+    row_reduce,
+    warp_reduce,
+)
+
+
+def _convert_row_major(t: Tensor) -> cute.Tensor:
+    """
+    Convert a 2D row-major torch.Tensor to a CuTeDSL tensor with a compact,
+    dynamic layout on the leading dimension.
+    """
+    return from_dlpack(t.detach(), assumed_align=16).mark_compact_shape_dynamic(
+        mode=0,
+        stride_order=(0, 1),
+    )
+
+
+class LayerNormSM100(_ReductionBase):
+    """
+    SM100 LayerNorm forward kernel.
+
+    This mirrors `quack.layernorm.LayerNorm`'s schedule:
+    - Stage=2 pipeline: first pass computes mean, second pass computes
+      variance / rstd and normalization.
+    - Threads-per-row and cluster_n policy follow Quack's LayerNorm
+      heuristics to keep tensor-core friendly tiles across N.
+    - Optional `reload_from` hint enables reloading X from SMEM for large-N
+      shapes to shorten register lifetimes.
+
+    Differences vs Quack:
+    - Bias is optional and supported directly in the kernel.
+    - Dtype mapping and reduction helpers come from `lite_quack`.
+    """
+
+    def __init__(self, dtype: type[cutlass.Numeric], N: int):
+        super().__init__(dtype, N, stage=2)  # 2 stages for mean and var
+        # Default reload policy mirrors Quack: use SMEM reload only for
+        # very large hidden sizes. We keep this conservative for LayerNorm
+        # and tune primarily via threads-per-block / cluster_n.
+        self.reload_from: Optional[str] = None if N <= 16384 else "smem"
+        self.delay_w_load: bool = False
+
+    def _calculate_threads_per_row(self) -> int:
+        # Match Quack's LayerNorm threads-per-row buckets.
+        N = self.N
+        return (
+            8
+            if N <= 64
+            else (
+                16
+                if N <= 128
+                else (32 if N <= 3072 else (64 if N <= 6144 else (128 if N <= 16384 else 256)))
+            )
+        )
+
+    def _set_cluster_n(self) -> None:
+        # Cluster_n policy mirrors quack.layernorm.LayerNorm._set_cluster_n.
+        N = self.N
+        if const_expr(self.dtype.width == 16):
+            cluster_n = (
+                1
+                if N <= 16 * 1024
+                else (
+                    2
+                    if N <= 32 * 1024
+                    else (4 if N <= 64 * 1024 else (8 if N <= 128 * 1024 else 16))
+                )
+            )
+        else:
+            cluster_n = (
+                1
+                if N <= 32 * 1024
+                else (
+                    2
+                    if N <= 64 * 1024
+                    else (4 if N <= 128 * 1024 else (8 if N <= 256 * 1024 else 16))
+                )
+            )
+        self.cluster_n = cluster_n
+
+    @cute.jit
+    def __call__(
+        self,
+        mX: cute.Tensor,
+        mW: cute.Tensor,
+        mB: Optional[cute.Tensor],
+        mO: cute.Tensor,
+        mRstd: Optional[cute.Tensor],
+        mMean: Optional[cute.Tensor],
+        stream: cuda.CUstream,
+        eps: Float32 = 1e-6,
+    ):
+        assert mX.element_type == self.dtype
+        assert mO.element_type == self.dtype
+
+        # Tiling and cluster policy (mirrors Quack LayerNorm).
+        self._set_cluster_n()
+        tiler_mn, tv_layout = self._get_tv_layout()
+        num_threads = (
+            cute.size(tv_layout, mode=[0]) if _KERNEL_ACCEPTS_LAYOUT_ARGS else self._get_num_threads()
+        )
+        num_warps = num_threads // cute.arch.WARP_SIZE
+
+        # Expand weight / bias to match tiler_mn[0] rows per CTA.
+        mW = cute.make_tensor(
+            mW.iterator,
+            cute.prepend(mW.layout, cute.make_layout((tiler_mn[0],), stride=(0,))),
+        )
+        if const_expr(mB is not None):
+            mB = cute.make_tensor(
+                mB.iterator,
+                cute.prepend(mB.layout, cute.make_layout((tiler_mn[0],), stride=(0,))),
+            )
+        if const_expr(mRstd is not None):
+            mRstd = cute.make_tensor(
+                mRstd.iterator,
+                cute.append(mRstd.layout, cute.make_layout((self.N,), stride=(0,))),
+            )
+        if const_expr(mMean is not None):
+            mMean = cute.make_tensor(
+                mMean.iterator,
+                cute.append(mMean.layout, cute.make_layout((self.N,), stride=(0,))),
+            )
+
+        kernel = (
+            self.kernel(
+                mX,
+                mW,
+                mB,
+                mO,
+                mRstd,
+                mMean,
+                eps,
+                tv_layout,
+                tiler_mn,
+                const_expr(self.reload_from),
+                const_expr(self.delay_w_load),
+            )
+            if _KERNEL_ACCEPTS_LAYOUT_ARGS
+            else self.kernel(
+                mX,
+                mW,
+                mB,
+                mO,
+                mRstd,
+                mMean,
+                eps,
+            )
+        )
+        kernel.launch(
+            grid=[cute.ceil_div(mX.shape[0], tiler_mn[0]), self.cluster_n, 1],
+            block=[num_threads, 1, 1],
+            cluster=[
+                1,
+                self.cluster_n,
+                1,
+            ]
+            if const_expr(self.cluster_n > 1)
+            else None,
+            smem=self._smem_size_in_bytes(tiler_mn, num_warps),
+            stream=stream,
+        )
+
+    @cute.jit
+    def launch_from_ptrs(
+        self,
+        ptr_x: cute.Pointer,
+        ptr_w: cute.Pointer,
+        ptr_b: Optional[cute.Pointer],
+        ptr_out: cute.Pointer,
+        ptr_rstd: Optional[cute.Pointer],
+        ptr_mean: Optional[cute.Pointer],
+        M: Int32,
+        ld: Int32,
+        stream: cuda.CUstream,
+        eps: Float32 = 1e-6,
+    ) -> None:
+        """Pointer-based entrypoint that bypasses DLPack conversions.
+
+        This reconstructs cute.Tensor views from raw device pointers + explicit
+        layouts inside the JIT graph, reusing the tuned LayerNormSM100 schedule.
+        """
+        # The kernel uses 128-bit vectorized copies for X. Mirror Quack's
+        # `divisibility=128 // dtype.width` contract so the compiler can
+        # prove alignment for cp.async.
+        ld_assumed = cute.assume(ld, divby=128 // self.dtype.width)
+        # Match `mark_compact_shape_dynamic(mode=0, ...)`: M is dynamic, N is static.
+        layout_mn = cute.make_layout((M, self.N), stride=(ld_assumed, 1))
+        layout_n = cute.make_layout((self.N,), stride=(1,))
+        layout_m = cute.make_layout((M,), stride=(1,))
+
+        mX = cute.make_tensor(ptr_x, layout_mn)
+        mO = cute.make_tensor(ptr_out, layout_mn)
+        mW = cute.make_tensor(ptr_w, layout_n)
+        mB = cute.make_tensor(ptr_b, layout_n) if const_expr(ptr_b is not None) else None
+        mRstd = (
+            cute.make_tensor(ptr_rstd, layout_m)
+            if const_expr(ptr_rstd is not None)
+            else None
+        )
+        mMean = (
+            cute.make_tensor(ptr_mean, layout_m)
+            if const_expr(ptr_mean is not None)
+            else None
+        )
+
+        self.__call__(mX, mW, mB, mO, mRstd, mMean, stream, eps)
+
+    @cute.jit
+    def _kernel_impl(
+        self,
+        mX: cute.Tensor,
+        mW: cute.Tensor,
+        mB: Optional[cute.Tensor],
+        mO: cute.Tensor,
+        mRstd: Optional[cute.Tensor],
+        mMean: Optional[cute.Tensor],
+        eps: Float32,
+        tv_layout: cute.Layout,
+        tiler_mn: cute.Shape,
+        reload_from: cutlass.Constexpr,
+        delay_w_load: cutlass.Constexpr,
+    ):
+        tidx, _, _ = cute.arch.thread_idx()
+        bidx, _, _ = cute.arch.block_idx()
+        if const_expr(self.cluster_n > 1):
+            cluster_y = cute.arch.block_idx()[1]
+        else:
+            cluster_y = const_expr(0)
+
+        smem = cutlass.utils.SmemAllocator()
+        sX = smem.allocate_tensor(
+            mX.element_type,
+            cute.make_ordered_layout(tiler_mn, order=(1, 0)),
+            byte_alignment=16,
+        )
+        reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(smem, tv_layout)
+
+        shape = mX.shape
+        idX = cute.make_identity_tensor(shape)
+
+        # Slice for CTAs: use domain_offset_i64 to handle >2^31 elements.
+        mX, mO = [
+            domain_offset_i64((bidx * tiler_mn[0], 0), mT) for mT in (mX, mO)
+        ]
+        gX, gO = [cute.local_tile(mT, tiler_mn, (0, cluster_y)) for mT in (mX, mO)]
+        cX = cute.local_tile(idX, tiler_mn, (bidx, cluster_y))
+        gW = cute.local_tile(mW, tiler_mn, (0, cluster_y))
+        gB = (
+            cute.local_tile(mB, tiler_mn, (0, cluster_y))
+            if const_expr(mB is not None)
+            else None
+        )
+        gRstd = (
+            cute.local_tile(mRstd, tiler_mn, (bidx, cluster_y))
+            if const_expr(mRstd is not None)
+            else None
+        )
+        gMean = (
+            cute.local_tile(mMean, tiler_mn, (bidx, cluster_y))
+            if const_expr(mMean is not None)
+            else None
+        )
+
+        # Copy atoms for X / W / B / O.
+        copy_atom_load_X = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(),
+            mX.element_type,
+            num_bits_per_copy=128,
+        )
+        copy_atom_load_X_async = cute.make_copy_atom(
+            cute.nvgpu.cpasync.CopyG2SOp(),
+            mX.element_type,
+            num_bits_per_copy=128,
+        )
+        copy_atom_load_WB = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(),
+            mW.element_type,
+            num_bits_per_copy=128,
+        )
+        copy_atom_store_O = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(),
+            mO.element_type,
+            num_bits_per_copy=128,
+        )
+
+        thr_copy_X = cute.make_tiled_copy(
+            copy_atom_load_X_async,
+            tv_layout,
+            tiler_mn,
+        ).get_slice(tidx)
+        thr_copy_WB = cute.make_tiled_copy(
+            copy_atom_load_WB,
+            tv_layout,
+            tiler_mn,
+        ).get_slice(tidx)
+        thr_copy_O = cute.make_tiled_copy(
+            copy_atom_store_O,
+            tv_layout,
+            tiler_mn,
+        ).get_slice(tidx)
+
+        tWgW = thr_copy_WB.partition_S(gW)
+        tBgB = (
+            thr_copy_WB.partition_S(gB)
+            if const_expr(gB is not None)
+            else None
+        )
+        tXgX = thr_copy_X.partition_S(gX)
+        tXsX = thr_copy_X.partition_D(sX)
+        tXgO = thr_copy_O.partition_D(gO)
+        tXrRstd = (
+            thr_copy_O.partition_D(gRstd)
+            if const_expr(mRstd is not None)
+            else None
+        )
+        tXrMean = (
+            thr_copy_O.partition_D(gMean)
+            if const_expr(mMean is not None)
+            else None
+        )
+        tXcX = thr_copy_X.partition_S(cX)[(0, None), None, None]
+
+        # Fragments for gmem->rmem.
+        tWrW = cute.make_fragment_like(tWgW)
+        tBrB = (
+            cute.make_fragment_like(tBgB)
+            if const_expr(mB is not None)
+            else None
+        )
+        tXrW = thr_copy_X.retile(tWrW)
+        tXrB = (
+            thr_copy_X.retile(tBrB)
+            if const_expr(mB is not None)
+            else None
+        )
+        tXrX, tXrO = [cute.make_fragment_like(thr) for thr in (tXgX, tXgO)]
+
+        num_warps = cute.size(tv_layout, mode=[0]) // cute.arch.WARP_SIZE
+        self._initialize_cluster(tidx, mbar_ptr, num_warps, is_persistent=False)
+
+        tXpX = predicate_k(
+            thr_copy_X.partition_S(cX),
+            limit=shape[1],
+        )
+        row = tXcX[0][0]
+        if row < shape[0]:
+            cute.copy(copy_atom_load_X_async, tXgX, tXsX, pred=tXpX)
+        cute.arch.cp_async_commit_group()
+
+        tWpW = predicate_k(
+            thr_copy_WB.partition_S(cX),
+            limit=shape[1],
+        )
+        if const_expr(not delay_w_load):
+            cute.copy(copy_atom_load_WB, tWgW, tWrW, pred=tWpW)
+            if const_expr(mB is not None):
+                cute.copy(copy_atom_load_WB, tBgB, tBrB, pred=tWpW)
+
+        cute.arch.cp_async_wait_group(0)
+        cute.autovec_copy(tXsX, tXrX)
+        x = tXrX.load().to(Float32)
+        threads_per_row = tv_layout.shape[0][0]
+        sum_x = row_reduce(
+            x,
+            cute.ReductionOp.ADD,
+            threads_per_row,
+            reduction_buffer[None, None, 0],
+            mbar_ptr + 0 if const_expr(self.cluster_n > 1) else None,
+            init_val=0.0,
+            hook_fn=(
+                cute.arch.cluster_wait
+                if const_expr(self.cluster_n > 1)
+                else None
+            ),
+        )
+        mean = sum_x / shape[1]
+
+        if const_expr(reload_from == "smem"):
+            cute.autovec_copy(tXsX, tXrX)
+            x = tXrX.load().to(Float32)
+        elif const_expr(reload_from == "gmem"):
+            cute.copy(copy_atom_load_X, tXgX, tXrX, pred=tXpX)
+            x = tXrX.load().to(Float32)
+
+        sum_sq_x_sub_mean = row_reduce(
+            (x - mean) * (x - mean),
+            cute.ReductionOp.ADD,
+            threads_per_row,
+            reduction_buffer[None, None, 1],
+            mbar_ptr + 1 if const_expr(self.cluster_n > 1) else None,
+            init_val=0.0,
+        )
+        rstd = cute.math.rsqrt(sum_sq_x_sub_mean / shape[1] + eps, fastmath=True)
+
+        if const_expr(mRstd is not None):
+            if (
+                tXcX[0][1] == 0
+                and row < shape[0]
+                and (
+                    self.cluster_n == 1
+                    or cute.arch.block_idx_in_cluster() == 0
+                )
+            ):
+                tXrRstd[0] = rstd
+
+        if const_expr(mMean is not None):
+            if (
+                tXcX[0][1] == 0
+                and row < shape[0]
+                and (
+                    self.cluster_n == 1
+                    or cute.arch.block_idx_in_cluster() == 0
+                )
+            ):
+                tXrMean[0] = mean
+
+        if const_expr(delay_w_load):
+            cute.copy(copy_atom_load_WB, tWgW, tWrW, pred=tWpW)
+            if const_expr(mB is not None):
+                cute.copy(copy_atom_load_WB, tBgB, tBrB, pred=tWpW)
+
+        if const_expr(reload_from == "smem"):
+            cute.autovec_copy(tXsX, tXrX)
+            x = tXrX.load().to(Float32)
+        elif const_expr(reload_from == "gmem"):
+            cute.copy(copy_atom_load_X, tXgX, tXrX, pred=tXpX)
+            x = tXrX.load().to(Float32)
+
+        x_hat = (x - mean) * rstd
+        w = tXrW.load().to(Float32)
+        y = x_hat * w
+        if const_expr(mB is not None):
+            b = tXrB.load().to(Float32)
+            y = y + b
+
+        tXrO.store(y.to(tXrO.element_type))
+        tOpO = predicate_k(
+            thr_copy_O.partition_S(cX),
+            limit=shape[1],
+        )
+        if row < shape[0]:
+            cute.copy(copy_atom_store_O, tXrO, tXgO, pred=tOpO)
+
+    if _KERNEL_ACCEPTS_LAYOUT_ARGS:
+
+        @cute.kernel
+        def kernel(
+            self,
+            mX: cute.Tensor,
+            mW: cute.Tensor,
+            mB: Optional[cute.Tensor],
+            mO: cute.Tensor,
+            mRstd: Optional[cute.Tensor],
+            mMean: Optional[cute.Tensor],
+            eps: Float32,
+            tv_layout: cute.Layout,
+            tiler_mn: cute.Shape,
+            reload_from: cutlass.Constexpr,
+            delay_w_load: cutlass.Constexpr,
+        ):
+            self._kernel_impl(
+                mX,
+                mW,
+                mB,
+                mO,
+                mRstd,
+                mMean,
+                eps,
+                tv_layout,
+                tiler_mn,
+                reload_from,
+                delay_w_load,
+            )
+    else:
+
+        @cute.kernel
+        def kernel(
+            self,
+            mX: cute.Tensor,
+            mW: cute.Tensor,
+            mB: Optional[cute.Tensor],
+            mO: cute.Tensor,
+            mRstd: Optional[cute.Tensor],
+            mMean: Optional[cute.Tensor],
+            eps: Float32,
+        ):
+            tiler_mn, tv_layout = self._get_tv_layout()
+            self._kernel_impl(
+                mX,
+                mW,
+                mB,
+                mO,
+                mRstd,
+                mMean,
+                eps,
+                tv_layout,
+                tiler_mn,
+                const_expr(self.reload_from),
+                const_expr(self.delay_w_load),
+            )
+
+
+# -----------------------------------------------------------------------------
+# Public Python API
+# -----------------------------------------------------------------------------
+
+
+def layernorm(
+    x: Tensor,
+    weight: Tensor,
+    bias: Optional[Tensor] = None,
+    eps: float = 1e-6,
+    return_rstd: bool = False,
+    return_mean: bool = False,
+):
+    """
+    LayerNorm forward pass using the SM100 CuteDSL kernel.
+
+    Args:
+        x: Input tensor of shape (M, N).
+        weight: Scale parameter of shape (N,), typically fp32.
+        bias: Optional bias parameter of shape (N,).
+        eps: Small value for numerical stability.
+        return_rstd: Whether to return per-row reciprocal std (shape (M,)).
+        return_mean: Whether to return per-row mean (shape (M,)).
+    """
+    assert x.is_cuda and weight.is_cuda, "x and weight must be CUDA tensors"
+    assert x.dim() == 2, "Use (M, N) tensor; flatten batch/seq beforehand."
+    assert weight.dim() == 1, "weight must be 1D"
+    assert x.shape[1] == weight.shape[0], "Last dim of x must match weight.size(0)"
+    if bias is not None:
+        assert bias.is_cuda, "bias must be on CUDA"
+        assert bias.dim() == 1 and bias.shape[0] == weight.shape[0], (
+            "bias must be 1D and match weight"
+        )
+
+    M, N = x.shape
+    dtype = TORCH2CUTE_DTYPE[x.dtype]
+
+    rstd = torch.empty(M, device=x.device, dtype=torch.float32) if return_rstd else None
+    mean = torch.empty(M, device=x.device, dtype=torch.float32) if return_mean else None
+
+    # Fast path: bypass DLPack conversions when the inputs are in the common
+    # contiguous row-major layout and weights/bias are fp32 (Quack-style).
+    if _can_use_ptr_path(x, weight, bias):
+        out = torch.empty_strided(x.shape, x.stride(), device=x.device, dtype=x.dtype)
+        _layernorm_forward_ptr_into(
+            x=x,
+            weight=weight,
+            bias=bias,
+            out=out,
+            rstd=rstd,
+            mean=mean,
+            eps=eps,
+        )
+        if return_mean and return_rstd:
+            return out, rstd, mean
+        if return_rstd and not return_mean:
+            return out, rstd
+        if return_mean and not return_rstd:
+            return out, mean
+        return out
+
+    out = torch.empty_like(x)
+    mX = _convert_row_major(x)
+    mO = _convert_row_major(out)
+
+    # Weight/bias live in feature dimension (N).
+    mW = convert_from_dlpack_cute(
+        weight.detach(),
+        leading_dim=0,
+        alignment=16,
+        divisibility=128 // cutlass.Float32.width,
+    )
+    mB = (
+        convert_from_dlpack_cute(
+            bias.detach(),
+            leading_dim=0,
+            alignment=16,
+            divisibility=128 // cutlass.Float32.width,
+        )
+        if bias is not None
+        else None
+    )
+
+    mRstd = (
+        from_dlpack(rstd.detach(), assumed_align=4).mark_layout_dynamic(leading_dim=0)
+        if rstd is not None
+        else None
+    )
+    mMean = (
+        from_dlpack(mean.detach(), assumed_align=4).mark_layout_dynamic(leading_dim=0)
+        if mean is not None
+        else None
+    )
+
+    stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+    key = (N, dtype, mB is not None, mRstd is not None, mMean is not None)
+    compiled = _COMPILE_CACHE.get(key)
+    if compiled is None:
+        op = LayerNormSM100(dtype, N)
+        compiled = cute.compile(
+            op,
+            mX,
+            mW,
+            mB,
+            mO,
+            mRstd,
+            mMean,
+            stream,
+            Float32(eps),
+        )
+        _COMPILE_CACHE[key] = compiled
+
+    compiled(
+        mX,
+        mW,
+        mB,
+        mO,
+        mRstd,
+        mMean,
+        stream,
+        Float32(eps),
+    )
+
+    if return_mean and return_rstd:
+        return out, rstd, mean
+    if return_rstd and not return_mean:
+        return out, rstd
+    if return_mean and not return_rstd:
+        return out, mean
+    return out
+
+
+def _can_use_ptr_path(x: Tensor, weight: Tensor, bias: Optional[Tensor]) -> bool:
+    """Return True if we can safely use the pointer-based fast path.
+
+    This is intentionally conservative: we target the common inference-like
+    layout (2D row-major with stride(1)==1) and Quack-style fp32 weights.
+    """
+    if not x.is_cuda or x.dim() != 2:
+        return False
+    if x.stride(1) != 1:
+        return False
+    if not weight.is_cuda or weight.dim() != 1:
+        return False
+    if weight.dtype != torch.float32:
+        return False
+    if not weight.is_contiguous():
+        return False
+    if bias is not None:
+        if not bias.is_cuda or bias.dim() != 1:
+            return False
+        if bias.dtype != torch.float32:
+            return False
+        if not bias.is_contiguous():
+            return False
+    # Require 16B alignment for 128-bit vector copies (matches Quack's assumed_align=16).
+    if (x.data_ptr() % 16) != 0:
+        return False
+    if (weight.data_ptr() % 16) != 0:
+        return False
+    if bias is not None and (bias.data_ptr() % 16) != 0:
+        return False
+    # The kernel uses 128-bit vectorized loads; require the leading dimension
+    # to preserve 16B alignment for every row start.
+    dtype_x = TORCH2CUTE_DTYPE[x.dtype]
+    divby = 128 // dtype_x.width
+    if (x.stride(0) % divby) != 0:
+        return False
+    return True
+
+
+def _layernorm_forward_ptr_into(
+    *,
+    x: Tensor,
+    weight: Tensor,
+    bias: Optional[Tensor],
+    out: Tensor,
+    rstd: Optional[Tensor],
+    mean: Optional[Tensor],
+    eps: float,
+) -> None:
+    """Launch the pointer-based LayerNorm kernel into preallocated outputs."""
+    assert x.is_cuda and x.dim() == 2
+    M, N = x.shape
+    assert weight.is_cuda and weight.dim() == 1 and weight.shape[0] == N
+    if bias is not None:
+        assert bias.is_cuda and bias.dim() == 1 and bias.shape[0] == N
+    assert out.is_cuda and out.shape == x.shape and out.dtype == x.dtype
+    assert out.stride() == x.stride(), "Pointer path expects out to match x strides"
+    if rstd is not None:
+        assert rstd.is_cuda and rstd.shape == (M,) and rstd.dtype == torch.float32
+    if mean is not None:
+        assert mean.is_cuda and mean.shape == (M,) and mean.dtype == torch.float32
+
+    device_index = x.get_device()
+    if torch.cuda.current_device() != device_index:
+        torch.cuda.set_device(device_index)
+    stream_handle = int(torch.cuda.current_stream().cuda_stream)
+    stream = cuda.CUstream(stream_handle)
+
+    dtype_x = TORCH2CUTE_DTYPE[x.dtype]
+    key = (
+        "ptr",
+        int(N),
+        dtype_x,
+        bias is not None,
+        rstd is not None,
+        mean is not None,
+        int(device_index),
+    )
+    compiled = _PTR_COMPILE_CACHE.get(key)
+    if compiled is None:
+        op = LayerNormSM100(dtype_x, int(N))
+        ptr_x = rt.make_ptr(
+            dtype_x, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+        )
+        ptr_out = rt.make_ptr(
+            dtype_x, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+        )
+        ptr_w = rt.make_ptr(
+            cutlass.Float32,
+            weight.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=16,
+        )
+        ptr_b = (
+            rt.make_ptr(
+                cutlass.Float32,
+                bias.data_ptr(),
+                mem_space=rt.AddressSpace.gmem,
+                assumed_align=16,
+            )
+            if bias is not None
+            else None
+        )
+        ptr_rstd = (
+            rt.make_ptr(
+                cutlass.Float32,
+                rstd.data_ptr(),
+                mem_space=rt.AddressSpace.gmem,
+                assumed_align=4,
+            )
+            if rstd is not None
+            else None
+        )
+        ptr_mean = (
+            rt.make_ptr(
+                cutlass.Float32,
+                mean.data_ptr(),
+                mem_space=rt.AddressSpace.gmem,
+                assumed_align=4,
+            )
+            if mean is not None
+            else None
+        )
+        ld = Int32(int(x.stride(0)))
+        compiled = cute.compile(
+            op.launch_from_ptrs,
+            ptr_x,
+            ptr_w,
+            ptr_b,
+            ptr_out,
+            ptr_rstd,
+            ptr_mean,
+            Int32(int(M)),
+            ld,
+            stream,
+            Float32(float(eps)),
+        )
+        _PTR_COMPILE_CACHE[key] = compiled
+
+    ptr_x = rt.make_ptr(dtype_x, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+    ptr_out = rt.make_ptr(
+        dtype_x, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+    )
+    ptr_w = rt.make_ptr(
+        cutlass.Float32,
+        weight.data_ptr(),
+        mem_space=rt.AddressSpace.gmem,
+        assumed_align=16,
+    )
+    ptr_b = (
+        rt.make_ptr(
+            cutlass.Float32,
+            bias.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=16,
+        )
+        if bias is not None
+        else None
+    )
+    ptr_rstd = (
+        rt.make_ptr(
+            cutlass.Float32,
+            rstd.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=4,
+        )
+        if rstd is not None
+        else None
+    )
+    ptr_mean = (
+        rt.make_ptr(
+            cutlass.Float32,
+            mean.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=4,
+        )
+        if mean is not None
+        else None
+    )
+    ld = Int32(int(x.stride(0)))
+    compiled(
+        ptr_x,
+        ptr_w,
+        ptr_b,
+        ptr_out,
+        ptr_rstd,
+        ptr_mean,
+        Int32(int(M)),
+        ld,
+        stream,
+        Float32(float(eps)),
+    )
+
+
+def layernorm_ref(
+    x: Tensor,
+    weight: Tensor,
+    bias: Optional[Tensor] = None,
+    eps: float = 1e-6,
+) -> Tensor:
+    """
+    Reference LayerNorm implemented via torch.nn.functional.layer_norm.
+    """
+    x_f32 = x.float()
+    w = weight.float()
+    b = bias.float() if bias is not None else None
+    y = torch.nn.functional.layer_norm(x_f32, (x.shape[-1],), w, b, eps)
+    return y.to(x.dtype)
+
+
+def _as_2d(x: Tensor) -> Tuple[Tensor, Tuple[int, ...]]:
+    if x.dim() == 2:
+        return x, x.shape
+    original_shape = x.shape
+    M = int(torch.prod(torch.tensor(original_shape[:-1])).item())
+    N = original_shape[-1]
+    return x.reshape(M, N), original_shape
+
+
+def _restore_shape(x: Tensor, shape: Tuple[int, ...]) -> Tensor:
+    return x.reshape(shape)
+
+
+@cute.kernel
+def _layernorm_backward_dx_kernel(
+    mX: cute.Tensor,
+    mW: cute.Tensor,
+    mdO: cute.Tensor,
+    mRstd: cute.Tensor,
+    mMean: cute.Tensor,
+    mdX: cute.Tensor,
+):
+    """
+    Simple CTA-per-row LayerNorm backward kernel for dx only.
+
+    Each block processes one row of shape (N,), using block_threads threads.
+    It performs two passes over the row:
+      1) Compute mean_wdy and mean_xhat_wdy in fp32.
+      2) Compute dx using the standard LayerNorm backward formula:
+         dx = rstd * (wdy - mean_wdy - x_hat * mean_xhat_wdy),
+         where wdy = dy * gamma and x_hat = (x - mean) * rstd.
+    """
+    tidx, _, _ = cute.arch.thread_idx()
+    bidx, _, _ = cute.arch.block_idx()
+
+    block_threads = const_expr(256)
+    shape = mX.shape
+    M = shape[0]
+    N = shape[1]
+
+    row = bidx
+    if row < M:
+        # Shared buffers for warp-level reductions across the block.
+        smem = cutlass.utils.SmemAllocator()
+        num_warps = const_expr(block_threads // cute.arch.WARP_SIZE)
+        warp_sums_layout = cute.make_layout((num_warps,), stride=(1,))
+        warp_sums_wdy = smem.allocate_tensor(Float32, warp_sums_layout, byte_alignment=4)
+        warp_sums_xhatwdy = smem.allocate_tensor(Float32, warp_sums_layout, byte_alignment=4)
+
+        lane = cute.arch.lane_idx()
+        warp_idx = cute.arch.warp_idx()
+
+        rstd_val = mRstd[row].to(Float32)
+        mean_val = mMean[row].to(Float32)
+
+        # Pass 1: compute local partial sums of wdy and x_hat*wdy.
+        local_wdy = Float32(0.0)
+        local_xhatwdy = Float32(0.0)
+        for col in cutlass.range(tidx, N, block_threads):
+            x_val = mX[row, col].to(Float32)
+            dy_val = mdO[row, col].to(Float32)
+            gamma = mW[col].to(Float32)
+            x_mu = x_val - mean_val
+            x_hat = x_mu * rstd_val
+            wdy = dy_val * gamma
+            local_wdy += wdy
+            local_xhatwdy += x_hat * wdy
+
+        # Warp-level reduction, then block-level reduction via shared memory.
+        red_op = operator.add  # type: ignore[assignment]
+        local_wdy = warp_reduce(local_wdy, red_op)
+        local_xhatwdy = warp_reduce(local_xhatwdy, red_op)
+
+        if lane == 0:
+            warp_sums_wdy[warp_idx] = local_wdy
+            warp_sums_xhatwdy[warp_idx] = local_xhatwdy
+
+        cute.arch.barrier()
+
+        total_wdy = Float32(0.0)
+        total_xhatwdy = Float32(0.0)
+        if warp_idx == 0 and lane == 0:
+            for wi in cutlass.range_constexpr(num_warps):
+                total_wdy += warp_sums_wdy[wi]
+                total_xhatwdy += warp_sums_xhatwdy[wi]
+            # Store totals back into first slots for broadcast.
+            warp_sums_wdy[0] = total_wdy
+            warp_sums_xhatwdy[0] = total_xhatwdy
+
+        cute.arch.barrier()
+
+        total_wdy = warp_sums_wdy[0]
+        total_xhatwdy = warp_sums_xhatwdy[0]
+        inv_N = Float32(1.0 / float(N))
+        mean_wdy = total_wdy * inv_N
+        mean_xhatwdy = total_xhatwdy * inv_N
+
+        # Pass 2: compute dx and write back.
+        for col in cutlass.range(tidx, N, block_threads):
+            x_val = mX[row, col].to(Float32)
+            dy_val = mdO[row, col].to(Float32)
+            gamma = mW[col].to(Float32)
+            x_mu = x_val - mean_val
+            x_hat = x_mu * rstd_val
+            wdy = dy_val * gamma
+            dx_val = (wdy - mean_wdy - x_hat * mean_xhatwdy) * rstd_val
+            mdX[row, col] = dx_val.to(mdX.element_type)
+
+
+@cute.jit
+def _layernorm_backward_dx(
+    mX: cute.Tensor,
+    mW: cute.Tensor,
+    mdO: cute.Tensor,
+    mRstd: cute.Tensor,
+    mMean: cute.Tensor,
+    mdX: cute.Tensor,
+    stream: cuda.CUstream,
+) -> None:
+    """
+    JIT wrapper that launches the dx-only LayerNorm backward kernel.
+    One CTA processes one row of length N with 256 threads.
+    """
+    M = mX.shape[0]
+    _layernorm_backward_dx_kernel(
+        mX,
+        mW,
+        mdO,
+        mRstd,
+        mMean,
+        mdX,
+    ).launch(
+        grid=[M, 1, 1],
+        block=[256, 1, 1],
+        stream=stream,
+    )
+
+
+@cute.kernel
+def _layernorm_backward_param_kernel(
+    mX: cute.Tensor,
+    mdO: cute.Tensor,
+    mRstd: cute.Tensor,
+    mMean: cute.Tensor,
+    mdW_partial: Optional[cute.Tensor],
+    mdB_partial: Optional[cute.Tensor],
+    num_blocks: Int32,
+) -> None:
+    """
+    Parameter-gradient kernel for LayerNorm.
+
+    Each CTA accumulates partial dweight/dbias over a stripe of rows:
+      - Grid dim X: num_blocks (sm_count-style persistent CTAs).
+      - Threads in a CTA partition the N dimension.
+      - For each assigned column, a thread streams over rows
+        row = blockIdx.x, blockIdx.x + num_blocks, ...
+
+    This mirrors the persistent-CTA pattern used by RMSNorm backward,
+    but uses a simpler per-thread accumulation since columns are
+    independent.
+    """
+    tidx, _, _ = cute.arch.thread_idx()
+    bidx, _, _ = cute.arch.block_idx()
+
+    block_threads = const_expr(256)
+    M = mX.shape[0]
+    N = mX.shape[1]
+
+    if bidx < num_blocks:
+        for col in cutlass.range(tidx, N, block_threads):
+            dw_local = Float32(0.0)
+            db_local = Float32(0.0)
+            for row in cutlass.range(bidx, M, num_blocks):
+                x_val = mX[row, col].to(Float32)
+                dy_val = mdO[row, col].to(Float32)
+                rstd_val = mRstd[row].to(Float32)
+                mean_val = mMean[row].to(Float32)
+                x_mu = x_val - mean_val
+                x_hat = x_mu * rstd_val
+                dw_local += dy_val * x_hat
+                db_local += dy_val
+
+            if const_expr(mdW_partial is not None):
+                mdW_partial[bidx, col] = dw_local
+            if const_expr(mdB_partial is not None):
+                mdB_partial[bidx, col] = db_local
+
+
+@cute.jit
+def _layernorm_backward_param(
+    mX: cute.Tensor,
+    mdO: cute.Tensor,
+    mRstd: cute.Tensor,
+    mMean: cute.Tensor,
+    mdW_partial: Optional[cute.Tensor],
+    mdB_partial: Optional[cute.Tensor],
+    num_blocks: Int32,
+    stream: cuda.CUstream,
+) -> None:
+    """
+    JIT wrapper that launches the parameter-gradient kernel.
+    """
+    _layernorm_backward_param_kernel(
+        mX,
+        mdO,
+        mRstd,
+        mMean,
+        mdW_partial,
+        mdB_partial,
+        num_blocks,
+    ).launch(
+        grid=[num_blocks, 1, 1],
+        block=[256, 1, 1],
+        stream=stream,
+    )
+
+
+def _layernorm_backward_dx_sm100(
+    dout_2d: Tensor,
+    x_2d: Tensor,
+    weight: Tensor,
+    rstd_1d: Tensor,
+    mean_1d: Tensor,
+    dx_2d: Tensor,
+) -> None:
+    """
+    Host-side helper to run the dx-only LayerNorm backward kernel.
+    """
+    M, N = x_2d.shape
+    assert dout_2d.shape == (M, N)
+    assert rstd_1d.numel() == M
+    assert mean_1d.numel() == M
+
+    dtype = TORCH2CUTE_DTYPE[x_2d.dtype]
+
+    mX = _convert_row_major(x_2d)
+    mdO = _convert_row_major(dout_2d)
+    mdX = _convert_row_major(dx_2d)
+
+    mW = convert_from_dlpack_cute(
+        weight.detach(),
+        leading_dim=0,
+        alignment=16,
+        divisibility=128 // cutlass.Float32.width,
+    )
+    mRstd = from_dlpack(rstd_1d.detach(), assumed_align=4).mark_layout_dynamic(leading_dim=0)
+    mMean = from_dlpack(mean_1d.detach(), assumed_align=4).mark_layout_dynamic(leading_dim=0)
+
+    stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+    key = (N, dtype)
+    compiled = _BWD_DX_COMPILE_CACHE.get(key)
+    if compiled is None:
+        compiled = cute.compile(
+            _layernorm_backward_dx,
+            mX,
+            mW,
+            mdO,
+            mRstd,
+            mMean,
+            mdX,
+            stream,
+        )
+        _BWD_DX_COMPILE_CACHE[key] = compiled
+
+    compiled(
+        mX,
+        mW,
+        mdO,
+        mRstd,
+        mMean,
+        mdX,
+        stream,
+    )
+
+
+def _layernorm_backward_params_sm100(
+    dout_2d: Tensor,
+    x_2d: Tensor,
+    rstd_1d: Tensor,
+    mean_1d: Tensor,
+    dw_partial: Optional[Tensor],
+    db_partial: Optional[Tensor],
+    sm_count: int,
+) -> None:
+    """
+    Host-side helper to run the parameter-gradient kernel that populates
+    dw_partial / db_partial of shape (sm_count, N).
+    """
+    M, N = x_2d.shape
+    assert dout_2d.shape == (M, N)
+    assert rstd_1d.numel() == M
+    assert mean_1d.numel() == M
+    if dw_partial is None and db_partial is None:
+        return
+
+    dtype = TORCH2CUTE_DTYPE[x_2d.dtype]
+
+    mX = _convert_row_major(x_2d)
+    mdO = _convert_row_major(dout_2d)
+    mRstd = from_dlpack(rstd_1d.detach(), assumed_align=4).mark_layout_dynamic(leading_dim=0)
+    mMean = from_dlpack(mean_1d.detach(), assumed_align=4).mark_layout_dynamic(leading_dim=0)
+
+    mdW_partial = (
+        from_dlpack(dw_partial, assumed_align=16).mark_compact_shape_dynamic(mode=0)
+        if dw_partial is not None
+        else None
+    )
+    mdB_partial = (
+        from_dlpack(db_partial, assumed_align=16).mark_compact_shape_dynamic(mode=0)
+        if db_partial is not None
+        else None
+    )
+
+    stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+    has_bias = db_partial is not None
+    key = (N, dtype, has_bias)
+    compiled = _BWD_PARAM_COMPILE_CACHE.get(key)
+    if compiled is None:
+        compiled = cute.compile(
+            _layernorm_backward_param,
+            mX,
+            mdO,
+            mRstd,
+            mMean,
+            mdW_partial,
+            mdB_partial,
+            Int32(sm_count),
+            stream,
+        )
+        _BWD_PARAM_COMPILE_CACHE[key] = compiled
+
+    compiled(
+        mX,
+        mdO,
+        mRstd,
+        mMean,
+        mdW_partial,
+        mdB_partial,
+        Int32(sm_count),
+        stream,
+    )
+
+
+def layernorm_backward(
+    dout: Tensor,
+    x: Tensor,
+    weight: Tensor,
+    rstd: Tensor,
+    mean: Tensor,
+    bias: Optional[Tensor] = None,
+) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
+    """
+    LayerNorm backward implemented in CuteDSL / CUTLASS.
+
+    Computes gradients w.r.t. input, weight, and optional bias using
+    two kernels:
+      - A dx kernel (CTA-per-row) that streams over N.
+      - A parameter-gradient kernel that accumulates dw/db over a
+        persistent grid of CTAs across the M dimension.
+    """
+    assert x.shape == dout.shape, "x and dout must have the same shape"
+    assert x.is_cuda and dout.is_cuda, "x and dout must be CUDA tensors"
+    assert weight.dim() == 1, "weight must be 1D"
+    if bias is not None:
+        assert bias.dim() == 1, "bias must be 1D"
+
+    x_2d, orig_shape = _as_2d(x)
+    dout_2d, _ = _as_2d(dout)
+    M, N = x_2d.shape
+
+    # Flatten to 2D for the kernels.
+    mean_flat = mean.view(M)
+    rstd_flat = rstd.view(M)
+
+    dx_2d = torch.empty_like(x_2d)
+    _layernorm_backward_dx_sm100(
+        dout_2d,
+        x_2d,
+        weight,
+        rstd_flat,
+        mean_flat,
+        dx_2d,
+    )
+
+    device = x.device
+    sm_count = get_sm_count(N, device, M=M, dtype=x.dtype)
+
+    dw_partial = torch.empty(sm_count, N, device=device, dtype=torch.float32)
+    db_partial = (
+        torch.empty(sm_count, N, device=device, dtype=torch.float32)
+        if bias is not None
+        else None
+    )
+
+    _layernorm_backward_params_sm100(
+        dout_2d,
+        x_2d,
+        rstd_flat,
+        mean_flat,
+        dw_partial,
+        db_partial,
+        sm_count,
+    )
+
+    dweight = dw_partial.sum(dim=0).to(weight.dtype)
+    dbias = db_partial.sum(dim=0).to(bias.dtype) if bias is not None else None
+
+    dx = _restore_shape(dx_2d, orig_shape)
+    return dx, dweight, dbias
+
+
+if __name__ == "__main__":
+    # Allow direct execution for a quick functional check.
+    if not torch.cuda.is_available():
+        print("CUDA not available; LayerNormSM100 test skipped.")
+        raise SystemExit(0)
+
+    device = "cuda"
+    M, N = 2048, 4096
+    dtype = torch.bfloat16
+    x = torch.randn(M, N, device=device, dtype=dtype)
+    w = torch.randn(N, device=device, dtype=torch.float32)
+    b = torch.randn(N, device=device, dtype=torch.float32)
+
+    y_ref = layernorm_ref(x, w, b)
+    y, rstd, mean = layernorm(x, w, b, return_rstd=True, return_mean=True)
+    torch.testing.assert_close(
+        y,
+        y_ref,
+        atol=5e-2 if dtype != torch.float32 else 1e-5,
+        rtol=5e-2 if dtype != torch.float32 else 1e-5,
+    )
+
+    print("LayerNormSM100 forward correctness check passed.")
diff --git a/oink/src/kernelagent_oink/blackwell/lite_quack.py b/oink/src/kernelagent_oink/blackwell/lite_quack.py
index 14ae723..1bc15b1 100644
--- a/oink/src/kernelagent_oink/blackwell/lite_quack.py
+++ b/oink/src/kernelagent_oink/blackwell/lite_quack.py
@@ -13,21 +13,24 @@
 # limitations under the License.
 
 """
-Lightweight local clone of the small subset of helpers that the SM100
+Lightweight local clone of the small subset of Quack helpers that the SM100
 RMSNorm CuteDSL kernels depend on.
 
 This module intentionally avoids importing the `quack` package so that
-Oink Blackwell kernels can run without Quack installed, while keeping
-numerical behaviour and performance close to the original reference
-implementations.
+KernelAgent Oink SM100 kernels can run without Quack installed, while keeping
+numerical behaviour and performance identical to the reference kernels.
 """
 
 from __future__ import annotations
 
 import math
 import operator
-from typing import Callable, Optional
+import importlib.metadata
+import re
+from functools import partial
+from typing import Callable, Optional, Tuple, Type
 
+import cuda.bindings.driver as cuda  # type: ignore
 import torch
 from torch import Tensor
 
@@ -36,11 +39,39 @@
 from cutlass import Float32, Int32, const_expr
 from cutlass.cute.runtime import from_dlpack
 from cutlass.cutlass_dsl import T, dsl_user_op
-from cutlass._mlir.dialects import llvm
+from cutlass._mlir.dialects import llvm, nvvm, vector
+
+
+def _parse_version_tuple(version: str) -> tuple[int, int, int]:
+    parts = version.split(".")
+    nums: list[int] = []
+    for part in parts[:3]:
+        match = re.match(r"^(\d+)", part)
+        nums.append(int(match.group(1)) if match is not None else 0)
+    while len(nums) < 3:
+        nums.append(0)
+    return nums[0], nums[1], nums[2]
+
+
+def _cutlass_dsl_version() -> Optional[tuple[int, int, int]]:
+    try:
+        return _parse_version_tuple(importlib.metadata.version("nvidia-cutlass-dsl"))
+    except Exception:
+        return None
+
+
+_CUTLASS_DSL_VERSION = _cutlass_dsl_version()
+# CuTeDSL 4.3.4 tightened some kernel argument expectations (notably around
+# passing Layout/Shape/Constexpr objects into @cute.kernel functions). Keep the
+# older signature for <4.3.4, but switch to a 4.3.4+ compatible signature when
+# we detect 4.3.4+ (or when version detection is unavailable).
+_KERNEL_ACCEPTS_LAYOUT_ARGS = (
+    _CUTLASS_DSL_VERSION is not None and _CUTLASS_DSL_VERSION < (4, 3, 4)
+)
 
 
 # -------------------------
-# Dtype mapping
+# Dtype mapping (from quack.cute_dsl_utils)
 # -------------------------
 
 TORCH2CUTE_DTYPE = {
@@ -51,21 +82,15 @@
 
 
 # -------------------------
-# Tensor conversion helpers
+# Tensor conversion helpers (from quack.utils)
 # -------------------------
 
-
 def convert_from_dlpack(
     x: Tensor,
     leading_dim: int,
     alignment: int = 16,
     divisibility: int = 1,
 ) -> cute.Tensor:
-    """
-    Wrap a torch.Tensor in a CuteDSL tensor with layout metadata that
-    matches the logical leading dimension and alignment/divisibility
-    constraints expected by SM100 kernels.
-    """
     return (
         from_dlpack(x, assumed_align=alignment)
         .mark_layout_dynamic(leading_dim=leading_dim)
@@ -78,14 +103,12 @@ def convert_from_dlpack(
 
 
 # -------------------------
-# SM90/SM100 cluster helpers
+# SM90/SM100 cluster helpers (from quack.utils)
 # -------------------------
 
 
 @dsl_user_op
-def elem_pointer(
-    x: cute.Tensor, coord: cute.Coord, *, loc=None, ip=None
-) -> cute.Pointer:
+def elem_pointer(x: cute.Tensor, coord: cute.Coord, *, loc=None, ip=None) -> cute.Pointer:
     return x.iterator + cute.crd2idx(coord, x.layout, loc=loc, ip=ip)
 
 
@@ -136,9 +159,7 @@ def store_shared_remote(
     ).ir_value()
     if const_expr(isinstance(val, float)):
         val = Float32(val)
-    assert isinstance(val, (Float32, Int32, cutlass.Int64)), (
-        "val must be Float32, Int32, or Int64"
-    )
+    assert isinstance(val, (Float32, Int32, cutlass.Int64)), "val must be Float32, Int32, or Int64"
     suffix = {Float32: "f32", Int32: "s32", cutlass.Int64: "s64"}[type(val)]
     constraint = {Float32: "f", Int32: "r", cutlass.Int64: "l"}[type(val)]
     llvm.inline_asm(
@@ -154,37 +175,22 @@ def store_shared_remote(
 
 @cute.jit
 def predicate_k(tAcA: cute.Tensor, limit: cutlass.Int32) -> cute.Tensor:
-    """
-    Build a predicate tensor for the K dimension only. Values beyond
-    `limit` are masked out.
-    """
+    # Only compute predicates for the "k" dimension. For the mn dimension, we will use "if".
     tApA = cute.make_fragment(
         cute.make_layout(
-            (
-                cute.size(tAcA, mode=[0, 1]),
-                cute.size(tAcA, mode=[1]),
-                cute.size(tAcA, mode=[2]),
-            ),
+            (cute.size(tAcA, mode=[0, 1]), cute.size(tAcA, mode=[1]), cute.size(tAcA, mode=[2])),
             stride=(cute.size(tAcA, mode=[2]), 0, 1),
         ),
         cutlass.Boolean,
     )
     for rest_v in cutlass.range_constexpr(tApA.shape[0]):
         for rest_k in cutlass.range_constexpr(tApA.shape[2]):
-            tApA[rest_v, 0, rest_k] = cute.elem_less(
-                tAcA[(0, rest_v), 0, rest_k][1], limit
-            )
+            tApA[rest_v, 0, rest_k] = cute.elem_less(tAcA[(0, rest_v), 0, rest_k][1], limit)
     return tApA
 
 
 @dsl_user_op
-def domain_offset_i64(
-    coord: cute.Coord, tensor: cute.Tensor, *, loc=None, ip=None
-) -> cute.Tensor:
-    """
-    Return a tensor whose iterator is offset by an Int64 byte offset
-    computed from `coord` and the tensor's strides.
-    """
+def domain_offset_i64(coord: cute.Coord, tensor: cute.Tensor, *, loc=None, ip=None) -> cute.Tensor:
     flat_coord_i64 = tuple(cutlass.Int64(c) for c in cute.flatten(coord))
     flat_stride = cute.flatten_to_tuple(tensor.stride)
     assert len(flat_coord_i64) == len(flat_stride), (
@@ -201,8 +207,81 @@ def domain_offset_i64(
     return cute.make_tensor(new_ptr, tensor.layout)
 
 
+@dsl_user_op
+def coord_offset_i64(
+    idx: cute.typing.Int,
+    tensor: cute.Tensor,
+    dim: int,
+    *,
+    loc=None,
+    ip=None,
+) -> cute.Tensor:
+    offset = cutlass.Int64(idx) * cute.size(tensor.stride[dim])
+    assert isinstance(tensor.iterator, cute.Pointer)
+    new_ptr = cute.make_ptr(
+        tensor.element_type,
+        tensor.iterator.toint() + offset * tensor.element_type.width // 8,
+        tensor.memspace,
+        assumed_align=tensor.iterator.max_alignment,
+    )
+    return cute.make_tensor(new_ptr, tensor.layout)
+
+
+@cute.jit
+def fill_oob(tXsX: cute.Tensor, tXpX: Optional[cute.Tensor], fill_value: cutlass.Numeric) -> None:
+    """Fill out-of-bounds values in shared memory tensor."""
+    tXrX_fill = cute.make_fragment_like(tXsX[(None, 0), None, 0])
+    tXrX_fill.fill(fill_value)
+    for rest_v in cutlass.range_constexpr(const_expr(tXsX.shape[0][1])):
+        for rest_k in cutlass.range_constexpr(const_expr(tXsX.shape[2])):
+            if const_expr(tXpX is not None):
+                if not tXpX[rest_v, 0, rest_k]:
+                    cute.autovec_copy(tXrX_fill, tXsX[(None, rest_v), None, rest_k])
+            else:
+                cute.autovec_copy(tXrX_fill, tXsX[(None, rest_v), None, rest_k])
+
+
+@dsl_user_op
+def f32x2_to_i64(a: Float32, b: Float32, *, loc=None, ip=None) -> cutlass.Int64:
+    """Pack two f32 values into a single i64.
+
+    This mirrors quack.utils.f32x2_to_i64 and is used by online_softmax_reduce
+    to store (max, sum_exp) pairs in an Int64 reduction buffer.
+    """
+    vec_f32x2 = vector.from_elements(
+        T.vector(2, T.f32()),
+        (a.ir_value(loc=loc, ip=ip), b.ir_value(loc=loc, ip=ip)),
+        loc=loc,
+        ip=ip,
+    )
+    vec_i64x1 = vector.bitcast(T.vector(1, T.i64()), vec_f32x2, loc=loc, ip=ip)
+    res = cutlass.Int64(
+        vector.extract(vec_i64x1, dynamic_position=[], static_position=[0], loc=loc, ip=ip)
+    )
+    return res
+
+
+@dsl_user_op
+def i64_to_f32x2(c: cutlass.Int64, *, loc=None, ip=None) -> Tuple[Float32, Float32]:
+    """Unpack a single i64 into two f32 values, inverse of f32x2_to_i64."""
+    vec_i64x1 = vector.from_elements(
+        T.vector(1, T.i64()),
+        (c.ir_value(loc=loc, ip=ip),),
+        loc=loc,
+        ip=ip,
+    )
+    vec_f32x2 = vector.bitcast(T.vector(2, T.f32()), vec_i64x1, loc=loc, ip=ip)
+    res0 = Float32(
+        vector.extract(vec_f32x2, dynamic_position=[], static_position=[0], loc=loc, ip=ip)
+    )
+    res1 = Float32(
+        vector.extract(vec_f32x2, dynamic_position=[], static_position=[1], loc=loc, ip=ip)
+    )
+    return res0, res1
+
+
 # -------------------------
-# Reduction helpers
+# Reduction helpers (from quack.reduce)
 # -------------------------
 
 
@@ -212,10 +291,6 @@ def warp_reduce(
     op: Callable,
     width: cutlass.Constexpr[int] = cute.arch.WARP_SIZE,
 ) -> cute.TensorSSA | cute.Numeric:
-    """
-    Warp-level reduction for either scalar values or small TensorSSA
-    fragments.
-    """
     if cutlass.const_expr(isinstance(val, cute.TensorSSA)):
         res = cute.make_fragment(val.shape, val.dtype)
         res.store(val)
@@ -234,7 +309,7 @@ def block_reduce(
     reduction_buffer: cute.Tensor,
     init_val: cute.Numeric = 0.0,
 ) -> cute.Numeric:
-    """Block-level reduction across warps."""
+    """reduction_buffer has shape (num_warps / warp_per_row, warps_per_row)."""
     lane_idx, warp_idx = cute.arch.lane_idx(), cute.arch.warp_idx()
     warps_per_row = cute.size(reduction_buffer.shape[1])
     row_idx, col_idx = warp_idx // warps_per_row, warp_idx % warps_per_row
@@ -256,10 +331,7 @@ def cluster_reduce(
     init_val: cute.Numeric = 0.0,
     phase: Optional[cutlass.Int32] = None,
 ) -> cute.Numeric:
-    """
-    Cluster-wide reduction using shared memory and mbarrier. The
-    reduction_buffer has shape (rows_per_block, (warps_per_row, cluster_n)).
-    """
+    """reduction_buffer has shape (num_warps / warps_per_row, (warps_per_row, cluster_n))."""
     cta_rank_in_cluster = cute.arch.block_idx_in_cluster()
     lane_idx, warp_idx = cute.arch.lane_idx(), cute.arch.warp_idx()
     rows_per_block, (warps_per_row, cluster_n) = reduction_buffer.shape
@@ -297,12 +369,10 @@ def block_or_cluster_reduce(
     phase: Optional[cutlass.Int32] = None,
     init_val: cute.Numeric = 0.0,
 ) -> cute.Numeric:
-    """Dispatch between block or cluster reduction depending on mbar_ptr."""
+    """Perform either block or cluster reduction based on whether mbar_ptr is provided."""
     if cutlass.const_expr(mbar_ptr is None):
         return block_reduce(val, op, reduction_buffer, init_val=init_val)
-    return cluster_reduce(
-        val, op, reduction_buffer, mbar_ptr, init_val=init_val, phase=phase
-    )
+    return cluster_reduce(val, op, reduction_buffer, mbar_ptr, init_val=init_val, phase=phase)
 
 
 @cute.jit
@@ -316,21 +386,14 @@ def row_reduce(
     init_val: cute.Numeric = 0.0,
     hook_fn: Optional[Callable] = None,
 ) -> cute.Numeric:
-    """
-    Row-wise reduction used by RMSNorm and similar kernels.
-
-    reduction_buffer must have shape
-      (num_warps / warps_per_row, (warps_per_row, cluster_n)).
-    """
+    """reduction_buffer must have shape (num_warps / warps_per_row, (warps_per_row, cluster_n))."""
     if cutlass.const_expr(isinstance(x, cute.TensorSSA)):
         val = x.reduce(op, init_val=init_val, reduction_profile=0)
     else:
         val = x
     warp_op = {
         cute.ReductionOp.ADD: operator.add,
-        cute.ReductionOp.MAX: cute.arch.fmax
-        if cutlass.const_expr(x.dtype == Float32)
-        else max,
+        cute.ReductionOp.MAX: cute.arch.fmax if cutlass.const_expr(x.dtype == Float32) else max,
         cute.ReductionOp.MIN: min,
         cute.ReductionOp.MUL: operator.mul,
     }[op]
@@ -358,28 +421,808 @@ def row_reduce(
     return val
 
 
+@cute.jit
+def row_reduce_add(
+    x: cute.TensorSSA | cute.Numeric,
+    threads_per_row: cutlass.Constexpr[int],
+    reduction_buffer: Optional[cute.Tensor] = None,
+    mbar_ptr: Optional[cute.Pointer] = None,
+    phase: Optional[cutlass.Int32] = None,
+    init_val: cute.Numeric = 0.0,
+    hook_fn: Optional[Callable] = None,
+) -> cute.Numeric:
+    """Specialized row_reduce for ADD reductions.
+
+    This mirrors row_reduce but hardcodes the ADD operation so we avoid
+    dynamic dispatch on the reduction op. It is used by bandwidth-bound
+    kernels like RMSNorm backward where the reduction is always ADD in
+    Float32.
+    """
+    if cutlass.const_expr(isinstance(x, cute.TensorSSA)):
+        val = x.reduce(cute.ReductionOp.ADD, init_val=init_val, reduction_profile=0)
+    else:
+        val = x
+    val = warp_reduce(
+        val,
+        operator.add,
+        width=min(threads_per_row, cute.arch.WARP_SIZE),
+    )
+    if cutlass.const_expr(hook_fn is not None):
+        hook_fn()
+    if cutlass.const_expr(reduction_buffer is not None):
+        warps_per_row, cluster_n = reduction_buffer.shape[1]
+        assert cluster_n == 1 or mbar_ptr is not None, (
+            "mbar_ptr must be provided for cluster reduction"
+        )
+        if cutlass.const_expr(warps_per_row > 1 or cluster_n > 1):
+            val = block_or_cluster_reduce(
+                val,
+                operator.add,
+                reduction_buffer,
+                mbar_ptr,
+                phase=phase,
+                init_val=init_val,
+            )
+    return val
+
+
+@cute.jit
+def online_softmax_reduce(
+    x: cute.TensorSSA,
+    threads_per_row: cutlass.Constexpr[int],
+    reduction_buffer: Optional[cute.Tensor] = None,
+    mbar_ptr: Optional[cute.Pointer] = None,
+    hook_fn: Optional[Callable] = None,
+    phase: Optional[cutlass.Int32] = None,
+    return_exp_x: bool = False,
+) -> tuple[Float32, Float32, Optional[cute.TensorSSA]]:
+    """Online softmax reduction over a row.
+
+    This mirrors quack.reduce.online_softmax_reduce and computes:
+      - max_x: row-wise maximum of x
+      - sum_exp_x: row-wise sum of exp(x - max_x)
+      - exp_x (optional): per-element exp(x - max_x_final) if return_exp_x is True
+    """
+    assert x.dtype == Float32, "x must be of type Float32"
+    # reduction_buffer must have shape (num_warps / warps_per_row, (warps_per_row, cluster_n), 2)
+    max_x = warp_reduce(
+        x.reduce(cute.ReductionOp.MAX, init_val=-Float32.inf, reduction_profile=0),
+        cute.arch.fmax,
+        width=min(threads_per_row, cute.arch.WARP_SIZE),
+    )
+    log2_e = math.log2(math.e)
+    exp_x = cute.math.exp2(x * log2_e - (max_x * log2_e), fastmath=True)
+    sum_exp_x = warp_reduce(
+        exp_x.reduce(cute.ReductionOp.ADD, init_val=0.0, reduction_profile=0),
+        operator.add,
+        width=min(threads_per_row, cute.arch.WARP_SIZE),
+    )
+    if cutlass.const_expr(hook_fn is not None):
+        hook_fn()
+    if cutlass.const_expr(reduction_buffer is not None):
+        rows_per_block, (warps_per_row, cluster_n) = reduction_buffer.shape
+        assert cluster_n == 1 or mbar_ptr is not None, (
+            "mbar_ptr must be provided for cluster reduction"
+        )
+        if cutlass.const_expr(warps_per_row > 1 or cluster_n > 1):
+            assert reduction_buffer.element_type == cutlass.Int64, (
+                "reduction_buffer must be of type Int64"
+            )
+            lane_idx, warp_idx = cute.arch.lane_idx(), cute.arch.warp_idx()
+            row_idx, col_idx = warp_idx // warps_per_row, warp_idx % warps_per_row
+            if cutlass.const_expr(mbar_ptr is None):
+                if lane_idx == 0:
+                    reduction_buffer[row_idx, col_idx] = f32x2_to_i64(max_x, sum_exp_x)
+                cute.arch.barrier()
+                max_x_single_warp = -Float32.inf
+                sum_exp_x = 0.0
+                if lane_idx < warps_per_row:
+                    max_x_single_warp, sum_exp_x = i64_to_f32x2(
+                        reduction_buffer[row_idx, lane_idx]
+                    )
+                max_x_final = warp_reduce(max_x_single_warp, cute.arch.fmax)
+                sum_exp_x *= cute.math.exp(max_x_single_warp - max_x_final, fastmath=True)
+                sum_exp_x = warp_reduce(sum_exp_x, operator.add)
+                if cutlass.const_expr(return_exp_x):
+                    exp_x *= cute.math.exp(max_x - max_x_final, fastmath=True)
+                max_x = max_x_final
+            else:
+                cta_rank_in_cluster = cute.arch.block_idx_in_cluster()
+                if warp_idx == 0:
+                    with cute.arch.elect_one():
+                        num_warps = rows_per_block * warps_per_row
+                        cute.arch.mbarrier_arrive_and_expect_tx(
+                            mbar_ptr,
+                            num_warps * cluster_n * reduction_buffer.element_type.width // 8,
+                        )
+                if lane_idx < cluster_n:
+                    store_shared_remote(
+                        f32x2_to_i64(max_x, sum_exp_x),
+                        elem_pointer(reduction_buffer, (row_idx, (col_idx, cta_rank_in_cluster))),
+                        mbar_ptr,
+                        peer_cta_rank_in_cluster=lane_idx,
+                    )
+                cute.arch.mbarrier_wait(mbar_ptr, phase=phase if phase is not None else 0)
+                num_iter = cute.ceil_div(warps_per_row * cluster_n, cute.arch.WARP_SIZE)
+                max_x_single_warp = cute.make_fragment(num_iter, Float32)
+                max_x_single_warp.fill(-Float32.inf)
+                sum_exp_x_single_warp = cute.make_fragment(num_iter, Float32)
+                sum_exp_x_single_warp.fill(0.0)
+                for i in cutlass.range_constexpr(num_iter):
+                    idx = lane_idx + i * cute.arch.WARP_SIZE
+                    if idx < cute.size(reduction_buffer, mode=[1]):
+                        max_x_single_warp[i], sum_exp_x_single_warp[i] = i64_to_f32x2(
+                            reduction_buffer[row_idx, idx]
+                        )
+                max_x_final = max_x_single_warp.load().reduce(
+                    cute.ReductionOp.MAX,
+                    init_val=-Float32.inf,
+                    reduction_profile=0,
+                )
+                max_x_final = warp_reduce(max_x_final, cute.arch.fmax)
+                sum_exp_x = 0.0
+                for i in cutlass.range_constexpr(num_iter):
+                    sum_exp_x += sum_exp_x_single_warp[i] * cute.math.exp(
+                        max_x_single_warp[i] - max_x_final,
+                        fastmath=True,
+                    )
+                sum_exp_x = warp_reduce(sum_exp_x, operator.add)
+                if cutlass.const_expr(return_exp_x):
+                    exp_x *= cute.math.exp(max_x - max_x_final, fastmath=True)
+                max_x = max_x_final
+    return max_x, sum_exp_x, (exp_x if cutlass.const_expr(return_exp_x) else None)
+
+
+# -------------------------
+# Copy helpers (minimal subset of quack.copy_utils)
+# -------------------------
+
+
+@dsl_user_op
+def get_copy_atom(
+    dtype: Type[cutlass.Numeric],
+    num_copy_elems: int,
+    is_async: bool = False,
+    *,
+    loc=None,
+    ip=None,
+) -> cute.CopyAtom:
+    from cutlass.cute.nvgpu import cpasync
+
+    num_copy_bits = const_expr(min(128, num_copy_elems * dtype.width))
+    copy_op = cpasync.CopyG2SOp() if is_async else cute.nvgpu.CopyUniversalOp()
+    return cute.make_copy_atom(copy_op, dtype, num_bits_per_copy=num_copy_bits, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def copy(
+    src: cute.Tensor,
+    dst: cute.Tensor,
+    *,
+    pred: Optional[cute.Tensor] = None,
+    num_copy_elems: int = 1,
+    is_async: bool = False,
+    loc=None,
+    ip=None,
+    **kwargs,
+) -> None:
+    copy_atom = get_copy_atom(src.element_type, num_copy_elems, is_async, loc=loc, ip=ip)
+    cute.copy(copy_atom, src, dst, pred=pred, loc=loc, ip=ip, **kwargs)
+
+
+# -------------------------
+# Reduction base (from quack.reduction_base)
+# -------------------------
+
+
+class ReductionBase:
+    def __init__(
+        self,
+        dtype: Type[cutlass.Numeric],
+        N: int,
+        stage: int,
+        reduction_dtype: Type[cutlass.Numeric] = cutlass.Float32,
+    ):
+        self.dtype = dtype
+        self.N = N
+        self.stage = stage
+        self.reduction_dtype = reduction_dtype
+
+    def _calculate_threads_per_row(self) -> int:
+        raise NotImplementedError()
+
+    def _set_cluster_n(self) -> None:
+        self.cluster_n = 1
+
+    def _get_num_threads(self) -> int:
+        return 128 if self.N <= 16384 else 256
+
+    def _get_tv_layout(self, num_copy_bits: int = 128) -> Tuple[cute.Shape, cute.Layout]:
+        vecsize = num_copy_bits // self.dtype.width
+        assert self.N % vecsize == 0, f"Input N {self.N} is not divisible by vector size {vecsize}"
+        num_threads = self._get_num_threads()
+        assert num_threads % cute.arch.WARP_SIZE == 0
+
+        threads_per_row = self._calculate_threads_per_row()
+        self._set_cluster_n()
+        num_blocks_N = cute.ceil_div(self.N // vecsize, threads_per_row * self.cluster_n)
+        cols_per_block = num_threads // threads_per_row
+        tiler_mn = (cols_per_block, vecsize * num_blocks_N * threads_per_row)
+        tv_layout = cute.make_layout(
+            ((threads_per_row, cols_per_block), (vecsize, num_blocks_N)),
+            stride=(
+                (vecsize * cols_per_block, 1),
+                (cols_per_block, cols_per_block * vecsize * threads_per_row),
+            ),
+        )
+        return tiler_mn, tv_layout
+
+    def _smem_size_in_bytes(self, tiler_mn, num_warps: int) -> int:
+        return (
+            cute.size_in_bytes(self.dtype, cute.make_layout(tiler_mn))
+            + self.stage * num_warps * self.cluster_n * (self.reduction_dtype.width // 8)
+            + self.stage * (cutlass.Int64.width // 8)
+        )
+
+    def _get_reduction_buffer_layout(self, tv_layout: cute.Layout, cluster_n: int) -> cute.Layout:
+        num_warps = cute.size(tv_layout, mode=[0]) // cute.arch.WARP_SIZE
+        warps_per_row = max(tv_layout.shape[0][0] // cute.arch.WARP_SIZE, 1)
+        return cute.make_ordered_layout(
+            (num_warps // warps_per_row, (warps_per_row, cluster_n), self.stage),
+            order=(1, 0, 2),
+        )
+
+    def _allocate_reduction_buffer_and_mbar(
+        self,
+        smem: cutlass.utils.SmemAllocator,
+        tv_layout: cute.Layout,
+        is_persistent: bool = False,
+    ) -> Tuple[cute.Tensor, Optional[cute.Pointer]]:
+        reduction_buffer = smem.allocate_tensor(
+            self.reduction_dtype,
+            self._get_reduction_buffer_layout(tv_layout, self.cluster_n),
+            byte_alignment=4,
+        )
+        if cutlass.const_expr(self.cluster_n > 1):
+            mbar_ptr = smem.allocate_array(
+                cutlass.Int64,
+                num_elems=self.stage if not is_persistent else self.stage * 2,
+            )
+        else:
+            mbar_ptr = None
+        return reduction_buffer, mbar_ptr
+
+    @cute.jit
+    def _initialize_cluster(
+        self,
+        tidx: cutlass.Int32,
+        mbar_ptr: Optional[cute.Pointer],
+        num_warps: int,
+        is_persistent: bool = False,
+    ) -> None:
+        if cutlass.const_expr(self.cluster_n > 1 and mbar_ptr is not None):
+            if tidx < self.stage:
+                cute.arch.mbarrier_init(mbar_ptr + tidx, 1)
+                if cutlass.const_expr(is_persistent):
+                    cute.arch.mbarrier_init(
+                        mbar_ptr + self.stage + tidx,
+                        num_warps * self.cluster_n,
+                    )
+            cute.arch.mbarrier_init_fence()
+            cute.arch.cluster_arrive_relaxed()
+
+
 # -------------------------
-# SM count helper
+# RMSNorm backward base (from quack.rmsnorm.RMSNormBackward)
 # -------------------------
 
 
-def get_sm_count(N: int, device: torch.device) -> int:
+class RMSNormBackward(ReductionBase):
+    def __init__(self, dtype: cutlass.Numeric, N: int):
+        # 2 stages for double buffering when computing mean of x_hat * wdy
+        super().__init__(dtype, N, stage=2, reduction_dtype=Float32)
+        self.reload_wdy = None if N <= 16 * 1024 else "smem"
+        if self.N > 128 * 1024 and self.dtype.width >= 32:
+            raise ValueError("RMSNormBackward does not support N > 128k with dtype >= 32 bits")
+
+    def _get_num_threads(self) -> int:
+        return 128 if self.N <= 4096 else 256
+
+    def _calculate_threads_per_row(self) -> int:
+        N = self.N
+        return (
+            8
+            if N <= 64
+            else (
+                16
+                if N <= 128
+                else (32 if N <= 256 else (64 if N <= 512 else (128 if N <= 4096 else 256)))
+            )
+        )
+
+    def _set_cluster_n(self) -> None:
+        N = self.N
+        cluster_n = (
+            1
+            if N <= 8 * 1024
+            else (2 if N <= 16 * 1024 else (4 if N <= 32 * 1024 else (8 if N <= 64 * 1024 else 16)))
+        )
+        self.cluster_n = cluster_n
+
+    def _smem_size_in_bytes(self, tiler_mn, num_warps: int, do_dtype=None) -> int:
+        if do_dtype is None:
+            do_dtype = self.dtype
+        return (
+            cute.size_in_bytes(self.dtype, cute.make_layout(tiler_mn)) * 2
+            + cute.size_in_bytes(do_dtype, cute.make_layout(tiler_mn)) * 2
+            + self.stage * num_warps * self.cluster_n * (self.reduction_dtype.width // 8)
+            + self.stage * (cutlass.Int64.width // 8) * 2
+        )
+
+    @cute.jit
+    def __call__(
+        self,
+        mX: cute.Tensor,
+        mW: Optional[cute.Tensor],
+        mdO: cute.Tensor,
+        mdResO: Optional[cute.Tensor],
+        mRstd: cute.Tensor,
+        mdX: cute.Tensor,
+        mdW: Optional[cute.Tensor],
+        mdRes: Optional[cute.Tensor],
+        mdB: Optional[cute.Tensor],
+        sm_count: Int32,
+        stream: cuda.CUstream,
+    ):
+        semistatic_shape = (*mX.shape[:-1], self.N)
+
+        def new_stride(t):
+            return (
+                cute.assume(t.stride[0], divby=128 // t.element_type.width),
+                t.stride[1],
+            )
+
+        mX, mdO, mdResO, mdX, mdRes = [
+            cute.make_tensor(t.iterator, cute.make_layout(semistatic_shape, stride=new_stride(t)))
+            if const_expr(t is not None)
+            else None
+            for t in (mX, mdO, mdResO, mdX, mdRes)
+        ]
+        self._set_cluster_n()
+        largest_dtype_width = const_expr(
+            max(
+                mX.element_type.width,
+                mdO.element_type.width,
+                mdX.element_type.width,
+                mdResO.element_type.width if mdResO is not None else 0,
+                mdRes.element_type.width if mdRes is not None else 0,
+            )
+        )
+        tiler_mn, tv_layout = self._get_tv_layout(
+            num_copy_bits=128 // largest_dtype_width * mX.element_type.width
+        )
+        num_threads = (
+            cute.size(tv_layout, mode=[0]) if _KERNEL_ACCEPTS_LAYOUT_ARGS else self._get_num_threads()
+        )
+        num_warps = num_threads // cute.arch.WARP_SIZE
+        if const_expr(mW is not None):
+            mW_expanded_layout = cute.prepend(
+                mW.layout,
+                cute.make_layout((tiler_mn[0],), stride=(0,)),
+            )
+            mW = cute.make_tensor(mW.iterator, mW_expanded_layout)
+
+        num_blocks = sm_count
+        kernel = (
+            self.kernel(mX, mW, mdO, mdResO, mRstd, mdX, mdW, mdB, mdRes, tv_layout, tiler_mn)
+            if _KERNEL_ACCEPTS_LAYOUT_ARGS
+            else self.kernel(mX, mW, mdO, mdResO, mRstd, mdX, mdW, mdB, mdRes)
+        )
+        kernel.launch(
+            grid=[num_blocks, self.cluster_n, 1],
+            block=[num_threads, 1, 1],
+            cluster=[1, self.cluster_n, 1] if self.cluster_n > 1 else None,
+            smem=self._smem_size_in_bytes(tiler_mn, num_warps, do_dtype=mdO.element_type),
+            stream=stream,
+        )
+
+    @cute.jit
+    def _kernel_impl(
+        self,
+        mX: cute.Tensor,
+        mW: Optional[cute.Tensor],
+        mdO: cute.Tensor,
+        mdResO: Optional[cute.Tensor],
+        mRstd: cute.Tensor,
+        mdX: cute.Tensor,
+        mdW: Optional[cute.Tensor],
+        mdB: Optional[cute.Tensor],
+        mdRes: Optional[cute.Tensor],
+        tv_layout: cute.Layout,
+        tiler_mn: cute.Shape,
+    ):
+        tidx, _, _ = cute.arch.thread_idx()
+        bidx_start, _, _ = cute.arch.block_idx()
+        gdim, _, _ = cute.arch.grid_dim()
+        if const_expr(self.cluster_n > 1):
+            cluster_y = cute.arch.block_idx()[1]
+        else:
+            cluster_y = const_expr(0)
+
+        shape = mX.shape
+        M = shape[0]
+        is_even_N = const_expr(shape[1] == tiler_mn[1] * self.cluster_n)
+
+        idX = cute.make_identity_tensor(shape)
+
+        smem = cutlass.utils.SmemAllocator()
+        smem_layout = cute.make_ordered_layout((tiler_mn[0], tiler_mn[1], 2), order=(1, 0, 2))
+        sX = smem.allocate_tensor(mX.element_type, smem_layout, byte_alignment=16)
+        sdO = smem.allocate_tensor(mdO.element_type, smem_layout, byte_alignment=16)
+        reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(
+            smem,
+            tv_layout,
+            is_persistent=True,
+        )
+        if const_expr(mbar_ptr is not None):
+            mbar_full_ptr, mbar_empty_ptr = mbar_ptr, mbar_ptr + 2
+        else:
+            mbar_full_ptr, mbar_empty_ptr = None, None
+
+        num_copy_elems_X = tv_layout.shape[1][0]
+        copy_atom_load_X = get_copy_atom(mX.element_type, num_copy_elems_X, is_async=False)
+        thr_copy_X = cute.make_tiled_copy(copy_atom_load_X, tv_layout, tiler_mn).get_slice(tidx)
+        copy_fn = partial(copy, num_copy_elems=num_copy_elems_X)
+
+        gX, gdO, gdResO, gdX, gdRes, cX = [
+            cute.local_tile(mT, tiler_mn, (None, cluster_y)) if mT is not None else None
+            for mT in (mX, mdO, mdResO, mdX, mdRes, idX)
+        ]
+        gW = cute.local_tile(mW, tiler_mn, (0, cluster_y)) if mW is not None else None
+        gdW, gdB = [
+            cute.local_tile(mT, (1, tiler_mn[1]), (bidx_start, cluster_y))
+            if const_expr(mT is not None)
+            else None
+            for mT in (mdW, mdB)
+        ]
+
+        tXgX = thr_copy_X.partition_S(gX)
+        tXsX = thr_copy_X.partition_D(sX)
+        tXgdO = thr_copy_X.partition_S(gdO)
+        tXsdO = thr_copy_X.partition_D(sdO)
+        tXgdX = thr_copy_X.partition_D(gdX)
+        if const_expr(mdResO is not None):
+            tXgdResO = thr_copy_X.partition_S(gdResO)
+        if const_expr(mdRes is not None):
+            tXgdRes = thr_copy_X.partition_D(gdRes)
+        tXcX = thr_copy_X.partition_S(cX)[(0, None), None, None, None]
+
+        tXrX, tXrdO, tXrdX = [
+            cute.make_fragment_like(thr[None, None, None, 0]) for thr in (tXgX, tXgdO, tXgdX)
+        ]
+        tXrdResO = None
+        if const_expr(mdResO is not None):
+            tXrdResO = cute.make_fragment_like(tXgdResO[None, None, None, 0])
+        tXrdRes = None
+        if const_expr(mdRes is not None):
+            tXrdRes = cute.make_fragment_like(tXgdRes[None, None, None, 0])
+
+        tXpX = (
+            predicate_k(thr_copy_X.partition_S(cX[None, None, 0]), limit=shape[1])
+            if not is_even_N
+            else None
+        )
+
+        tXgdW, tXrdW = None, None
+        tXgdB, tXrdB = None, None
+        if const_expr(mdW is not None):
+            tXgdW = thr_copy_X.partition_S(gdW)
+            tXrdW = cute.make_fragment_like(tXgdW, Float32)
+        if const_expr(mdB is not None):
+            tXgdB = thr_copy_X.partition_S(gdB)
+            tXrdB = cute.make_fragment_like(tXgdB, Float32)
+
+        num_warps = cute.size(tv_layout, mode=[0]) // cute.arch.WARP_SIZE
+
+        self._initialize_cluster(tidx, mbar_ptr, num_warps, is_persistent=True)
+
+        tXrW = None
+        if const_expr(mW is not None):
+            tXgW = thr_copy_X.partition_S(gW)
+            tXrW = cute.make_fragment_like(tXgW)
+            if not is_even_N:
+                tXrW.fill(0.0)
+            copy_fn(tXgW, tXrW, pred=tXpX)
+
+        row = tXcX[None, None, None, bidx_start][0][0]
+        if row < M:
+            tXgX_cur = coord_offset_i64(bidx_start, tXgX, dim=3)[None, None, None, 0]
+            tXgdO_cur = coord_offset_i64(bidx_start, tXgdO, dim=3)[None, None, None, 0]
+            copy_fn(tXgX_cur, tXsX[None, None, None, 0], pred=tXpX, is_async=True)
+            copy_fn(tXgdO_cur, tXsdO[None, None, None, 0], pred=tXpX, is_async=True)
+        elif tiler_mn[0] > 1:
+            fill_oob(tXsX[None, None, None, 0], None, fill_value=mX.element_type.zero)
+            fill_oob(tXsdO[None, None, None, 0], None, fill_value=mdO.element_type.zero)
+        cute.arch.cp_async_commit_group()
+
+        if const_expr(self.cluster_n > 1):
+            cute.arch.cluster_wait()
+
+        threads_per_row = tv_layout.shape[0][0]
+        if const_expr(mdW is not None):
+            tXrdW.fill(0.0)
+        if const_expr(mdB is not None):
+            tXrdB.fill(0.0)
+        stage = Int32(0)
+        producer_phase = Int32(1)
+        consumer_phase = Int32(0)
+        for bidx in cutlass.range(bidx_start, cute.ceil_div(M, tiler_mn[0]), gdim):
+            row = tXcX[None, None, None, bidx][0][0]
+            if row + gdim * tiler_mn[0] < M:
+                tXgX_cur = coord_offset_i64(bidx + gdim, tXgX, dim=3)[None, None, None, 0]
+                tXgdO_cur = coord_offset_i64(bidx + gdim, tXgdO, dim=3)[None, None, None, 0]
+                copy_fn(tXgX_cur, tXsX[None, None, None, stage ^ 1], pred=tXpX, is_async=True)
+                copy_fn(tXgdO_cur, tXsdO[None, None, None, stage ^ 1], pred=tXpX, is_async=True)
+            elif tiler_mn[0] > 1:
+                fill_oob(
+                    tXsX[None, None, None, stage ^ 1],
+                    None,
+                    fill_value=mX.element_type.zero,
+                )
+                fill_oob(
+                    tXsdO[None, None, None, stage ^ 1],
+                    None,
+                    fill_value=mdO.element_type.zero,
+                )
+            cute.arch.cp_async_commit_group()
+            rstd_val = cutlass.Float.zero
+            if row < M or tiler_mn[0] == 1:
+                rstd_val = mRstd[row]
+            if const_expr(mdResO is not None):
+                tXgdResO_cur = coord_offset_i64(bidx, tXgdResO, dim=3)[None, None, None, 0]
+                if row < M or tiler_mn[0] == 1:
+                    copy_fn(tXgdResO_cur, tXrdResO, pred=tXpX)
+                elif tiler_mn[0] > 1:
+                    tXrdResO.fill(0.0)
+            cute.arch.cp_async_wait_group(1)
+            cute.autovec_copy(tXsX[None, None, None, stage], tXrX)
+            x = tXrX.load().to(cute.Float32)
+            cute.autovec_copy(tXsdO[None, None, None, stage], tXrdO)
+            dout = tXrdO.load().to(cute.Float32)
+            x_hat = x * rstd_val
+            wdy = dout
+            if const_expr(mW is not None):
+                wdy *= tXrW.load().to(Float32)
+            if const_expr(self.cluster_n > 1):
+                cute.arch.mbarrier_wait(mbar_empty_ptr + stage, producer_phase)
+            mean_xhat_wdy = (
+                row_reduce_add(
+                    x_hat * wdy,
+                    threads_per_row,
+                    reduction_buffer[None, None, stage],
+                    (mbar_full_ptr + stage if const_expr(self.cluster_n > 1) else None),
+                    phase=consumer_phase,
+                    init_val=0.0,
+                )
+                / shape[1]
+            )
+
+            if const_expr(self.cluster_n > 1):
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared,
+                    space=cute.arch.SharedSpace.shared_cta,
+                )
+                cute.arch.sync_warp()
+                lane_idx = cute.arch.lane_idx()
+                if lane_idx < self.cluster_n:
+                    cute.arch.mbarrier_arrive(
+                        mbar_empty_ptr + stage,
+                        peer_cta_rank_in_cluster=lane_idx,
+                    )
+
+            if const_expr(self.reload_wdy == "smem"):
+                cute.autovec_copy(tXsdO[None, None, None, stage], tXrdO)
+                dout = tXrdO.load().to(cute.Float32)
+                wdy = dout
+                if const_expr(mW is not None):
+                    wdy *= tXrW.load().to(Float32)
+
+            dx = (wdy - x_hat * mean_xhat_wdy) * rstd_val
+            if const_expr(mdResO is not None):
+                dx += tXrdResO.load().to(cute.Float32)
+            tXrdX.store(dx.to(tXrdX.element_type))
+            if row < M or tiler_mn[0] == 1:
+                tXgdX_cur = coord_offset_i64(bidx, tXgdX, dim=3)[None, None, None, 0]
+                copy_fn(tXrdX, tXgdX_cur, pred=tXpX)
+            if const_expr(mdRes is not None):
+                tXrdRes.store(dx.to(tXrdRes.element_type))
+                tXgdRes_cur = coord_offset_i64(bidx, tXgdRes, dim=3)[None, None, None, 0]
+                if row < M or tiler_mn[0] == 1:
+                    copy_fn(tXrdRes, tXgdRes_cur, pred=tXpX)
+            if const_expr(mdW is not None):
+                tXrdW.store(tXrdW.load() + dout * x_hat)
+            if const_expr(mdB is not None):
+                tXrdB.store(tXrdB.load() + dout)
+
+            stage ^= 1
+            if stage == 0:
+                consumer_phase ^= 1
+                producer_phase ^= 1
+
+        if const_expr(tiler_mn[0] > 1):
+            if const_expr(mdW is not None):
+                sdW = cute.make_tensor(
+                    cute.recast_ptr(sX.iterator, dtype=cute.Float32),
+                    cute.make_ordered_layout(tiler_mn, order=(1, 0)),
+                )
+                tXsdW = thr_copy_X.partition_D(sdW)
+                cute.arch.barrier()
+                row0 = tXcX[None, None, None, 0][0][0]
+                if row0 > 0:
+                    cute.autovec_copy(tXrdW, tXsdW)
+                cute.arch.barrier()
+                if row0 == 0:
+                    for i in cutlass.range_constexpr(1, const_expr(tiler_mn[0])):
+                        tXrdW_other = cute.make_fragment_like(tXrdW)
+                        tXsdW_other = cute.make_tensor(
+                            tXsdW.iterator + i * sdW.stride[0],
+                            tXsdW.layout,
+                        )
+                        cute.autovec_copy(tXsdW_other, tXrdW_other)
+                        tXrdW.store(tXrdW.load() + tXrdW_other.load())
+                    copy_fn(tXrdW, tXgdW, pred=tXpX)
+                cute.arch.barrier()
+            if const_expr(mdB is not None):
+                sdB = cute.make_tensor(
+                    cute.recast_ptr(sX.iterator, dtype=cute.Float32),
+                    cute.make_ordered_layout(tiler_mn, order=(1, 0)),
+                )
+                tXsdB = thr_copy_X.partition_D(sdB)
+                cute.arch.barrier()
+                row0 = tXcX[None, None, None, 0][0][0]
+                if row0 > 0:
+                    cute.autovec_copy(tXrdB, tXsdB)
+                cute.arch.barrier()
+                if row0 == 0:
+                    for i in cutlass.range_constexpr(1, const_expr(tiler_mn[0])):
+                        tXrdB_other = cute.make_fragment_like(tXrdB)
+                        tXsdB_other = cute.make_tensor(
+                            tXsdB.iterator + i * sdB.stride[0],
+                            tXsdB.layout,
+                        )
+                        cute.autovec_copy(tXsdB_other, tXrdB_other)
+                        tXrdB.store(tXrdB.load() + tXrdB_other.load())
+                    copy_fn(tXrdB, tXgdB, pred=tXpX)
+        else:
+            if const_expr(mdW is not None):
+                copy_fn(tXrdW, tXgdW, pred=tXpX)
+            if const_expr(mdB is not None):
+                copy_fn(tXrdB, tXgdB, pred=tXpX)
+
+        if const_expr(self.cluster_n > 1):
+            stage ^= 1
+            if stage == 0:
+                producer_phase ^= 1
+            cute.arch.mbarrier_wait(mbar_empty_ptr + stage, producer_phase)
+
+    if _KERNEL_ACCEPTS_LAYOUT_ARGS:
+
+        @cute.kernel
+        def kernel(
+            self,
+            mX: cute.Tensor,
+            mW: Optional[cute.Tensor],
+            mdO: cute.Tensor,
+            mdResO: Optional[cute.Tensor],
+            mRstd: cute.Tensor,
+            mdX: cute.Tensor,
+            mdW: Optional[cute.Tensor],
+            mdB: Optional[cute.Tensor],
+            mdRes: Optional[cute.Tensor],
+            tv_layout: cute.Layout,
+            tiler_mn: cute.Shape,
+        ):
+            self._kernel_impl(
+                mX,
+                mW,
+                mdO,
+                mdResO,
+                mRstd,
+                mdX,
+                mdW,
+                mdB,
+                mdRes,
+                tv_layout,
+                tiler_mn,
+            )
+    else:
+
+        @cute.kernel
+        def kernel(
+            self,
+            mX: cute.Tensor,
+            mW: Optional[cute.Tensor],
+            mdO: cute.Tensor,
+            mdResO: Optional[cute.Tensor],
+            mRstd: cute.Tensor,
+            mdX: cute.Tensor,
+            mdW: Optional[cute.Tensor],
+            mdB: Optional[cute.Tensor],
+            mdRes: Optional[cute.Tensor],
+        ):
+            largest_dtype_width = const_expr(
+                max(
+                    mX.element_type.width,
+                    mdO.element_type.width,
+                    mdX.element_type.width,
+                    mdResO.element_type.width if mdResO is not None else 0,
+                    mdRes.element_type.width if mdRes is not None else 0,
+                )
+            )
+            tiler_mn, tv_layout = self._get_tv_layout(
+                num_copy_bits=128 // largest_dtype_width * mX.element_type.width
+            )
+            self._kernel_impl(
+                mX,
+                mW,
+                mdO,
+                mdResO,
+                mRstd,
+                mdX,
+                mdW,
+                mdB,
+                mdRes,
+                tv_layout,
+                tiler_mn,
+            )
+
+
+# -------------------------
+# SM count helper (from quack.rmsnorm._get_sm_count)
+# -------------------------
+
+
+def get_sm_count(
+    N: int,
+    device: torch.device,
+    M: Optional[int] = None,
+    dtype: Optional[torch.dtype] = None,
+) -> int:
     """
-    Heuristic for the number of persistent CTAs (sm_count) based on N and
-    the GPU's SM count. This mirrors the behaviour used in Quack's
-    RMSNorm kernels but lives entirely in this local module.
+    SM count heuristic for reduction-style kernels.
+
+    This starts from Quack's _get_sm_count policy and layers on SM100 /
+    DSv3-specific tuning so that:
+      - For DSv3-style shapes (large-M, N in {6144, 8192}, fp16/bf16),
+        sm_count is reduced for very large M to cut down the number of
+        dw_partial/db_partial rows that ever hit HBM.
+      - For Quack-suite hidden=4096, small-M shapes, sm_count is modestly
+        increased to improve SM occupancy, matching the existing SM100
+        tuning used by both RMSNorm and LayerNorm.
     """
+    props = torch.cuda.get_device_properties(device)
+    num_sms = props.multi_processor_count
+
     sm_count_multiple = (
-        16
-        if N <= 256
-        else (8 if N <= 1024 else (4 if N <= 2048 else (2 if N <= 4096 else 1)))
-    )
-    sm_count = torch.cuda.get_device_properties(device).multi_processor_count
-    sm_count = (
-        sm_count * sm_count_multiple
-        if N <= 8192
-        else sm_count // 2
-        if N <= 16384
-        else sm_count * 2
+        16 if N <= 256 else (8 if N <= 1024 else (4 if N <= 2048 else (2 if N <= 4096 else 1)))
     )
+    sm_count = num_sms
+    if N <= 8192:
+        sm_count = sm_count * sm_count_multiple
+    elif N <= 16384:
+        sm_count = sm_count // 2
+    else:
+        sm_count = sm_count * 2
+
+    # Quack-suite tuning: for small-M, hidden=4096 shapes (M<=8192) and
+    # 16-bit dtypes, increase sm_count to improve occupancy. This mirrors
+    # the existing SM100 RMSNorm/LayerNorm heuristics.
+    if (
+        dtype in (torch.float16, torch.bfloat16)
+        and M is not None
+        and M <= 8192
+        and N == 4096
+    ):
+        sm_count = min(sm_count * 2, num_sms * 4)
+
     return sm_count
diff --git a/oink/src/kernelagent_oink/blackwell/rmsnorm.py b/oink/src/kernelagent_oink/blackwell/rmsnorm.py
index c7fc1b3..1e080a3 100644
--- a/oink/src/kernelagent_oink/blackwell/rmsnorm.py
+++ b/oink/src/kernelagent_oink/blackwell/rmsnorm.py
@@ -76,6 +76,7 @@
 import cutlass.cute as cute  # noqa: E402
 from cutlass import Float32, Int32, const_expr  # noqa: E402
 from cutlass.cute import runtime as rt  # noqa: E402
+from cutlass.cute.runtime import from_dlpack  # noqa: E402
 
 # Simple compile cache declared early so direct execution works
 _PTR_COMPILE_CACHE = {}
@@ -114,6 +115,14 @@ def _env_flag(name: str, default: bool) -> bool:
 _ENABLE_TPR256 = _env_flag("OINK_RMSNORM_ENABLE_TPR256", default=False)
 _ENABLE_STAGE2 = _env_flag("OINK_RMSNORM_ENABLE_STAGE2", default=False)
 
+# Forward dispatch control:
+# - Default behavior: use the pointer-based path when safe, otherwise fall back
+#   to the stage-2 module (then the torch reference).
+# - If you want to force stage-2 even when the pointer path is available (for
+#   experimentation / A-B testing), set this env var **before** importing this
+#   module.
+_FORCE_RMSNORM_STAGE2_FWD = _env_flag("KERNELAGENT_OINK_FORCE_RMSNORM_STAGE2", default=False)
+
 # CuTeDSL stability probe for the experimental cluster_n>1 + direct-GMEM schedule.
 #
 # Some CuTeDSL builds segfault during JIT compilation when combining:
@@ -918,7 +927,13 @@ def _get_fast_ptr_fused_add_rmsnorm_launcher(
 # NOTE: Avoid `from . import ...` imports here: CuTeDSL's AST preprocessor may
 # mishandle that form (module=None in the AST). Use fully-qualified imports.
 from kernelagent_oink.blackwell import lite_quack as qutils  # noqa: E402
-from kernelagent_oink.blackwell.lite_quack import TORCH2CUTE_DTYPE, row_reduce  # noqa: E402
+from kernelagent_oink.blackwell.lite_quack import (  # noqa: E402
+    TORCH2CUTE_DTYPE,
+    RMSNormBackward as BaseRMSNormBackward,
+    convert_from_dlpack as convert_from_dlpack_cute,
+    get_sm_count,
+    row_reduce,
+)
 
 
 # -------------------------
@@ -2720,52 +2735,57 @@ def rmsnorm_forward(
     assert x.dim() == 2, "Use (M, N) tensor; flatten batch/seq beforehand."
     M, N = x.shape
 
-    # For DSv3 big-M outliers on SM100, keep using the dedicated
-    # stage-2 K-loop implementation, which is already tuned and
-    # parity-checked against the reference.
-    use_stage2_big_dsv3 = bool(
-        M >= 65536 and N in (6144, 8192) and x.dtype in (torch.float16, torch.bfloat16)
-    )
-    if use_stage2_big_dsv3:
-        try:
-            import rmsnorm_with_stage2 as rms2  # type: ignore[import-not-found]
-        except Exception:
-            rms2 = None  # type: ignore[assignment]
-        if rms2 is not None:
-            y, rstd, residual_out = rms2.rmsnorm_forward_with_stage2(
-                x,
-                weight=weight,
-                bias=bias,
-                residual=residual,
-                eps=eps,
-                store_rstd=store_rstd,
-            )
-            # Preserve stride contracts for torch.compile consistency, even
-            # when using the optional stage-2 implementation.
-            if y.stride() != x.stride():
-                y_strided = torch.empty_strided(
-                    x.shape, x.stride(), device=x.device, dtype=x.dtype
-                )
-                y_strided.copy_(y)
-                y = y_strided
-            if residual is not None and residual_out is not None:
-                if residual_out.stride() != residual.stride():
-                    residual_out_strided = torch.empty_strided(
-                        residual.shape,
-                        residual.stride(),
-                        device=residual.device,
-                        dtype=residual.dtype,
-                    )
-                    residual_out_strided.copy_(residual_out)
-                    residual_out = residual_out_strided
-            return y, rstd, residual_out
-
-    # Default: use the pointer-based entry whenever we can represent the
-    # inputs as a row-major [M, N] view with stride(1) == 1. For rare layouts
-    # we can't safely express without DLPack, fall back to a torch reference.
-    if _can_use_ptr_path(x, weight, bias, residual):
+    # Fast path: use the pointer-based entry whenever we can represent the
+    # inputs as a row-major [M, N] view with stride(1) == 1 and dtype contracts
+    # are satisfied (vLLM uses this in inference).
+    #
+    # When the pointer path can't be used (e.g. float32 weights for Quack-style
+    # APIs, or non-standard layouts), fall back to the CuTeDSL stage-2 module
+    # before using the slow torch reference implementation.
+    force_stage2 = _FORCE_RMSNORM_STAGE2_FWD
+
+    if not force_stage2 and _can_use_ptr_path(x, weight, bias, residual):
         return _rmsnorm_forward_ptr(x, weight, bias, residual, eps, store_rstd)
 
+    # CuTeDSL fallback for cases that aren't safe for the pointer path.
+    # Import lazily to keep vLLM plugin startup and common inference fast paths
+    # lightweight.
+    try:
+        import importlib
+
+        rms2 = importlib.import_module(
+            ".rmsnorm_with_stage2",
+            package=__package__ or "kernelagent_oink.blackwell",
+        )
+    except Exception:
+        rms2 = None  # type: ignore[assignment]
+    if rms2 is not None:
+        y, rstd, residual_out = rms2.rmsnorm_forward_with_stage2(
+            x,
+            weight=weight,
+            bias=bias,
+            residual=residual,
+            eps=eps,
+            store_rstd=store_rstd,
+        )
+        # Preserve stride contracts for torch.compile consistency, even
+        # when using the optional stage-2 implementation.
+        if y.stride() != x.stride():
+            y_strided = torch.empty_strided(x.shape, x.stride(), device=x.device, dtype=x.dtype)
+            y_strided.copy_(y)
+            y = y_strided
+        if residual is not None and residual_out is not None:
+            if residual_out.stride() != residual.stride():
+                residual_out_strided = torch.empty_strided(
+                    residual.shape,
+                    residual.stride(),
+                    device=residual.device,
+                    dtype=residual.dtype,
+                )
+                residual_out_strided.copy_(residual_out)
+                residual_out = residual_out_strided
+        return y, rstd, residual_out
+
     # Safe fallback (correctness-first). This is expected to be rare in vLLM.
     y = rmsnorm_ref(x, weight, bias, residual, eps)
     # Preserve the input stride contract even on the fallback path so
@@ -2910,6 +2930,363 @@ def fused_add_rmsnorm_inplace_(
     return None
 
 
+# -------------------------
+# Backward kernel (SM100)
+# -------------------------
+
+
+class RMSNormBackwardSM100(BaseRMSNormBackward):
+    """SM100-tuned RMSNorm backward.
+
+    This is a thin wrapper around the generic `lite_quack.RMSNormBackward`
+    base implementation, with SM100-friendly tiling heuristics that mirror
+    the forward policy used by Oink.
+    """
+
+    def __init__(self, dtype: cutlass.Numeric, N: int):
+        super().__init__(dtype, N)
+
+    def _get_num_threads(self) -> int:
+        # Keep 128 threads only up to N=4k; use 256 for larger rows to ensure
+        # threads_per_row <= num_threads across buckets.
+        try:
+            return self._nt_override  # type: ignore[attr-defined]
+        except Exception:
+            return 128 if self.N <= 4096 else 256
+
+    def _calculate_threads_per_row(self) -> int:
+        # Mirror RMSNormSM100 forward's tiling.
+        N = self.N
+        if N <= 64:
+            return 8
+        if N <= 128:
+            return 16
+        if N <= 1024:
+            return 32
+        if N <= 4096:
+            return 128
+        if N <= 8192:
+            try:
+                return self._tpr_override  # type: ignore[attr-defined]
+            except Exception:
+                return 128
+        if N <= 16384:
+            return 256
+        return 256
+
+    def _set_cluster_n(self) -> None:
+        # Reuse the SM100 forward cluster growth policy so large-N shapes can
+        # fan out across multiple CTAs in the same row.
+        try:
+            self.cluster_n = self._cluster_n_override  # type: ignore[attr-defined]
+            return
+        except Exception:
+            pass
+
+        N = self.N
+        if N <= 8192:
+            cluster_n = 1
+        elif self.dtype.width == 16:
+            if N <= 16 * 1024:
+                cluster_n = 2
+            elif N <= 32 * 1024:
+                cluster_n = 2
+            elif N <= 64 * 1024:
+                cluster_n = 4
+            elif N <= 128 * 1024:
+                cluster_n = 8
+            else:
+                cluster_n = 16
+        else:
+            if N <= 32 * 1024:
+                cluster_n = 1
+            elif N <= 64 * 1024:
+                cluster_n = 2
+            elif N <= 128 * 1024:
+                cluster_n = 4
+            elif N <= 256 * 1024:
+                cluster_n = 8
+            else:
+                cluster_n = 16
+        self.cluster_n = cluster_n
+
+    @cute.jit
+    def __call__(
+        self,
+        mX: cute.Tensor,
+        mW: Optional[cute.Tensor],
+        mdO: cute.Tensor,
+        mdResO: Optional[cute.Tensor],
+        mRstd: cute.Tensor,
+        mdX: cute.Tensor,
+        mdW: Optional[cute.Tensor],
+        mdRes: Optional[cute.Tensor],
+        mdB: Optional[cute.Tensor],
+        sm_count: Int32,
+        stream: cuda.CUstream,
+    ):
+        # Match forward's 32B alignment on the leading dimension to unlock
+        # wider vectorization when legal.
+        semistatic_shape = (*mX.shape[:-1], self.N)
+
+        def new_stride(t):
+            return (
+                cute.assume(t.stride[0], divby=256 // t.element_type.width),
+                t.stride[1],
+            )
+
+        mX, mdO, mdResO, mdX, mdRes = [
+            cute.make_tensor(t.iterator, cute.make_layout(semistatic_shape, stride=new_stride(t)))
+            if const_expr(t is not None)
+            else None
+            for t in (mX, mdO, mdResO, mdX, mdRes)
+        ]
+
+        self._set_cluster_n()
+        largest_dtype_width = const_expr(
+            max(
+                mX.element_type.width,
+                mdO.element_type.width,
+                mdX.element_type.width,
+                mdResO.element_type.width if mdResO is not None else 0,
+                mdRes.element_type.width if mdRes is not None else 0,
+            )
+        )
+        tiler_mn, tv_layout = self._get_tv_layout(
+            num_copy_bits=128 // largest_dtype_width * mX.element_type.width
+        )
+        num_threads = (
+            cute.size(tv_layout, mode=[0]) if _KERNEL_ACCEPTS_LAYOUT_ARGS else self._get_num_threads()
+        )
+        num_warps = num_threads // cute.arch.WARP_SIZE
+        if const_expr(mW is not None):
+            mW_expanded_layout = cute.prepend(
+                mW.layout, cute.make_layout((tiler_mn[0],), stride=(0,))
+            )
+            mW = cute.make_tensor(mW.iterator, mW_expanded_layout)
+
+        num_blocks = sm_count
+        kernel = (
+            self.kernel(mX, mW, mdO, mdResO, mRstd, mdX, mdW, mdB, mdRes, tv_layout, tiler_mn)
+            if _KERNEL_ACCEPTS_LAYOUT_ARGS
+            else self.kernel(mX, mW, mdO, mdResO, mRstd, mdX, mdW, mdB, mdRes)
+        )
+        kernel.launch(
+            grid=[num_blocks, self.cluster_n, 1],
+            block=[num_threads, 1, 1],
+            cluster=[1, self.cluster_n, 1] if self.cluster_n > 1 else None,
+            smem=self._smem_size_in_bytes(tiler_mn, num_warps, do_dtype=mdO.element_type),
+            stream=stream,
+        )
+
+
+_BWD_COMPILE_CACHE: dict[tuple[object, ...], object] = {}
+
+
+def _rmsnorm_bwd_sm100(
+    x: Tensor,
+    weight: Optional[Tensor],
+    dout: Tensor,
+    rstd: Tensor,
+    dx: Tensor,
+    dw_partial: Optional[Tensor],
+    db_partial: Optional[Tensor] = None,
+    dresidual_out: Optional[Tensor] = None,
+    dresidual: Optional[Tensor] = None,
+    sm_count: Optional[int] = None,
+) -> None:
+    """SM100-specific RMSNorm backward dispatch.
+
+    Mirrors Quack's `quack.rmsnorm._rmsnorm_bwd`, but instantiates
+    `RMSNormBackwardSM100` (SM100-tuned heuristics).
+    """
+    assert x.dim() == 2, "Input must be 2D"
+    assert x.is_cuda, "Input tensor must be on CUDA device"
+    assert x.dtype in (torch.float16, torch.bfloat16, torch.float32)
+
+    if weight is not None:
+        assert weight.dim() == 1
+        assert x.shape[-1] == weight.shape[0]
+        assert weight.is_cuda
+        assert weight.dtype in (torch.float32, torch.bfloat16, torch.float16)
+    if dresidual_out is not None:
+        assert dresidual_out.shape == x.shape
+        assert dresidual_out.is_cuda
+        assert dresidual_out.dtype in (torch.float16, torch.bfloat16, torch.float32)
+    if dresidual is not None:
+        assert dresidual.shape == x.shape
+        assert dresidual.is_cuda
+        assert dresidual.dtype in (torch.float16, torch.bfloat16, torch.float32)
+
+    M, N = x.size(0), x.size(1)
+    device = x.device
+    if dw_partial is None and db_partial is None:
+        assert sm_count is not None
+    else:
+        sm_count = (
+            dw_partial.shape[0] if dw_partial is not None else db_partial.shape[0]
+        )
+
+    # Match Quack's conversion strategy for activations/gradients: keep the
+    # (M, N) layout dynamic without enforcing additional compact-shape
+    # constraints. This reduces per-call Python overhead for small-M shapes.
+    convert_from_dlpack = lambda t: from_dlpack(  # type: ignore[assignment]
+        t.detach(),
+        assumed_align=16,
+    ).mark_layout_dynamic(leading_dim=1)
+    x_tensor, dout_tensor, dres_out_tensor, dx_tensor, dres_tensor = [
+        convert_from_dlpack(t) if t is not None else None
+        for t in (x, dout, dresidual_out, dx, dresidual)
+    ]
+
+    if weight is not None:
+        weight_dtype = TORCH2CUTE_DTYPE[weight.dtype]
+        weight_tensor = convert_from_dlpack_cute(
+            weight.detach(),
+            leading_dim=0,
+            divisibility=128 // weight_dtype.width,
+        )
+    else:
+        weight_tensor = None
+
+    dw_partial_tensor = (
+        from_dlpack(dw_partial, assumed_align=16).mark_compact_shape_dynamic(mode=0)
+        if dw_partial is not None
+        else None
+    )
+    db_partial_tensor = (
+        from_dlpack(db_partial, assumed_align=16).mark_compact_shape_dynamic(mode=0)
+        if db_partial is not None
+        else None
+    )
+    rstd_tensor = (
+        from_dlpack(rstd.detach(), assumed_align=4).mark_layout_dynamic(leading_dim=0)
+    )
+
+    current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+
+    compile_key = (
+        M,
+        N,
+        x_tensor.element_type,
+        weight_tensor.element_type if weight is not None else None,
+        db_partial.dtype if db_partial is not None else None,
+        dresidual.dtype if dresidual is not None else None,
+        dresidual_out.dtype if dresidual_out is not None else None,
+    )
+    kernel = _BWD_COMPILE_CACHE.get(compile_key)
+    if kernel is None:
+        op = RMSNormBackwardSM100(x_tensor.element_type, N)
+
+        # Shape-specific tuning overrides for DSv3-style N=8192 rows.
+        if isinstance(op, RMSNormBackwardSM100) and N == 8192:
+            if M >= 65536:
+                op._tpr_override = 256  # type: ignore[attr-defined]
+                op._nt_override = 256  # type: ignore[attr-defined]
+            elif M >= 16384:
+                op._tpr_override = 256  # type: ignore[attr-defined]
+
+        kernel = cute.compile(
+            op,
+            x_tensor,
+            weight_tensor,
+            dout_tensor,
+            dres_out_tensor,
+            rstd_tensor,
+            dx_tensor,
+            dw_partial_tensor,
+            dres_tensor,
+            db_partial_tensor,
+            Int32(sm_count if sm_count is not None else 0),
+            current_stream,
+        )
+        _BWD_COMPILE_CACHE[compile_key] = kernel
+
+    kernel(
+        x_tensor,
+        weight_tensor,
+        dout_tensor,
+        dres_out_tensor,
+        rstd_tensor,
+        dx_tensor,
+        dw_partial_tensor,
+        dres_tensor,
+        db_partial_tensor,
+        Int32(sm_count if sm_count is not None else 0),
+        current_stream,
+    )
+
+
+def rmsnorm_backward(
+    x: Tensor,
+    weight: Optional[Tensor],
+    dout: Tensor,
+    rstd: Tensor,
+    dresidual_out: Optional[Tensor] = None,
+    has_bias: bool = False,
+    has_residual: bool = False,
+) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor], Optional[Tensor]]:
+    """Public SM100 RMSNorm backward entry point.
+
+    Signature mirrors `quack.rmsnorm.rmsnorm_bwd` for easy comparisons.
+    """
+    device = x.device
+    M, N = x.size(0), x.size(1)
+    dx = torch.empty_like(x)
+    if dresidual_out is not None and dresidual_out.dtype != dx.dtype:
+        dresidual = torch.empty_like(x, dtype=dresidual_out.dtype)
+    else:
+        dresidual = None
+
+    # Shared SM100 tuning policy (used by both RMSNorm and LayerNorm).
+    sm_count = get_sm_count(N, device, M=M, dtype=x.dtype)
+
+    # Quack-suite smallest case (M=8192, N=4096) is extremely sensitive to
+    # Python/allocator overhead because the kernel itself is very fast.
+    #
+    # The default `lite_quack.get_sm_count` adds a small-M occupancy boost for
+    # N=4096, which increases `dw_partial` size and can amplify allocator
+    # pressure in benchmark/verify loops. Clamp to Quack's baseline policy
+    # (`sm_count = num_sms * 2` for N=4096) for this regime.
+    if N == 4096 and M <= 8192 and x.dtype in (torch.float16, torch.bfloat16):
+        try:
+            num_sms = torch.cuda.get_device_properties(device).multi_processor_count
+            sm_count = min(int(sm_count), int(num_sms) * 2)
+        except Exception:
+            pass
+
+    if weight is not None:
+        dw_partial = torch.empty(sm_count, N, device=device, dtype=torch.float32)
+    else:
+        dw_partial = None
+    db_partial = (
+        torch.empty(sm_count, N, device=device, dtype=torch.float32) if has_bias else None
+    )
+
+    _rmsnorm_bwd_sm100(
+        x,
+        weight,
+        dout,
+        rstd,
+        dx,
+        dw_partial,
+        db_partial,
+        dresidual_out,
+        dresidual,
+        sm_count,
+    )
+
+    dw = dw_partial.sum(dim=0).to(weight.dtype) if weight is not None else None
+    db = db_partial.sum(dim=0).to(weight.dtype) if has_bias else None
+    if has_residual and dresidual is None:
+        dresidual = dx
+    return dx, dw, db, dresidual
+
+
+# Quack-style alias for benchmarks
+rmsnorm_bwd = rmsnorm_backward
+
+
 if __name__ == "__main__":
     # Minimal ad-hoc test (functionality only). For performance comparisons, use the benchmark harness.
     if not torch.cuda.is_available():
diff --git a/oink/src/kernelagent_oink/blackwell/rmsnorm_with_stage2.py b/oink/src/kernelagent_oink/blackwell/rmsnorm_with_stage2.py
new file mode 100644
index 0000000..b53da12
--- /dev/null
+++ b/oink/src/kernelagent_oink/blackwell/rmsnorm_with_stage2.py
@@ -0,0 +1,805 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+RMSNorm kernel for SM100 (Blackwell) in CuteDSL, with the experimental
+stage-2 cp.async ping-pong path preserved for N≈6k/8k.
+
+This file is a fork of rmsnorm.py that keeps the K-loop cp.async path
+behind `self.stage > 1` while the main implementation has been simplified
+to a single-stage schedule.
+"""
+
+from __future__ import annotations
+
+import importlib.metadata
+import re
+from typing import Optional, Tuple
+
+import torch
+from torch import Tensor
+
+import cuda.bindings.driver as cuda  # provided by NVIDIA cuda-python
+
+import cutlass
+import cutlass.cute as cute
+from cutlass import Float32, const_expr
+from cutlass.cute.runtime import from_dlpack
+
+from kernelagent_oink.blackwell import lite_quack as qutils
+from kernelagent_oink.blackwell.lite_quack import TORCH2CUTE_DTYPE, row_reduce
+
+_COMPILE_CACHE: dict[tuple[object, ...], object] = {}
+
+
+def _parse_version_tuple(version: str) -> tuple[int, int, int]:
+    parts = version.split(".")
+    nums: list[int] = []
+    for part in parts[:3]:
+        match = re.match(r"^(\d+)", part)
+        nums.append(int(match.group(1)) if match is not None else 0)
+    while len(nums) < 3:
+        nums.append(0)
+    return nums[0], nums[1], nums[2]
+
+
+def _cutlass_dsl_version() -> Optional[tuple[int, int, int]]:
+    try:
+        return _parse_version_tuple(importlib.metadata.version("nvidia-cutlass-dsl"))
+    except Exception:
+        return None
+
+
+_CUTLASS_DSL_VERSION = _cutlass_dsl_version()
+# CuTeDSL 4.3.4 tightened some kernel argument expectations (notably around
+# passing Layout/Shape/Constexpr objects into @cute.kernel functions). Keep the
+# older signature for <4.3.4, but switch to a 4.3.4+ compatible signature when
+# we detect 4.3.4+ (or when version detection is unavailable).
+_KERNEL_ACCEPTS_LAYOUT_ARGS = (
+    _CUTLASS_DSL_VERSION is not None and _CUTLASS_DSL_VERSION < (4, 3, 4)
+)
+
+
+@cute.jit
+def get_copy_atom_bw(
+    dtype: type[cutlass.Numeric], num_copy_elems: int, is_async: bool = False
+) -> cute.CopyAtom:
+    max_bits = const_expr(128 if is_async else 256)
+    num_copy_bits = const_expr(min(max_bits, num_copy_elems * dtype.width))
+    from cutlass.cute.nvgpu import cpasync
+
+    copy_op = (
+        cpasync.CopyG2SOp(cache_mode=cpasync.LoadCacheMode.GLOBAL)
+        if is_async
+        else cute.nvgpu.CopyUniversalOp()
+    )
+    return cute.make_copy_atom(copy_op, dtype, num_bits_per_copy=num_copy_bits)
+
+
+@cute.jit
+def copy_tiled(
+    src: cute.Tensor,
+    dst: cute.Tensor,
+    *,
+    pred: Optional[cute.Tensor] = None,
+    num_copy_elems: int = 1,
+    is_async: bool = False,
+) -> None:
+    atom = get_copy_atom_bw(src.element_type, num_copy_elems, is_async)
+    cute.copy(atom, src, dst, pred=pred)
+
+
+class RMSNormSM100WithStage2:
+    def __init__(self, N: int, dtype: type[cutlass.Numeric], stage: Optional[int] = None):
+        self.N = N
+        self.dtype = dtype
+        self.stage = 1 if stage is None else stage
+        self.reduction_dtype = cutlass.Float32
+
+    def _threads_per_row(self) -> int:
+        N = self.N
+        if N <= 64:
+            return 8
+        elif N <= 128:
+            return 16
+        elif N <= 1024:
+            return 32
+        elif N <= 4096:
+            return 128
+        elif N <= 8192:
+            try:
+                return self._tpr_override  # type: ignore[attr-defined]
+            except Exception:
+                return 128
+        elif N <= 16384:
+            return 256
+        else:
+            return 256
+
+    def _cluster_n(self) -> int:
+        N = self.N
+        if N <= 8192:
+            return 1
+        if const_expr(self.dtype.width == 16):
+            if N <= 16 * 1024:
+                return 2
+            elif N <= 32 * 1024:
+                return 2
+            elif N <= 64 * 1024:
+                return 4
+            elif N <= 128 * 1024:
+                return 8
+            else:
+                return 16
+        else:
+            if N <= 32 * 1024:
+                return 1
+            elif N <= 64 * 1024:
+                return 2
+            elif N <= 128 * 1024:
+                return 4
+            elif N <= 256 * 1024:
+                return 8
+            else:
+                return 16
+
+    def _num_threads(self) -> int:
+        try:
+            return self._nt_override  # type: ignore[attr-defined]
+        except Exception:
+            return 128 if self.N <= 16384 else 256
+
+    def _tv_layout(self, num_copy_bits: int = 256) -> Tuple[cute.Shape, cute.Layout]:
+        vecsize = num_copy_bits // self.dtype.width
+        num_threads = self._num_threads()
+        assert num_threads % cute.arch.WARP_SIZE == 0
+        tpr = self._threads_per_row()
+        cluster_n = self._cluster_n()
+        num_cols_vec = cute.ceil_div(self.N, vecsize)
+        num_blocks_N = cute.ceil_div(num_cols_vec, tpr * cluster_n)
+        cols_per_block = num_threads // tpr
+        tiler_mn = (cols_per_block, vecsize * num_blocks_N * tpr)
+        tv_layout = cute.make_layout(
+            ((tpr, cols_per_block), (vecsize, num_blocks_N)),
+            stride=((vecsize * cols_per_block, 1), (cols_per_block, cols_per_block * vecsize * tpr)),
+        )
+        return tiler_mn, tv_layout
+
+    @cute.jit
+    def __call__(
+        self,
+        mX: cute.Tensor,
+        mW: Optional[cute.Tensor],
+        mB: Optional[cute.Tensor],
+        mRes: Optional[cute.Tensor],
+        mO: cute.Tensor,
+        mResO: Optional[cute.Tensor],
+        mRstd: Optional[cute.Tensor],
+        stream: cuda.CUstream,
+        eps: Float32 = 1e-6,
+    ):
+        semistatic_shape = (*mX.shape[:-1], self.N)
+
+        def new_stride(t):
+            return (
+                cute.assume(t.stride[0], divby=256 // t.element_type.width),
+                t.stride[1],
+            )
+
+        mX, mRes, mO, mResO = [
+            cute.make_tensor(t.iterator, cute.make_layout(semistatic_shape, stride=new_stride(t)))
+            if const_expr(t is not None)
+            else None
+            for t in (mX, mRes, mO, mResO)
+        ]
+        assert mX.element_type == self.dtype
+        assert mO.element_type == self.dtype
+
+        copy_bits = const_expr(128)
+        tiler_mn, tv_layout = self._tv_layout(num_copy_bits=copy_bits)
+        num_threads = (
+            cute.size(tv_layout, mode=[0]) if _KERNEL_ACCEPTS_LAYOUT_ARGS else self._num_threads()
+        )
+        num_warps = num_threads // cute.arch.WARP_SIZE
+        threads_per_row = (
+            tv_layout.shape[0][0] if _KERNEL_ACCEPTS_LAYOUT_ARGS else self._threads_per_row()
+        )
+        warps_per_row = max(threads_per_row // cute.arch.WARP_SIZE, 1)
+        cluster_n = self._cluster_n()
+
+        if const_expr(mW is not None):
+            mW = cute.make_tensor(
+                mW.iterator, cute.prepend(mW.layout, cute.make_layout((tiler_mn[0],), stride=(0,)))
+            )
+        if const_expr(mB is not None):
+            mB = cute.make_tensor(
+                mB.iterator, cute.prepend(mB.layout, cute.make_layout((tiler_mn[0],), stride=(0,)))
+            )
+        if const_expr(mRstd is not None):
+            mRstd = cute.make_tensor(
+                mRstd.iterator, cute.append(mRstd.layout, cute.make_layout((self.N,), stride=(0,)))
+            )
+
+        stage_bufs = 2 if self.stage > 1 else 1
+        tile_bytes_x = cute.size_in_bytes(self.dtype, cute.make_layout(tiler_mn)) * stage_bufs
+        tile_bytes_res = (
+            cute.size_in_bytes(mRes.element_type, cute.make_layout(tiler_mn)) * stage_bufs
+            if const_expr(mRes is not None)
+            else 0
+        )
+        red_bytes = self.stage * num_warps * cluster_n * (self.reduction_dtype.width // 8)
+        mbar_bytes = self.stage * (cutlass.Int64.width // 8)
+        smem_bytes = tile_bytes_x + tile_bytes_res + red_bytes + mbar_bytes
+
+        kernel = (
+            self.kernel(
+                mX,
+                mW,
+                mB,
+                mRes,
+                mO,
+                mResO,
+                mRstd,
+                eps,
+                tv_layout,
+                tiler_mn,
+                const_expr(num_warps),
+                const_expr(warps_per_row),
+                const_expr(threads_per_row),
+            )
+            if _KERNEL_ACCEPTS_LAYOUT_ARGS
+            else self.kernel(
+                mX,
+                mW,
+                mB,
+                mRes,
+                mO,
+                mResO,
+                mRstd,
+                eps,
+            )
+        )
+
+        kernel.launch(
+            grid=[cute.ceil_div(mX.shape[0], tiler_mn[0]), cluster_n, 1],
+            block=[num_threads, 1, 1],
+            cluster=([1, cluster_n, 1] if const_expr(cluster_n > 1) else None),
+            smem=smem_bytes,
+            stream=stream,
+        )
+
+    @cute.jit
+    def _kernel_impl(
+        self,
+        mX: cute.Tensor,
+        mW: Optional[cute.Tensor],
+        mB: Optional[cute.Tensor],
+        mRes: Optional[cute.Tensor],
+        mO: cute.Tensor,
+        mResO: Optional[cute.Tensor],
+        mRstd: Optional[cute.Tensor],
+        eps: Float32,
+        tv_layout: cute.Layout,
+        tiler_mn: cute.Shape,
+        num_warps: cutlass.Constexpr[int],
+        warps_per_row: cutlass.Constexpr[int],
+        threads_per_row: cutlass.Constexpr[int],
+    ):
+        tidx, _, _ = cute.arch.thread_idx()
+        bidx, _, _ = cute.arch.block_idx()
+        cluster_n = self._cluster_n()
+        cluster_y = const_expr(0) if const_expr(cluster_n == 1) else cute.arch.block_idx()[1]
+
+        smem = cutlass.utils.SmemAllocator()
+        sX0 = smem.allocate_tensor(
+            mX.element_type, cute.make_ordered_layout(tiler_mn, order=(1, 0)), byte_alignment=32
+        )
+        sX1 = (
+            smem.allocate_tensor(
+                mX.element_type,
+                cute.make_ordered_layout(tiler_mn, order=(1, 0)),
+                byte_alignment=32,
+            )
+            if const_expr(self.stage > 1)
+            else None
+        )
+        sRes0 = (
+            smem.allocate_tensor(
+                mRes.element_type, cute.make_ordered_layout(tiler_mn, order=(1, 0)), byte_alignment=32
+            )
+            if const_expr(mRes is not None)
+            else None
+        )
+        sRes1 = (
+            smem.allocate_tensor(
+                mRes.element_type,
+                cute.make_ordered_layout(tiler_mn, order=(1, 0)),
+                byte_alignment=32,
+            )
+            if const_expr(mRes is not None and self.stage > 1)
+            else None
+        )
+
+        reduction_buffer, mbar_ptr = self._alloc_reduction_and_mbar(smem, num_warps, warps_per_row)
+
+        shape = mX.shape
+        idX = cute.make_identity_tensor(shape)
+
+        num_copy_elems_X = tv_layout.shape[1][0]
+        use_async = const_expr(self.N >= 1024)
+        copy_atom = get_copy_atom_bw(mX.element_type, num_copy_elems_X, is_async=use_async)
+        thr_copy = cute.make_tiled_copy(copy_atom, tv_layout, tiler_mn).get_slice(tidx)
+
+        gW, gB = [
+            cute.local_tile(t, tiler_mn, (0, cluster_y)) if const_expr(t is not None) else None
+            for t in (mW, mB)
+        ]
+        tXgW = thr_copy.partition_S(gW) if const_expr(mW is not None) else None
+        tXgB = thr_copy.partition_S(gB) if const_expr(mB is not None) else None
+        tXrW = cute.make_fragment_like(tXgW) if const_expr(mW is not None) else None
+        tXrB = cute.make_fragment_like(tXgB) if const_expr(mB is not None) else None
+        if const_expr(mW is not None):
+            cute.copy(get_copy_atom_bw(mW.element_type, num_copy_elems_X, is_async=False), tXgW, tXrW)
+        if const_expr(mB is not None):
+            cute.copy(get_copy_atom_bw(mB.element_type, num_copy_elems_X, is_async=False), tXgB, tXrB)
+
+        self._init_cluster(tidx, mbar_ptr)
+
+        mX_i, mRes_i, mO_i, mResO_i = [
+            qutils.domain_offset_i64((bidx * tiler_mn[0], 0), t) if t is not None else None
+            for t in (mX, mRes, mO, mResO)
+        ]
+        gX_i = cute.local_tile(mX_i, tiler_mn, (0, cluster_y))
+        gO_i = cute.local_tile(mO_i, tiler_mn, (0, cluster_y))
+        gRes_i = (
+            cute.local_tile(mRes_i, tiler_mn, (0, cluster_y)) if const_expr(mRes is not None) else None
+        )
+        gResO_i = (
+            cute.local_tile(mResO_i, tiler_mn, (0, cluster_y)) if const_expr(mResO is not None) else None
+        )
+        gRstd_i = (
+            cute.local_tile(mRstd, tiler_mn, (bidx, cluster_y)) if const_expr(mRstd is not None) else None
+        )
+        cX_i = cute.local_tile(idX, tiler_mn, (bidx, cluster_y))
+
+        tXcX_i = thr_copy.partition_S(cX_i)[(0, None), None, None]
+        row_i = tXcX_i[0][0]
+        tXgRstd_i = thr_copy.partition_D(gRstd_i) if const_expr(mRstd is not None) else None
+
+        # Intra-row K-loop cp.async ping-pong (two-pass) for N≈6k/8k (stage=2)
+        if const_expr(self.stage > 1 and (shape[1] == 6144 or shape[1] == 8192)):
+            vecsize = tv_layout.shape[1][0]
+            tpr = threads_per_row
+            target_tile_n = const_expr(4096 if shape[1] == 6144 else 8192)
+            tile_factor = const_expr(target_tile_n // (vecsize * tpr))
+            tile_n = vecsize * tpr * tile_factor
+            num_tiles = cute.ceil_div(shape[1], tile_n)
+
+            tiler_mn_tile = (tiler_mn[0], tile_n)
+            sX0_tile = cute.local_tile(sX0, tiler_mn_tile, (0, 0))
+            sX1_tile = cute.local_tile(sX1, tiler_mn_tile, (0, 0)) if const_expr(self.stage > 1) else None
+            sRes0_tile = (
+                cute.local_tile(sRes0, tiler_mn_tile, (0, 0)) if const_expr(mRes is not None) else None
+            )
+            sRes1_tile = (
+                cute.local_tile(sRes1, tiler_mn_tile, (0, 0)) if const_expr(mRes is not None and self.stage > 1) else None
+            )
+
+            tv_layout_tile = cute.make_layout(
+                ((tpr, tiler_mn[0]), (vecsize, tile_factor)),
+                stride=((vecsize * tiler_mn[0], 1), (tiler_mn[0], tiler_mn[0] * vecsize * tpr)),
+            )
+            thr_copy_tile = cute.make_tiled_copy(copy_atom, tv_layout_tile, tiler_mn_tile).get_slice(tidx)
+
+            sum_sq_acc = cute.Float32(0.0)
+            k_off0 = const_expr(0) * tile_n
+            gX_0 = cute.local_tile(qutils.domain_offset_i64((0, k_off0), mX_i), tiler_mn_tile, (0, cluster_y))
+            tXgX_0 = thr_copy_tile.partition_S(gX_0)
+            tXsX_0 = thr_copy_tile.partition_D(sX0_tile)
+            cX_0 = cute.local_tile(cute.domain_offset((0, k_off0), cX_i), tiler_mn_tile, (0, cluster_y))
+            tXc_0 = thr_copy_tile.partition_S(cX_0)
+            tXp_0 = qutils.predicate_k(tXc_0, limit=shape[1])
+            tXp_ping = tXp_0
+            tXp_pong = tXp_0
+            if row_i < shape[0]:
+                copy_tiled(tXgX_0, tXsX_0, num_copy_elems=vecsize, is_async=use_async, pred=tXp_0)
+                if const_expr(mRes is not None):
+                    gRes_0 = cute.local_tile(qutils.domain_offset_i64((0, k_off0), mRes_i), tiler_mn_tile, (0, cluster_y))
+                    tXgRes_0 = thr_copy_tile.partition_S(gRes_0)
+                    tXsRes_0 = thr_copy_tile.partition_D(sRes0_tile)
+                    copy_tiled(tXgRes_0, tXsRes_0, num_copy_elems=vecsize, is_async=use_async, pred=tXp_0)
+            if const_expr(use_async):
+                cute.arch.cp_async_commit_group()
+
+            for t in cutlass.range_constexpr(num_tiles):
+                next_t = t + 1
+                if next_t < num_tiles:
+                    k_off_n = next_t * tile_n
+                    gX_n = cute.local_tile(qutils.domain_offset_i64((0, k_off_n), mX_i), tiler_mn_tile, (0, cluster_y))
+                    tXgX_n = thr_copy_tile.partition_S(gX_n)
+                    cX_n = cute.local_tile(cute.domain_offset((0, k_off_n), cX_i), tiler_mn_tile, (0, cluster_y))
+                    tXc_n = thr_copy_tile.partition_S(cX_n)
+                    tXp_n = qutils.predicate_k(tXc_n, limit=shape[1])
+                    if const_expr((t % 2) == 0):
+                        tXsX_n = thr_copy_tile.partition_D(sX1_tile)
+                        tXsRes_n = (
+                            thr_copy_tile.partition_D(sRes1_tile) if const_expr(mRes is not None) else None
+                        )
+                        tXp_pong = tXp_n
+                    else:
+                        tXsX_n = thr_copy_tile.partition_D(sX0_tile)
+                        tXsRes_n = (
+                            thr_copy_tile.partition_D(sRes0_tile) if const_expr(mRes is not None) else None
+                        )
+                        tXp_ping = tXp_n
+                    if row_i < shape[0]:
+                        copy_tiled(tXgX_n, tXsX_n, num_copy_elems=vecsize, is_async=use_async, pred=tXp_n)
+                        if const_expr(mRes is not None):
+                            gRes_n = cute.local_tile(qutils.domain_offset_i64((0, k_off_n), mRes_i), tiler_mn_tile, (0, cluster_y))
+                            tXgRes_n = thr_copy_tile.partition_S(gRes_n)
+                            copy_tiled(tXgRes_n, tXsRes_n, num_copy_elems=vecsize, is_async=use_async, pred=tXp_n)
+                    if const_expr(use_async):
+                        cute.arch.cp_async_commit_group()
+                if const_expr(use_async):
+                    cute.arch.cp_async_wait_group(1 if next_t < num_tiles else 0)
+
+                if const_expr((t % 2) == 0):
+                    tXsX_cur = thr_copy_tile.partition_D(sX0_tile)
+                    tXsRes_cur = thr_copy_tile.partition_D(sRes0_tile) if const_expr(mRes is not None) else None
+                    pred_cur = tXp_ping
+                else:
+                    tXsX_cur = thr_copy_tile.partition_D(sX1_tile)
+                    tXsRes_cur = thr_copy_tile.partition_D(sRes1_tile) if const_expr(mRes is not None) else None
+                    pred_cur = tXp_pong
+                qutils.fill_oob(tXsX_cur, pred_cur, mX.element_type.zero)
+                if const_expr(mRes is not None):
+                    qutils.fill_oob(tXsRes_cur, pred_cur, mRes.element_type.zero)
+
+                k_off = t * tile_n
+                gX_t = cute.local_tile(qutils.domain_offset_i64((0, k_off), mX_i), tiler_mn_tile, (0, cluster_y))
+                tXgX_t = thr_copy_tile.partition_S(gX_t)
+                tXrX = cute.make_fragment_like(tXgX_t)
+                cute.autovec_copy(tXsX_cur, tXrX)
+                x = tXrX.load().to(cute.Float32)
+                if const_expr(mRes is not None):
+                    gRes_t = cute.local_tile(qutils.domain_offset_i64((0, k_off), mRes_i), tiler_mn_tile, (0, cluster_y))
+                    tXgRes_t = thr_copy_tile.partition_S(gRes_t)
+                    tXrRes = cute.make_fragment_like(tXgRes_t)
+                    cute.autovec_copy(tXsRes_cur, tXrRes)
+                    x += tXrRes.load().to(cute.Float32)
+
+                if const_expr(mResO is not None):
+                    gResO_t = cute.local_tile(qutils.domain_offset_i64((0, k_off), mResO_i), tiler_mn_tile, (0, cluster_y))
+                    tXgResO_t = thr_copy_tile.partition_D(gResO_t)
+                    tXrResO = cute.make_fragment_like(tXgResO_t)
+                    tXrResO.store(x.to(tXrResO.element_type))
+                    if row_i < shape[0]:
+                        copy_tiled(tXrResO, tXgResO_t, num_copy_elems=vecsize, is_async=False, pred=pred_cur)
+
+                sum_sq_tile = row_reduce(
+                    x * x,
+                    cute.ReductionOp.ADD,
+                    threads_per_row,
+                    reduction_buffer[None, None, 0],
+                    mbar_ptr,
+                    init_val=0.0,
+                    hook_fn=(cute.arch.cluster_wait if const_expr(cluster_n > 1) else None),
+                )
+                sum_sq_acc = sum_sq_acc + sum_sq_tile
+
+            rstd = cute.math.rsqrt(sum_sq_acc / shape[1] + eps, fastmath=True)
+            if const_expr(mRstd is not None):
+                if (
+                    tXcX_i[0][1] == 0
+                    and row_i < shape[0]
+                    and (cluster_n == 1 or cute.arch.block_idx_in_cluster() == 0)
+                ):
+                    tXgRstd_i[0] = rstd
+
+            for t in cutlass.range_constexpr(num_tiles):
+                k_off = t * tile_n
+                cX_t = cute.local_tile(cute.domain_offset((0, k_off), cX_i), tiler_mn_tile, (0, cluster_y))
+                tXc_t = thr_copy_tile.partition_S(cX_t)
+                tXp_t = qutils.predicate_k(tXc_t, limit=shape[1])
+
+                if const_expr((t % 2) == 0):
+                    tXsX_cur = thr_copy_tile.partition_D(sX0_tile)
+                    tXsRes_cur = (
+                        thr_copy_tile.partition_D(sRes0_tile) if const_expr(mRes is not None) else None
+                    )
+                else:
+                    tXsX_cur = thr_copy_tile.partition_D(sX1_tile)
+                    tXsRes_cur = (
+                        thr_copy_tile.partition_D(sRes1_tile) if const_expr(mRes is not None) else None
+                    )
+
+                qutils.fill_oob(tXsX_cur, tXp_t, mX.element_type.zero)
+                if const_expr(mRes is not None):
+                    qutils.fill_oob(tXsRes_cur, tXp_t, mRes.element_type.zero)
+
+                gX_t = cute.local_tile(qutils.domain_offset_i64((0, k_off), mX_i), tiler_mn_tile, (0, cluster_y))
+                tXgX_t = thr_copy_tile.partition_S(gX_t)
+                tXrX = cute.make_fragment_like(tXgX_t)
+                cute.autovec_copy(tXsX_cur, tXrX)
+                x = tXrX.load().to(cute.Float32)
+                if const_expr(mRes is not None):
+                    gRes_t = cute.local_tile(qutils.domain_offset_i64((0, k_off), mRes_i), tiler_mn_tile, (0, cluster_y))
+                    tXgRes_t = thr_copy_tile.partition_S(gRes_t)
+                    tXrRes = cute.make_fragment_like(tXgRes_t)
+                    cute.autovec_copy(tXsRes_cur, tXrRes)
+                    x += tXrRes.load().to(cute.Float32)
+
+                y = x * rstd
+                if const_expr(mW is not None):
+                    gW_t = cute.local_tile(qutils.domain_offset_i64((0, k_off), mW), tiler_mn_tile, (0, cluster_y))
+                    tWgW_t = thr_copy_tile.partition_S(gW_t)
+                    tWrW_t = cute.make_fragment_like(tWgW_t)
+                    copy_tiled(tWgW_t, tWrW_t, num_copy_elems=vecsize, is_async=False, pred=tXp_t)
+                    y = y * tWrW_t.load().to(cute.Float32)
+                if const_expr(mB is not None):
+                    gB_t = cute.local_tile(qutils.domain_offset_i64((0, k_off), mB), tiler_mn_tile, (0, cluster_y))
+                    tWgB_t = thr_copy_tile.partition_S(gB_t)
+                    tWrB_t = cute.make_fragment_like(tWgB_t)
+                    copy_tiled(tWgB_t, tWrB_t, num_copy_elems=vecsize, is_async=False, pred=tXp_t)
+                    y = y + tWrB_t.load().to(cute.Float32)
+
+                gO_t = cute.local_tile(qutils.domain_offset_i64((0, k_off), mO_i), tiler_mn_tile, (0, cluster_y))
+                tXgO_t = thr_copy_tile.partition_D(gO_t)
+                tXrO = cute.make_fragment_like(tXgO_t)
+                tXrO.store(y.to(tXrO.element_type))
+                if row_i < shape[0]:
+                    copy_tiled(tXrO, tXgO_t, num_copy_elems=vecsize, is_async=False, pred=tXp_t)
+
+            return
+
+        # Fallback: single-stage path identical to current rmsnorm.py
+        tXgX_i = thr_copy.partition_S(gX_i)
+        tXgRes_i = thr_copy.partition_S(gRes_i) if const_expr(mRes is not None) else None
+        tXgO_i = thr_copy.partition_D(gO_i)
+        tXgResO_i = thr_copy.partition_D(gResO_i) if const_expr(mResO is not None) else None
+        is_even_N_i = const_expr(shape[1] == tiler_mn[1] * cluster_n)
+        tXpX_i = (
+            qutils.predicate_k(thr_copy.partition_S(cX_i), limit=shape[1]) if not is_even_N_i else None
+        )
+
+        if row_i < shape[0]:
+            cute.copy(copy_atom, tXgX_i, thr_copy.partition_D(sX0), pred=tXpX_i)
+            if const_expr(mRes is not None):
+                cute.copy(copy_atom, tXgRes_i, thr_copy.partition_D(sRes0), pred=tXpX_i)
+        if const_expr(use_async):
+            cute.arch.cp_async_commit_group()
+            cute.arch.cp_async_wait_group(0)
+
+        tXrX = cute.make_fragment_like(tXgX_i)
+        cute.autovec_copy(thr_copy.partition_D(sX0), tXrX)
+        x = tXrX.load().to(cute.Float32)
+        if const_expr(mRes is not None):
+            tXrRes = cute.make_fragment_like(tXgRes_i)
+            cute.autovec_copy(thr_copy.partition_D(sRes0), tXrRes)
+            x += tXrRes.load().to(cute.Float32)
+
+        if const_expr(mResO is not None):
+            tXrResO = cute.make_fragment_like(tXgResO_i)
+            tXrResO.store(x.to(tXrResO.element_type))
+            if row_i < shape[0]:
+                cute.copy(
+                    get_copy_atom_bw(tXrResO.element_type, num_copy_elems_X, is_async=False),
+                    tXrResO,
+                    tXgResO_i,
+                )
+
+        sum_sq = row_reduce(
+            x * x,
+            cute.ReductionOp.ADD,
+            threads_per_row,
+            reduction_buffer[None, None, 0],
+            mbar_ptr,
+            init_val=0.0,
+            hook_fn=(cute.arch.cluster_wait if const_expr(cluster_n > 1) else None),
+        )
+        rstd = cute.math.rsqrt(sum_sq / shape[1] + eps, fastmath=True)
+
+        if const_expr(mRstd is not None):
+            if (
+                tXcX_i[0][1] == 0
+                and row_i < shape[0]
+                and (cluster_n == 1 or cute.arch.block_idx_in_cluster() == 0)
+            ):
+                tXgRstd_i[0] = rstd
+
+        y = x * rstd
+        if const_expr(mW is not None):
+            y = y * tXrW.load().to(cute.Float32)
+        if const_expr(mB is not None):
+            y = y + tXrB.load().to(cute.Float32)
+
+        tXrO = cute.make_fragment_like(tXgO_i)
+        tXrO.store(y.to(tXrO.element_type))
+        if row_i < shape[0]:
+            cute.copy(
+                get_copy_atom_bw(tXrO.element_type, num_copy_elems_X, is_async=False),
+                tXrO,
+                tXgO_i,
+            )
+
+    if _KERNEL_ACCEPTS_LAYOUT_ARGS:
+
+        @cute.kernel
+        def kernel(
+            self,
+            mX: cute.Tensor,
+            mW: Optional[cute.Tensor],
+            mB: Optional[cute.Tensor],
+            mRes: Optional[cute.Tensor],
+            mO: cute.Tensor,
+            mResO: Optional[cute.Tensor],
+            mRstd: Optional[cute.Tensor],
+            eps: Float32,
+            tv_layout: cute.Layout,
+            tiler_mn: cute.Shape,
+            num_warps: cutlass.Constexpr[int],
+            warps_per_row: cutlass.Constexpr[int],
+            threads_per_row: cutlass.Constexpr[int],
+        ):
+            self._kernel_impl(
+                mX,
+                mW,
+                mB,
+                mRes,
+                mO,
+                mResO,
+                mRstd,
+                eps,
+                tv_layout,
+                tiler_mn,
+                num_warps,
+                warps_per_row,
+                threads_per_row,
+            )
+    else:
+
+        @cute.kernel
+        def kernel(
+            self,
+            mX: cute.Tensor,
+            mW: Optional[cute.Tensor],
+            mB: Optional[cute.Tensor],
+            mRes: Optional[cute.Tensor],
+            mO: cute.Tensor,
+            mResO: Optional[cute.Tensor],
+            mRstd: Optional[cute.Tensor],
+            eps: Float32,
+        ):
+            copy_bits = const_expr(128)
+            tiler_mn, tv_layout = self._tv_layout(num_copy_bits=copy_bits)
+            num_threads = self._num_threads()
+            num_warps = num_threads // cute.arch.WARP_SIZE
+            threads_per_row = self._threads_per_row()
+            warps_per_row = max(threads_per_row // cute.arch.WARP_SIZE, 1)
+            self._kernel_impl(
+                mX,
+                mW,
+                mB,
+                mRes,
+                mO,
+                mResO,
+                mRstd,
+                eps,
+                tv_layout,
+                tiler_mn,
+                const_expr(num_warps),
+                const_expr(warps_per_row),
+                const_expr(threads_per_row),
+            )
+
+    @cute.jit
+    def _alloc_reduction_and_mbar(
+        self,
+        smem: cutlass.utils.SmemAllocator,
+        num_warps: cutlass.Constexpr[int],
+        warps_per_row: cutlass.Constexpr[int],
+    ) -> Tuple[cute.Tensor, Optional[cute.Pointer]]:
+        cluster_n = self._cluster_n()
+        red_layout = cute.make_ordered_layout(
+            (num_warps // warps_per_row, (warps_per_row, cluster_n), self.stage),
+            order=(1, 0, 2),
+        )
+        reduction_buffer = smem.allocate_tensor(self.reduction_dtype, red_layout, byte_alignment=4)
+        if const_expr(cluster_n > 1):
+            mbar_ptr = smem.allocate_array(cutlass.Int64, num_elems=self.stage)
+        else:
+            mbar_ptr = None
+        return reduction_buffer, mbar_ptr
+
+    @cute.jit
+    def _init_cluster(self, tidx: cutlass.Int32, mbar_ptr: Optional[cute.Pointer]):
+        if const_expr(mbar_ptr is not None):
+            if tidx < self.stage:
+                cute.arch.mbarrier_init(mbar_ptr + tidx, 1)
+            cute.arch.mbarrier_init_fence()
+            cute.arch.cluster_arrive_relaxed()
+
+
+def rmsnorm_forward_with_stage2(
+    x: Tensor,
+    weight: Optional[Tensor] = None,
+    bias: Optional[Tensor] = None,
+    residual: Optional[Tensor] = None,
+    eps: float = 1e-6,
+    store_rstd: bool = False,
+) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
+    assert x.is_cuda
+    assert x.dim() == 2
+    M, N = x.shape
+    dtype = TORCH2CUTE_DTYPE[x.dtype]
+
+    convert_x = lambda t: from_dlpack(t.detach(), assumed_align=32).mark_layout_dynamic(leading_dim=1)
+    mX = convert_x(x)
+    mRes = convert_x(residual) if residual is not None else None
+    out = torch.empty_like(x, dtype=x.dtype)
+    mO = from_dlpack(out.detach(), assumed_align=32).mark_layout_dynamic(leading_dim=1)
+
+    mW = (
+        from_dlpack(weight.detach(), assumed_align=32).mark_layout_dynamic(leading_dim=0)
+        if weight is not None
+        else None
+    )
+    mB = (
+        from_dlpack(bias.detach(), assumed_align=32).mark_layout_dynamic(leading_dim=0)
+        if bias is not None
+        else None
+    )
+    if store_rstd:
+        rstd = torch.empty(M, device=x.device, dtype=torch.float32)
+        mRstd = from_dlpack(rstd.detach(), assumed_align=4).mark_layout_dynamic(leading_dim=0)
+    else:
+        rstd = None
+        mRstd = None
+
+    residual_out = None
+    mResO = None
+    if residual is not None:
+        residual_out = torch.empty_like(residual)
+        mResO = from_dlpack(residual_out.detach(), assumed_align=32).mark_layout_dynamic(leading_dim=1)
+
+    # Enable the intra-row cp.async K-loop only for DSv3-style large-N rows
+    # with very large M, where there is enough work per row to amortize the
+    # pipeline start-up cost. Mid-size M shapes are better served by the
+    # simpler single-stage schedule.
+    use_kloop = bool(M >= 65536 and N in (6144, 8192))
+    stage = 2 if use_kloop else 1
+    op = RMSNormSM100WithStage2(N, dtype, stage=stage)
+    if use_kloop:
+        op._tpr_override = 128  # type: ignore[attr-defined]
+        # Prefer 1 row/CTA at N=6144; keep 2 rows/CTA at N=8192 to match
+        # the original tuning there.
+        op._nt_override = (128 if N == 6144 else 256)  # type: ignore[attr-defined]
+
+    stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+    key = (
+        N,
+        dtype,
+        mRes is not None,
+        mW is not None,
+        mB is not None,
+        mResO is not None,
+        mRstd is not None,
+        stage,
+    )
+    compiled = _COMPILE_CACHE.get(key)
+    if compiled is None:
+        compiled = cute.compile(op, mX, mW, mB, mRes, mO, mResO, mRstd, stream, Float32(eps))
+        _COMPILE_CACHE[key] = compiled
+    compiled(mX, mW, mB, mRes, mO, mResO, mRstd, stream, Float32(eps))
+    return out, rstd, residual_out
diff --git a/oink/src/kernelagent_oink/blackwell/softmax.py b/oink/src/kernelagent_oink/blackwell/softmax.py
new file mode 100644
index 0000000..a2f2581
--- /dev/null
+++ b/oink/src/kernelagent_oink/blackwell/softmax.py
@@ -0,0 +1,749 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Softmax forward + backward kernels for SM100 (Blackwell) in CuteDSL.
+
+This module implements numerically stable softmax over the last dimension of
+2D tensors (M, N) and its backward pass, targeting SM100 with Quack-style
+tiling, cp.async pipelines, and cluster reductions, but without depending on
+the `quack` package at runtime.
+
+The kernels are self-contained and use only local helpers in
+`kernelagent_oink.blackwell.lite_quack` plus CuTeDSL/CUTLASS.
+"""
+
+from __future__ import annotations
+
+import importlib.metadata
+import math
+import os
+import re
+from typing import Optional, Type
+
+import torch
+from torch import Tensor
+
+import cuda.bindings.driver as cuda  # provided by NVIDIA cuda-python
+
+# CuTeDSL caches generated MLIR into a tempdir under a global default
+# (`/tmp/$USER/cutlass_python_cache`). The cache bytecode format can differ across
+# `nvidia-cutlass-dsl` versions, and cross-version cache sharing causes noisy
+# warnings (and disables cache reuse).
+if "CUTE_DSL_CACHE_DIR" not in os.environ:
+    try:
+        _dsl_ver = importlib.metadata.version("nvidia-cutlass-dsl")
+    except Exception:
+        _dsl_ver = "unknown"
+    _dsl_ver = re.sub(r"[^0-9A-Za-z]+", "_", _dsl_ver)
+    _user = os.environ.get("USER") or os.environ.get("USERNAME") or "user"
+    _tmp = os.environ.get("TMPDIR") or "/tmp"
+    os.environ["CUTE_DSL_CACHE_DIR"] = os.path.join(
+        _tmp, _user, f"cutlass_python_cache_{_dsl_ver}"
+    )
+
+try:
+    import cutlass  # type: ignore  # noqa: F401
+except Exception as e:
+    raise ImportError(
+        "kernelagent_oink.blackwell.softmax requires CuTeDSL's Python package "
+        "(`cutlass`, typically provided by `nvidia-cutlass-dsl`)."
+    ) from e
+
+import cutlass.cute as cute
+from cutlass import Float32, Int32, const_expr
+from cutlass.cute import runtime as rt
+from cutlass.cute.runtime import from_dlpack
+
+from kernelagent_oink.blackwell.lite_quack import (
+    _KERNEL_ACCEPTS_LAYOUT_ARGS,
+    TORCH2CUTE_DTYPE,
+    ReductionBase,
+    domain_offset_i64,
+    fill_oob,
+    online_softmax_reduce,
+    predicate_k,
+    row_reduce,
+)
+
+_FWD_COMPILE_CACHE: dict[tuple[Type[cutlass.Numeric], int], object] = {}
+_BWD_COMPILE_CACHE: dict[tuple[Type[cutlass.Numeric], int], object] = {}
+_PTR_FWD_COMPILE_CACHE: dict[tuple[object, ...], object] = {}
+_PTR_BWD_COMPILE_CACHE: dict[tuple[object, ...], object] = {}
+
+
+class SoftmaxFwdSM100(ReductionBase):
+    def __init__(self, dtype: Type[cutlass.Numeric], N: int):
+        # One-stage online reduction: pack (max, sum_exp) into Int64 reduction buffer.
+        super().__init__(dtype, N, stage=1, reduction_dtype=cutlass.Int64)
+
+    def _calculate_threads_per_row(self) -> int:
+        # Match Quack's bucketed policy for Softmax.
+        N = self.N
+        if N <= 64:
+            return 8
+        if N <= 128:
+            return 16
+        if N <= 3072:
+            return 32
+        if N <= 6144:
+            return 64
+        if N <= 16384:
+            return 128
+        return 256
+
+    def _set_cluster_n(self) -> None:
+        # Quack-style growth of cluster_n with N and dtype.
+        N = self.N
+        if const_expr(self.dtype.width == 16):
+            cluster_n = (
+                1
+                if N <= 16 * 1024
+                else (
+                    2
+                    if N <= 32 * 1024
+                    else (4 if N <= 64 * 1024 else (8 if N <= 128 * 1024 else 16))
+                )
+            )
+        else:
+            cluster_n = (
+                1
+                if N <= 32 * 1024
+                else (
+                    2
+                    if N <= 64 * 1024
+                    else (4 if N <= 128 * 1024 else (8 if N <= 256 * 1024 else 16))
+                )
+            )
+        self.cluster_n = cluster_n
+
+    @cute.jit
+    def __call__(self, mX: cute.Tensor, mO: cute.Tensor, stream: cuda.CUstream) -> None:
+        assert mX.element_type == self.dtype
+        assert mO.element_type == self.dtype
+        # Use the generic ReductionBase tiling with 128-bit vectorization.
+        tiler_mn, tv_layout = self._get_tv_layout()
+        num_threads = (
+            cute.size(tv_layout, mode=[0]) if _KERNEL_ACCEPTS_LAYOUT_ARGS else self._get_num_threads()
+        )
+        num_warps = num_threads // cute.arch.WARP_SIZE
+        kernel = (
+            self.kernel(mX, mO, tv_layout, tiler_mn)
+            if _KERNEL_ACCEPTS_LAYOUT_ARGS
+            else self.kernel(mX, mO)
+        )
+        kernel.launch(
+            grid=[cute.ceil_div(mX.shape[0], tiler_mn[0]), self.cluster_n, 1],
+            block=[num_threads, 1, 1],
+            cluster=[1, self.cluster_n, 1] if const_expr(self.cluster_n > 1) else None,
+            smem=self._smem_size_in_bytes(tiler_mn, num_warps),
+            stream=stream,
+        )
+
+    @cute.jit
+    def launch_from_ptrs(
+        self,
+        ptr_x: cute.Pointer,
+        ptr_out: cute.Pointer,
+        M: Int32,
+        ld: Int32,
+        stream: cuda.CUstream,
+    ) -> None:
+        """Pointer-based entrypoint that bypasses DLPack conversions.
+
+        Reconstructs cute.Tensor views from raw pointers + explicit layouts
+        inside the JIT graph, matching the existing SM100 schedule.
+        """
+        # Mirror Quack/LayerNorm contracts: assume 16B alignment and an LD that
+        # preserves 128-bit vectorized copies for every row start.
+        ld_assumed = cute.assume(ld, divby=128 // self.dtype.width)
+        layout_mn = cute.make_layout((M, self.N), stride=(ld_assumed, 1))
+        mX = cute.make_tensor(ptr_x, layout_mn)
+        mO = cute.make_tensor(ptr_out, layout_mn)
+        self.__call__(mX, mO, stream)
+
+    @cute.jit
+    def _kernel_impl(
+        self,
+        mX: cute.Tensor,
+        mO: cute.Tensor,
+        tv_layout: cute.Layout,
+        tiler_mn: cute.Shape,
+    ) -> None:
+        tidx, _, _ = cute.arch.thread_idx()
+        bidx, _, _ = cute.arch.block_idx()
+        if const_expr(self.cluster_n > 1):
+            cluster_y = cute.arch.block_idx()[1]
+        else:
+            cluster_y = const_expr(0)
+
+        shape = mX.shape
+        idX = cute.make_identity_tensor(shape)
+
+        # Slice per-CTA region; use 64-bit indexing for large tensors.
+        mX, mO = [domain_offset_i64((bidx * tiler_mn[0], 0), mT) for mT in (mX, mO)]
+        gX, gO = [cute.local_tile(mT, tiler_mn, (0, cluster_y)) for mT in (mX, mO)]
+        cX = cute.local_tile(idX, tiler_mn, (bidx, cluster_y))
+
+        smem = cutlass.utils.SmemAllocator()
+        sX = smem.allocate_tensor(
+            mX.element_type,
+            cute.make_ordered_layout(tiler_mn, order=(1, 0)),
+            byte_alignment=16,
+        )
+        reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(smem, tv_layout)
+
+        # Copy atoms for gmem <-> smem and smem <-> gmem.
+        # Use 128-bit cp.async for global->shared and 128-bit vectorized stores.
+        copy_atom_load = cute.make_copy_atom(
+            cute.nvgpu.cpasync.CopyG2SOp(),
+            mX.element_type,
+            num_bits_per_copy=128,
+        )
+        copy_atom_store = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(),
+            gO.element_type,
+            num_bits_per_copy=128,
+        )
+
+        thr_copy_load = cute.make_tiled_copy(copy_atom_load, tv_layout, tiler_mn).get_slice(tidx)
+        thr_copy_store = cute.make_tiled_copy(copy_atom_store, tv_layout, tiler_mn).get_slice(tidx)
+
+        tXgX = thr_copy_load.partition_S(gX)
+        tXsX = thr_copy_load.partition_D(sX)
+        tXgO = thr_copy_store.partition_D(gO)
+        tXcX = thr_copy_load.partition_S(cX)[(0, None), None, None]
+
+        # Register fragments.
+        tXrX, tXrO = [cute.make_fragment_like(thr) for thr in (tXgX, tXgO)]
+
+        num_warps = cute.size(tv_layout, mode=[0]) // cute.arch.WARP_SIZE
+        self._initialize_cluster(tidx, mbar_ptr, num_warps)
+
+        # Predicate and cp.async pipeline for potential tail tiles.
+        is_even_N = const_expr(self.N == tiler_mn[1] * self.cluster_n)
+        tXpX = (
+            predicate_k(thr_copy_load.partition_S(cX), limit=shape[1])
+            if const_expr(not is_even_N)
+            else None
+        )
+
+        if tXcX[0][0] < shape[0]:
+            cute.copy(copy_atom_load, tXgX, tXsX, pred=tXpX)
+        cute.arch.cp_async_commit_group()
+        cute.arch.cp_async_wait_group(0)
+
+        if const_expr(not is_even_N):
+            fill_oob(tXsX, tXpX, -tXsX.element_type.inf)
+
+        cute.autovec_copy(tXsX, tXrX)
+        x = tXrX.load().to(Float32)
+        threads_per_row = tv_layout.shape[0][0]
+
+        # Online softmax reduction: compute max and sum_exp in a single pass, with
+        # optional cluster-wide aggregation via an Int64 reduction buffer.
+        max_x, denom, exp_x = online_softmax_reduce(
+            x,
+            threads_per_row,
+            reduction_buffer[None, None, 0],
+            mbar_ptr,
+            hook_fn=cute.arch.cluster_wait if const_expr(self.cluster_n > 1) else None,
+            phase=None,
+            return_exp_x=True,
+        )
+
+        y = exp_x * cute.arch.rcp_approx(denom)
+        tXrO.store(y.to(tXrO.element_type))
+
+        tOpO = (
+            predicate_k(thr_copy_store.partition_S(cX), limit=shape[1])
+            if const_expr(not is_even_N)
+            else None
+        )
+
+        if tXcX[0][0] < shape[0]:
+            cute.copy(copy_atom_store, tXrO, tXgO, pred=tOpO)
+
+    if _KERNEL_ACCEPTS_LAYOUT_ARGS:
+
+        @cute.kernel
+        def kernel(
+            self,
+            mX: cute.Tensor,
+            mO: cute.Tensor,
+            tv_layout: cute.Layout,
+            tiler_mn: cute.Shape,
+        ) -> None:
+            self._kernel_impl(mX, mO, tv_layout, tiler_mn)
+    else:
+
+        @cute.kernel
+        def kernel(
+            self,
+            mX: cute.Tensor,
+            mO: cute.Tensor,
+        ) -> None:
+            tiler_mn, tv_layout = self._get_tv_layout()
+            self._kernel_impl(mX, mO, tv_layout, tiler_mn)
+
+
+class SoftmaxBwdSM100(ReductionBase):
+    def __init__(self, dtype: Type[cutlass.Numeric], N: int):
+        # One stage for dot(dy, y) per row.
+        super().__init__(dtype, N, stage=1, reduction_dtype=cutlass.Float32)
+
+    def _calculate_threads_per_row(self) -> int:
+        # Match Quack backward softmax buckets.
+        N = self.N
+        if N <= 64:
+            return 8
+        if N <= 128:
+            return 16
+        if N <= 3072:
+            return 32
+        if N <= 6144:
+            return 64
+        if N <= 8192:
+            return 128
+        return 256
+
+    def _set_cluster_n(self) -> None:
+        N = self.N
+        if const_expr(self.dtype.width == 16):
+            cluster_n = (
+                1
+                if N <= 16 * 1024
+                else (
+                    2
+                    if N <= 32 * 1024
+                    else (4 if N <= 64 * 1024 else (8 if N <= 128 * 1024 else 16))
+                )
+            )
+        else:
+            cluster_n = (
+                1
+                if N <= 32 * 1024
+                else (
+                    2
+                    if N <= 64 * 1024
+                    else (4 if N <= 128 * 1024 else (8 if N <= 256 * 1024 else 16))
+                )
+            )
+        self.cluster_n = cluster_n
+
+    def _get_num_threads(self) -> int:
+        # Slightly more aggressive threading for large N than the base class.
+        return 128 if self.N <= 8192 else 256
+
+    def _smem_size_in_bytes(self, tiler_mn, num_warps: int) -> int:
+        # Store both y and dy tiles plus reduction buffers and mbarriers.
+        return (
+            cute.size_in_bytes(self.dtype, cute.make_layout(tiler_mn)) * 2
+            + self.stage * num_warps * self.cluster_n * (self.reduction_dtype.width // 8)
+            + self.stage * (cutlass.Int64.width // 8)
+        )
+
+    @cute.jit
+    def __call__(
+        self,
+        mdY: cute.Tensor,
+        mY: cute.Tensor,
+        mdX: cute.Tensor,
+        stream: cuda.CUstream,
+    ) -> None:
+        assert mdY.element_type == self.dtype
+        assert mY.element_type == self.dtype
+        assert mdX.element_type == self.dtype
+        # Use the generic ReductionBase tiling with 128-bit vectorization.
+        tiler_mn, tv_layout = self._get_tv_layout()
+        num_threads = (
+            cute.size(tv_layout, mode=[0]) if _KERNEL_ACCEPTS_LAYOUT_ARGS else self._get_num_threads()
+        )
+        num_warps = num_threads // cute.arch.WARP_SIZE
+        kernel = (
+            self.kernel(mdY, mY, mdX, tv_layout, tiler_mn)
+            if _KERNEL_ACCEPTS_LAYOUT_ARGS
+            else self.kernel(mdY, mY, mdX)
+        )
+        kernel.launch(
+            grid=[cute.ceil_div(mdY.shape[0], tiler_mn[0]), self.cluster_n, 1],
+            block=[num_threads, 1, 1],
+            cluster=[1, self.cluster_n, 1] if const_expr(self.cluster_n > 1) else None,
+            smem=self._smem_size_in_bytes(tiler_mn, num_warps),
+            stream=stream,
+        )
+
+    @cute.jit
+    def launch_from_ptrs(
+        self,
+        ptr_dy: cute.Pointer,
+        ptr_y: cute.Pointer,
+        ptr_dx: cute.Pointer,
+        M: Int32,
+        ld: Int32,
+        stream: cuda.CUstream,
+    ) -> None:
+        """Pointer-based entrypoint that bypasses DLPack conversions."""
+        ld_assumed = cute.assume(ld, divby=128 // self.dtype.width)
+        layout_mn = cute.make_layout((M, self.N), stride=(ld_assumed, 1))
+        mdY = cute.make_tensor(ptr_dy, layout_mn)
+        mY = cute.make_tensor(ptr_y, layout_mn)
+        mdX = cute.make_tensor(ptr_dx, layout_mn)
+        self.__call__(mdY, mY, mdX, stream)
+
+    @cute.jit
+    def _kernel_impl(
+        self,
+        mdY: cute.Tensor,
+        mY: cute.Tensor,
+        mdX: cute.Tensor,
+        tv_layout: cute.Layout,
+        tiler_mn: cute.Shape,
+    ) -> None:
+        tidx, _, _ = cute.arch.thread_idx()
+        bidx, _, _ = cute.arch.block_idx()
+        if const_expr(self.cluster_n > 1):
+            cluster_y = cute.arch.block_idx()[1]
+        else:
+            cluster_y = const_expr(0)
+
+        shape = mdY.shape
+        idX = cute.make_identity_tensor(shape)
+
+        mdY, mY, mdX = [
+            domain_offset_i64((bidx * tiler_mn[0], 0), mT) for mT in (mdY, mY, mdX)
+        ]
+        gdY, gY, gdX = [cute.local_tile(mT, tiler_mn, (0, cluster_y)) for mT in (mdY, mY, mdX)]
+        cX = cute.local_tile(idX, tiler_mn, (bidx, cluster_y))
+
+        smem = cutlass.utils.SmemAllocator()
+        sdY = smem.allocate_tensor(
+            mdY.element_type,
+            cute.make_ordered_layout(tiler_mn, order=(1, 0)),
+            byte_alignment=16,
+        )
+        sY = smem.allocate_tensor(
+            mY.element_type,
+            cute.make_ordered_layout(tiler_mn, order=(1, 0)),
+            byte_alignment=16,
+        )
+        reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(smem, tv_layout)
+
+        copy_atom_load = cute.make_copy_atom(
+            cute.nvgpu.cpasync.CopyG2SOp(),
+            mdY.element_type,
+            num_bits_per_copy=128,
+        )
+        copy_atom_store = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(),
+            gdX.element_type,
+            num_bits_per_copy=128,
+        )
+
+        thr_copy_load = cute.make_tiled_copy(copy_atom_load, tv_layout, tiler_mn).get_slice(tidx)
+        thr_copy_store = cute.make_tiled_copy(copy_atom_store, tv_layout, tiler_mn).get_slice(tidx)
+
+        tdYgdY = thr_copy_load.partition_S(gdY)
+        tdYsdY = thr_copy_load.partition_D(sdY)
+        tYgY = thr_copy_load.partition_S(gY)
+        tYsY = thr_copy_load.partition_D(sY)
+        tdXgdX = thr_copy_store.partition_D(gdX)
+        tXcX = thr_copy_load.partition_S(cX)[(0, None), None, None]
+
+        tdYrdY, tYrY, tdXrdX = [cute.make_fragment_like(thr) for thr in (tdYgdY, tYgY, tdXgdX)]
+
+        num_warps = cute.size(tv_layout, mode=[0]) // cute.arch.WARP_SIZE
+        self._initialize_cluster(tidx, mbar_ptr, num_warps)
+
+        is_even_N = const_expr(self.N == tiler_mn[1] * self.cluster_n)
+        tdYpdY = (
+            predicate_k(thr_copy_load.partition_S(cX), limit=shape[1])
+            if const_expr(not is_even_N)
+            else None
+        )
+
+        if tXcX[0][0] < shape[0]:
+            cute.copy(copy_atom_load, tdYgdY, tdYsdY, pred=tdYpdY)
+            cute.copy(copy_atom_load, tYgY, tYsY, pred=tdYpdY)
+        cute.arch.cp_async_commit_group()
+        cute.arch.cp_async_wait_group(0)
+
+        cute.autovec_copy(tdYsdY, tdYrdY)
+        cute.autovec_copy(tYsY, tYrY)
+        dy = tdYrdY.load().to(Float32)
+        y = tYrY.load().to(Float32)
+
+        threads_per_row = tv_layout.shape[0][0]
+        dot = row_reduce(
+            dy * y,
+            cute.ReductionOp.ADD,
+            threads_per_row,
+            reduction_buffer[None, None, 0],
+            mbar_ptr if const_expr(self.cluster_n > 1) else None,
+            init_val=0.0,
+            hook_fn=cute.arch.cluster_wait if const_expr(self.cluster_n > 1) else None,
+        )
+
+        dx = y * (dy - dot)
+        tdXrdX.store(dx.to(tdXrdX.element_type))
+
+        tdXpdX = (
+            predicate_k(thr_copy_store.partition_S(cX), limit=shape[1])
+            if const_expr(not is_even_N)
+            else None
+        )
+        if tXcX[0][0] < shape[0]:
+            cute.copy(copy_atom_store, tdXrdX, tdXgdX, pred=tdXpdX)
+
+    if _KERNEL_ACCEPTS_LAYOUT_ARGS:
+
+        @cute.kernel
+        def kernel(
+            self,
+            mdY: cute.Tensor,
+            mY: cute.Tensor,
+            mdX: cute.Tensor,
+            tv_layout: cute.Layout,
+            tiler_mn: cute.Shape,
+        ) -> None:
+            self._kernel_impl(mdY, mY, mdX, tv_layout, tiler_mn)
+    else:
+
+        @cute.kernel
+        def kernel(
+            self,
+            mdY: cute.Tensor,
+            mY: cute.Tensor,
+            mdX: cute.Tensor,
+        ) -> None:
+            tiler_mn, tv_layout = self._get_tv_layout()
+            self._kernel_impl(mdY, mY, mdX, tv_layout, tiler_mn)
+
+
+def _convert_2d_tensor(x: Tensor) -> cute.Tensor:
+    # Match Quack's Softmax conversion exactly: assume 16B alignment and mark
+    # the shape compact with row-major stride order (0, 1), with mode=0 (batch).
+    # We intentionally do not call mark_layout_dynamic here to avoid the
+    # leading_dim stride==1 constraint used in RMSNorm.
+    return (
+        from_dlpack(x.detach(), assumed_align=16)
+        .mark_compact_shape_dynamic(mode=0, stride_order=(0, 1))
+    )
+
+
+def _can_use_ptr_path_2d(x: Tensor) -> bool:
+    """Conservative guard for the pointer-based fast path."""
+    if not x.is_cuda or x.dim() != 2:
+        return False
+    if x.dtype not in TORCH2CUTE_DTYPE:
+        return False
+    # Require row-major last-dim contiguous.
+    if x.stride(1) != 1:
+        return False
+    # Require 16B alignment (matches from_dlpack(..., assumed_align=16)).
+    if (x.data_ptr() % 16) != 0:
+        return False
+    dtype_x = TORCH2CUTE_DTYPE[x.dtype]
+    divby = 128 // dtype_x.width
+    # Softmax uses ReductionBase default num_copy_bits=128, so N must be divisible.
+    if (x.shape[1] % divby) != 0:
+        return False
+    # Ensure each row start remains aligned for 128-bit vectorized copies.
+    if (x.stride(0) % divby) != 0:
+        return False
+    return True
+
+
+def _softmax_forward_ptr_into(*, x: Tensor, out: Tensor) -> None:
+    """Launch the pointer-based Softmax forward kernel into preallocated `out`."""
+    assert x.is_cuda and x.dim() == 2
+    assert out.is_cuda and out.shape == x.shape and out.dtype == x.dtype
+    assert out.stride() == x.stride(), "Pointer path expects out to match x strides"
+
+    M, N = x.shape
+    device_index = x.get_device()
+    if torch.cuda.current_device() != device_index:
+        torch.cuda.set_device(device_index)
+    stream = cuda.CUstream(int(torch.cuda.current_stream().cuda_stream))
+
+    dtype_x = TORCH2CUTE_DTYPE[x.dtype]
+    key = ("ptr_fwd", int(N), dtype_x, int(device_index))
+    compiled = _PTR_FWD_COMPILE_CACHE.get(key)
+    if compiled is None:
+        op = SoftmaxFwdSM100(dtype_x, int(N))
+        ptr_x = rt.make_ptr(dtype_x, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+        ptr_out = rt.make_ptr(
+            dtype_x, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+        )
+        ld = Int32(int(x.stride(0)))
+        compiled = cute.compile(
+            op.launch_from_ptrs,
+            ptr_x,
+            ptr_out,
+            Int32(int(M)),
+            ld,
+            stream,
+        )
+        _PTR_FWD_COMPILE_CACHE[key] = compiled
+
+    ptr_x = rt.make_ptr(dtype_x, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+    ptr_out = rt.make_ptr(dtype_x, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+    compiled(ptr_x, ptr_out, Int32(int(M)), Int32(int(x.stride(0))), stream)
+
+
+def _softmax_backward_ptr_into(*, dy: Tensor, y: Tensor, dx: Tensor) -> None:
+    """Launch the pointer-based Softmax backward kernel into preallocated `dx`."""
+    assert dy.is_cuda and dy.dim() == 2
+    assert y.is_cuda and y.shape == dy.shape and y.dtype == dy.dtype
+    assert dx.is_cuda and dx.shape == dy.shape and dx.dtype == dy.dtype
+    assert dy.stride() == y.stride() == dx.stride(), "Pointer path expects matching strides"
+
+    M, N = dy.shape
+    device_index = dy.get_device()
+    if torch.cuda.current_device() != device_index:
+        torch.cuda.set_device(device_index)
+    stream = cuda.CUstream(int(torch.cuda.current_stream().cuda_stream))
+
+    dtype_x = TORCH2CUTE_DTYPE[dy.dtype]
+    key = ("ptr_bwd", int(N), dtype_x, int(device_index))
+    compiled = _PTR_BWD_COMPILE_CACHE.get(key)
+    if compiled is None:
+        op = SoftmaxBwdSM100(dtype_x, int(N))
+        ptr_dy = rt.make_ptr(dtype_x, dy.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+        ptr_y = rt.make_ptr(dtype_x, y.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+        ptr_dx = rt.make_ptr(dtype_x, dx.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+        ld = Int32(int(dy.stride(0)))
+        compiled = cute.compile(
+            op.launch_from_ptrs,
+            ptr_dy,
+            ptr_y,
+            ptr_dx,
+            Int32(int(M)),
+            ld,
+            stream,
+        )
+        _PTR_BWD_COMPILE_CACHE[key] = compiled
+
+    ptr_dy = rt.make_ptr(dtype_x, dy.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+    ptr_y = rt.make_ptr(dtype_x, y.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+    ptr_dx = rt.make_ptr(dtype_x, dx.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+    compiled(ptr_dy, ptr_y, ptr_dx, Int32(int(M)), Int32(int(dy.stride(0))), stream)
+
+
+def softmax_forward(x: Tensor) -> Tensor:
+    """SM100 CuteDSL softmax forward pass: y = softmax(x, dim=-1)."""
+    assert x.dim() == 2, "Input must be 2D (M, N)"
+    assert x.is_cuda, "Input must be on CUDA device"
+    assert x.dtype in TORCH2CUTE_DTYPE, "Unsupported dtype"
+
+    N = x.size(1)
+    dtype = TORCH2CUTE_DTYPE[x.dtype]
+    if _can_use_ptr_path_2d(x):
+        out = torch.empty_strided(x.shape, x.stride(), device=x.device, dtype=x.dtype)
+        _softmax_forward_ptr_into(x=x, out=out)
+        return out
+
+    out = torch.empty_like(x)
+
+    x_tensor = _convert_2d_tensor(x)
+    out_tensor = _convert_2d_tensor(out)
+    current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+
+    compile_key = (dtype, N)
+    kernel = _FWD_COMPILE_CACHE.get(compile_key)
+    if kernel is None:
+        op = SoftmaxFwdSM100(dtype, N)
+        kernel = cute.compile(op, x_tensor, out_tensor, current_stream)
+        _FWD_COMPILE_CACHE[compile_key] = kernel
+    kernel(x_tensor, out_tensor, current_stream)
+    return out
+
+
+def softmax_backward(dy: Tensor, y: Tensor) -> Tensor:
+    """SM100 CuteDSL softmax backward pass."""
+    assert dy.dim() == 2 and y.dim() == 2, "dy and y must be 2D (M, N)"
+    assert dy.shape == y.shape, "dy and y must have the same shape"
+    assert dy.is_cuda and y.is_cuda, "dy and y must be on CUDA device"
+    assert dy.dtype in TORCH2CUTE_DTYPE, "Unsupported dtype"
+    assert y.dtype == dy.dtype, "dy and y must have the same dtype"
+
+    N = dy.size(1)
+    dtype = TORCH2CUTE_DTYPE[dy.dtype]
+    if _can_use_ptr_path_2d(dy) and _can_use_ptr_path_2d(y) and dy.stride() == y.stride():
+        dx = torch.empty_strided(dy.shape, dy.stride(), device=dy.device, dtype=dy.dtype)
+        _softmax_backward_ptr_into(dy=dy, y=y, dx=dx)
+        return dx
+
+    dx = torch.empty_like(dy)
+
+    dy_tensor = _convert_2d_tensor(dy)
+    y_tensor = _convert_2d_tensor(y)
+    dx_tensor = _convert_2d_tensor(dx)
+    current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+
+    compile_key = (dtype, N)
+    kernel = _BWD_COMPILE_CACHE.get(compile_key)
+    if kernel is None:
+        op = SoftmaxBwdSM100(dtype, N)
+        kernel = cute.compile(op, dy_tensor, y_tensor, dx_tensor, current_stream)
+        _BWD_COMPILE_CACHE[compile_key] = kernel
+    kernel(dy_tensor, y_tensor, dx_tensor, current_stream)
+    return dx
+
+
+class SoftmaxFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x: Tensor) -> Tensor:
+        y = softmax_forward(x)
+        ctx.save_for_backward(y)
+        return y
+
+    @staticmethod
+    def backward(ctx, dy: Tensor) -> tuple[Tensor]:
+        (y,) = ctx.saved_tensors
+        dx = softmax_backward(dy, y)
+        return dx
+
+
+def softmax(x: Tensor) -> Tensor:
+    """Autograd-friendly softmax using the SM100 CuteDSL kernel."""
+    return SoftmaxFunction.apply(x)
+
+
+def _torch_softmax_reference(x: Tensor) -> Tensor:
+    return torch.nn.functional.softmax(x, dim=-1)
+
+
+def verify_softmax_parity(
+    M: int,
+    N: int,
+    dtype: torch.dtype = torch.bfloat16,
+    atol: float = 5e-2,
+    rtol: float = 5e-2,
+) -> None:
+    """Compare SM100 CuteDSL softmax against PyTorch for a single shape."""
+    device = torch.device("cuda")
+    x = torch.randn(M, N, device=device, dtype=dtype)
+    x.requires_grad_(True)
+
+    # Forward parity
+    y_ref = _torch_softmax_reference(x)
+    y = softmax(x)
+    torch.testing.assert_close(y, y_ref, atol=atol, rtol=rtol)
+
+    # Backward parity
+    dy = torch.randn_like(y)
+    (dx_ref,) = torch.autograd.grad(y_ref, x, dy, retain_graph=False)
+    dx = softmax_backward(dy, y)
+    torch.testing.assert_close(dx, dx_ref, atol=atol, rtol=rtol)

From 9b29732bb333a98ddbb5f750c1ff407d9050c5e7 Mon Sep 17 00:00:00 2001
From: Laura Wang <3700467+Laurawly@users.noreply.github.com>
Date: Wed, 21 Jan 2026 20:06:57 -0800
Subject: [PATCH 5/8] oink: fix ruff lint

---
 .../benchmark_cross_entropy_sm100.py          | 89 ++++++++++---------
 .../benchmark_fused_add_rmsnorm_sm100.py      | 36 ++++----
 .../benchmark/benchmark_hbm_roofline_sm100.py | 42 +++++----
 .../benchmark/benchmark_layernorm_sm100.py    | 34 +++----
 .../benchmark/benchmark_rmsnorm_bwd_sm100.py  | 39 ++++----
 .../benchmark/benchmark_rmsnorm_sm100.py      | 40 +++++----
 .../benchmark/benchmark_softmax_sm100.py      | 42 ++++++---
 .../benchmarks/readme/plot_quack_style_svg.py |  4 +-
 .../kernelagent_oink/blackwell/layernorm.py   |  2 +-
 .../kernelagent_oink/blackwell/lite_quack.py  |  2 +-
 .../src/kernelagent_oink/blackwell/rmsnorm.py | 13 +--
 .../blackwell/rmsnorm_with_stage2.py          | 10 ++-
 .../src/kernelagent_oink/blackwell/softmax.py |  3 +-
 13 files changed, 199 insertions(+), 157 deletions(-)

diff --git a/oink/benchmarks/benchmark/benchmark_cross_entropy_sm100.py b/oink/benchmarks/benchmark/benchmark_cross_entropy_sm100.py
index 8bcac15..18399c7 100644
--- a/oink/benchmarks/benchmark/benchmark_cross_entropy_sm100.py
+++ b/oink/benchmarks/benchmark/benchmark_cross_entropy_sm100.py
@@ -219,14 +219,16 @@ def bench_single(
     bytes_io = bytes_io_model_ce(M, N, dtype, target_dtype=target.dtype, mode=mode)
 
     if mode == "fwd":
-        fn_oink = lambda: oink_ce.cross_entropy_forward(
-            logits, target, ignore_index=int(ignore_index), reduction="none"
-        )
-        fn_quack = (
-            None
-            if quack_ce_fwd is None
-            else (
-                lambda: quack_ce_fwd(
+        def fn_oink():
+            return oink_ce.cross_entropy_forward(
+                logits, target, ignore_index=int(ignore_index), reduction="none"
+            )
+
+        fn_quack = None
+        if quack_ce_fwd is not None:
+
+            def fn_quack():
+                return quack_ce_fwd(
                     logits,
                     target,
                     target_logit=None,
@@ -235,8 +237,7 @@ def bench_single(
                     return_dx=False,
                     inplace_backward=False,
                 )
-            )
-        )
+
     elif mode == "bwd":
         with torch.no_grad():
             _loss_o, lse_o = oink_ce.cross_entropy_forward(
@@ -254,14 +255,17 @@ def bench_single(
                 )
             else:
                 lse_q = None
-        fn_oink = lambda: oink_ce.cross_entropy_backward(
-            dloss, logits, target, lse_o, ignore_index=int(ignore_index)
-        )
-        fn_quack = (
-            None
-            if (quack_ce_bwd is None or lse_q is None)
-            else (
-                lambda: quack_ce_bwd(
+
+        def fn_oink():
+            return oink_ce.cross_entropy_backward(
+                dloss, logits, target, lse_o, ignore_index=int(ignore_index)
+            )
+
+        fn_quack = None
+        if quack_ce_bwd is not None and lse_q is not None:
+
+            def fn_quack():
+                return quack_ce_bwd(
                     logits,
                     target,
                     dloss,
@@ -269,37 +273,38 @@ def bench_single(
                     ignore_index=int(ignore_index),
                     inplace_backward=False,
                 )
-            )
-        )
+
     elif mode == "fwd_bwd":
-        fn_oink = lambda: oink_ce.cross_entropy_fwd_bwd(
-            dloss,
-            logits,
-            target,
-            ignore_index=int(ignore_index),
-        )
-        fn_quack = (
-            None
-            if (quack_ce_fwd is None or quack_ce_bwd is None)
-            else (
-                lambda: quack_ce_bwd(
+        def fn_oink():
+            return oink_ce.cross_entropy_fwd_bwd(
+                dloss,
+                logits,
+                target,
+                ignore_index=int(ignore_index),
+            )
+
+        fn_quack = None
+        if quack_ce_fwd is not None and quack_ce_bwd is not None:
+
+            def fn_quack():
+                _loss_q, lse_q = quack_ce_fwd(
+                    logits,
+                    target,
+                    target_logit=None,
+                    ignore_index=int(ignore_index),
+                    return_lse=True,
+                    return_dx=False,
+                    inplace_backward=False,
+                )
+                return quack_ce_bwd(
                     logits,
                     target,
                     dloss,
-                    quack_ce_fwd(
-                        logits,
-                        target,
-                        target_logit=None,
-                        ignore_index=int(ignore_index),
-                        return_lse=True,
-                        return_dx=False,
-                        inplace_backward=False,
-                    )[1],
+                    lse_q,
                     ignore_index=int(ignore_index),
                     inplace_backward=False,
                 )
-            )
-        )
+
     else:
         raise ValueError(f"Unsupported mode: {mode}")
 
diff --git a/oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py b/oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py
index b75f892..6418e61 100644
--- a/oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py
+++ b/oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
 """
 Benchmark fused_add_rmsnorm (in-place) on SM100.
 
@@ -21,9 +19,11 @@
     --json /tmp/kernelagent_oink_sm100_suite_bf16/fused_add_rmsnorm_dsv3.json
 """
 
+from __future__ import annotations
+
 import argparse
 import os
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Tuple
 
 import torch
 
@@ -165,7 +165,9 @@ def bench_one(
 
     bytes_io = bytes_io_model_fused_add_rmsnorm_inplace(M, N, dtype)
 
-    fn = lambda: oink_rmsnorm.fused_add_rmsnorm_inplace_(x, residual, w, eps=1e-6)
+    def fn():
+        oink_rmsnorm.fused_add_rmsnorm_inplace_(x, residual, w, eps=1e-6)
+
     ms = do_bench_triton(fn, warmup_ms=warmup_ms, rep_ms=iters_ms)
 
     gbps = bytes_io / (ms * 1e-3) / 1e9
@@ -187,18 +189,20 @@ def bench_one(
         out_q = torch.empty_like(x)
         res_out_q = torch.empty_like(residual)
 
-        fn_q = lambda: quack_rmsnorm_fwd_mut(
-            x,
-            w,
-            out_q,
-            None,  # bias
-            None,  # rstd
-            None,  # mean
-            residual,
-            res_out_q,
-            1e-6,
-            False,  # is_layernorm
-        )
+        def fn_q():
+            quack_rmsnorm_fwd_mut(
+                x,
+                w,
+                out_q,
+                None,  # bias
+                None,  # rstd
+                None,  # mean
+                residual,
+                res_out_q,
+                1e-6,
+                False,  # is_layernorm
+            )
+
         ms_q = do_bench_triton(fn_q, warmup_ms=warmup_ms, rep_ms=iters_ms)
         gbps_q = bytes_io / (ms_q * 1e-3) / 1e9
         row.update(
diff --git a/oink/benchmarks/benchmark/benchmark_hbm_roofline_sm100.py b/oink/benchmarks/benchmark/benchmark_hbm_roofline_sm100.py
index 971a03c..8ec4bfd 100644
--- a/oink/benchmarks/benchmark/benchmark_hbm_roofline_sm100.py
+++ b/oink/benchmarks/benchmark/benchmark_hbm_roofline_sm100.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
 """
 HBM roofline microbenchmark for SM100 (GB200 / Blackwell).
 
@@ -17,9 +15,11 @@
   CUDA_VISIBLE_DEVICES=0 python oink/benchmarks/benchmark/benchmark_hbm_roofline_sm100.py --dtype fp16 --op triad --gb 2
 """
 
+from __future__ import annotations
+
 import argparse
 import os
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Tuple
 
 import torch
 import triton
@@ -95,23 +95,27 @@ def bench_one(
     grid = (triton.cdiv(n_elements, block),)
 
     if op == "copy":
-        launch = lambda: _copy_kernel[grid](
-            x,
-            y,
-            n_elements,
-            BLOCK=block,
-            num_warps=num_warps,
-            num_stages=4,
-        )
+        def launch():
+            _copy_kernel[grid](
+                x,
+                y,
+                n_elements,
+                BLOCK=block,
+                num_warps=num_warps,
+                num_stages=4,
+            )
+
     elif op == "triad":
-        launch = lambda: _triad_kernel[grid](
-            x,
-            y,
-            n_elements,
-            BLOCK=block,
-            num_warps=num_warps,
-            num_stages=4,
-        )
+        def launch():
+            _triad_kernel[grid](
+                x,
+                y,
+                n_elements,
+                BLOCK=block,
+                num_warps=num_warps,
+                num_stages=4,
+            )
+
     else:
         raise ValueError(f"Unsupported op: {op}")
 
diff --git a/oink/benchmarks/benchmark/benchmark_layernorm_sm100.py b/oink/benchmarks/benchmark/benchmark_layernorm_sm100.py
index 778e3e2..a9865d1 100644
--- a/oink/benchmarks/benchmark/benchmark_layernorm_sm100.py
+++ b/oink/benchmarks/benchmark/benchmark_layernorm_sm100.py
@@ -244,27 +244,31 @@ def bench_single(
         weight_dtype=w.dtype,
     )
 
-    fn_oink = lambda: oink_ln.layernorm(
-        x,
-        w,
-        bias=b,
-        eps=eps,
-        return_rstd=return_rstd,
-        return_mean=return_mean,
-    )
+    def fn_oink():
+        return oink_ln.layernorm(
+            x,
+            w,
+            bias=b,
+            eps=eps,
+            return_rstd=return_rstd,
+            return_mean=return_mean,
+        )
+
     ms_oink = do_bench_triton(fn_oink, warmup_ms=warmup_ms, rep_ms=iters_ms)
     gbps_oink = bytes_io / (ms_oink * 1e-3) / 1e9
 
     if quack_layernorm is None or has_bias:
         return (ms_oink, gbps_oink), None, stats
 
-    fn_quack = lambda: quack_layernorm(
-        x,
-        w,
-        eps=eps,
-        return_rstd=return_rstd,
-        return_mean=return_mean,
-    )
+    def fn_quack():
+        return quack_layernorm(
+            x,
+            w,
+            eps=eps,
+            return_rstd=return_rstd,
+            return_mean=return_mean,
+        )
+
     ms_quack = do_bench_triton(fn_quack, warmup_ms=warmup_ms, rep_ms=iters_ms)
     gbps_quack = bytes_io / (ms_quack * 1e-3) / 1e9
     return (ms_oink, gbps_oink), (ms_quack, gbps_quack), stats
diff --git a/oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py b/oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py
index 01c390d..4ba1c47 100644
--- a/oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py
+++ b/oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py
@@ -262,15 +262,16 @@ def bench_single(
     if verify:
         stats = _verify_parity(x, w, dout, rstd, has_bias=False, has_residual=False)
 
-    fn_oink = lambda: oink_rmsnorm.rmsnorm_backward(
-        x,
-        w,
-        dout,
-        rstd,
-        dresidual_out=None,
-        has_bias=False,
-        has_residual=False,
-    )
+    def fn_oink():
+        return oink_rmsnorm.rmsnorm_backward(
+            x,
+            w,
+            dout,
+            rstd,
+            dresidual_out=None,
+            has_bias=False,
+            has_residual=False,
+        )
 
     ms_oink = do_bench_triton(fn_oink, warmup_ms=warmup_ms, rep_ms=iters_ms)
     bytes_io = bytes_io_model_bwd(M, N, dtype, weight_dtype=w.dtype)
@@ -280,15 +281,17 @@ def bench_single(
     if quack_rmsnorm_bwd is None:
         return ours, None, stats
 
-    fn_quack = lambda: quack_rmsnorm_bwd(
-        x,
-        w,
-        dout,
-        rstd,
-        dresidual_out=None,
-        has_bias=False,
-        has_residual=False,
-    )
+    def fn_quack():
+        return quack_rmsnorm_bwd(
+            x,
+            w,
+            dout,
+            rstd,
+            dresidual_out=None,
+            has_bias=False,
+            has_residual=False,
+        )
+
     ms_quack = do_bench_triton(fn_quack, warmup_ms=warmup_ms, rep_ms=iters_ms)
     gbps_quack = bytes_io / (ms_quack * 1e-3) / 1e9
     return ours, Result(ms=ms_quack, gbps=gbps_quack), stats
diff --git a/oink/benchmarks/benchmark/benchmark_rmsnorm_sm100.py b/oink/benchmarks/benchmark/benchmark_rmsnorm_sm100.py
index e55e9ff..20ed8ac 100644
--- a/oink/benchmarks/benchmark/benchmark_rmsnorm_sm100.py
+++ b/oink/benchmarks/benchmark/benchmark_rmsnorm_sm100.py
@@ -183,30 +183,34 @@ def bench_single(
 
     bytes_io = bytes_io_model_fwd(M, N, dtype, weight_dtype=w.dtype)
 
-    fn_oink = lambda: oink_rmsnorm.rmsnorm_forward(
-        x,
-        weight=w,
-        bias=None,
-        residual=None,
-        eps=eps,
-        store_rstd=store_rstd,
-    )
+    def fn_oink():
+        return oink_rmsnorm.rmsnorm_forward(
+            x,
+            weight=w,
+            bias=None,
+            residual=None,
+            eps=eps,
+            store_rstd=store_rstd,
+        )
+
     ms_oink = do_bench_triton(fn_oink, warmup_ms=warmup_ms, rep_ms=iters_ms)
     gbps_oink = bytes_io / (ms_oink * 1e-3) / 1e9
 
     if quack_rmsnorm_fwd is None:
         return (ms_oink, gbps_oink), None, stats
 
-    fn_quack = lambda: quack_rmsnorm_fwd(
-        x,
-        w,
-        bias=None,
-        residual=None,
-        out_dtype=None,
-        residual_dtype=None,
-        eps=eps,
-        store_rstd=store_rstd,
-    )
+    def fn_quack():
+        return quack_rmsnorm_fwd(
+            x,
+            w,
+            bias=None,
+            residual=None,
+            out_dtype=None,
+            residual_dtype=None,
+            eps=eps,
+            store_rstd=store_rstd,
+        )
+
     ms_quack = do_bench_triton(fn_quack, warmup_ms=warmup_ms, rep_ms=iters_ms)
     gbps_quack = bytes_io / (ms_quack * 1e-3) / 1e9
     return (ms_oink, gbps_oink), (ms_quack, gbps_quack), stats
diff --git a/oink/benchmarks/benchmark/benchmark_softmax_sm100.py b/oink/benchmarks/benchmark/benchmark_softmax_sm100.py
index 93c5af3..7826efc 100644
--- a/oink/benchmarks/benchmark/benchmark_softmax_sm100.py
+++ b/oink/benchmarks/benchmark/benchmark_softmax_sm100.py
@@ -150,25 +150,39 @@ def bench_single(
     bytes_io = bytes_io_model_softmax(M, N, dtype, mode=mode)
 
     if mode == "fwd":
-        fn_oink = lambda: oink_softmax.softmax_forward(x)
-        fn_quack = None if quack_softmax_fwd is None else (lambda: quack_softmax_fwd(x))
+        def fn_oink():
+            return oink_softmax.softmax_forward(x)
+
+        fn_quack = None
+        if quack_softmax_fwd is not None:
+
+            def fn_quack():
+                return quack_softmax_fwd(x)
+
     elif mode == "bwd":
         with torch.no_grad():
             y_o = oink_softmax.softmax_forward(x)
             y_q = quack_softmax_fwd(x) if quack_softmax_fwd is not None else None
-        fn_oink = lambda: oink_softmax.softmax_backward(dy, y_o)
-        fn_quack = (
-            None
-            if (quack_softmax_bwd is None or y_q is None)
-            else (lambda: quack_softmax_bwd(dy, y_q))
-        )
+
+        def fn_oink():
+            return oink_softmax.softmax_backward(dy, y_o)
+
+        fn_quack = None
+        if quack_softmax_bwd is not None and y_q is not None:
+
+            def fn_quack():
+                return quack_softmax_bwd(dy, y_q)
+
     elif mode == "fwd_bwd":
-        fn_oink = lambda: oink_softmax.softmax_fwd_bwd(dy, x)
-        fn_quack = (
-            None
-            if (quack_softmax_fwd is None or quack_softmax_bwd is None)
-            else (lambda: quack_softmax_bwd(dy, quack_softmax_fwd(x)))
-        )
+        def fn_oink():
+            return oink_softmax.softmax_fwd_bwd(dy, x)
+
+        fn_quack = None
+        if quack_softmax_fwd is not None and quack_softmax_bwd is not None:
+
+            def fn_quack():
+                return quack_softmax_bwd(dy, quack_softmax_fwd(x))
+
     else:
         raise ValueError(f"Unsupported mode: {mode}")
 
diff --git a/oink/benchmarks/readme/plot_quack_style_svg.py b/oink/benchmarks/readme/plot_quack_style_svg.py
index 1799f2e..c089b2b 100644
--- a/oink/benchmarks/readme/plot_quack_style_svg.py
+++ b/oink/benchmarks/readme/plot_quack_style_svg.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
 """
 Generate Quack-style SVG performance plots (Oink vs Quack) from the SM100 suite
 JSON artifacts under `/tmp/kernelagent_oink_sm100_suite_{bf16,fp16}`.
@@ -42,6 +40,8 @@
   when available: `fused_add_rmsnorm_dsv3.json`.
 """
 
+from __future__ import annotations
+
 import argparse
 import json
 import math
diff --git a/oink/src/kernelagent_oink/blackwell/layernorm.py b/oink/src/kernelagent_oink/blackwell/layernorm.py
index 05b11de..0e4d640 100644
--- a/oink/src/kernelagent_oink/blackwell/layernorm.py
+++ b/oink/src/kernelagent_oink/blackwell/layernorm.py
@@ -80,7 +80,7 @@
 
 # Local helpers cloned from Quack via lite_quack so that this kernel does
 # not depend on `quack` at runtime.
-from kernelagent_oink.blackwell.lite_quack import (
+from kernelagent_oink.blackwell.lite_quack import (  # noqa: E402
     _KERNEL_ACCEPTS_LAYOUT_ARGS,
     TORCH2CUTE_DTYPE,
     ReductionBase as _ReductionBase,
diff --git a/oink/src/kernelagent_oink/blackwell/lite_quack.py b/oink/src/kernelagent_oink/blackwell/lite_quack.py
index 1bc15b1..590d773 100644
--- a/oink/src/kernelagent_oink/blackwell/lite_quack.py
+++ b/oink/src/kernelagent_oink/blackwell/lite_quack.py
@@ -39,7 +39,7 @@
 from cutlass import Float32, Int32, const_expr
 from cutlass.cute.runtime import from_dlpack
 from cutlass.cutlass_dsl import T, dsl_user_op
-from cutlass._mlir.dialects import llvm, nvvm, vector
+from cutlass._mlir.dialects import llvm, vector
 
 
 def _parse_version_tuple(version: str) -> tuple[int, int, int]:
diff --git a/oink/src/kernelagent_oink/blackwell/rmsnorm.py b/oink/src/kernelagent_oink/blackwell/rmsnorm.py
index 1e080a3..9df9f16 100644
--- a/oink/src/kernelagent_oink/blackwell/rmsnorm.py
+++ b/oink/src/kernelagent_oink/blackwell/rmsnorm.py
@@ -3119,7 +3119,6 @@ def _rmsnorm_bwd_sm100(
         assert dresidual.dtype in (torch.float16, torch.bfloat16, torch.float32)
 
     M, N = x.size(0), x.size(1)
-    device = x.device
     if dw_partial is None and db_partial is None:
         assert sm_count is not None
     else:
@@ -3130,12 +3129,14 @@ def _rmsnorm_bwd_sm100(
     # Match Quack's conversion strategy for activations/gradients: keep the
     # (M, N) layout dynamic without enforcing additional compact-shape
     # constraints. This reduces per-call Python overhead for small-M shapes.
-    convert_from_dlpack = lambda t: from_dlpack(  # type: ignore[assignment]
-        t.detach(),
-        assumed_align=16,
-    ).mark_layout_dynamic(leading_dim=1)
+    def _convert_mx(t: Tensor) -> cute.Tensor:
+        return from_dlpack(
+            t.detach(),
+            assumed_align=16,
+        ).mark_layout_dynamic(leading_dim=1)
+
     x_tensor, dout_tensor, dres_out_tensor, dx_tensor, dres_tensor = [
-        convert_from_dlpack(t) if t is not None else None
+        _convert_mx(t) if t is not None else None
         for t in (x, dout, dresidual_out, dx, dresidual)
     ]
 
diff --git a/oink/src/kernelagent_oink/blackwell/rmsnorm_with_stage2.py b/oink/src/kernelagent_oink/blackwell/rmsnorm_with_stage2.py
index b53da12..fec5bf4 100644
--- a/oink/src/kernelagent_oink/blackwell/rmsnorm_with_stage2.py
+++ b/oink/src/kernelagent_oink/blackwell/rmsnorm_with_stage2.py
@@ -744,9 +744,13 @@ def rmsnorm_forward_with_stage2(
     M, N = x.shape
     dtype = TORCH2CUTE_DTYPE[x.dtype]
 
-    convert_x = lambda t: from_dlpack(t.detach(), assumed_align=32).mark_layout_dynamic(leading_dim=1)
-    mX = convert_x(x)
-    mRes = convert_x(residual) if residual is not None else None
+    def _convert_x(t: Tensor) -> cute.Tensor:
+        return from_dlpack(
+            t.detach(), assumed_align=32
+        ).mark_layout_dynamic(leading_dim=1)
+
+    mX = _convert_x(x)
+    mRes = _convert_x(residual) if residual is not None else None
     out = torch.empty_like(x, dtype=x.dtype)
     mO = from_dlpack(out.detach(), assumed_align=32).mark_layout_dynamic(leading_dim=1)
 
diff --git a/oink/src/kernelagent_oink/blackwell/softmax.py b/oink/src/kernelagent_oink/blackwell/softmax.py
index a2f2581..a8a2791 100644
--- a/oink/src/kernelagent_oink/blackwell/softmax.py
+++ b/oink/src/kernelagent_oink/blackwell/softmax.py
@@ -27,10 +27,9 @@
 from __future__ import annotations
 
 import importlib.metadata
-import math
 import os
 import re
-from typing import Optional, Type
+from typing import Type
 
 import torch
 from torch import Tensor

From 0543b6f6948c8c110a72d7273ec343f8f897baaa Mon Sep 17 00:00:00 2001
From: Laura Wang <3700467+Laurawly@users.noreply.github.com>
Date: Wed, 21 Jan 2026 20:08:22 -0800
Subject: [PATCH 6/8] oink: ruff format

---
 oink/benchmarks/benchmark/bench_utils.py      |  34 +-
 .../benchmark_cross_entropy_sm100.py          |  97 +++--
 .../benchmark_fused_add_rmsnorm_sm100.py      |  35 +-
 .../benchmark/benchmark_hbm_roofline_sm100.py |  44 ++-
 .../benchmark/benchmark_layernorm_sm100.py    |  82 +++--
 .../benchmark/benchmark_rmsnorm_bwd_sm100.py  |  32 +-
 .../benchmark/benchmark_rmsnorm_sm100.py      |  55 ++-
 .../benchmark/benchmark_softmax_sm100.py      |  39 +-
 .../benchmarks/readme/plot_quack_style_svg.py |  44 ++-
 oink/benchmarks/readme/run_sm100_suite.py     |   8 +-
 oink/benchmarks/readme/summarize_results.py   |  61 +++-
 .../blackwell/cross_entropy.py                |  81 ++++-
 .../kernelagent_oink/blackwell/layernorm.py   |  90 +++--
 .../kernelagent_oink/blackwell/lite_quack.py  | 179 ++++++---
 .../src/kernelagent_oink/blackwell/rmsnorm.py |  32 +-
 .../blackwell/rmsnorm_with_stage2.py          | 342 ++++++++++++++----
 .../src/kernelagent_oink/blackwell/softmax.py | 100 +++--
 17 files changed, 1019 insertions(+), 336 deletions(-)

diff --git a/oink/benchmarks/benchmark/bench_utils.py b/oink/benchmarks/benchmark/bench_utils.py
index 0abb005..0a9ae4b 100644
--- a/oink/benchmarks/benchmark/bench_utils.py
+++ b/oink/benchmarks/benchmark/bench_utils.py
@@ -67,7 +67,9 @@ def detect_hbm_peak_gbps(device: Optional[torch.device] = None) -> float:
     return 2000.0
 
 
-def do_bench_triton(fn: Callable[[], Any], *, warmup_ms: int = 25, rep_ms: int = 100) -> float:
+def do_bench_triton(
+    fn: Callable[[], Any], *, warmup_ms: int = 25, rep_ms: int = 100
+) -> float:
     """Kernel-only timing consistent with the Oink benchmark harnesses."""
     return float(triton_do_bench(fn, warmup=warmup_ms, rep=rep_ms, return_mode="mean"))
 
@@ -127,7 +129,13 @@ def write_csv(path: str, rows: Sequence[Dict[str, Any]]) -> None:
             writer.writerow(row)
 
 
-def write_json(path: str, meta: DeviceMeta, rows: Sequence[Dict[str, Any]], *, extra: Dict[str, Any] | None = None) -> None:
+def write_json(
+    path: str,
+    meta: DeviceMeta,
+    rows: Sequence[Dict[str, Any]],
+    *,
+    extra: Dict[str, Any] | None = None,
+) -> None:
     os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
     payload: Dict[str, Any] = {
         "meta": {**asdict(meta), **(extra or {})},
@@ -179,7 +187,9 @@ def __init__(self, *, total_elems: int, p99_target_samples: int = 1_000_000):
         if total_elems <= 0:
             raise ValueError(f"total_elems must be > 0, got {total_elems}")
         if p99_target_samples <= 0:
-            raise ValueError(f"p99_target_samples must be > 0, got {p99_target_samples}")
+            raise ValueError(
+                f"p99_target_samples must be > 0, got {p99_target_samples}"
+            )
         self.total_elems = int(total_elems)
         self.p99_target_samples = int(p99_target_samples)
         # Deterministic strided sampling across the flattened tensor order.
@@ -193,7 +203,9 @@ def __init__(self, *, total_elems: int, p99_target_samples: int = 1_000_000):
 
     def update(self, out: torch.Tensor, ref: torch.Tensor) -> None:
         if out.shape != ref.shape:
-            raise ValueError(f"shape mismatch: out={tuple(out.shape)} ref={tuple(ref.shape)}")
+            raise ValueError(
+                f"shape mismatch: out={tuple(out.shape)} ref={tuple(ref.shape)}"
+            )
 
         # Compute error in float32 for stable reductions.
         err_f32 = (out - ref).to(torch.float32)
@@ -214,9 +226,13 @@ def update(self, out: torch.Tensor, ref: torch.Tensor) -> None:
         stride = int(self.sample_stride)
         first = (-int(self._global_offset)) % stride
         if first < block_elems:
-            idx = torch.arange(first, block_elems, step=stride, device=flat.device, dtype=torch.int64)
+            idx = torch.arange(
+                first, block_elems, step=stride, device=flat.device, dtype=torch.int64
+            )
             # Gather a modest number of values (≈ block_elems/stride).
-            vals = flat.index_select(0, idx).detach().to(device="cpu", dtype=torch.float32)
+            vals = (
+                flat.index_select(0, idx).detach().to(device="cpu", dtype=torch.float32)
+            )
             self._abs_err_samples.append(vals)
 
         self._global_offset += block_elems
@@ -226,7 +242,11 @@ def finalize(self) -> ErrorStats:
             samples = torch.cat(self._abs_err_samples, dim=0)
             if samples.numel() > self.p99_target_samples:
                 samples = samples[: self.p99_target_samples]
-            p99 = float(torch.quantile(samples, 0.99).item()) if samples.numel() > 0 else 0.0
+            p99 = (
+                float(torch.quantile(samples, 0.99).item())
+                if samples.numel() > 0
+                else 0.0
+            )
             sample_elems = int(samples.numel())
         else:
             p99 = 0.0
diff --git a/oink/benchmarks/benchmark/benchmark_cross_entropy_sm100.py b/oink/benchmarks/benchmark/benchmark_cross_entropy_sm100.py
index 18399c7..ff1a99b 100644
--- a/oink/benchmarks/benchmark/benchmark_cross_entropy_sm100.py
+++ b/oink/benchmarks/benchmark/benchmark_cross_entropy_sm100.py
@@ -84,16 +84,22 @@ def dsv3_configs() -> List[Tuple[int, int]]:
     return [(m, n) for m in Ms for n in Ns]
 
 
-def _verify_parity(logits: torch.Tensor, target: torch.Tensor, *, ignore_index: int) -> dict[str, object]:
+def _verify_parity(
+    logits: torch.Tensor, target: torch.Tensor, *, ignore_index: int
+) -> dict[str, object]:
     dtype = logits.dtype
     ref_block_rows = 512
-    dloss = torch.randn(logits.size(0), device=logits.device, dtype=torch.float32)  # upstream grad
+    dloss = torch.randn(
+        logits.size(0), device=logits.device, dtype=torch.float32
+    )  # upstream grad
 
     with torch.no_grad():
         loss_o, lse_o = oink_ce.cross_entropy_forward(
             logits, target, ignore_index=ignore_index, reduction="none"
         )
-        dx_o = oink_ce.cross_entropy_backward(dloss, logits, target, lse_o, ignore_index=ignore_index)
+        dx_o = oink_ce.cross_entropy_backward(
+            dloss, logits, target, lse_o, ignore_index=ignore_index
+        )
         dx_fused_o = oink_ce.cross_entropy_fwd_bwd(
             dloss,
             logits,
@@ -125,8 +131,12 @@ def _verify_parity(logits: torch.Tensor, target: torch.Tensor, *, ignore_index:
 
     M = int(logits.shape[0])
     N = int(logits.shape[1])
-    loss_acc_ours = ErrorStatsAccumulator(total_elems=M, p99_target_samples=min(M, 1_000_000))
-    lse_acc_ours = ErrorStatsAccumulator(total_elems=M, p99_target_samples=min(M, 1_000_000))
+    loss_acc_ours = ErrorStatsAccumulator(
+        total_elems=M, p99_target_samples=min(M, 1_000_000)
+    )
+    lse_acc_ours = ErrorStatsAccumulator(
+        total_elems=M, p99_target_samples=min(M, 1_000_000)
+    )
     dx_acc_ours = ErrorStatsAccumulator(total_elems=M * N)
     dx_fused_acc_ours = ErrorStatsAccumulator(total_elems=M * N)
     loss_acc_quack = (
@@ -159,23 +169,39 @@ def _verify_parity(logits: torch.Tensor, target: torch.Tensor, *, ignore_index:
             ignore_index=ignore_index,
         )
         lse_ref = torch.logsumexp(logits_f32, dim=-1)
-        (dx_ref_f32,) = torch.autograd.grad(loss_ref, logits_f32, grad_outputs=dloss_blk)
+        (dx_ref_f32,) = torch.autograd.grad(
+            loss_ref, logits_f32, grad_outputs=dloss_blk
+        )
         dx_ref = dx_ref_f32.to(dtype)
 
-        torch.testing.assert_close(loss_o[start:end], loss_ref.detach(), **_VERIFY_TOL_LOSS)
-        torch.testing.assert_close(lse_o[start:end], lse_ref.detach(), **_VERIFY_TOL_LOSS)
+        torch.testing.assert_close(
+            loss_o[start:end], loss_ref.detach(), **_VERIFY_TOL_LOSS
+        )
+        torch.testing.assert_close(
+            lse_o[start:end], lse_ref.detach(), **_VERIFY_TOL_LOSS
+        )
         torch.testing.assert_close(dx_o[start:end], dx_ref, **_VERIFY_TOL_DX[dtype])
-        torch.testing.assert_close(dx_fused_o[start:end], dx_ref, **_VERIFY_TOL_DX[dtype])
+        torch.testing.assert_close(
+            dx_fused_o[start:end], dx_ref, **_VERIFY_TOL_DX[dtype]
+        )
         loss_acc_ours.update(loss_o[start:end], loss_ref.detach())
         lse_acc_ours.update(lse_o[start:end], lse_ref.detach())
         dx_acc_ours.update(dx_o[start:end], dx_ref)
         dx_fused_acc_ours.update(dx_fused_o[start:end], dx_ref)
 
         if loss_q is not None and lse_q is not None and dx_q is not None:
-            torch.testing.assert_close(loss_q[start:end], loss_ref.detach(), **_VERIFY_TOL_LOSS)
-            torch.testing.assert_close(lse_q[start:end], lse_ref.detach(), **_VERIFY_TOL_LOSS)
+            torch.testing.assert_close(
+                loss_q[start:end], loss_ref.detach(), **_VERIFY_TOL_LOSS
+            )
+            torch.testing.assert_close(
+                lse_q[start:end], lse_ref.detach(), **_VERIFY_TOL_LOSS
+            )
             torch.testing.assert_close(dx_q[start:end], dx_ref, **_VERIFY_TOL_DX[dtype])
-            assert loss_acc_quack is not None and lse_acc_quack is not None and dx_acc_quack is not None
+            assert (
+                loss_acc_quack is not None
+                and lse_acc_quack is not None
+                and dx_acc_quack is not None
+            )
             loss_acc_quack.update(loss_q[start:end], loss_ref.detach())
             lse_acc_quack.update(lse_q[start:end], lse_ref.detach())
             dx_acc_quack.update(dx_q[start:end], dx_ref)
@@ -185,7 +211,11 @@ def _verify_parity(logits: torch.Tensor, target: torch.Tensor, *, ignore_index:
     stats.update(error_stats_to_row("ours_err_lse", lse_acc_ours.finalize()))
     stats.update(error_stats_to_row("ours_err_dx", dx_acc_ours.finalize()))
     stats.update(error_stats_to_row("ours_err_dx_fused", dx_fused_acc_ours.finalize()))
-    if loss_acc_quack is not None and lse_acc_quack is not None and dx_acc_quack is not None:
+    if (
+        loss_acc_quack is not None
+        and lse_acc_quack is not None
+        and dx_acc_quack is not None
+    ):
         stats.update(error_stats_to_row("quack_err_loss", loss_acc_quack.finalize()))
         stats.update(error_stats_to_row("quack_err_lse", lse_acc_quack.finalize()))
         stats.update(error_stats_to_row("quack_err_dx", dx_acc_quack.finalize()))
@@ -219,6 +249,7 @@ def bench_single(
     bytes_io = bytes_io_model_ce(M, N, dtype, target_dtype=target.dtype, mode=mode)
 
     if mode == "fwd":
+
         def fn_oink():
             return oink_ce.cross_entropy_forward(
                 logits, target, ignore_index=int(ignore_index), reduction="none"
@@ -275,6 +306,7 @@ def fn_quack():
                 )
 
     elif mode == "fwd_bwd":
+
         def fn_oink():
             return oink_ce.cross_entropy_fwd_bwd(
                 dloss,
@@ -330,16 +362,34 @@ def main() -> None:
     print(f"Running on {torch.cuda.get_device_name(device)} (SM{sm})")
 
     p = argparse.ArgumentParser()
-    p.add_argument("--dtype", type=str, default="bf16", choices=["fp16", "bf16", "fp32"])
-    p.add_argument("--mode", type=str, default="fwd_bwd", choices=["fwd", "bwd", "fwd_bwd"])
+    p.add_argument(
+        "--dtype", type=str, default="bf16", choices=["fp16", "bf16", "fp32"]
+    )
+    p.add_argument(
+        "--mode", type=str, default="fwd_bwd", choices=["fwd", "bwd", "fwd_bwd"]
+    )
     p.add_argument("--ignore-index", type=int, default=-100)
-    p.add_argument("--iters", type=int, default=50, help="Triton do_bench rep_ms (kernel-only).")
+    p.add_argument(
+        "--iters", type=int, default=50, help="Triton do_bench rep_ms (kernel-only)."
+    )
     p.add_argument("--warmup-ms", type=int, default=25)
-    p.add_argument("--csv", type=str, default=None, help="Optional CSV output path; appends rows")
-    p.add_argument("--json", type=str, default=None, help="Optional JSON output path (meta + rows)")
+    p.add_argument(
+        "--csv", type=str, default=None, help="Optional CSV output path; appends rows"
+    )
+    p.add_argument(
+        "--json", type=str, default=None, help="Optional JSON output path (meta + rows)"
+    )
     p.add_argument("--configs", type=str, default="1024x4096,8192x4096")
-    p.add_argument("--quack-suite", action="store_true", help="Run Quack-style batch/seq grid (vocab=4096)")
-    p.add_argument("--dsv3", action="store_true", help="Run DSv3 set: M in {4096,16384,65536}, N in {3072,6144,8192,12288}")
+    p.add_argument(
+        "--quack-suite",
+        action="store_true",
+        help="Run Quack-style batch/seq grid (vocab=4096)",
+    )
+    p.add_argument(
+        "--dsv3",
+        action="store_true",
+        help="Run DSv3 set: M in {4096,16384,65536}, N in {3072,6144,8192,12288}",
+    )
     p.add_argument(
         "--skip-verify",
         action="store_true",
@@ -360,8 +410,11 @@ def main() -> None:
     meta = collect_device_meta(device)
 
     rows_out: List[Dict[str, Any]] = []
-    for (M, N) in cfgs:
-        print(f"bench M={M:<8d} N={N:<6d} dtype={args.dtype} mode={args.mode} ...", flush=True)
+    for M, N in cfgs:
+        print(
+            f"bench M={M:<8d} N={N:<6d} dtype={args.dtype} mode={args.mode} ...",
+            flush=True,
+        )
         (ms_oink, gbps_oink), quack, stats = bench_single(
             M=M,
             N=N,
diff --git a/oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py b/oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py
index 6418e61..863712d 100644
--- a/oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py
+++ b/oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py
@@ -90,8 +90,16 @@ def _verify_parity(
 
     y_acc_ours = ErrorStatsAccumulator(total_elems=M * N)
     z_acc_ours = ErrorStatsAccumulator(total_elems=M * N)
-    y_acc_quack = ErrorStatsAccumulator(total_elems=M * N) if quack_rmsnorm_fwd_mut is not None else None
-    z_acc_quack = ErrorStatsAccumulator(total_elems=M * N) if quack_rmsnorm_fwd_mut is not None else None
+    y_acc_quack = (
+        ErrorStatsAccumulator(total_elems=M * N)
+        if quack_rmsnorm_fwd_mut is not None
+        else None
+    )
+    z_acc_quack = (
+        ErrorStatsAccumulator(total_elems=M * N)
+        if quack_rmsnorm_fwd_mut is not None
+        else None
+    )
 
     x_o = x.clone()
     r_o = residual.clone()
@@ -141,7 +149,9 @@ def _verify_parity(
     stats.update(error_stats_to_row("ours_err_residual_out", z_acc_ours.finalize()))
     if y_acc_quack is not None and z_acc_quack is not None:
         stats.update(error_stats_to_row("quack_err_y", y_acc_quack.finalize()))
-        stats.update(error_stats_to_row("quack_err_residual_out", z_acc_quack.finalize()))
+        stats.update(
+            error_stats_to_row("quack_err_residual_out", z_acc_quack.finalize())
+        )
     return stats
 
 
@@ -177,7 +187,9 @@ def fn():
     row: Dict[str, Any] = dict(
         M=int(M),
         N=int(N),
-        dtype="bf16" if dtype is torch.bfloat16 else ("fp16" if dtype is torch.float16 else "fp32"),
+        dtype="bf16"
+        if dtype is torch.bfloat16
+        else ("fp16" if dtype is torch.float16 else "fp32"),
         ours_ms=float(ms),
         ours_gbps=float(gbps),
         ours_tbps=float(tbps),
@@ -247,7 +259,9 @@ def _print_table(rows: List[Dict[str, Any]]) -> None:
 
 def main() -> None:
     p = argparse.ArgumentParser()
-    p.add_argument("--dtype", type=str, default="bf16", choices=["bf16", "fp16", "fp32"])
+    p.add_argument(
+        "--dtype", type=str, default="bf16", choices=["bf16", "fp16", "fp32"]
+    )
     p.add_argument("--M", type=int, default=65536)
     p.add_argument("--N", type=int, default=4096)
     p.add_argument(
@@ -256,7 +270,9 @@ def main() -> None:
         help="Run DSv3 set: M in {4096,16384,65536}, N in {6144,7168,8192}",
     )
     p.add_argument("--warmup-ms", type=int, default=25)
-    p.add_argument("--iters", type=int, default=200, help="rep_ms for do_bench (default: 200)")
+    p.add_argument(
+        "--iters", type=int, default=200, help="rep_ms for do_bench (default: 200)"
+    )
     p.add_argument("--skip-verify", action="store_true")
     p.add_argument("--json", type=str, default=None)
     args = p.parse_args()
@@ -266,8 +282,11 @@ def main() -> None:
 
     cfgs = dsv3_configs() if bool(args.dsv3) else [(int(args.M), int(args.N))]
     rows: List[Dict[str, Any]] = []
-    for (M, N) in cfgs:
-        print(f"bench M={M:<8d} N={N:<6d} dtype={_dtype_label(dtype)} fused_add_rmsnorm ...", flush=True)
+    for M, N in cfgs:
+        print(
+            f"bench M={M:<8d} N={N:<6d} dtype={_dtype_label(dtype)} fused_add_rmsnorm ...",
+            flush=True,
+        )
         rows.append(
             bench_one(
                 M=int(M),
diff --git a/oink/benchmarks/benchmark/benchmark_hbm_roofline_sm100.py b/oink/benchmarks/benchmark/benchmark_hbm_roofline_sm100.py
index 8ec4bfd..c22294e 100644
--- a/oink/benchmarks/benchmark/benchmark_hbm_roofline_sm100.py
+++ b/oink/benchmarks/benchmark/benchmark_hbm_roofline_sm100.py
@@ -95,6 +95,7 @@ def bench_one(
     grid = (triton.cdiv(n_elements, block),)
 
     if op == "copy":
+
         def launch():
             _copy_kernel[grid](
                 x,
@@ -106,6 +107,7 @@ def launch():
             )
 
     elif op == "triad":
+
         def launch():
             _triad_kernel[grid](
                 x,
@@ -134,20 +136,38 @@ def _print_summary(rows: List[Dict[str, Any]]) -> None:
         return
     best = max(rows, key=lambda r: float(r["tbps"]))
     print("\nSummary (STREAM-like):")
-    print(f"- best_tbps: {best['tbps']:.3f} TB/s  ({best['op']}, BLOCK={best['block']}, warps={best['num_warps']})")
+    print(
+        f"- best_tbps: {best['tbps']:.3f} TB/s  ({best['op']}, BLOCK={best['block']}, warps={best['num_warps']})"
+    )
 
 
 def main() -> None:
     p = argparse.ArgumentParser()
-    p.add_argument("--dtype", type=str, default="bf16", choices=["bf16", "fp16", "fp32"])
+    p.add_argument(
+        "--dtype", type=str, default="bf16", choices=["bf16", "fp16", "fp32"]
+    )
     p.add_argument("--op", type=str, default="copy", choices=["copy", "triad", "both"])
-    p.add_argument("--gb", type=float, default=2.0, help="Size per tensor in GB (default: 2)")
+    p.add_argument(
+        "--gb", type=float, default=2.0, help="Size per tensor in GB (default: 2)"
+    )
     p.add_argument("--warmup-ms", type=int, default=25)
-    p.add_argument("--iters", type=int, default=100, help="rep_ms for do_bench (default: 100)")
-    p.add_argument("--json", type=str, default=None, help="Write JSON results to this path")
-    p.add_argument("--no-sweep", action="store_true", help="Disable tuning sweep; run a single config")
-    p.add_argument("--block", type=int, default=2048, help="BLOCK size when --no-sweep is set")
-    p.add_argument("--warps", type=int, default=8, help="num_warps when --no-sweep is set")
+    p.add_argument(
+        "--iters", type=int, default=100, help="rep_ms for do_bench (default: 100)"
+    )
+    p.add_argument(
+        "--json", type=str, default=None, help="Write JSON results to this path"
+    )
+    p.add_argument(
+        "--no-sweep",
+        action="store_true",
+        help="Disable tuning sweep; run a single config",
+    )
+    p.add_argument(
+        "--block", type=int, default=2048, help="BLOCK size when --no-sweep is set"
+    )
+    p.add_argument(
+        "--warps", type=int, default=8, help="num_warps when --no-sweep is set"
+    )
     args = p.parse_args()
 
     dtype = parse_dtype(args.dtype)
@@ -181,7 +201,9 @@ def main() -> None:
 
     print(f"Running on {props.name} (SM{props.major}{props.minor})")
     print(f"- dtype: {args.dtype} (elem={elem_size}B)")
-    print(f"- n_elements: {n_elements:,}  (~{(n_elements * elem_size) / (1024**3):.2f} GiB per tensor)")
+    print(
+        f"- n_elements: {n_elements:,}  (~{(n_elements * elem_size) / (1024**3):.2f} GiB per tensor)"
+    )
     print(f"- ops: {ops}")
     print(f"- sweep: {sweep}")
 
@@ -212,7 +234,9 @@ def main() -> None:
                     tbps=float(tbps),
                 )
             )
-            print(f"- {op:5s} BLOCK={block:4d} warps={warps}: {tbps:.3f} TB/s  ({ms:.4f} ms)")
+            print(
+                f"- {op:5s} BLOCK={block:4d} warps={warps}: {tbps:.3f} TB/s  ({ms:.4f} ms)"
+            )
 
     _print_summary(rows)
 
diff --git a/oink/benchmarks/benchmark/benchmark_layernorm_sm100.py b/oink/benchmarks/benchmark/benchmark_layernorm_sm100.py
index a9865d1..3c0e37d 100644
--- a/oink/benchmarks/benchmark/benchmark_layernorm_sm100.py
+++ b/oink/benchmarks/benchmark/benchmark_layernorm_sm100.py
@@ -97,7 +97,9 @@ def _verify_parity(
 
     y_acc_ours = ErrorStatsAccumulator(total_elems=M * N)
     y_acc_quack = (
-        ErrorStatsAccumulator(total_elems=M * N) if (quack_layernorm is not None and b is None) else None
+        ErrorStatsAccumulator(total_elems=M * N)
+        if (quack_layernorm is not None and b is None)
+        else None
     )
     with torch.no_grad():
         ours = oink_ln.layernorm(
@@ -138,8 +140,12 @@ def _unpack(out):
     # Pure-PyTorch reference (float32 accumulation), matching Quack's unit tests:
     # - compute ref output via F.layer_norm on float32
     # - compute mean/rstd from float32 input
-    rstd_ref_all = torch.empty((M,), device=x.device, dtype=torch.float32) if return_rstd else None
-    mean_ref_all = torch.empty((M,), device=x.device, dtype=torch.float32) if return_mean else None
+    rstd_ref_all = (
+        torch.empty((M,), device=x.device, dtype=torch.float32) if return_rstd else None
+    )
+    mean_ref_all = (
+        torch.empty((M,), device=x.device, dtype=torch.float32) if return_mean else None
+    )
 
     for start, end in iter_row_blocks(M, ref_block_rows):
         x_f32 = x[start:end].float()
@@ -165,16 +171,24 @@ def _unpack(out):
                 rstd_ref_all[start:end] = rstd_ref
 
                 assert rstd_o is not None
-                torch.testing.assert_close(rstd_o[start:end], rstd_ref, **_VERIFY_TOL_STATS)
+                torch.testing.assert_close(
+                    rstd_o[start:end], rstd_ref, **_VERIFY_TOL_STATS
+                )
                 if rstd_q is not None:
-                    torch.testing.assert_close(rstd_q[start:end], rstd_ref, **_VERIFY_TOL_STATS)
+                    torch.testing.assert_close(
+                        rstd_q[start:end], rstd_ref, **_VERIFY_TOL_STATS
+                    )
 
             if return_mean:
                 mean_ref = mean_f32
                 assert mean_o is not None
-                torch.testing.assert_close(mean_o[start:end], mean_ref, **_VERIFY_TOL_STATS)
+                torch.testing.assert_close(
+                    mean_o[start:end], mean_ref, **_VERIFY_TOL_STATS
+                )
                 if mean_q is not None:
-                    torch.testing.assert_close(mean_q[start:end], mean_ref, **_VERIFY_TOL_STATS)
+                    torch.testing.assert_close(
+                        mean_q[start:end], mean_ref, **_VERIFY_TOL_STATS
+                    )
 
     stats: dict[str, object] = {}
     stats.update(error_stats_to_row("ours_err_y", y_acc_ours.finalize()))
@@ -184,30 +198,38 @@ def _unpack(out):
     if return_rstd:
         assert rstd_o is not None and rstd_ref_all is not None
         rstd_acc_ours = ErrorStatsAccumulator(
-            total_elems=int(rstd_ref_all.numel()), p99_target_samples=int(rstd_ref_all.numel())
+            total_elems=int(rstd_ref_all.numel()),
+            p99_target_samples=int(rstd_ref_all.numel()),
         )
         rstd_acc_ours.update(rstd_o, rstd_ref_all)
         stats.update(error_stats_to_row("ours_err_rstd", rstd_acc_ours.finalize()))
         if rstd_q is not None:
             rstd_acc_quack = ErrorStatsAccumulator(
-                total_elems=int(rstd_ref_all.numel()), p99_target_samples=int(rstd_ref_all.numel())
+                total_elems=int(rstd_ref_all.numel()),
+                p99_target_samples=int(rstd_ref_all.numel()),
             )
             rstd_acc_quack.update(rstd_q, rstd_ref_all)
-            stats.update(error_stats_to_row("quack_err_rstd", rstd_acc_quack.finalize()))
+            stats.update(
+                error_stats_to_row("quack_err_rstd", rstd_acc_quack.finalize())
+            )
 
     if return_mean:
         assert mean_o is not None and mean_ref_all is not None
         mean_acc_ours = ErrorStatsAccumulator(
-            total_elems=int(mean_ref_all.numel()), p99_target_samples=int(mean_ref_all.numel())
+            total_elems=int(mean_ref_all.numel()),
+            p99_target_samples=int(mean_ref_all.numel()),
         )
         mean_acc_ours.update(mean_o, mean_ref_all)
         stats.update(error_stats_to_row("ours_err_mean", mean_acc_ours.finalize()))
         if mean_q is not None:
             mean_acc_quack = ErrorStatsAccumulator(
-                total_elems=int(mean_ref_all.numel()), p99_target_samples=int(mean_ref_all.numel())
+                total_elems=int(mean_ref_all.numel()),
+                p99_target_samples=int(mean_ref_all.numel()),
             )
             mean_acc_quack.update(mean_q, mean_ref_all)
-            stats.update(error_stats_to_row("quack_err_mean", mean_acc_quack.finalize()))
+            stats.update(
+                error_stats_to_row("quack_err_mean", mean_acc_quack.finalize())
+            )
 
     return stats
 
@@ -232,7 +254,9 @@ def bench_single(
 
     stats: dict[str, object] = {}
     if verify:
-        stats = _verify_parity(x, w, b, eps=eps, return_rstd=return_rstd, return_mean=return_mean)
+        stats = _verify_parity(
+            x, w, b, eps=eps, return_rstd=return_rstd, return_mean=return_mean
+        )
 
     bytes_io = bytes_io_model_layernorm(
         M,
@@ -285,17 +309,33 @@ def main() -> None:
     print(f"Running on {torch.cuda.get_device_name(device)} (SM{sm})")
 
     p = argparse.ArgumentParser()
-    p.add_argument("--dtype", type=str, default="bf16", choices=["fp16", "bf16", "fp32"])
+    p.add_argument(
+        "--dtype", type=str, default="bf16", choices=["fp16", "bf16", "fp32"]
+    )
     p.add_argument("--eps", type=float, default=1e-6)
     p.add_argument("--return-rstd", action="store_true")
     p.add_argument("--return-mean", action="store_true")
-    p.add_argument("--with-bias", action="store_true", help="Benchmark bias path (Quack compare skipped)")
-    p.add_argument("--iters", type=int, default=100, help="Triton do_bench rep_ms (kernel-only).")
+    p.add_argument(
+        "--with-bias",
+        action="store_true",
+        help="Benchmark bias path (Quack compare skipped)",
+    )
+    p.add_argument(
+        "--iters", type=int, default=100, help="Triton do_bench rep_ms (kernel-only)."
+    )
     p.add_argument("--warmup-ms", type=int, default=25)
-    p.add_argument("--csv", type=str, default=None, help="Optional CSV output path; appends rows")
-    p.add_argument("--json", type=str, default=None, help="Optional JSON output path (meta + rows)")
+    p.add_argument(
+        "--csv", type=str, default=None, help="Optional CSV output path; appends rows"
+    )
+    p.add_argument(
+        "--json", type=str, default=None, help="Optional JSON output path (meta + rows)"
+    )
     p.add_argument("--configs", type=str, default="1024x4096,8192x4096")
-    p.add_argument("--quack-suite", action="store_true", help="Run Quack-style batch/seq grid (hidden=4096)")
+    p.add_argument(
+        "--quack-suite",
+        action="store_true",
+        help="Run Quack-style batch/seq grid (hidden=4096)",
+    )
     p.add_argument(
         "--dsv3",
         action="store_true",
@@ -322,7 +362,7 @@ def main() -> None:
     meta = collect_device_meta(device)
 
     rows_out: List[Dict[str, Any]] = []
-    for (M, N) in cfgs:
+    for M, N in cfgs:
         print(f"bench M={M:<8d} N={N:<6d} dtype={args.dtype} ...", flush=True)
         (ms_oink, gbps_oink), quack, stats = bench_single(
             M=M,
diff --git a/oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py b/oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py
index 4ba1c47..b9909e7 100644
--- a/oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py
+++ b/oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py
@@ -140,7 +140,11 @@ def _verify_parity(
     M, N = int(x.shape[0]), int(x.shape[1])
 
     dx_acc_ours = ErrorStatsAccumulator(total_elems=M * N)
-    dx_acc_quack = ErrorStatsAccumulator(total_elems=M * N) if quack_rmsnorm_bwd is not None else None
+    dx_acc_quack = (
+        ErrorStatsAccumulator(total_elems=M * N)
+        if quack_rmsnorm_bwd is not None
+        else None
+    )
 
     with torch.no_grad():
         dx_oink, dw_oink, db_oink, dres_oink = oink_rmsnorm.rmsnorm_backward(
@@ -216,7 +220,9 @@ def _verify_parity(
             dw_tol = dict(atol=dw_atol, rtol=1e-3)
             torch.testing.assert_close(dw_oink_f32, dw_ref_f32, **dw_tol)
             if dw_quack is not None:
-                torch.testing.assert_close(dw_quack.to(torch.float32), dw_ref_f32, **dw_tol)
+                torch.testing.assert_close(
+                    dw_quack.to(torch.float32), dw_ref_f32, **dw_tol
+                )
             dw_tol = None  # handled above
         if dw_tol is not None:
             torch.testing.assert_close(dw_oink, dw_ref, **dw_tol)
@@ -224,7 +230,9 @@ def _verify_parity(
                 torch.testing.assert_close(dw_quack, dw_ref, **dw_tol)
 
         # Record weight-grad error stats (small, so exact p99 over the full vector).
-        dw_acc_ours = ErrorStatsAccumulator(total_elems=int(dw_ref.numel()), p99_target_samples=int(dw_ref.numel()))
+        dw_acc_ours = ErrorStatsAccumulator(
+            total_elems=int(dw_ref.numel()), p99_target_samples=int(dw_ref.numel())
+        )
         dw_acc_ours.update(dw_oink, dw_ref)
         stats.update(error_stats_to_row("ours_err_dw", dw_acc_ours.finalize()))
         if dw_quack is not None:
@@ -308,7 +316,9 @@ def main() -> None:
     print(f"Running on {torch.cuda.get_device_name(device)} (SM{sm})")
 
     p = argparse.ArgumentParser()
-    p.add_argument("--dtype", type=str, default="bf16", choices=["fp16", "bf16", "fp32"])
+    p.add_argument(
+        "--dtype", type=str, default="bf16", choices=["fp16", "bf16", "fp32"]
+    )
     p.add_argument(
         "--weight-dtype",
         type=str,
@@ -324,10 +334,16 @@ def main() -> None:
         help="Triton do_bench rep_ms (kernel-only).",
     )
     p.add_argument("--warmup-ms", type=int, default=25)
-    p.add_argument("--csv", type=str, default=None, help="Optional CSV output path; appends rows")
-    p.add_argument("--json", type=str, default=None, help="Optional JSON output path (meta + rows)")
+    p.add_argument(
+        "--csv", type=str, default=None, help="Optional CSV output path; appends rows"
+    )
+    p.add_argument(
+        "--json", type=str, default=None, help="Optional JSON output path (meta + rows)"
+    )
     p.add_argument("--configs", type=str, default="1024x4096,8192x4096")
-    p.add_argument("--quack-suite", action="store_true", help="Run Quack-style batch/seq grid")
+    p.add_argument(
+        "--quack-suite", action="store_true", help="Run Quack-style batch/seq grid"
+    )
     p.add_argument(
         "--dsv3",
         action="store_true",
@@ -358,7 +374,7 @@ def main() -> None:
 
     rows_out: list[dict[str, object]] = []
 
-    for (M, N) in cfgs:
+    for M, N in cfgs:
         print(f"bench M={M:<8d} N={N:<6d} dtype={args.dtype} ...", flush=True)
         ours, quack, stats = bench_single(
             M=M,
diff --git a/oink/benchmarks/benchmark/benchmark_rmsnorm_sm100.py b/oink/benchmarks/benchmark/benchmark_rmsnorm_sm100.py
index 20ed8ac..f4c8a5f 100644
--- a/oink/benchmarks/benchmark/benchmark_rmsnorm_sm100.py
+++ b/oink/benchmarks/benchmark/benchmark_rmsnorm_sm100.py
@@ -85,7 +85,11 @@ def _verify_parity(
     N = int(x.shape[1])
 
     y_acc_ours = ErrorStatsAccumulator(total_elems=M * N)
-    y_acc_quack = ErrorStatsAccumulator(total_elems=M * N) if quack_rmsnorm_fwd is not None else None
+    y_acc_quack = (
+        ErrorStatsAccumulator(total_elems=M * N)
+        if quack_rmsnorm_fwd is not None
+        else None
+    )
 
     with torch.no_grad():
         y_o, rstd_o, res_o = oink_rmsnorm.rmsnorm_forward(
@@ -141,15 +145,20 @@ def _verify_parity(
             assert rstd_q is not None
             torch.testing.assert_close(rstd_q, rstd_ref, **tol_rstd)
         # Stats for rstd are cheap (M elements); compute exact p99 over all rows.
-        rstd_acc_ours = ErrorStatsAccumulator(total_elems=int(rstd_ref.numel()), p99_target_samples=int(rstd_ref.numel()))
+        rstd_acc_ours = ErrorStatsAccumulator(
+            total_elems=int(rstd_ref.numel()), p99_target_samples=int(rstd_ref.numel())
+        )
         rstd_acc_ours.update(rstd_o, rstd_ref)
         stats.update(error_stats_to_row("ours_err_rstd", rstd_acc_ours.finalize()))
         if rstd_q is not None:
             rstd_acc_quack = ErrorStatsAccumulator(
-                total_elems=int(rstd_ref.numel()), p99_target_samples=int(rstd_ref.numel())
+                total_elems=int(rstd_ref.numel()),
+                p99_target_samples=int(rstd_ref.numel()),
             )
             rstd_acc_quack.update(rstd_q, rstd_ref)
-            stats.update(error_stats_to_row("quack_err_rstd", rstd_acc_quack.finalize()))
+            stats.update(
+                error_stats_to_row("quack_err_rstd", rstd_acc_quack.finalize())
+            )
     # Residual output semantics differ slightly across implementations:
     # - Oink returns `None` when residual is None.
     # - Quack returns `x` as a safe alias in that case.
@@ -227,7 +236,9 @@ def main() -> None:
     print(f"Running on {torch.cuda.get_device_name(device)} (SM{sm})")
 
     p = argparse.ArgumentParser()
-    p.add_argument("--dtype", type=str, default="bf16", choices=["fp16", "bf16", "fp32"])
+    p.add_argument(
+        "--dtype", type=str, default="bf16", choices=["fp16", "bf16", "fp32"]
+    )
     p.add_argument(
         "--weight-dtype",
         type=str,
@@ -236,15 +247,33 @@ def main() -> None:
         help="RMSNorm weight dtype. `same` matches activation dtype (vLLM-style inference).",
     )
     p.add_argument("--eps", type=float, default=1e-6)
-    p.add_argument("--store-rstd", action="store_true", help="Also write rstd (fp32 per row)")
-    p.add_argument("--iters", type=int, default=100, help="Triton do_bench rep_ms (kernel-only).")
+    p.add_argument(
+        "--store-rstd", action="store_true", help="Also write rstd (fp32 per row)"
+    )
+    p.add_argument(
+        "--iters", type=int, default=100, help="Triton do_bench rep_ms (kernel-only)."
+    )
     p.add_argument("--warmup-ms", type=int, default=25)
-    p.add_argument("--csv", type=str, default=None, help="Optional CSV output path; appends rows")
-    p.add_argument("--json", type=str, default=None, help="Optional JSON output path (meta + rows)")
+    p.add_argument(
+        "--csv", type=str, default=None, help="Optional CSV output path; appends rows"
+    )
+    p.add_argument(
+        "--json", type=str, default=None, help="Optional JSON output path (meta + rows)"
+    )
     p.add_argument("--configs", type=str, default="1024x4096,8192x4096")
-    p.add_argument("--quack-suite", action="store_true", help="Run Quack-style batch/seq grid")
-    p.add_argument("--dsv3", action="store_true", help="Run DSv3 set: M in {4096,16384,65536}, N in {6144,7168,8192}")
-    p.add_argument("--skip-verify", action="store_true", help="Skip correctness checks (Oink/Quack vs a pure-PyTorch reference)")
+    p.add_argument(
+        "--quack-suite", action="store_true", help="Run Quack-style batch/seq grid"
+    )
+    p.add_argument(
+        "--dsv3",
+        action="store_true",
+        help="Run DSv3 set: M in {4096,16384,65536}, N in {6144,7168,8192}",
+    )
+    p.add_argument(
+        "--skip-verify",
+        action="store_true",
+        help="Skip correctness checks (Oink/Quack vs a pure-PyTorch reference)",
+    )
     args = p.parse_args()
 
     dtype = parse_dtype(args.dtype)
@@ -265,7 +294,7 @@ def main() -> None:
     meta = collect_device_meta(device)
 
     rows_out: List[Dict[str, Any]] = []
-    for (M, N) in cfgs:
+    for M, N in cfgs:
         print(f"bench M={M:<8d} N={N:<6d} dtype={args.dtype} ...", flush=True)
         (ms_oink, gbps_oink), quack, stats = bench_single(
             M=M,
diff --git a/oink/benchmarks/benchmark/benchmark_softmax_sm100.py b/oink/benchmarks/benchmark/benchmark_softmax_sm100.py
index 7826efc..995b09f 100644
--- a/oink/benchmarks/benchmark/benchmark_softmax_sm100.py
+++ b/oink/benchmarks/benchmark/benchmark_softmax_sm100.py
@@ -150,6 +150,7 @@ def bench_single(
     bytes_io = bytes_io_model_softmax(M, N, dtype, mode=mode)
 
     if mode == "fwd":
+
         def fn_oink():
             return oink_softmax.softmax_forward(x)
 
@@ -174,6 +175,7 @@ def fn_quack():
                 return quack_softmax_bwd(dy, y_q)
 
     elif mode == "fwd_bwd":
+
         def fn_oink():
             return oink_softmax.softmax_fwd_bwd(dy, x)
 
@@ -208,20 +210,36 @@ def main() -> None:
     print(f"Running on {torch.cuda.get_device_name(device)} (SM{sm})")
 
     p = argparse.ArgumentParser()
-    p.add_argument("--dtype", type=str, default="bf16", choices=["fp16", "bf16", "fp32"])
-    p.add_argument("--mode", type=str, default="fwd_bwd", choices=["fwd", "bwd", "fwd_bwd"])
-    p.add_argument("--iters", type=int, default=50, help="Triton do_bench rep_ms (kernel-only).")
+    p.add_argument(
+        "--dtype", type=str, default="bf16", choices=["fp16", "bf16", "fp32"]
+    )
+    p.add_argument(
+        "--mode", type=str, default="fwd_bwd", choices=["fwd", "bwd", "fwd_bwd"]
+    )
+    p.add_argument(
+        "--iters", type=int, default=50, help="Triton do_bench rep_ms (kernel-only)."
+    )
     p.add_argument("--warmup-ms", type=int, default=25)
-    p.add_argument("--csv", type=str, default=None, help="Optional CSV output path; appends rows")
-    p.add_argument("--json", type=str, default=None, help="Optional JSON output path (meta + rows)")
+    p.add_argument(
+        "--csv", type=str, default=None, help="Optional CSV output path; appends rows"
+    )
+    p.add_argument(
+        "--json", type=str, default=None, help="Optional JSON output path (meta + rows)"
+    )
     p.add_argument("--configs", type=str, default="1024x4096,8192x4096")
-    p.add_argument("--quack-suite", action="store_true", help="Run Quack-style batch/seq grid")
+    p.add_argument(
+        "--quack-suite", action="store_true", help="Run Quack-style batch/seq grid"
+    )
     p.add_argument(
         "--dsv3",
         action="store_true",
         help="Run DSv3 set: M in {4096,16384,65536}, N in {6144,7168,8192}",
     )
-    p.add_argument("--skip-verify", action="store_true", help="Skip correctness checks (Oink/Quack vs PyTorch softmax)")
+    p.add_argument(
+        "--skip-verify",
+        action="store_true",
+        help="Skip correctness checks (Oink/Quack vs PyTorch softmax)",
+    )
     args = p.parse_args()
 
     dtype = parse_dtype(args.dtype)
@@ -237,8 +255,11 @@ def main() -> None:
     meta = collect_device_meta(device)
 
     rows_out: List[Dict[str, Any]] = []
-    for (M, N) in cfgs:
-        print(f"bench M={M:<8d} N={N:<6d} dtype={args.dtype} mode={args.mode} ...", flush=True)
+    for M, N in cfgs:
+        print(
+            f"bench M={M:<8d} N={N:<6d} dtype={args.dtype} mode={args.mode} ...",
+            flush=True,
+        )
         (ms_oink, gbps_oink), quack, stats = bench_single(
             M=M,
             N=N,
diff --git a/oink/benchmarks/readme/plot_quack_style_svg.py b/oink/benchmarks/readme/plot_quack_style_svg.py
index c089b2b..af76832 100644
--- a/oink/benchmarks/readme/plot_quack_style_svg.py
+++ b/oink/benchmarks/readme/plot_quack_style_svg.py
@@ -78,7 +78,9 @@ def _gbps_from_row(prefix: str, row: Mapping[str, Any]) -> Optional[float]:
     return None
 
 
-def _aggregate_by_shape(rows: Sequence[Mapping[str, Any]]) -> Dict[Tuple[int, int], Dict[str, float]]:
+def _aggregate_by_shape(
+    rows: Sequence[Mapping[str, Any]],
+) -> Dict[Tuple[int, int], Dict[str, float]]:
     """Aggregate duplicate (M, N) rows using median (more robust than mean)."""
     buckets: dict[tuple[int, int], dict[str, list[float]]] = defaultdict(
         lambda: defaultdict(list)
@@ -199,7 +201,11 @@ def _plot(
                 continue
             ours_y.append(float(rec["ours"]))
             quack_y.append(float(rec["quack"]))
-        max_y = max(max_y, *(v for v in ours_y if math.isfinite(v)), *(v for v in quack_y if math.isfinite(v)))
+        max_y = max(
+            max_y,
+            *(v for v in ours_y if math.isfinite(v)),
+            *(v for v in quack_y if math.isfinite(v)),
+        )
 
         ax.plot(
             x,
@@ -337,7 +343,10 @@ def main() -> None:
         description="Generate Quack-style SVG plots from KernelAgent-Oink suite JSONs."
     )
     p.add_argument(
-        "--in-dir", type=str, required=True, help="Directory containing suite JSON outputs"
+        "--in-dir",
+        type=str,
+        required=True,
+        help="Directory containing suite JSON outputs",
     )
     p.add_argument(
         "--suite",
@@ -362,22 +371,35 @@ def main() -> None:
             "`union` includes every shape across panels (may create gaps)."
         ),
     )
-    p.add_argument("--roofline-json", type=str, default=None, help="Optional /tmp/hbm_roofline_sm100_*.json path")
+    p.add_argument(
+        "--roofline-json",
+        type=str,
+        default=None,
+        help="Optional /tmp/hbm_roofline_sm100_*.json path",
+    )
     p.add_argument("--out", type=str, required=True, help="Output SVG path")
-    p.add_argument("--title", type=str, default=None, help="Optional figure title override")
+    p.add_argument(
+        "--title", type=str, default=None, help="Optional figure title override"
+    )
     args = p.parse_args()
 
     in_dir = os.path.abspath(args.in_dir)
     if not os.path.isdir(in_dir):
         raise SystemExit(f"--in-dir is not a directory: {in_dir}")
 
-    roofline_gbps = _read_roofline_gbps(args.roofline_json) if args.roofline_json else None
+    roofline_gbps = (
+        _read_roofline_gbps(args.roofline_json) if args.roofline_json else None
+    )
 
     panel_files = list(_panel_files_for_suite(str(args.suite)))
     if args.include_layernorm:
         if args.suite != "quack_suite":
-            raise SystemExit("--include-layernorm is only supported for `--suite quack_suite`.")
-        panel_files.append(("LayerNorm (fwd)", _layernorm_file_for_suite(str(args.suite))))
+            raise SystemExit(
+                "--include-layernorm is only supported for `--suite quack_suite`."
+            )
+        panel_files.append(
+            ("LayerNorm (fwd)", _layernorm_file_for_suite(str(args.suite)))
+        )
 
     panels: List[Tuple[str, Dict[Tuple[int, int], Dict[str, float]]]] = []
     for panel_title, filename in panel_files:
@@ -410,7 +432,11 @@ def main() -> None:
             suite_name = "DSv3 CrossEntropy"
         else:
             suite_name = str(args.suite)
-        suffix = " (+LayerNorm)" if (args.suite == "quack_suite" and args.include_layernorm) else ""
+        suffix = (
+            " (+LayerNorm)"
+            if (args.suite == "quack_suite" and args.include_layernorm)
+            else ""
+        )
         if args.suite == "dsv3_cross_entropy":
             title = f"SM100 {dtype.upper()} — {suite_name}{suffix}"
         else:
diff --git a/oink/benchmarks/readme/run_sm100_suite.py b/oink/benchmarks/readme/run_sm100_suite.py
index 5ac1091..c31d4b5 100644
--- a/oink/benchmarks/readme/run_sm100_suite.py
+++ b/oink/benchmarks/readme/run_sm100_suite.py
@@ -21,7 +21,9 @@ def _run(cmd: List[str], *, dry_run: bool) -> None:
 
 def main() -> None:
     p = argparse.ArgumentParser()
-    p.add_argument("--dtype", type=str, default="bf16", choices=["fp16", "bf16", "fp32"])
+    p.add_argument(
+        "--dtype", type=str, default="bf16", choices=["fp16", "bf16", "fp32"]
+    )
     p.add_argument(
         "--out-dir",
         type=str,
@@ -33,7 +35,9 @@ def main() -> None:
         action="store_true",
         help="Skip correctness checks (Oink/Quack vs PyTorch / pure-PyTorch references)",
     )
-    p.add_argument("--dry-run", action="store_true", help="Print commands without executing them")
+    p.add_argument(
+        "--dry-run", action="store_true", help="Print commands without executing them"
+    )
     args = p.parse_args()
 
     # Standardize env for standalone runs outside the vLLM plugin.
diff --git a/oink/benchmarks/readme/summarize_results.py b/oink/benchmarks/readme/summarize_results.py
index 70782dd..29b768e 100644
--- a/oink/benchmarks/readme/summarize_results.py
+++ b/oink/benchmarks/readme/summarize_results.py
@@ -95,16 +95,26 @@ def _summarize_error_stats(rows: Sequence[Dict[str, Any]]) -> str:
     out_rows: List[Dict[str, Any]] = []
     for pfx in prefixes:
         # Per-prefix worst-case across rows.
-        max_abs_vals = [float(r[pfx + "_max_abs"]) for r in rows if (pfx + "_max_abs") in r]
-        p99_abs_vals = [float(r[pfx + "_p99_abs"]) for r in rows if (pfx + "_p99_abs") in r]
-        rel_l2_vals = [float(r[pfx + "_rel_l2"]) for r in rows if (pfx + "_rel_l2") in r]
+        max_abs_vals = [
+            float(r[pfx + "_max_abs"]) for r in rows if (pfx + "_max_abs") in r
+        ]
+        p99_abs_vals = [
+            float(r[pfx + "_p99_abs"]) for r in rows if (pfx + "_p99_abs") in r
+        ]
+        rel_l2_vals = [
+            float(r[pfx + "_rel_l2"]) for r in rows if (pfx + "_rel_l2") in r
+        ]
         if not max_abs_vals and not p99_abs_vals and not rel_l2_vals:
             continue
         out_rows.append(
             {
                 "metric": pfx,
-                "max_abs (max over shapes)": max(max_abs_vals) if max_abs_vals else None,
-                "p99_abs (max over shapes)": max(p99_abs_vals) if p99_abs_vals else None,
+                "max_abs (max over shapes)": max(max_abs_vals)
+                if max_abs_vals
+                else None,
+                "p99_abs (max over shapes)": max(p99_abs_vals)
+                if p99_abs_vals
+                else None,
                 "rel_l2 (max over shapes)": max(rel_l2_vals) if rel_l2_vals else None,
             }
         )
@@ -112,8 +122,15 @@ def _summarize_error_stats(rows: Sequence[Dict[str, Any]]) -> str:
     if not out_rows:
         return ""
 
-    cols = ["metric", "max_abs (max over shapes)", "p99_abs (max over shapes)", "rel_l2 (max over shapes)"]
-    return "\n".join(["", "### Error Stats (vs PyTorch ref)", "", _md_table(out_rows, cols), ""])
+    cols = [
+        "metric",
+        "max_abs (max over shapes)",
+        "p99_abs (max over shapes)",
+        "rel_l2 (max over shapes)",
+    ]
+    return "\n".join(
+        ["", "### Error Stats (vs PyTorch ref)", "", _md_table(out_rows, cols), ""]
+    )
 
 
 def summarize_one(path: str) -> str:
@@ -143,7 +160,9 @@ def summarize_one(path: str) -> str:
         if method is not None:
             parts.append(f"- method: `{method}`")
         if meta.get("warmup_ms") is not None and meta.get("rep_ms") is not None:
-            parts.append(f"- warmup_ms: `{meta.get('warmup_ms')}` | rep_ms: `{meta.get('rep_ms')}`")
+            parts.append(
+                f"- warmup_ms: `{meta.get('warmup_ms')}` | rep_ms: `{meta.get('rep_ms')}`"
+            )
 
     if rows:
         parts.append("")
@@ -153,7 +172,9 @@ def summarize_one(path: str) -> str:
         gm = _geomean(speeds)
         if gm is not None:
             parts.append("")
-            parts.append(f"- geomean speedup vs Quack: `{gm:.3f}x` (over {len(speeds)} shapes)")
+            parts.append(
+                f"- geomean speedup vs Quack: `{gm:.3f}x` (over {len(speeds)} shapes)"
+            )
 
         err_block = _summarize_error_stats(rows)
         if err_block:
@@ -167,9 +188,21 @@ def summarize_one(path: str) -> str:
 
 
 def main() -> None:
-    p = argparse.ArgumentParser(description="Summarize KernelAgent-Oink benchmark JSONs into Markdown tables.")
-    p.add_argument("--in-dir", type=str, required=True, help="Directory containing benchmark JSON files")
-    p.add_argument("--out", type=str, default=None, help="Optional output markdown path (default: stdout)")
+    p = argparse.ArgumentParser(
+        description="Summarize KernelAgent-Oink benchmark JSONs into Markdown tables."
+    )
+    p.add_argument(
+        "--in-dir",
+        type=str,
+        required=True,
+        help="Directory containing benchmark JSON files",
+    )
+    p.add_argument(
+        "--out",
+        type=str,
+        default=None,
+        help="Optional output markdown path (default: stdout)",
+    )
     args = p.parse_args()
 
     in_dir = os.path.abspath(args.in_dir)
@@ -177,7 +210,9 @@ def main() -> None:
         raise SystemExit(f"--in-dir is not a directory: {in_dir}")
 
     json_paths = sorted(
-        os.path.join(in_dir, name) for name in os.listdir(in_dir) if name.endswith(".json")
+        os.path.join(in_dir, name)
+        for name in os.listdir(in_dir)
+        if name.endswith(".json")
     )
     if not json_paths:
         raise SystemExit(f"No .json files found under: {in_dir}")
diff --git a/oink/src/kernelagent_oink/blackwell/cross_entropy.py b/oink/src/kernelagent_oink/blackwell/cross_entropy.py
index 94f052f..3e6eef1 100644
--- a/oink/src/kernelagent_oink/blackwell/cross_entropy.py
+++ b/oink/src/kernelagent_oink/blackwell/cross_entropy.py
@@ -103,9 +103,8 @@ def _convert_logits_2d(x: Tensor) -> cute.Tensor:
     softmax and RMSNorm kernels.
     """
     assert x.dim() == 2, "Input logits must be 2D (M, N)"
-    return (
-        from_dlpack(x.detach(), assumed_align=16)
-        .mark_compact_shape_dynamic(mode=0, stride_order=(0, 1))
+    return from_dlpack(x.detach(), assumed_align=16).mark_compact_shape_dynamic(
+        mode=0, stride_order=(0, 1)
     )
 
 
@@ -136,7 +135,11 @@ def _calculate_threads_per_row(self) -> int:
             else (
                 16
                 if N <= 128
-                else (32 if N <= 3072 else (64 if N <= 6144 else (128 if N <= 16384 else 256)))
+                else (
+                    32
+                    if N <= 3072
+                    else (64 if N <= 6144 else (128 if N <= 16384 else 256))
+                )
             )
         )
 
@@ -183,7 +186,9 @@ def __call__(
         num_copy_bits = math.gcd(self.N, 128 // self.dtype.width) * self.dtype.width
         tiler_mn, tv_layout = self._get_tv_layout(num_copy_bits=num_copy_bits)
         num_threads = (
-            cute.size(tv_layout, mode=[0]) if _KERNEL_ACCEPTS_LAYOUT_ARGS else self._get_num_threads()
+            cute.size(tv_layout, mode=[0])
+            if _KERNEL_ACCEPTS_LAYOUT_ARGS
+            else self._get_num_threads()
         )
         num_warps = num_threads // cute.arch.WARP_SIZE
         kernel = (
@@ -267,7 +272,9 @@ def _kernel_impl(
             cute.make_ordered_layout(tiler_mn, order=(1, 0)),
             byte_alignment=16,
         )
-        reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(smem, tv_layout)
+        reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(
+            smem, tv_layout
+        )
 
         # Copy setup: gmem -> smem via cp.async, 128-bit or narrower as needed.
         num_copy_elems_X = tv_layout.shape[1][0]
@@ -277,7 +284,9 @@ def _kernel_impl(
             gX.element_type,
             num_bits_per_copy=num_copy_bits_X,
         )
-        thr_copy_X = cute.make_tiled_copy(copy_atom_load_X, tv_layout, tiler_mn).get_slice(tidx)
+        thr_copy_X = cute.make_tiled_copy(
+            copy_atom_load_X, tv_layout, tiler_mn
+        ).get_slice(tidx)
 
         tXgX = thr_copy_X.partition_S(gX)
         tXsX = thr_copy_X.partition_D(sX)
@@ -414,13 +423,21 @@ def _calculate_threads_per_row(self) -> int:
             else (
                 16
                 if N <= 128
-                else (32 if N <= 3072 else (64 if N <= 6144 else (128 if N <= 16384 else 256)))
+                else (
+                    32
+                    if N <= 3072
+                    else (64 if N <= 6144 else (128 if N <= 16384 else 256))
+                )
             )
         )
 
-    def _get_tv_layout(self, num_copy_bits: int = 128) -> tuple[cute.Shape, cute.Layout]:
+    def _get_tv_layout(
+        self, num_copy_bits: int = 128
+    ) -> tuple[cute.Shape, cute.Layout]:
         vecsize = num_copy_bits // self.dtype.width
-        assert self.N % vecsize == 0, f"Input N {self.N} is not divisible by vector size {vecsize}"
+        assert self.N % vecsize == 0, (
+            f"Input N {self.N} is not divisible by vector size {vecsize}"
+        )
         N = min(self.N, 16384)
         num_threads = 128 if N <= 16384 else 256
         threads_per_row = self._calculate_threads_per_row()
@@ -452,7 +469,9 @@ def __call__(
         num_copy_bits = math.gcd(self.N, 128 // self.dtype.width) * self.dtype.width
         tiler_mn, tv_layout = self._get_tv_layout(num_copy_bits=num_copy_bits)
         num_threads = (
-            cute.size(tv_layout, mode=[0]) if _KERNEL_ACCEPTS_LAYOUT_ARGS else self._get_num_threads()
+            cute.size(tv_layout, mode=[0])
+            if _KERNEL_ACCEPTS_LAYOUT_ARGS
+            else self._get_num_threads()
         )
         # Broadcast (M,) tensors along the N dimension with stride 0.
         mDLoss, mTarget, mLSE = [
@@ -564,8 +583,12 @@ def _kernel_impl(
             gdX.element_type,
             num_bits_per_copy=num_copy_bits_X,
         )
-        thr_copy_X = cute.make_tiled_copy(copy_atom_load_X, tv_layout, tiler_mn).get_slice(tidx)
-        thr_copy_dX = cute.make_tiled_copy(copy_atom_store_dX, tv_layout, tiler_mn).get_slice(tidx)
+        thr_copy_X = cute.make_tiled_copy(
+            copy_atom_load_X, tv_layout, tiler_mn
+        ).get_slice(tidx)
+        thr_copy_dX = cute.make_tiled_copy(
+            copy_atom_store_dX, tv_layout, tiler_mn
+        ).get_slice(tidx)
 
         tXgX = thr_copy_X.partition_S(gX)
         tXsX = thr_copy_X.partition_D(sX)
@@ -898,8 +921,14 @@ def _cross_entropy_forward_ptr_into(
     assert logits.is_cuda and logits.dim() == 2
     assert target.is_cuda and target.dim() == 1 and target.shape[0] == logits.shape[0]
     assert target.dtype is torch.int64
-    assert loss.is_cuda and loss.shape == (logits.shape[0],) and loss.dtype is torch.float32
-    assert lse.is_cuda and lse.shape == (logits.shape[0],) and lse.dtype is torch.float32
+    assert (
+        loss.is_cuda
+        and loss.shape == (logits.shape[0],)
+        and loss.dtype is torch.float32
+    )
+    assert (
+        lse.is_cuda and lse.shape == (logits.shape[0],) and lse.dtype is torch.float32
+    )
 
     M, N = logits.shape
     device_index = logits.get_device()
@@ -991,10 +1020,18 @@ def _cross_entropy_backward_ptr_into(
     assert logits.is_cuda and logits.dim() == 2
     assert target.is_cuda and target.dim() == 1 and target.shape[0] == logits.shape[0]
     assert target.dtype is torch.int64
-    assert dloss.is_cuda and dloss.shape == (logits.shape[0],) and dloss.dtype is torch.float32
-    assert lse.is_cuda and lse.shape == (logits.shape[0],) and lse.dtype is torch.float32
+    assert (
+        dloss.is_cuda
+        and dloss.shape == (logits.shape[0],)
+        and dloss.dtype is torch.float32
+    )
+    assert (
+        lse.is_cuda and lse.shape == (logits.shape[0],) and lse.dtype is torch.float32
+    )
     assert dx.is_cuda and dx.shape == logits.shape and dx.dtype == logits.dtype
-    assert dx.stride() == logits.stride(), "Pointer path expects dx to match logits strides"
+    assert dx.stride() == logits.stride(), (
+        "Pointer path expects dx to match logits strides"
+    )
 
     M, N = logits.shape
     device_index = logits.get_device()
@@ -1060,7 +1097,9 @@ def _cross_entropy_backward_ptr_into(
         mem_space=rt.AddressSpace.gmem,
         assumed_align=4,
     )
-    ptr_dx = rt.make_ptr(dtype_x, dx.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+    ptr_dx = rt.make_ptr(
+        dtype_x, dx.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+    )
     ptr_lse = rt.make_ptr(
         cutlass.Float32,
         lse.data_ptr(),
@@ -1169,7 +1208,9 @@ def verify_cross_entropy_parity(
         mask = torch.rand(M, device=device) < 0.1
         target[mask] = ignore_index
 
-    loss, lse = cross_entropy_forward(logits, target, ignore_index=ignore_index, reduction="none")
+    loss, lse = cross_entropy_forward(
+        logits, target, ignore_index=ignore_index, reduction="none"
+    )
 
     logits_ref = logits.detach().clone().requires_grad_()
     target_ref = target.detach().clone()
diff --git a/oink/src/kernelagent_oink/blackwell/layernorm.py b/oink/src/kernelagent_oink/blackwell/layernorm.py
index 0e4d640..67f67ce 100644
--- a/oink/src/kernelagent_oink/blackwell/layernorm.py
+++ b/oink/src/kernelagent_oink/blackwell/layernorm.py
@@ -138,7 +138,11 @@ def _calculate_threads_per_row(self) -> int:
             else (
                 16
                 if N <= 128
-                else (32 if N <= 3072 else (64 if N <= 6144 else (128 if N <= 16384 else 256)))
+                else (
+                    32
+                    if N <= 3072
+                    else (64 if N <= 6144 else (128 if N <= 16384 else 256))
+                )
             )
         )
 
@@ -186,7 +190,9 @@ def __call__(
         self._set_cluster_n()
         tiler_mn, tv_layout = self._get_tv_layout()
         num_threads = (
-            cute.size(tv_layout, mode=[0]) if _KERNEL_ACCEPTS_LAYOUT_ARGS else self._get_num_threads()
+            cute.size(tv_layout, mode=[0])
+            if _KERNEL_ACCEPTS_LAYOUT_ARGS
+            else self._get_num_threads()
         )
         num_warps = num_threads // cute.arch.WARP_SIZE
 
@@ -281,7 +287,9 @@ def launch_from_ptrs(
         mX = cute.make_tensor(ptr_x, layout_mn)
         mO = cute.make_tensor(ptr_out, layout_mn)
         mW = cute.make_tensor(ptr_w, layout_n)
-        mB = cute.make_tensor(ptr_b, layout_n) if const_expr(ptr_b is not None) else None
+        mB = (
+            cute.make_tensor(ptr_b, layout_n) if const_expr(ptr_b is not None) else None
+        )
         mRstd = (
             cute.make_tensor(ptr_rstd, layout_m)
             if const_expr(ptr_rstd is not None)
@@ -323,15 +331,15 @@ def _kernel_impl(
             cute.make_ordered_layout(tiler_mn, order=(1, 0)),
             byte_alignment=16,
         )
-        reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(smem, tv_layout)
+        reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(
+            smem, tv_layout
+        )
 
         shape = mX.shape
         idX = cute.make_identity_tensor(shape)
 
         # Slice for CTAs: use domain_offset_i64 to handle >2^31 elements.
-        mX, mO = [
-            domain_offset_i64((bidx * tiler_mn[0], 0), mT) for mT in (mX, mO)
-        ]
+        mX, mO = [domain_offset_i64((bidx * tiler_mn[0], 0), mT) for mT in (mX, mO)]
         gX, gO = [cute.local_tile(mT, tiler_mn, (0, cluster_y)) for mT in (mX, mO)]
         cX = cute.local_tile(idX, tiler_mn, (bidx, cluster_y))
         gW = cute.local_tile(mW, tiler_mn, (0, cluster_y))
@@ -390,39 +398,23 @@ def _kernel_impl(
         ).get_slice(tidx)
 
         tWgW = thr_copy_WB.partition_S(gW)
-        tBgB = (
-            thr_copy_WB.partition_S(gB)
-            if const_expr(gB is not None)
-            else None
-        )
+        tBgB = thr_copy_WB.partition_S(gB) if const_expr(gB is not None) else None
         tXgX = thr_copy_X.partition_S(gX)
         tXsX = thr_copy_X.partition_D(sX)
         tXgO = thr_copy_O.partition_D(gO)
         tXrRstd = (
-            thr_copy_O.partition_D(gRstd)
-            if const_expr(mRstd is not None)
-            else None
+            thr_copy_O.partition_D(gRstd) if const_expr(mRstd is not None) else None
         )
         tXrMean = (
-            thr_copy_O.partition_D(gMean)
-            if const_expr(mMean is not None)
-            else None
+            thr_copy_O.partition_D(gMean) if const_expr(mMean is not None) else None
         )
         tXcX = thr_copy_X.partition_S(cX)[(0, None), None, None]
 
         # Fragments for gmem->rmem.
         tWrW = cute.make_fragment_like(tWgW)
-        tBrB = (
-            cute.make_fragment_like(tBgB)
-            if const_expr(mB is not None)
-            else None
-        )
+        tBrB = cute.make_fragment_like(tBgB) if const_expr(mB is not None) else None
         tXrW = thr_copy_X.retile(tWrW)
-        tXrB = (
-            thr_copy_X.retile(tBrB)
-            if const_expr(mB is not None)
-            else None
-        )
+        tXrB = thr_copy_X.retile(tBrB) if const_expr(mB is not None) else None
         tXrX, tXrO = [cute.make_fragment_like(thr) for thr in (tXgX, tXgO)]
 
         num_warps = cute.size(tv_layout, mode=[0]) // cute.arch.WARP_SIZE
@@ -458,9 +450,7 @@ def _kernel_impl(
             mbar_ptr + 0 if const_expr(self.cluster_n > 1) else None,
             init_val=0.0,
             hook_fn=(
-                cute.arch.cluster_wait
-                if const_expr(self.cluster_n > 1)
-                else None
+                cute.arch.cluster_wait if const_expr(self.cluster_n > 1) else None
             ),
         )
         mean = sum_x / shape[1]
@@ -486,10 +476,7 @@ def _kernel_impl(
             if (
                 tXcX[0][1] == 0
                 and row < shape[0]
-                and (
-                    self.cluster_n == 1
-                    or cute.arch.block_idx_in_cluster() == 0
-                )
+                and (self.cluster_n == 1 or cute.arch.block_idx_in_cluster() == 0)
             ):
                 tXrRstd[0] = rstd
 
@@ -497,10 +484,7 @@ def _kernel_impl(
             if (
                 tXcX[0][1] == 0
                 and row < shape[0]
-                and (
-                    self.cluster_n == 1
-                    or cute.arch.block_idx_in_cluster() == 0
-                )
+                and (self.cluster_n == 1 or cute.arch.block_idx_in_cluster() == 0)
             ):
                 tXrMean[0] = mean
 
@@ -861,7 +845,9 @@ def _layernorm_forward_ptr_into(
         )
         _PTR_COMPILE_CACHE[key] = compiled
 
-    ptr_x = rt.make_ptr(dtype_x, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+    ptr_x = rt.make_ptr(
+        dtype_x, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+    )
     ptr_out = rt.make_ptr(
         dtype_x, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
     )
@@ -978,8 +964,12 @@ def _layernorm_backward_dx_kernel(
         smem = cutlass.utils.SmemAllocator()
         num_warps = const_expr(block_threads // cute.arch.WARP_SIZE)
         warp_sums_layout = cute.make_layout((num_warps,), stride=(1,))
-        warp_sums_wdy = smem.allocate_tensor(Float32, warp_sums_layout, byte_alignment=4)
-        warp_sums_xhatwdy = smem.allocate_tensor(Float32, warp_sums_layout, byte_alignment=4)
+        warp_sums_wdy = smem.allocate_tensor(
+            Float32, warp_sums_layout, byte_alignment=4
+        )
+        warp_sums_xhatwdy = smem.allocate_tensor(
+            Float32, warp_sums_layout, byte_alignment=4
+        )
 
         lane = cute.arch.lane_idx()
         warp_idx = cute.arch.warp_idx()
@@ -1177,8 +1167,12 @@ def _layernorm_backward_dx_sm100(
         alignment=16,
         divisibility=128 // cutlass.Float32.width,
     )
-    mRstd = from_dlpack(rstd_1d.detach(), assumed_align=4).mark_layout_dynamic(leading_dim=0)
-    mMean = from_dlpack(mean_1d.detach(), assumed_align=4).mark_layout_dynamic(leading_dim=0)
+    mRstd = from_dlpack(rstd_1d.detach(), assumed_align=4).mark_layout_dynamic(
+        leading_dim=0
+    )
+    mMean = from_dlpack(mean_1d.detach(), assumed_align=4).mark_layout_dynamic(
+        leading_dim=0
+    )
 
     stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
     key = (N, dtype)
@@ -1231,8 +1225,12 @@ def _layernorm_backward_params_sm100(
 
     mX = _convert_row_major(x_2d)
     mdO = _convert_row_major(dout_2d)
-    mRstd = from_dlpack(rstd_1d.detach(), assumed_align=4).mark_layout_dynamic(leading_dim=0)
-    mMean = from_dlpack(mean_1d.detach(), assumed_align=4).mark_layout_dynamic(leading_dim=0)
+    mRstd = from_dlpack(rstd_1d.detach(), assumed_align=4).mark_layout_dynamic(
+        leading_dim=0
+    )
+    mMean = from_dlpack(mean_1d.detach(), assumed_align=4).mark_layout_dynamic(
+        leading_dim=0
+    )
 
     mdW_partial = (
         from_dlpack(dw_partial, assumed_align=16).mark_compact_shape_dynamic(mode=0)
diff --git a/oink/src/kernelagent_oink/blackwell/lite_quack.py b/oink/src/kernelagent_oink/blackwell/lite_quack.py
index 590d773..e8ce93a 100644
--- a/oink/src/kernelagent_oink/blackwell/lite_quack.py
+++ b/oink/src/kernelagent_oink/blackwell/lite_quack.py
@@ -85,6 +85,7 @@ def _cutlass_dsl_version() -> Optional[tuple[int, int, int]]:
 # Tensor conversion helpers (from quack.utils)
 # -------------------------
 
+
 def convert_from_dlpack(
     x: Tensor,
     leading_dim: int,
@@ -108,7 +109,9 @@ def convert_from_dlpack(
 
 
 @dsl_user_op
-def elem_pointer(x: cute.Tensor, coord: cute.Coord, *, loc=None, ip=None) -> cute.Pointer:
+def elem_pointer(
+    x: cute.Tensor, coord: cute.Coord, *, loc=None, ip=None
+) -> cute.Pointer:
     return x.iterator + cute.crd2idx(coord, x.layout, loc=loc, ip=ip)
 
 
@@ -159,7 +162,9 @@ def store_shared_remote(
     ).ir_value()
     if const_expr(isinstance(val, float)):
         val = Float32(val)
-    assert isinstance(val, (Float32, Int32, cutlass.Int64)), "val must be Float32, Int32, or Int64"
+    assert isinstance(val, (Float32, Int32, cutlass.Int64)), (
+        "val must be Float32, Int32, or Int64"
+    )
     suffix = {Float32: "f32", Int32: "s32", cutlass.Int64: "s64"}[type(val)]
     constraint = {Float32: "f", Int32: "r", cutlass.Int64: "l"}[type(val)]
     llvm.inline_asm(
@@ -178,19 +183,27 @@ def predicate_k(tAcA: cute.Tensor, limit: cutlass.Int32) -> cute.Tensor:
     # Only compute predicates for the "k" dimension. For the mn dimension, we will use "if".
     tApA = cute.make_fragment(
         cute.make_layout(
-            (cute.size(tAcA, mode=[0, 1]), cute.size(tAcA, mode=[1]), cute.size(tAcA, mode=[2])),
+            (
+                cute.size(tAcA, mode=[0, 1]),
+                cute.size(tAcA, mode=[1]),
+                cute.size(tAcA, mode=[2]),
+            ),
             stride=(cute.size(tAcA, mode=[2]), 0, 1),
         ),
         cutlass.Boolean,
     )
     for rest_v in cutlass.range_constexpr(tApA.shape[0]):
         for rest_k in cutlass.range_constexpr(tApA.shape[2]):
-            tApA[rest_v, 0, rest_k] = cute.elem_less(tAcA[(0, rest_v), 0, rest_k][1], limit)
+            tApA[rest_v, 0, rest_k] = cute.elem_less(
+                tAcA[(0, rest_v), 0, rest_k][1], limit
+            )
     return tApA
 
 
 @dsl_user_op
-def domain_offset_i64(coord: cute.Coord, tensor: cute.Tensor, *, loc=None, ip=None) -> cute.Tensor:
+def domain_offset_i64(
+    coord: cute.Coord, tensor: cute.Tensor, *, loc=None, ip=None
+) -> cute.Tensor:
     flat_coord_i64 = tuple(cutlass.Int64(c) for c in cute.flatten(coord))
     flat_stride = cute.flatten_to_tuple(tensor.stride)
     assert len(flat_coord_i64) == len(flat_stride), (
@@ -228,7 +241,9 @@ def coord_offset_i64(
 
 
 @cute.jit
-def fill_oob(tXsX: cute.Tensor, tXpX: Optional[cute.Tensor], fill_value: cutlass.Numeric) -> None:
+def fill_oob(
+    tXsX: cute.Tensor, tXpX: Optional[cute.Tensor], fill_value: cutlass.Numeric
+) -> None:
     """Fill out-of-bounds values in shared memory tensor."""
     tXrX_fill = cute.make_fragment_like(tXsX[(None, 0), None, 0])
     tXrX_fill.fill(fill_value)
@@ -256,7 +271,9 @@ def f32x2_to_i64(a: Float32, b: Float32, *, loc=None, ip=None) -> cutlass.Int64:
     )
     vec_i64x1 = vector.bitcast(T.vector(1, T.i64()), vec_f32x2, loc=loc, ip=ip)
     res = cutlass.Int64(
-        vector.extract(vec_i64x1, dynamic_position=[], static_position=[0], loc=loc, ip=ip)
+        vector.extract(
+            vec_i64x1, dynamic_position=[], static_position=[0], loc=loc, ip=ip
+        )
     )
     return res
 
@@ -272,10 +289,14 @@ def i64_to_f32x2(c: cutlass.Int64, *, loc=None, ip=None) -> Tuple[Float32, Float
     )
     vec_f32x2 = vector.bitcast(T.vector(2, T.f32()), vec_i64x1, loc=loc, ip=ip)
     res0 = Float32(
-        vector.extract(vec_f32x2, dynamic_position=[], static_position=[0], loc=loc, ip=ip)
+        vector.extract(
+            vec_f32x2, dynamic_position=[], static_position=[0], loc=loc, ip=ip
+        )
     )
     res1 = Float32(
-        vector.extract(vec_f32x2, dynamic_position=[], static_position=[1], loc=loc, ip=ip)
+        vector.extract(
+            vec_f32x2, dynamic_position=[], static_position=[1], loc=loc, ip=ip
+        )
     )
     return res0, res1
 
@@ -372,7 +393,9 @@ def block_or_cluster_reduce(
     """Perform either block or cluster reduction based on whether mbar_ptr is provided."""
     if cutlass.const_expr(mbar_ptr is None):
         return block_reduce(val, op, reduction_buffer, init_val=init_val)
-    return cluster_reduce(val, op, reduction_buffer, mbar_ptr, init_val=init_val, phase=phase)
+    return cluster_reduce(
+        val, op, reduction_buffer, mbar_ptr, init_val=init_val, phase=phase
+    )
 
 
 @cute.jit
@@ -393,7 +416,9 @@ def row_reduce(
         val = x
     warp_op = {
         cute.ReductionOp.ADD: operator.add,
-        cute.ReductionOp.MAX: cute.arch.fmax if cutlass.const_expr(x.dtype == Float32) else max,
+        cute.ReductionOp.MAX: cute.arch.fmax
+        if cutlass.const_expr(x.dtype == Float32)
+        else max,
         cute.ReductionOp.MIN: min,
         cute.ReductionOp.MUL: operator.mul,
     }[op]
@@ -521,7 +546,9 @@ def online_softmax_reduce(
                         reduction_buffer[row_idx, lane_idx]
                     )
                 max_x_final = warp_reduce(max_x_single_warp, cute.arch.fmax)
-                sum_exp_x *= cute.math.exp(max_x_single_warp - max_x_final, fastmath=True)
+                sum_exp_x *= cute.math.exp(
+                    max_x_single_warp - max_x_final, fastmath=True
+                )
                 sum_exp_x = warp_reduce(sum_exp_x, operator.add)
                 if cutlass.const_expr(return_exp_x):
                     exp_x *= cute.math.exp(max_x - max_x_final, fastmath=True)
@@ -533,16 +560,23 @@ def online_softmax_reduce(
                         num_warps = rows_per_block * warps_per_row
                         cute.arch.mbarrier_arrive_and_expect_tx(
                             mbar_ptr,
-                            num_warps * cluster_n * reduction_buffer.element_type.width // 8,
+                            num_warps
+                            * cluster_n
+                            * reduction_buffer.element_type.width
+                            // 8,
                         )
                 if lane_idx < cluster_n:
                     store_shared_remote(
                         f32x2_to_i64(max_x, sum_exp_x),
-                        elem_pointer(reduction_buffer, (row_idx, (col_idx, cta_rank_in_cluster))),
+                        elem_pointer(
+                            reduction_buffer, (row_idx, (col_idx, cta_rank_in_cluster))
+                        ),
                         mbar_ptr,
                         peer_cta_rank_in_cluster=lane_idx,
                     )
-                cute.arch.mbarrier_wait(mbar_ptr, phase=phase if phase is not None else 0)
+                cute.arch.mbarrier_wait(
+                    mbar_ptr, phase=phase if phase is not None else 0
+                )
                 num_iter = cute.ceil_div(warps_per_row * cluster_n, cute.arch.WARP_SIZE)
                 max_x_single_warp = cute.make_fragment(num_iter, Float32)
                 max_x_single_warp.fill(-Float32.inf)
@@ -591,7 +625,9 @@ def get_copy_atom(
 
     num_copy_bits = const_expr(min(128, num_copy_elems * dtype.width))
     copy_op = cpasync.CopyG2SOp() if is_async else cute.nvgpu.CopyUniversalOp()
-    return cute.make_copy_atom(copy_op, dtype, num_bits_per_copy=num_copy_bits, loc=loc, ip=ip)
+    return cute.make_copy_atom(
+        copy_op, dtype, num_bits_per_copy=num_copy_bits, loc=loc, ip=ip
+    )
 
 
 @dsl_user_op
@@ -606,7 +642,9 @@ def copy(
     ip=None,
     **kwargs,
 ) -> None:
-    copy_atom = get_copy_atom(src.element_type, num_copy_elems, is_async, loc=loc, ip=ip)
+    copy_atom = get_copy_atom(
+        src.element_type, num_copy_elems, is_async, loc=loc, ip=ip
+    )
     cute.copy(copy_atom, src, dst, pred=pred, loc=loc, ip=ip, **kwargs)
 
 
@@ -637,15 +675,21 @@ def _set_cluster_n(self) -> None:
     def _get_num_threads(self) -> int:
         return 128 if self.N <= 16384 else 256
 
-    def _get_tv_layout(self, num_copy_bits: int = 128) -> Tuple[cute.Shape, cute.Layout]:
+    def _get_tv_layout(
+        self, num_copy_bits: int = 128
+    ) -> Tuple[cute.Shape, cute.Layout]:
         vecsize = num_copy_bits // self.dtype.width
-        assert self.N % vecsize == 0, f"Input N {self.N} is not divisible by vector size {vecsize}"
+        assert self.N % vecsize == 0, (
+            f"Input N {self.N} is not divisible by vector size {vecsize}"
+        )
         num_threads = self._get_num_threads()
         assert num_threads % cute.arch.WARP_SIZE == 0
 
         threads_per_row = self._calculate_threads_per_row()
         self._set_cluster_n()
-        num_blocks_N = cute.ceil_div(self.N // vecsize, threads_per_row * self.cluster_n)
+        num_blocks_N = cute.ceil_div(
+            self.N // vecsize, threads_per_row * self.cluster_n
+        )
         cols_per_block = num_threads // threads_per_row
         tiler_mn = (cols_per_block, vecsize * num_blocks_N * threads_per_row)
         tv_layout = cute.make_layout(
@@ -660,11 +704,16 @@ def _get_tv_layout(self, num_copy_bits: int = 128) -> Tuple[cute.Shape, cute.Lay
     def _smem_size_in_bytes(self, tiler_mn, num_warps: int) -> int:
         return (
             cute.size_in_bytes(self.dtype, cute.make_layout(tiler_mn))
-            + self.stage * num_warps * self.cluster_n * (self.reduction_dtype.width // 8)
+            + self.stage
+            * num_warps
+            * self.cluster_n
+            * (self.reduction_dtype.width // 8)
             + self.stage * (cutlass.Int64.width // 8)
         )
 
-    def _get_reduction_buffer_layout(self, tv_layout: cute.Layout, cluster_n: int) -> cute.Layout:
+    def _get_reduction_buffer_layout(
+        self, tv_layout: cute.Layout, cluster_n: int
+    ) -> cute.Layout:
         num_warps = cute.size(tv_layout, mode=[0]) // cute.arch.WARP_SIZE
         warps_per_row = max(tv_layout.shape[0][0] // cute.arch.WARP_SIZE, 1)
         return cute.make_ordered_layout(
@@ -723,7 +772,9 @@ def __init__(self, dtype: cutlass.Numeric, N: int):
         super().__init__(dtype, N, stage=2, reduction_dtype=Float32)
         self.reload_wdy = None if N <= 16 * 1024 else "smem"
         if self.N > 128 * 1024 and self.dtype.width >= 32:
-            raise ValueError("RMSNormBackward does not support N > 128k with dtype >= 32 bits")
+            raise ValueError(
+                "RMSNormBackward does not support N > 128k with dtype >= 32 bits"
+            )
 
     def _get_num_threads(self) -> int:
         return 128 if self.N <= 4096 else 256
@@ -736,7 +787,11 @@ def _calculate_threads_per_row(self) -> int:
             else (
                 16
                 if N <= 128
-                else (32 if N <= 256 else (64 if N <= 512 else (128 if N <= 4096 else 256)))
+                else (
+                    32
+                    if N <= 256
+                    else (64 if N <= 512 else (128 if N <= 4096 else 256))
+                )
             )
         )
 
@@ -745,7 +800,11 @@ def _set_cluster_n(self) -> None:
         cluster_n = (
             1
             if N <= 8 * 1024
-            else (2 if N <= 16 * 1024 else (4 if N <= 32 * 1024 else (8 if N <= 64 * 1024 else 16)))
+            else (
+                2
+                if N <= 16 * 1024
+                else (4 if N <= 32 * 1024 else (8 if N <= 64 * 1024 else 16))
+            )
         )
         self.cluster_n = cluster_n
 
@@ -755,7 +814,10 @@ def _smem_size_in_bytes(self, tiler_mn, num_warps: int, do_dtype=None) -> int:
         return (
             cute.size_in_bytes(self.dtype, cute.make_layout(tiler_mn)) * 2
             + cute.size_in_bytes(do_dtype, cute.make_layout(tiler_mn)) * 2
-            + self.stage * num_warps * self.cluster_n * (self.reduction_dtype.width // 8)
+            + self.stage
+            * num_warps
+            * self.cluster_n
+            * (self.reduction_dtype.width // 8)
             + self.stage * (cutlass.Int64.width // 8) * 2
         )
 
@@ -783,7 +845,9 @@ def new_stride(t):
             )
 
         mX, mdO, mdResO, mdX, mdRes = [
-            cute.make_tensor(t.iterator, cute.make_layout(semistatic_shape, stride=new_stride(t)))
+            cute.make_tensor(
+                t.iterator, cute.make_layout(semistatic_shape, stride=new_stride(t))
+            )
             if const_expr(t is not None)
             else None
             for t in (mX, mdO, mdResO, mdX, mdRes)
@@ -802,7 +866,9 @@ def new_stride(t):
             num_copy_bits=128 // largest_dtype_width * mX.element_type.width
         )
         num_threads = (
-            cute.size(tv_layout, mode=[0]) if _KERNEL_ACCEPTS_LAYOUT_ARGS else self._get_num_threads()
+            cute.size(tv_layout, mode=[0])
+            if _KERNEL_ACCEPTS_LAYOUT_ARGS
+            else self._get_num_threads()
         )
         num_warps = num_threads // cute.arch.WARP_SIZE
         if const_expr(mW is not None):
@@ -814,7 +880,9 @@ def new_stride(t):
 
         num_blocks = sm_count
         kernel = (
-            self.kernel(mX, mW, mdO, mdResO, mRstd, mdX, mdW, mdB, mdRes, tv_layout, tiler_mn)
+            self.kernel(
+                mX, mW, mdO, mdResO, mRstd, mdX, mdW, mdB, mdRes, tv_layout, tiler_mn
+            )
             if _KERNEL_ACCEPTS_LAYOUT_ARGS
             else self.kernel(mX, mW, mdO, mdResO, mRstd, mdX, mdW, mdB, mdRes)
         )
@@ -822,7 +890,9 @@ def new_stride(t):
             grid=[num_blocks, self.cluster_n, 1],
             block=[num_threads, 1, 1],
             cluster=[1, self.cluster_n, 1] if self.cluster_n > 1 else None,
-            smem=self._smem_size_in_bytes(tiler_mn, num_warps, do_dtype=mdO.element_type),
+            smem=self._smem_size_in_bytes(
+                tiler_mn, num_warps, do_dtype=mdO.element_type
+            ),
             stream=stream,
         )
 
@@ -856,7 +926,9 @@ def _kernel_impl(
         idX = cute.make_identity_tensor(shape)
 
         smem = cutlass.utils.SmemAllocator()
-        smem_layout = cute.make_ordered_layout((tiler_mn[0], tiler_mn[1], 2), order=(1, 0, 2))
+        smem_layout = cute.make_ordered_layout(
+            (tiler_mn[0], tiler_mn[1], 2), order=(1, 0, 2)
+        )
         sX = smem.allocate_tensor(mX.element_type, smem_layout, byte_alignment=16)
         sdO = smem.allocate_tensor(mdO.element_type, smem_layout, byte_alignment=16)
         reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(
@@ -870,8 +942,12 @@ def _kernel_impl(
             mbar_full_ptr, mbar_empty_ptr = None, None
 
         num_copy_elems_X = tv_layout.shape[1][0]
-        copy_atom_load_X = get_copy_atom(mX.element_type, num_copy_elems_X, is_async=False)
-        thr_copy_X = cute.make_tiled_copy(copy_atom_load_X, tv_layout, tiler_mn).get_slice(tidx)
+        copy_atom_load_X = get_copy_atom(
+            mX.element_type, num_copy_elems_X, is_async=False
+        )
+        thr_copy_X = cute.make_tiled_copy(
+            copy_atom_load_X, tv_layout, tiler_mn
+        ).get_slice(tidx)
         copy_fn = partial(copy, num_copy_elems=num_copy_elems_X)
 
         gX, gdO, gdResO, gdX, gdRes, cX = [
@@ -898,7 +974,8 @@ def _kernel_impl(
         tXcX = thr_copy_X.partition_S(cX)[(0, None), None, None, None]
 
         tXrX, tXrdO, tXrdX = [
-            cute.make_fragment_like(thr[None, None, None, 0]) for thr in (tXgX, tXgdO, tXgdX)
+            cute.make_fragment_like(thr[None, None, None, 0])
+            for thr in (tXgX, tXgdO, tXgdX)
         ]
         tXrdResO = None
         if const_expr(mdResO is not None):
@@ -959,10 +1036,24 @@ def _kernel_impl(
         for bidx in cutlass.range(bidx_start, cute.ceil_div(M, tiler_mn[0]), gdim):
             row = tXcX[None, None, None, bidx][0][0]
             if row + gdim * tiler_mn[0] < M:
-                tXgX_cur = coord_offset_i64(bidx + gdim, tXgX, dim=3)[None, None, None, 0]
-                tXgdO_cur = coord_offset_i64(bidx + gdim, tXgdO, dim=3)[None, None, None, 0]
-                copy_fn(tXgX_cur, tXsX[None, None, None, stage ^ 1], pred=tXpX, is_async=True)
-                copy_fn(tXgdO_cur, tXsdO[None, None, None, stage ^ 1], pred=tXpX, is_async=True)
+                tXgX_cur = coord_offset_i64(bidx + gdim, tXgX, dim=3)[
+                    None, None, None, 0
+                ]
+                tXgdO_cur = coord_offset_i64(bidx + gdim, tXgdO, dim=3)[
+                    None, None, None, 0
+                ]
+                copy_fn(
+                    tXgX_cur,
+                    tXsX[None, None, None, stage ^ 1],
+                    pred=tXpX,
+                    is_async=True,
+                )
+                copy_fn(
+                    tXgdO_cur,
+                    tXsdO[None, None, None, stage ^ 1],
+                    pred=tXpX,
+                    is_async=True,
+                )
             elif tiler_mn[0] > 1:
                 fill_oob(
                     tXsX[None, None, None, stage ^ 1],
@@ -979,7 +1070,9 @@ def _kernel_impl(
             if row < M or tiler_mn[0] == 1:
                 rstd_val = mRstd[row]
             if const_expr(mdResO is not None):
-                tXgdResO_cur = coord_offset_i64(bidx, tXgdResO, dim=3)[None, None, None, 0]
+                tXgdResO_cur = coord_offset_i64(bidx, tXgdResO, dim=3)[
+                    None, None, None, 0
+                ]
                 if row < M or tiler_mn[0] == 1:
                     copy_fn(tXgdResO_cur, tXrdResO, pred=tXpX)
                 elif tiler_mn[0] > 1:
@@ -1036,7 +1129,9 @@ def _kernel_impl(
                 copy_fn(tXrdX, tXgdX_cur, pred=tXpX)
             if const_expr(mdRes is not None):
                 tXrdRes.store(dx.to(tXrdRes.element_type))
-                tXgdRes_cur = coord_offset_i64(bidx, tXgdRes, dim=3)[None, None, None, 0]
+                tXgdRes_cur = coord_offset_i64(bidx, tXgdRes, dim=3)[
+                    None, None, None, 0
+                ]
                 if row < M or tiler_mn[0] == 1:
                     copy_fn(tXrdRes, tXgdRes_cur, pred=tXpX)
             if const_expr(mdW is not None):
@@ -1204,7 +1299,9 @@ def get_sm_count(
     num_sms = props.multi_processor_count
 
     sm_count_multiple = (
-        16 if N <= 256 else (8 if N <= 1024 else (4 if N <= 2048 else (2 if N <= 4096 else 1)))
+        16
+        if N <= 256
+        else (8 if N <= 1024 else (4 if N <= 2048 else (2 if N <= 4096 else 1)))
     )
     sm_count = num_sms
     if N <= 8192:
diff --git a/oink/src/kernelagent_oink/blackwell/rmsnorm.py b/oink/src/kernelagent_oink/blackwell/rmsnorm.py
index 9df9f16..e921947 100644
--- a/oink/src/kernelagent_oink/blackwell/rmsnorm.py
+++ b/oink/src/kernelagent_oink/blackwell/rmsnorm.py
@@ -121,7 +121,9 @@ def _env_flag(name: str, default: bool) -> bool:
 # - If you want to force stage-2 even when the pointer path is available (for
 #   experimentation / A-B testing), set this env var **before** importing this
 #   module.
-_FORCE_RMSNORM_STAGE2_FWD = _env_flag("KERNELAGENT_OINK_FORCE_RMSNORM_STAGE2", default=False)
+_FORCE_RMSNORM_STAGE2_FWD = _env_flag(
+    "KERNELAGENT_OINK_FORCE_RMSNORM_STAGE2", default=False
+)
 
 # CuTeDSL stability probe for the experimental cluster_n>1 + direct-GMEM schedule.
 #
@@ -2771,7 +2773,9 @@ def rmsnorm_forward(
         # Preserve stride contracts for torch.compile consistency, even
         # when using the optional stage-2 implementation.
         if y.stride() != x.stride():
-            y_strided = torch.empty_strided(x.shape, x.stride(), device=x.device, dtype=x.dtype)
+            y_strided = torch.empty_strided(
+                x.shape, x.stride(), device=x.device, dtype=x.dtype
+            )
             y_strided.copy_(y)
             y = y_strided
         if residual is not None and residual_out is not None:
@@ -3036,7 +3040,9 @@ def new_stride(t):
             )
 
         mX, mdO, mdResO, mdX, mdRes = [
-            cute.make_tensor(t.iterator, cute.make_layout(semistatic_shape, stride=new_stride(t)))
+            cute.make_tensor(
+                t.iterator, cute.make_layout(semistatic_shape, stride=new_stride(t))
+            )
             if const_expr(t is not None)
             else None
             for t in (mX, mdO, mdResO, mdX, mdRes)
@@ -3056,7 +3062,9 @@ def new_stride(t):
             num_copy_bits=128 // largest_dtype_width * mX.element_type.width
         )
         num_threads = (
-            cute.size(tv_layout, mode=[0]) if _KERNEL_ACCEPTS_LAYOUT_ARGS else self._get_num_threads()
+            cute.size(tv_layout, mode=[0])
+            if _KERNEL_ACCEPTS_LAYOUT_ARGS
+            else self._get_num_threads()
         )
         num_warps = num_threads // cute.arch.WARP_SIZE
         if const_expr(mW is not None):
@@ -3067,7 +3075,9 @@ def new_stride(t):
 
         num_blocks = sm_count
         kernel = (
-            self.kernel(mX, mW, mdO, mdResO, mRstd, mdX, mdW, mdB, mdRes, tv_layout, tiler_mn)
+            self.kernel(
+                mX, mW, mdO, mdResO, mRstd, mdX, mdW, mdB, mdRes, tv_layout, tiler_mn
+            )
             if _KERNEL_ACCEPTS_LAYOUT_ARGS
             else self.kernel(mX, mW, mdO, mdResO, mRstd, mdX, mdW, mdB, mdRes)
         )
@@ -3075,7 +3085,9 @@ def new_stride(t):
             grid=[num_blocks, self.cluster_n, 1],
             block=[num_threads, 1, 1],
             cluster=[1, self.cluster_n, 1] if self.cluster_n > 1 else None,
-            smem=self._smem_size_in_bytes(tiler_mn, num_warps, do_dtype=mdO.element_type),
+            smem=self._smem_size_in_bytes(
+                tiler_mn, num_warps, do_dtype=mdO.element_type
+            ),
             stream=stream,
         )
 
@@ -3160,8 +3172,8 @@ def _convert_mx(t: Tensor) -> cute.Tensor:
         if db_partial is not None
         else None
     )
-    rstd_tensor = (
-        from_dlpack(rstd.detach(), assumed_align=4).mark_layout_dynamic(leading_dim=0)
+    rstd_tensor = from_dlpack(rstd.detach(), assumed_align=4).mark_layout_dynamic(
+        leading_dim=0
     )
 
     current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
@@ -3261,7 +3273,9 @@ def rmsnorm_backward(
     else:
         dw_partial = None
     db_partial = (
-        torch.empty(sm_count, N, device=device, dtype=torch.float32) if has_bias else None
+        torch.empty(sm_count, N, device=device, dtype=torch.float32)
+        if has_bias
+        else None
     )
 
     _rmsnorm_bwd_sm100(
diff --git a/oink/src/kernelagent_oink/blackwell/rmsnorm_with_stage2.py b/oink/src/kernelagent_oink/blackwell/rmsnorm_with_stage2.py
index fec5bf4..2b5b36d 100644
--- a/oink/src/kernelagent_oink/blackwell/rmsnorm_with_stage2.py
+++ b/oink/src/kernelagent_oink/blackwell/rmsnorm_with_stage2.py
@@ -101,7 +101,9 @@ def copy_tiled(
 
 
 class RMSNormSM100WithStage2:
-    def __init__(self, N: int, dtype: type[cutlass.Numeric], stage: Optional[int] = None):
+    def __init__(
+        self, N: int, dtype: type[cutlass.Numeric], stage: Optional[int] = None
+    ):
         self.N = N
         self.dtype = dtype
         self.stage = 1 if stage is None else stage
@@ -172,7 +174,10 @@ def _tv_layout(self, num_copy_bits: int = 256) -> Tuple[cute.Shape, cute.Layout]
         tiler_mn = (cols_per_block, vecsize * num_blocks_N * tpr)
         tv_layout = cute.make_layout(
             ((tpr, cols_per_block), (vecsize, num_blocks_N)),
-            stride=((vecsize * cols_per_block, 1), (cols_per_block, cols_per_block * vecsize * tpr)),
+            stride=(
+                (vecsize * cols_per_block, 1),
+                (cols_per_block, cols_per_block * vecsize * tpr),
+            ),
         )
         return tiler_mn, tv_layout
 
@@ -198,7 +203,9 @@ def new_stride(t):
             )
 
         mX, mRes, mO, mResO = [
-            cute.make_tensor(t.iterator, cute.make_layout(semistatic_shape, stride=new_stride(t)))
+            cute.make_tensor(
+                t.iterator, cute.make_layout(semistatic_shape, stride=new_stride(t))
+            )
             if const_expr(t is not None)
             else None
             for t in (mX, mRes, mO, mResO)
@@ -209,36 +216,48 @@ def new_stride(t):
         copy_bits = const_expr(128)
         tiler_mn, tv_layout = self._tv_layout(num_copy_bits=copy_bits)
         num_threads = (
-            cute.size(tv_layout, mode=[0]) if _KERNEL_ACCEPTS_LAYOUT_ARGS else self._num_threads()
+            cute.size(tv_layout, mode=[0])
+            if _KERNEL_ACCEPTS_LAYOUT_ARGS
+            else self._num_threads()
         )
         num_warps = num_threads // cute.arch.WARP_SIZE
         threads_per_row = (
-            tv_layout.shape[0][0] if _KERNEL_ACCEPTS_LAYOUT_ARGS else self._threads_per_row()
+            tv_layout.shape[0][0]
+            if _KERNEL_ACCEPTS_LAYOUT_ARGS
+            else self._threads_per_row()
         )
         warps_per_row = max(threads_per_row // cute.arch.WARP_SIZE, 1)
         cluster_n = self._cluster_n()
 
         if const_expr(mW is not None):
             mW = cute.make_tensor(
-                mW.iterator, cute.prepend(mW.layout, cute.make_layout((tiler_mn[0],), stride=(0,)))
+                mW.iterator,
+                cute.prepend(mW.layout, cute.make_layout((tiler_mn[0],), stride=(0,))),
             )
         if const_expr(mB is not None):
             mB = cute.make_tensor(
-                mB.iterator, cute.prepend(mB.layout, cute.make_layout((tiler_mn[0],), stride=(0,)))
+                mB.iterator,
+                cute.prepend(mB.layout, cute.make_layout((tiler_mn[0],), stride=(0,))),
             )
         if const_expr(mRstd is not None):
             mRstd = cute.make_tensor(
-                mRstd.iterator, cute.append(mRstd.layout, cute.make_layout((self.N,), stride=(0,)))
+                mRstd.iterator,
+                cute.append(mRstd.layout, cute.make_layout((self.N,), stride=(0,))),
             )
 
         stage_bufs = 2 if self.stage > 1 else 1
-        tile_bytes_x = cute.size_in_bytes(self.dtype, cute.make_layout(tiler_mn)) * stage_bufs
+        tile_bytes_x = (
+            cute.size_in_bytes(self.dtype, cute.make_layout(tiler_mn)) * stage_bufs
+        )
         tile_bytes_res = (
-            cute.size_in_bytes(mRes.element_type, cute.make_layout(tiler_mn)) * stage_bufs
+            cute.size_in_bytes(mRes.element_type, cute.make_layout(tiler_mn))
+            * stage_bufs
             if const_expr(mRes is not None)
             else 0
         )
-        red_bytes = self.stage * num_warps * cluster_n * (self.reduction_dtype.width // 8)
+        red_bytes = (
+            self.stage * num_warps * cluster_n * (self.reduction_dtype.width // 8)
+        )
         mbar_bytes = self.stage * (cutlass.Int64.width // 8)
         smem_bytes = tile_bytes_x + tile_bytes_res + red_bytes + mbar_bytes
 
@@ -299,11 +318,15 @@ def _kernel_impl(
         tidx, _, _ = cute.arch.thread_idx()
         bidx, _, _ = cute.arch.block_idx()
         cluster_n = self._cluster_n()
-        cluster_y = const_expr(0) if const_expr(cluster_n == 1) else cute.arch.block_idx()[1]
+        cluster_y = (
+            const_expr(0) if const_expr(cluster_n == 1) else cute.arch.block_idx()[1]
+        )
 
         smem = cutlass.utils.SmemAllocator()
         sX0 = smem.allocate_tensor(
-            mX.element_type, cute.make_ordered_layout(tiler_mn, order=(1, 0)), byte_alignment=32
+            mX.element_type,
+            cute.make_ordered_layout(tiler_mn, order=(1, 0)),
+            byte_alignment=32,
         )
         sX1 = (
             smem.allocate_tensor(
@@ -316,7 +339,9 @@ def _kernel_impl(
         )
         sRes0 = (
             smem.allocate_tensor(
-                mRes.element_type, cute.make_ordered_layout(tiler_mn, order=(1, 0)), byte_alignment=32
+                mRes.element_type,
+                cute.make_ordered_layout(tiler_mn, order=(1, 0)),
+                byte_alignment=32,
             )
             if const_expr(mRes is not None)
             else None
@@ -331,18 +356,24 @@ def _kernel_impl(
             else None
         )
 
-        reduction_buffer, mbar_ptr = self._alloc_reduction_and_mbar(smem, num_warps, warps_per_row)
+        reduction_buffer, mbar_ptr = self._alloc_reduction_and_mbar(
+            smem, num_warps, warps_per_row
+        )
 
         shape = mX.shape
         idX = cute.make_identity_tensor(shape)
 
         num_copy_elems_X = tv_layout.shape[1][0]
         use_async = const_expr(self.N >= 1024)
-        copy_atom = get_copy_atom_bw(mX.element_type, num_copy_elems_X, is_async=use_async)
+        copy_atom = get_copy_atom_bw(
+            mX.element_type, num_copy_elems_X, is_async=use_async
+        )
         thr_copy = cute.make_tiled_copy(copy_atom, tv_layout, tiler_mn).get_slice(tidx)
 
         gW, gB = [
-            cute.local_tile(t, tiler_mn, (0, cluster_y)) if const_expr(t is not None) else None
+            cute.local_tile(t, tiler_mn, (0, cluster_y))
+            if const_expr(t is not None)
+            else None
             for t in (mW, mB)
         ]
         tXgW = thr_copy.partition_S(gW) if const_expr(mW is not None) else None
@@ -350,32 +381,50 @@ def _kernel_impl(
         tXrW = cute.make_fragment_like(tXgW) if const_expr(mW is not None) else None
         tXrB = cute.make_fragment_like(tXgB) if const_expr(mB is not None) else None
         if const_expr(mW is not None):
-            cute.copy(get_copy_atom_bw(mW.element_type, num_copy_elems_X, is_async=False), tXgW, tXrW)
+            cute.copy(
+                get_copy_atom_bw(mW.element_type, num_copy_elems_X, is_async=False),
+                tXgW,
+                tXrW,
+            )
         if const_expr(mB is not None):
-            cute.copy(get_copy_atom_bw(mB.element_type, num_copy_elems_X, is_async=False), tXgB, tXrB)
+            cute.copy(
+                get_copy_atom_bw(mB.element_type, num_copy_elems_X, is_async=False),
+                tXgB,
+                tXrB,
+            )
 
         self._init_cluster(tidx, mbar_ptr)
 
         mX_i, mRes_i, mO_i, mResO_i = [
-            qutils.domain_offset_i64((bidx * tiler_mn[0], 0), t) if t is not None else None
+            qutils.domain_offset_i64((bidx * tiler_mn[0], 0), t)
+            if t is not None
+            else None
             for t in (mX, mRes, mO, mResO)
         ]
         gX_i = cute.local_tile(mX_i, tiler_mn, (0, cluster_y))
         gO_i = cute.local_tile(mO_i, tiler_mn, (0, cluster_y))
         gRes_i = (
-            cute.local_tile(mRes_i, tiler_mn, (0, cluster_y)) if const_expr(mRes is not None) else None
+            cute.local_tile(mRes_i, tiler_mn, (0, cluster_y))
+            if const_expr(mRes is not None)
+            else None
         )
         gResO_i = (
-            cute.local_tile(mResO_i, tiler_mn, (0, cluster_y)) if const_expr(mResO is not None) else None
+            cute.local_tile(mResO_i, tiler_mn, (0, cluster_y))
+            if const_expr(mResO is not None)
+            else None
         )
         gRstd_i = (
-            cute.local_tile(mRstd, tiler_mn, (bidx, cluster_y)) if const_expr(mRstd is not None) else None
+            cute.local_tile(mRstd, tiler_mn, (bidx, cluster_y))
+            if const_expr(mRstd is not None)
+            else None
         )
         cX_i = cute.local_tile(idX, tiler_mn, (bidx, cluster_y))
 
         tXcX_i = thr_copy.partition_S(cX_i)[(0, None), None, None]
         row_i = tXcX_i[0][0]
-        tXgRstd_i = thr_copy.partition_D(gRstd_i) if const_expr(mRstd is not None) else None
+        tXgRstd_i = (
+            thr_copy.partition_D(gRstd_i) if const_expr(mRstd is not None) else None
+        )
 
         # Intra-row K-loop cp.async ping-pong (two-pass) for N≈6k/8k (stage=2)
         if const_expr(self.stage > 1 and (shape[1] == 6144 or shape[1] == 8192)):
@@ -388,37 +437,72 @@ def _kernel_impl(
 
             tiler_mn_tile = (tiler_mn[0], tile_n)
             sX0_tile = cute.local_tile(sX0, tiler_mn_tile, (0, 0))
-            sX1_tile = cute.local_tile(sX1, tiler_mn_tile, (0, 0)) if const_expr(self.stage > 1) else None
+            sX1_tile = (
+                cute.local_tile(sX1, tiler_mn_tile, (0, 0))
+                if const_expr(self.stage > 1)
+                else None
+            )
             sRes0_tile = (
-                cute.local_tile(sRes0, tiler_mn_tile, (0, 0)) if const_expr(mRes is not None) else None
+                cute.local_tile(sRes0, tiler_mn_tile, (0, 0))
+                if const_expr(mRes is not None)
+                else None
             )
             sRes1_tile = (
-                cute.local_tile(sRes1, tiler_mn_tile, (0, 0)) if const_expr(mRes is not None and self.stage > 1) else None
+                cute.local_tile(sRes1, tiler_mn_tile, (0, 0))
+                if const_expr(mRes is not None and self.stage > 1)
+                else None
             )
 
             tv_layout_tile = cute.make_layout(
                 ((tpr, tiler_mn[0]), (vecsize, tile_factor)),
-                stride=((vecsize * tiler_mn[0], 1), (tiler_mn[0], tiler_mn[0] * vecsize * tpr)),
+                stride=(
+                    (vecsize * tiler_mn[0], 1),
+                    (tiler_mn[0], tiler_mn[0] * vecsize * tpr),
+                ),
             )
-            thr_copy_tile = cute.make_tiled_copy(copy_atom, tv_layout_tile, tiler_mn_tile).get_slice(tidx)
+            thr_copy_tile = cute.make_tiled_copy(
+                copy_atom, tv_layout_tile, tiler_mn_tile
+            ).get_slice(tidx)
 
             sum_sq_acc = cute.Float32(0.0)
             k_off0 = const_expr(0) * tile_n
-            gX_0 = cute.local_tile(qutils.domain_offset_i64((0, k_off0), mX_i), tiler_mn_tile, (0, cluster_y))
+            gX_0 = cute.local_tile(
+                qutils.domain_offset_i64((0, k_off0), mX_i),
+                tiler_mn_tile,
+                (0, cluster_y),
+            )
             tXgX_0 = thr_copy_tile.partition_S(gX_0)
             tXsX_0 = thr_copy_tile.partition_D(sX0_tile)
-            cX_0 = cute.local_tile(cute.domain_offset((0, k_off0), cX_i), tiler_mn_tile, (0, cluster_y))
+            cX_0 = cute.local_tile(
+                cute.domain_offset((0, k_off0), cX_i), tiler_mn_tile, (0, cluster_y)
+            )
             tXc_0 = thr_copy_tile.partition_S(cX_0)
             tXp_0 = qutils.predicate_k(tXc_0, limit=shape[1])
             tXp_ping = tXp_0
             tXp_pong = tXp_0
             if row_i < shape[0]:
-                copy_tiled(tXgX_0, tXsX_0, num_copy_elems=vecsize, is_async=use_async, pred=tXp_0)
+                copy_tiled(
+                    tXgX_0,
+                    tXsX_0,
+                    num_copy_elems=vecsize,
+                    is_async=use_async,
+                    pred=tXp_0,
+                )
                 if const_expr(mRes is not None):
-                    gRes_0 = cute.local_tile(qutils.domain_offset_i64((0, k_off0), mRes_i), tiler_mn_tile, (0, cluster_y))
+                    gRes_0 = cute.local_tile(
+                        qutils.domain_offset_i64((0, k_off0), mRes_i),
+                        tiler_mn_tile,
+                        (0, cluster_y),
+                    )
                     tXgRes_0 = thr_copy_tile.partition_S(gRes_0)
                     tXsRes_0 = thr_copy_tile.partition_D(sRes0_tile)
-                    copy_tiled(tXgRes_0, tXsRes_0, num_copy_elems=vecsize, is_async=use_async, pred=tXp_0)
+                    copy_tiled(
+                        tXgRes_0,
+                        tXsRes_0,
+                        num_copy_elems=vecsize,
+                        is_async=use_async,
+                        pred=tXp_0,
+                    )
             if const_expr(use_async):
                 cute.arch.cp_async_commit_group()
 
@@ -426,29 +510,57 @@ def _kernel_impl(
                 next_t = t + 1
                 if next_t < num_tiles:
                     k_off_n = next_t * tile_n
-                    gX_n = cute.local_tile(qutils.domain_offset_i64((0, k_off_n), mX_i), tiler_mn_tile, (0, cluster_y))
+                    gX_n = cute.local_tile(
+                        qutils.domain_offset_i64((0, k_off_n), mX_i),
+                        tiler_mn_tile,
+                        (0, cluster_y),
+                    )
                     tXgX_n = thr_copy_tile.partition_S(gX_n)
-                    cX_n = cute.local_tile(cute.domain_offset((0, k_off_n), cX_i), tiler_mn_tile, (0, cluster_y))
+                    cX_n = cute.local_tile(
+                        cute.domain_offset((0, k_off_n), cX_i),
+                        tiler_mn_tile,
+                        (0, cluster_y),
+                    )
                     tXc_n = thr_copy_tile.partition_S(cX_n)
                     tXp_n = qutils.predicate_k(tXc_n, limit=shape[1])
                     if const_expr((t % 2) == 0):
                         tXsX_n = thr_copy_tile.partition_D(sX1_tile)
                         tXsRes_n = (
-                            thr_copy_tile.partition_D(sRes1_tile) if const_expr(mRes is not None) else None
+                            thr_copy_tile.partition_D(sRes1_tile)
+                            if const_expr(mRes is not None)
+                            else None
                         )
                         tXp_pong = tXp_n
                     else:
                         tXsX_n = thr_copy_tile.partition_D(sX0_tile)
                         tXsRes_n = (
-                            thr_copy_tile.partition_D(sRes0_tile) if const_expr(mRes is not None) else None
+                            thr_copy_tile.partition_D(sRes0_tile)
+                            if const_expr(mRes is not None)
+                            else None
                         )
                         tXp_ping = tXp_n
                     if row_i < shape[0]:
-                        copy_tiled(tXgX_n, tXsX_n, num_copy_elems=vecsize, is_async=use_async, pred=tXp_n)
+                        copy_tiled(
+                            tXgX_n,
+                            tXsX_n,
+                            num_copy_elems=vecsize,
+                            is_async=use_async,
+                            pred=tXp_n,
+                        )
                         if const_expr(mRes is not None):
-                            gRes_n = cute.local_tile(qutils.domain_offset_i64((0, k_off_n), mRes_i), tiler_mn_tile, (0, cluster_y))
+                            gRes_n = cute.local_tile(
+                                qutils.domain_offset_i64((0, k_off_n), mRes_i),
+                                tiler_mn_tile,
+                                (0, cluster_y),
+                            )
                             tXgRes_n = thr_copy_tile.partition_S(gRes_n)
-                            copy_tiled(tXgRes_n, tXsRes_n, num_copy_elems=vecsize, is_async=use_async, pred=tXp_n)
+                            copy_tiled(
+                                tXgRes_n,
+                                tXsRes_n,
+                                num_copy_elems=vecsize,
+                                is_async=use_async,
+                                pred=tXp_n,
+                            )
                     if const_expr(use_async):
                         cute.arch.cp_async_commit_group()
                 if const_expr(use_async):
@@ -456,36 +568,62 @@ def _kernel_impl(
 
                 if const_expr((t % 2) == 0):
                     tXsX_cur = thr_copy_tile.partition_D(sX0_tile)
-                    tXsRes_cur = thr_copy_tile.partition_D(sRes0_tile) if const_expr(mRes is not None) else None
+                    tXsRes_cur = (
+                        thr_copy_tile.partition_D(sRes0_tile)
+                        if const_expr(mRes is not None)
+                        else None
+                    )
                     pred_cur = tXp_ping
                 else:
                     tXsX_cur = thr_copy_tile.partition_D(sX1_tile)
-                    tXsRes_cur = thr_copy_tile.partition_D(sRes1_tile) if const_expr(mRes is not None) else None
+                    tXsRes_cur = (
+                        thr_copy_tile.partition_D(sRes1_tile)
+                        if const_expr(mRes is not None)
+                        else None
+                    )
                     pred_cur = tXp_pong
                 qutils.fill_oob(tXsX_cur, pred_cur, mX.element_type.zero)
                 if const_expr(mRes is not None):
                     qutils.fill_oob(tXsRes_cur, pred_cur, mRes.element_type.zero)
 
                 k_off = t * tile_n
-                gX_t = cute.local_tile(qutils.domain_offset_i64((0, k_off), mX_i), tiler_mn_tile, (0, cluster_y))
+                gX_t = cute.local_tile(
+                    qutils.domain_offset_i64((0, k_off), mX_i),
+                    tiler_mn_tile,
+                    (0, cluster_y),
+                )
                 tXgX_t = thr_copy_tile.partition_S(gX_t)
                 tXrX = cute.make_fragment_like(tXgX_t)
                 cute.autovec_copy(tXsX_cur, tXrX)
                 x = tXrX.load().to(cute.Float32)
                 if const_expr(mRes is not None):
-                    gRes_t = cute.local_tile(qutils.domain_offset_i64((0, k_off), mRes_i), tiler_mn_tile, (0, cluster_y))
+                    gRes_t = cute.local_tile(
+                        qutils.domain_offset_i64((0, k_off), mRes_i),
+                        tiler_mn_tile,
+                        (0, cluster_y),
+                    )
                     tXgRes_t = thr_copy_tile.partition_S(gRes_t)
                     tXrRes = cute.make_fragment_like(tXgRes_t)
                     cute.autovec_copy(tXsRes_cur, tXrRes)
                     x += tXrRes.load().to(cute.Float32)
 
                 if const_expr(mResO is not None):
-                    gResO_t = cute.local_tile(qutils.domain_offset_i64((0, k_off), mResO_i), tiler_mn_tile, (0, cluster_y))
+                    gResO_t = cute.local_tile(
+                        qutils.domain_offset_i64((0, k_off), mResO_i),
+                        tiler_mn_tile,
+                        (0, cluster_y),
+                    )
                     tXgResO_t = thr_copy_tile.partition_D(gResO_t)
                     tXrResO = cute.make_fragment_like(tXgResO_t)
                     tXrResO.store(x.to(tXrResO.element_type))
                     if row_i < shape[0]:
-                        copy_tiled(tXrResO, tXgResO_t, num_copy_elems=vecsize, is_async=False, pred=pred_cur)
+                        copy_tiled(
+                            tXrResO,
+                            tXgResO_t,
+                            num_copy_elems=vecsize,
+                            is_async=False,
+                            pred=pred_cur,
+                        )
 
                 sum_sq_tile = row_reduce(
                     x * x,
@@ -494,7 +632,9 @@ def _kernel_impl(
                     reduction_buffer[None, None, 0],
                     mbar_ptr,
                     init_val=0.0,
-                    hook_fn=(cute.arch.cluster_wait if const_expr(cluster_n > 1) else None),
+                    hook_fn=(
+                        cute.arch.cluster_wait if const_expr(cluster_n > 1) else None
+                    ),
                 )
                 sum_sq_acc = sum_sq_acc + sum_sq_tile
 
@@ -509,32 +649,46 @@ def _kernel_impl(
 
             for t in cutlass.range_constexpr(num_tiles):
                 k_off = t * tile_n
-                cX_t = cute.local_tile(cute.domain_offset((0, k_off), cX_i), tiler_mn_tile, (0, cluster_y))
+                cX_t = cute.local_tile(
+                    cute.domain_offset((0, k_off), cX_i), tiler_mn_tile, (0, cluster_y)
+                )
                 tXc_t = thr_copy_tile.partition_S(cX_t)
                 tXp_t = qutils.predicate_k(tXc_t, limit=shape[1])
 
                 if const_expr((t % 2) == 0):
                     tXsX_cur = thr_copy_tile.partition_D(sX0_tile)
                     tXsRes_cur = (
-                        thr_copy_tile.partition_D(sRes0_tile) if const_expr(mRes is not None) else None
+                        thr_copy_tile.partition_D(sRes0_tile)
+                        if const_expr(mRes is not None)
+                        else None
                     )
                 else:
                     tXsX_cur = thr_copy_tile.partition_D(sX1_tile)
                     tXsRes_cur = (
-                        thr_copy_tile.partition_D(sRes1_tile) if const_expr(mRes is not None) else None
+                        thr_copy_tile.partition_D(sRes1_tile)
+                        if const_expr(mRes is not None)
+                        else None
                     )
 
                 qutils.fill_oob(tXsX_cur, tXp_t, mX.element_type.zero)
                 if const_expr(mRes is not None):
                     qutils.fill_oob(tXsRes_cur, tXp_t, mRes.element_type.zero)
 
-                gX_t = cute.local_tile(qutils.domain_offset_i64((0, k_off), mX_i), tiler_mn_tile, (0, cluster_y))
+                gX_t = cute.local_tile(
+                    qutils.domain_offset_i64((0, k_off), mX_i),
+                    tiler_mn_tile,
+                    (0, cluster_y),
+                )
                 tXgX_t = thr_copy_tile.partition_S(gX_t)
                 tXrX = cute.make_fragment_like(tXgX_t)
                 cute.autovec_copy(tXsX_cur, tXrX)
                 x = tXrX.load().to(cute.Float32)
                 if const_expr(mRes is not None):
-                    gRes_t = cute.local_tile(qutils.domain_offset_i64((0, k_off), mRes_i), tiler_mn_tile, (0, cluster_y))
+                    gRes_t = cute.local_tile(
+                        qutils.domain_offset_i64((0, k_off), mRes_i),
+                        tiler_mn_tile,
+                        (0, cluster_y),
+                    )
                     tXgRes_t = thr_copy_tile.partition_S(gRes_t)
                     tXrRes = cute.make_fragment_like(tXgRes_t)
                     cute.autovec_copy(tXsRes_cur, tXrRes)
@@ -542,35 +696,67 @@ def _kernel_impl(
 
                 y = x * rstd
                 if const_expr(mW is not None):
-                    gW_t = cute.local_tile(qutils.domain_offset_i64((0, k_off), mW), tiler_mn_tile, (0, cluster_y))
+                    gW_t = cute.local_tile(
+                        qutils.domain_offset_i64((0, k_off), mW),
+                        tiler_mn_tile,
+                        (0, cluster_y),
+                    )
                     tWgW_t = thr_copy_tile.partition_S(gW_t)
                     tWrW_t = cute.make_fragment_like(tWgW_t)
-                    copy_tiled(tWgW_t, tWrW_t, num_copy_elems=vecsize, is_async=False, pred=tXp_t)
+                    copy_tiled(
+                        tWgW_t,
+                        tWrW_t,
+                        num_copy_elems=vecsize,
+                        is_async=False,
+                        pred=tXp_t,
+                    )
                     y = y * tWrW_t.load().to(cute.Float32)
                 if const_expr(mB is not None):
-                    gB_t = cute.local_tile(qutils.domain_offset_i64((0, k_off), mB), tiler_mn_tile, (0, cluster_y))
+                    gB_t = cute.local_tile(
+                        qutils.domain_offset_i64((0, k_off), mB),
+                        tiler_mn_tile,
+                        (0, cluster_y),
+                    )
                     tWgB_t = thr_copy_tile.partition_S(gB_t)
                     tWrB_t = cute.make_fragment_like(tWgB_t)
-                    copy_tiled(tWgB_t, tWrB_t, num_copy_elems=vecsize, is_async=False, pred=tXp_t)
+                    copy_tiled(
+                        tWgB_t,
+                        tWrB_t,
+                        num_copy_elems=vecsize,
+                        is_async=False,
+                        pred=tXp_t,
+                    )
                     y = y + tWrB_t.load().to(cute.Float32)
 
-                gO_t = cute.local_tile(qutils.domain_offset_i64((0, k_off), mO_i), tiler_mn_tile, (0, cluster_y))
+                gO_t = cute.local_tile(
+                    qutils.domain_offset_i64((0, k_off), mO_i),
+                    tiler_mn_tile,
+                    (0, cluster_y),
+                )
                 tXgO_t = thr_copy_tile.partition_D(gO_t)
                 tXrO = cute.make_fragment_like(tXgO_t)
                 tXrO.store(y.to(tXrO.element_type))
                 if row_i < shape[0]:
-                    copy_tiled(tXrO, tXgO_t, num_copy_elems=vecsize, is_async=False, pred=tXp_t)
+                    copy_tiled(
+                        tXrO, tXgO_t, num_copy_elems=vecsize, is_async=False, pred=tXp_t
+                    )
 
             return
 
         # Fallback: single-stage path identical to current rmsnorm.py
         tXgX_i = thr_copy.partition_S(gX_i)
-        tXgRes_i = thr_copy.partition_S(gRes_i) if const_expr(mRes is not None) else None
+        tXgRes_i = (
+            thr_copy.partition_S(gRes_i) if const_expr(mRes is not None) else None
+        )
         tXgO_i = thr_copy.partition_D(gO_i)
-        tXgResO_i = thr_copy.partition_D(gResO_i) if const_expr(mResO is not None) else None
+        tXgResO_i = (
+            thr_copy.partition_D(gResO_i) if const_expr(mResO is not None) else None
+        )
         is_even_N_i = const_expr(shape[1] == tiler_mn[1] * cluster_n)
         tXpX_i = (
-            qutils.predicate_k(thr_copy.partition_S(cX_i), limit=shape[1]) if not is_even_N_i else None
+            qutils.predicate_k(thr_copy.partition_S(cX_i), limit=shape[1])
+            if not is_even_N_i
+            else None
         )
 
         if row_i < shape[0]:
@@ -594,7 +780,9 @@ def _kernel_impl(
             tXrResO.store(x.to(tXrResO.element_type))
             if row_i < shape[0]:
                 cute.copy(
-                    get_copy_atom_bw(tXrResO.element_type, num_copy_elems_X, is_async=False),
+                    get_copy_atom_bw(
+                        tXrResO.element_type, num_copy_elems_X, is_async=False
+                    ),
                     tXrResO,
                     tXgResO_i,
                 )
@@ -715,7 +903,9 @@ def _alloc_reduction_and_mbar(
             (num_warps // warps_per_row, (warps_per_row, cluster_n), self.stage),
             order=(1, 0, 2),
         )
-        reduction_buffer = smem.allocate_tensor(self.reduction_dtype, red_layout, byte_alignment=4)
+        reduction_buffer = smem.allocate_tensor(
+            self.reduction_dtype, red_layout, byte_alignment=4
+        )
         if const_expr(cluster_n > 1):
             mbar_ptr = smem.allocate_array(cutlass.Int64, num_elems=self.stage)
         else:
@@ -745,9 +935,9 @@ def rmsnorm_forward_with_stage2(
     dtype = TORCH2CUTE_DTYPE[x.dtype]
 
     def _convert_x(t: Tensor) -> cute.Tensor:
-        return from_dlpack(
-            t.detach(), assumed_align=32
-        ).mark_layout_dynamic(leading_dim=1)
+        return from_dlpack(t.detach(), assumed_align=32).mark_layout_dynamic(
+            leading_dim=1
+        )
 
     mX = _convert_x(x)
     mRes = _convert_x(residual) if residual is not None else None
@@ -755,7 +945,9 @@ def _convert_x(t: Tensor) -> cute.Tensor:
     mO = from_dlpack(out.detach(), assumed_align=32).mark_layout_dynamic(leading_dim=1)
 
     mW = (
-        from_dlpack(weight.detach(), assumed_align=32).mark_layout_dynamic(leading_dim=0)
+        from_dlpack(weight.detach(), assumed_align=32).mark_layout_dynamic(
+            leading_dim=0
+        )
         if weight is not None
         else None
     )
@@ -766,7 +958,9 @@ def _convert_x(t: Tensor) -> cute.Tensor:
     )
     if store_rstd:
         rstd = torch.empty(M, device=x.device, dtype=torch.float32)
-        mRstd = from_dlpack(rstd.detach(), assumed_align=4).mark_layout_dynamic(leading_dim=0)
+        mRstd = from_dlpack(rstd.detach(), assumed_align=4).mark_layout_dynamic(
+            leading_dim=0
+        )
     else:
         rstd = None
         mRstd = None
@@ -775,7 +969,9 @@ def _convert_x(t: Tensor) -> cute.Tensor:
     mResO = None
     if residual is not None:
         residual_out = torch.empty_like(residual)
-        mResO = from_dlpack(residual_out.detach(), assumed_align=32).mark_layout_dynamic(leading_dim=1)
+        mResO = from_dlpack(
+            residual_out.detach(), assumed_align=32
+        ).mark_layout_dynamic(leading_dim=1)
 
     # Enable the intra-row cp.async K-loop only for DSv3-style large-N rows
     # with very large M, where there is enough work per row to amortize the
@@ -788,7 +984,7 @@ def _convert_x(t: Tensor) -> cute.Tensor:
         op._tpr_override = 128  # type: ignore[attr-defined]
         # Prefer 1 row/CTA at N=6144; keep 2 rows/CTA at N=8192 to match
         # the original tuning there.
-        op._nt_override = (128 if N == 6144 else 256)  # type: ignore[attr-defined]
+        op._nt_override = 128 if N == 6144 else 256  # type: ignore[attr-defined]
 
     stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
     key = (
@@ -803,7 +999,9 @@ def _convert_x(t: Tensor) -> cute.Tensor:
     )
     compiled = _COMPILE_CACHE.get(key)
     if compiled is None:
-        compiled = cute.compile(op, mX, mW, mB, mRes, mO, mResO, mRstd, stream, Float32(eps))
+        compiled = cute.compile(
+            op, mX, mW, mB, mRes, mO, mResO, mRstd, stream, Float32(eps)
+        )
         _COMPILE_CACHE[key] = compiled
     compiled(mX, mW, mB, mRes, mO, mResO, mRstd, stream, Float32(eps))
     return out, rstd, residual_out
diff --git a/oink/src/kernelagent_oink/blackwell/softmax.py b/oink/src/kernelagent_oink/blackwell/softmax.py
index a8a2791..6a7eb54 100644
--- a/oink/src/kernelagent_oink/blackwell/softmax.py
+++ b/oink/src/kernelagent_oink/blackwell/softmax.py
@@ -134,7 +134,9 @@ def __call__(self, mX: cute.Tensor, mO: cute.Tensor, stream: cuda.CUstream) -> N
         # Use the generic ReductionBase tiling with 128-bit vectorization.
         tiler_mn, tv_layout = self._get_tv_layout()
         num_threads = (
-            cute.size(tv_layout, mode=[0]) if _KERNEL_ACCEPTS_LAYOUT_ARGS else self._get_num_threads()
+            cute.size(tv_layout, mode=[0])
+            if _KERNEL_ACCEPTS_LAYOUT_ARGS
+            else self._get_num_threads()
         )
         num_warps = num_threads // cute.arch.WARP_SIZE
         kernel = (
@@ -201,7 +203,9 @@ def _kernel_impl(
             cute.make_ordered_layout(tiler_mn, order=(1, 0)),
             byte_alignment=16,
         )
-        reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(smem, tv_layout)
+        reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(
+            smem, tv_layout
+        )
 
         # Copy atoms for gmem <-> smem and smem <-> gmem.
         # Use 128-bit cp.async for global->shared and 128-bit vectorized stores.
@@ -216,8 +220,12 @@ def _kernel_impl(
             num_bits_per_copy=128,
         )
 
-        thr_copy_load = cute.make_tiled_copy(copy_atom_load, tv_layout, tiler_mn).get_slice(tidx)
-        thr_copy_store = cute.make_tiled_copy(copy_atom_store, tv_layout, tiler_mn).get_slice(tidx)
+        thr_copy_load = cute.make_tiled_copy(
+            copy_atom_load, tv_layout, tiler_mn
+        ).get_slice(tidx)
+        thr_copy_store = cute.make_tiled_copy(
+            copy_atom_store, tv_layout, tiler_mn
+        ).get_slice(tidx)
 
         tXgX = thr_copy_load.partition_S(gX)
         tXsX = thr_copy_load.partition_D(sX)
@@ -349,7 +357,10 @@ def _smem_size_in_bytes(self, tiler_mn, num_warps: int) -> int:
         # Store both y and dy tiles plus reduction buffers and mbarriers.
         return (
             cute.size_in_bytes(self.dtype, cute.make_layout(tiler_mn)) * 2
-            + self.stage * num_warps * self.cluster_n * (self.reduction_dtype.width // 8)
+            + self.stage
+            * num_warps
+            * self.cluster_n
+            * (self.reduction_dtype.width // 8)
             + self.stage * (cutlass.Int64.width // 8)
         )
 
@@ -367,7 +378,9 @@ def __call__(
         # Use the generic ReductionBase tiling with 128-bit vectorization.
         tiler_mn, tv_layout = self._get_tv_layout()
         num_threads = (
-            cute.size(tv_layout, mode=[0]) if _KERNEL_ACCEPTS_LAYOUT_ARGS else self._get_num_threads()
+            cute.size(tv_layout, mode=[0])
+            if _KERNEL_ACCEPTS_LAYOUT_ARGS
+            else self._get_num_threads()
         )
         num_warps = num_threads // cute.arch.WARP_SIZE
         kernel = (
@@ -423,7 +436,9 @@ def _kernel_impl(
         mdY, mY, mdX = [
             domain_offset_i64((bidx * tiler_mn[0], 0), mT) for mT in (mdY, mY, mdX)
         ]
-        gdY, gY, gdX = [cute.local_tile(mT, tiler_mn, (0, cluster_y)) for mT in (mdY, mY, mdX)]
+        gdY, gY, gdX = [
+            cute.local_tile(mT, tiler_mn, (0, cluster_y)) for mT in (mdY, mY, mdX)
+        ]
         cX = cute.local_tile(idX, tiler_mn, (bidx, cluster_y))
 
         smem = cutlass.utils.SmemAllocator()
@@ -437,7 +452,9 @@ def _kernel_impl(
             cute.make_ordered_layout(tiler_mn, order=(1, 0)),
             byte_alignment=16,
         )
-        reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(smem, tv_layout)
+        reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(
+            smem, tv_layout
+        )
 
         copy_atom_load = cute.make_copy_atom(
             cute.nvgpu.cpasync.CopyG2SOp(),
@@ -450,8 +467,12 @@ def _kernel_impl(
             num_bits_per_copy=128,
         )
 
-        thr_copy_load = cute.make_tiled_copy(copy_atom_load, tv_layout, tiler_mn).get_slice(tidx)
-        thr_copy_store = cute.make_tiled_copy(copy_atom_store, tv_layout, tiler_mn).get_slice(tidx)
+        thr_copy_load = cute.make_tiled_copy(
+            copy_atom_load, tv_layout, tiler_mn
+        ).get_slice(tidx)
+        thr_copy_store = cute.make_tiled_copy(
+            copy_atom_store, tv_layout, tiler_mn
+        ).get_slice(tidx)
 
         tdYgdY = thr_copy_load.partition_S(gdY)
         tdYsdY = thr_copy_load.partition_D(sdY)
@@ -460,7 +481,9 @@ def _kernel_impl(
         tdXgdX = thr_copy_store.partition_D(gdX)
         tXcX = thr_copy_load.partition_S(cX)[(0, None), None, None]
 
-        tdYrdY, tYrY, tdXrdX = [cute.make_fragment_like(thr) for thr in (tdYgdY, tYgY, tdXgdX)]
+        tdYrdY, tYrY, tdXrdX = [
+            cute.make_fragment_like(thr) for thr in (tdYgdY, tYgY, tdXgdX)
+        ]
 
         num_warps = cute.size(tv_layout, mode=[0]) // cute.arch.WARP_SIZE
         self._initialize_cluster(tidx, mbar_ptr, num_warps)
@@ -535,9 +558,8 @@ def _convert_2d_tensor(x: Tensor) -> cute.Tensor:
     # the shape compact with row-major stride order (0, 1), with mode=0 (batch).
     # We intentionally do not call mark_layout_dynamic here to avoid the
     # leading_dim stride==1 constraint used in RMSNorm.
-    return (
-        from_dlpack(x.detach(), assumed_align=16)
-        .mark_compact_shape_dynamic(mode=0, stride_order=(0, 1))
+    return from_dlpack(x.detach(), assumed_align=16).mark_compact_shape_dynamic(
+        mode=0, stride_order=(0, 1)
     )
 
 
@@ -581,7 +603,9 @@ def _softmax_forward_ptr_into(*, x: Tensor, out: Tensor) -> None:
     compiled = _PTR_FWD_COMPILE_CACHE.get(key)
     if compiled is None:
         op = SoftmaxFwdSM100(dtype_x, int(N))
-        ptr_x = rt.make_ptr(dtype_x, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+        ptr_x = rt.make_ptr(
+            dtype_x, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+        )
         ptr_out = rt.make_ptr(
             dtype_x, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
         )
@@ -596,8 +620,12 @@ def _softmax_forward_ptr_into(*, x: Tensor, out: Tensor) -> None:
         )
         _PTR_FWD_COMPILE_CACHE[key] = compiled
 
-    ptr_x = rt.make_ptr(dtype_x, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
-    ptr_out = rt.make_ptr(dtype_x, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+    ptr_x = rt.make_ptr(
+        dtype_x, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+    )
+    ptr_out = rt.make_ptr(
+        dtype_x, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+    )
     compiled(ptr_x, ptr_out, Int32(int(M)), Int32(int(x.stride(0))), stream)
 
 
@@ -606,7 +634,9 @@ def _softmax_backward_ptr_into(*, dy: Tensor, y: Tensor, dx: Tensor) -> None:
     assert dy.is_cuda and dy.dim() == 2
     assert y.is_cuda and y.shape == dy.shape and y.dtype == dy.dtype
     assert dx.is_cuda and dx.shape == dy.shape and dx.dtype == dy.dtype
-    assert dy.stride() == y.stride() == dx.stride(), "Pointer path expects matching strides"
+    assert dy.stride() == y.stride() == dx.stride(), (
+        "Pointer path expects matching strides"
+    )
 
     M, N = dy.shape
     device_index = dy.get_device()
@@ -619,9 +649,15 @@ def _softmax_backward_ptr_into(*, dy: Tensor, y: Tensor, dx: Tensor) -> None:
     compiled = _PTR_BWD_COMPILE_CACHE.get(key)
     if compiled is None:
         op = SoftmaxBwdSM100(dtype_x, int(N))
-        ptr_dy = rt.make_ptr(dtype_x, dy.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
-        ptr_y = rt.make_ptr(dtype_x, y.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
-        ptr_dx = rt.make_ptr(dtype_x, dx.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+        ptr_dy = rt.make_ptr(
+            dtype_x, dy.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+        )
+        ptr_y = rt.make_ptr(
+            dtype_x, y.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+        )
+        ptr_dx = rt.make_ptr(
+            dtype_x, dx.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+        )
         ld = Int32(int(dy.stride(0)))
         compiled = cute.compile(
             op.launch_from_ptrs,
@@ -634,9 +670,15 @@ def _softmax_backward_ptr_into(*, dy: Tensor, y: Tensor, dx: Tensor) -> None:
         )
         _PTR_BWD_COMPILE_CACHE[key] = compiled
 
-    ptr_dy = rt.make_ptr(dtype_x, dy.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
-    ptr_y = rt.make_ptr(dtype_x, y.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
-    ptr_dx = rt.make_ptr(dtype_x, dx.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16)
+    ptr_dy = rt.make_ptr(
+        dtype_x, dy.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+    )
+    ptr_y = rt.make_ptr(
+        dtype_x, y.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+    )
+    ptr_dx = rt.make_ptr(
+        dtype_x, dx.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+    )
     compiled(ptr_dy, ptr_y, ptr_dx, Int32(int(M)), Int32(int(dy.stride(0))), stream)
 
 
@@ -679,8 +721,14 @@ def softmax_backward(dy: Tensor, y: Tensor) -> Tensor:
 
     N = dy.size(1)
     dtype = TORCH2CUTE_DTYPE[dy.dtype]
-    if _can_use_ptr_path_2d(dy) and _can_use_ptr_path_2d(y) and dy.stride() == y.stride():
-        dx = torch.empty_strided(dy.shape, dy.stride(), device=dy.device, dtype=dy.dtype)
+    if (
+        _can_use_ptr_path_2d(dy)
+        and _can_use_ptr_path_2d(y)
+        and dy.stride() == y.stride()
+    ):
+        dx = torch.empty_strided(
+            dy.shape, dy.stride(), device=dy.device, dtype=dy.dtype
+        )
         _softmax_backward_ptr_into(dy=dy, y=y, dx=dx)
         return dx
 

From 7e818eecf3448f9f8173ac4ad5154177fea830e8 Mon Sep 17 00:00:00 2001
From: Laura Wang <3700467+Laurawly@users.noreply.github.com>
Date: Wed, 21 Jan 2026 20:11:32 -0800
Subject: [PATCH 7/8] oink: add license headers to benchmarks

---
 oink/benchmarks/benchmark/bench_utils.py           | 14 ++++++++++++++
 .../benchmark/benchmark_cross_entropy_sm100.py     | 14 ++++++++++++++
 .../benchmark/benchmark_fused_add_rmsnorm_sm100.py | 14 ++++++++++++++
 .../benchmark/benchmark_hbm_roofline_sm100.py      | 14 ++++++++++++++
 .../benchmark/benchmark_layernorm_sm100.py         | 14 ++++++++++++++
 .../benchmark/benchmark_rmsnorm_bwd_sm100.py       | 14 ++++++++++++++
 .../benchmark/benchmark_rmsnorm_sm100.py           | 14 ++++++++++++++
 .../benchmark/benchmark_softmax_sm100.py           | 14 ++++++++++++++
 oink/benchmarks/readme/plot_quack_style_svg.py     | 14 ++++++++++++++
 oink/benchmarks/readme/run_sm100_suite.py          | 14 ++++++++++++++
 oink/benchmarks/readme/summarize_results.py        | 14 ++++++++++++++
 11 files changed, 154 insertions(+)

diff --git a/oink/benchmarks/benchmark/bench_utils.py b/oink/benchmarks/benchmark/bench_utils.py
index 0a9ae4b..ef996ec 100644
--- a/oink/benchmarks/benchmark/bench_utils.py
+++ b/oink/benchmarks/benchmark/bench_utils.py
@@ -1,3 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from __future__ import annotations
 
 import csv
diff --git a/oink/benchmarks/benchmark/benchmark_cross_entropy_sm100.py b/oink/benchmarks/benchmark/benchmark_cross_entropy_sm100.py
index ff1a99b..3c8bf44 100644
--- a/oink/benchmarks/benchmark/benchmark_cross_entropy_sm100.py
+++ b/oink/benchmarks/benchmark/benchmark_cross_entropy_sm100.py
@@ -1,3 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from __future__ import annotations
 
 import argparse
diff --git a/oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py b/oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py
index 863712d..8a0227b 100644
--- a/oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py
+++ b/oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py
@@ -1,3 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """
 Benchmark fused_add_rmsnorm (in-place) on SM100.
 
diff --git a/oink/benchmarks/benchmark/benchmark_hbm_roofline_sm100.py b/oink/benchmarks/benchmark/benchmark_hbm_roofline_sm100.py
index c22294e..22fb48d 100644
--- a/oink/benchmarks/benchmark/benchmark_hbm_roofline_sm100.py
+++ b/oink/benchmarks/benchmark/benchmark_hbm_roofline_sm100.py
@@ -1,3 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """
 HBM roofline microbenchmark for SM100 (GB200 / Blackwell).
 
diff --git a/oink/benchmarks/benchmark/benchmark_layernorm_sm100.py b/oink/benchmarks/benchmark/benchmark_layernorm_sm100.py
index 3c0e37d..20895b7 100644
--- a/oink/benchmarks/benchmark/benchmark_layernorm_sm100.py
+++ b/oink/benchmarks/benchmark/benchmark_layernorm_sm100.py
@@ -1,3 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from __future__ import annotations
 
 import argparse
diff --git a/oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py b/oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py
index b9909e7..31b335b 100644
--- a/oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py
+++ b/oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py
@@ -1,3 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from __future__ import annotations
 
 import argparse
diff --git a/oink/benchmarks/benchmark/benchmark_rmsnorm_sm100.py b/oink/benchmarks/benchmark/benchmark_rmsnorm_sm100.py
index f4c8a5f..39e6cd7 100644
--- a/oink/benchmarks/benchmark/benchmark_rmsnorm_sm100.py
+++ b/oink/benchmarks/benchmark/benchmark_rmsnorm_sm100.py
@@ -1,3 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from __future__ import annotations
 
 import argparse
diff --git a/oink/benchmarks/benchmark/benchmark_softmax_sm100.py b/oink/benchmarks/benchmark/benchmark_softmax_sm100.py
index 995b09f..a5b2b3c 100644
--- a/oink/benchmarks/benchmark/benchmark_softmax_sm100.py
+++ b/oink/benchmarks/benchmark/benchmark_softmax_sm100.py
@@ -1,3 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from __future__ import annotations
 
 import argparse
diff --git a/oink/benchmarks/readme/plot_quack_style_svg.py b/oink/benchmarks/readme/plot_quack_style_svg.py
index af76832..88eebdf 100644
--- a/oink/benchmarks/readme/plot_quack_style_svg.py
+++ b/oink/benchmarks/readme/plot_quack_style_svg.py
@@ -1,3 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """
 Generate Quack-style SVG performance plots (Oink vs Quack) from the SM100 suite
 JSON artifacts under `/tmp/kernelagent_oink_sm100_suite_{bf16,fp16}`.
diff --git a/oink/benchmarks/readme/run_sm100_suite.py b/oink/benchmarks/readme/run_sm100_suite.py
index c31d4b5..fb9d603 100644
--- a/oink/benchmarks/readme/run_sm100_suite.py
+++ b/oink/benchmarks/readme/run_sm100_suite.py
@@ -1,3 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from __future__ import annotations
 
 import argparse
diff --git a/oink/benchmarks/readme/summarize_results.py b/oink/benchmarks/readme/summarize_results.py
index 29b768e..684694d 100644
--- a/oink/benchmarks/readme/summarize_results.py
+++ b/oink/benchmarks/readme/summarize_results.py
@@ -1,3 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from __future__ import annotations
 
 import argparse

From 5d195d6c2d80b9415d8a5433c54494ccac769eac Mon Sep 17 00:00:00 2001
From: Laura Wang <3700467+Laurawly@users.noreply.github.com>
Date: Thu, 22 Jan 2026 09:59:37 -0800
Subject: [PATCH 8/8] update

---
 oink/benchmarks/README.md                     |    6 +
 .../benchmark_fused_add_rmsnorm_sm100.py      |   55 +-
 .../benchmark/benchmark_rmsnorm_bwd_sm100.py  |   11 +-
 .../media/sm100_bf16_oink_vs_quack.svg        |  350 ++--
 .../media/sm100_bf16_oink_vs_quack_dsv3.svg   |  714 +++----
 .../sm100_bf16_oink_vs_quack_dsv3_all.svg     |  570 +++---
 ..._bf16_oink_vs_quack_dsv3_cross_entropy.svg |  180 +-
 ...bf16_oink_vs_quack_dsv3_with_layernorm.svg | 1742 ++++++++---------
 ...m100_bf16_oink_vs_quack_with_layernorm.svg |  460 ++---
 .../media/sm100_fp16_oink_vs_quack.svg        |  350 ++--
 .../media/sm100_fp16_oink_vs_quack_dsv3.svg   |  714 +++----
 .../sm100_fp16_oink_vs_quack_dsv3_all.svg     |  570 +++---
 ..._fp16_oink_vs_quack_dsv3_cross_entropy.svg |  180 +-
 ...fp16_oink_vs_quack_dsv3_with_layernorm.svg | 1742 ++++++++---------
 ...m100_fp16_oink_vs_quack_with_layernorm.svg |  460 ++---
 oink/benchmarks/readme/run_sm100_suite.py     |   17 +
 .../blackwell/cross_entropy.py                | 1045 +++++++++-
 .../kernelagent_oink/blackwell/fast_launch.py |  115 ++
 .../kernelagent_oink/blackwell/layernorm.py   |  845 ++++++--
 .../kernelagent_oink/blackwell/lite_quack.py  |  181 +-
 .../src/kernelagent_oink/blackwell/rmsnorm.py | 1287 +++++++++++-
 .../src/kernelagent_oink/blackwell/softmax.py |  834 +++++++-
 22 files changed, 7996 insertions(+), 4432 deletions(-)
 create mode 100644 oink/src/kernelagent_oink/blackwell/fast_launch.py

diff --git a/oink/benchmarks/README.md b/oink/benchmarks/README.md
index ceb7932..a5c4676 100644
--- a/oink/benchmarks/README.md
+++ b/oink/benchmarks/README.md
@@ -96,6 +96,12 @@ CUDA_VISIBLE_DEVICES=0 python oink/benchmarks/benchmark/benchmark_fused_add_rmsn
   --json /tmp/fused_add_rmsnorm_sm100_bf16.json
 ```
 
+Note on the Quack baseline: Oink exposes an **in-place** fused op (updates `x` and `residual`).
+Quack’s fused kernel produces `out` and `residual_out` out-of-place, so by default the benchmark
+times `quack::_rmsnorm_fwd` **plus** two explicit copies (`x.copy_(out)`, `residual.copy_(residual_out)`)
+to match the in-place semantics (integration-realistic). Use `--quack-baseline kernel` to time only
+the Quack fused kernel with preallocated outputs.
+
 ### RMSNorm backward
 
 ```bash
diff --git a/oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py b/oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py
index 8a0227b..1787d7d 100644
--- a/oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py
+++ b/oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py
@@ -31,6 +31,15 @@
 DSv3 suite (Oink vs Quack, multi-shape):
   CUDA_VISIBLE_DEVICES=0 python oink/benchmarks/benchmark/benchmark_fused_add_rmsnorm_sm100.py --dtype bf16 --dsv3 \\
     --json /tmp/kernelagent_oink_sm100_suite_bf16/fused_add_rmsnorm_dsv3.json
+
+Quack baseline note:
+- Oink exposes an **in-place** fused op (writes `x` and `residual` in-place).
+- Quack provides an equivalent fused kernel, but typically returns `out` and
+  `residual_out` (out-of-place) and does not expose a public "update my input
+  buffers in-place" API.
+- For integration realism (vLLM-style semantics) we default to timing:
+    Quack fused kernel + 2 explicit copies to apply the in-place updates
+  so the benchmark covers the full semantic cost.
 """
 
 from __future__ import annotations
@@ -177,6 +186,7 @@ def bench_one(
     warmup_ms: int,
     iters_ms: int,
     verify: bool,
+    quack_baseline: str,
 ) -> Dict[str, Any]:
     device = torch.device("cuda")
     x = torch.randn((M, N), device=device, dtype=dtype)
@@ -212,23 +222,40 @@ def fn():
     row.update(stats)
 
     if quack_rmsnorm_fwd_mut is not None:
-        out_q = torch.empty_like(x)
-        res_out_q = torch.empty_like(residual)
+        x_q = x.clone()
+        residual_q = residual.clone()
+        out_q = torch.empty_like(x_q)
+        res_out_q = torch.empty_like(residual_q)
 
-        def fn_q():
+        def fn_q_kernel():
             quack_rmsnorm_fwd_mut(
-                x,
+                x_q,
                 w,
                 out_q,
                 None,  # bias
                 None,  # rstd
                 None,  # mean
-                residual,
+                residual_q,
                 res_out_q,
                 1e-6,
                 False,  # is_layernorm
             )
 
+        if quack_baseline == "kernel":
+            fn_q = fn_q_kernel
+        elif quack_baseline == "kernel_inplace":
+
+            def fn_q():
+                fn_q_kernel()
+                # Apply the same in-place semantics as vLLM expects:
+                # - x is overwritten with y
+                # - residual is overwritten with z = x + residual
+                x_q.copy_(out_q)
+                residual_q.copy_(res_out_q)
+
+        else:
+            raise ValueError(f"Unknown quack_baseline: {quack_baseline}")
+
         ms_q = do_bench_triton(fn_q, warmup_ms=warmup_ms, rep_ms=iters_ms)
         gbps_q = bytes_io / (ms_q * 1e-3) / 1e9
         row.update(
@@ -287,6 +314,18 @@ def main() -> None:
     p.add_argument(
         "--iters", type=int, default=200, help="rep_ms for do_bench (default: 200)"
     )
+    p.add_argument(
+        "--quack-baseline",
+        type=str,
+        default="kernel_inplace",
+        choices=["kernel", "kernel_inplace"],
+        help=(
+            "How to time Quack for the in-place fused op.\n"
+            "- kernel: Quack fused kernel only (preallocated out/residual_out).\n"
+            "- kernel_inplace: Quack fused kernel + 2 explicit copies to apply "
+            "in-place semantics (integration-realistic)."
+        ),
+    )
     p.add_argument("--skip-verify", action="store_true")
     p.add_argument("--json", type=str, default=None)
     args = p.parse_args()
@@ -309,6 +348,7 @@ def main() -> None:
                 warmup_ms=int(args.warmup_ms),
                 iters_ms=int(args.iters),
                 verify=not bool(args.skip_verify),
+                quack_baseline=str(args.quack_baseline),
             )
         )
 
@@ -324,7 +364,10 @@ def main() -> None:
                 warmup_ms=int(args.warmup_ms),
                 rep_ms=int(args.iters),
                 method="triton.testing.do_bench(mean)",
-                note="Oink fused_add_rmsnorm_inplace_ vs Quack quack::_rmsnorm_fwd(residual=..., residual_out=...) when available",
+                note=(
+                    "Oink fused_add_rmsnorm_inplace_ vs Quack baseline "
+                    f"({args.quack_baseline}) when available"
+                ),
             ),
         )
 
diff --git a/oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py b/oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py
index 31b335b..50ecb2e 100644
--- a/oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py
+++ b/oink/benchmarks/benchmark/benchmark_rmsnorm_bwd_sm100.py
@@ -17,7 +17,6 @@
 import argparse
 import csv
 import os
-import sys
 from dataclasses import dataclass
 from typing import List, Optional, Tuple
 
@@ -30,19 +29,17 @@
 # Ensure SM100 (GB200) architecture is recognized by CuTeDSL when running outside vLLM.
 os.environ.setdefault("CUTE_DSL_ARCH", "sm_100a")
 
-# Make the in-repo KernelAgent Oink package importable without an editable install.
-_HERE = os.path.dirname(os.path.abspath(__file__))
-_OINK_SRC = os.path.abspath(os.path.join(_HERE, "..", "src"))
-if _OINK_SRC not in sys.path:
-    sys.path.insert(0, _OINK_SRC)
-
 from bench_utils import (  # noqa: E402
     ErrorStatsAccumulator,
     collect_device_meta,
+    ensure_oink_src_on_path,
     error_stats_to_row,
     iter_row_blocks,
     write_json,
 )
+
+ensure_oink_src_on_path()
+
 from kernelagent_oink.blackwell import rmsnorm as oink_rmsnorm  # noqa: E402
 
 try:
diff --git a/oink/benchmarks/media/sm100_bf16_oink_vs_quack.svg b/oink/benchmarks/media/sm100_bf16_oink_vs_quack.svg
index e32e3a7..96b5b83 100644
--- a/oink/benchmarks/media/sm100_bf16_oink_vs_quack.svg
+++ b/oink/benchmarks/media/sm100_bf16_oink_vs_quack.svg
@@ -6,7 +6,7 @@
   <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
    <cc:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
-    <dc:date>2026-01-12T23:31:37.117906</dc:date>
+    <dc:date>2026-01-22T03:16:57.722815</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <cc:Agent>
@@ -41,12 +41,12 @@ z
     <g id="xtick_1">
      <g id="line2d_1">
       <defs>
-       <path id="ma3eba1b756" d="M 0 0 
+       <path id="mefdc231aa8" d="M 0 0 
 L 0 3.5 
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
-       <use xlink:href="#ma3eba1b756" x="75.099178" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mefdc231aa8" x="75.099178" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_1">
@@ -176,7 +176,7 @@ z
     <g id="xtick_2">
      <g id="line2d_2">
       <g>
-       <use xlink:href="#ma3eba1b756" x="130.546438" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mefdc231aa8" x="130.546438" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_2">
@@ -243,7 +243,7 @@ z
     <g id="xtick_3">
      <g id="line2d_3">
       <g>
-       <use xlink:href="#ma3eba1b756" x="185.993699" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mefdc231aa8" x="185.993699" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_3">
@@ -322,7 +322,7 @@ z
     <g id="xtick_4">
      <g id="line2d_4">
       <g>
-       <use xlink:href="#ma3eba1b756" x="241.440959" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mefdc231aa8" x="241.440959" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_4">
@@ -343,7 +343,7 @@ z
     <g id="xtick_5">
      <g id="line2d_5">
       <g>
-       <use xlink:href="#ma3eba1b756" x="296.888219" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mefdc231aa8" x="296.888219" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_5">
@@ -365,7 +365,7 @@ z
     <g id="xtick_6">
      <g id="line2d_6">
       <g>
-       <use xlink:href="#ma3eba1b756" x="352.33548" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mefdc231aa8" x="352.33548" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_6">
@@ -414,7 +414,7 @@ z
     <g id="xtick_7">
      <g id="line2d_7">
       <g>
-       <use xlink:href="#ma3eba1b756" x="407.78274" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mefdc231aa8" x="407.78274" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_7">
@@ -439,16 +439,16 @@ z
      <g id="line2d_8">
       <path d="M 58.465 334.546471 
 L 424.416918 334.546471 
-" clip-path="url(#pabdd47e931)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+" clip-path="url(#p68502969ea)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_9">
       <defs>
-       <path id="m1594e17ace" d="M 0 0 
+       <path id="mc8f78b9030" d="M 0 0 
 L -3.5 0 
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
-       <use xlink:href="#m1594e17ace" x="58.465" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc8f78b9030" x="58.465" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_8">
@@ -483,18 +483,18 @@ z
     </g>
     <g id="ytick_2">
      <g id="line2d_10">
-      <path d="M 58.465 286.945749 
-L 424.416918 286.945749 
-" clip-path="url(#pabdd47e931)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 286.38748 
+L 424.416918 286.38748 
+" clip-path="url(#p68502969ea)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_11">
       <g>
-       <use xlink:href="#m1594e17ace" x="58.465" y="286.945749" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc8f78b9030" x="58.465" y="286.38748" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_9">
       <!-- 2000 -->
-      <g transform="translate(20.925 291.504812) scale(0.12 -0.12)">
+      <g transform="translate(20.925 290.946543) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-32"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -504,18 +504,18 @@ L 424.416918 286.945749
     </g>
     <g id="ytick_3">
      <g id="line2d_12">
-      <path d="M 58.465 239.345028 
-L 424.416918 239.345028 
-" clip-path="url(#pabdd47e931)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 238.228489 
+L 424.416918 238.228489 
+" clip-path="url(#p68502969ea)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_13">
       <g>
-       <use xlink:href="#m1594e17ace" x="58.465" y="239.345028" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc8f78b9030" x="58.465" y="238.228489" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_10">
       <!-- 4000 -->
-      <g transform="translate(20.925 243.90409) scale(0.12 -0.12)">
+      <g transform="translate(20.925 242.787552) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-34"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -525,18 +525,18 @@ L 424.416918 239.345028
     </g>
     <g id="ytick_4">
      <g id="line2d_14">
-      <path d="M 58.465 191.744306 
-L 424.416918 191.744306 
-" clip-path="url(#pabdd47e931)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 190.069499 
+L 424.416918 190.069499 
+" clip-path="url(#p68502969ea)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_15">
       <g>
-       <use xlink:href="#m1594e17ace" x="58.465" y="191.744306" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc8f78b9030" x="58.465" y="190.069499" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_11">
       <!-- 6000 -->
-      <g transform="translate(20.925 196.303368) scale(0.12 -0.12)">
+      <g transform="translate(20.925 194.628561) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-36"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -944,16 +944,16 @@ z
     </g>
    </g>
    <g id="line2d_16">
-    <path d="M 75.099178 253.25486 
-L 130.546438 259.661974 
-L 185.993699 223.419235 
-L 241.440959 218.133855 
-L 296.888219 216.008349 
-L 352.33548 214.921541 
-L 407.78274 214.432846 
-" clip-path="url(#pabdd47e931)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <path d="M 75.099178 238.747969 
+L 130.546438 223.337905 
+L 185.993699 214.120939 
+L 241.440959 208.283843 
+L 296.888219 205.38719 
+L 352.33548 203.733601 
+L 407.78274 202.979263 
+" clip-path="url(#p68502969ea)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
     <defs>
-     <path id="m7377485948" d="M 0 3.5 
+     <path id="m88b9ff4334" d="M 0 3.5 
 C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
 C 3.131218 1.81853 3.5 0.928211 3.5 0 
 C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
@@ -965,27 +965,27 @@ C -1.81853 3.131218 -0.928211 3.5 0 3.5
 z
 " style="stroke: #5ba3f5"/>
     </defs>
-    <g clip-path="url(#pabdd47e931)">
-     <use xlink:href="#m7377485948" x="75.099178" y="253.25486" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7377485948" x="130.546438" y="259.661974" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7377485948" x="185.993699" y="223.419235" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7377485948" x="241.440959" y="218.133855" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7377485948" x="296.888219" y="216.008349" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7377485948" x="352.33548" y="214.921541" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7377485948" x="407.78274" y="214.432846" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <g clip-path="url(#p68502969ea)">
+     <use xlink:href="#m88b9ff4334" x="75.099178" y="238.747969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m88b9ff4334" x="130.546438" y="223.337905" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m88b9ff4334" x="185.993699" y="214.120939" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m88b9ff4334" x="241.440959" y="208.283843" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m88b9ff4334" x="296.888219" y="205.38719" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m88b9ff4334" x="352.33548" y="203.733601" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m88b9ff4334" x="407.78274" y="202.979263" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_17">
-    <path d="M 75.099178 258.412109 
-L 130.546438 246.283012 
-L 185.993699 238.132423 
-L 241.440959 234.06502 
-L 296.888219 231.64472 
-L 352.33548 230.413258 
-L 407.78274 229.891064 
-" clip-path="url(#pabdd47e931)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <path d="M 75.099178 257.132175 
+L 130.546438 245.226113 
+L 185.993699 237.170142 
+L 241.440959 232.865528 
+L 296.888219 231.059769 
+L 352.33548 229.821094 
+L 407.78274 229.170627 
+" clip-path="url(#p68502969ea)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
     <defs>
-     <path id="mad3f600a4b" d="M 0 3.5 
+     <path id="m47454486bb" d="M 0 3.5 
 C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
 C 3.131218 1.81853 3.5 0.928211 3.5 0 
 C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
@@ -997,20 +997,20 @@ C -1.81853 3.131218 -0.928211 3.5 0 3.5
 z
 " style="stroke: #ff4444"/>
     </defs>
-    <g clip-path="url(#pabdd47e931)">
-     <use xlink:href="#mad3f600a4b" x="75.099178" y="258.412109" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mad3f600a4b" x="130.546438" y="246.283012" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mad3f600a4b" x="185.993699" y="238.132423" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mad3f600a4b" x="241.440959" y="234.06502" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mad3f600a4b" x="296.888219" y="231.64472" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mad3f600a4b" x="352.33548" y="230.413258" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mad3f600a4b" x="407.78274" y="229.891064" style="fill: #ff4444; stroke: #ff4444"/>
+    <g clip-path="url(#p68502969ea)">
+     <use xlink:href="#m47454486bb" x="75.099178" y="257.132175" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m47454486bb" x="130.546438" y="245.226113" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m47454486bb" x="185.993699" y="237.170142" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m47454486bb" x="241.440959" y="232.865528" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m47454486bb" x="296.888219" y="231.059769" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m47454486bb" x="352.33548" y="229.821094" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m47454486bb" x="407.78274" y="229.170627" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_18">
     <path d="M 58.465 158.870109 
 L 424.416918 158.870109 
-" clip-path="url(#pabdd47e931)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+" clip-path="url(#p68502969ea)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_3">
     <path d="M 58.465 334.546471 
@@ -1227,7 +1227,7 @@ z
     <g id="xtick_8">
      <g id="line2d_19">
       <g>
-       <use xlink:href="#ma3eba1b756" x="486.089178" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mefdc231aa8" x="486.089178" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_14">
@@ -1247,7 +1247,7 @@ z
     <g id="xtick_9">
      <g id="line2d_20">
       <g>
-       <use xlink:href="#ma3eba1b756" x="541.536438" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mefdc231aa8" x="541.536438" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_15">
@@ -1268,7 +1268,7 @@ z
     <g id="xtick_10">
      <g id="line2d_21">
       <g>
-       <use xlink:href="#ma3eba1b756" x="596.983699" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mefdc231aa8" x="596.983699" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_16">
@@ -1289,7 +1289,7 @@ z
     <g id="xtick_11">
      <g id="line2d_22">
       <g>
-       <use xlink:href="#ma3eba1b756" x="652.430959" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mefdc231aa8" x="652.430959" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_17">
@@ -1310,7 +1310,7 @@ z
     <g id="xtick_12">
      <g id="line2d_23">
       <g>
-       <use xlink:href="#ma3eba1b756" x="707.878219" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mefdc231aa8" x="707.878219" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_18">
@@ -1332,7 +1332,7 @@ z
     <g id="xtick_13">
      <g id="line2d_24">
       <g>
-       <use xlink:href="#ma3eba1b756" x="763.32548" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mefdc231aa8" x="763.32548" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_19">
@@ -1354,7 +1354,7 @@ z
     <g id="xtick_14">
      <g id="line2d_25">
       <g>
-       <use xlink:href="#ma3eba1b756" x="818.77274" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mefdc231aa8" x="818.77274" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_20">
@@ -1379,93 +1379,93 @@ z
      <g id="line2d_26">
       <path d="M 469.455 334.546471 
 L 835.406918 334.546471 
-" clip-path="url(#p78cfff3dbe)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+" clip-path="url(#p8e212f52e1)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_27">
       <g>
-       <use xlink:href="#m1594e17ace" x="469.455" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc8f78b9030" x="469.455" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_6">
      <g id="line2d_28">
-      <path d="M 469.455 286.945749 
-L 835.406918 286.945749 
-" clip-path="url(#p78cfff3dbe)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 469.455 286.38748 
+L 835.406918 286.38748 
+" clip-path="url(#p8e212f52e1)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_29">
       <g>
-       <use xlink:href="#m1594e17ace" x="469.455" y="286.945749" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc8f78b9030" x="469.455" y="286.38748" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_7">
      <g id="line2d_30">
-      <path d="M 469.455 239.345028 
-L 835.406918 239.345028 
-" clip-path="url(#p78cfff3dbe)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 469.455 238.228489 
+L 835.406918 238.228489 
+" clip-path="url(#p8e212f52e1)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_31">
       <g>
-       <use xlink:href="#m1594e17ace" x="469.455" y="239.345028" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc8f78b9030" x="469.455" y="238.228489" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_8">
      <g id="line2d_32">
-      <path d="M 469.455 191.744306 
-L 835.406918 191.744306 
-" clip-path="url(#p78cfff3dbe)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 469.455 190.069499 
+L 835.406918 190.069499 
+" clip-path="url(#p8e212f52e1)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_33">
       <g>
-       <use xlink:href="#m1594e17ace" x="469.455" y="191.744306" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc8f78b9030" x="469.455" y="190.069499" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
    </g>
    <g id="line2d_34">
-    <path d="M 486.089178 242.661221 
-L 541.536438 232.6956 
-L 596.983699 226.518353 
-L 652.430959 223.572657 
-L 707.878219 221.993259 
-L 763.32548 221.173843 
-L 818.77274 220.741664 
-" clip-path="url(#p78cfff3dbe)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p78cfff3dbe)">
-     <use xlink:href="#m7377485948" x="486.089178" y="242.661221" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7377485948" x="541.536438" y="232.6956" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7377485948" x="596.983699" y="226.518353" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7377485948" x="652.430959" y="223.572657" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7377485948" x="707.878219" y="221.993259" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7377485948" x="763.32548" y="221.173843" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7377485948" x="818.77274" y="220.741664" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <path d="M 486.089178 241.570475 
+L 541.536438 231.363841 
+L 596.983699 225.318162 
+L 652.430959 222.208866 
+L 707.878219 220.653889 
+L 763.32548 219.814485 
+L 818.77274 219.385707 
+" clip-path="url(#p8e212f52e1)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p8e212f52e1)">
+     <use xlink:href="#m88b9ff4334" x="486.089178" y="241.570475" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m88b9ff4334" x="541.536438" y="231.363841" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m88b9ff4334" x="596.983699" y="225.318162" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m88b9ff4334" x="652.430959" y="222.208866" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m88b9ff4334" x="707.878219" y="220.653889" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m88b9ff4334" x="763.32548" y="219.814485" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m88b9ff4334" x="818.77274" y="219.385707" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_35">
-    <path d="M 486.089178 272.482214 
-L 541.536438 265.651785 
-L 596.983699 261.232124 
-L 652.430959 258.947326 
-L 707.878219 257.740701 
-L 763.32548 257.088157 
-L 818.77274 256.798828 
-" clip-path="url(#p78cfff3dbe)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p78cfff3dbe)">
-     <use xlink:href="#mad3f600a4b" x="486.089178" y="272.482214" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mad3f600a4b" x="541.536438" y="265.651785" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mad3f600a4b" x="596.983699" y="261.232124" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mad3f600a4b" x="652.430959" y="258.947326" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mad3f600a4b" x="707.878219" y="257.740701" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mad3f600a4b" x="763.32548" y="257.088157" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mad3f600a4b" x="818.77274" y="256.798828" style="fill: #ff4444; stroke: #ff4444"/>
+    <path d="M 486.089178 272.852471 
+L 541.536438 265.231813 
+L 596.983699 260.582149 
+L 652.430959 258.078016 
+L 707.878219 256.850404 
+L 763.32548 256.195061 
+L 818.77274 255.870209 
+" clip-path="url(#p8e212f52e1)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p8e212f52e1)">
+     <use xlink:href="#m47454486bb" x="486.089178" y="272.852471" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m47454486bb" x="541.536438" y="265.231813" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m47454486bb" x="596.983699" y="260.582149" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m47454486bb" x="652.430959" y="258.078016" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m47454486bb" x="707.878219" y="256.850404" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m47454486bb" x="763.32548" y="256.195061" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m47454486bb" x="818.77274" y="255.870209" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_36">
     <path d="M 469.455 158.870109 
 L 835.406918 158.870109 
-" clip-path="url(#p78cfff3dbe)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+" clip-path="url(#p8e212f52e1)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_8">
     <path d="M 469.455 334.546471 
@@ -1581,7 +1581,7 @@ z
     <g id="xtick_15">
      <g id="line2d_37">
       <g>
-       <use xlink:href="#ma3eba1b756" x="897.079178" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mefdc231aa8" x="897.079178" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_22">
@@ -1601,7 +1601,7 @@ z
     <g id="xtick_16">
      <g id="line2d_38">
       <g>
-       <use xlink:href="#ma3eba1b756" x="952.526438" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mefdc231aa8" x="952.526438" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_23">
@@ -1622,7 +1622,7 @@ z
     <g id="xtick_17">
      <g id="line2d_39">
       <g>
-       <use xlink:href="#ma3eba1b756" x="1007.973699" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mefdc231aa8" x="1007.973699" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_24">
@@ -1643,7 +1643,7 @@ z
     <g id="xtick_18">
      <g id="line2d_40">
       <g>
-       <use xlink:href="#ma3eba1b756" x="1063.420959" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mefdc231aa8" x="1063.420959" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_25">
@@ -1664,7 +1664,7 @@ z
     <g id="xtick_19">
      <g id="line2d_41">
       <g>
-       <use xlink:href="#ma3eba1b756" x="1118.868219" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mefdc231aa8" x="1118.868219" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_26">
@@ -1686,7 +1686,7 @@ z
     <g id="xtick_20">
      <g id="line2d_42">
       <g>
-       <use xlink:href="#ma3eba1b756" x="1174.31548" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mefdc231aa8" x="1174.31548" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_27">
@@ -1708,7 +1708,7 @@ z
     <g id="xtick_21">
      <g id="line2d_43">
       <g>
-       <use xlink:href="#ma3eba1b756" x="1229.76274" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mefdc231aa8" x="1229.76274" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_28">
@@ -1733,93 +1733,93 @@ z
      <g id="line2d_44">
       <path d="M 880.445 334.546471 
 L 1246.396918 334.546471 
-" clip-path="url(#p08e088dd3c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+" clip-path="url(#p9bab140156)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_45">
       <g>
-       <use xlink:href="#m1594e17ace" x="880.445" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc8f78b9030" x="880.445" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_10">
      <g id="line2d_46">
-      <path d="M 880.445 286.945749 
-L 1246.396918 286.945749 
-" clip-path="url(#p08e088dd3c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 880.445 286.38748 
+L 1246.396918 286.38748 
+" clip-path="url(#p9bab140156)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_47">
       <g>
-       <use xlink:href="#m1594e17ace" x="880.445" y="286.945749" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc8f78b9030" x="880.445" y="286.38748" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_11">
      <g id="line2d_48">
-      <path d="M 880.445 239.345028 
-L 1246.396918 239.345028 
-" clip-path="url(#p08e088dd3c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 880.445 238.228489 
+L 1246.396918 238.228489 
+" clip-path="url(#p9bab140156)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_49">
       <g>
-       <use xlink:href="#m1594e17ace" x="880.445" y="239.345028" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc8f78b9030" x="880.445" y="238.228489" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_12">
      <g id="line2d_50">
-      <path d="M 880.445 191.744306 
-L 1246.396918 191.744306 
-" clip-path="url(#p08e088dd3c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 880.445 190.069499 
+L 1246.396918 190.069499 
+" clip-path="url(#p9bab140156)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_51">
       <g>
-       <use xlink:href="#m1594e17ace" x="880.445" y="191.744306" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc8f78b9030" x="880.445" y="190.069499" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
    </g>
    <g id="line2d_52">
-    <path d="M 897.079178 257.088265 
-L 952.526438 246.107662 
-L 1007.973699 241.060757 
-L 1063.420959 237.106886 
-L 1118.868219 235.500346 
-L 1174.31548 234.468963 
-L 1229.76274 233.975246 
-" clip-path="url(#p08e088dd3c)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p08e088dd3c)">
-     <use xlink:href="#m7377485948" x="897.079178" y="257.088265" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7377485948" x="952.526438" y="246.107662" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7377485948" x="1007.973699" y="241.060757" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7377485948" x="1063.420959" y="237.106886" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7377485948" x="1118.868219" y="235.500346" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7377485948" x="1174.31548" y="234.468963" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7377485948" x="1229.76274" y="233.975246" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <path d="M 897.079178 257.054623 
+L 952.526438 245.288411 
+L 1007.973699 239.648452 
+L 1063.420959 236.012931 
+L 1118.868219 234.360915 
+L 1174.31548 233.260094 
+L 1229.76274 232.798825 
+" clip-path="url(#p9bab140156)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p9bab140156)">
+     <use xlink:href="#m88b9ff4334" x="897.079178" y="257.054623" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m88b9ff4334" x="952.526438" y="245.288411" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m88b9ff4334" x="1007.973699" y="239.648452" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m88b9ff4334" x="1063.420959" y="236.012931" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m88b9ff4334" x="1118.868219" y="234.360915" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m88b9ff4334" x="1174.31548" y="233.260094" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m88b9ff4334" x="1229.76274" y="232.798825" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_53">
-    <path d="M 897.079178 299.12209 
-L 952.526438 272.123621 
-L 1007.973699 266.042305 
-L 1063.420959 262.721252 
-L 1118.868219 261.113394 
-L 1174.31548 260.124563 
-L 1229.76274 259.694299 
-" clip-path="url(#p08e088dd3c)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p08e088dd3c)">
-     <use xlink:href="#mad3f600a4b" x="897.079178" y="299.12209" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mad3f600a4b" x="952.526438" y="272.123621" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mad3f600a4b" x="1007.973699" y="266.042305" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mad3f600a4b" x="1063.420959" y="262.721252" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mad3f600a4b" x="1118.868219" y="261.113394" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mad3f600a4b" x="1174.31548" y="260.124563" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mad3f600a4b" x="1229.76274" y="259.694299" style="fill: #ff4444; stroke: #ff4444"/>
+    <path d="M 897.079178 299.408833 
+L 952.526438 271.365398 
+L 1007.973699 265.338464 
+L 1063.420959 261.952079 
+L 1118.868219 260.333542 
+L 1174.31548 259.272038 
+L 1229.76274 258.836131 
+" clip-path="url(#p9bab140156)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p9bab140156)">
+     <use xlink:href="#m47454486bb" x="897.079178" y="299.408833" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m47454486bb" x="952.526438" y="271.365398" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m47454486bb" x="1007.973699" y="265.338464" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m47454486bb" x="1063.420959" y="261.952079" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m47454486bb" x="1118.868219" y="260.333542" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m47454486bb" x="1174.31548" y="259.272038" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m47454486bb" x="1229.76274" y="258.836131" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_54">
     <path d="M 880.445 158.870109 
 L 1246.396918 158.870109 
-" clip-path="url(#p08e088dd3c)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+" clip-path="url(#p9bab140156)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_13">
     <path d="M 880.445 334.546471 
@@ -2128,7 +2128,7 @@ L 394.490937 46.691969
 L 410.740937 46.691969 
 " style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
     <g>
-     <use xlink:href="#m7377485948" x="394.490937" y="46.691969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m88b9ff4334" x="394.490937" y="46.691969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="text_31">
@@ -2183,7 +2183,7 @@ L 619.955625 46.691969
 L 636.205625 46.691969 
 " style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
     <g>
-     <use xlink:href="#mad3f600a4b" x="619.955625" y="46.691969" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m47454486bb" x="619.955625" y="46.691969" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="text_32">
@@ -2246,13 +2246,13 @@ z
   </g>
  </g>
  <defs>
-  <clipPath id="pabdd47e931">
+  <clipPath id="p68502969ea">
    <rect x="58.465" y="144.816" width="365.951918" height="189.730471"/>
   </clipPath>
-  <clipPath id="p78cfff3dbe">
+  <clipPath id="p8e212f52e1">
    <rect x="469.455" y="144.816" width="365.951918" height="189.730471"/>
   </clipPath>
-  <clipPath id="p08e088dd3c">
+  <clipPath id="p9bab140156">
    <rect x="880.445" y="144.816" width="365.951918" height="189.730471"/>
   </clipPath>
  </defs>
diff --git a/oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3.svg b/oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3.svg
index b70ba9b..254623e 100644
--- a/oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3.svg
+++ b/oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3.svg
@@ -1,12 +1,12 @@
 <?xml version="1.0" encoding="utf-8" standalone="no"?>
 <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
   "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
-<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="1296pt" height="403.2pt" viewBox="0 0 1296 403.2" xmlns="http://www.w3.org/2000/svg" version="1.1">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="1275.020982pt" height="387.00431pt" viewBox="0 0 1275.020982 387.00431" xmlns="http://www.w3.org/2000/svg" version="1.1">
  <metadata>
   <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
    <cc:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
-    <dc:date>2026-01-12T20:27:29.562089</dc:date>
+    <dc:date>2026-01-22T03:17:01.077305</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <cc:Agent>
@@ -21,19 +21,19 @@
  </defs>
  <g id="figure_1">
   <g id="patch_1">
-   <path d="M 0 403.2 
-L 1296 403.2 
-L 1296 0 
+   <path d="M 0 387.00431 
+L 1275.020982 387.00431 
+L 1275.020982 0 
 L 0 0 
 z
 " style="fill: #ffffff"/>
   </g>
   <g id="axes_1">
    <g id="patch_2">
-    <path d="M 66.53 345.561604 
-L 436.873051 345.561604 
-L 436.873051 151.44 
-L 66.53 151.44 
+    <path d="M 58.465 338.937604 
+L 428.808051 338.937604 
+L 428.808051 144.816 
+L 58.465 144.816 
 z
 " style="fill: #ffffff"/>
    </g>
@@ -41,17 +41,17 @@ z
     <g id="xtick_1">
      <g id="line2d_1">
       <defs>
-       <path id="m62e8ddf8d5" d="M 0 0 
+       <path id="m8b199a4271" d="M 0 0 
 L 0 3.5 
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
-       <use xlink:href="#m62e8ddf8d5" x="83.363775" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8b199a4271" x="75.298775" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_1">
       <!-- (4K, 6K) -->
-      <g transform="translate(84.834336 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(76.769336 351.310511) rotate(-315) scale(0.1 -0.1)">
        <defs>
         <path id="DejaVuSans-28" d="M 1984 4856 
 Q 1566 4138 1362 3434 
@@ -167,12 +167,12 @@ z
     <g id="xtick_2">
      <g id="line2d_2">
       <g>
-       <use xlink:href="#m62e8ddf8d5" x="125.448213" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8b199a4271" x="117.383213" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_2">
       <!-- (16K, 6K) -->
-      <g transform="translate(126.918774 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(118.853774 351.310511) rotate(-315) scale(0.1 -0.1)">
        <defs>
         <path id="DejaVuSans-31" d="M 794 531 
 L 1825 531 
@@ -204,12 +204,12 @@ z
     <g id="xtick_3">
      <g id="line2d_3">
       <g>
-       <use xlink:href="#m62e8ddf8d5" x="167.53265" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8b199a4271" x="159.46765" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_3">
       <!-- (64K, 6K) -->
-      <g transform="translate(169.003211 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(160.938211 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -225,12 +225,12 @@ z
     <g id="xtick_4">
      <g id="line2d_4">
       <g>
-       <use xlink:href="#m62e8ddf8d5" x="209.617088" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8b199a4271" x="201.552088" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_4">
       <!-- (4K, 7K) -->
-      <g transform="translate(211.087649 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(203.022649 351.310511) rotate(-315) scale(0.1 -0.1)">
        <defs>
         <path id="DejaVuSans-37" d="M 525 4666 
 L 3525 4666 
@@ -257,12 +257,12 @@ z
     <g id="xtick_5">
      <g id="line2d_5">
       <g>
-       <use xlink:href="#m62e8ddf8d5" x="251.701526" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8b199a4271" x="243.636526" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_5">
       <!-- (16K, 7K) -->
-      <g transform="translate(253.172087 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(245.107087 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
@@ -278,12 +278,12 @@ z
     <g id="xtick_6">
      <g id="line2d_6">
       <g>
-       <use xlink:href="#m62e8ddf8d5" x="293.785963" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8b199a4271" x="285.720963" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_6">
       <!-- (64K, 7K) -->
-      <g transform="translate(295.256524 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(287.191524 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -299,12 +299,12 @@ z
     <g id="xtick_7">
      <g id="line2d_7">
       <g>
-       <use xlink:href="#m62e8ddf8d5" x="335.870401" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8b199a4271" x="327.805401" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_7">
       <!-- (4K, 8K) -->
-      <g transform="translate(337.340962 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(329.275962 351.310511) rotate(-315) scale(0.1 -0.1)">
        <defs>
         <path id="DejaVuSans-38" d="M 2034 2216 
 Q 1584 2216 1326 1975 
@@ -360,12 +360,12 @@ z
     <g id="xtick_8">
      <g id="line2d_8">
       <g>
-       <use xlink:href="#m62e8ddf8d5" x="377.954838" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8b199a4271" x="369.889838" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_8">
       <!-- (16K, 8K) -->
-      <g transform="translate(379.4254 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(371.3604 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
@@ -381,12 +381,12 @@ z
     <g id="xtick_9">
      <g id="line2d_9">
       <g>
-       <use xlink:href="#m62e8ddf8d5" x="420.039276" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8b199a4271" x="411.974276" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_9">
       <!-- (64K, 8K) -->
-      <g transform="translate(421.509837 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(413.444837 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -403,23 +403,23 @@ z
    <g id="matplotlib.axis_2">
     <g id="ytick_1">
      <g id="line2d_10">
-      <path d="M 66.53 345.561604 
-L 436.873051 345.561604 
-" clip-path="url(#p856ea13fce)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 338.937604 
+L 428.808051 338.937604 
+" clip-path="url(#p558db2f6f3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_11">
       <defs>
-       <path id="m7fb1351010" d="M 0 0 
+       <path id="m88ec31a3c9" d="M 0 0 
 L -3.5 0 
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
-       <use xlink:href="#m7fb1351010" x="66.53" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m88ec31a3c9" x="58.465" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_10">
       <!-- 0 -->
-      <g transform="translate(51.895 350.120666) scale(0.12 -0.12)">
+      <g transform="translate(43.83 343.496666) scale(0.12 -0.12)">
        <defs>
         <path id="DejaVuSans-30" d="M 2034 4250 
 Q 1547 4250 1301 3770 
@@ -449,18 +449,18 @@ z
     </g>
     <g id="ytick_2">
      <g id="line2d_12">
-      <path d="M 66.53 321.210406 
-L 436.873051 321.210406 
-" clip-path="url(#p856ea13fce)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 314.300811 
+L 428.808051 314.300811 
+" clip-path="url(#p558db2f6f3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_13">
       <g>
-       <use xlink:href="#m7fb1351010" x="66.53" y="321.210406" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m88ec31a3c9" x="58.465" y="314.300811" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_11">
       <!-- 1000 -->
-      <g transform="translate(28.99 325.769469) scale(0.12 -0.12)">
+      <g transform="translate(20.925 318.859874) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-31"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -470,18 +470,18 @@ L 436.873051 321.210406
     </g>
     <g id="ytick_3">
      <g id="line2d_14">
-      <path d="M 66.53 296.859208 
-L 436.873051 296.859208 
-" clip-path="url(#p856ea13fce)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 289.664019 
+L 428.808051 289.664019 
+" clip-path="url(#p558db2f6f3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_15">
       <g>
-       <use xlink:href="#m7fb1351010" x="66.53" y="296.859208" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m88ec31a3c9" x="58.465" y="289.664019" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_12">
       <!-- 2000 -->
-      <g transform="translate(28.99 301.418271) scale(0.12 -0.12)">
+      <g transform="translate(20.925 294.223081) scale(0.12 -0.12)">
        <defs>
         <path id="DejaVuSans-32" d="M 1228 531 
 L 3431 531 
@@ -517,18 +517,18 @@ z
     </g>
     <g id="ytick_4">
      <g id="line2d_16">
-      <path d="M 66.53 272.508011 
-L 436.873051 272.508011 
-" clip-path="url(#p856ea13fce)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 265.027226 
+L 428.808051 265.027226 
+" clip-path="url(#p558db2f6f3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_17">
       <g>
-       <use xlink:href="#m7fb1351010" x="66.53" y="272.508011" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m88ec31a3c9" x="58.465" y="265.027226" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_13">
       <!-- 3000 -->
-      <g transform="translate(28.99 277.067073) scale(0.12 -0.12)">
+      <g transform="translate(20.925 269.586289) scale(0.12 -0.12)">
        <defs>
         <path id="DejaVuSans-33" d="M 2597 2516 
 Q 3050 2419 3304 2112 
@@ -572,18 +572,18 @@ z
     </g>
     <g id="ytick_5">
      <g id="line2d_18">
-      <path d="M 66.53 248.156813 
-L 436.873051 248.156813 
-" clip-path="url(#p856ea13fce)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 240.390434 
+L 428.808051 240.390434 
+" clip-path="url(#p558db2f6f3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_19">
       <g>
-       <use xlink:href="#m7fb1351010" x="66.53" y="248.156813" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m88ec31a3c9" x="58.465" y="240.390434" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_14">
       <!-- 4000 -->
-      <g transform="translate(28.99 252.715875) scale(0.12 -0.12)">
+      <g transform="translate(20.925 244.949496) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-34"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -593,18 +593,18 @@ L 436.873051 248.156813
     </g>
     <g id="ytick_6">
      <g id="line2d_20">
-      <path d="M 66.53 223.805615 
-L 436.873051 223.805615 
-" clip-path="url(#p856ea13fce)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 215.753641 
+L 428.808051 215.753641 
+" clip-path="url(#p558db2f6f3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_21">
       <g>
-       <use xlink:href="#m7fb1351010" x="66.53" y="223.805615" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m88ec31a3c9" x="58.465" y="215.753641" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_15">
       <!-- 5000 -->
-      <g transform="translate(28.99 228.364678) scale(0.12 -0.12)">
+      <g transform="translate(20.925 220.312703) scale(0.12 -0.12)">
        <defs>
         <path id="DejaVuSans-35" d="M 691 4666 
 L 3169 4666 
@@ -641,18 +641,18 @@ z
     </g>
     <g id="ytick_7">
      <g id="line2d_22">
-      <path d="M 66.53 199.454417 
-L 436.873051 199.454417 
-" clip-path="url(#p856ea13fce)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 191.116848 
+L 428.808051 191.116848 
+" clip-path="url(#p558db2f6f3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_23">
       <g>
-       <use xlink:href="#m7fb1351010" x="66.53" y="199.454417" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m88ec31a3c9" x="58.465" y="191.116848" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_16">
       <!-- 6000 -->
-      <g transform="translate(28.99 204.01348) scale(0.12 -0.12)">
+      <g transform="translate(20.925 195.675911) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-36"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -662,18 +662,18 @@ L 436.873051 199.454417
     </g>
     <g id="ytick_8">
      <g id="line2d_24">
-      <path d="M 66.53 175.10322 
-L 436.873051 175.10322 
-" clip-path="url(#p856ea13fce)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 166.480056 
+L 428.808051 166.480056 
+" clip-path="url(#p558db2f6f3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_25">
       <g>
-       <use xlink:href="#m7fb1351010" x="66.53" y="175.10322" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m88ec31a3c9" x="58.465" y="166.480056" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_17">
       <!-- 7000 -->
-      <g transform="translate(28.99 179.662282) scale(0.12 -0.12)">
+      <g transform="translate(20.925 171.039118) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-37"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -683,7 +683,7 @@ L 436.873051 175.10322
     </g>
     <g id="text_18">
      <!-- Memory Bandwidth (GB/s) -->
-     <g transform="translate(21.6625 353.504552) rotate(-90) scale(0.16 -0.16)">
+     <g transform="translate(13.5975 346.880552) rotate(-90) scale(0.16 -0.16)">
       <defs>
        <path id="DejaVuSans-4d" d="M 628 4666 
 L 1569 4666 
@@ -1081,18 +1081,18 @@ z
     </g>
    </g>
    <g id="line2d_26">
-    <path d="M 83.363775 220.602834 
-L 125.448213 192.302239 
-L 167.53265 184.037143 
-L 209.617088 215.679122 
-L 251.701526 192.646751 
-L 293.785963 186.491419 
-L 335.870401 216.730422 
-L 377.954838 193.198291 
-L 420.039276 185.866938 
-" clip-path="url(#p856ea13fce)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <path d="M 75.298775 212.430604 
+L 117.383213 184.017052 
+L 159.46765 174.711019 
+L 201.552088 206.353328 
+L 243.636526 184.708661 
+L 285.720963 173.316994 
+L 327.805401 209.818741 
+L 369.889838 185.652025 
+L 411.974276 178.040206 
+" clip-path="url(#p558db2f6f3)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
     <defs>
-     <path id="mc7b58b2ba9" d="M 0 3.5 
+     <path id="m4a1cd61088" d="M 0 3.5 
 C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
 C 3.131218 1.81853 3.5 0.928211 3.5 0 
 C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
@@ -1104,31 +1104,31 @@ C -1.81853 3.131218 -0.928211 3.5 0 3.5
 z
 " style="stroke: #5ba3f5"/>
     </defs>
-    <g clip-path="url(#p856ea13fce)">
-     <use xlink:href="#mc7b58b2ba9" x="83.363775" y="220.602834" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mc7b58b2ba9" x="125.448213" y="192.302239" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mc7b58b2ba9" x="167.53265" y="184.037143" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mc7b58b2ba9" x="209.617088" y="215.679122" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mc7b58b2ba9" x="251.701526" y="192.646751" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mc7b58b2ba9" x="293.785963" y="186.491419" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mc7b58b2ba9" x="335.870401" y="216.730422" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mc7b58b2ba9" x="377.954838" y="193.198291" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mc7b58b2ba9" x="420.039276" y="185.866938" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <g clip-path="url(#p558db2f6f3)">
+     <use xlink:href="#m4a1cd61088" x="75.298775" y="212.430604" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4a1cd61088" x="117.383213" y="184.017052" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4a1cd61088" x="159.46765" y="174.711019" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4a1cd61088" x="201.552088" y="206.353328" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4a1cd61088" x="243.636526" y="184.708661" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4a1cd61088" x="285.720963" y="173.316994" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4a1cd61088" x="327.805401" y="209.818741" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4a1cd61088" x="369.889838" y="185.652025" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4a1cd61088" x="411.974276" y="178.040206" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_27">
-    <path d="M 83.363775 237.768471 
-L 125.448213 210.158317 
-L 167.53265 201.225875 
-L 209.617088 230.11742 
-L 251.701526 206.489193 
-L 293.785963 199.507637 
-L 335.870401 230.691316 
-L 377.954838 206.76609 
-L 420.039276 199.529002 
-" clip-path="url(#p856ea13fce)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <path d="M 75.298775 282.364602 
+L 117.383213 268.736015 
+L 159.46765 264.104609 
+L 201.552088 278.02447 
+L 243.636526 267.063463 
+L 285.720963 263.298714 
+L 327.805401 278.983431 
+L 369.889838 266.969932 
+L 411.974276 263.837225 
+" clip-path="url(#p558db2f6f3)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
     <defs>
-     <path id="mf1a9062706" d="M 0 3.5 
+     <path id="mb7a5884233" d="M 0 3.5 
 C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
 C 3.131218 1.81853 3.5 0.928211 3.5 0 
 C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
@@ -1140,46 +1140,46 @@ C -1.81853 3.131218 -0.928211 3.5 0 3.5
 z
 " style="stroke: #ff4444"/>
     </defs>
-    <g clip-path="url(#p856ea13fce)">
-     <use xlink:href="#mf1a9062706" x="83.363775" y="237.768471" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mf1a9062706" x="125.448213" y="210.158317" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mf1a9062706" x="167.53265" y="201.225875" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mf1a9062706" x="209.617088" y="230.11742" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mf1a9062706" x="251.701526" y="206.489193" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mf1a9062706" x="293.785963" y="199.507637" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mf1a9062706" x="335.870401" y="230.691316" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mf1a9062706" x="377.954838" y="206.76609" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mf1a9062706" x="420.039276" y="199.529002" style="fill: #ff4444; stroke: #ff4444"/>
+    <g clip-path="url(#p558db2f6f3)">
+     <use xlink:href="#mb7a5884233" x="75.298775" y="282.364602" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mb7a5884233" x="117.383213" y="268.736015" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mb7a5884233" x="159.46765" y="264.104609" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mb7a5884233" x="201.552088" y="278.02447" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mb7a5884233" x="243.636526" y="267.063463" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mb7a5884233" x="285.720963" y="263.298714" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mb7a5884233" x="327.805401" y="278.983431" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mb7a5884233" x="369.889838" y="266.969932" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mb7a5884233" x="411.974276" y="263.837225" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_28">
-    <path d="M 66.53 165.819378 
-L 436.873051 165.819378 
-" clip-path="url(#p856ea13fce)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+    <path d="M 58.465 159.195378 
+L 428.808051 159.195378 
+" clip-path="url(#p558db2f6f3)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_3">
-    <path d="M 66.53 345.561604 
-L 66.53 151.44 
+    <path d="M 58.465 338.937604 
+L 58.465 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_4">
-    <path d="M 436.873051 345.561604 
-L 436.873051 151.44 
+    <path d="M 428.808051 338.937604 
+L 428.808051 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_5">
-    <path d="M 66.53 345.561604 
-L 436.873051 345.561604 
+    <path d="M 58.465 338.937604 
+L 428.808051 338.937604 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_6">
-    <path d="M 66.53 151.44 
-L 436.873051 151.44 
+    <path d="M 58.465 144.816 
+L 428.808051 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="text_19">
     <!-- Fused Add+RMSNorm (fwd) -->
-    <g transform="translate(127.299026 145.44) scale(0.18 -0.18)">
+    <g transform="translate(119.234026 138.816) scale(0.18 -0.18)">
      <defs>
       <path id="DejaVuSans-46" d="M 628 4666 
 L 3309 4666 
@@ -1369,10 +1369,10 @@ z
   </g>
   <g id="axes_2">
    <g id="patch_7">
-    <path d="M 477.52 345.561604 
-L 847.863051 345.561604 
-L 847.863051 151.44 
-L 477.52 151.44 
+    <path d="M 469.455 338.937604 
+L 839.798051 338.937604 
+L 839.798051 144.816 
+L 469.455 144.816 
 z
 " style="fill: #ffffff"/>
    </g>
@@ -1380,12 +1380,12 @@ z
     <g id="xtick_10">
      <g id="line2d_29">
       <g>
-       <use xlink:href="#m62e8ddf8d5" x="494.353775" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8b199a4271" x="486.288775" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_20">
       <!-- (4K, 6K) -->
-      <g transform="translate(495.824336 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(487.759336 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
@@ -1400,12 +1400,12 @@ z
     <g id="xtick_11">
      <g id="line2d_30">
       <g>
-       <use xlink:href="#m62e8ddf8d5" x="536.438213" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8b199a4271" x="528.373213" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_21">
       <!-- (16K, 6K) -->
-      <g transform="translate(537.908774 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(529.843774 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
@@ -1421,12 +1421,12 @@ z
     <g id="xtick_12">
      <g id="line2d_31">
       <g>
-       <use xlink:href="#m62e8ddf8d5" x="578.52265" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8b199a4271" x="570.45765" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_22">
       <!-- (64K, 6K) -->
-      <g transform="translate(579.993211 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(571.928211 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -1442,12 +1442,12 @@ z
     <g id="xtick_13">
      <g id="line2d_32">
       <g>
-       <use xlink:href="#m62e8ddf8d5" x="620.607088" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8b199a4271" x="612.542088" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_23">
       <!-- (4K, 7K) -->
-      <g transform="translate(622.077649 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(614.012649 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
@@ -1462,12 +1462,12 @@ z
     <g id="xtick_14">
      <g id="line2d_33">
       <g>
-       <use xlink:href="#m62e8ddf8d5" x="662.691526" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8b199a4271" x="654.626526" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_24">
       <!-- (16K, 7K) -->
-      <g transform="translate(664.162087 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(656.097087 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
@@ -1483,12 +1483,12 @@ z
     <g id="xtick_15">
      <g id="line2d_34">
       <g>
-       <use xlink:href="#m62e8ddf8d5" x="704.775963" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8b199a4271" x="696.710963" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_25">
       <!-- (64K, 7K) -->
-      <g transform="translate(706.246524 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(698.181524 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -1504,12 +1504,12 @@ z
     <g id="xtick_16">
      <g id="line2d_35">
       <g>
-       <use xlink:href="#m62e8ddf8d5" x="746.860401" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8b199a4271" x="738.795401" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_26">
       <!-- (4K, 8K) -->
-      <g transform="translate(748.330962 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(740.265962 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
@@ -1524,12 +1524,12 @@ z
     <g id="xtick_17">
      <g id="line2d_36">
       <g>
-       <use xlink:href="#m62e8ddf8d5" x="788.944838" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8b199a4271" x="780.879838" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_27">
       <!-- (16K, 8K) -->
-      <g transform="translate(790.4154 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(782.3504 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
@@ -1545,12 +1545,12 @@ z
     <g id="xtick_18">
      <g id="line2d_37">
       <g>
-       <use xlink:href="#m62e8ddf8d5" x="831.029276" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8b199a4271" x="822.964276" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_28">
       <!-- (64K, 8K) -->
-      <g transform="translate(832.499837 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(824.434837 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -1567,175 +1567,175 @@ z
    <g id="matplotlib.axis_4">
     <g id="ytick_9">
      <g id="line2d_38">
-      <path d="M 477.52 345.561604 
-L 847.863051 345.561604 
-" clip-path="url(#p1ed2bedd16)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 469.455 338.937604 
+L 839.798051 338.937604 
+" clip-path="url(#p3886029db8)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_39">
       <g>
-       <use xlink:href="#m7fb1351010" x="477.52" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m88ec31a3c9" x="469.455" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_10">
      <g id="line2d_40">
-      <path d="M 477.52 321.210406 
-L 847.863051 321.210406 
-" clip-path="url(#p1ed2bedd16)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 469.455 314.300811 
+L 839.798051 314.300811 
+" clip-path="url(#p3886029db8)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_41">
       <g>
-       <use xlink:href="#m7fb1351010" x="477.52" y="321.210406" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m88ec31a3c9" x="469.455" y="314.300811" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_11">
      <g id="line2d_42">
-      <path d="M 477.52 296.859208 
-L 847.863051 296.859208 
-" clip-path="url(#p1ed2bedd16)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 469.455 289.664019 
+L 839.798051 289.664019 
+" clip-path="url(#p3886029db8)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_43">
       <g>
-       <use xlink:href="#m7fb1351010" x="477.52" y="296.859208" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m88ec31a3c9" x="469.455" y="289.664019" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_12">
      <g id="line2d_44">
-      <path d="M 477.52 272.508011 
-L 847.863051 272.508011 
-" clip-path="url(#p1ed2bedd16)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 469.455 265.027226 
+L 839.798051 265.027226 
+" clip-path="url(#p3886029db8)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_45">
       <g>
-       <use xlink:href="#m7fb1351010" x="477.52" y="272.508011" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m88ec31a3c9" x="469.455" y="265.027226" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_13">
      <g id="line2d_46">
-      <path d="M 477.52 248.156813 
-L 847.863051 248.156813 
-" clip-path="url(#p1ed2bedd16)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 469.455 240.390434 
+L 839.798051 240.390434 
+" clip-path="url(#p3886029db8)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_47">
       <g>
-       <use xlink:href="#m7fb1351010" x="477.52" y="248.156813" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m88ec31a3c9" x="469.455" y="240.390434" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_14">
      <g id="line2d_48">
-      <path d="M 477.52 223.805615 
-L 847.863051 223.805615 
-" clip-path="url(#p1ed2bedd16)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 469.455 215.753641 
+L 839.798051 215.753641 
+" clip-path="url(#p3886029db8)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_49">
       <g>
-       <use xlink:href="#m7fb1351010" x="477.52" y="223.805615" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m88ec31a3c9" x="469.455" y="215.753641" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_15">
      <g id="line2d_50">
-      <path d="M 477.52 199.454417 
-L 847.863051 199.454417 
-" clip-path="url(#p1ed2bedd16)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 469.455 191.116848 
+L 839.798051 191.116848 
+" clip-path="url(#p3886029db8)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_51">
       <g>
-       <use xlink:href="#m7fb1351010" x="477.52" y="199.454417" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m88ec31a3c9" x="469.455" y="191.116848" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_16">
      <g id="line2d_52">
-      <path d="M 477.52 175.10322 
-L 847.863051 175.10322 
-" clip-path="url(#p1ed2bedd16)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 469.455 166.480056 
+L 839.798051 166.480056 
+" clip-path="url(#p3886029db8)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_53">
       <g>
-       <use xlink:href="#m7fb1351010" x="477.52" y="175.10322" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m88ec31a3c9" x="469.455" y="166.480056" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
    </g>
    <g id="line2d_54">
-    <path d="M 494.353775 263.911259 
-L 536.438213 243.717278 
-L 578.52265 237.136708 
-L 620.607088 259.989198 
-L 662.691526 240.048593 
-L 704.775963 234.044728 
-L 746.860401 264.200518 
-L 788.944838 249.423191 
-L 831.029276 245.724769 
-" clip-path="url(#p1ed2bedd16)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p1ed2bedd16)">
-     <use xlink:href="#mc7b58b2ba9" x="494.353775" y="263.911259" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mc7b58b2ba9" x="536.438213" y="243.717278" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mc7b58b2ba9" x="578.52265" y="237.136708" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mc7b58b2ba9" x="620.607088" y="259.989198" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mc7b58b2ba9" x="662.691526" y="240.048593" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mc7b58b2ba9" x="704.775963" y="234.044728" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mc7b58b2ba9" x="746.860401" y="264.200518" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mc7b58b2ba9" x="788.944838" y="249.423191" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mc7b58b2ba9" x="831.029276" y="245.724769" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <path d="M 486.288775 254.662564 
+L 528.373213 235.895973 
+L 570.45765 229.210305 
+L 612.542088 251.890782 
+L 654.626526 232.293457 
+L 696.710963 226.115777 
+L 738.795401 256.386993 
+L 780.879838 241.604891 
+L 822.964276 237.88117 
+" clip-path="url(#p3886029db8)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p3886029db8)">
+     <use xlink:href="#m4a1cd61088" x="486.288775" y="254.662564" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4a1cd61088" x="528.373213" y="235.895973" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4a1cd61088" x="570.45765" y="229.210305" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4a1cd61088" x="612.542088" y="251.890782" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4a1cd61088" x="654.626526" y="232.293457" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4a1cd61088" x="696.710963" y="226.115777" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4a1cd61088" x="738.795401" y="256.386993" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4a1cd61088" x="780.879838" y="241.604891" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4a1cd61088" x="822.964276" y="237.88117" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_55">
-    <path d="M 494.353775 293.455245 
-L 536.438213 278.102834 
-L 578.52265 273.074938 
-L 620.607088 281.193452 
-L 662.691526 265.891044 
-L 704.775963 261.232871 
-L 746.860401 282.999424 
-L 788.944838 270.466554 
-L 831.029276 266.756053 
-" clip-path="url(#p1ed2bedd16)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p1ed2bedd16)">
-     <use xlink:href="#mf1a9062706" x="494.353775" y="293.455245" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mf1a9062706" x="536.438213" y="278.102834" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mf1a9062706" x="578.52265" y="273.074938" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mf1a9062706" x="620.607088" y="281.193452" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mf1a9062706" x="662.691526" y="265.891044" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mf1a9062706" x="704.775963" y="261.232871" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mf1a9062706" x="746.860401" y="282.999424" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mf1a9062706" x="788.944838" y="270.466554" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mf1a9062706" x="831.029276" y="266.756053" style="fill: #ff4444; stroke: #ff4444"/>
+    <path d="M 486.288775 285.805632 
+L 528.373213 270.694451 
+L 570.45765 265.582942 
+L 612.542088 273.757719 
+L 654.626526 258.418474 
+L 696.710963 253.581109 
+L 738.795401 274.770076 
+L 780.879838 262.967987 
+L 822.964276 259.211244 
+" clip-path="url(#p3886029db8)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p3886029db8)">
+     <use xlink:href="#mb7a5884233" x="486.288775" y="285.805632" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mb7a5884233" x="528.373213" y="270.694451" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mb7a5884233" x="570.45765" y="265.582942" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mb7a5884233" x="612.542088" y="273.757719" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mb7a5884233" x="654.626526" y="258.418474" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mb7a5884233" x="696.710963" y="253.581109" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mb7a5884233" x="738.795401" y="274.770076" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mb7a5884233" x="780.879838" y="262.967987" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mb7a5884233" x="822.964276" y="259.211244" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_56">
-    <path d="M 477.52 165.819378 
-L 847.863051 165.819378 
-" clip-path="url(#p1ed2bedd16)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+    <path d="M 469.455 159.195378 
+L 839.798051 159.195378 
+" clip-path="url(#p3886029db8)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_8">
-    <path d="M 477.52 345.561604 
-L 477.52 151.44 
+    <path d="M 469.455 338.937604 
+L 469.455 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_9">
-    <path d="M 847.863051 345.561604 
-L 847.863051 151.44 
+    <path d="M 839.798051 338.937604 
+L 839.798051 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_10">
-    <path d="M 477.52 345.561604 
-L 847.863051 345.561604 
+    <path d="M 469.455 338.937604 
+L 839.798051 338.937604 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_11">
-    <path d="M 477.52 151.44 
-L 847.863051 151.44 
+    <path d="M 469.455 144.816 
+L 839.798051 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="text_29">
     <!-- Softmax (fwd+bwd) -->
-    <g transform="translate(573.026213 145.44) scale(0.18 -0.18)">
+    <g transform="translate(564.961213 138.816) scale(0.18 -0.18)">
      <defs>
       <path id="DejaVuSans-78" d="M 3513 3500 
 L 2247 1797 
@@ -1801,10 +1801,10 @@ z
   </g>
   <g id="axes_3">
    <g id="patch_12">
-    <path d="M 888.51 345.561604 
-L 1258.853051 345.561604 
-L 1258.853051 151.44 
-L 888.51 151.44 
+    <path d="M 880.445 338.937604 
+L 1250.788051 338.937604 
+L 1250.788051 144.816 
+L 880.445 144.816 
 z
 " style="fill: #ffffff"/>
    </g>
@@ -1812,12 +1812,12 @@ z
     <g id="xtick_19">
      <g id="line2d_57">
       <g>
-       <use xlink:href="#m62e8ddf8d5" x="905.343775" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8b199a4271" x="897.278775" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_30">
       <!-- (4K, 6K) -->
-      <g transform="translate(906.814336 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(898.749336 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
@@ -1832,12 +1832,12 @@ z
     <g id="xtick_20">
      <g id="line2d_58">
       <g>
-       <use xlink:href="#m62e8ddf8d5" x="947.428213" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8b199a4271" x="939.363213" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_31">
       <!-- (16K, 6K) -->
-      <g transform="translate(948.898774 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(940.833774 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
@@ -1853,12 +1853,12 @@ z
     <g id="xtick_21">
      <g id="line2d_59">
       <g>
-       <use xlink:href="#m62e8ddf8d5" x="989.51265" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8b199a4271" x="981.44765" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_32">
       <!-- (64K, 6K) -->
-      <g transform="translate(990.983211 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(982.918211 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -1874,12 +1874,12 @@ z
     <g id="xtick_22">
      <g id="line2d_60">
       <g>
-       <use xlink:href="#m62e8ddf8d5" x="1031.597088" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8b199a4271" x="1023.532088" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_33">
       <!-- (4K, 7K) -->
-      <g transform="translate(1033.067649 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(1025.002649 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
@@ -1894,12 +1894,12 @@ z
     <g id="xtick_23">
      <g id="line2d_61">
       <g>
-       <use xlink:href="#m62e8ddf8d5" x="1073.681526" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8b199a4271" x="1065.616526" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_34">
       <!-- (16K, 7K) -->
-      <g transform="translate(1075.152087 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(1067.087087 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
@@ -1915,12 +1915,12 @@ z
     <g id="xtick_24">
      <g id="line2d_62">
       <g>
-       <use xlink:href="#m62e8ddf8d5" x="1115.765963" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8b199a4271" x="1107.700963" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_35">
       <!-- (64K, 7K) -->
-      <g transform="translate(1117.236524 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(1109.171524 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -1936,12 +1936,12 @@ z
     <g id="xtick_25">
      <g id="line2d_63">
       <g>
-       <use xlink:href="#m62e8ddf8d5" x="1157.850401" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8b199a4271" x="1149.785401" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_36">
       <!-- (4K, 8K) -->
-      <g transform="translate(1159.320962 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(1151.255962 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
@@ -1956,12 +1956,12 @@ z
     <g id="xtick_26">
      <g id="line2d_64">
       <g>
-       <use xlink:href="#m62e8ddf8d5" x="1199.934838" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8b199a4271" x="1191.869838" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_37">
       <!-- (16K, 8K) -->
-      <g transform="translate(1201.4054 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(1193.3404 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
@@ -1977,12 +1977,12 @@ z
     <g id="xtick_27">
      <g id="line2d_65">
       <g>
-       <use xlink:href="#m62e8ddf8d5" x="1242.019276" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8b199a4271" x="1233.954276" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_38">
       <!-- (64K, 8K) -->
-      <g transform="translate(1243.489837 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(1235.424837 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -1999,175 +1999,175 @@ z
    <g id="matplotlib.axis_6">
     <g id="ytick_17">
      <g id="line2d_66">
-      <path d="M 888.51 345.561604 
-L 1258.853051 345.561604 
-" clip-path="url(#pada9960f7c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 880.445 338.937604 
+L 1250.788051 338.937604 
+" clip-path="url(#p4a3e371333)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_67">
       <g>
-       <use xlink:href="#m7fb1351010" x="888.51" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m88ec31a3c9" x="880.445" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_18">
      <g id="line2d_68">
-      <path d="M 888.51 321.210406 
-L 1258.853051 321.210406 
-" clip-path="url(#pada9960f7c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 880.445 314.300811 
+L 1250.788051 314.300811 
+" clip-path="url(#p4a3e371333)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_69">
       <g>
-       <use xlink:href="#m7fb1351010" x="888.51" y="321.210406" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m88ec31a3c9" x="880.445" y="314.300811" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_19">
      <g id="line2d_70">
-      <path d="M 888.51 296.859208 
-L 1258.853051 296.859208 
-" clip-path="url(#pada9960f7c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 880.445 289.664019 
+L 1250.788051 289.664019 
+" clip-path="url(#p4a3e371333)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_71">
       <g>
-       <use xlink:href="#m7fb1351010" x="888.51" y="296.859208" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m88ec31a3c9" x="880.445" y="289.664019" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_20">
      <g id="line2d_72">
-      <path d="M 888.51 272.508011 
-L 1258.853051 272.508011 
-" clip-path="url(#pada9960f7c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 880.445 265.027226 
+L 1250.788051 265.027226 
+" clip-path="url(#p4a3e371333)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_73">
       <g>
-       <use xlink:href="#m7fb1351010" x="888.51" y="272.508011" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m88ec31a3c9" x="880.445" y="265.027226" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_21">
      <g id="line2d_74">
-      <path d="M 888.51 248.156813 
-L 1258.853051 248.156813 
-" clip-path="url(#pada9960f7c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 880.445 240.390434 
+L 1250.788051 240.390434 
+" clip-path="url(#p4a3e371333)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_75">
       <g>
-       <use xlink:href="#m7fb1351010" x="888.51" y="248.156813" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m88ec31a3c9" x="880.445" y="240.390434" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_22">
      <g id="line2d_76">
-      <path d="M 888.51 223.805615 
-L 1258.853051 223.805615 
-" clip-path="url(#pada9960f7c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 880.445 215.753641 
+L 1250.788051 215.753641 
+" clip-path="url(#p4a3e371333)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_77">
       <g>
-       <use xlink:href="#m7fb1351010" x="888.51" y="223.805615" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m88ec31a3c9" x="880.445" y="215.753641" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_23">
      <g id="line2d_78">
-      <path d="M 888.51 199.454417 
-L 1258.853051 199.454417 
-" clip-path="url(#pada9960f7c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 880.445 191.116848 
+L 1250.788051 191.116848 
+" clip-path="url(#p4a3e371333)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_79">
       <g>
-       <use xlink:href="#m7fb1351010" x="888.51" y="199.454417" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m88ec31a3c9" x="880.445" y="191.116848" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_24">
      <g id="line2d_80">
-      <path d="M 888.51 175.10322 
-L 1258.853051 175.10322 
-" clip-path="url(#pada9960f7c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 880.445 166.480056 
+L 1250.788051 166.480056 
+" clip-path="url(#p4a3e371333)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_81">
       <g>
-       <use xlink:href="#m7fb1351010" x="888.51" y="175.10322" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m88ec31a3c9" x="880.445" y="166.480056" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
    </g>
    <g id="line2d_82">
-    <path d="M 905.343775 267.374117 
-L 947.428213 239.406658 
-L 989.51265 228.781463 
-L 1031.597088 270.832845 
-L 1073.681526 245.579611 
-L 1115.765963 237.002063 
-L 1157.850401 269.143317 
-L 1199.934838 243.707962 
-L 1242.019276 235.974729 
-" clip-path="url(#pada9960f7c)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#pada9960f7c)">
-     <use xlink:href="#mc7b58b2ba9" x="905.343775" y="267.374117" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mc7b58b2ba9" x="947.428213" y="239.406658" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mc7b58b2ba9" x="989.51265" y="228.781463" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mc7b58b2ba9" x="1031.597088" y="270.832845" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mc7b58b2ba9" x="1073.681526" y="245.579611" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mc7b58b2ba9" x="1115.765963" y="237.002063" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mc7b58b2ba9" x="1157.850401" y="269.143317" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mc7b58b2ba9" x="1199.934838" y="243.707962" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mc7b58b2ba9" x="1242.019276" y="235.974729" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <path d="M 897.278775 260.301522 
+L 939.363213 232.017398 
+L 981.44765 220.61633 
+L 1023.532088 254.156207 
+L 1065.616526 228.937981 
+L 1107.700963 218.233259 
+L 1149.785401 250.997444 
+L 1191.869838 221.265243 
+L 1233.954276 227.093025 
+" clip-path="url(#p4a3e371333)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p4a3e371333)">
+     <use xlink:href="#m4a1cd61088" x="897.278775" y="260.301522" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4a1cd61088" x="939.363213" y="232.017398" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4a1cd61088" x="981.44765" y="220.61633" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4a1cd61088" x="1023.532088" y="254.156207" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4a1cd61088" x="1065.616526" y="228.937981" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4a1cd61088" x="1107.700963" y="218.233259" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4a1cd61088" x="1149.785401" y="250.997444" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4a1cd61088" x="1191.869838" y="221.265243" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4a1cd61088" x="1233.954276" y="227.093025" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_83">
-    <path d="M 905.343775 291.660661 
-L 947.428213 278.668407 
-L 989.51265 273.816197 
-L 1031.597088 277.090923 
-L 1073.681526 260.141614 
-L 1115.765963 254.792619 
-L 1157.850401 277.61424 
-L 1199.934838 264.216362 
-L 1242.019276 260.538633 
-" clip-path="url(#pada9960f7c)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#pada9960f7c)">
-     <use xlink:href="#mf1a9062706" x="905.343775" y="291.660661" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mf1a9062706" x="947.428213" y="278.668407" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mf1a9062706" x="989.51265" y="273.816197" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mf1a9062706" x="1031.597088" y="277.090923" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mf1a9062706" x="1073.681526" y="260.141614" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mf1a9062706" x="1115.765963" y="254.792619" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mf1a9062706" x="1157.850401" y="277.61424" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mf1a9062706" x="1199.934838" y="264.216362" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mf1a9062706" x="1242.019276" y="260.538633" style="fill: #ff4444; stroke: #ff4444"/>
+    <path d="M 897.278775 284.848641 
+L 939.363213 271.046627 
+L 981.44765 266.276122 
+L 1023.532088 269.793812 
+L 1065.616526 252.528897 
+L 1107.700963 246.75519 
+L 1149.785401 270.110335 
+L 1191.869838 256.938586 
+L 1233.954276 252.375801 
+" clip-path="url(#p4a3e371333)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p4a3e371333)">
+     <use xlink:href="#mb7a5884233" x="897.278775" y="284.848641" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mb7a5884233" x="939.363213" y="271.046627" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mb7a5884233" x="981.44765" y="266.276122" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mb7a5884233" x="1023.532088" y="269.793812" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mb7a5884233" x="1065.616526" y="252.528897" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mb7a5884233" x="1107.700963" y="246.75519" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mb7a5884233" x="1149.785401" y="270.110335" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mb7a5884233" x="1191.869838" y="256.938586" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mb7a5884233" x="1233.954276" y="252.375801" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_84">
-    <path d="M 888.51 165.819378 
-L 1258.853051 165.819378 
-" clip-path="url(#pada9960f7c)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+    <path d="M 880.445 159.195378 
+L 1250.788051 159.195378 
+" clip-path="url(#p4a3e371333)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_13">
-    <path d="M 888.51 345.561604 
-L 888.51 151.44 
+    <path d="M 880.445 338.937604 
+L 880.445 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_14">
-    <path d="M 1258.853051 345.561604 
-L 1258.853051 151.44 
+    <path d="M 1250.788051 338.937604 
+L 1250.788051 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_15">
-    <path d="M 888.51 345.561604 
-L 1258.853051 345.561604 
+    <path d="M 880.445 338.937604 
+L 1250.788051 338.937604 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_16">
-    <path d="M 888.51 151.44 
-L 1258.853051 151.44 
+    <path d="M 880.445 144.816 
+L 1250.788051 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="text_39">
     <!-- LayerNorm (fwd) -->
-    <g transform="translate(998.070276 145.44) scale(0.18 -0.18)">
+    <g transform="translate(990.005276 138.816) scale(0.18 -0.18)">
      <defs>
       <path id="DejaVuSans-4c" d="M 628 4666 
 L 1259 4666 
@@ -2199,7 +2199,7 @@ z
   </g>
   <g id="text_40">
    <!-- SM100 BF16 Kernel Benchmarks (Oink vs Quack) — DSv3 (hidden-size ops) -->
-   <g transform="translate(233.067969 24.780563) scale(0.22 -0.22)">
+   <g transform="translate(225.002969 18.156562) scale(0.22 -0.22)">
     <defs>
      <path id="DejaVuSans-6c" d="M 603 4863 
 L 1178 4863 
@@ -2446,17 +2446,17 @@ z
   </g>
   <g id="legend_1">
    <g id="line2d_85">
-    <path d="M 386.305937 53.315969 
-L 402.555937 53.315969 
-L 418.805937 53.315969 
+    <path d="M 378.240937 46.691969 
+L 394.490937 46.691969 
+L 410.740937 46.691969 
 " style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
     <g>
-     <use xlink:href="#mc7b58b2ba9" x="402.555937" y="53.315969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m4a1cd61088" x="394.490937" y="46.691969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="text_41">
     <!-- KernelAgent-Oink (ours) -->
-    <g transform="translate(429.205937 57.865969) scale(0.13 -0.13)">
+    <g transform="translate(421.140937 51.241969) scale(0.13 -0.13)">
      <defs>
       <path id="DejaVuSans-67" d="M 2906 1791 
 Q 2906 2416 2648 2759 
@@ -2519,17 +2519,17 @@ z
     </g>
    </g>
    <g id="line2d_86">
-    <path d="M 611.770625 53.315969 
-L 628.020625 53.315969 
-L 644.270625 53.315969 
+    <path d="M 603.705625 46.691969 
+L 619.955625 46.691969 
+L 636.205625 46.691969 
 " style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
     <g>
-     <use xlink:href="#mf1a9062706" x="628.020625" y="53.315969" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mb7a5884233" x="619.955625" y="46.691969" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="text_42">
     <!-- Quack -->
-    <g transform="translate(654.670625 57.865969) scale(0.13 -0.13)">
+    <g transform="translate(646.605625 51.241969) scale(0.13 -0.13)">
      <use xlink:href="#DejaVuSans-51"/>
      <use xlink:href="#DejaVuSans-75" transform="translate(78.710938 0)"/>
      <use xlink:href="#DejaVuSans-61" transform="translate(142.089844 0)"/>
@@ -2538,14 +2538,14 @@ L 644.270625 53.315969
     </g>
    </g>
    <g id="line2d_87">
-    <path d="M 721.785156 53.315969 
-L 738.035156 53.315969 
-L 754.285156 53.315969 
+    <path d="M 713.720156 46.691969 
+L 729.970156 46.691969 
+L 746.220156 46.691969 
 " style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="text_43">
     <!-- HBM peak (measured) -->
-    <g transform="translate(764.685156 57.865969) scale(0.13 -0.13)">
+    <g transform="translate(756.620156 51.241969) scale(0.13 -0.13)">
      <defs>
       <path id="DejaVuSans-48" d="M 628 4666 
 L 1259 4666 
@@ -2587,14 +2587,14 @@ z
   </g>
  </g>
  <defs>
-  <clipPath id="p856ea13fce">
-   <rect x="66.53" y="151.44" width="370.343051" height="194.121604"/>
+  <clipPath id="p558db2f6f3">
+   <rect x="58.465" y="144.816" width="370.343051" height="194.121604"/>
   </clipPath>
-  <clipPath id="p1ed2bedd16">
-   <rect x="477.52" y="151.44" width="370.343051" height="194.121604"/>
+  <clipPath id="p3886029db8">
+   <rect x="469.455" y="144.816" width="370.343051" height="194.121604"/>
   </clipPath>
-  <clipPath id="pada9960f7c">
-   <rect x="888.51" y="151.44" width="370.343051" height="194.121604"/>
+  <clipPath id="p4a3e371333">
+   <rect x="880.445" y="144.816" width="370.343051" height="194.121604"/>
   </clipPath>
  </defs>
 </svg>
diff --git a/oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3_all.svg b/oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3_all.svg
index f5cd53c..9db31a5 100644
--- a/oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3_all.svg
+++ b/oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3_all.svg
@@ -6,7 +6,7 @@
   <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
    <cc:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
-    <dc:date>2026-01-12T23:50:09.117981</dc:date>
+    <dc:date>2026-01-22T03:17:06.137573</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <cc:Agent>
@@ -41,12 +41,12 @@ z
     <g id="xtick_1">
      <g id="line2d_1">
       <defs>
-       <path id="m8c58d9c87c" d="M 0 0 
+       <path id="ma73420eda6" d="M 0 0 
 L 0 3.5 
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
-       <use xlink:href="#m8c58d9c87c" x="75.478057" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="75.478057" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_1">
@@ -167,7 +167,7 @@ z
     <g id="xtick_2">
      <g id="line2d_2">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="118.010699" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="118.010699" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_2">
@@ -204,7 +204,7 @@ z
     <g id="xtick_3">
      <g id="line2d_3">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="160.543341" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="160.543341" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_3">
@@ -225,7 +225,7 @@ z
     <g id="xtick_4">
      <g id="line2d_4">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="203.075984" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="203.075984" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_4">
@@ -257,7 +257,7 @@ z
     <g id="xtick_5">
      <g id="line2d_5">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="245.608626" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="245.608626" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_5">
@@ -278,7 +278,7 @@ z
     <g id="xtick_6">
      <g id="line2d_6">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="288.141268" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="288.141268" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_6">
@@ -299,7 +299,7 @@ z
     <g id="xtick_7">
      <g id="line2d_7">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="330.673911" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="330.673911" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_7">
@@ -360,7 +360,7 @@ z
     <g id="xtick_8">
      <g id="line2d_8">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="373.206553" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="373.206553" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_8">
@@ -381,7 +381,7 @@ z
     <g id="xtick_9">
      <g id="line2d_9">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="415.739195" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="415.739195" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_9">
@@ -652,16 +652,16 @@ z
      <g id="line2d_10">
       <path d="M 58.465 316.082831 
 L 432.752252 316.082831 
-" clip-path="url(#pcb97c4c092)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+" clip-path="url(#pd5fb8ecbf3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_11">
       <defs>
-       <path id="m443ef8f579" d="M 0 0 
+       <path id="m662a0763fe" d="M 0 0 
 L -3.5 0 
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
-       <use xlink:href="#m443ef8f579" x="58.465" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m662a0763fe" x="58.465" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_11">
@@ -696,18 +696,18 @@ z
     </g>
     <g id="ytick_2">
      <g id="line2d_12">
-      <path d="M 58.465 273.114379 
-L 432.752252 273.114379 
-" clip-path="url(#pcb97c4c092)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 272.610437 
+L 432.752252 272.610437 
+" clip-path="url(#pd5fb8ecbf3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_13">
       <g>
-       <use xlink:href="#m443ef8f579" x="58.465" y="273.114379" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m662a0763fe" x="58.465" y="272.610437" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_12">
       <!-- 2000 -->
-      <g transform="translate(20.925 277.673441) scale(0.12 -0.12)">
+      <g transform="translate(20.925 277.1695) scale(0.12 -0.12)">
        <defs>
         <path id="DejaVuSans-32" d="M 1228 531 
 L 3431 531 
@@ -743,18 +743,18 @@ z
     </g>
     <g id="ytick_3">
      <g id="line2d_14">
-      <path d="M 58.465 230.145926 
-L 432.752252 230.145926 
-" clip-path="url(#pcb97c4c092)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 229.138044 
+L 432.752252 229.138044 
+" clip-path="url(#pd5fb8ecbf3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_15">
       <g>
-       <use xlink:href="#m443ef8f579" x="58.465" y="230.145926" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m662a0763fe" x="58.465" y="229.138044" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_13">
       <!-- 4000 -->
-      <g transform="translate(20.925 234.704988) scale(0.12 -0.12)">
+      <g transform="translate(20.925 233.697106) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-34"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -764,18 +764,18 @@ L 432.752252 230.145926
     </g>
     <g id="ytick_4">
      <g id="line2d_16">
-      <path d="M 58.465 187.177473 
-L 432.752252 187.177473 
-" clip-path="url(#pcb97c4c092)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 185.66565 
+L 432.752252 185.66565 
+" clip-path="url(#pd5fb8ecbf3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_17">
       <g>
-       <use xlink:href="#m443ef8f579" x="58.465" y="187.177473" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m662a0763fe" x="58.465" y="185.66565" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_14">
       <!-- 6000 -->
-      <g transform="translate(20.925 191.736536) scale(0.12 -0.12)">
+      <g transform="translate(20.925 190.224712) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-36"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -1032,18 +1032,18 @@ z
     </g>
    </g>
    <g id="line2d_18">
-    <path d="M 75.478057 206.026622 
-L 118.010699 180.931174 
-L 160.543341 172.021301 
-L 203.075984 200.452473 
-L 245.608626 181.644013 
-L 288.141268 175.360397 
-L 330.673911 203.288962 
-L 373.206553 181.422463 
-L 415.739195 175.218742 
-" clip-path="url(#pcb97c4c092)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <path d="M 75.478057 204.470045 
+L 118.010699 179.401743 
+L 160.543341 171.191349 
+L 203.075984 199.108273 
+L 245.608626 180.011926 
+L 288.141268 169.961449 
+L 330.673911 202.165689 
+L 373.206553 180.844224 
+L 415.739195 174.128577 
+" clip-path="url(#pd5fb8ecbf3)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
     <defs>
-     <path id="m4ddee04d9f" d="M 0 3.5 
+     <path id="me5690a5aa2" d="M 0 3.5 
 C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
 C 3.131218 1.81853 3.5 0.928211 3.5 0 
 C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
@@ -1055,31 +1055,31 @@ C -1.81853 3.131218 -0.928211 3.5 0 3.5
 z
 " style="stroke: #5ba3f5"/>
     </defs>
-    <g clip-path="url(#pcb97c4c092)">
-     <use xlink:href="#m4ddee04d9f" x="75.478057" y="206.026622" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="118.010699" y="180.931174" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="160.543341" y="172.021301" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="203.075984" y="200.452473" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="245.608626" y="181.644013" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="288.141268" y="175.360397" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="330.673911" y="203.288962" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="373.206553" y="181.422463" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="415.739195" y="175.218742" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <g clip-path="url(#pd5fb8ecbf3)">
+     <use xlink:href="#me5690a5aa2" x="75.478057" y="204.470045" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="118.010699" y="179.401743" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="160.543341" y="171.191349" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="203.075984" y="199.108273" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="245.608626" y="180.011926" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="288.141268" y="169.961449" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="330.673911" y="202.165689" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="373.206553" y="180.844224" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="415.739195" y="174.128577" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_19">
-    <path d="M 75.478057 221.88501 
-L 118.010699 197.013731 
-L 160.543341 187.801968 
-L 203.075984 214.414772 
-L 245.608626 192.509179 
-L 288.141268 186.851726 
-L 330.673911 213.077268 
-L 373.206553 193.043313 
-L 415.739195 188.073312 
-" clip-path="url(#pcb97c4c092)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <path d="M 75.478057 266.170413 
+L 118.010699 254.146378 
+L 160.543341 250.060247 
+L 203.075984 262.341263 
+L 245.608626 252.670743 
+L 288.141268 249.349234 
+L 330.673911 263.187321 
+L 373.206553 252.588223 
+L 415.739195 249.824344 
+" clip-path="url(#pd5fb8ecbf3)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
     <defs>
-     <path id="m32d72ab000" d="M 0 3.5 
+     <path id="m9a48fdbc57" d="M 0 3.5 
 C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
 C 3.131218 1.81853 3.5 0.928211 3.5 0 
 C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
@@ -1091,22 +1091,22 @@ C -1.81853 3.131218 -0.928211 3.5 0 3.5
 z
 " style="stroke: #ff4444"/>
     </defs>
-    <g clip-path="url(#pcb97c4c092)">
-     <use xlink:href="#m32d72ab000" x="75.478057" y="221.88501" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="118.010699" y="197.013731" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="160.543341" y="187.801968" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="203.075984" y="214.414772" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="245.608626" y="192.509179" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="288.141268" y="186.851726" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="330.673911" y="213.077268" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="373.206553" y="193.043313" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="415.739195" y="188.073312" style="fill: #ff4444; stroke: #ff4444"/>
+    <g clip-path="url(#pd5fb8ecbf3)">
+     <use xlink:href="#m9a48fdbc57" x="75.478057" y="266.170413" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="118.010699" y="254.146378" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="160.543341" y="250.060247" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="203.075984" y="262.341263" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="245.608626" y="252.670743" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="288.141268" y="249.349234" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="330.673911" y="263.187321" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="373.206553" y="252.588223" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="415.739195" y="249.824344" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_20">
     <path d="M 58.465 157.502432 
 L 432.752252 157.502432 
-" clip-path="url(#pcb97c4c092)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+" clip-path="url(#pd5fb8ecbf3)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_3">
     <path d="M 58.465 316.082831 
@@ -1287,7 +1287,7 @@ z
     <g id="xtick_10">
      <g id="line2d_21">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="490.606864" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="490.606864" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_17">
@@ -1307,7 +1307,7 @@ z
     <g id="xtick_11">
      <g id="line2d_22">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="533.139506" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="533.139506" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_18">
@@ -1328,7 +1328,7 @@ z
     <g id="xtick_12">
      <g id="line2d_23">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="575.672148" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="575.672148" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_19">
@@ -1349,7 +1349,7 @@ z
     <g id="xtick_13">
      <g id="line2d_24">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="618.204791" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="618.204791" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_20">
@@ -1369,7 +1369,7 @@ z
     <g id="xtick_14">
      <g id="line2d_25">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="660.737433" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="660.737433" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_21">
@@ -1390,7 +1390,7 @@ z
     <g id="xtick_15">
      <g id="line2d_26">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="703.270075" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="703.270075" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_22">
@@ -1411,7 +1411,7 @@ z
     <g id="xtick_16">
      <g id="line2d_27">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="745.802717" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="745.802717" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_23">
@@ -1431,7 +1431,7 @@ z
     <g id="xtick_17">
      <g id="line2d_28">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="788.33536" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="788.33536" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_24">
@@ -1452,7 +1452,7 @@ z
     <g id="xtick_18">
      <g id="line2d_29">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="830.868002" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="830.868002" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_25">
@@ -1500,101 +1500,101 @@ z
      <g id="line2d_30">
       <path d="M 473.593807 316.082831 
 L 847.881059 316.082831 
-" clip-path="url(#paa39aed402)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+" clip-path="url(#p637d225080)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_31">
       <g>
-       <use xlink:href="#m443ef8f579" x="473.593807" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m662a0763fe" x="473.593807" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_6">
      <g id="line2d_32">
-      <path d="M 473.593807 273.114379 
-L 847.881059 273.114379 
-" clip-path="url(#paa39aed402)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 473.593807 272.610437 
+L 847.881059 272.610437 
+" clip-path="url(#p637d225080)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_33">
       <g>
-       <use xlink:href="#m443ef8f579" x="473.593807" y="273.114379" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m662a0763fe" x="473.593807" y="272.610437" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_7">
      <g id="line2d_34">
-      <path d="M 473.593807 230.145926 
-L 847.881059 230.145926 
-" clip-path="url(#paa39aed402)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 473.593807 229.138044 
+L 847.881059 229.138044 
+" clip-path="url(#p637d225080)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_35">
       <g>
-       <use xlink:href="#m443ef8f579" x="473.593807" y="230.145926" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m662a0763fe" x="473.593807" y="229.138044" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_8">
      <g id="line2d_36">
-      <path d="M 473.593807 187.177473 
-L 847.881059 187.177473 
-" clip-path="url(#paa39aed402)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 473.593807 185.66565 
+L 847.881059 185.66565 
+" clip-path="url(#p637d225080)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_37">
       <g>
-       <use xlink:href="#m443ef8f579" x="473.593807" y="187.177473" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m662a0763fe" x="473.593807" y="185.66565" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
    </g>
    <g id="line2d_38">
-    <path d="M 490.606864 243.285166 
-L 533.139506 226.388036 
-L 575.672148 220.344494 
-L 618.204791 240.294166 
-L 660.737433 223.097025 
-L 703.270075 217.684213 
-L 745.802717 244.379998 
-L 788.33536 231.359184 
-L 830.868002 228.003296 
-" clip-path="url(#paa39aed402)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#paa39aed402)">
-     <use xlink:href="#m4ddee04d9f" x="490.606864" y="243.285166" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="533.139506" y="226.388036" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="575.672148" y="220.344494" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="618.204791" y="240.294166" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="660.737433" y="223.097025" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="703.270075" y="217.684213" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="745.802717" y="244.379998" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="788.33536" y="231.359184" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="830.868002" y="228.003296" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <path d="M 490.606864 241.729855 
+L 533.139506 225.172735 
+L 575.672148 219.2742 
+L 618.204791 239.284407 
+L 660.737433 221.994359 
+L 703.270075 216.544004 
+L 745.802717 243.25126 
+L 788.33536 230.209518 
+L 830.868002 226.924207 
+" clip-path="url(#p637d225080)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p637d225080)">
+     <use xlink:href="#me5690a5aa2" x="490.606864" y="241.729855" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="533.139506" y="225.172735" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="575.672148" y="219.2742" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="618.204791" y="239.284407" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="660.737433" y="221.994359" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="703.270075" y="216.544004" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="745.802717" y="243.25126" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="788.33536" y="230.209518" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="830.868002" y="226.924207" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_39">
-    <path d="M 490.606864 269.506625 
-L 533.139506 256.558458 
-L 575.672148 252.132367 
-L 618.204791 259.293153 
-L 660.737433 245.742249 
-L 703.270075 241.672732 
-L 745.802717 260.602156 
-L 788.33536 249.835186 
-L 830.868002 246.555873 
-" clip-path="url(#paa39aed402)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#paa39aed402)">
-     <use xlink:href="#m32d72ab000" x="490.606864" y="269.506625" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="533.139506" y="256.558458" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="575.672148" y="252.132367" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="618.204791" y="259.293153" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="660.737433" y="245.742249" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="703.270075" y="241.672732" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="745.802717" y="260.602156" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="788.33536" y="249.835186" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="830.868002" y="246.555873" style="fill: #ff4444; stroke: #ff4444"/>
+    <path d="M 490.606864 269.206315 
+L 533.139506 255.874239 
+L 575.672148 251.364529 
+L 618.204791 258.576855 
+L 660.737433 245.043566 
+L 703.270075 240.775724 
+L 745.802717 259.470023 
+L 788.33536 249.057445 
+L 830.868002 245.742999 
+" clip-path="url(#p637d225080)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p637d225080)">
+     <use xlink:href="#m9a48fdbc57" x="490.606864" y="269.206315" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="533.139506" y="255.874239" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="575.672148" y="251.364529" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="618.204791" y="258.576855" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="660.737433" y="245.043566" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="703.270075" y="240.775724" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="745.802717" y="259.470023" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="788.33536" y="249.057445" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="830.868002" y="245.742999" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_40">
     <path d="M 473.593807 157.502432 
 L 847.881059 157.502432 
-" clip-path="url(#paa39aed402)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+" clip-path="url(#p637d225080)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_8">
     <path d="M 473.593807 316.082831 
@@ -1695,7 +1695,7 @@ z
     <g id="xtick_19">
      <g id="line2d_41">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="905.735671" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="905.735671" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_28">
@@ -1715,7 +1715,7 @@ z
     <g id="xtick_20">
      <g id="line2d_42">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="948.268313" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="948.268313" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_29">
@@ -1736,7 +1736,7 @@ z
     <g id="xtick_21">
      <g id="line2d_43">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="990.800955" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="990.800955" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_30">
@@ -1757,7 +1757,7 @@ z
     <g id="xtick_22">
      <g id="line2d_44">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="1033.333597" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="1033.333597" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_31">
@@ -1777,7 +1777,7 @@ z
     <g id="xtick_23">
      <g id="line2d_45">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="1075.86624" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="1075.86624" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_32">
@@ -1798,7 +1798,7 @@ z
     <g id="xtick_24">
      <g id="line2d_46">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="1118.398882" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="1118.398882" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_33">
@@ -1819,7 +1819,7 @@ z
     <g id="xtick_25">
      <g id="line2d_47">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="1160.931524" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="1160.931524" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_34">
@@ -1839,7 +1839,7 @@ z
     <g id="xtick_26">
      <g id="line2d_48">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="1203.464166" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="1203.464166" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_35">
@@ -1860,7 +1860,7 @@ z
     <g id="xtick_27">
      <g id="line2d_49">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="1245.996809" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="1245.996809" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_36">
@@ -1908,101 +1908,101 @@ z
      <g id="line2d_50">
       <path d="M 888.722614 316.082831 
 L 1263.009866 316.082831 
-" clip-path="url(#pa424c21931)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+" clip-path="url(#p8827df76f1)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_51">
       <g>
-       <use xlink:href="#m443ef8f579" x="888.722614" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m662a0763fe" x="888.722614" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_10">
      <g id="line2d_52">
-      <path d="M 888.722614 273.114379 
-L 1263.009866 273.114379 
-" clip-path="url(#pa424c21931)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 888.722614 272.610437 
+L 1263.009866 272.610437 
+" clip-path="url(#p8827df76f1)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_53">
       <g>
-       <use xlink:href="#m443ef8f579" x="888.722614" y="273.114379" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m662a0763fe" x="888.722614" y="272.610437" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_11">
      <g id="line2d_54">
-      <path d="M 888.722614 230.145926 
-L 1263.009866 230.145926 
-" clip-path="url(#pa424c21931)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 888.722614 229.138044 
+L 1263.009866 229.138044 
+" clip-path="url(#p8827df76f1)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_55">
       <g>
-       <use xlink:href="#m443ef8f579" x="888.722614" y="230.145926" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m662a0763fe" x="888.722614" y="229.138044" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_12">
      <g id="line2d_56">
-      <path d="M 888.722614 187.177473 
-L 1263.009866 187.177473 
-" clip-path="url(#pa424c21931)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 888.722614 185.66565 
+L 1263.009866 185.66565 
+" clip-path="url(#p8827df76f1)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_57">
       <g>
-       <use xlink:href="#m443ef8f579" x="888.722614" y="187.177473" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m662a0763fe" x="888.722614" y="185.66565" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
    </g>
    <g id="line2d_58">
-    <path d="M 905.735671 247.493832 
-L 948.268313 222.774292 
-L 990.800955 212.924481 
-L 1033.333597 249.77445 
-L 1075.86624 227.98358 
-L 1118.398882 219.776424 
-L 1160.931524 248.672806 
-L 1203.464166 226.521779 
-L 1245.996809 218.572302 
-" clip-path="url(#pa424c21931)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#pa424c21931)">
-     <use xlink:href="#m4ddee04d9f" x="905.735671" y="247.493832" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="948.268313" y="222.774292" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="990.800955" y="212.924481" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="1033.333597" y="249.77445" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="1075.86624" y="227.98358" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="1118.398882" y="219.776424" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="1160.931524" y="248.672806" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="1203.464166" y="226.521779" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="1245.996809" y="218.572302" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <path d="M 905.735671 246.704914 
+L 948.268313 221.750802 
+L 990.800955 211.692031 
+L 1033.333597 241.283113 
+L 1075.86624 219.033937 
+L 1118.398882 209.589529 
+L 1160.931524 238.496245 
+L 1203.464166 212.264544 
+L 1245.996809 217.406196 
+" clip-path="url(#p8827df76f1)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p8827df76f1)">
+     <use xlink:href="#me5690a5aa2" x="905.735671" y="246.704914" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="948.268313" y="221.750802" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="990.800955" y="211.692031" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="1033.333597" y="241.283113" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="1075.86624" y="219.033937" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="1118.398882" y="209.589529" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="1160.931524" y="238.496245" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="1203.464166" y="212.264544" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="1245.996809" y="217.406196" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_59">
-    <path d="M 905.735671 268.692346 
-L 948.268313 256.805708 
-L 990.800955 252.713862 
-L 1033.333597 255.817815 
-L 1075.86624 240.722765 
-L 1118.398882 235.674322 
-L 1160.931524 256.103061 
-L 1203.464166 244.354092 
-L 1245.996809 240.581285 
-" clip-path="url(#pa424c21931)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#pa424c21931)">
-     <use xlink:href="#m32d72ab000" x="905.735671" y="268.692346" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="948.268313" y="256.805708" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="990.800955" y="252.713862" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="1033.333597" y="255.817815" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="1075.86624" y="240.722765" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="1118.398882" y="235.674322" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="1160.931524" y="256.103061" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="1203.464166" y="244.354092" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="1245.996809" y="240.581285" style="fill: #ff4444; stroke: #ff4444"/>
+    <path d="M 905.735671 268.361995 
+L 948.268313 256.184952 
+L 990.800955 251.976099 
+L 1033.333597 255.079635 
+L 1075.86624 239.847394 
+L 1118.398882 234.75345 
+L 1160.931524 255.358893 
+L 1203.464166 243.737911 
+L 1245.996809 239.712322 
+" clip-path="url(#p8827df76f1)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p8827df76f1)">
+     <use xlink:href="#m9a48fdbc57" x="905.735671" y="268.361995" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="948.268313" y="256.184952" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="990.800955" y="251.976099" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="1033.333597" y="255.079635" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="1075.86624" y="239.847394" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="1118.398882" y="234.75345" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="1160.931524" y="255.358893" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="1203.464166" y="243.737911" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="1245.996809" y="239.712322" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_60">
     <path d="M 888.722614 157.502432 
 L 1263.009866 157.502432 
-" clip-path="url(#pa424c21931)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+" clip-path="url(#p8827df76f1)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_13">
     <path d="M 888.722614 316.082831 
@@ -2069,7 +2069,7 @@ z
     <g id="xtick_28">
      <g id="line2d_61">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="1320.864477" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="1320.864477" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_39">
@@ -2123,7 +2123,7 @@ z
     <g id="xtick_29">
      <g id="line2d_62">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="1351.797308" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="1351.797308" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_40">
@@ -2144,7 +2144,7 @@ z
     <g id="xtick_30">
      <g id="line2d_63">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="1382.730139" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="1382.730139" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_41">
@@ -2165,7 +2165,7 @@ z
     <g id="xtick_31">
      <g id="line2d_64">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="1413.66297" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="1413.66297" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_42">
@@ -2185,7 +2185,7 @@ z
     <g id="xtick_32">
      <g id="line2d_65">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="1444.5958" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="1444.5958" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_43">
@@ -2206,7 +2206,7 @@ z
     <g id="xtick_33">
      <g id="line2d_66">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="1475.528631" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="1475.528631" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_44">
@@ -2227,7 +2227,7 @@ z
     <g id="xtick_34">
      <g id="line2d_67">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="1506.461462" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="1506.461462" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_45">
@@ -2247,7 +2247,7 @@ z
     <g id="xtick_35">
      <g id="line2d_68">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="1537.394293" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="1537.394293" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_46">
@@ -2268,7 +2268,7 @@ z
     <g id="xtick_36">
      <g id="line2d_69">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="1568.327123" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="1568.327123" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_47">
@@ -2289,7 +2289,7 @@ z
     <g id="xtick_37">
      <g id="line2d_70">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="1599.259954" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="1599.259954" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_48">
@@ -2310,7 +2310,7 @@ z
     <g id="xtick_38">
      <g id="line2d_71">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="1630.192785" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="1630.192785" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_49">
@@ -2332,7 +2332,7 @@ z
     <g id="xtick_39">
      <g id="line2d_72">
       <g>
-       <use xlink:href="#m8c58d9c87c" x="1661.125616" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma73420eda6" x="1661.125616" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_50">
@@ -2433,113 +2433,113 @@ z
      <g id="line2d_73">
       <path d="M 1303.85142 316.082831 
 L 1678.138672 316.082831 
-" clip-path="url(#pcbba2f8de0)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+" clip-path="url(#p25c2489d1c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_74">
       <g>
-       <use xlink:href="#m443ef8f579" x="1303.85142" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m662a0763fe" x="1303.85142" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_14">
      <g id="line2d_75">
-      <path d="M 1303.85142 273.114379 
-L 1678.138672 273.114379 
-" clip-path="url(#pcbba2f8de0)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 1303.85142 272.610437 
+L 1678.138672 272.610437 
+" clip-path="url(#p25c2489d1c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_76">
       <g>
-       <use xlink:href="#m443ef8f579" x="1303.85142" y="273.114379" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m662a0763fe" x="1303.85142" y="272.610437" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_15">
      <g id="line2d_77">
-      <path d="M 1303.85142 230.145926 
-L 1678.138672 230.145926 
-" clip-path="url(#pcbba2f8de0)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 1303.85142 229.138044 
+L 1678.138672 229.138044 
+" clip-path="url(#p25c2489d1c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_78">
       <g>
-       <use xlink:href="#m443ef8f579" x="1303.85142" y="230.145926" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m662a0763fe" x="1303.85142" y="229.138044" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_16">
      <g id="line2d_79">
-      <path d="M 1303.85142 187.177473 
-L 1678.138672 187.177473 
-" clip-path="url(#pcbba2f8de0)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 1303.85142 185.66565 
+L 1678.138672 185.66565 
+" clip-path="url(#p25c2489d1c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_80">
       <g>
-       <use xlink:href="#m443ef8f579" x="1303.85142" y="187.177473" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m662a0763fe" x="1303.85142" y="185.66565" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
    </g>
    <g id="line2d_81">
-    <path d="M 1320.864477 260.53911 
-L 1351.797308 228.607266 
-L 1382.730139 213.334816 
-L 1413.66297 255.469893 
-L 1444.5958 237.279537 
-L 1475.528631 231.213568 
-L 1506.461462 247.582494 
-L 1537.394293 233.808101 
-L 1568.327123 229.484113 
-L 1599.259954 243.595486 
-L 1630.192785 232.784634 
-L 1661.125616 229.215888 
-" clip-path="url(#pcbba2f8de0)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#pcbba2f8de0)">
-     <use xlink:href="#m4ddee04d9f" x="1320.864477" y="260.53911" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="1351.797308" y="228.607266" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="1382.730139" y="213.334816" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="1413.66297" y="255.469893" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="1444.5958" y="237.279537" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="1475.528631" y="231.213568" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="1506.461462" y="247.582494" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="1537.394293" y="233.808101" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="1568.327123" y="229.484113" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="1599.259954" y="243.595486" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="1630.192785" y="232.784634" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m4ddee04d9f" x="1661.125616" y="229.215888" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <path d="M 1320.864477 263.195876 
+L 1351.797308 227.569943 
+L 1382.730139 211.670979 
+L 1413.66297 254.374652 
+L 1444.5958 236.99508 
+L 1475.528631 230.099213 
+L 1506.461462 247.29723 
+L 1537.394293 233.720439 
+L 1568.327123 228.567481 
+L 1599.259954 242.76838 
+L 1630.192785 231.54686 
+L 1661.125616 228.197951 
+" clip-path="url(#p25c2489d1c)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p25c2489d1c)">
+     <use xlink:href="#me5690a5aa2" x="1320.864477" y="263.195876" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="1351.797308" y="227.569943" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="1382.730139" y="211.670979" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="1413.66297" y="254.374652" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="1444.5958" y="236.99508" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="1475.528631" y="230.099213" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="1506.461462" y="247.29723" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="1537.394293" y="233.720439" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="1568.327123" y="228.567481" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="1599.259954" y="242.76838" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="1630.192785" y="231.54686" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="1661.125616" y="228.197951" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_82">
-    <path d="M 1320.864477 304.13148 
-L 1351.797308 269.869577 
-L 1382.730139 252.938446 
-L 1413.66297 293.745122 
-L 1444.5958 259.537073 
-L 1475.528631 253.973238 
-L 1506.461462 284.942857 
-L 1537.394293 253.545768 
-L 1568.327123 249.075663 
-L 1599.259954 271.907974 
-L 1630.192785 255.416905 
-L 1661.125616 252.326788 
-" clip-path="url(#pcbba2f8de0)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#pcbba2f8de0)">
-     <use xlink:href="#m32d72ab000" x="1320.864477" y="304.13148" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="1351.797308" y="269.869577" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="1382.730139" y="252.938446" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="1413.66297" y="293.745122" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="1444.5958" y="259.537073" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="1475.528631" y="253.973238" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="1506.461462" y="284.942857" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="1537.394293" y="253.545768" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="1568.327123" y="249.075663" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="1599.259954" y="271.907974" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="1630.192785" y="255.416905" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m32d72ab000" x="1661.125616" y="252.326788" style="fill: #ff4444; stroke: #ff4444"/>
+    <path d="M 1320.864477 304.507769 
+L 1351.797308 263.261155 
+L 1382.730139 252.351364 
+L 1413.66297 291.932457 
+L 1444.5958 259.032521 
+L 1475.528631 253.313994 
+L 1506.461462 279.898403 
+L 1537.394293 252.957585 
+L 1568.327123 248.308302 
+L 1599.259954 264.677161 
+L 1630.192785 254.737508 
+L 1661.125616 251.595756 
+" clip-path="url(#p25c2489d1c)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p25c2489d1c)">
+     <use xlink:href="#m9a48fdbc57" x="1320.864477" y="304.507769" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="1351.797308" y="263.261155" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="1382.730139" y="252.351364" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="1413.66297" y="291.932457" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="1444.5958" y="259.032521" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="1475.528631" y="253.313994" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="1506.461462" y="279.898403" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="1537.394293" y="252.957585" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="1568.327123" y="248.308302" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="1599.259954" y="264.677161" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="1630.192785" y="254.737508" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="1661.125616" y="251.595756" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_83">
     <path d="M 1303.85142 157.502432 
 L 1678.138672 157.502432 
-" clip-path="url(#pcbba2f8de0)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+" clip-path="url(#p25c2489d1c)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_18">
     <path d="M 1303.85142 316.082831 
@@ -2784,7 +2784,7 @@ L 610.490937 46.691969
 L 626.740937 46.691969 
 " style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
     <g>
-     <use xlink:href="#m4ddee04d9f" x="610.490937" y="46.691969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me5690a5aa2" x="610.490937" y="46.691969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="text_54">
@@ -2857,7 +2857,7 @@ L 835.955625 46.691969
 L 852.205625 46.691969 
 " style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
     <g>
-     <use xlink:href="#m32d72ab000" x="835.955625" y="46.691969" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m9a48fdbc57" x="835.955625" y="46.691969" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="text_55">
@@ -2920,16 +2920,16 @@ z
   </g>
  </g>
  <defs>
-  <clipPath id="pcb97c4c092">
+  <clipPath id="pd5fb8ecbf3">
    <rect x="58.465" y="144.816" width="374.287252" height="171.266831"/>
   </clipPath>
-  <clipPath id="paa39aed402">
+  <clipPath id="p637d225080">
    <rect x="473.593807" y="144.816" width="374.287252" height="171.266831"/>
   </clipPath>
-  <clipPath id="pa424c21931">
+  <clipPath id="p8827df76f1">
    <rect x="888.722614" y="144.816" width="374.287252" height="171.266831"/>
   </clipPath>
-  <clipPath id="pcbba2f8de0">
+  <clipPath id="p25c2489d1c">
    <rect x="1303.85142" y="144.816" width="374.287252" height="171.266831"/>
   </clipPath>
  </defs>
diff --git a/oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3_cross_entropy.svg b/oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3_cross_entropy.svg
index db39e3c..c392959 100644
--- a/oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3_cross_entropy.svg
+++ b/oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3_cross_entropy.svg
@@ -6,7 +6,7 @@
   <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
    <cc:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
-    <dc:date>2026-01-12T23:31:44.506589</dc:date>
+    <dc:date>2026-01-22T03:17:04.456371</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <cc:Agent>
@@ -41,12 +41,12 @@ z
     <g id="xtick_1">
      <g id="line2d_1">
       <defs>
-       <path id="m15b5a74a8c" d="M 0 0 
+       <path id="m164c3edd7a" d="M 0 0 
 L 0 3.5 
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
-       <use xlink:href="#m15b5a74a8c" x="73.267666" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m164c3edd7a" x="73.267666" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_1">
@@ -169,7 +169,7 @@ z
     <g id="xtick_2">
      <g id="line2d_2">
       <g>
-       <use xlink:href="#m15b5a74a8c" x="100.181604" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m164c3edd7a" x="100.181604" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_2">
@@ -236,7 +236,7 @@ z
     <g id="xtick_3">
      <g id="line2d_3">
       <g>
-       <use xlink:href="#m15b5a74a8c" x="127.095543" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m164c3edd7a" x="127.095543" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_3">
@@ -257,7 +257,7 @@ z
     <g id="xtick_4">
      <g id="line2d_4">
       <g>
-       <use xlink:href="#m15b5a74a8c" x="154.009481" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m164c3edd7a" x="154.009481" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_4">
@@ -277,7 +277,7 @@ z
     <g id="xtick_5">
      <g id="line2d_5">
       <g>
-       <use xlink:href="#m15b5a74a8c" x="180.923419" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m164c3edd7a" x="180.923419" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_5">
@@ -298,7 +298,7 @@ z
     <g id="xtick_6">
      <g id="line2d_6">
       <g>
-       <use xlink:href="#m15b5a74a8c" x="207.837357" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m164c3edd7a" x="207.837357" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_6">
@@ -319,7 +319,7 @@ z
     <g id="xtick_7">
      <g id="line2d_7">
       <g>
-       <use xlink:href="#m15b5a74a8c" x="234.751295" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m164c3edd7a" x="234.751295" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_7">
@@ -380,7 +380,7 @@ z
     <g id="xtick_8">
      <g id="line2d_8">
       <g>
-       <use xlink:href="#m15b5a74a8c" x="261.665234" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m164c3edd7a" x="261.665234" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_8">
@@ -401,7 +401,7 @@ z
     <g id="xtick_9">
      <g id="line2d_9">
       <g>
-       <use xlink:href="#m15b5a74a8c" x="288.579172" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m164c3edd7a" x="288.579172" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_9">
@@ -422,7 +422,7 @@ z
     <g id="xtick_10">
      <g id="line2d_10">
       <g>
-       <use xlink:href="#m15b5a74a8c" x="315.49311" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m164c3edd7a" x="315.49311" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_10">
@@ -469,7 +469,7 @@ z
     <g id="xtick_11">
      <g id="line2d_11">
       <g>
-       <use xlink:href="#m15b5a74a8c" x="342.407048" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m164c3edd7a" x="342.407048" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_11">
@@ -491,7 +491,7 @@ z
     <g id="xtick_12">
      <g id="line2d_12">
       <g>
-       <use xlink:href="#m15b5a74a8c" x="369.320987" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m164c3edd7a" x="369.320987" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_12">
@@ -516,16 +516,16 @@ z
      <g id="line2d_13">
       <path d="M 58.465 334.482831 
 L 384.123653 334.482831 
-" clip-path="url(#p81d229eaf5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+" clip-path="url(#p0d8a9a7a6c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_14">
       <defs>
-       <path id="md612f4bb41" d="M 0 0 
+       <path id="m61debbd647" d="M 0 0 
 L -3.5 0 
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
-       <use xlink:href="#md612f4bb41" x="58.465" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m61debbd647" x="58.465" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_13">
@@ -560,18 +560,18 @@ z
     </g>
     <g id="ytick_2">
      <g id="line2d_15">
-      <path d="M 58.465 286.898076 
-L 384.123653 286.898076 
-" clip-path="url(#p81d229eaf5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 286.339994 
+L 384.123653 286.339994 
+" clip-path="url(#p0d8a9a7a6c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_16">
       <g>
-       <use xlink:href="#md612f4bb41" x="58.465" y="286.898076" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m61debbd647" x="58.465" y="286.339994" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_14">
       <!-- 2000 -->
-      <g transform="translate(20.925 291.457138) scale(0.12 -0.12)">
+      <g transform="translate(20.925 290.899057) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-32"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -581,18 +581,18 @@ L 384.123653 286.898076
     </g>
     <g id="ytick_3">
      <g id="line2d_17">
-      <path d="M 58.465 239.313321 
-L 384.123653 239.313321 
-" clip-path="url(#p81d229eaf5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 238.197157 
+L 384.123653 238.197157 
+" clip-path="url(#p0d8a9a7a6c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_18">
       <g>
-       <use xlink:href="#md612f4bb41" x="58.465" y="239.313321" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m61debbd647" x="58.465" y="238.197157" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_15">
       <!-- 4000 -->
-      <g transform="translate(20.925 243.872383) scale(0.12 -0.12)">
+      <g transform="translate(20.925 242.756219) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-34"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -602,18 +602,18 @@ L 384.123653 239.313321
     </g>
     <g id="ytick_4">
      <g id="line2d_19">
-      <path d="M 58.465 191.728565 
-L 384.123653 191.728565 
-" clip-path="url(#p81d229eaf5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 190.05432 
+L 384.123653 190.05432 
+" clip-path="url(#p0d8a9a7a6c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_20">
       <g>
-       <use xlink:href="#md612f4bb41" x="58.465" y="191.728565" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m61debbd647" x="58.465" y="190.05432" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_16">
       <!-- 6000 -->
-      <g transform="translate(20.925 196.287628) scale(0.12 -0.12)">
+      <g transform="translate(20.925 194.613382) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-36"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -1021,21 +1021,21 @@ z
     </g>
    </g>
    <g id="line2d_21">
-    <path d="M 73.267666 272.971787 
-L 100.181604 237.609356 
-L 127.095543 220.696114 
-L 154.009481 267.357961 
-L 180.923419 247.213329 
-L 207.837357 240.495665 
-L 234.751295 258.623181 
-L 261.665234 243.36894 
-L 288.579172 238.580406 
-L 315.49311 254.207831 
-L 342.407048 242.235517 
-L 369.320987 238.283365 
-" clip-path="url(#p81d229eaf5)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <path d="M 73.267666 275.913982 
+L 100.181604 236.460587 
+L 127.095543 218.853523 
+L 154.009481 266.145053 
+L 180.923419 246.898311 
+L 207.837357 239.261589 
+L 234.751295 258.30727 
+L 261.665234 243.27186 
+L 288.579172 237.565296 
+L 315.49311 253.291864 
+L 342.407048 240.864764 
+L 369.320987 237.156065 
+" clip-path="url(#p0d8a9a7a6c)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
     <defs>
-     <path id="m04c2f89d97" d="M 0 3.5 
+     <path id="m934e39abee" d="M 0 3.5 
 C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
 C 3.131218 1.81853 3.5 0.928211 3.5 0 
 C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
@@ -1047,37 +1047,37 @@ C -1.81853 3.131218 -0.928211 3.5 0 3.5
 z
 " style="stroke: #5ba3f5"/>
     </defs>
-    <g clip-path="url(#p81d229eaf5)">
-     <use xlink:href="#m04c2f89d97" x="73.267666" y="272.971787" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m04c2f89d97" x="100.181604" y="237.609356" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m04c2f89d97" x="127.095543" y="220.696114" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m04c2f89d97" x="154.009481" y="267.357961" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m04c2f89d97" x="180.923419" y="247.213329" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m04c2f89d97" x="207.837357" y="240.495665" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m04c2f89d97" x="234.751295" y="258.623181" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m04c2f89d97" x="261.665234" y="243.36894" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m04c2f89d97" x="288.579172" y="238.580406" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m04c2f89d97" x="315.49311" y="254.207831" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m04c2f89d97" x="342.407048" y="242.235517" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m04c2f89d97" x="369.320987" y="238.283365" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <g clip-path="url(#p0d8a9a7a6c)">
+     <use xlink:href="#m934e39abee" x="73.267666" y="275.913982" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m934e39abee" x="100.181604" y="236.460587" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m934e39abee" x="127.095543" y="218.853523" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m934e39abee" x="154.009481" y="266.145053" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m934e39abee" x="180.923419" y="246.898311" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m934e39abee" x="207.837357" y="239.261589" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m934e39abee" x="234.751295" y="258.30727" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m934e39abee" x="261.665234" y="243.27186" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m934e39abee" x="288.579172" y="237.565296" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m934e39abee" x="315.49311" y="253.291864" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m934e39abee" x="342.407048" y="240.864764" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m934e39abee" x="369.320987" y="237.156065" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_22">
-    <path d="M 73.267666 321.24749 
-L 100.181604 283.304671 
-L 127.095543 264.554548 
-L 154.009481 309.745277 
-L 180.923419 271.862096 
-L 207.837357 265.700512 
-L 234.751295 299.997343 
-L 261.665234 265.227117 
-L 288.579172 260.276768 
-L 315.49311 285.562061 
-L 342.407048 267.299279 
-L 369.320987 263.877176 
-" clip-path="url(#p81d229eaf5)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <path d="M 73.267666 321.664205 
+L 100.181604 275.986274 
+L 127.095543 263.904393 
+L 154.009481 307.737869 
+L 180.923419 271.303338 
+L 207.837357 264.970443 
+L 234.751295 294.41094 
+L 261.665234 264.575743 
+L 288.579172 259.426966 
+L 315.49311 277.554408 
+L 342.407048 266.546892 
+L 369.320987 263.067606 
+" clip-path="url(#p0d8a9a7a6c)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
     <defs>
-     <path id="m902d21bb1f" d="M 0 3.5 
+     <path id="m26d2813ec0" d="M 0 3.5 
 C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
 C 3.131218 1.81853 3.5 0.928211 3.5 0 
 C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
@@ -1089,25 +1089,25 @@ C -1.81853 3.131218 -0.928211 3.5 0 3.5
 z
 " style="stroke: #ff4444"/>
     </defs>
-    <g clip-path="url(#p81d229eaf5)">
-     <use xlink:href="#m902d21bb1f" x="73.267666" y="321.24749" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m902d21bb1f" x="100.181604" y="283.304671" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m902d21bb1f" x="127.095543" y="264.554548" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m902d21bb1f" x="154.009481" y="309.745277" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m902d21bb1f" x="180.923419" y="271.862096" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m902d21bb1f" x="207.837357" y="265.700512" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m902d21bb1f" x="234.751295" y="299.997343" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m902d21bb1f" x="261.665234" y="265.227117" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m902d21bb1f" x="288.579172" y="260.276768" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m902d21bb1f" x="315.49311" y="285.562061" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m902d21bb1f" x="342.407048" y="267.299279" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m902d21bb1f" x="369.320987" y="263.877176" style="fill: #ff4444; stroke: #ff4444"/>
+    <g clip-path="url(#p0d8a9a7a6c)">
+     <use xlink:href="#m26d2813ec0" x="73.267666" y="321.664205" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m26d2813ec0" x="100.181604" y="275.986274" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m26d2813ec0" x="127.095543" y="263.904393" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m26d2813ec0" x="154.009481" y="307.737869" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m26d2813ec0" x="180.923419" y="271.303338" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m26d2813ec0" x="207.837357" y="264.970443" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m26d2813ec0" x="234.751295" y="294.41094" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m26d2813ec0" x="261.665234" y="264.575743" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m26d2813ec0" x="288.579172" y="259.426966" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m26d2813ec0" x="315.49311" y="277.554408" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m26d2813ec0" x="342.407048" y="266.546892" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m26d2813ec0" x="369.320987" y="263.067606" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_23">
     <path d="M 58.465 158.865395 
 L 384.123653 158.865395 
-" clip-path="url(#p81d229eaf5)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+" clip-path="url(#p0d8a9a7a6c)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_3">
     <path d="M 58.465 334.482831 
@@ -1415,7 +1415,7 @@ L 130.874375 45.382125
 L 145.874375 45.382125 
 " style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
     <g>
-     <use xlink:href="#m04c2f89d97" x="130.874375" y="45.382125" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m934e39abee" x="130.874375" y="45.382125" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="text_20">
@@ -1568,7 +1568,7 @@ L 130.874375 62.995875
 L 145.874375 62.995875 
 " style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
     <g>
-     <use xlink:href="#m902d21bb1f" x="130.874375" y="62.995875" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m26d2813ec0" x="130.874375" y="62.995875" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="text_21">
@@ -1680,7 +1680,7 @@ z
   </g>
  </g>
  <defs>
-  <clipPath id="p81d229eaf5">
+  <clipPath id="p0d8a9a7a6c">
    <rect x="58.465" y="144.816" width="325.658653" height="189.666831"/>
   </clipPath>
  </defs>
diff --git a/oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3_with_layernorm.svg b/oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3_with_layernorm.svg
index e8d4cc6..0d4c1ae 100644
--- a/oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3_with_layernorm.svg
+++ b/oink/benchmarks/media/sm100_bf16_oink_vs_quack_dsv3_with_layernorm.svg
@@ -1,12 +1,12 @@
 <?xml version="1.0" encoding="utf-8" standalone="no"?>
 <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
   "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
-<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="1728pt" height="360pt" viewBox="0 0 1728 360" xmlns="http://www.w3.org/2000/svg" version="1.1">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="1275.020982pt" height="387.00431pt" viewBox="0 0 1275.020982 387.00431" xmlns="http://www.w3.org/2000/svg" version="1.1">
  <metadata>
   <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
    <cc:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
-    <dc:date>2026-01-08T16:35:17.144819</dc:date>
+    <dc:date>2026-01-22T03:17:02.768056</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <cc:Agent>
@@ -21,19 +21,19 @@
  </defs>
  <g id="figure_1">
   <g id="patch_1">
-   <path d="M 0 360 
-L 1728 360 
-L 1728 0 
+   <path d="M 0 387.00431 
+L 1275.020982 387.00431 
+L 1275.020982 0 
 L 0 0 
 z
 " style="fill: #ffffff"/>
   </g>
   <g id="axes_1">
    <g id="patch_2">
-    <path d="M 66.53 302.361604 
-L 441.930945 302.361604 
-L 441.930945 95.28 
-L 66.53 95.28 
+    <path d="M 58.465 338.937604 
+L 428.808051 338.937604 
+L 428.808051 144.816 
+L 58.465 144.816 
 z
 " style="fill: #ffffff"/>
    </g>
@@ -41,17 +41,17 @@ z
     <g id="xtick_1">
      <g id="line2d_1">
       <defs>
-       <path id="m6ad96393bb" d="M 0 0 
+       <path id="m6419f4e5b9" d="M 0 0 
 L 0 3.5 
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
-       <use xlink:href="#m6ad96393bb" x="83.593679" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m6419f4e5b9" x="75.298775" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_1">
       <!-- (4K, 6K) -->
-      <g transform="translate(85.06424 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(76.769336 351.310511) rotate(-315) scale(0.1 -0.1)">
        <defs>
         <path id="DejaVuSans-28" d="M 1984 4856 
 Q 1566 4138 1362 3434 
@@ -167,12 +167,12 @@ z
     <g id="xtick_2">
      <g id="line2d_2">
       <g>
-       <use xlink:href="#m6ad96393bb" x="151.848397" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m6419f4e5b9" x="117.383213" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_2">
       <!-- (16K, 6K) -->
-      <g transform="translate(153.318958 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(118.853774 351.310511) rotate(-315) scale(0.1 -0.1)">
        <defs>
         <path id="DejaVuSans-31" d="M 794 531 
 L 1825 531 
@@ -204,12 +204,12 @@ z
     <g id="xtick_3">
      <g id="line2d_3">
       <g>
-       <use xlink:href="#m6ad96393bb" x="220.103114" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m6419f4e5b9" x="159.46765" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_3">
       <!-- (64K, 6K) -->
-      <g transform="translate(221.573675 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(160.938211 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -225,12 +225,86 @@ z
     <g id="xtick_4">
      <g id="line2d_4">
       <g>
-       <use xlink:href="#m6ad96393bb" x="288.357831" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m6419f4e5b9" x="201.552088" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_4">
+      <!-- (4K, 7K) -->
+      <g transform="translate(203.022649 351.310511) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-37" d="M 525 4666 
+L 3525 4666 
+L 3525 4397 
+L 1831 0 
+L 1172 0 
+L 2766 4134 
+L 525 4134 
+L 525 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="line2d_5">
+      <g>
+       <use xlink:href="#m6419f4e5b9" x="243.636526" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_5">
+      <!-- (16K, 7K) -->
+      <g transform="translate(245.107087 351.310511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="line2d_6">
+      <g>
+       <use xlink:href="#m6419f4e5b9" x="285.720963" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_6">
+      <!-- (64K, 7K) -->
+      <g transform="translate(287.191524 351.310511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_7">
+     <g id="line2d_7">
+      <g>
+       <use xlink:href="#m6419f4e5b9" x="327.805401" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_7">
       <!-- (4K, 8K) -->
-      <g transform="translate(289.828392 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(329.275962 351.310511) rotate(-315) scale(0.1 -0.1)">
        <defs>
         <path id="DejaVuSans-38" d="M 2034 2216 
 Q 1584 2216 1326 1975 
@@ -283,15 +357,15 @@ z
       </g>
      </g>
     </g>
-    <g id="xtick_5">
-     <g id="line2d_5">
+    <g id="xtick_8">
+     <g id="line2d_8">
       <g>
-       <use xlink:href="#m6ad96393bb" x="356.612549" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m6419f4e5b9" x="369.889838" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_5">
+     <g id="text_8">
       <!-- (16K, 8K) -->
-      <g transform="translate(358.08311 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(371.3604 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
@@ -304,15 +378,15 @@ z
       </g>
      </g>
     </g>
-    <g id="xtick_6">
-     <g id="line2d_6">
+    <g id="xtick_9">
+     <g id="line2d_9">
       <g>
-       <use xlink:href="#m6ad96393bb" x="424.867266" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m6419f4e5b9" x="411.974276" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_6">
+     <g id="text_9">
       <!-- (64K, 8K) -->
-      <g transform="translate(426.337827 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(413.444837 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -328,24 +402,24 @@ z
    </g>
    <g id="matplotlib.axis_2">
     <g id="ytick_1">
-     <g id="line2d_7">
-      <path d="M 66.53 302.361604 
-L 441.930945 302.361604 
-" clip-path="url(#p1e1ded2a29)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_10">
+      <path d="M 58.465 338.937604 
+L 428.808051 338.937604 
+" clip-path="url(#p5e36002bb1)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_8">
+     <g id="line2d_11">
       <defs>
-       <path id="m8ba46e77ee" d="M 0 0 
+       <path id="m4e190215f1" d="M 0 0 
 L -3.5 0 
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
-       <use xlink:href="#m8ba46e77ee" x="66.53" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m4e190215f1" x="58.465" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_7">
+     <g id="text_10">
       <!-- 0 -->
-      <g transform="translate(51.895 306.920666) scale(0.12 -0.12)">
+      <g transform="translate(43.83 343.496666) scale(0.12 -0.12)">
        <defs>
         <path id="DejaVuSans-30" d="M 2034 4250 
 Q 1547 4250 1301 3770 
@@ -374,19 +448,19 @@ z
      </g>
     </g>
     <g id="ytick_2">
-     <g id="line2d_9">
-      <path d="M 66.53 276.439075 
-L 441.930945 276.439075 
-" clip-path="url(#p1e1ded2a29)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_12">
+      <path d="M 58.465 314.300811 
+L 428.808051 314.300811 
+" clip-path="url(#p5e36002bb1)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_10">
+     <g id="line2d_13">
       <g>
-       <use xlink:href="#m8ba46e77ee" x="66.53" y="276.439075" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m4e190215f1" x="58.465" y="314.300811" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_8">
+     <g id="text_11">
       <!-- 1000 -->
-      <g transform="translate(28.99 280.998138) scale(0.12 -0.12)">
+      <g transform="translate(20.925 318.859874) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-31"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -395,19 +469,19 @@ L 441.930945 276.439075
      </g>
     </g>
     <g id="ytick_3">
-     <g id="line2d_11">
-      <path d="M 66.53 250.516546 
-L 441.930945 250.516546 
-" clip-path="url(#p1e1ded2a29)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_14">
+      <path d="M 58.465 289.664019 
+L 428.808051 289.664019 
+" clip-path="url(#p5e36002bb1)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_12">
+     <g id="line2d_15">
       <g>
-       <use xlink:href="#m8ba46e77ee" x="66.53" y="250.516546" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m4e190215f1" x="58.465" y="289.664019" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_9">
+     <g id="text_12">
       <!-- 2000 -->
-      <g transform="translate(28.99 255.075609) scale(0.12 -0.12)">
+      <g transform="translate(20.925 294.223081) scale(0.12 -0.12)">
        <defs>
         <path id="DejaVuSans-32" d="M 1228 531 
 L 3431 531 
@@ -442,19 +516,19 @@ z
      </g>
     </g>
     <g id="ytick_4">
-     <g id="line2d_13">
-      <path d="M 66.53 224.594018 
-L 441.930945 224.594018 
-" clip-path="url(#p1e1ded2a29)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_16">
+      <path d="M 58.465 265.027226 
+L 428.808051 265.027226 
+" clip-path="url(#p5e36002bb1)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_14">
+     <g id="line2d_17">
       <g>
-       <use xlink:href="#m8ba46e77ee" x="66.53" y="224.594018" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m4e190215f1" x="58.465" y="265.027226" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_10">
+     <g id="text_13">
       <!-- 3000 -->
-      <g transform="translate(28.99 229.15308) scale(0.12 -0.12)">
+      <g transform="translate(20.925 269.586289) scale(0.12 -0.12)">
        <defs>
         <path id="DejaVuSans-33" d="M 2597 2516 
 Q 3050 2419 3304 2112 
@@ -497,19 +571,19 @@ z
      </g>
     </g>
     <g id="ytick_5">
-     <g id="line2d_15">
-      <path d="M 66.53 198.671489 
-L 441.930945 198.671489 
-" clip-path="url(#p1e1ded2a29)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_18">
+      <path d="M 58.465 240.390434 
+L 428.808051 240.390434 
+" clip-path="url(#p5e36002bb1)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_16">
+     <g id="line2d_19">
       <g>
-       <use xlink:href="#m8ba46e77ee" x="66.53" y="198.671489" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m4e190215f1" x="58.465" y="240.390434" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_11">
+     <g id="text_14">
       <!-- 4000 -->
-      <g transform="translate(28.99 203.230551) scale(0.12 -0.12)">
+      <g transform="translate(20.925 244.949496) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-34"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -518,19 +592,19 @@ L 441.930945 198.671489
      </g>
     </g>
     <g id="ytick_6">
-     <g id="line2d_17">
-      <path d="M 66.53 172.74896 
-L 441.930945 172.74896 
-" clip-path="url(#p1e1ded2a29)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_20">
+      <path d="M 58.465 215.753641 
+L 428.808051 215.753641 
+" clip-path="url(#p5e36002bb1)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_18">
+     <g id="line2d_21">
       <g>
-       <use xlink:href="#m8ba46e77ee" x="66.53" y="172.74896" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m4e190215f1" x="58.465" y="215.753641" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_12">
+     <g id="text_15">
       <!-- 5000 -->
-      <g transform="translate(28.99 177.308023) scale(0.12 -0.12)">
+      <g transform="translate(20.925 220.312703) scale(0.12 -0.12)">
        <defs>
         <path id="DejaVuSans-35" d="M 691 4666 
 L 3169 4666 
@@ -566,19 +640,19 @@ z
      </g>
     </g>
     <g id="ytick_7">
-     <g id="line2d_19">
-      <path d="M 66.53 146.826432 
-L 441.930945 146.826432 
-" clip-path="url(#p1e1ded2a29)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_22">
+      <path d="M 58.465 191.116848 
+L 428.808051 191.116848 
+" clip-path="url(#p5e36002bb1)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_20">
+     <g id="line2d_23">
       <g>
-       <use xlink:href="#m8ba46e77ee" x="66.53" y="146.826432" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m4e190215f1" x="58.465" y="191.116848" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_13">
+     <g id="text_16">
       <!-- 6000 -->
-      <g transform="translate(28.99 151.385494) scale(0.12 -0.12)">
+      <g transform="translate(20.925 195.675911) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-36"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -587,31 +661,19 @@ L 441.930945 146.826432
      </g>
     </g>
     <g id="ytick_8">
-     <g id="line2d_21">
-      <path d="M 66.53 120.903903 
-L 441.930945 120.903903 
-" clip-path="url(#p1e1ded2a29)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_24">
+      <path d="M 58.465 166.480056 
+L 428.808051 166.480056 
+" clip-path="url(#p5e36002bb1)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_22">
+     <g id="line2d_25">
       <g>
-       <use xlink:href="#m8ba46e77ee" x="66.53" y="120.903903" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m4e190215f1" x="58.465" y="166.480056" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_14">
+     <g id="text_17">
       <!-- 7000 -->
-      <g transform="translate(28.99 125.462965) scale(0.12 -0.12)">
-       <defs>
-        <path id="DejaVuSans-37" d="M 525 4666 
-L 3525 4666 
-L 3525 4397 
-L 1831 0 
-L 1172 0 
-L 2766 4134 
-L 525 4134 
-L 525 4666 
-z
-" transform="scale(0.015625)"/>
-       </defs>
+      <g transform="translate(20.925 171.039118) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-37"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -619,9 +681,9 @@ z
       </g>
      </g>
     </g>
-    <g id="text_15">
+    <g id="text_18">
      <!-- Memory Bandwidth (GB/s) -->
-     <g transform="translate(21.6625 303.824552) rotate(-90) scale(0.16 -0.16)">
+     <g transform="translate(13.5975 346.880552) rotate(-90) scale(0.16 -0.16)">
       <defs>
        <path id="DejaVuSans-4d" d="M 628 4666 
 L 1569 4666 
@@ -1018,16 +1080,19 @@ z
      </g>
     </g>
    </g>
-   <g id="line2d_23">
-    <path d="M 83.593679 272.159594 
-L 151.848397 195.029867 
-L 220.103114 193.264032 
-L 288.357831 263.563331 
-L 356.612549 193.825638 
-L 424.867266 196.186021 
-" clip-path="url(#p1e1ded2a29)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+   <g id="line2d_26">
+    <path d="M 75.298775 212.430604 
+L 117.383213 184.017052 
+L 159.46765 174.711019 
+L 201.552088 206.353328 
+L 243.636526 184.708661 
+L 285.720963 173.316994 
+L 327.805401 209.818741 
+L 369.889838 185.652025 
+L 411.974276 178.040206 
+" clip-path="url(#p5e36002bb1)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
     <defs>
-     <path id="mae8045ff5a" d="M 0 3.5 
+     <path id="mccabfa35da" d="M 0 3.5 
 C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
 C 3.131218 1.81853 3.5 0.928211 3.5 0 
 C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
@@ -1039,25 +1104,31 @@ C -1.81853 3.131218 -0.928211 3.5 0 3.5
 z
 " style="stroke: #5ba3f5"/>
     </defs>
-    <g clip-path="url(#p1e1ded2a29)">
-     <use xlink:href="#mae8045ff5a" x="83.593679" y="272.159594" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mae8045ff5a" x="151.848397" y="195.029867" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mae8045ff5a" x="220.103114" y="193.264032" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mae8045ff5a" x="288.357831" y="263.563331" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mae8045ff5a" x="356.612549" y="193.825638" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mae8045ff5a" x="424.867266" y="196.186021" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <g clip-path="url(#p5e36002bb1)">
+     <use xlink:href="#mccabfa35da" x="75.298775" y="212.430604" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mccabfa35da" x="117.383213" y="184.017052" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mccabfa35da" x="159.46765" y="174.711019" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mccabfa35da" x="201.552088" y="206.353328" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mccabfa35da" x="243.636526" y="184.708661" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mccabfa35da" x="285.720963" y="173.316994" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mccabfa35da" x="327.805401" y="209.818741" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mccabfa35da" x="369.889838" y="185.652025" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mccabfa35da" x="411.974276" y="178.040206" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
-   <g id="line2d_24">
-    <path d="M 83.593679 225.20668 
-L 151.848397 197.771954 
-L 220.103114 187.47801 
-L 288.357831 224.643778 
-L 356.612549 203.208915 
-L 424.867266 196.374915 
-" clip-path="url(#p1e1ded2a29)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+   <g id="line2d_27">
+    <path d="M 75.298775 282.364602 
+L 117.383213 268.736015 
+L 159.46765 264.104609 
+L 201.552088 278.02447 
+L 243.636526 267.063463 
+L 285.720963 263.298714 
+L 327.805401 278.983431 
+L 369.889838 266.969932 
+L 411.974276 263.837225 
+" clip-path="url(#p5e36002bb1)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
     <defs>
-     <path id="mae70e8af38" d="M 0 3.5 
+     <path id="m5641c227fe" d="M 0 3.5 
 C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
 C 3.131218 1.81853 3.5 0.928211 3.5 0 
 C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
@@ -1069,44 +1140,113 @@ C -1.81853 3.131218 -0.928211 3.5 0 3.5
 z
 " style="stroke: #ff4444"/>
     </defs>
-    <g clip-path="url(#p1e1ded2a29)">
-     <use xlink:href="#mae70e8af38" x="83.593679" y="225.20668" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mae70e8af38" x="151.848397" y="197.771954" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mae70e8af38" x="220.103114" y="187.47801" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mae70e8af38" x="288.357831" y="224.643778" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mae70e8af38" x="356.612549" y="203.208915" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mae70e8af38" x="424.867266" y="196.374915" style="fill: #ff4444; stroke: #ff4444"/>
+    <g clip-path="url(#p5e36002bb1)">
+     <use xlink:href="#m5641c227fe" x="75.298775" y="282.364602" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m5641c227fe" x="117.383213" y="268.736015" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m5641c227fe" x="159.46765" y="264.104609" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m5641c227fe" x="201.552088" y="278.02447" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m5641c227fe" x="243.636526" y="267.063463" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m5641c227fe" x="285.720963" y="263.298714" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m5641c227fe" x="327.805401" y="278.983431" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m5641c227fe" x="369.889838" y="266.969932" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m5641c227fe" x="411.974276" y="263.837225" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
-   <g id="line2d_25">
-    <path d="M 66.53 110.619378 
-L 441.930945 110.619378 
-" clip-path="url(#p1e1ded2a29)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   <g id="line2d_28">
+    <path d="M 58.465 159.195378 
+L 428.808051 159.195378 
+" clip-path="url(#p5e36002bb1)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_3">
-    <path d="M 66.53 302.361604 
-L 66.53 95.28 
+    <path d="M 58.465 338.937604 
+L 58.465 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_4">
-    <path d="M 441.930945 302.361604 
-L 441.930945 95.28 
+    <path d="M 428.808051 338.937604 
+L 428.808051 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_5">
-    <path d="M 66.53 302.361604 
-L 441.930945 302.361604 
+    <path d="M 58.465 338.937604 
+L 428.808051 338.937604 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_6">
-    <path d="M 66.53 95.28 
-L 441.930945 95.28 
+    <path d="M 58.465 144.816 
+L 428.808051 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
-   <g id="text_16">
-    <!-- RMSNorm (fp32 weight) -->
-    <g transform="translate(146.527191 89.28) scale(0.18 -0.18)">
+   <g id="text_19">
+    <!-- Fused Add+RMSNorm (fwd) -->
+    <g transform="translate(119.234026 138.816) scale(0.18 -0.18)">
      <defs>
+      <path id="DejaVuSans-46" d="M 628 4666 
+L 3309 4666 
+L 3309 4134 
+L 1259 4134 
+L 1259 2759 
+L 3109 2759 
+L 3109 2228 
+L 1259 2228 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-75" d="M 544 1381 
+L 544 3500 
+L 1119 3500 
+L 1119 1403 
+Q 1119 906 1312 657 
+Q 1506 409 1894 409 
+Q 2359 409 2629 706 
+Q 2900 1003 2900 1516 
+L 2900 3500 
+L 3475 3500 
+L 3475 0 
+L 2900 0 
+L 2900 538 
+Q 2691 219 2414 64 
+Q 2138 -91 1772 -91 
+Q 1169 -91 856 284 
+Q 544 659 544 1381 
+z
+M 1991 3584 
+L 1991 3584 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-41" d="M 2188 4044 
+L 1331 1722 
+L 3047 1722 
+L 2188 4044 
+z
+M 1831 4666 
+L 2547 4666 
+L 4325 0 
+L 3669 0 
+L 3244 1197 
+L 1141 1197 
+L 716 0 
+L 50 0 
+L 1831 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-2b" d="M 2944 4013 
+L 2944 2272 
+L 4684 2272 
+L 4684 1741 
+L 2944 1741 
+L 2944 0 
+L 2419 0 
+L 2419 1741 
+L 678 1741 
+L 678 2272 
+L 2419 2272 
+L 2419 4013 
+L 2944 4013 
+z
+" transform="scale(0.015625)"/>
       <path id="DejaVuSans-52" d="M 2841 2188 
 Q 3044 2119 3236 1894 
 Q 3428 1669 3622 1275 
@@ -1199,111 +1339,53 @@ Q 697 4328 969 4595
 Q 1241 4863 1831 4863 
 L 2375 4863 
 z
-" transform="scale(0.015625)"/>
-      <path id="DejaVuSans-70" d="M 1159 525 
-L 1159 -1331 
-L 581 -1331 
-L 581 3500 
-L 1159 3500 
-L 1159 2969 
-Q 1341 3281 1617 3432 
-Q 1894 3584 2278 3584 
-Q 2916 3584 3314 3078 
-Q 3713 2572 3713 1747 
-Q 3713 922 3314 415 
-Q 2916 -91 2278 -91 
-Q 1894 -91 1617 61 
-Q 1341 213 1159 525 
-z
-M 3116 1747 
-Q 3116 2381 2855 2742 
-Q 2594 3103 2138 3103 
-Q 1681 3103 1420 2742 
-Q 1159 2381 1159 1747 
-Q 1159 1113 1420 752 
-Q 1681 391 2138 391 
-Q 2594 391 2855 752 
-Q 3116 1113 3116 1747 
-z
-" transform="scale(0.015625)"/>
-      <path id="DejaVuSans-67" d="M 2906 1791 
-Q 2906 2416 2648 2759 
-Q 2391 3103 1925 3103 
-Q 1463 3103 1205 2759 
-Q 947 2416 947 1791 
-Q 947 1169 1205 825 
-Q 1463 481 1925 481 
-Q 2391 481 2648 825 
-Q 2906 1169 2906 1791 
-z
-M 3481 434 
-Q 3481 -459 3084 -895 
-Q 2688 -1331 1869 -1331 
-Q 1566 -1331 1297 -1286 
-Q 1028 -1241 775 -1147 
-L 775 -588 
-Q 1028 -725 1275 -790 
-Q 1522 -856 1778 -856 
-Q 2344 -856 2625 -561 
-Q 2906 -266 2906 331 
-L 2906 616 
-Q 2728 306 2450 153 
-Q 2172 0 1784 0 
-Q 1141 0 747 490 
-Q 353 981 353 1791 
-Q 353 2603 747 3093 
-Q 1141 3584 1784 3584 
-Q 2172 3584 2450 3431 
-Q 2728 3278 2906 2969 
-L 2906 3500 
-L 3481 3500 
-L 3481 434 
-z
 " transform="scale(0.015625)"/>
      </defs>
-     <use xlink:href="#DejaVuSans-52"/>
-     <use xlink:href="#DejaVuSans-4d" transform="translate(69.482422 0)"/>
-     <use xlink:href="#DejaVuSans-53" transform="translate(155.761719 0)"/>
-     <use xlink:href="#DejaVuSans-4e" transform="translate(219.238281 0)"/>
-     <use xlink:href="#DejaVuSans-6f" transform="translate(294.042969 0)"/>
-     <use xlink:href="#DejaVuSans-72" transform="translate(355.224609 0)"/>
-     <use xlink:href="#DejaVuSans-6d" transform="translate(394.587891 0)"/>
-     <use xlink:href="#DejaVuSans-20" transform="translate(492 0)"/>
-     <use xlink:href="#DejaVuSans-28" transform="translate(523.787109 0)"/>
-     <use xlink:href="#DejaVuSans-66" transform="translate(562.800781 0)"/>
-     <use xlink:href="#DejaVuSans-70" transform="translate(598.005859 0)"/>
-     <use xlink:href="#DejaVuSans-33" transform="translate(661.482422 0)"/>
-     <use xlink:href="#DejaVuSans-32" transform="translate(725.105469 0)"/>
-     <use xlink:href="#DejaVuSans-20" transform="translate(788.728516 0)"/>
-     <use xlink:href="#DejaVuSans-77" transform="translate(820.515625 0)"/>
-     <use xlink:href="#DejaVuSans-65" transform="translate(902.302734 0)"/>
-     <use xlink:href="#DejaVuSans-69" transform="translate(963.826172 0)"/>
-     <use xlink:href="#DejaVuSans-67" transform="translate(991.609375 0)"/>
-     <use xlink:href="#DejaVuSans-68" transform="translate(1055.085938 0)"/>
-     <use xlink:href="#DejaVuSans-74" transform="translate(1118.464844 0)"/>
-     <use xlink:href="#DejaVuSans-29" transform="translate(1157.673828 0)"/>
+     <use xlink:href="#DejaVuSans-46"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(52.019531 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(115.398438 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(167.498047 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(229.021484 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(292.498047 0)"/>
+     <use xlink:href="#DejaVuSans-41" transform="translate(324.285156 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(390.943359 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(454.419922 0)"/>
+     <use xlink:href="#DejaVuSans-2b" transform="translate(517.896484 0)"/>
+     <use xlink:href="#DejaVuSans-52" transform="translate(601.685547 0)"/>
+     <use xlink:href="#DejaVuSans-4d" transform="translate(671.167969 0)"/>
+     <use xlink:href="#DejaVuSans-53" transform="translate(757.447266 0)"/>
+     <use xlink:href="#DejaVuSans-4e" transform="translate(820.923828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(895.728516 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(956.910156 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(996.273438 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(1093.685547 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(1125.472656 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(1164.486328 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(1197.941406 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1279.728516 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1343.205078 0)"/>
     </g>
    </g>
   </g>
   <g id="axes_2">
    <g id="patch_7">
-    <path d="M 482.7725 302.361604 
-L 858.173445 302.361604 
-L 858.173445 95.28 
-L 482.7725 95.28 
+    <path d="M 469.455 338.937604 
+L 839.798051 338.937604 
+L 839.798051 144.816 
+L 469.455 144.816 
 z
 " style="fill: #ffffff"/>
    </g>
    <g id="matplotlib.axis_3">
-    <g id="xtick_7">
-     <g id="line2d_26">
+    <g id="xtick_10">
+     <g id="line2d_29">
       <g>
-       <use xlink:href="#m6ad96393bb" x="499.836179" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m6419f4e5b9" x="486.288775" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_17">
+     <g id="text_20">
       <!-- (4K, 6K) -->
-      <g transform="translate(501.30674 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(487.759336 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
@@ -1315,15 +1397,15 @@ z
       </g>
      </g>
     </g>
-    <g id="xtick_8">
-     <g id="line2d_27">
+    <g id="xtick_11">
+     <g id="line2d_30">
       <g>
-       <use xlink:href="#m6ad96393bb" x="568.090897" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m6419f4e5b9" x="528.373213" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_18">
+     <g id="text_21">
       <!-- (16K, 6K) -->
-      <g transform="translate(569.561458 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(529.843774 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
@@ -1336,15 +1418,15 @@ z
       </g>
      </g>
     </g>
-    <g id="xtick_9">
-     <g id="line2d_28">
+    <g id="xtick_12">
+     <g id="line2d_31">
       <g>
-       <use xlink:href="#m6ad96393bb" x="636.345614" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m6419f4e5b9" x="570.45765" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_19">
+     <g id="text_22">
       <!-- (64K, 6K) -->
-      <g transform="translate(637.816175 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(571.928211 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -1357,15 +1439,77 @@ z
       </g>
      </g>
     </g>
-    <g id="xtick_10">
-     <g id="line2d_29">
+    <g id="xtick_13">
+     <g id="line2d_32">
       <g>
-       <use xlink:href="#m6ad96393bb" x="704.600331" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m6419f4e5b9" x="612.542088" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_20">
+     <g id="text_23">
+      <!-- (4K, 7K) -->
+      <g transform="translate(614.012649 351.310511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_14">
+     <g id="line2d_33">
+      <g>
+       <use xlink:href="#m6419f4e5b9" x="654.626526" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_24">
+      <!-- (16K, 7K) -->
+      <g transform="translate(656.097087 351.310511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_15">
+     <g id="line2d_34">
+      <g>
+       <use xlink:href="#m6419f4e5b9" x="696.710963" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_25">
+      <!-- (64K, 7K) -->
+      <g transform="translate(698.181524 351.310511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_16">
+     <g id="line2d_35">
+      <g>
+       <use xlink:href="#m6419f4e5b9" x="738.795401" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_26">
       <!-- (4K, 8K) -->
-      <g transform="translate(706.070892 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(740.265962 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
@@ -1377,15 +1521,15 @@ z
       </g>
      </g>
     </g>
-    <g id="xtick_11">
-     <g id="line2d_30">
+    <g id="xtick_17">
+     <g id="line2d_36">
       <g>
-       <use xlink:href="#m6ad96393bb" x="772.855049" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m6419f4e5b9" x="780.879838" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_21">
+     <g id="text_27">
       <!-- (16K, 8K) -->
-      <g transform="translate(774.32561 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(782.3504 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
@@ -1398,15 +1542,15 @@ z
       </g>
      </g>
     </g>
-    <g id="xtick_12">
-     <g id="line2d_31">
+    <g id="xtick_18">
+     <g id="line2d_37">
       <g>
-       <use xlink:href="#m6ad96393bb" x="841.109766" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m6419f4e5b9" x="822.964276" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_22">
+     <g id="text_28">
       <!-- (64K, 8K) -->
-      <g transform="translate(842.580327 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(824.434837 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -1422,164 +1566,176 @@ z
    </g>
    <g id="matplotlib.axis_4">
     <g id="ytick_9">
-     <g id="line2d_32">
-      <path d="M 482.7725 302.361604 
-L 858.173445 302.361604 
-" clip-path="url(#pb440e45e99)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_38">
+      <path d="M 469.455 338.937604 
+L 839.798051 338.937604 
+" clip-path="url(#p7760d3fe3c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_33">
+     <g id="line2d_39">
       <g>
-       <use xlink:href="#m8ba46e77ee" x="482.7725" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m4e190215f1" x="469.455" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_10">
-     <g id="line2d_34">
-      <path d="M 482.7725 276.439075 
-L 858.173445 276.439075 
-" clip-path="url(#pb440e45e99)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_40">
+      <path d="M 469.455 314.300811 
+L 839.798051 314.300811 
+" clip-path="url(#p7760d3fe3c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_35">
+     <g id="line2d_41">
       <g>
-       <use xlink:href="#m8ba46e77ee" x="482.7725" y="276.439075" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m4e190215f1" x="469.455" y="314.300811" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_11">
-     <g id="line2d_36">
-      <path d="M 482.7725 250.516546 
-L 858.173445 250.516546 
-" clip-path="url(#pb440e45e99)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_42">
+      <path d="M 469.455 289.664019 
+L 839.798051 289.664019 
+" clip-path="url(#p7760d3fe3c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_37">
+     <g id="line2d_43">
       <g>
-       <use xlink:href="#m8ba46e77ee" x="482.7725" y="250.516546" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m4e190215f1" x="469.455" y="289.664019" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_12">
-     <g id="line2d_38">
-      <path d="M 482.7725 224.594018 
-L 858.173445 224.594018 
-" clip-path="url(#pb440e45e99)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_44">
+      <path d="M 469.455 265.027226 
+L 839.798051 265.027226 
+" clip-path="url(#p7760d3fe3c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_39">
+     <g id="line2d_45">
       <g>
-       <use xlink:href="#m8ba46e77ee" x="482.7725" y="224.594018" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m4e190215f1" x="469.455" y="265.027226" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_13">
-     <g id="line2d_40">
-      <path d="M 482.7725 198.671489 
-L 858.173445 198.671489 
-" clip-path="url(#pb440e45e99)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_46">
+      <path d="M 469.455 240.390434 
+L 839.798051 240.390434 
+" clip-path="url(#p7760d3fe3c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_41">
+     <g id="line2d_47">
       <g>
-       <use xlink:href="#m8ba46e77ee" x="482.7725" y="198.671489" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m4e190215f1" x="469.455" y="240.390434" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_14">
-     <g id="line2d_42">
-      <path d="M 482.7725 172.74896 
-L 858.173445 172.74896 
-" clip-path="url(#pb440e45e99)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_48">
+      <path d="M 469.455 215.753641 
+L 839.798051 215.753641 
+" clip-path="url(#p7760d3fe3c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_43">
+     <g id="line2d_49">
       <g>
-       <use xlink:href="#m8ba46e77ee" x="482.7725" y="172.74896" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m4e190215f1" x="469.455" y="215.753641" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_15">
-     <g id="line2d_44">
-      <path d="M 482.7725 146.826432 
-L 858.173445 146.826432 
-" clip-path="url(#pb440e45e99)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_50">
+      <path d="M 469.455 191.116848 
+L 839.798051 191.116848 
+" clip-path="url(#p7760d3fe3c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_45">
+     <g id="line2d_51">
       <g>
-       <use xlink:href="#m8ba46e77ee" x="482.7725" y="146.826432" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m4e190215f1" x="469.455" y="191.116848" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_16">
-     <g id="line2d_46">
-      <path d="M 482.7725 120.903903 
-L 858.173445 120.903903 
-" clip-path="url(#pb440e45e99)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_52">
+      <path d="M 469.455 166.480056 
+L 839.798051 166.480056 
+" clip-path="url(#p7760d3fe3c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_47">
+     <g id="line2d_53">
       <g>
-       <use xlink:href="#m8ba46e77ee" x="482.7725" y="120.903903" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m4e190215f1" x="469.455" y="166.480056" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
    </g>
-   <g id="line2d_48">
-    <path d="M 499.836179 230.257851 
-L 568.090897 179.17427 
-L 636.345614 169.204529 
-L 704.600331 207.439749 
-L 772.855049 169.971131 
-L 841.109766 163.297212 
-" clip-path="url(#pb440e45e99)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#pb440e45e99)">
-     <use xlink:href="#mae8045ff5a" x="499.836179" y="230.257851" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mae8045ff5a" x="568.090897" y="179.17427" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mae8045ff5a" x="636.345614" y="169.204529" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mae8045ff5a" x="704.600331" y="207.439749" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mae8045ff5a" x="772.855049" y="169.971131" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mae8045ff5a" x="841.109766" y="163.297212" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+   <g id="line2d_54">
+    <path d="M 486.288775 254.662564 
+L 528.373213 235.895973 
+L 570.45765 229.210305 
+L 612.542088 251.890782 
+L 654.626526 232.293457 
+L 696.710963 226.115777 
+L 738.795401 256.386993 
+L 780.879838 241.604891 
+L 822.964276 237.88117 
+" clip-path="url(#p7760d3fe3c)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p7760d3fe3c)">
+     <use xlink:href="#mccabfa35da" x="486.288775" y="254.662564" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mccabfa35da" x="528.373213" y="235.895973" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mccabfa35da" x="570.45765" y="229.210305" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mccabfa35da" x="612.542088" y="251.890782" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mccabfa35da" x="654.626526" y="232.293457" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mccabfa35da" x="696.710963" y="226.115777" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mccabfa35da" x="738.795401" y="256.386993" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mccabfa35da" x="780.879838" y="241.604891" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mccabfa35da" x="822.964276" y="237.88117" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
-   <g id="line2d_49">
-    <path d="M 499.836179 209.293665 
-L 568.090897 182.757282 
-L 636.345614 173.695178 
-L 704.600331 190.772636 
-L 772.855049 169.129494 
-L 841.109766 162.556672 
-" clip-path="url(#pb440e45e99)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#pb440e45e99)">
-     <use xlink:href="#mae70e8af38" x="499.836179" y="209.293665" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mae70e8af38" x="568.090897" y="182.757282" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mae70e8af38" x="636.345614" y="173.695178" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mae70e8af38" x="704.600331" y="190.772636" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mae70e8af38" x="772.855049" y="169.129494" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mae70e8af38" x="841.109766" y="162.556672" style="fill: #ff4444; stroke: #ff4444"/>
+   <g id="line2d_55">
+    <path d="M 486.288775 285.805632 
+L 528.373213 270.694451 
+L 570.45765 265.582942 
+L 612.542088 273.757719 
+L 654.626526 258.418474 
+L 696.710963 253.581109 
+L 738.795401 274.770076 
+L 780.879838 262.967987 
+L 822.964276 259.211244 
+" clip-path="url(#p7760d3fe3c)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p7760d3fe3c)">
+     <use xlink:href="#m5641c227fe" x="486.288775" y="285.805632" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m5641c227fe" x="528.373213" y="270.694451" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m5641c227fe" x="570.45765" y="265.582942" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m5641c227fe" x="612.542088" y="273.757719" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m5641c227fe" x="654.626526" y="258.418474" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m5641c227fe" x="696.710963" y="253.581109" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m5641c227fe" x="738.795401" y="274.770076" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m5641c227fe" x="780.879838" y="262.967987" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m5641c227fe" x="822.964276" y="259.211244" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
-   <g id="line2d_50">
-    <path d="M 482.7725 110.619378 
-L 858.173445 110.619378 
-" clip-path="url(#pb440e45e99)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   <g id="line2d_56">
+    <path d="M 469.455 159.195378 
+L 839.798051 159.195378 
+" clip-path="url(#p7760d3fe3c)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_8">
-    <path d="M 482.7725 302.361604 
-L 482.7725 95.28 
+    <path d="M 469.455 338.937604 
+L 469.455 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_9">
-    <path d="M 858.173445 302.361604 
-L 858.173445 95.28 
+    <path d="M 839.798051 338.937604 
+L 839.798051 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_10">
-    <path d="M 482.7725 302.361604 
-L 858.173445 302.361604 
+    <path d="M 469.455 338.937604 
+L 839.798051 338.937604 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_11">
-    <path d="M 482.7725 95.28 
-L 858.173445 95.28 
+    <path d="M 469.455 144.816 
+L 839.798051 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
-   <g id="text_23">
+   <g id="text_29">
     <!-- Softmax (fwd+bwd) -->
-    <g transform="translate(580.80766 89.28) scale(0.18 -0.18)">
+    <g transform="translate(564.961213 138.816) scale(0.18 -0.18)">
      <defs>
       <path id="DejaVuSans-78" d="M 3513 3500 
 L 2247 1797 
@@ -1595,21 +1751,6 @@ L 1906 2253
 L 2834 3500 
 L 3513 3500 
 z
-" transform="scale(0.015625)"/>
-      <path id="DejaVuSans-2b" d="M 2944 4013 
-L 2944 2272 
-L 4684 2272 
-L 4684 1741 
-L 2944 1741 
-L 2944 0 
-L 2419 0 
-L 2419 1741 
-L 678 1741 
-L 678 2272 
-L 2419 2272 
-L 2419 4013 
-L 2944 4013 
-z
 " transform="scale(0.015625)"/>
       <path id="DejaVuSans-62" d="M 3116 1747 
 Q 3116 2381 2855 2742 
@@ -1660,23 +1801,23 @@ z
   </g>
   <g id="axes_3">
    <g id="patch_12">
-    <path d="M 899.015 302.361604 
-L 1274.415945 302.361604 
-L 1274.415945 95.28 
-L 899.015 95.28 
+    <path d="M 880.445 338.937604 
+L 1250.788051 338.937604 
+L 1250.788051 144.816 
+L 880.445 144.816 
 z
 " style="fill: #ffffff"/>
    </g>
    <g id="matplotlib.axis_5">
-    <g id="xtick_13">
-     <g id="line2d_51">
+    <g id="xtick_19">
+     <g id="line2d_57">
       <g>
-       <use xlink:href="#m6ad96393bb" x="916.078679" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m6419f4e5b9" x="897.278775" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_24">
+     <g id="text_30">
       <!-- (4K, 6K) -->
-      <g transform="translate(917.54924 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(898.749336 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
@@ -1688,15 +1829,15 @@ z
       </g>
      </g>
     </g>
-    <g id="xtick_14">
-     <g id="line2d_52">
+    <g id="xtick_20">
+     <g id="line2d_58">
       <g>
-       <use xlink:href="#m6ad96393bb" x="984.333397" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m6419f4e5b9" x="939.363213" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_25">
+     <g id="text_31">
       <!-- (16K, 6K) -->
-      <g transform="translate(985.803958 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(940.833774 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
@@ -1709,15 +1850,15 @@ z
       </g>
      </g>
     </g>
-    <g id="xtick_15">
-     <g id="line2d_53">
+    <g id="xtick_21">
+     <g id="line2d_59">
       <g>
-       <use xlink:href="#m6ad96393bb" x="1052.588114" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m6419f4e5b9" x="981.44765" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_26">
+     <g id="text_32">
       <!-- (64K, 6K) -->
-      <g transform="translate(1054.058675 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(982.918211 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -1730,381 +1871,77 @@ z
       </g>
      </g>
     </g>
-    <g id="xtick_16">
-     <g id="line2d_54">
-      <g>
-       <use xlink:href="#m6ad96393bb" x="1120.842831" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
-      </g>
-     </g>
-     <g id="text_27">
-      <!-- (4K, 8K) -->
-      <g transform="translate(1122.313392 314.734511) rotate(-315) scale(0.1 -0.1)">
-       <use xlink:href="#DejaVuSans-28"/>
-       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
-       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
-       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
-       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
-       <use xlink:href="#DejaVuSans-38" transform="translate(231.787109 0)"/>
-       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
-       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
-      </g>
-     </g>
-    </g>
-    <g id="xtick_17">
-     <g id="line2d_55">
-      <g>
-       <use xlink:href="#m6ad96393bb" x="1189.097549" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
-      </g>
-     </g>
-     <g id="text_28">
-      <!-- (16K, 8K) -->
-      <g transform="translate(1190.56811 314.734511) rotate(-315) scale(0.1 -0.1)">
-       <use xlink:href="#DejaVuSans-28"/>
-       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
-       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
-       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
-       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
-       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
-       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
-       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
-       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
-      </g>
-     </g>
-    </g>
-    <g id="xtick_18">
-     <g id="line2d_56">
-      <g>
-       <use xlink:href="#m6ad96393bb" x="1257.352266" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
-      </g>
-     </g>
-     <g id="text_29">
-      <!-- (64K, 8K) -->
-      <g transform="translate(1258.822827 314.734511) rotate(-315) scale(0.1 -0.1)">
-       <use xlink:href="#DejaVuSans-28"/>
-       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
-       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
-       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
-       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
-       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
-       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
-       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
-       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
-      </g>
-     </g>
-    </g>
-   </g>
-   <g id="matplotlib.axis_6">
-    <g id="ytick_17">
-     <g id="line2d_57">
-      <path d="M 899.015 302.361604 
-L 1274.415945 302.361604 
-" clip-path="url(#p084fd1625d)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
-     </g>
-     <g id="line2d_58">
-      <g>
-       <use xlink:href="#m8ba46e77ee" x="899.015" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
-      </g>
-     </g>
-    </g>
-    <g id="ytick_18">
-     <g id="line2d_59">
-      <path d="M 899.015 276.439075 
-L 1274.415945 276.439075 
-" clip-path="url(#p084fd1625d)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
-     </g>
+    <g id="xtick_22">
      <g id="line2d_60">
       <g>
-       <use xlink:href="#m8ba46e77ee" x="899.015" y="276.439075" style="stroke: #000000; stroke-width: 0.8"/>
-      </g>
-     </g>
-    </g>
-    <g id="ytick_19">
-     <g id="line2d_61">
-      <path d="M 899.015 250.516546 
-L 1274.415945 250.516546 
-" clip-path="url(#p084fd1625d)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
-     </g>
-     <g id="line2d_62">
-      <g>
-       <use xlink:href="#m8ba46e77ee" x="899.015" y="250.516546" style="stroke: #000000; stroke-width: 0.8"/>
-      </g>
-     </g>
-    </g>
-    <g id="ytick_20">
-     <g id="line2d_63">
-      <path d="M 899.015 224.594018 
-L 1274.415945 224.594018 
-" clip-path="url(#p084fd1625d)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
-     </g>
-     <g id="line2d_64">
-      <g>
-       <use xlink:href="#m8ba46e77ee" x="899.015" y="224.594018" style="stroke: #000000; stroke-width: 0.8"/>
-      </g>
-     </g>
-    </g>
-    <g id="ytick_21">
-     <g id="line2d_65">
-      <path d="M 899.015 198.671489 
-L 1274.415945 198.671489 
-" clip-path="url(#p084fd1625d)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
-     </g>
-     <g id="line2d_66">
-      <g>
-       <use xlink:href="#m8ba46e77ee" x="899.015" y="198.671489" style="stroke: #000000; stroke-width: 0.8"/>
-      </g>
-     </g>
-    </g>
-    <g id="ytick_22">
-     <g id="line2d_67">
-      <path d="M 899.015 172.74896 
-L 1274.415945 172.74896 
-" clip-path="url(#p084fd1625d)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
-     </g>
-     <g id="line2d_68">
-      <g>
-       <use xlink:href="#m8ba46e77ee" x="899.015" y="172.74896" style="stroke: #000000; stroke-width: 0.8"/>
-      </g>
-     </g>
-    </g>
-    <g id="ytick_23">
-     <g id="line2d_69">
-      <path d="M 899.015 146.826432 
-L 1274.415945 146.826432 
-" clip-path="url(#p084fd1625d)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
-     </g>
-     <g id="line2d_70">
-      <g>
-       <use xlink:href="#m8ba46e77ee" x="899.015" y="146.826432" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m6419f4e5b9" x="1023.532088" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-    </g>
-    <g id="ytick_24">
-     <g id="line2d_71">
-      <path d="M 899.015 120.903903 
-L 1274.415945 120.903903 
-" clip-path="url(#p084fd1625d)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
-     </g>
-     <g id="line2d_72">
-      <g>
-       <use xlink:href="#m8ba46e77ee" x="899.015" y="120.903903" style="stroke: #000000; stroke-width: 0.8"/>
-      </g>
-     </g>
-    </g>
-   </g>
-   <g id="line2d_73">
-    <path d="M 916.078679 270.539093 
-L 984.333397 192.128303 
-L 1052.588114 180.212864 
-L 1120.842831 260.545493 
-L 1189.097549 184.529482 
-L 1257.352266 175.658501 
-" clip-path="url(#p084fd1625d)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p084fd1625d)">
-     <use xlink:href="#mae8045ff5a" x="916.078679" y="270.539093" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mae8045ff5a" x="984.333397" y="192.128303" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mae8045ff5a" x="1052.588114" y="180.212864" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mae8045ff5a" x="1120.842831" y="260.545493" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mae8045ff5a" x="1189.097549" y="184.529482" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mae8045ff5a" x="1257.352266" y="175.658501" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-    </g>
-   </g>
-   <g id="line2d_74">
-    <path d="M 916.078679 254.64417 
-L 984.333397 200.239505 
-L 1052.588114 189.844138 
-L 1120.842831 243.78625 
-L 1189.097549 189.103961 
-L 1257.352266 180.922022 
-" clip-path="url(#p084fd1625d)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p084fd1625d)">
-     <use xlink:href="#mae70e8af38" x="916.078679" y="254.64417" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mae70e8af38" x="984.333397" y="200.239505" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mae70e8af38" x="1052.588114" y="189.844138" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mae70e8af38" x="1120.842831" y="243.78625" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mae70e8af38" x="1189.097549" y="189.103961" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mae70e8af38" x="1257.352266" y="180.922022" style="fill: #ff4444; stroke: #ff4444"/>
-    </g>
-   </g>
-   <g id="line2d_75">
-    <path d="M 899.015 110.619378 
-L 1274.415945 110.619378 
-" clip-path="url(#p084fd1625d)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
-   </g>
-   <g id="patch_13">
-    <path d="M 899.015 302.361604 
-L 899.015 95.28 
-" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
-   </g>
-   <g id="patch_14">
-    <path d="M 1274.415945 302.361604 
-L 1274.415945 95.28 
-" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
-   </g>
-   <g id="patch_15">
-    <path d="M 899.015 302.361604 
-L 1274.415945 302.361604 
-" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
-   </g>
-   <g id="patch_16">
-    <path d="M 899.015 95.28 
-L 1274.415945 95.28 
-" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
-   </g>
-   <g id="text_30">
-    <!-- Cross-Entropy (fwd+bwd) -->
-    <g transform="translate(971.542191 89.28) scale(0.18 -0.18)">
-     <defs>
-      <path id="DejaVuSans-43" d="M 4122 4306 
-L 4122 3641 
-Q 3803 3938 3442 4084 
-Q 3081 4231 2675 4231 
-Q 1875 4231 1450 3742 
-Q 1025 3253 1025 2328 
-Q 1025 1406 1450 917 
-Q 1875 428 2675 428 
-Q 3081 428 3442 575 
-Q 3803 722 4122 1019 
-L 4122 359 
-Q 3791 134 3420 21 
-Q 3050 -91 2638 -91 
-Q 1578 -91 968 557 
-Q 359 1206 359 2328 
-Q 359 3453 968 4101 
-Q 1578 4750 2638 4750 
-Q 3056 4750 3426 4639 
-Q 3797 4528 4122 4306 
-z
-" transform="scale(0.015625)"/>
-      <path id="DejaVuSans-2d" d="M 313 2009 
-L 1997 2009 
-L 1997 1497 
-L 313 1497 
-L 313 2009 
-z
-" transform="scale(0.015625)"/>
-      <path id="DejaVuSans-45" d="M 628 4666 
-L 3578 4666 
-L 3578 4134 
-L 1259 4134 
-L 1259 2753 
-L 3481 2753 
-L 3481 2222 
-L 1259 2222 
-L 1259 531 
-L 3634 531 
-L 3634 0 
-L 628 0 
-L 628 4666 
-z
-" transform="scale(0.015625)"/>
-     </defs>
-     <use xlink:href="#DejaVuSans-43"/>
-     <use xlink:href="#DejaVuSans-72" transform="translate(69.824219 0)"/>
-     <use xlink:href="#DejaVuSans-6f" transform="translate(108.6875 0)"/>
-     <use xlink:href="#DejaVuSans-73" transform="translate(169.869141 0)"/>
-     <use xlink:href="#DejaVuSans-73" transform="translate(221.96875 0)"/>
-     <use xlink:href="#DejaVuSans-2d" transform="translate(274.068359 0)"/>
-     <use xlink:href="#DejaVuSans-45" transform="translate(310.152344 0)"/>
-     <use xlink:href="#DejaVuSans-6e" transform="translate(373.335938 0)"/>
-     <use xlink:href="#DejaVuSans-74" transform="translate(436.714844 0)"/>
-     <use xlink:href="#DejaVuSans-72" transform="translate(475.923828 0)"/>
-     <use xlink:href="#DejaVuSans-6f" transform="translate(514.787109 0)"/>
-     <use xlink:href="#DejaVuSans-70" transform="translate(575.96875 0)"/>
-     <use xlink:href="#DejaVuSans-79" transform="translate(639.445312 0)"/>
-     <use xlink:href="#DejaVuSans-20" transform="translate(698.625 0)"/>
-     <use xlink:href="#DejaVuSans-28" transform="translate(730.412109 0)"/>
-     <use xlink:href="#DejaVuSans-66" transform="translate(769.425781 0)"/>
-     <use xlink:href="#DejaVuSans-77" transform="translate(802.880859 0)"/>
-     <use xlink:href="#DejaVuSans-64" transform="translate(884.667969 0)"/>
-     <use xlink:href="#DejaVuSans-2b" transform="translate(948.144531 0)"/>
-     <use xlink:href="#DejaVuSans-62" transform="translate(1031.933594 0)"/>
-     <use xlink:href="#DejaVuSans-77" transform="translate(1095.410156 0)"/>
-     <use xlink:href="#DejaVuSans-64" transform="translate(1177.197266 0)"/>
-     <use xlink:href="#DejaVuSans-29" transform="translate(1240.673828 0)"/>
-    </g>
-   </g>
-  </g>
-  <g id="axes_4">
-   <g id="patch_17">
-    <path d="M 1315.2575 302.361604 
-L 1690.658445 302.361604 
-L 1690.658445 95.28 
-L 1315.2575 95.28 
-z
-" style="fill: #ffffff"/>
-   </g>
-   <g id="matplotlib.axis_7">
-    <g id="xtick_19">
-     <g id="line2d_76">
-      <g>
-       <use xlink:href="#m6ad96393bb" x="1332.321179" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
-      </g>
-     </g>
-     <g id="text_31">
-      <!-- (4K, 6K) -->
-      <g transform="translate(1333.79174 314.734511) rotate(-315) scale(0.1 -0.1)">
+     <g id="text_33">
+      <!-- (4K, 7K) -->
+      <g transform="translate(1025.002649 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
        <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
        <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
-       <use xlink:href="#DejaVuSans-36" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(231.787109 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
        <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
       </g>
      </g>
     </g>
-    <g id="xtick_20">
-     <g id="line2d_77">
+    <g id="xtick_23">
+     <g id="line2d_61">
       <g>
-       <use xlink:href="#m6ad96393bb" x="1400.575897" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m6419f4e5b9" x="1065.616526" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_32">
-      <!-- (16K, 6K) -->
-      <g transform="translate(1402.046458 314.734511) rotate(-315) scale(0.1 -0.1)">
+     <g id="text_34">
+      <!-- (16K, 7K) -->
+      <g transform="translate(1067.087087 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
        <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
        <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
-       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
        <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
       </g>
      </g>
     </g>
-    <g id="xtick_21">
-     <g id="line2d_78">
+    <g id="xtick_24">
+     <g id="line2d_62">
       <g>
-       <use xlink:href="#m6ad96393bb" x="1468.830614" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m6419f4e5b9" x="1107.700963" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_33">
-      <!-- (64K, 6K) -->
-      <g transform="translate(1470.301175 314.734511) rotate(-315) scale(0.1 -0.1)">
+     <g id="text_35">
+      <!-- (64K, 7K) -->
+      <g transform="translate(1109.171524 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
        <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
        <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
-       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
        <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
       </g>
      </g>
     </g>
-    <g id="xtick_22">
-     <g id="line2d_79">
+    <g id="xtick_25">
+     <g id="line2d_63">
       <g>
-       <use xlink:href="#m6ad96393bb" x="1537.085331" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m6419f4e5b9" x="1149.785401" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_34">
+     <g id="text_36">
       <!-- (4K, 8K) -->
-      <g transform="translate(1538.555892 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(1151.255962 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
@@ -2116,15 +1953,15 @@ z
       </g>
      </g>
     </g>
-    <g id="xtick_23">
-     <g id="line2d_80">
+    <g id="xtick_26">
+     <g id="line2d_64">
       <g>
-       <use xlink:href="#m6ad96393bb" x="1605.340049" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m6419f4e5b9" x="1191.869838" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_35">
+     <g id="text_37">
       <!-- (16K, 8K) -->
-      <g transform="translate(1606.81061 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(1193.3404 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
@@ -2137,15 +1974,15 @@ z
       </g>
      </g>
     </g>
-    <g id="xtick_24">
-     <g id="line2d_81">
+    <g id="xtick_27">
+     <g id="line2d_65">
       <g>
-       <use xlink:href="#m6ad96393bb" x="1673.594766" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m6419f4e5b9" x="1233.954276" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_36">
+     <g id="text_38">
       <!-- (64K, 8K) -->
-      <g transform="translate(1675.065327 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(1235.424837 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -2159,166 +1996,178 @@ z
      </g>
     </g>
    </g>
-   <g id="matplotlib.axis_8">
-    <g id="ytick_25">
-     <g id="line2d_82">
-      <path d="M 1315.2575 302.361604 
-L 1690.658445 302.361604 
-" clip-path="url(#pe49f8c1ba2)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
-     </g>
-     <g id="line2d_83">
+   <g id="matplotlib.axis_6">
+    <g id="ytick_17">
+     <g id="line2d_66">
+      <path d="M 880.445 338.937604 
+L 1250.788051 338.937604 
+" clip-path="url(#p5e3f07371c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_67">
       <g>
-       <use xlink:href="#m8ba46e77ee" x="1315.2575" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m4e190215f1" x="880.445" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
-    <g id="ytick_26">
-     <g id="line2d_84">
-      <path d="M 1315.2575 276.439075 
-L 1690.658445 276.439075 
-" clip-path="url(#pe49f8c1ba2)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+    <g id="ytick_18">
+     <g id="line2d_68">
+      <path d="M 880.445 314.300811 
+L 1250.788051 314.300811 
+" clip-path="url(#p5e3f07371c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_85">
+     <g id="line2d_69">
       <g>
-       <use xlink:href="#m8ba46e77ee" x="1315.2575" y="276.439075" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m4e190215f1" x="880.445" y="314.300811" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
-    <g id="ytick_27">
-     <g id="line2d_86">
-      <path d="M 1315.2575 250.516546 
-L 1690.658445 250.516546 
-" clip-path="url(#pe49f8c1ba2)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+    <g id="ytick_19">
+     <g id="line2d_70">
+      <path d="M 880.445 289.664019 
+L 1250.788051 289.664019 
+" clip-path="url(#p5e3f07371c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_87">
+     <g id="line2d_71">
       <g>
-       <use xlink:href="#m8ba46e77ee" x="1315.2575" y="250.516546" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m4e190215f1" x="880.445" y="289.664019" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
-    <g id="ytick_28">
-     <g id="line2d_88">
-      <path d="M 1315.2575 224.594018 
-L 1690.658445 224.594018 
-" clip-path="url(#pe49f8c1ba2)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+    <g id="ytick_20">
+     <g id="line2d_72">
+      <path d="M 880.445 265.027226 
+L 1250.788051 265.027226 
+" clip-path="url(#p5e3f07371c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_89">
+     <g id="line2d_73">
       <g>
-       <use xlink:href="#m8ba46e77ee" x="1315.2575" y="224.594018" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m4e190215f1" x="880.445" y="265.027226" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
-    <g id="ytick_29">
-     <g id="line2d_90">
-      <path d="M 1315.2575 198.671489 
-L 1690.658445 198.671489 
-" clip-path="url(#pe49f8c1ba2)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+    <g id="ytick_21">
+     <g id="line2d_74">
+      <path d="M 880.445 240.390434 
+L 1250.788051 240.390434 
+" clip-path="url(#p5e3f07371c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_91">
+     <g id="line2d_75">
       <g>
-       <use xlink:href="#m8ba46e77ee" x="1315.2575" y="198.671489" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m4e190215f1" x="880.445" y="240.390434" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
-    <g id="ytick_30">
-     <g id="line2d_92">
-      <path d="M 1315.2575 172.74896 
-L 1690.658445 172.74896 
-" clip-path="url(#pe49f8c1ba2)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+    <g id="ytick_22">
+     <g id="line2d_76">
+      <path d="M 880.445 215.753641 
+L 1250.788051 215.753641 
+" clip-path="url(#p5e3f07371c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_93">
+     <g id="line2d_77">
       <g>
-       <use xlink:href="#m8ba46e77ee" x="1315.2575" y="172.74896" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m4e190215f1" x="880.445" y="215.753641" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
-    <g id="ytick_31">
-     <g id="line2d_94">
-      <path d="M 1315.2575 146.826432 
-L 1690.658445 146.826432 
-" clip-path="url(#pe49f8c1ba2)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+    <g id="ytick_23">
+     <g id="line2d_78">
+      <path d="M 880.445 191.116848 
+L 1250.788051 191.116848 
+" clip-path="url(#p5e3f07371c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_95">
+     <g id="line2d_79">
       <g>
-       <use xlink:href="#m8ba46e77ee" x="1315.2575" y="146.826432" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m4e190215f1" x="880.445" y="191.116848" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
-    <g id="ytick_32">
-     <g id="line2d_96">
-      <path d="M 1315.2575 120.903903 
-L 1690.658445 120.903903 
-" clip-path="url(#pe49f8c1ba2)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+    <g id="ytick_24">
+     <g id="line2d_80">
+      <path d="M 880.445 166.480056 
+L 1250.788051 166.480056 
+" clip-path="url(#p5e3f07371c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_97">
+     <g id="line2d_81">
       <g>
-       <use xlink:href="#m8ba46e77ee" x="1315.2575" y="120.903903" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m4e190215f1" x="880.445" y="166.480056" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
    </g>
-   <g id="line2d_98">
-    <path d="M 1332.321179 249.250576 
-L 1400.575897 236.227718 
-L 1468.830614 231.495765 
-L 1537.085331 237.847671 
-L 1605.340049 225.576877 
-L 1673.594766 221.793314 
-" clip-path="url(#pe49f8c1ba2)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#pe49f8c1ba2)">
-     <use xlink:href="#mae8045ff5a" x="1332.321179" y="249.250576" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mae8045ff5a" x="1400.575897" y="236.227718" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mae8045ff5a" x="1468.830614" y="231.495765" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mae8045ff5a" x="1537.085331" y="237.847671" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mae8045ff5a" x="1605.340049" y="225.576877" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mae8045ff5a" x="1673.594766" y="221.793314" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+   <g id="line2d_82">
+    <path d="M 897.278775 260.301522 
+L 939.363213 232.017398 
+L 981.44765 220.61633 
+L 1023.532088 254.156207 
+L 1065.616526 228.937981 
+L 1107.700963 218.233259 
+L 1149.785401 250.997444 
+L 1191.869838 221.265243 
+L 1233.954276 227.093025 
+" clip-path="url(#p5e3f07371c)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p5e3f07371c)">
+     <use xlink:href="#mccabfa35da" x="897.278775" y="260.301522" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mccabfa35da" x="939.363213" y="232.017398" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mccabfa35da" x="981.44765" y="220.61633" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mccabfa35da" x="1023.532088" y="254.156207" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mccabfa35da" x="1065.616526" y="228.937981" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mccabfa35da" x="1107.700963" y="218.233259" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mccabfa35da" x="1149.785401" y="250.997444" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mccabfa35da" x="1191.869838" y="221.265243" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mccabfa35da" x="1233.954276" y="227.093025" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
-   <g id="line2d_99">
-    <path d="M 1332.321179 244.807576 
-L 1400.575897 230.902584 
-L 1468.830614 226.009343 
-L 1537.085331 229.971564 
-L 1605.340049 215.698405 
-L 1673.594766 211.308378 
-" clip-path="url(#pe49f8c1ba2)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#pe49f8c1ba2)">
-     <use xlink:href="#mae70e8af38" x="1332.321179" y="244.807576" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mae70e8af38" x="1400.575897" y="230.902584" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mae70e8af38" x="1468.830614" y="226.009343" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mae70e8af38" x="1537.085331" y="229.971564" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mae70e8af38" x="1605.340049" y="215.698405" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#mae70e8af38" x="1673.594766" y="211.308378" style="fill: #ff4444; stroke: #ff4444"/>
+   <g id="line2d_83">
+    <path d="M 897.278775 284.848641 
+L 939.363213 271.046627 
+L 981.44765 266.276122 
+L 1023.532088 269.793812 
+L 1065.616526 252.528897 
+L 1107.700963 246.75519 
+L 1149.785401 270.110335 
+L 1191.869838 256.938586 
+L 1233.954276 252.375801 
+" clip-path="url(#p5e3f07371c)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p5e3f07371c)">
+     <use xlink:href="#m5641c227fe" x="897.278775" y="284.848641" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m5641c227fe" x="939.363213" y="271.046627" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m5641c227fe" x="981.44765" y="266.276122" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m5641c227fe" x="1023.532088" y="269.793812" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m5641c227fe" x="1065.616526" y="252.528897" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m5641c227fe" x="1107.700963" y="246.75519" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m5641c227fe" x="1149.785401" y="270.110335" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m5641c227fe" x="1191.869838" y="256.938586" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m5641c227fe" x="1233.954276" y="252.375801" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
-   <g id="line2d_100">
-    <path d="M 1315.2575 110.619378 
-L 1690.658445 110.619378 
-" clip-path="url(#pe49f8c1ba2)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   <g id="line2d_84">
+    <path d="M 880.445 159.195378 
+L 1250.788051 159.195378 
+" clip-path="url(#p5e3f07371c)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
-   <g id="patch_18">
-    <path d="M 1315.2575 302.361604 
-L 1315.2575 95.28 
+   <g id="patch_13">
+    <path d="M 880.445 338.937604 
+L 880.445 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
-   <g id="patch_19">
-    <path d="M 1690.658445 302.361604 
-L 1690.658445 95.28 
+   <g id="patch_14">
+    <path d="M 1250.788051 338.937604 
+L 1250.788051 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
-   <g id="patch_20">
-    <path d="M 1315.2575 302.361604 
-L 1690.658445 302.361604 
+   <g id="patch_15">
+    <path d="M 880.445 338.937604 
+L 1250.788051 338.937604 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
-   <g id="patch_21">
-    <path d="M 1315.2575 95.28 
-L 1690.658445 95.28 
+   <g id="patch_16">
+    <path d="M 880.445 144.816 
+L 1250.788051 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
-   <g id="text_37">
+   <g id="text_39">
     <!-- LayerNorm (fwd) -->
-    <g transform="translate(1427.346723 89.28) scale(0.18 -0.18)">
+    <g transform="translate(990.005276 138.816) scale(0.18 -0.18)">
      <defs>
       <path id="DejaVuSans-4c" d="M 628 4666 
 L 1259 4666 
@@ -2348,23 +2197,10 @@ z
     </g>
    </g>
   </g>
-  <g id="text_38">
-   <!-- SM100 BF16 Kernel Benchmarks (Oink vs Quack) — DSv3 (+LayerNorm) -->
-   <g transform="translate(465.145156 18.516563) scale(0.22 -0.22)">
+  <g id="text_40">
+   <!-- SM100 BF16 Kernel Benchmarks (Oink vs Quack) — DSv3 (hidden-size ops) -->
+   <g transform="translate(225.002969 18.156562) scale(0.22 -0.22)">
     <defs>
-     <path id="DejaVuSans-46" d="M 628 4666 
-L 3309 4666 
-L 3309 4134 
-L 1259 4134 
-L 1259 2759 
-L 3109 2759 
-L 3109 2228 
-L 1259 2228 
-L 1259 0 
-L 628 0 
-L 628 4666 
-z
-" transform="scale(0.015625)"/>
      <path id="DejaVuSans-6c" d="M 603 4863 
 L 1178 4863 
 L 1178 0 
@@ -2463,28 +2299,6 @@ Q 4678 3434 4678 2328
 Q 4678 1516 4351 937 
 Q 4025 359 3406 84 
 z
-" transform="scale(0.015625)"/>
-     <path id="DejaVuSans-75" d="M 544 1381 
-L 544 3500 
-L 1119 3500 
-L 1119 1403 
-Q 1119 906 1312 657 
-Q 1506 409 1894 409 
-Q 2359 409 2629 706 
-Q 2900 1003 2900 1516 
-L 2900 3500 
-L 3475 3500 
-L 3475 0 
-L 2900 0 
-L 2900 538 
-Q 2691 219 2414 64 
-Q 2138 -91 1772 -91 
-Q 1169 -91 856 284 
-Q 544 659 544 1381 
-z
-M 1991 3584 
-L 1991 3584 
-z
 " transform="scale(0.015625)"/>
      <path id="DejaVuSans-2014" d="M 313 1978 
 L 6088 1978 
@@ -2511,6 +2325,52 @@ Q 3275 0 1925 0
 L 628 0 
 L 628 4666 
 z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-2d" d="M 313 2009 
+L 1997 2009 
+L 1997 1497 
+L 313 1497 
+L 313 2009 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-7a" d="M 353 3500 
+L 3084 3500 
+L 3084 2975 
+L 922 459 
+L 3084 459 
+L 3084 0 
+L 275 0 
+L 275 525 
+L 2438 3041 
+L 353 3041 
+L 353 3500 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-70" d="M 1159 525 
+L 1159 -1331 
+L 581 -1331 
+L 581 3500 
+L 1159 3500 
+L 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+z
+M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
 " transform="scale(0.015625)"/>
     </defs>
     <use xlink:href="#DejaVuSans-53"/>
@@ -2566,47 +2426,70 @@ z
     <use xlink:href="#DejaVuSans-33" transform="translate(2817.126953 0)"/>
     <use xlink:href="#DejaVuSans-20" transform="translate(2880.75 0)"/>
     <use xlink:href="#DejaVuSans-28" transform="translate(2912.537109 0)"/>
-    <use xlink:href="#DejaVuSans-2b" transform="translate(2951.550781 0)"/>
-    <use xlink:href="#DejaVuSans-4c" transform="translate(3035.339844 0)"/>
-    <use xlink:href="#DejaVuSans-61" transform="translate(3091.052734 0)"/>
-    <use xlink:href="#DejaVuSans-79" transform="translate(3152.332031 0)"/>
-    <use xlink:href="#DejaVuSans-65" transform="translate(3211.511719 0)"/>
-    <use xlink:href="#DejaVuSans-72" transform="translate(3273.035156 0)"/>
-    <use xlink:href="#DejaVuSans-4e" transform="translate(3314.148438 0)"/>
-    <use xlink:href="#DejaVuSans-6f" transform="translate(3388.953125 0)"/>
-    <use xlink:href="#DejaVuSans-72" transform="translate(3450.134766 0)"/>
-    <use xlink:href="#DejaVuSans-6d" transform="translate(3489.498047 0)"/>
-    <use xlink:href="#DejaVuSans-29" transform="translate(3586.910156 0)"/>
+    <use xlink:href="#DejaVuSans-68" transform="translate(2951.550781 0)"/>
+    <use xlink:href="#DejaVuSans-69" transform="translate(3014.929688 0)"/>
+    <use xlink:href="#DejaVuSans-64" transform="translate(3042.712891 0)"/>
+    <use xlink:href="#DejaVuSans-64" transform="translate(3106.189453 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(3169.666016 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(3231.189453 0)"/>
+    <use xlink:href="#DejaVuSans-2d" transform="translate(3294.568359 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(3330.652344 0)"/>
+    <use xlink:href="#DejaVuSans-69" transform="translate(3382.751953 0)"/>
+    <use xlink:href="#DejaVuSans-7a" transform="translate(3410.535156 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(3463.025391 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(3524.548828 0)"/>
+    <use xlink:href="#DejaVuSans-6f" transform="translate(3556.335938 0)"/>
+    <use xlink:href="#DejaVuSans-70" transform="translate(3617.517578 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(3680.994141 0)"/>
+    <use xlink:href="#DejaVuSans-29" transform="translate(3733.09375 0)"/>
    </g>
   </g>
   <g id="legend_1">
-   <g id="line2d_101">
-    <path d="M 582.175625 39.937812 
-L 599.675625 39.937812 
-L 617.175625 39.937812 
+   <g id="line2d_85">
+    <path d="M 378.240937 46.691969 
+L 394.490937 46.691969 
+L 410.740937 46.691969 
 " style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
     <g>
-     <use xlink:href="#mae8045ff5a" x="599.675625" y="39.937812" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mccabfa35da" x="394.490937" y="46.691969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
-   <g id="text_39">
+   <g id="text_41">
     <!-- KernelAgent-Oink (ours) -->
-    <g transform="translate(628.375625 44.837812) scale(0.14 -0.14)">
+    <g transform="translate(421.140937 51.241969) scale(0.13 -0.13)">
      <defs>
-      <path id="DejaVuSans-41" d="M 2188 4044 
-L 1331 1722 
-L 3047 1722 
-L 2188 4044 
+      <path id="DejaVuSans-67" d="M 2906 1791 
+Q 2906 2416 2648 2759 
+Q 2391 3103 1925 3103 
+Q 1463 3103 1205 2759 
+Q 947 2416 947 1791 
+Q 947 1169 1205 825 
+Q 1463 481 1925 481 
+Q 2391 481 2648 825 
+Q 2906 1169 2906 1791 
 z
-M 1831 4666 
-L 2547 4666 
-L 4325 0 
-L 3669 0 
-L 3244 1197 
-L 1141 1197 
-L 716 0 
-L 50 0 
-L 1831 4666 
+M 3481 434 
+Q 3481 -459 3084 -895 
+Q 2688 -1331 1869 -1331 
+Q 1566 -1331 1297 -1286 
+Q 1028 -1241 775 -1147 
+L 775 -588 
+Q 1028 -725 1275 -790 
+Q 1522 -856 1778 -856 
+Q 2344 -856 2625 -561 
+Q 2906 -266 2906 331 
+L 2906 616 
+Q 2728 306 2450 153 
+Q 2172 0 1784 0 
+Q 1141 0 747 490 
+Q 353 981 353 1791 
+Q 353 2603 747 3093 
+Q 1141 3584 1784 3584 
+Q 2172 3584 2450 3431 
+Q 2728 3278 2906 2969 
+L 2906 3500 
+L 3481 3500 
+L 3481 434 
 z
 " transform="scale(0.015625)"/>
      </defs>
@@ -2635,18 +2518,18 @@ z
      <use xlink:href="#DejaVuSans-29" transform="translate(1165.335938 0)"/>
     </g>
    </g>
-   <g id="line2d_102">
-    <path d="M 824.98375 39.937812 
-L 842.48375 39.937812 
-L 859.98375 39.937812 
+   <g id="line2d_86">
+    <path d="M 603.705625 46.691969 
+L 619.955625 46.691969 
+L 636.205625 46.691969 
 " style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
     <g>
-     <use xlink:href="#mae70e8af38" x="842.48375" y="39.937812" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m5641c227fe" x="619.955625" y="46.691969" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
-   <g id="text_40">
+   <g id="text_42">
     <!-- Quack -->
-    <g transform="translate(871.18375 44.837812) scale(0.14 -0.14)">
+    <g transform="translate(646.605625 51.241969) scale(0.13 -0.13)">
      <use xlink:href="#DejaVuSans-51"/>
      <use xlink:href="#DejaVuSans-75" transform="translate(78.710938 0)"/>
      <use xlink:href="#DejaVuSans-61" transform="translate(142.089844 0)"/>
@@ -2654,15 +2537,15 @@ L 859.98375 39.937812
      <use xlink:href="#DejaVuSans-6b" transform="translate(258.349609 0)"/>
     </g>
    </g>
-   <g id="line2d_103">
-    <path d="M 943.460938 39.937812 
-L 960.960938 39.937812 
-L 978.460938 39.937812 
+   <g id="line2d_87">
+    <path d="M 713.720156 46.691969 
+L 729.970156 46.691969 
+L 746.220156 46.691969 
 " style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
-   <g id="text_41">
+   <g id="text_43">
     <!-- HBM peak (measured) -->
-    <g transform="translate(989.660938 44.837812) scale(0.14 -0.14)">
+    <g transform="translate(756.620156 51.241969) scale(0.13 -0.13)">
      <defs>
       <path id="DejaVuSans-48" d="M 628 4666 
 L 1259 4666 
@@ -2704,17 +2587,14 @@ z
   </g>
  </g>
  <defs>
-  <clipPath id="p1e1ded2a29">
-   <rect x="66.53" y="95.28" width="375.400945" height="207.081604"/>
-  </clipPath>
-  <clipPath id="pb440e45e99">
-   <rect x="482.7725" y="95.28" width="375.400945" height="207.081604"/>
+  <clipPath id="p5e36002bb1">
+   <rect x="58.465" y="144.816" width="370.343051" height="194.121604"/>
   </clipPath>
-  <clipPath id="p084fd1625d">
-   <rect x="899.015" y="95.28" width="375.400945" height="207.081604"/>
+  <clipPath id="p7760d3fe3c">
+   <rect x="469.455" y="144.816" width="370.343051" height="194.121604"/>
   </clipPath>
-  <clipPath id="pe49f8c1ba2">
-   <rect x="1315.2575" y="95.28" width="375.400945" height="207.081604"/>
+  <clipPath id="p5e3f07371c">
+   <rect x="880.445" y="144.816" width="370.343051" height="194.121604"/>
   </clipPath>
  </defs>
 </svg>
diff --git a/oink/benchmarks/media/sm100_bf16_oink_vs_quack_with_layernorm.svg b/oink/benchmarks/media/sm100_bf16_oink_vs_quack_with_layernorm.svg
index a5670bd..1780d62 100644
--- a/oink/benchmarks/media/sm100_bf16_oink_vs_quack_with_layernorm.svg
+++ b/oink/benchmarks/media/sm100_bf16_oink_vs_quack_with_layernorm.svg
@@ -6,7 +6,7 @@
   <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
    <cc:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
-    <dc:date>2026-01-12T23:31:33.339254</dc:date>
+    <dc:date>2026-01-22T03:16:59.406646</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <cc:Agent>
@@ -41,12 +41,12 @@ z
     <g id="xtick_1">
      <g id="line2d_1">
       <defs>
-       <path id="m53396d3673" d="M 0 0 
+       <path id="m7cd492e0cb" d="M 0 0 
 L 0 3.5 
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
-       <use xlink:href="#m53396d3673" x="75.329082" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m7cd492e0cb" x="75.329082" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_1">
@@ -176,7 +176,7 @@ z
     <g id="xtick_2">
      <g id="line2d_2">
       <g>
-       <use xlink:href="#m53396d3673" x="131.54269" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m7cd492e0cb" x="131.54269" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_2">
@@ -243,7 +243,7 @@ z
     <g id="xtick_3">
      <g id="line2d_3">
       <g>
-       <use xlink:href="#m53396d3673" x="187.756298" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m7cd492e0cb" x="187.756298" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_3">
@@ -322,7 +322,7 @@ z
     <g id="xtick_4">
      <g id="line2d_4">
       <g>
-       <use xlink:href="#m53396d3673" x="243.969906" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m7cd492e0cb" x="243.969906" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_4">
@@ -343,7 +343,7 @@ z
     <g id="xtick_5">
      <g id="line2d_5">
       <g>
-       <use xlink:href="#m53396d3673" x="300.183514" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m7cd492e0cb" x="300.183514" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_5">
@@ -365,7 +365,7 @@ z
     <g id="xtick_6">
      <g id="line2d_6">
       <g>
-       <use xlink:href="#m53396d3673" x="356.397122" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m7cd492e0cb" x="356.397122" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_6">
@@ -414,7 +414,7 @@ z
     <g id="xtick_7">
      <g id="line2d_7">
       <g>
-       <use xlink:href="#m53396d3673" x="412.61073" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m7cd492e0cb" x="412.61073" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_7">
@@ -439,16 +439,16 @@ z
      <g id="line2d_8">
       <path d="M 58.465 334.546471 
 L 429.474812 334.546471 
-" clip-path="url(#p5c358f3005)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+" clip-path="url(#pf9b9211caf)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_9">
       <defs>
-       <path id="mf533c794cf" d="M 0 0 
+       <path id="m5d7ed8c3b6" d="M 0 0 
 L -3.5 0 
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
-       <use xlink:href="#mf533c794cf" x="58.465" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m5d7ed8c3b6" x="58.465" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_8">
@@ -483,18 +483,18 @@ z
     </g>
     <g id="ytick_2">
      <g id="line2d_10">
-      <path d="M 58.465 286.945749 
-L 429.474812 286.945749 
-" clip-path="url(#p5c358f3005)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 286.38748 
+L 429.474812 286.38748 
+" clip-path="url(#pf9b9211caf)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_11">
       <g>
-       <use xlink:href="#mf533c794cf" x="58.465" y="286.945749" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m5d7ed8c3b6" x="58.465" y="286.38748" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_9">
       <!-- 2000 -->
-      <g transform="translate(20.925 291.504812) scale(0.12 -0.12)">
+      <g transform="translate(20.925 290.946543) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-32"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -504,18 +504,18 @@ L 429.474812 286.945749
     </g>
     <g id="ytick_3">
      <g id="line2d_12">
-      <path d="M 58.465 239.345028 
-L 429.474812 239.345028 
-" clip-path="url(#p5c358f3005)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 238.228489 
+L 429.474812 238.228489 
+" clip-path="url(#pf9b9211caf)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_13">
       <g>
-       <use xlink:href="#mf533c794cf" x="58.465" y="239.345028" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m5d7ed8c3b6" x="58.465" y="238.228489" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_10">
       <!-- 4000 -->
-      <g transform="translate(20.925 243.90409) scale(0.12 -0.12)">
+      <g transform="translate(20.925 242.787552) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-34"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -525,18 +525,18 @@ L 429.474812 239.345028
     </g>
     <g id="ytick_4">
      <g id="line2d_14">
-      <path d="M 58.465 191.744306 
-L 429.474812 191.744306 
-" clip-path="url(#p5c358f3005)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 190.069499 
+L 429.474812 190.069499 
+" clip-path="url(#pf9b9211caf)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_15">
       <g>
-       <use xlink:href="#mf533c794cf" x="58.465" y="191.744306" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m5d7ed8c3b6" x="58.465" y="190.069499" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_11">
       <!-- 6000 -->
-      <g transform="translate(20.925 196.303368) scale(0.12 -0.12)">
+      <g transform="translate(20.925 194.628561) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-36"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -944,16 +944,16 @@ z
     </g>
    </g>
    <g id="line2d_16">
-    <path d="M 75.329082 253.25486 
-L 131.54269 259.661974 
-L 187.756298 223.419235 
-L 243.969906 218.133855 
-L 300.183514 216.008349 
-L 356.397122 214.921541 
-L 412.61073 214.432846 
-" clip-path="url(#p5c358f3005)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <path d="M 75.329082 238.747969 
+L 131.54269 223.337905 
+L 187.756298 214.120939 
+L 243.969906 208.283843 
+L 300.183514 205.38719 
+L 356.397122 203.733601 
+L 412.61073 202.979263 
+" clip-path="url(#pf9b9211caf)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
     <defs>
-     <path id="m7c10715330" d="M 0 3.5 
+     <path id="mf7806f9f5c" d="M 0 3.5 
 C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
 C 3.131218 1.81853 3.5 0.928211 3.5 0 
 C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
@@ -965,27 +965,27 @@ C -1.81853 3.131218 -0.928211 3.5 0 3.5
 z
 " style="stroke: #5ba3f5"/>
     </defs>
-    <g clip-path="url(#p5c358f3005)">
-     <use xlink:href="#m7c10715330" x="75.329082" y="253.25486" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7c10715330" x="131.54269" y="259.661974" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7c10715330" x="187.756298" y="223.419235" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7c10715330" x="243.969906" y="218.133855" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7c10715330" x="300.183514" y="216.008349" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7c10715330" x="356.397122" y="214.921541" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7c10715330" x="412.61073" y="214.432846" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <g clip-path="url(#pf9b9211caf)">
+     <use xlink:href="#mf7806f9f5c" x="75.329082" y="238.747969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mf7806f9f5c" x="131.54269" y="223.337905" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mf7806f9f5c" x="187.756298" y="214.120939" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mf7806f9f5c" x="243.969906" y="208.283843" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mf7806f9f5c" x="300.183514" y="205.38719" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mf7806f9f5c" x="356.397122" y="203.733601" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mf7806f9f5c" x="412.61073" y="202.979263" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_17">
-    <path d="M 75.329082 258.412109 
-L 131.54269 246.283012 
-L 187.756298 238.132423 
-L 243.969906 234.06502 
-L 300.183514 231.64472 
-L 356.397122 230.413258 
-L 412.61073 229.891064 
-" clip-path="url(#p5c358f3005)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <path d="M 75.329082 257.132175 
+L 131.54269 245.226113 
+L 187.756298 237.170142 
+L 243.969906 232.865528 
+L 300.183514 231.059769 
+L 356.397122 229.821094 
+L 412.61073 229.170627 
+" clip-path="url(#pf9b9211caf)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
     <defs>
-     <path id="me2b5b002fa" d="M 0 3.5 
+     <path id="m3996983308" d="M 0 3.5 
 C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
 C 3.131218 1.81853 3.5 0.928211 3.5 0 
 C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
@@ -997,20 +997,20 @@ C -1.81853 3.131218 -0.928211 3.5 0 3.5
 z
 " style="stroke: #ff4444"/>
     </defs>
-    <g clip-path="url(#p5c358f3005)">
-     <use xlink:href="#me2b5b002fa" x="75.329082" y="258.412109" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#me2b5b002fa" x="131.54269" y="246.283012" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#me2b5b002fa" x="187.756298" y="238.132423" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#me2b5b002fa" x="243.969906" y="234.06502" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#me2b5b002fa" x="300.183514" y="231.64472" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#me2b5b002fa" x="356.397122" y="230.413258" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#me2b5b002fa" x="412.61073" y="229.891064" style="fill: #ff4444; stroke: #ff4444"/>
+    <g clip-path="url(#pf9b9211caf)">
+     <use xlink:href="#m3996983308" x="75.329082" y="257.132175" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3996983308" x="131.54269" y="245.226113" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3996983308" x="187.756298" y="237.170142" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3996983308" x="243.969906" y="232.865528" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3996983308" x="300.183514" y="231.059769" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3996983308" x="356.397122" y="229.821094" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3996983308" x="412.61073" y="229.170627" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_18">
     <path d="M 58.465 158.870109 
 L 429.474812 158.870109 
-" clip-path="url(#p5c358f3005)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+" clip-path="url(#pf9b9211caf)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_3">
     <path d="M 58.465 334.546471 
@@ -1227,7 +1227,7 @@ z
     <g id="xtick_8">
      <g id="line2d_19">
       <g>
-       <use xlink:href="#m53396d3673" x="491.571582" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m7cd492e0cb" x="491.571582" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_14">
@@ -1247,7 +1247,7 @@ z
     <g id="xtick_9">
      <g id="line2d_20">
       <g>
-       <use xlink:href="#m53396d3673" x="547.78519" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m7cd492e0cb" x="547.78519" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_15">
@@ -1268,7 +1268,7 @@ z
     <g id="xtick_10">
      <g id="line2d_21">
       <g>
-       <use xlink:href="#m53396d3673" x="603.998798" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m7cd492e0cb" x="603.998798" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_16">
@@ -1289,7 +1289,7 @@ z
     <g id="xtick_11">
      <g id="line2d_22">
       <g>
-       <use xlink:href="#m53396d3673" x="660.212406" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m7cd492e0cb" x="660.212406" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_17">
@@ -1310,7 +1310,7 @@ z
     <g id="xtick_12">
      <g id="line2d_23">
       <g>
-       <use xlink:href="#m53396d3673" x="716.426014" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m7cd492e0cb" x="716.426014" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_18">
@@ -1332,7 +1332,7 @@ z
     <g id="xtick_13">
      <g id="line2d_24">
       <g>
-       <use xlink:href="#m53396d3673" x="772.639622" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m7cd492e0cb" x="772.639622" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_19">
@@ -1354,7 +1354,7 @@ z
     <g id="xtick_14">
      <g id="line2d_25">
       <g>
-       <use xlink:href="#m53396d3673" x="828.85323" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m7cd492e0cb" x="828.85323" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_20">
@@ -1379,93 +1379,93 @@ z
      <g id="line2d_26">
       <path d="M 474.7075 334.546471 
 L 845.717312 334.546471 
-" clip-path="url(#p9fd6c82b75)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+" clip-path="url(#p3f778d8e4c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_27">
       <g>
-       <use xlink:href="#mf533c794cf" x="474.7075" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m5d7ed8c3b6" x="474.7075" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_6">
      <g id="line2d_28">
-      <path d="M 474.7075 286.945749 
-L 845.717312 286.945749 
-" clip-path="url(#p9fd6c82b75)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 474.7075 286.38748 
+L 845.717312 286.38748 
+" clip-path="url(#p3f778d8e4c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_29">
       <g>
-       <use xlink:href="#mf533c794cf" x="474.7075" y="286.945749" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m5d7ed8c3b6" x="474.7075" y="286.38748" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_7">
      <g id="line2d_30">
-      <path d="M 474.7075 239.345028 
-L 845.717312 239.345028 
-" clip-path="url(#p9fd6c82b75)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 474.7075 238.228489 
+L 845.717312 238.228489 
+" clip-path="url(#p3f778d8e4c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_31">
       <g>
-       <use xlink:href="#mf533c794cf" x="474.7075" y="239.345028" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m5d7ed8c3b6" x="474.7075" y="238.228489" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_8">
      <g id="line2d_32">
-      <path d="M 474.7075 191.744306 
-L 845.717312 191.744306 
-" clip-path="url(#p9fd6c82b75)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 474.7075 190.069499 
+L 845.717312 190.069499 
+" clip-path="url(#p3f778d8e4c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_33">
       <g>
-       <use xlink:href="#mf533c794cf" x="474.7075" y="191.744306" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m5d7ed8c3b6" x="474.7075" y="190.069499" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
    </g>
    <g id="line2d_34">
-    <path d="M 491.571582 242.661221 
-L 547.78519 232.6956 
-L 603.998798 226.518353 
-L 660.212406 223.572657 
-L 716.426014 221.993259 
-L 772.639622 221.173843 
-L 828.85323 220.741664 
-" clip-path="url(#p9fd6c82b75)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p9fd6c82b75)">
-     <use xlink:href="#m7c10715330" x="491.571582" y="242.661221" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7c10715330" x="547.78519" y="232.6956" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7c10715330" x="603.998798" y="226.518353" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7c10715330" x="660.212406" y="223.572657" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7c10715330" x="716.426014" y="221.993259" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7c10715330" x="772.639622" y="221.173843" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7c10715330" x="828.85323" y="220.741664" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <path d="M 491.571582 241.570475 
+L 547.78519 231.363841 
+L 603.998798 225.318162 
+L 660.212406 222.208866 
+L 716.426014 220.653889 
+L 772.639622 219.814485 
+L 828.85323 219.385707 
+" clip-path="url(#p3f778d8e4c)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p3f778d8e4c)">
+     <use xlink:href="#mf7806f9f5c" x="491.571582" y="241.570475" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mf7806f9f5c" x="547.78519" y="231.363841" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mf7806f9f5c" x="603.998798" y="225.318162" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mf7806f9f5c" x="660.212406" y="222.208866" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mf7806f9f5c" x="716.426014" y="220.653889" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mf7806f9f5c" x="772.639622" y="219.814485" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mf7806f9f5c" x="828.85323" y="219.385707" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_35">
-    <path d="M 491.571582 272.482214 
-L 547.78519 265.651785 
-L 603.998798 261.232124 
-L 660.212406 258.947326 
-L 716.426014 257.740701 
-L 772.639622 257.088157 
-L 828.85323 256.798828 
-" clip-path="url(#p9fd6c82b75)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p9fd6c82b75)">
-     <use xlink:href="#me2b5b002fa" x="491.571582" y="272.482214" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#me2b5b002fa" x="547.78519" y="265.651785" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#me2b5b002fa" x="603.998798" y="261.232124" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#me2b5b002fa" x="660.212406" y="258.947326" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#me2b5b002fa" x="716.426014" y="257.740701" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#me2b5b002fa" x="772.639622" y="257.088157" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#me2b5b002fa" x="828.85323" y="256.798828" style="fill: #ff4444; stroke: #ff4444"/>
+    <path d="M 491.571582 272.852471 
+L 547.78519 265.231813 
+L 603.998798 260.582149 
+L 660.212406 258.078016 
+L 716.426014 256.850404 
+L 772.639622 256.195061 
+L 828.85323 255.870209 
+" clip-path="url(#p3f778d8e4c)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p3f778d8e4c)">
+     <use xlink:href="#m3996983308" x="491.571582" y="272.852471" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3996983308" x="547.78519" y="265.231813" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3996983308" x="603.998798" y="260.582149" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3996983308" x="660.212406" y="258.078016" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3996983308" x="716.426014" y="256.850404" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3996983308" x="772.639622" y="256.195061" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3996983308" x="828.85323" y="255.870209" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_36">
     <path d="M 474.7075 158.870109 
 L 845.717312 158.870109 
-" clip-path="url(#p9fd6c82b75)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+" clip-path="url(#p3f778d8e4c)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_8">
     <path d="M 474.7075 334.546471 
@@ -1581,7 +1581,7 @@ z
     <g id="xtick_15">
      <g id="line2d_37">
       <g>
-       <use xlink:href="#m53396d3673" x="907.814082" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m7cd492e0cb" x="907.814082" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_22">
@@ -1601,7 +1601,7 @@ z
     <g id="xtick_16">
      <g id="line2d_38">
       <g>
-       <use xlink:href="#m53396d3673" x="964.02769" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m7cd492e0cb" x="964.02769" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_23">
@@ -1622,7 +1622,7 @@ z
     <g id="xtick_17">
      <g id="line2d_39">
       <g>
-       <use xlink:href="#m53396d3673" x="1020.241298" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m7cd492e0cb" x="1020.241298" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_24">
@@ -1643,7 +1643,7 @@ z
     <g id="xtick_18">
      <g id="line2d_40">
       <g>
-       <use xlink:href="#m53396d3673" x="1076.454906" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m7cd492e0cb" x="1076.454906" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_25">
@@ -1664,7 +1664,7 @@ z
     <g id="xtick_19">
      <g id="line2d_41">
       <g>
-       <use xlink:href="#m53396d3673" x="1132.668514" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m7cd492e0cb" x="1132.668514" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_26">
@@ -1686,7 +1686,7 @@ z
     <g id="xtick_20">
      <g id="line2d_42">
       <g>
-       <use xlink:href="#m53396d3673" x="1188.882122" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m7cd492e0cb" x="1188.882122" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_27">
@@ -1708,7 +1708,7 @@ z
     <g id="xtick_21">
      <g id="line2d_43">
       <g>
-       <use xlink:href="#m53396d3673" x="1245.09573" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m7cd492e0cb" x="1245.09573" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_28">
@@ -1733,93 +1733,93 @@ z
      <g id="line2d_44">
       <path d="M 890.95 334.546471 
 L 1261.959812 334.546471 
-" clip-path="url(#pda63148a07)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+" clip-path="url(#p17f87f3a7a)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_45">
       <g>
-       <use xlink:href="#mf533c794cf" x="890.95" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m5d7ed8c3b6" x="890.95" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_10">
      <g id="line2d_46">
-      <path d="M 890.95 286.945749 
-L 1261.959812 286.945749 
-" clip-path="url(#pda63148a07)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 890.95 286.38748 
+L 1261.959812 286.38748 
+" clip-path="url(#p17f87f3a7a)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_47">
       <g>
-       <use xlink:href="#mf533c794cf" x="890.95" y="286.945749" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m5d7ed8c3b6" x="890.95" y="286.38748" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_11">
      <g id="line2d_48">
-      <path d="M 890.95 239.345028 
-L 1261.959812 239.345028 
-" clip-path="url(#pda63148a07)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 890.95 238.228489 
+L 1261.959812 238.228489 
+" clip-path="url(#p17f87f3a7a)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_49">
       <g>
-       <use xlink:href="#mf533c794cf" x="890.95" y="239.345028" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m5d7ed8c3b6" x="890.95" y="238.228489" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_12">
      <g id="line2d_50">
-      <path d="M 890.95 191.744306 
-L 1261.959812 191.744306 
-" clip-path="url(#pda63148a07)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 890.95 190.069499 
+L 1261.959812 190.069499 
+" clip-path="url(#p17f87f3a7a)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_51">
       <g>
-       <use xlink:href="#mf533c794cf" x="890.95" y="191.744306" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m5d7ed8c3b6" x="890.95" y="190.069499" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
    </g>
    <g id="line2d_52">
-    <path d="M 907.814082 257.088265 
-L 964.02769 246.107662 
-L 1020.241298 241.060757 
-L 1076.454906 237.106886 
-L 1132.668514 235.500346 
-L 1188.882122 234.468963 
-L 1245.09573 233.975246 
-" clip-path="url(#pda63148a07)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#pda63148a07)">
-     <use xlink:href="#m7c10715330" x="907.814082" y="257.088265" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7c10715330" x="964.02769" y="246.107662" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7c10715330" x="1020.241298" y="241.060757" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7c10715330" x="1076.454906" y="237.106886" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7c10715330" x="1132.668514" y="235.500346" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7c10715330" x="1188.882122" y="234.468963" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7c10715330" x="1245.09573" y="233.975246" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <path d="M 907.814082 257.054623 
+L 964.02769 245.288411 
+L 1020.241298 239.648452 
+L 1076.454906 236.012931 
+L 1132.668514 234.360915 
+L 1188.882122 233.260094 
+L 1245.09573 232.798825 
+" clip-path="url(#p17f87f3a7a)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p17f87f3a7a)">
+     <use xlink:href="#mf7806f9f5c" x="907.814082" y="257.054623" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mf7806f9f5c" x="964.02769" y="245.288411" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mf7806f9f5c" x="1020.241298" y="239.648452" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mf7806f9f5c" x="1076.454906" y="236.012931" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mf7806f9f5c" x="1132.668514" y="234.360915" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mf7806f9f5c" x="1188.882122" y="233.260094" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mf7806f9f5c" x="1245.09573" y="232.798825" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_53">
-    <path d="M 907.814082 299.12209 
-L 964.02769 272.123621 
-L 1020.241298 266.042305 
-L 1076.454906 262.721252 
-L 1132.668514 261.113394 
-L 1188.882122 260.124563 
-L 1245.09573 259.694299 
-" clip-path="url(#pda63148a07)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#pda63148a07)">
-     <use xlink:href="#me2b5b002fa" x="907.814082" y="299.12209" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#me2b5b002fa" x="964.02769" y="272.123621" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#me2b5b002fa" x="1020.241298" y="266.042305" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#me2b5b002fa" x="1076.454906" y="262.721252" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#me2b5b002fa" x="1132.668514" y="261.113394" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#me2b5b002fa" x="1188.882122" y="260.124563" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#me2b5b002fa" x="1245.09573" y="259.694299" style="fill: #ff4444; stroke: #ff4444"/>
+    <path d="M 907.814082 299.408833 
+L 964.02769 271.365398 
+L 1020.241298 265.338464 
+L 1076.454906 261.952079 
+L 1132.668514 260.333542 
+L 1188.882122 259.272038 
+L 1245.09573 258.836131 
+" clip-path="url(#p17f87f3a7a)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p17f87f3a7a)">
+     <use xlink:href="#m3996983308" x="907.814082" y="299.408833" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3996983308" x="964.02769" y="271.365398" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3996983308" x="1020.241298" y="265.338464" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3996983308" x="1076.454906" y="261.952079" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3996983308" x="1132.668514" y="260.333542" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3996983308" x="1188.882122" y="259.272038" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3996983308" x="1245.09573" y="258.836131" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_54">
     <path d="M 890.95 158.870109 
 L 1261.959812 158.870109 
-" clip-path="url(#pda63148a07)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+" clip-path="url(#p17f87f3a7a)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_13">
     <path d="M 890.95 334.546471 
@@ -1928,7 +1928,7 @@ z
     <g id="xtick_22">
      <g id="line2d_55">
       <g>
-       <use xlink:href="#m53396d3673" x="1324.056582" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m7cd492e0cb" x="1324.056582" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_30">
@@ -1948,7 +1948,7 @@ z
     <g id="xtick_23">
      <g id="line2d_56">
       <g>
-       <use xlink:href="#m53396d3673" x="1380.27019" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m7cd492e0cb" x="1380.27019" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_31">
@@ -1969,7 +1969,7 @@ z
     <g id="xtick_24">
      <g id="line2d_57">
       <g>
-       <use xlink:href="#m53396d3673" x="1436.483798" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m7cd492e0cb" x="1436.483798" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_32">
@@ -1990,7 +1990,7 @@ z
     <g id="xtick_25">
      <g id="line2d_58">
       <g>
-       <use xlink:href="#m53396d3673" x="1492.697406" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m7cd492e0cb" x="1492.697406" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_33">
@@ -2011,7 +2011,7 @@ z
     <g id="xtick_26">
      <g id="line2d_59">
       <g>
-       <use xlink:href="#m53396d3673" x="1548.911014" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m7cd492e0cb" x="1548.911014" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_34">
@@ -2033,7 +2033,7 @@ z
     <g id="xtick_27">
      <g id="line2d_60">
       <g>
-       <use xlink:href="#m53396d3673" x="1605.124622" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m7cd492e0cb" x="1605.124622" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_35">
@@ -2055,7 +2055,7 @@ z
     <g id="xtick_28">
      <g id="line2d_61">
       <g>
-       <use xlink:href="#m53396d3673" x="1661.33823" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m7cd492e0cb" x="1661.33823" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_36">
@@ -2080,93 +2080,93 @@ z
      <g id="line2d_62">
       <path d="M 1307.1925 334.546471 
 L 1678.202312 334.546471 
-" clip-path="url(#pdcd35808ba)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+" clip-path="url(#p0aefa4695a)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_63">
       <g>
-       <use xlink:href="#mf533c794cf" x="1307.1925" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m5d7ed8c3b6" x="1307.1925" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_14">
      <g id="line2d_64">
-      <path d="M 1307.1925 286.945749 
-L 1678.202312 286.945749 
-" clip-path="url(#pdcd35808ba)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 1307.1925 286.38748 
+L 1678.202312 286.38748 
+" clip-path="url(#p0aefa4695a)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_65">
       <g>
-       <use xlink:href="#mf533c794cf" x="1307.1925" y="286.945749" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m5d7ed8c3b6" x="1307.1925" y="286.38748" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_15">
      <g id="line2d_66">
-      <path d="M 1307.1925 239.345028 
-L 1678.202312 239.345028 
-" clip-path="url(#pdcd35808ba)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 1307.1925 238.228489 
+L 1678.202312 238.228489 
+" clip-path="url(#p0aefa4695a)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_67">
       <g>
-       <use xlink:href="#mf533c794cf" x="1307.1925" y="239.345028" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m5d7ed8c3b6" x="1307.1925" y="238.228489" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_16">
      <g id="line2d_68">
-      <path d="M 1307.1925 191.744306 
-L 1678.202312 191.744306 
-" clip-path="url(#pdcd35808ba)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 1307.1925 190.069499 
+L 1678.202312 190.069499 
+" clip-path="url(#p0aefa4695a)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_69">
       <g>
-       <use xlink:href="#mf533c794cf" x="1307.1925" y="191.744306" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m5d7ed8c3b6" x="1307.1925" y="190.069499" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
    </g>
    <g id="line2d_70">
-    <path d="M 1324.056582 258.252957 
-L 1380.27019 246.854983 
-L 1436.483798 240.694095 
-L 1492.697406 237.286932 
-L 1548.911014 235.416291 
-L 1605.124622 234.374703 
-L 1661.33823 233.955946 
-" clip-path="url(#pdcd35808ba)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#pdcd35808ba)">
-     <use xlink:href="#m7c10715330" x="1324.056582" y="258.252957" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7c10715330" x="1380.27019" y="246.854983" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7c10715330" x="1436.483798" y="240.694095" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7c10715330" x="1492.697406" y="237.286932" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7c10715330" x="1548.911014" y="235.416291" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7c10715330" x="1605.124622" y="234.374703" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m7c10715330" x="1661.33823" y="233.955946" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <path d="M 1324.056582 249.416559 
+L 1380.27019 237.290928 
+L 1436.483798 228.793707 
+L 1492.697406 224.310574 
+L 1548.911014 222.024574 
+L 1605.124622 220.852965 
+L 1661.33823 220.298783 
+" clip-path="url(#p0aefa4695a)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p0aefa4695a)">
+     <use xlink:href="#mf7806f9f5c" x="1324.056582" y="249.416559" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mf7806f9f5c" x="1380.27019" y="237.290928" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mf7806f9f5c" x="1436.483798" y="228.793707" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mf7806f9f5c" x="1492.697406" y="224.310574" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mf7806f9f5c" x="1548.911014" y="222.024574" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mf7806f9f5c" x="1605.124622" y="220.852965" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mf7806f9f5c" x="1661.33823" y="220.298783" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_71">
-    <path d="M 1324.056582 267.610946 
-L 1380.27019 259.360866 
-L 1436.483798 253.978014 
-L 1492.697406 251.199004 
-L 1548.911014 250.113639 
-L 1605.124622 249.284432 
-L 1661.33823 249.208269 
-" clip-path="url(#pdcd35808ba)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#pdcd35808ba)">
-     <use xlink:href="#me2b5b002fa" x="1324.056582" y="267.610946" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#me2b5b002fa" x="1380.27019" y="259.360866" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#me2b5b002fa" x="1436.483798" y="253.978014" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#me2b5b002fa" x="1492.697406" y="251.199004" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#me2b5b002fa" x="1548.911014" y="250.113639" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#me2b5b002fa" x="1605.124622" y="249.284432" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#me2b5b002fa" x="1661.33823" y="249.208269" style="fill: #ff4444; stroke: #ff4444"/>
+    <path d="M 1324.056582 266.878179 
+L 1380.27019 257.984117 
+L 1436.483798 252.95274 
+L 1492.697406 250.395741 
+L 1548.911014 249.268587 
+L 1605.124622 248.544879 
+L 1661.33823 248.187514 
+" clip-path="url(#p0aefa4695a)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p0aefa4695a)">
+     <use xlink:href="#m3996983308" x="1324.056582" y="266.878179" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3996983308" x="1380.27019" y="257.984117" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3996983308" x="1436.483798" y="252.95274" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3996983308" x="1492.697406" y="250.395741" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3996983308" x="1548.911014" y="249.268587" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3996983308" x="1605.124622" y="248.544879" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3996983308" x="1661.33823" y="248.187514" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_72">
     <path d="M 1307.1925 158.870109 
 L 1678.202312 158.870109 
-" clip-path="url(#pdcd35808ba)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+" clip-path="url(#p0aefa4695a)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_18">
     <path d="M 1307.1925 334.546471 
@@ -2446,7 +2446,7 @@ L 610.490937 46.691969
 L 626.740937 46.691969 
 " style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
     <g>
-     <use xlink:href="#m7c10715330" x="610.490937" y="46.691969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mf7806f9f5c" x="610.490937" y="46.691969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="text_39">
@@ -2501,7 +2501,7 @@ L 835.955625 46.691969
 L 852.205625 46.691969 
 " style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
     <g>
-     <use xlink:href="#me2b5b002fa" x="835.955625" y="46.691969" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3996983308" x="835.955625" y="46.691969" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="text_40">
@@ -2564,16 +2564,16 @@ z
   </g>
  </g>
  <defs>
-  <clipPath id="p5c358f3005">
+  <clipPath id="pf9b9211caf">
    <rect x="58.465" y="144.816" width="371.009812" height="189.730471"/>
   </clipPath>
-  <clipPath id="p9fd6c82b75">
+  <clipPath id="p3f778d8e4c">
    <rect x="474.7075" y="144.816" width="371.009812" height="189.730471"/>
   </clipPath>
-  <clipPath id="pda63148a07">
+  <clipPath id="p17f87f3a7a">
    <rect x="890.95" y="144.816" width="371.009812" height="189.730471"/>
   </clipPath>
-  <clipPath id="pdcd35808ba">
+  <clipPath id="p0aefa4695a">
    <rect x="1307.1925" y="144.816" width="371.009812" height="189.730471"/>
   </clipPath>
  </defs>
diff --git a/oink/benchmarks/media/sm100_fp16_oink_vs_quack.svg b/oink/benchmarks/media/sm100_fp16_oink_vs_quack.svg
index 0a021e9..e3bcd46 100644
--- a/oink/benchmarks/media/sm100_fp16_oink_vs_quack.svg
+++ b/oink/benchmarks/media/sm100_fp16_oink_vs_quack.svg
@@ -6,7 +6,7 @@
   <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
    <cc:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
-    <dc:date>2026-01-12T23:31:38.919062</dc:date>
+    <dc:date>2026-01-22T03:17:07.801333</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <cc:Agent>
@@ -41,12 +41,12 @@ z
     <g id="xtick_1">
      <g id="line2d_1">
       <defs>
-       <path id="meb1843bdeb" d="M 0 0 
+       <path id="mac3651fd45" d="M 0 0 
 L 0 3.5 
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
-       <use xlink:href="#meb1843bdeb" x="75.099178" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mac3651fd45" x="75.099178" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_1">
@@ -176,7 +176,7 @@ z
     <g id="xtick_2">
      <g id="line2d_2">
       <g>
-       <use xlink:href="#meb1843bdeb" x="130.546438" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mac3651fd45" x="130.546438" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_2">
@@ -243,7 +243,7 @@ z
     <g id="xtick_3">
      <g id="line2d_3">
       <g>
-       <use xlink:href="#meb1843bdeb" x="185.993699" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mac3651fd45" x="185.993699" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_3">
@@ -322,7 +322,7 @@ z
     <g id="xtick_4">
      <g id="line2d_4">
       <g>
-       <use xlink:href="#meb1843bdeb" x="241.440959" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mac3651fd45" x="241.440959" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_4">
@@ -343,7 +343,7 @@ z
     <g id="xtick_5">
      <g id="line2d_5">
       <g>
-       <use xlink:href="#meb1843bdeb" x="296.888219" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mac3651fd45" x="296.888219" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_5">
@@ -365,7 +365,7 @@ z
     <g id="xtick_6">
      <g id="line2d_6">
       <g>
-       <use xlink:href="#meb1843bdeb" x="352.33548" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mac3651fd45" x="352.33548" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_6">
@@ -414,7 +414,7 @@ z
     <g id="xtick_7">
      <g id="line2d_7">
       <g>
-       <use xlink:href="#meb1843bdeb" x="407.78274" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mac3651fd45" x="407.78274" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_7">
@@ -439,16 +439,16 @@ z
      <g id="line2d_8">
       <path d="M 58.465 334.546471 
 L 424.416918 334.546471 
-" clip-path="url(#pd71b444ea1)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+" clip-path="url(#pb2e732b357)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_9">
       <defs>
-       <path id="md6c5251344" d="M 0 0 
+       <path id="m2ee9692d93" d="M 0 0 
 L -3.5 0 
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
-       <use xlink:href="#md6c5251344" x="58.465" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m2ee9692d93" x="58.465" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_8">
@@ -483,18 +483,18 @@ z
     </g>
     <g id="ytick_2">
      <g id="line2d_10">
-      <path d="M 58.465 286.915059 
-L 424.416918 286.915059 
-" clip-path="url(#pd71b444ea1)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 286.389791 
+L 424.416918 286.389791 
+" clip-path="url(#pb2e732b357)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_11">
       <g>
-       <use xlink:href="#md6c5251344" x="58.465" y="286.915059" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m2ee9692d93" x="58.465" y="286.389791" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_9">
       <!-- 2000 -->
-      <g transform="translate(20.925 291.474121) scale(0.12 -0.12)">
+      <g transform="translate(20.925 290.948853) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-32"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -504,18 +504,18 @@ L 424.416918 286.915059
     </g>
     <g id="ytick_3">
      <g id="line2d_12">
-      <path d="M 58.465 239.283646 
-L 424.416918 239.283646 
-" clip-path="url(#pd71b444ea1)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 238.23311 
+L 424.416918 238.23311 
+" clip-path="url(#pb2e732b357)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_13">
       <g>
-       <use xlink:href="#md6c5251344" x="58.465" y="239.283646" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m2ee9692d93" x="58.465" y="238.23311" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_10">
       <!-- 4000 -->
-      <g transform="translate(20.925 243.842709) scale(0.12 -0.12)">
+      <g transform="translate(20.925 242.792173) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-34"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -525,18 +525,18 @@ L 424.416918 239.283646
     </g>
     <g id="ytick_4">
      <g id="line2d_14">
-      <path d="M 58.465 191.652234 
-L 424.416918 191.652234 
-" clip-path="url(#pd71b444ea1)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 190.07643 
+L 424.416918 190.07643 
+" clip-path="url(#pb2e732b357)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_15">
       <g>
-       <use xlink:href="#md6c5251344" x="58.465" y="191.652234" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m2ee9692d93" x="58.465" y="190.07643" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_11">
       <!-- 6000 -->
-      <g transform="translate(20.925 196.211297) scale(0.12 -0.12)">
+      <g transform="translate(20.925 194.635493) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-36"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -944,16 +944,16 @@ z
     </g>
    </g>
    <g id="line2d_16">
-    <path d="M 75.099178 251.913135 
-L 130.546438 241.17648 
-L 185.993699 219.657703 
-L 241.440959 214.163341 
-L 296.888219 211.264677 
-L 352.33548 210.014971 
-L 407.78274 209.246698 
-" clip-path="url(#pd71b444ea1)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <path d="M 75.099178 238.350863 
+L 130.546438 223.543438 
+L 185.993699 214.146066 
+L 241.440959 208.516003 
+L 296.888219 205.520358 
+L 352.33548 203.984826 
+L 407.78274 203.208765 
+" clip-path="url(#pb2e732b357)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
     <defs>
-     <path id="m00e73cd0be" d="M 0 3.5 
+     <path id="m2c37a9df10" d="M 0 3.5 
 C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
 C 3.131218 1.81853 3.5 0.928211 3.5 0 
 C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
@@ -965,27 +965,27 @@ C -1.81853 3.131218 -0.928211 3.5 0 3.5
 z
 " style="stroke: #5ba3f5"/>
     </defs>
-    <g clip-path="url(#pd71b444ea1)">
-     <use xlink:href="#m00e73cd0be" x="75.099178" y="251.913135" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m00e73cd0be" x="130.546438" y="241.17648" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m00e73cd0be" x="185.993699" y="219.657703" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m00e73cd0be" x="241.440959" y="214.163341" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m00e73cd0be" x="296.888219" y="211.264677" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m00e73cd0be" x="352.33548" y="210.014971" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m00e73cd0be" x="407.78274" y="209.246698" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <g clip-path="url(#pb2e732b357)">
+     <use xlink:href="#m2c37a9df10" x="75.099178" y="238.350863" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m2c37a9df10" x="130.546438" y="223.543438" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m2c37a9df10" x="185.993699" y="214.146066" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m2c37a9df10" x="241.440959" y="208.516003" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m2c37a9df10" x="296.888219" y="205.520358" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m2c37a9df10" x="352.33548" y="203.984826" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m2c37a9df10" x="407.78274" y="203.208765" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_17">
-    <path d="M 75.099178 259.330658 
-L 130.546438 248.708389 
-L 185.993699 241.397352 
-L 241.440959 237.270903 
-L 296.888219 235.199319 
-L 352.33548 234.549081 
-L 407.78274 233.995559 
-" clip-path="url(#pd71b444ea1)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <path d="M 75.099178 259.86644 
+L 130.546438 247.880354 
+L 185.993699 240.625705 
+L 241.440959 236.344278 
+L 296.888219 234.573906 
+L 352.33548 233.434901 
+L 407.78274 232.843972 
+" clip-path="url(#pb2e732b357)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
     <defs>
-     <path id="maccf74412a" d="M 0 3.5 
+     <path id="macf72b4cfb" d="M 0 3.5 
 C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
 C 3.131218 1.81853 3.5 0.928211 3.5 0 
 C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
@@ -997,20 +997,20 @@ C -1.81853 3.131218 -0.928211 3.5 0 3.5
 z
 " style="stroke: #ff4444"/>
     </defs>
-    <g clip-path="url(#pd71b444ea1)">
-     <use xlink:href="#maccf74412a" x="75.099178" y="259.330658" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#maccf74412a" x="130.546438" y="248.708389" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#maccf74412a" x="185.993699" y="241.397352" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#maccf74412a" x="241.440959" y="237.270903" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#maccf74412a" x="296.888219" y="235.199319" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#maccf74412a" x="352.33548" y="234.549081" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#maccf74412a" x="407.78274" y="233.995559" style="fill: #ff4444; stroke: #ff4444"/>
+    <g clip-path="url(#pb2e732b357)">
+     <use xlink:href="#macf72b4cfb" x="75.099178" y="259.86644" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#macf72b4cfb" x="130.546438" y="247.880354" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#macf72b4cfb" x="185.993699" y="240.625705" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#macf72b4cfb" x="241.440959" y="236.344278" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#macf72b4cfb" x="296.888219" y="234.573906" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#macf72b4cfb" x="352.33548" y="233.434901" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#macf72b4cfb" x="407.78274" y="232.843972" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_18">
     <path d="M 58.465 158.870109 
 L 424.416918 158.870109 
-" clip-path="url(#pd71b444ea1)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+" clip-path="url(#pb2e732b357)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_3">
     <path d="M 58.465 334.546471 
@@ -1227,7 +1227,7 @@ z
     <g id="xtick_8">
      <g id="line2d_19">
       <g>
-       <use xlink:href="#meb1843bdeb" x="486.089178" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mac3651fd45" x="486.089178" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_14">
@@ -1247,7 +1247,7 @@ z
     <g id="xtick_9">
      <g id="line2d_20">
       <g>
-       <use xlink:href="#meb1843bdeb" x="541.536438" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mac3651fd45" x="541.536438" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_15">
@@ -1268,7 +1268,7 @@ z
     <g id="xtick_10">
      <g id="line2d_21">
       <g>
-       <use xlink:href="#meb1843bdeb" x="596.983699" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mac3651fd45" x="596.983699" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_16">
@@ -1289,7 +1289,7 @@ z
     <g id="xtick_11">
      <g id="line2d_22">
       <g>
-       <use xlink:href="#meb1843bdeb" x="652.430959" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mac3651fd45" x="652.430959" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_17">
@@ -1310,7 +1310,7 @@ z
     <g id="xtick_12">
      <g id="line2d_23">
       <g>
-       <use xlink:href="#meb1843bdeb" x="707.878219" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mac3651fd45" x="707.878219" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_18">
@@ -1332,7 +1332,7 @@ z
     <g id="xtick_13">
      <g id="line2d_24">
       <g>
-       <use xlink:href="#meb1843bdeb" x="763.32548" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mac3651fd45" x="763.32548" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_19">
@@ -1354,7 +1354,7 @@ z
     <g id="xtick_14">
      <g id="line2d_25">
       <g>
-       <use xlink:href="#meb1843bdeb" x="818.77274" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mac3651fd45" x="818.77274" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_20">
@@ -1379,93 +1379,93 @@ z
      <g id="line2d_26">
       <path d="M 469.455 334.546471 
 L 835.406918 334.546471 
-" clip-path="url(#p5400b7ad60)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+" clip-path="url(#pa994cb3d22)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_27">
       <g>
-       <use xlink:href="#md6c5251344" x="469.455" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m2ee9692d93" x="469.455" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_6">
      <g id="line2d_28">
-      <path d="M 469.455 286.915059 
-L 835.406918 286.915059 
-" clip-path="url(#p5400b7ad60)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 469.455 286.389791 
+L 835.406918 286.389791 
+" clip-path="url(#pa994cb3d22)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_29">
       <g>
-       <use xlink:href="#md6c5251344" x="469.455" y="286.915059" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m2ee9692d93" x="469.455" y="286.389791" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_7">
      <g id="line2d_30">
-      <path d="M 469.455 239.283646 
-L 835.406918 239.283646 
-" clip-path="url(#p5400b7ad60)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 469.455 238.23311 
+L 835.406918 238.23311 
+" clip-path="url(#pa994cb3d22)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_31">
       <g>
-       <use xlink:href="#md6c5251344" x="469.455" y="239.283646" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m2ee9692d93" x="469.455" y="238.23311" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_8">
      <g id="line2d_32">
-      <path d="M 469.455 191.652234 
-L 835.406918 191.652234 
-" clip-path="url(#p5400b7ad60)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 469.455 190.07643 
+L 835.406918 190.07643 
+" clip-path="url(#pa994cb3d22)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_33">
       <g>
-       <use xlink:href="#md6c5251344" x="469.455" y="191.652234" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m2ee9692d93" x="469.455" y="190.07643" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
    </g>
    <g id="line2d_34">
-    <path d="M 486.089178 238.582331 
-L 541.536438 227.350322 
-L 596.983699 221.013408 
-L 652.430959 217.531347 
-L 707.878219 215.493568 
-L 763.32548 214.504926 
-L 818.77274 214.038054 
-" clip-path="url(#p5400b7ad60)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p5400b7ad60)">
-     <use xlink:href="#m00e73cd0be" x="486.089178" y="238.582331" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m00e73cd0be" x="541.536438" y="227.350322" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m00e73cd0be" x="596.983699" y="221.013408" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m00e73cd0be" x="652.430959" y="217.531347" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m00e73cd0be" x="707.878219" y="215.493568" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m00e73cd0be" x="763.32548" y="214.504926" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m00e73cd0be" x="818.77274" y="214.038054" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <path d="M 486.089178 237.753267 
+L 541.536438 226.985784 
+L 596.983699 219.744682 
+L 652.430959 216.245263 
+L 707.878219 214.176326 
+L 763.32548 213.209587 
+L 818.77274 212.702105 
+" clip-path="url(#pa994cb3d22)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pa994cb3d22)">
+     <use xlink:href="#m2c37a9df10" x="486.089178" y="237.753267" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m2c37a9df10" x="541.536438" y="226.985784" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m2c37a9df10" x="596.983699" y="219.744682" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m2c37a9df10" x="652.430959" y="216.245263" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m2c37a9df10" x="707.878219" y="214.176326" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m2c37a9df10" x="763.32548" y="213.209587" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m2c37a9df10" x="818.77274" y="212.702105" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_35">
-    <path d="M 486.089178 271.055153 
-L 541.536438 263.729959 
-L 596.983699 258.77406 
-L 652.430959 256.357586 
-L 707.878219 255.137832 
-L 763.32548 254.494771 
-L 818.77274 254.183547 
-" clip-path="url(#p5400b7ad60)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p5400b7ad60)">
-     <use xlink:href="#maccf74412a" x="486.089178" y="271.055153" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#maccf74412a" x="541.536438" y="263.729959" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#maccf74412a" x="596.983699" y="258.77406" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#maccf74412a" x="652.430959" y="256.357586" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#maccf74412a" x="707.878219" y="255.137832" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#maccf74412a" x="763.32548" y="254.494771" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#maccf74412a" x="818.77274" y="254.183547" style="fill: #ff4444; stroke: #ff4444"/>
+    <path d="M 486.089178 270.05307 
+L 541.536438 262.934911 
+L 596.983699 257.942177 
+L 652.430959 255.503949 
+L 707.878219 254.259693 
+L 763.32548 253.615574 
+L 818.77274 253.288726 
+" clip-path="url(#pa994cb3d22)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pa994cb3d22)">
+     <use xlink:href="#macf72b4cfb" x="486.089178" y="270.05307" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#macf72b4cfb" x="541.536438" y="262.934911" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#macf72b4cfb" x="596.983699" y="257.942177" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#macf72b4cfb" x="652.430959" y="255.503949" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#macf72b4cfb" x="707.878219" y="254.259693" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#macf72b4cfb" x="763.32548" y="253.615574" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#macf72b4cfb" x="818.77274" y="253.288726" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_36">
     <path d="M 469.455 158.870109 
 L 835.406918 158.870109 
-" clip-path="url(#p5400b7ad60)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+" clip-path="url(#pa994cb3d22)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_8">
     <path d="M 469.455 334.546471 
@@ -1581,7 +1581,7 @@ z
     <g id="xtick_15">
      <g id="line2d_37">
       <g>
-       <use xlink:href="#meb1843bdeb" x="897.079178" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mac3651fd45" x="897.079178" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_22">
@@ -1601,7 +1601,7 @@ z
     <g id="xtick_16">
      <g id="line2d_38">
       <g>
-       <use xlink:href="#meb1843bdeb" x="952.526438" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mac3651fd45" x="952.526438" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_23">
@@ -1622,7 +1622,7 @@ z
     <g id="xtick_17">
      <g id="line2d_39">
       <g>
-       <use xlink:href="#meb1843bdeb" x="1007.973699" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mac3651fd45" x="1007.973699" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_24">
@@ -1643,7 +1643,7 @@ z
     <g id="xtick_18">
      <g id="line2d_40">
       <g>
-       <use xlink:href="#meb1843bdeb" x="1063.420959" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mac3651fd45" x="1063.420959" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_25">
@@ -1664,7 +1664,7 @@ z
     <g id="xtick_19">
      <g id="line2d_41">
       <g>
-       <use xlink:href="#meb1843bdeb" x="1118.868219" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mac3651fd45" x="1118.868219" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_26">
@@ -1686,7 +1686,7 @@ z
     <g id="xtick_20">
      <g id="line2d_42">
       <g>
-       <use xlink:href="#meb1843bdeb" x="1174.31548" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mac3651fd45" x="1174.31548" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_27">
@@ -1708,7 +1708,7 @@ z
     <g id="xtick_21">
      <g id="line2d_43">
       <g>
-       <use xlink:href="#meb1843bdeb" x="1229.76274" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mac3651fd45" x="1229.76274" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_28">
@@ -1733,93 +1733,93 @@ z
      <g id="line2d_44">
       <path d="M 880.445 334.546471 
 L 1246.396918 334.546471 
-" clip-path="url(#p657ab3184b)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+" clip-path="url(#p0bfe6d2be3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_45">
       <g>
-       <use xlink:href="#md6c5251344" x="880.445" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m2ee9692d93" x="880.445" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_10">
      <g id="line2d_46">
-      <path d="M 880.445 286.915059 
-L 1246.396918 286.915059 
-" clip-path="url(#p657ab3184b)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 880.445 286.389791 
+L 1246.396918 286.389791 
+" clip-path="url(#p0bfe6d2be3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_47">
       <g>
-       <use xlink:href="#md6c5251344" x="880.445" y="286.915059" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m2ee9692d93" x="880.445" y="286.389791" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_11">
      <g id="line2d_48">
-      <path d="M 880.445 239.283646 
-L 1246.396918 239.283646 
-" clip-path="url(#p657ab3184b)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 880.445 238.23311 
+L 1246.396918 238.23311 
+" clip-path="url(#p0bfe6d2be3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_49">
       <g>
-       <use xlink:href="#md6c5251344" x="880.445" y="239.283646" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m2ee9692d93" x="880.445" y="238.23311" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_12">
      <g id="line2d_50">
-      <path d="M 880.445 191.652234 
-L 1246.396918 191.652234 
-" clip-path="url(#p657ab3184b)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 880.445 190.07643 
+L 1246.396918 190.07643 
+" clip-path="url(#p0bfe6d2be3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_51">
       <g>
-       <use xlink:href="#md6c5251344" x="880.445" y="191.652234" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m2ee9692d93" x="880.445" y="190.07643" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
    </g>
    <g id="line2d_52">
-    <path d="M 897.079178 256.925659 
-L 952.526438 244.945202 
-L 1007.973699 237.689301 
-L 1063.420959 233.432348 
-L 1118.868219 231.001708 
-L 1174.31548 229.984945 
-L 1229.76274 229.36148 
-" clip-path="url(#p657ab3184b)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p657ab3184b)">
-     <use xlink:href="#m00e73cd0be" x="897.079178" y="256.925659" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m00e73cd0be" x="952.526438" y="244.945202" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m00e73cd0be" x="1007.973699" y="237.689301" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m00e73cd0be" x="1063.420959" y="233.432348" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m00e73cd0be" x="1118.868219" y="231.001708" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m00e73cd0be" x="1174.31548" y="229.984945" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m00e73cd0be" x="1229.76274" y="229.36148" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <path d="M 897.079178 256.899385 
+L 952.526438 245.075489 
+L 1007.973699 236.133801 
+L 1063.420959 232.355721 
+L 1118.868219 229.957284 
+L 1174.31548 228.797117 
+L 1229.76274 228.187974 
+" clip-path="url(#p0bfe6d2be3)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p0bfe6d2be3)">
+     <use xlink:href="#m2c37a9df10" x="897.079178" y="256.899385" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m2c37a9df10" x="952.526438" y="245.075489" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m2c37a9df10" x="1007.973699" y="236.133801" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m2c37a9df10" x="1063.420959" y="232.355721" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m2c37a9df10" x="1118.868219" y="229.957284" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m2c37a9df10" x="1174.31548" y="228.797117" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m2c37a9df10" x="1229.76274" y="228.187974" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_53">
-    <path d="M 897.079178 299.955996 
-L 952.526438 270.110028 
-L 1007.973699 262.988024 
-L 1063.420959 259.42104 
-L 1118.868219 257.236743 
-L 1174.31548 256.309819 
-L 1229.76274 255.686765 
-" clip-path="url(#p657ab3184b)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p657ab3184b)">
-     <use xlink:href="#maccf74412a" x="897.079178" y="299.955996" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#maccf74412a" x="952.526438" y="270.110028" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#maccf74412a" x="1007.973699" y="262.988024" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#maccf74412a" x="1063.420959" y="259.42104" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#maccf74412a" x="1118.868219" y="257.236743" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#maccf74412a" x="1174.31548" y="256.309819" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#maccf74412a" x="1229.76274" y="255.686765" style="fill: #ff4444; stroke: #ff4444"/>
+    <path d="M 897.079178 292.867913 
+L 952.526438 268.830338 
+L 1007.973699 262.297001 
+L 1063.420959 258.67523 
+L 1118.868219 256.6246 
+L 1174.31548 255.377266 
+L 1229.76274 254.854282 
+" clip-path="url(#p0bfe6d2be3)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p0bfe6d2be3)">
+     <use xlink:href="#macf72b4cfb" x="897.079178" y="292.867913" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#macf72b4cfb" x="952.526438" y="268.830338" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#macf72b4cfb" x="1007.973699" y="262.297001" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#macf72b4cfb" x="1063.420959" y="258.67523" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#macf72b4cfb" x="1118.868219" y="256.6246" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#macf72b4cfb" x="1174.31548" y="255.377266" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#macf72b4cfb" x="1229.76274" y="254.854282" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_54">
     <path d="M 880.445 158.870109 
 L 1246.396918 158.870109 
-" clip-path="url(#p657ab3184b)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+" clip-path="url(#p0bfe6d2be3)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_13">
     <path d="M 880.445 334.546471 
@@ -2149,7 +2149,7 @@ L 394.490937 46.691969
 L 410.740937 46.691969 
 " style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
     <g>
-     <use xlink:href="#m00e73cd0be" x="394.490937" y="46.691969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m2c37a9df10" x="394.490937" y="46.691969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="text_31">
@@ -2204,7 +2204,7 @@ L 619.955625 46.691969
 L 636.205625 46.691969 
 " style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
     <g>
-     <use xlink:href="#maccf74412a" x="619.955625" y="46.691969" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#macf72b4cfb" x="619.955625" y="46.691969" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="text_32">
@@ -2267,13 +2267,13 @@ z
   </g>
  </g>
  <defs>
-  <clipPath id="pd71b444ea1">
+  <clipPath id="pb2e732b357">
    <rect x="58.465" y="144.816" width="365.951918" height="189.730471"/>
   </clipPath>
-  <clipPath id="p5400b7ad60">
+  <clipPath id="pa994cb3d22">
    <rect x="469.455" y="144.816" width="365.951918" height="189.730471"/>
   </clipPath>
-  <clipPath id="p657ab3184b">
+  <clipPath id="p0bfe6d2be3">
    <rect x="880.445" y="144.816" width="365.951918" height="189.730471"/>
   </clipPath>
  </defs>
diff --git a/oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3.svg b/oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3.svg
index 9a58fde..e5cecac 100644
--- a/oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3.svg
+++ b/oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3.svg
@@ -1,12 +1,12 @@
 <?xml version="1.0" encoding="utf-8" standalone="no"?>
 <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
   "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
-<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="1296pt" height="403.2pt" viewBox="0 0 1296 403.2" xmlns="http://www.w3.org/2000/svg" version="1.1">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="1275.020982pt" height="387.00431pt" viewBox="0 0 1275.020982 387.00431" xmlns="http://www.w3.org/2000/svg" version="1.1">
  <metadata>
   <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
    <cc:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
-    <dc:date>2026-01-12T20:27:30.111404</dc:date>
+    <dc:date>2026-01-22T03:17:11.211653</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <cc:Agent>
@@ -21,19 +21,19 @@
  </defs>
  <g id="figure_1">
   <g id="patch_1">
-   <path d="M 0 403.2 
-L 1296 403.2 
-L 1296 0 
+   <path d="M 0 387.00431 
+L 1275.020982 387.00431 
+L 1275.020982 0 
 L 0 0 
 z
 " style="fill: #ffffff"/>
   </g>
   <g id="axes_1">
    <g id="patch_2">
-    <path d="M 66.53 345.561604 
-L 436.873051 345.561604 
-L 436.873051 151.44 
-L 66.53 151.44 
+    <path d="M 58.465 338.937604 
+L 428.808051 338.937604 
+L 428.808051 144.816 
+L 58.465 144.816 
 z
 " style="fill: #ffffff"/>
    </g>
@@ -41,17 +41,17 @@ z
     <g id="xtick_1">
      <g id="line2d_1">
       <defs>
-       <path id="m9bb0c315f1" d="M 0 0 
+       <path id="meae7d44612" d="M 0 0 
 L 0 3.5 
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
-       <use xlink:href="#m9bb0c315f1" x="83.363775" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#meae7d44612" x="75.298775" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_1">
       <!-- (4K, 6K) -->
-      <g transform="translate(84.834336 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(76.769336 351.310511) rotate(-315) scale(0.1 -0.1)">
        <defs>
         <path id="DejaVuSans-28" d="M 1984 4856 
 Q 1566 4138 1362 3434 
@@ -167,12 +167,12 @@ z
     <g id="xtick_2">
      <g id="line2d_2">
       <g>
-       <use xlink:href="#m9bb0c315f1" x="125.448213" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#meae7d44612" x="117.383213" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_2">
       <!-- (16K, 6K) -->
-      <g transform="translate(126.918774 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(118.853774 351.310511) rotate(-315) scale(0.1 -0.1)">
        <defs>
         <path id="DejaVuSans-31" d="M 794 531 
 L 1825 531 
@@ -204,12 +204,12 @@ z
     <g id="xtick_3">
      <g id="line2d_3">
       <g>
-       <use xlink:href="#m9bb0c315f1" x="167.53265" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#meae7d44612" x="159.46765" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_3">
       <!-- (64K, 6K) -->
-      <g transform="translate(169.003211 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(160.938211 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -225,12 +225,12 @@ z
     <g id="xtick_4">
      <g id="line2d_4">
       <g>
-       <use xlink:href="#m9bb0c315f1" x="209.617088" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#meae7d44612" x="201.552088" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_4">
       <!-- (4K, 7K) -->
-      <g transform="translate(211.087649 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(203.022649 351.310511) rotate(-315) scale(0.1 -0.1)">
        <defs>
         <path id="DejaVuSans-37" d="M 525 4666 
 L 3525 4666 
@@ -257,12 +257,12 @@ z
     <g id="xtick_5">
      <g id="line2d_5">
       <g>
-       <use xlink:href="#m9bb0c315f1" x="251.701526" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#meae7d44612" x="243.636526" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_5">
       <!-- (16K, 7K) -->
-      <g transform="translate(253.172087 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(245.107087 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
@@ -278,12 +278,12 @@ z
     <g id="xtick_6">
      <g id="line2d_6">
       <g>
-       <use xlink:href="#m9bb0c315f1" x="293.785963" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#meae7d44612" x="285.720963" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_6">
       <!-- (64K, 7K) -->
-      <g transform="translate(295.256524 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(287.191524 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -299,12 +299,12 @@ z
     <g id="xtick_7">
      <g id="line2d_7">
       <g>
-       <use xlink:href="#m9bb0c315f1" x="335.870401" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#meae7d44612" x="327.805401" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_7">
       <!-- (4K, 8K) -->
-      <g transform="translate(337.340962 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(329.275962 351.310511) rotate(-315) scale(0.1 -0.1)">
        <defs>
         <path id="DejaVuSans-38" d="M 2034 2216 
 Q 1584 2216 1326 1975 
@@ -360,12 +360,12 @@ z
     <g id="xtick_8">
      <g id="line2d_8">
       <g>
-       <use xlink:href="#m9bb0c315f1" x="377.954838" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#meae7d44612" x="369.889838" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_8">
       <!-- (16K, 8K) -->
-      <g transform="translate(379.4254 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(371.3604 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
@@ -381,12 +381,12 @@ z
     <g id="xtick_9">
      <g id="line2d_9">
       <g>
-       <use xlink:href="#m9bb0c315f1" x="420.039276" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#meae7d44612" x="411.974276" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_9">
       <!-- (64K, 8K) -->
-      <g transform="translate(421.509837 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(413.444837 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -403,23 +403,23 @@ z
    <g id="matplotlib.axis_2">
     <g id="ytick_1">
      <g id="line2d_10">
-      <path d="M 66.53 345.561604 
-L 436.873051 345.561604 
-" clip-path="url(#p7f372e1956)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 338.937604 
+L 428.808051 338.937604 
+" clip-path="url(#pbcb0270ef8)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_11">
       <defs>
-       <path id="m0a40772bfa" d="M 0 0 
+       <path id="mc93a165e1c" d="M 0 0 
 L -3.5 0 
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
-       <use xlink:href="#m0a40772bfa" x="66.53" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc93a165e1c" x="58.465" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_10">
       <!-- 0 -->
-      <g transform="translate(51.895 350.120666) scale(0.12 -0.12)">
+      <g transform="translate(43.83 343.496666) scale(0.12 -0.12)">
        <defs>
         <path id="DejaVuSans-30" d="M 2034 4250 
 Q 1547 4250 1301 3770 
@@ -449,18 +449,18 @@ z
     </g>
     <g id="ytick_2">
      <g id="line2d_12">
-      <path d="M 66.53 321.194706 
-L 436.873051 321.194706 
-" clip-path="url(#p7f372e1956)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 314.301993 
+L 428.808051 314.301993 
+" clip-path="url(#pbcb0270ef8)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_13">
       <g>
-       <use xlink:href="#m0a40772bfa" x="66.53" y="321.194706" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc93a165e1c" x="58.465" y="314.301993" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_11">
       <!-- 1000 -->
-      <g transform="translate(28.99 325.753768) scale(0.12 -0.12)">
+      <g transform="translate(20.925 318.861056) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-31"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -470,18 +470,18 @@ L 436.873051 321.194706
     </g>
     <g id="ytick_3">
      <g id="line2d_14">
-      <path d="M 66.53 296.827808 
-L 436.873051 296.827808 
-" clip-path="url(#p7f372e1956)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 289.666383 
+L 428.808051 289.666383 
+" clip-path="url(#pbcb0270ef8)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_15">
       <g>
-       <use xlink:href="#m0a40772bfa" x="66.53" y="296.827808" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc93a165e1c" x="58.465" y="289.666383" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_12">
       <!-- 2000 -->
-      <g transform="translate(28.99 301.38687) scale(0.12 -0.12)">
+      <g transform="translate(20.925 294.225445) scale(0.12 -0.12)">
        <defs>
         <path id="DejaVuSans-32" d="M 1228 531 
 L 3431 531 
@@ -517,18 +517,18 @@ z
     </g>
     <g id="ytick_4">
      <g id="line2d_16">
-      <path d="M 66.53 272.460909 
-L 436.873051 272.460909 
-" clip-path="url(#p7f372e1956)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 265.030772 
+L 428.808051 265.030772 
+" clip-path="url(#pbcb0270ef8)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_17">
       <g>
-       <use xlink:href="#m0a40772bfa" x="66.53" y="272.460909" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc93a165e1c" x="58.465" y="265.030772" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_13">
       <!-- 3000 -->
-      <g transform="translate(28.99 277.019972) scale(0.12 -0.12)">
+      <g transform="translate(20.925 269.589835) scale(0.12 -0.12)">
        <defs>
         <path id="DejaVuSans-33" d="M 2597 2516 
 Q 3050 2419 3304 2112 
@@ -572,18 +572,18 @@ z
     </g>
     <g id="ytick_5">
      <g id="line2d_18">
-      <path d="M 66.53 248.094011 
-L 436.873051 248.094011 
-" clip-path="url(#p7f372e1956)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 240.395161 
+L 428.808051 240.395161 
+" clip-path="url(#pbcb0270ef8)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_19">
       <g>
-       <use xlink:href="#m0a40772bfa" x="66.53" y="248.094011" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc93a165e1c" x="58.465" y="240.395161" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_14">
       <!-- 4000 -->
-      <g transform="translate(28.99 252.653074) scale(0.12 -0.12)">
+      <g transform="translate(20.925 244.954224) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-34"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -593,18 +593,18 @@ L 436.873051 248.094011
     </g>
     <g id="ytick_6">
      <g id="line2d_20">
-      <path d="M 66.53 223.727113 
-L 436.873051 223.727113 
-" clip-path="url(#p7f372e1956)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 215.759551 
+L 428.808051 215.759551 
+" clip-path="url(#pbcb0270ef8)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_21">
       <g>
-       <use xlink:href="#m0a40772bfa" x="66.53" y="223.727113" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc93a165e1c" x="58.465" y="215.759551" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_15">
       <!-- 5000 -->
-      <g transform="translate(28.99 228.286175) scale(0.12 -0.12)">
+      <g transform="translate(20.925 220.318613) scale(0.12 -0.12)">
        <defs>
         <path id="DejaVuSans-35" d="M 691 4666 
 L 3169 4666 
@@ -641,18 +641,18 @@ z
     </g>
     <g id="ytick_7">
      <g id="line2d_22">
-      <path d="M 66.53 199.360215 
-L 436.873051 199.360215 
-" clip-path="url(#p7f372e1956)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 191.12394 
+L 428.808051 191.12394 
+" clip-path="url(#pbcb0270ef8)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_23">
       <g>
-       <use xlink:href="#m0a40772bfa" x="66.53" y="199.360215" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc93a165e1c" x="58.465" y="191.12394" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_16">
       <!-- 6000 -->
-      <g transform="translate(28.99 203.919277) scale(0.12 -0.12)">
+      <g transform="translate(20.925 195.683003) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-36"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -662,18 +662,18 @@ L 436.873051 199.360215
     </g>
     <g id="ytick_8">
      <g id="line2d_24">
-      <path d="M 66.53 174.993316 
-L 436.873051 174.993316 
-" clip-path="url(#p7f372e1956)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 166.48833 
+L 428.808051 166.48833 
+" clip-path="url(#pbcb0270ef8)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_25">
       <g>
-       <use xlink:href="#m0a40772bfa" x="66.53" y="174.993316" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc93a165e1c" x="58.465" y="166.48833" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_17">
       <!-- 7000 -->
-      <g transform="translate(28.99 179.552379) scale(0.12 -0.12)">
+      <g transform="translate(20.925 171.047392) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-37"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -683,7 +683,7 @@ L 436.873051 174.993316
     </g>
     <g id="text_18">
      <!-- Memory Bandwidth (GB/s) -->
-     <g transform="translate(21.6625 353.504552) rotate(-90) scale(0.16 -0.16)">
+     <g transform="translate(13.5975 346.880552) rotate(-90) scale(0.16 -0.16)">
       <defs>
        <path id="DejaVuSans-4d" d="M 628 4666 
 L 1569 4666 
@@ -1081,18 +1081,18 @@ z
     </g>
    </g>
    <g id="line2d_26">
-    <path d="M 83.363775 220.542078 
-L 125.448213 191.175904 
-L 167.53265 180.378635 
-L 209.617088 218.832242 
-L 251.701526 198.974879 
-L 293.785963 186.339704 
-L 335.870401 214.277333 
-L 377.954838 189.080127 
-L 420.039276 182.463102 
-" clip-path="url(#p7f372e1956)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <path d="M 75.298775 212.449078 
+L 117.383213 184.126047 
+L 159.46765 170.405806 
+L 201.552088 211.91835 
+L 243.636526 190.028097 
+L 285.720963 173.791458 
+L 327.805401 206.902961 
+L 369.889838 181.710965 
+L 411.974276 172.327669 
+" clip-path="url(#pbcb0270ef8)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
     <defs>
-     <path id="m6ed91b5a99" d="M 0 3.5 
+     <path id="mdc62433696" d="M 0 3.5 
 C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
 C 3.131218 1.81853 3.5 0.928211 3.5 0 
 C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
@@ -1104,31 +1104,31 @@ C -1.81853 3.131218 -0.928211 3.5 0 3.5
 z
 " style="stroke: #5ba3f5"/>
     </defs>
-    <g clip-path="url(#p7f372e1956)">
-     <use xlink:href="#m6ed91b5a99" x="83.363775" y="220.542078" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m6ed91b5a99" x="125.448213" y="191.175904" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m6ed91b5a99" x="167.53265" y="180.378635" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m6ed91b5a99" x="209.617088" y="218.832242" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m6ed91b5a99" x="251.701526" y="198.974879" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m6ed91b5a99" x="293.785963" y="186.339704" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m6ed91b5a99" x="335.870401" y="214.277333" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m6ed91b5a99" x="377.954838" y="189.080127" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m6ed91b5a99" x="420.039276" y="182.463102" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <g clip-path="url(#pbcb0270ef8)">
+     <use xlink:href="#mdc62433696" x="75.298775" y="212.449078" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mdc62433696" x="117.383213" y="184.126047" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mdc62433696" x="159.46765" y="170.405806" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mdc62433696" x="201.552088" y="211.91835" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mdc62433696" x="243.636526" y="190.028097" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mdc62433696" x="285.720963" y="173.791458" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mdc62433696" x="327.805401" y="206.902961" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mdc62433696" x="369.889838" y="181.710965" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mdc62433696" x="411.974276" y="172.327669" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_27">
-    <path d="M 83.363775 237.140916 
-L 125.448213 208.572261 
-L 167.53265 199.383628 
-L 209.617088 230.552779 
-L 251.701526 208.558611 
-L 293.785963 198.960974 
-L 335.870401 227.845095 
-L 377.954838 203.69787 
-L 420.039276 198.099645 
-" clip-path="url(#p7f372e1956)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <path d="M 75.298775 281.801114 
+L 117.383213 268.060892 
+L 159.46765 263.365153 
+L 201.552088 278.6422 
+L 243.636526 268.099735 
+L 285.720963 263.830136 
+L 327.805401 278.204605 
+L 369.889838 266.288831 
+L 411.974276 263.266993 
+" clip-path="url(#pbcb0270ef8)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
     <defs>
-     <path id="m915f9547b2" d="M 0 3.5 
+     <path id="m67dd42607c" d="M 0 3.5 
 C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
 C 3.131218 1.81853 3.5 0.928211 3.5 0 
 C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
@@ -1140,46 +1140,46 @@ C -1.81853 3.131218 -0.928211 3.5 0 3.5
 z
 " style="stroke: #ff4444"/>
     </defs>
-    <g clip-path="url(#p7f372e1956)">
-     <use xlink:href="#m915f9547b2" x="83.363775" y="237.140916" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m915f9547b2" x="125.448213" y="208.572261" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m915f9547b2" x="167.53265" y="199.383628" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m915f9547b2" x="209.617088" y="230.552779" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m915f9547b2" x="251.701526" y="208.558611" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m915f9547b2" x="293.785963" y="198.960974" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m915f9547b2" x="335.870401" y="227.845095" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m915f9547b2" x="377.954838" y="203.69787" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m915f9547b2" x="420.039276" y="198.099645" style="fill: #ff4444; stroke: #ff4444"/>
+    <g clip-path="url(#pbcb0270ef8)">
+     <use xlink:href="#m67dd42607c" x="75.298775" y="281.801114" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m67dd42607c" x="117.383213" y="268.060892" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m67dd42607c" x="159.46765" y="263.365153" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m67dd42607c" x="201.552088" y="278.6422" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m67dd42607c" x="243.636526" y="268.099735" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m67dd42607c" x="285.720963" y="263.830136" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m67dd42607c" x="327.805401" y="278.204605" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m67dd42607c" x="369.889838" y="266.288831" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m67dd42607c" x="411.974276" y="263.266993" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_28">
-    <path d="M 66.53 165.819378 
-L 436.873051 165.819378 
-" clip-path="url(#p7f372e1956)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+    <path d="M 58.465 159.195378 
+L 428.808051 159.195378 
+" clip-path="url(#pbcb0270ef8)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_3">
-    <path d="M 66.53 345.561604 
-L 66.53 151.44 
+    <path d="M 58.465 338.937604 
+L 58.465 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_4">
-    <path d="M 436.873051 345.561604 
-L 436.873051 151.44 
+    <path d="M 428.808051 338.937604 
+L 428.808051 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_5">
-    <path d="M 66.53 345.561604 
-L 436.873051 345.561604 
+    <path d="M 58.465 338.937604 
+L 428.808051 338.937604 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_6">
-    <path d="M 66.53 151.44 
-L 436.873051 151.44 
+    <path d="M 58.465 144.816 
+L 428.808051 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="text_19">
     <!-- Fused Add+RMSNorm (fwd) -->
-    <g transform="translate(127.299026 145.44) scale(0.18 -0.18)">
+    <g transform="translate(119.234026 138.816) scale(0.18 -0.18)">
      <defs>
       <path id="DejaVuSans-46" d="M 628 4666 
 L 3309 4666 
@@ -1369,10 +1369,10 @@ z
   </g>
   <g id="axes_2">
    <g id="patch_7">
-    <path d="M 477.52 345.561604 
-L 847.863051 345.561604 
-L 847.863051 151.44 
-L 477.52 151.44 
+    <path d="M 469.455 338.937604 
+L 839.798051 338.937604 
+L 839.798051 144.816 
+L 469.455 144.816 
 z
 " style="fill: #ffffff"/>
    </g>
@@ -1380,12 +1380,12 @@ z
     <g id="xtick_10">
      <g id="line2d_29">
       <g>
-       <use xlink:href="#m9bb0c315f1" x="494.353775" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#meae7d44612" x="486.288775" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_20">
       <!-- (4K, 6K) -->
-      <g transform="translate(495.824336 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(487.759336 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
@@ -1400,12 +1400,12 @@ z
     <g id="xtick_11">
      <g id="line2d_30">
       <g>
-       <use xlink:href="#m9bb0c315f1" x="536.438213" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#meae7d44612" x="528.373213" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_21">
       <!-- (16K, 6K) -->
-      <g transform="translate(537.908774 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(529.843774 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
@@ -1421,12 +1421,12 @@ z
     <g id="xtick_12">
      <g id="line2d_31">
       <g>
-       <use xlink:href="#m9bb0c315f1" x="578.52265" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#meae7d44612" x="570.45765" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_22">
       <!-- (64K, 6K) -->
-      <g transform="translate(579.993211 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(571.928211 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -1442,12 +1442,12 @@ z
     <g id="xtick_13">
      <g id="line2d_32">
       <g>
-       <use xlink:href="#m9bb0c315f1" x="620.607088" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#meae7d44612" x="612.542088" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_23">
       <!-- (4K, 7K) -->
-      <g transform="translate(622.077649 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(614.012649 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
@@ -1462,12 +1462,12 @@ z
     <g id="xtick_14">
      <g id="line2d_33">
       <g>
-       <use xlink:href="#m9bb0c315f1" x="662.691526" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#meae7d44612" x="654.626526" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_24">
       <!-- (16K, 7K) -->
-      <g transform="translate(664.162087 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(656.097087 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
@@ -1483,12 +1483,12 @@ z
     <g id="xtick_15">
      <g id="line2d_34">
       <g>
-       <use xlink:href="#m9bb0c315f1" x="704.775963" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#meae7d44612" x="696.710963" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_25">
       <!-- (64K, 7K) -->
-      <g transform="translate(706.246524 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(698.181524 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -1504,12 +1504,12 @@ z
     <g id="xtick_16">
      <g id="line2d_35">
       <g>
-       <use xlink:href="#m9bb0c315f1" x="746.860401" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#meae7d44612" x="738.795401" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_26">
       <!-- (4K, 8K) -->
-      <g transform="translate(748.330962 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(740.265962 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
@@ -1524,12 +1524,12 @@ z
     <g id="xtick_17">
      <g id="line2d_36">
       <g>
-       <use xlink:href="#m9bb0c315f1" x="788.944838" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#meae7d44612" x="780.879838" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_27">
       <!-- (16K, 8K) -->
-      <g transform="translate(790.4154 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(782.3504 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
@@ -1545,12 +1545,12 @@ z
     <g id="xtick_18">
      <g id="line2d_37">
       <g>
-       <use xlink:href="#m9bb0c315f1" x="831.029276" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#meae7d44612" x="822.964276" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_28">
       <!-- (64K, 8K) -->
-      <g transform="translate(832.499837 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(824.434837 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -1567,175 +1567,175 @@ z
    <g id="matplotlib.axis_4">
     <g id="ytick_9">
      <g id="line2d_38">
-      <path d="M 477.52 345.561604 
-L 847.863051 345.561604 
-" clip-path="url(#p1662381337)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 469.455 338.937604 
+L 839.798051 338.937604 
+" clip-path="url(#p8e25358edd)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_39">
       <g>
-       <use xlink:href="#m0a40772bfa" x="477.52" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc93a165e1c" x="469.455" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_10">
      <g id="line2d_40">
-      <path d="M 477.52 321.194706 
-L 847.863051 321.194706 
-" clip-path="url(#p1662381337)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 469.455 314.301993 
+L 839.798051 314.301993 
+" clip-path="url(#p8e25358edd)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_41">
       <g>
-       <use xlink:href="#m0a40772bfa" x="477.52" y="321.194706" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc93a165e1c" x="469.455" y="314.301993" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_11">
      <g id="line2d_42">
-      <path d="M 477.52 296.827808 
-L 847.863051 296.827808 
-" clip-path="url(#p1662381337)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 469.455 289.666383 
+L 839.798051 289.666383 
+" clip-path="url(#p8e25358edd)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_43">
       <g>
-       <use xlink:href="#m0a40772bfa" x="477.52" y="296.827808" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc93a165e1c" x="469.455" y="289.666383" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_12">
      <g id="line2d_44">
-      <path d="M 477.52 272.460909 
-L 847.863051 272.460909 
-" clip-path="url(#p1662381337)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 469.455 265.030772 
+L 839.798051 265.030772 
+" clip-path="url(#p8e25358edd)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_45">
       <g>
-       <use xlink:href="#m0a40772bfa" x="477.52" y="272.460909" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc93a165e1c" x="469.455" y="265.030772" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_13">
      <g id="line2d_46">
-      <path d="M 477.52 248.094011 
-L 847.863051 248.094011 
-" clip-path="url(#p1662381337)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 469.455 240.395161 
+L 839.798051 240.395161 
+" clip-path="url(#p8e25358edd)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_47">
       <g>
-       <use xlink:href="#m0a40772bfa" x="477.52" y="248.094011" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc93a165e1c" x="469.455" y="240.395161" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_14">
      <g id="line2d_48">
-      <path d="M 477.52 223.727113 
-L 847.863051 223.727113 
-" clip-path="url(#p1662381337)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 469.455 215.759551 
+L 839.798051 215.759551 
+" clip-path="url(#p8e25358edd)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_49">
       <g>
-       <use xlink:href="#m0a40772bfa" x="477.52" y="223.727113" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc93a165e1c" x="469.455" y="215.759551" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_15">
      <g id="line2d_50">
-      <path d="M 477.52 199.360215 
-L 847.863051 199.360215 
-" clip-path="url(#p1662381337)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 469.455 191.12394 
+L 839.798051 191.12394 
+" clip-path="url(#p8e25358edd)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_51">
       <g>
-       <use xlink:href="#m0a40772bfa" x="477.52" y="199.360215" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc93a165e1c" x="469.455" y="191.12394" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_16">
      <g id="line2d_52">
-      <path d="M 477.52 174.993316 
-L 847.863051 174.993316 
-" clip-path="url(#p1662381337)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 469.455 166.48833 
+L 839.798051 166.48833 
+" clip-path="url(#p8e25358edd)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_53">
       <g>
-       <use xlink:href="#m0a40772bfa" x="477.52" y="174.993316" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc93a165e1c" x="469.455" y="166.48833" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
    </g>
    <g id="line2d_54">
-    <path d="M 494.353775 262.316346 
-L 536.438213 242.128516 
-L 578.52265 235.472551 
-L 620.607088 256.079671 
-L 662.691526 236.706592 
-L 704.775963 230.042683 
-L 746.860401 261.652432 
-L 788.944838 246.424047 
-L 831.029276 241.460071 
-" clip-path="url(#p1662381337)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p1662381337)">
-     <use xlink:href="#m6ed91b5a99" x="494.353775" y="262.316346" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m6ed91b5a99" x="536.438213" y="242.128516" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m6ed91b5a99" x="578.52265" y="235.472551" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m6ed91b5a99" x="620.607088" y="256.079671" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m6ed91b5a99" x="662.691526" y="236.706592" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m6ed91b5a99" x="704.775963" y="230.042683" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m6ed91b5a99" x="746.860401" y="261.652432" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m6ed91b5a99" x="788.944838" y="246.424047" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m6ed91b5a99" x="831.029276" y="241.460071" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <path d="M 486.288775 254.29777 
+L 528.373213 234.370248 
+L 570.45765 227.59634 
+L 612.542088 248.333682 
+L 654.626526 228.938876 
+L 696.710963 222.05363 
+L 738.795401 254.46009 
+L 780.879838 238.500401 
+L 822.964276 233.685939 
+" clip-path="url(#p8e25358edd)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p8e25358edd)">
+     <use xlink:href="#mdc62433696" x="486.288775" y="254.29777" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mdc62433696" x="528.373213" y="234.370248" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mdc62433696" x="570.45765" y="227.59634" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mdc62433696" x="612.542088" y="248.333682" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mdc62433696" x="654.626526" y="228.938876" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mdc62433696" x="696.710963" y="222.05363" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mdc62433696" x="738.795401" y="254.46009" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mdc62433696" x="780.879838" y="238.500401" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mdc62433696" x="822.964276" y="233.685939" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_55">
-    <path d="M 494.353775 289.622796 
-L 536.438213 274.506128 
-L 578.52265 268.230802 
-L 620.607088 278.803887 
-L 662.691526 262.273435 
-L 704.775963 256.960887 
-L 746.860401 280.708975 
-L 788.944838 268.033572 
-L 831.029276 264.152803 
-" clip-path="url(#p1662381337)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p1662381337)">
-     <use xlink:href="#m915f9547b2" x="494.353775" y="289.622796" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m915f9547b2" x="536.438213" y="274.506128" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m915f9547b2" x="578.52265" y="268.230802" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m915f9547b2" x="620.607088" y="278.803887" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m915f9547b2" x="662.691526" y="262.273435" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m915f9547b2" x="704.775963" y="256.960887" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m915f9547b2" x="746.860401" y="280.708975" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m915f9547b2" x="788.944838" y="268.033572" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m915f9547b2" x="831.029276" y="264.152803" style="fill: #ff4444; stroke: #ff4444"/>
+    <path d="M 486.288775 282.476636 
+L 528.373213 267.010556 
+L 570.45765 260.800383 
+L 612.542088 270.704591 
+L 654.626526 254.839939 
+L 696.710963 249.309273 
+L 738.795401 273.442742 
+L 780.879838 260.542643 
+L 822.964276 256.655339 
+" clip-path="url(#p8e25358edd)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p8e25358edd)">
+     <use xlink:href="#m67dd42607c" x="486.288775" y="282.476636" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m67dd42607c" x="528.373213" y="267.010556" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m67dd42607c" x="570.45765" y="260.800383" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m67dd42607c" x="612.542088" y="270.704591" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m67dd42607c" x="654.626526" y="254.839939" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m67dd42607c" x="696.710963" y="249.309273" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m67dd42607c" x="738.795401" y="273.442742" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m67dd42607c" x="780.879838" y="260.542643" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m67dd42607c" x="822.964276" y="256.655339" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_56">
-    <path d="M 477.52 165.819378 
-L 847.863051 165.819378 
-" clip-path="url(#p1662381337)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+    <path d="M 469.455 159.195378 
+L 839.798051 159.195378 
+" clip-path="url(#p8e25358edd)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_8">
-    <path d="M 477.52 345.561604 
-L 477.52 151.44 
+    <path d="M 469.455 338.937604 
+L 469.455 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_9">
-    <path d="M 847.863051 345.561604 
-L 847.863051 151.44 
+    <path d="M 839.798051 338.937604 
+L 839.798051 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_10">
-    <path d="M 477.52 345.561604 
-L 847.863051 345.561604 
+    <path d="M 469.455 338.937604 
+L 839.798051 338.937604 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_11">
-    <path d="M 477.52 151.44 
-L 847.863051 151.44 
+    <path d="M 469.455 144.816 
+L 839.798051 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="text_29">
     <!-- Softmax (fwd+bwd) -->
-    <g transform="translate(573.026213 145.44) scale(0.18 -0.18)">
+    <g transform="translate(564.961213 138.816) scale(0.18 -0.18)">
      <defs>
       <path id="DejaVuSans-78" d="M 3513 3500 
 L 2247 1797 
@@ -1801,10 +1801,10 @@ z
   </g>
   <g id="axes_3">
    <g id="patch_12">
-    <path d="M 888.51 345.561604 
-L 1258.853051 345.561604 
-L 1258.853051 151.44 
-L 888.51 151.44 
+    <path d="M 880.445 338.937604 
+L 1250.788051 338.937604 
+L 1250.788051 144.816 
+L 880.445 144.816 
 z
 " style="fill: #ffffff"/>
    </g>
@@ -1812,12 +1812,12 @@ z
     <g id="xtick_19">
      <g id="line2d_57">
       <g>
-       <use xlink:href="#m9bb0c315f1" x="905.343775" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#meae7d44612" x="897.278775" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_30">
       <!-- (4K, 6K) -->
-      <g transform="translate(906.814336 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(898.749336 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
@@ -1832,12 +1832,12 @@ z
     <g id="xtick_20">
      <g id="line2d_58">
       <g>
-       <use xlink:href="#m9bb0c315f1" x="947.428213" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#meae7d44612" x="939.363213" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_31">
       <!-- (16K, 6K) -->
-      <g transform="translate(948.898774 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(940.833774 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
@@ -1853,12 +1853,12 @@ z
     <g id="xtick_21">
      <g id="line2d_59">
       <g>
-       <use xlink:href="#m9bb0c315f1" x="989.51265" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#meae7d44612" x="981.44765" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_32">
       <!-- (64K, 6K) -->
-      <g transform="translate(990.983211 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(982.918211 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -1874,12 +1874,12 @@ z
     <g id="xtick_22">
      <g id="line2d_60">
       <g>
-       <use xlink:href="#m9bb0c315f1" x="1031.597088" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#meae7d44612" x="1023.532088" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_33">
       <!-- (4K, 7K) -->
-      <g transform="translate(1033.067649 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(1025.002649 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
@@ -1894,12 +1894,12 @@ z
     <g id="xtick_23">
      <g id="line2d_61">
       <g>
-       <use xlink:href="#m9bb0c315f1" x="1073.681526" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#meae7d44612" x="1065.616526" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_34">
       <!-- (16K, 7K) -->
-      <g transform="translate(1075.152087 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(1067.087087 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
@@ -1915,12 +1915,12 @@ z
     <g id="xtick_24">
      <g id="line2d_62">
       <g>
-       <use xlink:href="#m9bb0c315f1" x="1115.765963" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#meae7d44612" x="1107.700963" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_35">
       <!-- (64K, 7K) -->
-      <g transform="translate(1117.236524 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(1109.171524 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -1936,12 +1936,12 @@ z
     <g id="xtick_25">
      <g id="line2d_63">
       <g>
-       <use xlink:href="#m9bb0c315f1" x="1157.850401" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#meae7d44612" x="1149.785401" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_36">
       <!-- (4K, 8K) -->
-      <g transform="translate(1159.320962 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(1151.255962 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
@@ -1956,12 +1956,12 @@ z
     <g id="xtick_26">
      <g id="line2d_64">
       <g>
-       <use xlink:href="#m9bb0c315f1" x="1199.934838" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#meae7d44612" x="1191.869838" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_37">
       <!-- (16K, 8K) -->
-      <g transform="translate(1201.4054 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(1193.3404 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
@@ -1977,12 +1977,12 @@ z
     <g id="xtick_27">
      <g id="line2d_65">
       <g>
-       <use xlink:href="#m9bb0c315f1" x="1242.019276" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#meae7d44612" x="1233.954276" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_38">
       <!-- (64K, 8K) -->
-      <g transform="translate(1243.489837 357.934511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(1235.424837 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -1999,175 +1999,175 @@ z
    <g id="matplotlib.axis_6">
     <g id="ytick_17">
      <g id="line2d_66">
-      <path d="M 888.51 345.561604 
-L 1258.853051 345.561604 
-" clip-path="url(#p5bc307ebb5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 880.445 338.937604 
+L 1250.788051 338.937604 
+" clip-path="url(#p3ad0a450e5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_67">
       <g>
-       <use xlink:href="#m0a40772bfa" x="888.51" y="345.561604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc93a165e1c" x="880.445" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_18">
      <g id="line2d_68">
-      <path d="M 888.51 321.194706 
-L 1258.853051 321.194706 
-" clip-path="url(#p5bc307ebb5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 880.445 314.301993 
+L 1250.788051 314.301993 
+" clip-path="url(#p3ad0a450e5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_69">
       <g>
-       <use xlink:href="#m0a40772bfa" x="888.51" y="321.194706" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc93a165e1c" x="880.445" y="314.301993" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_19">
      <g id="line2d_70">
-      <path d="M 888.51 296.827808 
-L 1258.853051 296.827808 
-" clip-path="url(#p5bc307ebb5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 880.445 289.666383 
+L 1250.788051 289.666383 
+" clip-path="url(#p3ad0a450e5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_71">
       <g>
-       <use xlink:href="#m0a40772bfa" x="888.51" y="296.827808" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc93a165e1c" x="880.445" y="289.666383" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_20">
      <g id="line2d_72">
-      <path d="M 888.51 272.460909 
-L 1258.853051 272.460909 
-" clip-path="url(#p5bc307ebb5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 880.445 265.030772 
+L 1250.788051 265.030772 
+" clip-path="url(#p3ad0a450e5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_73">
       <g>
-       <use xlink:href="#m0a40772bfa" x="888.51" y="272.460909" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc93a165e1c" x="880.445" y="265.030772" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_21">
      <g id="line2d_74">
-      <path d="M 888.51 248.094011 
-L 1258.853051 248.094011 
-" clip-path="url(#p5bc307ebb5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 880.445 240.395161 
+L 1250.788051 240.395161 
+" clip-path="url(#p3ad0a450e5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_75">
       <g>
-       <use xlink:href="#m0a40772bfa" x="888.51" y="248.094011" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc93a165e1c" x="880.445" y="240.395161" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_22">
      <g id="line2d_76">
-      <path d="M 888.51 223.727113 
-L 1258.853051 223.727113 
-" clip-path="url(#p5bc307ebb5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 880.445 215.759551 
+L 1250.788051 215.759551 
+" clip-path="url(#p3ad0a450e5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_77">
       <g>
-       <use xlink:href="#m0a40772bfa" x="888.51" y="223.727113" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc93a165e1c" x="880.445" y="215.759551" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_23">
      <g id="line2d_78">
-      <path d="M 888.51 199.360215 
-L 1258.853051 199.360215 
-" clip-path="url(#p5bc307ebb5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 880.445 191.12394 
+L 1250.788051 191.12394 
+" clip-path="url(#p3ad0a450e5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_79">
       <g>
-       <use xlink:href="#m0a40772bfa" x="888.51" y="199.360215" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc93a165e1c" x="880.445" y="191.12394" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_24">
      <g id="line2d_80">
-      <path d="M 888.51 174.993316 
-L 1258.853051 174.993316 
-" clip-path="url(#p5bc307ebb5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 880.445 166.48833 
+L 1250.788051 166.48833 
+" clip-path="url(#p3ad0a450e5)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_81">
       <g>
-       <use xlink:href="#m0a40772bfa" x="888.51" y="174.993316" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mc93a165e1c" x="880.445" y="166.48833" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
    </g>
    <g id="line2d_82">
-    <path d="M 905.343775 268.093279 
-L 947.428213 239.904905 
-L 989.51265 229.298272 
-L 1031.597088 269.830982 
-L 1073.681526 243.059358 
-L 1115.765963 233.19888 
-L 1157.850401 268.825774 
-L 1199.934838 243.698436 
-L 1242.019276 234.310691 
-" clip-path="url(#p5bc307ebb5)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p5bc307ebb5)">
-     <use xlink:href="#m6ed91b5a99" x="905.343775" y="268.093279" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m6ed91b5a99" x="947.428213" y="239.904905" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m6ed91b5a99" x="989.51265" y="229.298272" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m6ed91b5a99" x="1031.597088" y="269.830982" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m6ed91b5a99" x="1073.681526" y="243.059358" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m6ed91b5a99" x="1115.765963" y="233.19888" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m6ed91b5a99" x="1157.850401" y="268.825774" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m6ed91b5a99" x="1199.934838" y="243.698436" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m6ed91b5a99" x="1242.019276" y="234.310691" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <path d="M 897.278775 260.649856 
+L 939.363213 232.08716 
+L 981.44765 221.098168 
+L 1023.532088 253.535953 
+L 1065.616526 227.081546 
+L 1107.700963 215.645535 
+L 1149.785401 248.619237 
+L 1191.869838 218.338564 
+L 1233.954276 225.951085 
+" clip-path="url(#p3ad0a450e5)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p3ad0a450e5)">
+     <use xlink:href="#mdc62433696" x="897.278775" y="260.649856" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mdc62433696" x="939.363213" y="232.08716" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mdc62433696" x="981.44765" y="221.098168" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mdc62433696" x="1023.532088" y="253.535953" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mdc62433696" x="1065.616526" y="227.081546" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mdc62433696" x="1107.700963" y="215.645535" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mdc62433696" x="1149.785401" y="248.619237" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mdc62433696" x="1191.869838" y="218.338564" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mdc62433696" x="1233.954276" y="225.951085" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_83">
-    <path d="M 905.343775 289.92474 
-L 947.428213 276.628747 
-L 989.51265 272.623044 
-L 1031.597088 274.767602 
-L 1073.681526 257.35917 
-L 1115.765963 251.66936 
-L 1157.850401 277.833954 
-L 1199.934838 265.051317 
-L 1242.019276 260.961163 
-" clip-path="url(#p5bc307ebb5)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p5bc307ebb5)">
-     <use xlink:href="#m915f9547b2" x="905.343775" y="289.92474" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m915f9547b2" x="947.428213" y="276.628747" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m915f9547b2" x="989.51265" y="272.623044" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m915f9547b2" x="1031.597088" y="274.767602" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m915f9547b2" x="1073.681526" y="257.35917" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m915f9547b2" x="1115.765963" y="251.66936" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m915f9547b2" x="1157.850401" y="277.833954" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m915f9547b2" x="1199.934838" y="265.051317" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m915f9547b2" x="1242.019276" y="260.961163" style="fill: #ff4444; stroke: #ff4444"/>
+    <path d="M 897.278775 282.583717 
+L 939.363213 269.28134 
+L 981.44765 264.499604 
+L 1023.532088 268.858832 
+L 1065.616526 249.802861 
+L 1107.700963 243.418323 
+L 1149.785401 270.286631 
+L 1191.869838 257.5083 
+L 1233.954276 252.931195 
+" clip-path="url(#p3ad0a450e5)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p3ad0a450e5)">
+     <use xlink:href="#m67dd42607c" x="897.278775" y="282.583717" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m67dd42607c" x="939.363213" y="269.28134" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m67dd42607c" x="981.44765" y="264.499604" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m67dd42607c" x="1023.532088" y="268.858832" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m67dd42607c" x="1065.616526" y="249.802861" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m67dd42607c" x="1107.700963" y="243.418323" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m67dd42607c" x="1149.785401" y="270.286631" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m67dd42607c" x="1191.869838" y="257.5083" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m67dd42607c" x="1233.954276" y="252.931195" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_84">
-    <path d="M 888.51 165.819378 
-L 1258.853051 165.819378 
-" clip-path="url(#p5bc307ebb5)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+    <path d="M 880.445 159.195378 
+L 1250.788051 159.195378 
+" clip-path="url(#p3ad0a450e5)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_13">
-    <path d="M 888.51 345.561604 
-L 888.51 151.44 
+    <path d="M 880.445 338.937604 
+L 880.445 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_14">
-    <path d="M 1258.853051 345.561604 
-L 1258.853051 151.44 
+    <path d="M 1250.788051 338.937604 
+L 1250.788051 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_15">
-    <path d="M 888.51 345.561604 
-L 1258.853051 345.561604 
+    <path d="M 880.445 338.937604 
+L 1250.788051 338.937604 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_16">
-    <path d="M 888.51 151.44 
-L 1258.853051 151.44 
+    <path d="M 880.445 144.816 
+L 1250.788051 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="text_39">
     <!-- LayerNorm (fwd) -->
-    <g transform="translate(998.070276 145.44) scale(0.18 -0.18)">
+    <g transform="translate(990.005276 138.816) scale(0.18 -0.18)">
      <defs>
       <path id="DejaVuSans-4c" d="M 628 4666 
 L 1259 4666 
@@ -2199,7 +2199,7 @@ z
   </g>
   <g id="text_40">
    <!-- SM100 FP16 Kernel Benchmarks (Oink vs Quack) — DSv3 (hidden-size ops) -->
-   <g transform="translate(233.982344 24.780563) scale(0.22 -0.22)">
+   <g transform="translate(225.917344 18.156562) scale(0.22 -0.22)">
     <defs>
      <path id="DejaVuSans-50" d="M 1259 4147 
 L 1259 2394 
@@ -2467,17 +2467,17 @@ z
   </g>
   <g id="legend_1">
    <g id="line2d_85">
-    <path d="M 386.305937 53.315969 
-L 402.555937 53.315969 
-L 418.805937 53.315969 
+    <path d="M 378.240937 46.691969 
+L 394.490937 46.691969 
+L 410.740937 46.691969 
 " style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
     <g>
-     <use xlink:href="#m6ed91b5a99" x="402.555937" y="53.315969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#mdc62433696" x="394.490937" y="46.691969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="text_41">
     <!-- KernelAgent-Oink (ours) -->
-    <g transform="translate(429.205937 57.865969) scale(0.13 -0.13)">
+    <g transform="translate(421.140937 51.241969) scale(0.13 -0.13)">
      <defs>
       <path id="DejaVuSans-67" d="M 2906 1791 
 Q 2906 2416 2648 2759 
@@ -2540,17 +2540,17 @@ z
     </g>
    </g>
    <g id="line2d_86">
-    <path d="M 611.770625 53.315969 
-L 628.020625 53.315969 
-L 644.270625 53.315969 
+    <path d="M 603.705625 46.691969 
+L 619.955625 46.691969 
+L 636.205625 46.691969 
 " style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
     <g>
-     <use xlink:href="#m915f9547b2" x="628.020625" y="53.315969" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m67dd42607c" x="619.955625" y="46.691969" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="text_42">
     <!-- Quack -->
-    <g transform="translate(654.670625 57.865969) scale(0.13 -0.13)">
+    <g transform="translate(646.605625 51.241969) scale(0.13 -0.13)">
      <use xlink:href="#DejaVuSans-51"/>
      <use xlink:href="#DejaVuSans-75" transform="translate(78.710938 0)"/>
      <use xlink:href="#DejaVuSans-61" transform="translate(142.089844 0)"/>
@@ -2559,14 +2559,14 @@ L 644.270625 53.315969
     </g>
    </g>
    <g id="line2d_87">
-    <path d="M 721.785156 53.315969 
-L 738.035156 53.315969 
-L 754.285156 53.315969 
+    <path d="M 713.720156 46.691969 
+L 729.970156 46.691969 
+L 746.220156 46.691969 
 " style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="text_43">
     <!-- HBM peak (measured) -->
-    <g transform="translate(764.685156 57.865969) scale(0.13 -0.13)">
+    <g transform="translate(756.620156 51.241969) scale(0.13 -0.13)">
      <defs>
       <path id="DejaVuSans-48" d="M 628 4666 
 L 1259 4666 
@@ -2608,14 +2608,14 @@ z
   </g>
  </g>
  <defs>
-  <clipPath id="p7f372e1956">
-   <rect x="66.53" y="151.44" width="370.343051" height="194.121604"/>
+  <clipPath id="pbcb0270ef8">
+   <rect x="58.465" y="144.816" width="370.343051" height="194.121604"/>
   </clipPath>
-  <clipPath id="p1662381337">
-   <rect x="477.52" y="151.44" width="370.343051" height="194.121604"/>
+  <clipPath id="p8e25358edd">
+   <rect x="469.455" y="144.816" width="370.343051" height="194.121604"/>
   </clipPath>
-  <clipPath id="p5bc307ebb5">
-   <rect x="888.51" y="151.44" width="370.343051" height="194.121604"/>
+  <clipPath id="p3ad0a450e5">
+   <rect x="880.445" y="144.816" width="370.343051" height="194.121604"/>
   </clipPath>
  </defs>
 </svg>
diff --git a/oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3_all.svg b/oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3_all.svg
index bff56d0..1575906 100644
--- a/oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3_all.svg
+++ b/oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3_all.svg
@@ -6,7 +6,7 @@
   <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
    <cc:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
-    <dc:date>2026-01-12T23:50:13.556455</dc:date>
+    <dc:date>2026-01-22T03:17:16.168483</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <cc:Agent>
@@ -41,12 +41,12 @@ z
     <g id="xtick_1">
      <g id="line2d_1">
       <defs>
-       <path id="me1a6361767" d="M 0 0 
+       <path id="mb2d86bc6ca" d="M 0 0 
 L 0 3.5 
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
-       <use xlink:href="#me1a6361767" x="75.478057" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="75.478057" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_1">
@@ -167,7 +167,7 @@ z
     <g id="xtick_2">
      <g id="line2d_2">
       <g>
-       <use xlink:href="#me1a6361767" x="118.010699" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="118.010699" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_2">
@@ -204,7 +204,7 @@ z
     <g id="xtick_3">
      <g id="line2d_3">
       <g>
-       <use xlink:href="#me1a6361767" x="160.543341" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="160.543341" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_3">
@@ -225,7 +225,7 @@ z
     <g id="xtick_4">
      <g id="line2d_4">
       <g>
-       <use xlink:href="#me1a6361767" x="203.075984" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="203.075984" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_4">
@@ -257,7 +257,7 @@ z
     <g id="xtick_5">
      <g id="line2d_5">
       <g>
-       <use xlink:href="#me1a6361767" x="245.608626" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="245.608626" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_5">
@@ -278,7 +278,7 @@ z
     <g id="xtick_6">
      <g id="line2d_6">
       <g>
-       <use xlink:href="#me1a6361767" x="288.141268" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="288.141268" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_6">
@@ -299,7 +299,7 @@ z
     <g id="xtick_7">
      <g id="line2d_7">
       <g>
-       <use xlink:href="#me1a6361767" x="330.673911" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="330.673911" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_7">
@@ -360,7 +360,7 @@ z
     <g id="xtick_8">
      <g id="line2d_8">
       <g>
-       <use xlink:href="#me1a6361767" x="373.206553" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="373.206553" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_8">
@@ -381,7 +381,7 @@ z
     <g id="xtick_9">
      <g id="line2d_9">
       <g>
-       <use xlink:href="#me1a6361767" x="415.739195" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="415.739195" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_9">
@@ -652,16 +652,16 @@ z
      <g id="line2d_10">
       <path d="M 58.465 316.082831 
 L 432.752252 316.082831 
-" clip-path="url(#pb6a1cac816)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+" clip-path="url(#pbb8d2b9fc8)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_11">
       <defs>
-       <path id="mda30e46ddb" d="M 0 0 
+       <path id="m76f6df339e" d="M 0 0 
 L -3.5 0 
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
-       <use xlink:href="#mda30e46ddb" x="58.465" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m76f6df339e" x="58.465" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_11">
@@ -696,18 +696,18 @@ z
     </g>
     <g id="ytick_2">
      <g id="line2d_12">
-      <path d="M 58.465 273.086675 
-L 432.752252 273.086675 
-" clip-path="url(#pb6a1cac816)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 272.612523 
+L 432.752252 272.612523 
+" clip-path="url(#pbb8d2b9fc8)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_13">
       <g>
-       <use xlink:href="#mda30e46ddb" x="58.465" y="273.086675" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m76f6df339e" x="58.465" y="272.612523" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_12">
       <!-- 2000 -->
-      <g transform="translate(20.925 277.645737) scale(0.12 -0.12)">
+      <g transform="translate(20.925 277.171586) scale(0.12 -0.12)">
        <defs>
         <path id="DejaVuSans-32" d="M 1228 531 
 L 3431 531 
@@ -743,18 +743,18 @@ z
     </g>
     <g id="ytick_3">
      <g id="line2d_14">
-      <path d="M 58.465 230.090518 
-L 432.752252 230.090518 
-" clip-path="url(#pb6a1cac816)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 229.142215 
+L 432.752252 229.142215 
+" clip-path="url(#pbb8d2b9fc8)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_15">
       <g>
-       <use xlink:href="#mda30e46ddb" x="58.465" y="230.090518" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m76f6df339e" x="58.465" y="229.142215" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_13">
       <!-- 4000 -->
-      <g transform="translate(20.925 234.64958) scale(0.12 -0.12)">
+      <g transform="translate(20.925 233.701277) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-34"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -764,18 +764,18 @@ L 432.752252 230.090518
     </g>
     <g id="ytick_4">
      <g id="line2d_16">
-      <path d="M 58.465 187.094361 
-L 432.752252 187.094361 
-" clip-path="url(#pb6a1cac816)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 185.671907 
+L 432.752252 185.671907 
+" clip-path="url(#pbb8d2b9fc8)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_17">
       <g>
-       <use xlink:href="#mda30e46ddb" x="58.465" y="187.094361" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m76f6df339e" x="58.465" y="185.671907" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_14">
       <!-- 6000 -->
-      <g transform="translate(20.925 191.653424) scale(0.12 -0.12)">
+      <g transform="translate(20.925 190.230969) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-36"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -1032,18 +1032,18 @@ z
     </g>
    </g>
    <g id="line2d_18">
-    <path d="M 75.478057 205.5528 
-L 118.010699 179.140297 
-L 160.543341 169.809528 
-L 203.075984 205.429197 
-L 245.608626 185.748795 
-L 288.141268 175.799063 
-L 330.673911 200.175862 
-L 373.206553 178.450876 
-L 415.739195 171.359059 
-" clip-path="url(#pb6a1cac816)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <path d="M 75.478057 204.486344 
+L 118.010699 179.497906 
+L 160.543341 167.393008 
+L 203.075984 204.018101 
+L 245.608626 184.705082 
+L 288.141268 170.380052 
+L 330.673911 199.593195 
+L 373.206553 177.367161 
+L 415.739195 169.088602 
+" clip-path="url(#pbb8d2b9fc8)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
     <defs>
-     <path id="m487c4cd14e" d="M 0 3.5 
+     <path id="m6bd03cc05c" d="M 0 3.5 
 C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
 C 3.131218 1.81853 3.5 0.928211 3.5 0 
 C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
@@ -1055,31 +1055,31 @@ C -1.81853 3.131218 -0.928211 3.5 0 3.5
 z
 " style="stroke: #5ba3f5"/>
     </defs>
-    <g clip-path="url(#pb6a1cac816)">
-     <use xlink:href="#m487c4cd14e" x="75.478057" y="205.5528" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="118.010699" y="179.140297" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="160.543341" y="169.809528" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="203.075984" y="205.429197" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="245.608626" y="185.748795" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="288.141268" y="175.799063" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="330.673911" y="200.175862" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="373.206553" y="178.450876" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="415.739195" y="171.359059" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <g clip-path="url(#pbb8d2b9fc8)">
+     <use xlink:href="#m6bd03cc05c" x="75.478057" y="204.486344" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="118.010699" y="179.497906" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="160.543341" y="167.393008" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="203.075984" y="204.018101" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="245.608626" y="184.705082" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="288.141268" y="170.380052" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="330.673911" y="199.593195" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="373.206553" y="177.367161" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="415.739195" y="169.088602" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_19">
-    <path d="M 75.478057 220.928241 
-L 118.010699 194.339842 
-L 160.543341 186.565136 
-L 203.075984 214.462 
-L 245.608626 195.061322 
-L 288.141268 186.606295 
-L 330.673911 213.294106 
-L 373.206553 191.64191 
-L 415.739195 185.505697 
-" clip-path="url(#pb6a1cac816)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <path d="M 75.478057 265.673267 
+L 118.010699 253.55074 
+L 160.543341 249.407851 
+L 203.075984 262.886265 
+L 245.608626 253.58501 
+L 288.141268 249.818089 
+L 330.673911 262.500189 
+L 373.206553 251.987311 
+L 415.739195 249.321247 
+" clip-path="url(#pbb8d2b9fc8)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
     <defs>
-     <path id="m29f540cea5" d="M 0 3.5 
+     <path id="mbd1ee5df0d" d="M 0 3.5 
 C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
 C 3.131218 1.81853 3.5 0.928211 3.5 0 
 C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
@@ -1091,22 +1091,22 @@ C -1.81853 3.131218 -0.928211 3.5 0 3.5
 z
 " style="stroke: #ff4444"/>
     </defs>
-    <g clip-path="url(#pb6a1cac816)">
-     <use xlink:href="#m29f540cea5" x="75.478057" y="220.928241" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="118.010699" y="194.339842" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="160.543341" y="186.565136" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="203.075984" y="214.462" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="245.608626" y="195.061322" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="288.141268" y="186.606295" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="330.673911" y="213.294106" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="373.206553" y="191.64191" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="415.739195" y="185.505697" style="fill: #ff4444; stroke: #ff4444"/>
+    <g clip-path="url(#pbb8d2b9fc8)">
+     <use xlink:href="#mbd1ee5df0d" x="75.478057" y="265.673267" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="118.010699" y="253.55074" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="160.543341" y="249.407851" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="203.075984" y="262.886265" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="245.608626" y="253.58501" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="288.141268" y="249.818089" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="330.673911" y="262.500189" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="373.206553" y="251.987311" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="415.739195" y="249.321247" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_20">
     <path d="M 58.465 157.502432 
 L 432.752252 157.502432 
-" clip-path="url(#pb6a1cac816)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+" clip-path="url(#pbb8d2b9fc8)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_3">
     <path d="M 58.465 316.082831 
@@ -1287,7 +1287,7 @@ z
     <g id="xtick_10">
      <g id="line2d_21">
       <g>
-       <use xlink:href="#me1a6361767" x="490.606864" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="490.606864" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_17">
@@ -1307,7 +1307,7 @@ z
     <g id="xtick_11">
      <g id="line2d_22">
       <g>
-       <use xlink:href="#me1a6361767" x="533.139506" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="533.139506" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_18">
@@ -1328,7 +1328,7 @@ z
     <g id="xtick_12">
      <g id="line2d_23">
       <g>
-       <use xlink:href="#me1a6361767" x="575.672148" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="575.672148" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_19">
@@ -1349,7 +1349,7 @@ z
     <g id="xtick_13">
      <g id="line2d_24">
       <g>
-       <use xlink:href="#me1a6361767" x="618.204791" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="618.204791" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_20">
@@ -1369,7 +1369,7 @@ z
     <g id="xtick_14">
      <g id="line2d_25">
       <g>
-       <use xlink:href="#me1a6361767" x="660.737433" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="660.737433" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_21">
@@ -1390,7 +1390,7 @@ z
     <g id="xtick_15">
      <g id="line2d_26">
       <g>
-       <use xlink:href="#me1a6361767" x="703.270075" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="703.270075" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_22">
@@ -1411,7 +1411,7 @@ z
     <g id="xtick_16">
      <g id="line2d_27">
       <g>
-       <use xlink:href="#me1a6361767" x="745.802717" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="745.802717" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_23">
@@ -1431,7 +1431,7 @@ z
     <g id="xtick_17">
      <g id="line2d_28">
       <g>
-       <use xlink:href="#me1a6361767" x="788.33536" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="788.33536" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_24">
@@ -1452,7 +1452,7 @@ z
     <g id="xtick_18">
      <g id="line2d_29">
       <g>
-       <use xlink:href="#me1a6361767" x="830.868002" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="830.868002" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_25">
@@ -1500,101 +1500,101 @@ z
      <g id="line2d_30">
       <path d="M 473.593807 316.082831 
 L 847.881059 316.082831 
-" clip-path="url(#p7b554e3208)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+" clip-path="url(#p723f2efa67)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_31">
       <g>
-       <use xlink:href="#mda30e46ddb" x="473.593807" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m76f6df339e" x="473.593807" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_6">
      <g id="line2d_32">
-      <path d="M 473.593807 273.086675 
-L 847.881059 273.086675 
-" clip-path="url(#p7b554e3208)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 473.593807 272.612523 
+L 847.881059 272.612523 
+" clip-path="url(#p723f2efa67)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_33">
       <g>
-       <use xlink:href="#mda30e46ddb" x="473.593807" y="273.086675" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m76f6df339e" x="473.593807" y="272.612523" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_7">
      <g id="line2d_34">
-      <path d="M 473.593807 230.090518 
-L 847.881059 230.090518 
-" clip-path="url(#p7b554e3208)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 473.593807 229.142215 
+L 847.881059 229.142215 
+" clip-path="url(#p723f2efa67)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_35">
       <g>
-       <use xlink:href="#mda30e46ddb" x="473.593807" y="230.090518" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m76f6df339e" x="473.593807" y="229.142215" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_8">
      <g id="line2d_36">
-      <path d="M 473.593807 187.094361 
-L 847.881059 187.094361 
-" clip-path="url(#p7b554e3208)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 473.593807 185.671907 
+L 847.881059 185.671907 
+" clip-path="url(#p723f2efa67)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_37">
       <g>
-       <use xlink:href="#mda30e46ddb" x="473.593807" y="187.094361" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m76f6df339e" x="473.593807" y="185.671907" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
    </g>
    <g id="line2d_38">
-    <path d="M 490.606864 242.230362 
-L 533.139506 224.659801 
-L 575.672148 218.979013 
-L 618.204791 237.379809 
-L 660.737433 219.905786 
-L 703.270075 214.100532 
-L 745.802717 242.388038 
-L 788.33536 228.316324 
-L 830.868002 224.247779 
-" clip-path="url(#p7b554e3208)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p7b554e3208)">
-     <use xlink:href="#m487c4cd14e" x="490.606864" y="242.230362" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="533.139506" y="224.659801" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="575.672148" y="218.979013" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="618.204791" y="237.379809" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="660.737433" y="219.905786" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="703.270075" y="214.100532" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="745.802717" y="242.388038" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="788.33536" y="228.316324" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="830.868002" y="224.247779" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <path d="M 490.606864 241.408009 
+L 533.139506 223.826641 
+L 575.672148 217.850254 
+L 618.204791 236.146099 
+L 660.737433 219.034727 
+L 703.270075 212.960111 
+L 745.802717 241.551219 
+L 788.33536 227.470533 
+L 830.868002 223.222899 
+" clip-path="url(#p723f2efa67)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p723f2efa67)">
+     <use xlink:href="#m6bd03cc05c" x="490.606864" y="241.408009" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="533.139506" y="223.826641" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="575.672148" y="217.850254" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="618.204791" y="236.146099" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="660.737433" y="219.034727" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="703.270075" y="212.960111" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="745.802717" y="241.551219" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="788.33536" y="227.470533" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="830.868002" y="223.222899" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_39">
-    <path d="M 490.606864 268.05437 
-L 533.139506 253.28622 
-L 575.672148 247.945881 
-L 618.204791 257.681156 
-L 660.737433 242.707566 
-L 703.270075 237.866283 
-L 745.802717 258.863992 
-L 788.33536 247.646974 
-L 830.868002 244.271053 
-" clip-path="url(#p7b554e3208)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p7b554e3208)">
-     <use xlink:href="#m29f540cea5" x="490.606864" y="268.05437" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="533.139506" y="253.28622" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="575.672148" y="247.945881" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="618.204791" y="257.681156" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="660.737433" y="242.707566" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="703.270075" y="237.866283" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="745.802717" y="258.863992" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="788.33536" y="247.646974" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="830.868002" y="244.271053" style="fill: #ff4444; stroke: #ff4444"/>
+    <path d="M 490.606864 266.269256 
+L 533.139506 252.624065 
+L 575.672148 247.145042 
+L 618.204791 255.883185 
+L 660.737433 241.886347 
+L 703.270075 237.00683 
+L 745.802717 258.298961 
+L 788.33536 246.917646 
+L 830.868002 243.488012 
+" clip-path="url(#p723f2efa67)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p723f2efa67)">
+     <use xlink:href="#mbd1ee5df0d" x="490.606864" y="266.269256" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="533.139506" y="252.624065" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="575.672148" y="247.145042" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="618.204791" y="255.883185" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="660.737433" y="241.886347" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="703.270075" y="237.00683" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="745.802717" y="258.298961" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="788.33536" y="246.917646" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="830.868002" y="243.488012" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_40">
     <path d="M 473.593807 157.502432 
 L 847.881059 157.502432 
-" clip-path="url(#p7b554e3208)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+" clip-path="url(#p723f2efa67)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_8">
     <path d="M 473.593807 316.082831 
@@ -1695,7 +1695,7 @@ z
     <g id="xtick_19">
      <g id="line2d_41">
       <g>
-       <use xlink:href="#me1a6361767" x="905.735671" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="905.735671" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_28">
@@ -1715,7 +1715,7 @@ z
     <g id="xtick_20">
      <g id="line2d_42">
       <g>
-       <use xlink:href="#me1a6361767" x="948.268313" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="948.268313" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_29">
@@ -1736,7 +1736,7 @@ z
     <g id="xtick_21">
      <g id="line2d_43">
       <g>
-       <use xlink:href="#me1a6361767" x="990.800955" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="990.800955" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_30">
@@ -1757,7 +1757,7 @@ z
     <g id="xtick_22">
      <g id="line2d_44">
       <g>
-       <use xlink:href="#me1a6361767" x="1033.333597" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="1033.333597" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_31">
@@ -1777,7 +1777,7 @@ z
     <g id="xtick_23">
      <g id="line2d_45">
       <g>
-       <use xlink:href="#me1a6361767" x="1075.86624" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="1075.86624" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_32">
@@ -1798,7 +1798,7 @@ z
     <g id="xtick_24">
      <g id="line2d_46">
       <g>
-       <use xlink:href="#me1a6361767" x="1118.398882" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="1118.398882" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_33">
@@ -1819,7 +1819,7 @@ z
     <g id="xtick_25">
      <g id="line2d_47">
       <g>
-       <use xlink:href="#me1a6361767" x="1160.931524" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="1160.931524" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_34">
@@ -1839,7 +1839,7 @@ z
     <g id="xtick_26">
      <g id="line2d_48">
       <g>
-       <use xlink:href="#me1a6361767" x="1203.464166" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="1203.464166" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_35">
@@ -1860,7 +1860,7 @@ z
     <g id="xtick_27">
      <g id="line2d_49">
       <g>
-       <use xlink:href="#me1a6361767" x="1245.996809" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="1245.996809" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_36">
@@ -1908,101 +1908,101 @@ z
      <g id="line2d_50">
       <path d="M 888.722614 316.082831 
 L 1263.009866 316.082831 
-" clip-path="url(#pe3e5c8b465)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+" clip-path="url(#pf08b57edcd)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_51">
       <g>
-       <use xlink:href="#mda30e46ddb" x="888.722614" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m76f6df339e" x="888.722614" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_10">
      <g id="line2d_52">
-      <path d="M 888.722614 273.086675 
-L 1263.009866 273.086675 
-" clip-path="url(#pe3e5c8b465)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 888.722614 272.612523 
+L 1263.009866 272.612523 
+" clip-path="url(#pf08b57edcd)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_53">
       <g>
-       <use xlink:href="#mda30e46ddb" x="888.722614" y="273.086675" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m76f6df339e" x="888.722614" y="272.612523" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_11">
      <g id="line2d_54">
-      <path d="M 888.722614 230.090518 
-L 1263.009866 230.090518 
-" clip-path="url(#pe3e5c8b465)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 888.722614 229.142215 
+L 1263.009866 229.142215 
+" clip-path="url(#pf08b57edcd)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_55">
       <g>
-       <use xlink:href="#mda30e46ddb" x="888.722614" y="230.090518" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m76f6df339e" x="888.722614" y="229.142215" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_12">
      <g id="line2d_56">
-      <path d="M 888.722614 187.094361 
-L 1263.009866 187.094361 
-" clip-path="url(#pe3e5c8b465)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 888.722614 185.671907 
+L 1263.009866 185.671907 
+" clip-path="url(#pf08b57edcd)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_57">
       <g>
-       <use xlink:href="#mda30e46ddb" x="888.722614" y="187.094361" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m76f6df339e" x="888.722614" y="185.671907" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
    </g>
    <g id="line2d_58">
-    <path d="M 905.735671 247.795263 
-L 948.268313 222.809969 
-L 990.800955 213.278342 
-L 1033.333597 249.388012 
-L 1075.86624 225.532749 
-L 1118.398882 216.665557 
-L 1160.931524 248.221497 
-L 1203.464166 225.860191 
-L 1245.996809 217.405128 
-" clip-path="url(#pe3e5c8b465)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#pe3e5c8b465)">
-     <use xlink:href="#m487c4cd14e" x="905.735671" y="247.795263" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="948.268313" y="222.809969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="990.800955" y="213.278342" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="1033.333597" y="249.388012" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="1075.86624" y="225.532749" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="1118.398882" y="216.665557" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="1160.931524" y="248.221497" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="1203.464166" y="225.860191" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="1245.996809" y="217.405128" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <path d="M 905.735671 247.012237 
+L 948.268313 221.812351 
+L 990.800955 212.11714 
+L 1033.333597 240.735885 
+L 1075.86624 217.396069 
+L 1118.398882 207.306469 
+L 1160.931524 236.398034 
+L 1203.464166 209.682436 
+L 1245.996809 216.398701 
+" clip-path="url(#pf08b57edcd)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pf08b57edcd)">
+     <use xlink:href="#m6bd03cc05c" x="905.735671" y="247.012237" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="948.268313" y="221.812351" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="990.800955" y="212.11714" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="1033.333597" y="240.735885" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="1075.86624" y="217.396069" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="1118.398882" y="207.306469" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="1160.931524" y="236.398034" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="1203.464166" y="209.682436" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="1245.996809" y="216.398701" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_59">
-    <path d="M 905.735671 267.417263 
-L 948.268313 255.200155 
-L 990.800955 251.112552 
-L 1033.333597 254.491374 
-L 1075.86624 238.25727 
-L 1118.398882 232.686349 
-L 1160.931524 256.363568 
-L 1203.464166 244.963072 
-L 1245.996809 241.004541 
-" clip-path="url(#pe3e5c8b465)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#pe3e5c8b465)">
-     <use xlink:href="#m29f540cea5" x="905.735671" y="267.417263" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="948.268313" y="255.200155" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="990.800955" y="251.112552" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="1033.333597" y="254.491374" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="1075.86624" y="238.25727" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="1118.398882" y="232.686349" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="1160.931524" y="256.363568" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="1203.464166" y="244.963072" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="1245.996809" y="241.004541" style="fill: #ff4444; stroke: #ff4444"/>
+    <path d="M 905.735671 266.36373 
+L 948.268313 254.627499 
+L 990.800955 250.408738 
+L 1033.333597 254.254735 
+L 1075.86624 237.442306 
+L 1118.398882 231.809446 
+L 1160.931524 255.514433 
+L 1203.464166 244.24055 
+L 1245.996809 240.202327 
+" clip-path="url(#pf08b57edcd)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pf08b57edcd)">
+     <use xlink:href="#mbd1ee5df0d" x="905.735671" y="266.36373" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="948.268313" y="254.627499" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="990.800955" y="250.408738" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="1033.333597" y="254.254735" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="1075.86624" y="237.442306" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="1118.398882" y="231.809446" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="1160.931524" y="255.514433" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="1203.464166" y="244.24055" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="1245.996809" y="240.202327" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_60">
     <path d="M 888.722614 157.502432 
 L 1263.009866 157.502432 
-" clip-path="url(#pe3e5c8b465)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+" clip-path="url(#pf08b57edcd)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_13">
     <path d="M 888.722614 316.082831 
@@ -2069,7 +2069,7 @@ z
     <g id="xtick_28">
      <g id="line2d_61">
       <g>
-       <use xlink:href="#me1a6361767" x="1320.864477" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="1320.864477" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_39">
@@ -2123,7 +2123,7 @@ z
     <g id="xtick_29">
      <g id="line2d_62">
       <g>
-       <use xlink:href="#me1a6361767" x="1351.797308" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="1351.797308" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_40">
@@ -2144,7 +2144,7 @@ z
     <g id="xtick_30">
      <g id="line2d_63">
       <g>
-       <use xlink:href="#me1a6361767" x="1382.730139" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="1382.730139" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_41">
@@ -2165,7 +2165,7 @@ z
     <g id="xtick_31">
      <g id="line2d_64">
       <g>
-       <use xlink:href="#me1a6361767" x="1413.66297" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="1413.66297" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_42">
@@ -2185,7 +2185,7 @@ z
     <g id="xtick_32">
      <g id="line2d_65">
       <g>
-       <use xlink:href="#me1a6361767" x="1444.5958" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="1444.5958" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_43">
@@ -2206,7 +2206,7 @@ z
     <g id="xtick_33">
      <g id="line2d_66">
       <g>
-       <use xlink:href="#me1a6361767" x="1475.528631" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="1475.528631" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_44">
@@ -2227,7 +2227,7 @@ z
     <g id="xtick_34">
      <g id="line2d_67">
       <g>
-       <use xlink:href="#me1a6361767" x="1506.461462" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="1506.461462" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_45">
@@ -2247,7 +2247,7 @@ z
     <g id="xtick_35">
      <g id="line2d_68">
       <g>
-       <use xlink:href="#me1a6361767" x="1537.394293" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="1537.394293" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_46">
@@ -2268,7 +2268,7 @@ z
     <g id="xtick_36">
      <g id="line2d_69">
       <g>
-       <use xlink:href="#me1a6361767" x="1568.327123" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="1568.327123" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_47">
@@ -2289,7 +2289,7 @@ z
     <g id="xtick_37">
      <g id="line2d_70">
       <g>
-       <use xlink:href="#me1a6361767" x="1599.259954" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="1599.259954" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_48">
@@ -2310,7 +2310,7 @@ z
     <g id="xtick_38">
      <g id="line2d_71">
       <g>
-       <use xlink:href="#me1a6361767" x="1630.192785" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="1630.192785" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_49">
@@ -2332,7 +2332,7 @@ z
     <g id="xtick_39">
      <g id="line2d_72">
       <g>
-       <use xlink:href="#me1a6361767" x="1661.125616" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mb2d86bc6ca" x="1661.125616" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_50">
@@ -2433,113 +2433,113 @@ z
      <g id="line2d_73">
       <path d="M 1303.85142 316.082831 
 L 1678.138672 316.082831 
-" clip-path="url(#p66a4d4c4b8)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+" clip-path="url(#p55a951beb2)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_74">
       <g>
-       <use xlink:href="#mda30e46ddb" x="1303.85142" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m76f6df339e" x="1303.85142" y="316.082831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_14">
      <g id="line2d_75">
-      <path d="M 1303.85142 273.086675 
-L 1678.138672 273.086675 
-" clip-path="url(#p66a4d4c4b8)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 1303.85142 272.612523 
+L 1678.138672 272.612523 
+" clip-path="url(#p55a951beb2)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_76">
       <g>
-       <use xlink:href="#mda30e46ddb" x="1303.85142" y="273.086675" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m76f6df339e" x="1303.85142" y="272.612523" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_15">
      <g id="line2d_77">
-      <path d="M 1303.85142 230.090518 
-L 1678.138672 230.090518 
-" clip-path="url(#p66a4d4c4b8)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 1303.85142 229.142215 
+L 1678.138672 229.142215 
+" clip-path="url(#p55a951beb2)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_78">
       <g>
-       <use xlink:href="#mda30e46ddb" x="1303.85142" y="230.090518" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m76f6df339e" x="1303.85142" y="229.142215" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_16">
      <g id="line2d_79">
-      <path d="M 1303.85142 187.094361 
-L 1678.138672 187.094361 
-" clip-path="url(#p66a4d4c4b8)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 1303.85142 185.671907 
+L 1678.138672 185.671907 
+" clip-path="url(#p55a951beb2)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_80">
       <g>
-       <use xlink:href="#mda30e46ddb" x="1303.85142" y="187.094361" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m76f6df339e" x="1303.85142" y="185.671907" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
    </g>
    <g id="line2d_81">
-    <path d="M 1320.864477 259.659908 
-L 1351.797308 229.36802 
-L 1382.730139 213.321305 
-L 1413.66297 253.788039 
-L 1444.5958 231.855436 
-L 1475.528631 223.774347 
-L 1506.461462 243.579895 
-L 1537.394293 223.893026 
-L 1568.327123 217.84971 
-L 1599.259954 241.504302 
-L 1630.192785 225.418811 
-L 1661.125616 221.008931 
-" clip-path="url(#p66a4d4c4b8)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p66a4d4c4b8)">
-     <use xlink:href="#m487c4cd14e" x="1320.864477" y="259.659908" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="1351.797308" y="229.36802" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="1382.730139" y="213.321305" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="1413.66297" y="253.788039" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="1444.5958" y="231.855436" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="1475.528631" y="223.774347" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="1506.461462" y="243.579895" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="1537.394293" y="223.893026" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="1568.327123" y="217.84971" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="1599.259954" y="241.504302" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="1630.192785" y="225.418811" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m487c4cd14e" x="1661.125616" y="221.008931" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <path d="M 1320.864477 258.91614 
+L 1351.797308 228.251939 
+L 1382.730139 212.198249 
+L 1413.66297 252.767144 
+L 1444.5958 231.615741 
+L 1475.528631 223.002526 
+L 1506.461462 242.651779 
+L 1537.394293 222.908248 
+L 1568.327123 216.799201 
+L 1599.259954 240.695254 
+L 1630.192785 224.816737 
+L 1661.125616 219.868247 
+" clip-path="url(#p55a951beb2)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p55a951beb2)">
+     <use xlink:href="#m6bd03cc05c" x="1320.864477" y="258.91614" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="1351.797308" y="228.251939" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="1382.730139" y="212.198249" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="1413.66297" y="252.767144" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="1444.5958" y="231.615741" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="1475.528631" y="223.002526" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="1506.461462" y="242.651779" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="1537.394293" y="222.908248" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="1568.327123" y="216.799201" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="1599.259954" y="240.695254" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="1630.192785" y="224.816737" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="1661.125616" y="219.868247" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_82">
-    <path d="M 1320.864477 302.225194 
-L 1351.797308 267.457114 
-L 1382.730139 251.224717 
-L 1413.66297 290.711635 
-L 1444.5958 257.786545 
-L 1475.528631 251.452025 
-L 1506.461462 282.752575 
-L 1537.394293 250.30758 
-L 1568.327123 245.121217 
-L 1599.259954 267.900687 
-L 1630.192785 253.059875 
-L 1661.125616 249.723372 
-" clip-path="url(#p66a4d4c4b8)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p66a4d4c4b8)">
-     <use xlink:href="#m29f540cea5" x="1320.864477" y="302.225194" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="1351.797308" y="267.457114" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="1382.730139" y="251.224717" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="1413.66297" y="290.711635" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="1444.5958" y="257.786545" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="1475.528631" y="251.452025" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="1506.461462" y="282.752575" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="1537.394293" y="250.30758" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="1568.327123" y="245.121217" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="1599.259954" y="267.900687" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="1630.192785" y="253.059875" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m29f540cea5" x="1661.125616" y="249.723372" style="fill: #ff4444; stroke: #ff4444"/>
+    <path d="M 1320.864477 301.429421 
+L 1351.797308 266.701751 
+L 1382.730139 250.541857 
+L 1413.66297 296.051632 
+L 1444.5958 257.141926 
+L 1475.528631 250.82407 
+L 1506.461462 280.957703 
+L 1537.394293 249.696303 
+L 1568.327123 244.430453 
+L 1599.259954 265.61976 
+L 1630.192785 252.571555 
+L 1661.125616 249.002716 
+" clip-path="url(#p55a951beb2)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p55a951beb2)">
+     <use xlink:href="#mbd1ee5df0d" x="1320.864477" y="301.429421" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="1351.797308" y="266.701751" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="1382.730139" y="250.541857" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="1413.66297" y="296.051632" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="1444.5958" y="257.141926" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="1475.528631" y="250.82407" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="1506.461462" y="280.957703" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="1537.394293" y="249.696303" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="1568.327123" y="244.430453" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="1599.259954" y="265.61976" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="1630.192785" y="252.571555" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="1661.125616" y="249.002716" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_83">
     <path d="M 1303.85142 157.502432 
 L 1678.138672 157.502432 
-" clip-path="url(#p66a4d4c4b8)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+" clip-path="url(#p55a951beb2)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_18">
     <path d="M 1303.85142 316.082831 
@@ -2805,7 +2805,7 @@ L 610.490937 46.691969
 L 626.740937 46.691969 
 " style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
     <g>
-     <use xlink:href="#m487c4cd14e" x="610.490937" y="46.691969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m6bd03cc05c" x="610.490937" y="46.691969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="text_54">
@@ -2878,7 +2878,7 @@ L 835.955625 46.691969
 L 852.205625 46.691969 
 " style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
     <g>
-     <use xlink:href="#m29f540cea5" x="835.955625" y="46.691969" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#mbd1ee5df0d" x="835.955625" y="46.691969" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="text_55">
@@ -2941,16 +2941,16 @@ z
   </g>
  </g>
  <defs>
-  <clipPath id="pb6a1cac816">
+  <clipPath id="pbb8d2b9fc8">
    <rect x="58.465" y="144.816" width="374.287252" height="171.266831"/>
   </clipPath>
-  <clipPath id="p7b554e3208">
+  <clipPath id="p723f2efa67">
    <rect x="473.593807" y="144.816" width="374.287252" height="171.266831"/>
   </clipPath>
-  <clipPath id="pe3e5c8b465">
+  <clipPath id="pf08b57edcd">
    <rect x="888.722614" y="144.816" width="374.287252" height="171.266831"/>
   </clipPath>
-  <clipPath id="p66a4d4c4b8">
+  <clipPath id="p55a951beb2">
    <rect x="1303.85142" y="144.816" width="374.287252" height="171.266831"/>
   </clipPath>
  </defs>
diff --git a/oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3_cross_entropy.svg b/oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3_cross_entropy.svg
index 6a16fe8..66a3075 100644
--- a/oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3_cross_entropy.svg
+++ b/oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3_cross_entropy.svg
@@ -6,7 +6,7 @@
   <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
    <cc:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
-    <dc:date>2026-01-12T23:31:46.294935</dc:date>
+    <dc:date>2026-01-22T03:17:14.531728</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <cc:Agent>
@@ -41,12 +41,12 @@ z
     <g id="xtick_1">
      <g id="line2d_1">
       <defs>
-       <path id="meecc0d2803" d="M 0 0 
+       <path id="m541b95d87d" d="M 0 0 
 L 0 3.5 
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
-       <use xlink:href="#meecc0d2803" x="73.267666" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m541b95d87d" x="73.267666" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_1">
@@ -169,7 +169,7 @@ z
     <g id="xtick_2">
      <g id="line2d_2">
       <g>
-       <use xlink:href="#meecc0d2803" x="100.181604" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m541b95d87d" x="100.181604" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_2">
@@ -236,7 +236,7 @@ z
     <g id="xtick_3">
      <g id="line2d_3">
       <g>
-       <use xlink:href="#meecc0d2803" x="127.095543" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m541b95d87d" x="127.095543" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_3">
@@ -257,7 +257,7 @@ z
     <g id="xtick_4">
      <g id="line2d_4">
       <g>
-       <use xlink:href="#meecc0d2803" x="154.009481" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m541b95d87d" x="154.009481" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_4">
@@ -277,7 +277,7 @@ z
     <g id="xtick_5">
      <g id="line2d_5">
       <g>
-       <use xlink:href="#meecc0d2803" x="180.923419" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m541b95d87d" x="180.923419" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_5">
@@ -298,7 +298,7 @@ z
     <g id="xtick_6">
      <g id="line2d_6">
       <g>
-       <use xlink:href="#meecc0d2803" x="207.837357" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m541b95d87d" x="207.837357" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_6">
@@ -319,7 +319,7 @@ z
     <g id="xtick_7">
      <g id="line2d_7">
       <g>
-       <use xlink:href="#meecc0d2803" x="234.751295" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m541b95d87d" x="234.751295" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_7">
@@ -380,7 +380,7 @@ z
     <g id="xtick_8">
      <g id="line2d_8">
       <g>
-       <use xlink:href="#meecc0d2803" x="261.665234" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m541b95d87d" x="261.665234" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_8">
@@ -401,7 +401,7 @@ z
     <g id="xtick_9">
      <g id="line2d_9">
       <g>
-       <use xlink:href="#meecc0d2803" x="288.579172" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m541b95d87d" x="288.579172" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_9">
@@ -422,7 +422,7 @@ z
     <g id="xtick_10">
      <g id="line2d_10">
       <g>
-       <use xlink:href="#meecc0d2803" x="315.49311" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m541b95d87d" x="315.49311" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_10">
@@ -469,7 +469,7 @@ z
     <g id="xtick_11">
      <g id="line2d_11">
       <g>
-       <use xlink:href="#meecc0d2803" x="342.407048" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m541b95d87d" x="342.407048" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_11">
@@ -491,7 +491,7 @@ z
     <g id="xtick_12">
      <g id="line2d_12">
       <g>
-       <use xlink:href="#meecc0d2803" x="369.320987" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m541b95d87d" x="369.320987" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_12">
@@ -516,16 +516,16 @@ z
      <g id="line2d_13">
       <path d="M 58.465 334.482831 
 L 384.123653 334.482831 
-" clip-path="url(#p86cc9ba3a0)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+" clip-path="url(#p0e700733a7)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_14">
       <defs>
-       <path id="m4bf57ba76b" d="M 0 0 
+       <path id="mbec36b9c4a" d="M 0 0 
 L -3.5 0 
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
-       <use xlink:href="#m4bf57ba76b" x="58.465" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mbec36b9c4a" x="58.465" y="334.482831" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_13">
@@ -560,18 +560,18 @@ z
     </g>
     <g id="ytick_2">
      <g id="line2d_15">
-      <path d="M 58.465 286.867396 
-L 384.123653 286.867396 
-" clip-path="url(#p86cc9ba3a0)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 286.342304 
+L 384.123653 286.342304 
+" clip-path="url(#p0e700733a7)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_16">
       <g>
-       <use xlink:href="#m4bf57ba76b" x="58.465" y="286.867396" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mbec36b9c4a" x="58.465" y="286.342304" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_14">
       <!-- 2000 -->
-      <g transform="translate(20.925 291.426458) scale(0.12 -0.12)">
+      <g transform="translate(20.925 290.901366) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-32"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -581,18 +581,18 @@ L 384.123653 286.867396
     </g>
     <g id="ytick_3">
      <g id="line2d_17">
-      <path d="M 58.465 239.25196 
-L 384.123653 239.25196 
-" clip-path="url(#p86cc9ba3a0)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 238.201776 
+L 384.123653 238.201776 
+" clip-path="url(#p0e700733a7)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_18">
       <g>
-       <use xlink:href="#m4bf57ba76b" x="58.465" y="239.25196" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mbec36b9c4a" x="58.465" y="238.201776" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_15">
       <!-- 4000 -->
-      <g transform="translate(20.925 243.811022) scale(0.12 -0.12)">
+      <g transform="translate(20.925 242.760839) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-34"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -602,18 +602,18 @@ L 384.123653 239.25196
     </g>
     <g id="ytick_4">
      <g id="line2d_19">
-      <path d="M 58.465 191.636524 
-L 384.123653 191.636524 
-" clip-path="url(#p86cc9ba3a0)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 190.061249 
+L 384.123653 190.061249 
+" clip-path="url(#p0e700733a7)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_20">
       <g>
-       <use xlink:href="#m4bf57ba76b" x="58.465" y="191.636524" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mbec36b9c4a" x="58.465" y="190.061249" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_16">
       <!-- 6000 -->
-      <g transform="translate(20.925 196.195587) scale(0.12 -0.12)">
+      <g transform="translate(20.925 194.620311) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-36"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -1021,21 +1021,21 @@ z
     </g>
    </g>
    <g id="line2d_21">
-    <path d="M 73.267666 271.998128 
-L 100.181604 238.45184 
-L 127.095543 220.681152 
-L 154.009481 265.495416 
-L 180.923419 241.206491 
-L 207.837357 232.257213 
-L 234.751295 254.190564 
-L 261.665234 232.388642 
-L 288.579172 225.696064 
-L 315.49311 251.89198 
-L 342.407048 234.07835 
-L 369.320987 229.194696 
-" clip-path="url(#p86cc9ba3a0)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <path d="M 73.267666 271.174454 
+L 100.181604 237.215854 
+L 127.095543 219.43744 
+L 154.009481 264.364842 
+L 180.923419 240.941045 
+L 207.837357 231.402472 
+L 234.751295 253.162737 
+L 261.665234 231.298065 
+L 288.579172 224.532694 
+L 315.49311 250.996013 
+L 342.407048 233.411592 
+L 369.320987 227.931463 
+" clip-path="url(#p0e700733a7)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
     <defs>
-     <path id="mab266826f4" d="M 0 3.5 
+     <path id="m428de5a0ad" d="M 0 3.5 
 C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
 C 3.131218 1.81853 3.5 0.928211 3.5 0 
 C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
@@ -1047,37 +1047,37 @@ C -1.81853 3.131218 -0.928211 3.5 0 3.5
 z
 " style="stroke: #5ba3f5"/>
     </defs>
-    <g clip-path="url(#p86cc9ba3a0)">
-     <use xlink:href="#mab266826f4" x="73.267666" y="271.998128" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mab266826f4" x="100.181604" y="238.45184" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mab266826f4" x="127.095543" y="220.681152" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mab266826f4" x="154.009481" y="265.495416" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mab266826f4" x="180.923419" y="241.206491" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mab266826f4" x="207.837357" y="232.257213" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mab266826f4" x="234.751295" y="254.190564" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mab266826f4" x="261.665234" y="232.388642" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mab266826f4" x="288.579172" y="225.696064" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mab266826f4" x="315.49311" y="251.89198" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mab266826f4" x="342.407048" y="234.07835" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#mab266826f4" x="369.320987" y="229.194696" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <g clip-path="url(#p0e700733a7)">
+     <use xlink:href="#m428de5a0ad" x="73.267666" y="271.174454" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m428de5a0ad" x="100.181604" y="237.215854" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m428de5a0ad" x="127.095543" y="219.43744" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m428de5a0ad" x="154.009481" y="264.364842" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m428de5a0ad" x="180.923419" y="240.941045" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m428de5a0ad" x="207.837357" y="231.402472" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m428de5a0ad" x="234.751295" y="253.162737" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m428de5a0ad" x="261.665234" y="231.298065" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m428de5a0ad" x="288.579172" y="224.532694" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m428de5a0ad" x="315.49311" y="250.996013" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m428de5a0ad" x="342.407048" y="233.411592" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m428de5a0ad" x="369.320987" y="227.931463" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_22">
-    <path d="M 73.267666 319.136403 
-L 100.181604 280.633025 
-L 127.095543 262.656706 
-L 154.009481 306.385888 
-L 180.923419 269.923501 
-L 207.837357 262.908434 
-L 234.751295 297.571749 
-L 261.665234 261.641036 
-L 288.579172 255.897477 
-L 315.49311 281.124253 
-L 342.407048 264.689023 
-L 369.320987 260.994064 
-" clip-path="url(#p86cc9ba3a0)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <path d="M 73.267666 318.255137 
+L 100.181604 279.796509 
+L 127.095543 261.900482 
+L 154.009481 312.299587 
+L 180.923419 269.209628 
+L 207.837357 262.213015 
+L 234.751295 295.584046 
+L 261.665234 260.964087 
+L 288.579172 255.132501 
+L 315.49311 278.598275 
+L 342.407048 264.14824 
+L 369.320987 260.195985 
+" clip-path="url(#p0e700733a7)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
     <defs>
-     <path id="m1dc950a644" d="M 0 3.5 
+     <path id="m1f921b7818" d="M 0 3.5 
 C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
 C 3.131218 1.81853 3.5 0.928211 3.5 0 
 C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
@@ -1089,25 +1089,25 @@ C -1.81853 3.131218 -0.928211 3.5 0 3.5
 z
 " style="stroke: #ff4444"/>
     </defs>
-    <g clip-path="url(#p86cc9ba3a0)">
-     <use xlink:href="#m1dc950a644" x="73.267666" y="319.136403" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m1dc950a644" x="100.181604" y="280.633025" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m1dc950a644" x="127.095543" y="262.656706" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m1dc950a644" x="154.009481" y="306.385888" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m1dc950a644" x="180.923419" y="269.923501" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m1dc950a644" x="207.837357" y="262.908434" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m1dc950a644" x="234.751295" y="297.571749" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m1dc950a644" x="261.665234" y="261.641036" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m1dc950a644" x="288.579172" y="255.897477" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m1dc950a644" x="315.49311" y="281.124253" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m1dc950a644" x="342.407048" y="264.689023" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m1dc950a644" x="369.320987" y="260.994064" style="fill: #ff4444; stroke: #ff4444"/>
+    <g clip-path="url(#p0e700733a7)">
+     <use xlink:href="#m1f921b7818" x="73.267666" y="318.255137" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f921b7818" x="100.181604" y="279.796509" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f921b7818" x="127.095543" y="261.900482" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f921b7818" x="154.009481" y="312.299587" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f921b7818" x="180.923419" y="269.209628" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f921b7818" x="207.837357" y="262.213015" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f921b7818" x="234.751295" y="295.584046" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f921b7818" x="261.665234" y="260.964087" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f921b7818" x="288.579172" y="255.132501" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f921b7818" x="315.49311" y="278.598275" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f921b7818" x="342.407048" y="264.14824" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f921b7818" x="369.320987" y="260.195985" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_23">
     <path d="M 58.465 158.865395 
 L 384.123653 158.865395 
-" clip-path="url(#p86cc9ba3a0)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+" clip-path="url(#p0e700733a7)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_3">
     <path d="M 58.465 334.482831 
@@ -1436,7 +1436,7 @@ L 130.874375 45.382125
 L 145.874375 45.382125 
 " style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
     <g>
-     <use xlink:href="#mab266826f4" x="130.874375" y="45.382125" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m428de5a0ad" x="130.874375" y="45.382125" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="text_20">
@@ -1589,7 +1589,7 @@ L 130.874375 62.995875
 L 145.874375 62.995875 
 " style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
     <g>
-     <use xlink:href="#m1dc950a644" x="130.874375" y="62.995875" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f921b7818" x="130.874375" y="62.995875" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="text_21">
@@ -1701,7 +1701,7 @@ z
   </g>
  </g>
  <defs>
-  <clipPath id="p86cc9ba3a0">
+  <clipPath id="p0e700733a7">
    <rect x="58.465" y="144.816" width="325.658653" height="189.666831"/>
   </clipPath>
  </defs>
diff --git a/oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3_with_layernorm.svg b/oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3_with_layernorm.svg
index 242d013..d87b7b9 100644
--- a/oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3_with_layernorm.svg
+++ b/oink/benchmarks/media/sm100_fp16_oink_vs_quack_dsv3_with_layernorm.svg
@@ -1,12 +1,12 @@
 <?xml version="1.0" encoding="utf-8" standalone="no"?>
 <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
   "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
-<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="1728pt" height="360pt" viewBox="0 0 1728 360" xmlns="http://www.w3.org/2000/svg" version="1.1">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="1275.020982pt" height="387.00431pt" viewBox="0 0 1275.020982 387.00431" xmlns="http://www.w3.org/2000/svg" version="1.1">
  <metadata>
   <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
    <cc:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
-    <dc:date>2026-01-08T16:35:17.806957</dc:date>
+    <dc:date>2026-01-22T03:17:12.903096</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <cc:Agent>
@@ -21,19 +21,19 @@
  </defs>
  <g id="figure_1">
   <g id="patch_1">
-   <path d="M 0 360 
-L 1728 360 
-L 1728 0 
+   <path d="M 0 387.00431 
+L 1275.020982 387.00431 
+L 1275.020982 0 
 L 0 0 
 z
 " style="fill: #ffffff"/>
   </g>
   <g id="axes_1">
    <g id="patch_2">
-    <path d="M 66.53 302.361604 
-L 441.930945 302.361604 
-L 441.930945 95.28 
-L 66.53 95.28 
+    <path d="M 58.465 338.937604 
+L 428.808051 338.937604 
+L 428.808051 144.816 
+L 58.465 144.816 
 z
 " style="fill: #ffffff"/>
    </g>
@@ -41,17 +41,17 @@ z
     <g id="xtick_1">
      <g id="line2d_1">
       <defs>
-       <path id="m68a6986d45" d="M 0 0 
+       <path id="m74ed47afc4" d="M 0 0 
 L 0 3.5 
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
-       <use xlink:href="#m68a6986d45" x="83.593679" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m74ed47afc4" x="75.298775" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_1">
       <!-- (4K, 6K) -->
-      <g transform="translate(85.06424 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(76.769336 351.310511) rotate(-315) scale(0.1 -0.1)">
        <defs>
         <path id="DejaVuSans-28" d="M 1984 4856 
 Q 1566 4138 1362 3434 
@@ -167,12 +167,12 @@ z
     <g id="xtick_2">
      <g id="line2d_2">
       <g>
-       <use xlink:href="#m68a6986d45" x="151.848397" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m74ed47afc4" x="117.383213" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_2">
       <!-- (16K, 6K) -->
-      <g transform="translate(153.318958 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(118.853774 351.310511) rotate(-315) scale(0.1 -0.1)">
        <defs>
         <path id="DejaVuSans-31" d="M 794 531 
 L 1825 531 
@@ -204,12 +204,12 @@ z
     <g id="xtick_3">
      <g id="line2d_3">
       <g>
-       <use xlink:href="#m68a6986d45" x="220.103114" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m74ed47afc4" x="159.46765" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_3">
       <!-- (64K, 6K) -->
-      <g transform="translate(221.573675 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(160.938211 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -225,12 +225,86 @@ z
     <g id="xtick_4">
      <g id="line2d_4">
       <g>
-       <use xlink:href="#m68a6986d45" x="288.357831" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m74ed47afc4" x="201.552088" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_4">
+      <!-- (4K, 7K) -->
+      <g transform="translate(203.022649 351.310511) rotate(-315) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-37" d="M 525 4666 
+L 3525 4666 
+L 3525 4397 
+L 1831 0 
+L 1172 0 
+L 2766 4134 
+L 525 4134 
+L 525 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="line2d_5">
+      <g>
+       <use xlink:href="#m74ed47afc4" x="243.636526" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_5">
+      <!-- (16K, 7K) -->
+      <g transform="translate(245.107087 351.310511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="line2d_6">
+      <g>
+       <use xlink:href="#m74ed47afc4" x="285.720963" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_6">
+      <!-- (64K, 7K) -->
+      <g transform="translate(287.191524 351.310511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_7">
+     <g id="line2d_7">
+      <g>
+       <use xlink:href="#m74ed47afc4" x="327.805401" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_7">
       <!-- (4K, 8K) -->
-      <g transform="translate(289.828392 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(329.275962 351.310511) rotate(-315) scale(0.1 -0.1)">
        <defs>
         <path id="DejaVuSans-38" d="M 2034 2216 
 Q 1584 2216 1326 1975 
@@ -283,15 +357,15 @@ z
       </g>
      </g>
     </g>
-    <g id="xtick_5">
-     <g id="line2d_5">
+    <g id="xtick_8">
+     <g id="line2d_8">
       <g>
-       <use xlink:href="#m68a6986d45" x="356.612549" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m74ed47afc4" x="369.889838" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_5">
+     <g id="text_8">
       <!-- (16K, 8K) -->
-      <g transform="translate(358.08311 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(371.3604 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
@@ -304,15 +378,15 @@ z
       </g>
      </g>
     </g>
-    <g id="xtick_6">
-     <g id="line2d_6">
+    <g id="xtick_9">
+     <g id="line2d_9">
       <g>
-       <use xlink:href="#m68a6986d45" x="424.867266" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m74ed47afc4" x="411.974276" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_6">
+     <g id="text_9">
       <!-- (64K, 8K) -->
-      <g transform="translate(426.337827 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(413.444837 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -328,24 +402,24 @@ z
    </g>
    <g id="matplotlib.axis_2">
     <g id="ytick_1">
-     <g id="line2d_7">
-      <path d="M 66.53 302.361604 
-L 441.930945 302.361604 
-" clip-path="url(#p11615d3359)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_10">
+      <path d="M 58.465 338.937604 
+L 428.808051 338.937604 
+" clip-path="url(#p2d323b1c9e)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_8">
+     <g id="line2d_11">
       <defs>
-       <path id="m0986283986" d="M 0 0 
+       <path id="mdf95c7a117" d="M 0 0 
 L -3.5 0 
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
-       <use xlink:href="#m0986283986" x="66.53" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mdf95c7a117" x="58.465" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_7">
+     <g id="text_10">
       <!-- 0 -->
-      <g transform="translate(51.895 306.920666) scale(0.12 -0.12)">
+      <g transform="translate(43.83 343.496666) scale(0.12 -0.12)">
        <defs>
         <path id="DejaVuSans-30" d="M 2034 4250 
 Q 1547 4250 1301 3770 
@@ -374,19 +448,19 @@ z
      </g>
     </g>
     <g id="ytick_2">
-     <g id="line2d_9">
-      <path d="M 66.53 276.422362 
-L 441.930945 276.422362 
-" clip-path="url(#p11615d3359)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_12">
+      <path d="M 58.465 314.301993 
+L 428.808051 314.301993 
+" clip-path="url(#p2d323b1c9e)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_10">
+     <g id="line2d_13">
       <g>
-       <use xlink:href="#m0986283986" x="66.53" y="276.422362" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mdf95c7a117" x="58.465" y="314.301993" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_8">
+     <g id="text_11">
       <!-- 1000 -->
-      <g transform="translate(28.99 280.981424) scale(0.12 -0.12)">
+      <g transform="translate(20.925 318.861056) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-31"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -395,19 +469,19 @@ L 441.930945 276.422362
      </g>
     </g>
     <g id="ytick_3">
-     <g id="line2d_11">
-      <path d="M 66.53 250.48312 
-L 441.930945 250.48312 
-" clip-path="url(#p11615d3359)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_14">
+      <path d="M 58.465 289.666383 
+L 428.808051 289.666383 
+" clip-path="url(#p2d323b1c9e)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_12">
+     <g id="line2d_15">
       <g>
-       <use xlink:href="#m0986283986" x="66.53" y="250.48312" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mdf95c7a117" x="58.465" y="289.666383" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_9">
+     <g id="text_12">
       <!-- 2000 -->
-      <g transform="translate(28.99 255.042182) scale(0.12 -0.12)">
+      <g transform="translate(20.925 294.225445) scale(0.12 -0.12)">
        <defs>
         <path id="DejaVuSans-32" d="M 1228 531 
 L 3431 531 
@@ -442,19 +516,19 @@ z
      </g>
     </g>
     <g id="ytick_4">
-     <g id="line2d_13">
-      <path d="M 66.53 224.543878 
-L 441.930945 224.543878 
-" clip-path="url(#p11615d3359)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_16">
+      <path d="M 58.465 265.030772 
+L 428.808051 265.030772 
+" clip-path="url(#p2d323b1c9e)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_14">
+     <g id="line2d_17">
       <g>
-       <use xlink:href="#m0986283986" x="66.53" y="224.543878" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mdf95c7a117" x="58.465" y="265.030772" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_10">
+     <g id="text_13">
       <!-- 3000 -->
-      <g transform="translate(28.99 229.10294) scale(0.12 -0.12)">
+      <g transform="translate(20.925 269.589835) scale(0.12 -0.12)">
        <defs>
         <path id="DejaVuSans-33" d="M 2597 2516 
 Q 3050 2419 3304 2112 
@@ -497,19 +571,19 @@ z
      </g>
     </g>
     <g id="ytick_5">
-     <g id="line2d_15">
-      <path d="M 66.53 198.604635 
-L 441.930945 198.604635 
-" clip-path="url(#p11615d3359)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_18">
+      <path d="M 58.465 240.395161 
+L 428.808051 240.395161 
+" clip-path="url(#p2d323b1c9e)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_16">
+     <g id="line2d_19">
       <g>
-       <use xlink:href="#m0986283986" x="66.53" y="198.604635" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mdf95c7a117" x="58.465" y="240.395161" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_11">
+     <g id="text_14">
       <!-- 4000 -->
-      <g transform="translate(28.99 203.163698) scale(0.12 -0.12)">
+      <g transform="translate(20.925 244.954224) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-34"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -518,19 +592,19 @@ L 441.930945 198.604635
      </g>
     </g>
     <g id="ytick_6">
-     <g id="line2d_17">
-      <path d="M 66.53 172.665393 
-L 441.930945 172.665393 
-" clip-path="url(#p11615d3359)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_20">
+      <path d="M 58.465 215.759551 
+L 428.808051 215.759551 
+" clip-path="url(#p2d323b1c9e)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_18">
+     <g id="line2d_21">
       <g>
-       <use xlink:href="#m0986283986" x="66.53" y="172.665393" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mdf95c7a117" x="58.465" y="215.759551" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_12">
+     <g id="text_15">
       <!-- 5000 -->
-      <g transform="translate(28.99 177.224456) scale(0.12 -0.12)">
+      <g transform="translate(20.925 220.318613) scale(0.12 -0.12)">
        <defs>
         <path id="DejaVuSans-35" d="M 691 4666 
 L 3169 4666 
@@ -566,19 +640,19 @@ z
      </g>
     </g>
     <g id="ytick_7">
-     <g id="line2d_19">
-      <path d="M 66.53 146.726151 
-L 441.930945 146.726151 
-" clip-path="url(#p11615d3359)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_22">
+      <path d="M 58.465 191.12394 
+L 428.808051 191.12394 
+" clip-path="url(#p2d323b1c9e)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_20">
+     <g id="line2d_23">
       <g>
-       <use xlink:href="#m0986283986" x="66.53" y="146.726151" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mdf95c7a117" x="58.465" y="191.12394" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_13">
+     <g id="text_16">
       <!-- 6000 -->
-      <g transform="translate(28.99 151.285214) scale(0.12 -0.12)">
+      <g transform="translate(20.925 195.683003) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-36"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -587,31 +661,19 @@ L 441.930945 146.726151
      </g>
     </g>
     <g id="ytick_8">
-     <g id="line2d_21">
-      <path d="M 66.53 120.786909 
-L 441.930945 120.786909 
-" clip-path="url(#p11615d3359)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_24">
+      <path d="M 58.465 166.48833 
+L 428.808051 166.48833 
+" clip-path="url(#p2d323b1c9e)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_22">
+     <g id="line2d_25">
       <g>
-       <use xlink:href="#m0986283986" x="66.53" y="120.786909" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mdf95c7a117" x="58.465" y="166.48833" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_14">
+     <g id="text_17">
       <!-- 7000 -->
-      <g transform="translate(28.99 125.345971) scale(0.12 -0.12)">
-       <defs>
-        <path id="DejaVuSans-37" d="M 525 4666 
-L 3525 4666 
-L 3525 4397 
-L 1831 0 
-L 1172 0 
-L 2766 4134 
-L 525 4134 
-L 525 4666 
-z
-" transform="scale(0.015625)"/>
-       </defs>
+      <g transform="translate(20.925 171.047392) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-37"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -619,9 +681,9 @@ z
       </g>
      </g>
     </g>
-    <g id="text_15">
+    <g id="text_18">
      <!-- Memory Bandwidth (GB/s) -->
-     <g transform="translate(21.6625 303.824552) rotate(-90) scale(0.16 -0.16)">
+     <g transform="translate(13.5975 346.880552) rotate(-90) scale(0.16 -0.16)">
       <defs>
        <path id="DejaVuSans-4d" d="M 628 4666 
 L 1569 4666 
@@ -1018,16 +1080,19 @@ z
      </g>
     </g>
    </g>
-   <g id="line2d_23">
-    <path d="M 83.593679 272.288339 
-L 151.848397 195.207187 
-L 220.103114 192.974007 
-L 288.357831 256.237388 
-L 356.612549 194.045128 
-L 424.867266 263.527692 
-" clip-path="url(#p11615d3359)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+   <g id="line2d_26">
+    <path d="M 75.298775 212.449078 
+L 117.383213 184.126047 
+L 159.46765 170.405806 
+L 201.552088 211.91835 
+L 243.636526 190.028097 
+L 285.720963 173.791458 
+L 327.805401 206.902961 
+L 369.889838 181.710965 
+L 411.974276 172.327669 
+" clip-path="url(#p2d323b1c9e)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
     <defs>
-     <path id="m276b91ba99" d="M 0 3.5 
+     <path id="m73bc246617" d="M 0 3.5 
 C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
 C 3.131218 1.81853 3.5 0.928211 3.5 0 
 C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
@@ -1039,25 +1104,31 @@ C -1.81853 3.131218 -0.928211 3.5 0 3.5
 z
 " style="stroke: #5ba3f5"/>
     </defs>
-    <g clip-path="url(#p11615d3359)">
-     <use xlink:href="#m276b91ba99" x="83.593679" y="272.288339" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m276b91ba99" x="151.848397" y="195.207187" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m276b91ba99" x="220.103114" y="192.974007" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m276b91ba99" x="288.357831" y="256.237388" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m276b91ba99" x="356.612549" y="194.045128" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m276b91ba99" x="424.867266" y="263.527692" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <g clip-path="url(#p2d323b1c9e)">
+     <use xlink:href="#m73bc246617" x="75.298775" y="212.449078" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m73bc246617" x="117.383213" y="184.126047" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m73bc246617" x="159.46765" y="170.405806" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m73bc246617" x="201.552088" y="211.91835" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m73bc246617" x="243.636526" y="190.028097" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m73bc246617" x="285.720963" y="173.791458" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m73bc246617" x="327.805401" y="206.902961" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m73bc246617" x="369.889838" y="181.710965" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m73bc246617" x="411.974276" y="172.327669" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
-   <g id="line2d_24">
-    <path d="M 83.593679 224.912937 
-L 151.848397 197.119755 
-L 220.103114 186.68722 
-L 288.357831 219.891429 
-L 356.612549 201.198527 
-L 424.867266 194.575077 
-" clip-path="url(#p11615d3359)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+   <g id="line2d_27">
+    <path d="M 75.298775 281.801114 
+L 117.383213 268.060892 
+L 159.46765 263.365153 
+L 201.552088 278.6422 
+L 243.636526 268.099735 
+L 285.720963 263.830136 
+L 327.805401 278.204605 
+L 369.889838 266.288831 
+L 411.974276 263.266993 
+" clip-path="url(#p2d323b1c9e)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
     <defs>
-     <path id="m6033ca0b55" d="M 0 3.5 
+     <path id="m1f6d2dfe32" d="M 0 3.5 
 C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
 C 3.131218 1.81853 3.5 0.928211 3.5 0 
 C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
@@ -1069,44 +1140,113 @@ C -1.81853 3.131218 -0.928211 3.5 0 3.5
 z
 " style="stroke: #ff4444"/>
     </defs>
-    <g clip-path="url(#p11615d3359)">
-     <use xlink:href="#m6033ca0b55" x="83.593679" y="224.912937" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m6033ca0b55" x="151.848397" y="197.119755" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m6033ca0b55" x="220.103114" y="186.68722" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m6033ca0b55" x="288.357831" y="219.891429" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m6033ca0b55" x="356.612549" y="201.198527" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m6033ca0b55" x="424.867266" y="194.575077" style="fill: #ff4444; stroke: #ff4444"/>
+    <g clip-path="url(#p2d323b1c9e)">
+     <use xlink:href="#m1f6d2dfe32" x="75.298775" y="281.801114" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f6d2dfe32" x="117.383213" y="268.060892" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f6d2dfe32" x="159.46765" y="263.365153" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f6d2dfe32" x="201.552088" y="278.6422" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f6d2dfe32" x="243.636526" y="268.099735" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f6d2dfe32" x="285.720963" y="263.830136" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f6d2dfe32" x="327.805401" y="278.204605" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f6d2dfe32" x="369.889838" y="266.288831" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f6d2dfe32" x="411.974276" y="263.266993" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
-   <g id="line2d_25">
-    <path d="M 66.53 110.619378 
-L 441.930945 110.619378 
-" clip-path="url(#p11615d3359)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   <g id="line2d_28">
+    <path d="M 58.465 159.195378 
+L 428.808051 159.195378 
+" clip-path="url(#p2d323b1c9e)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_3">
-    <path d="M 66.53 302.361604 
-L 66.53 95.28 
+    <path d="M 58.465 338.937604 
+L 58.465 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_4">
-    <path d="M 441.930945 302.361604 
-L 441.930945 95.28 
+    <path d="M 428.808051 338.937604 
+L 428.808051 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_5">
-    <path d="M 66.53 302.361604 
-L 441.930945 302.361604 
+    <path d="M 58.465 338.937604 
+L 428.808051 338.937604 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_6">
-    <path d="M 66.53 95.28 
-L 441.930945 95.28 
+    <path d="M 58.465 144.816 
+L 428.808051 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
-   <g id="text_16">
-    <!-- RMSNorm (fp32 weight) -->
-    <g transform="translate(146.527191 89.28) scale(0.18 -0.18)">
+   <g id="text_19">
+    <!-- Fused Add+RMSNorm (fwd) -->
+    <g transform="translate(119.234026 138.816) scale(0.18 -0.18)">
      <defs>
+      <path id="DejaVuSans-46" d="M 628 4666 
+L 3309 4666 
+L 3309 4134 
+L 1259 4134 
+L 1259 2759 
+L 3109 2759 
+L 3109 2228 
+L 1259 2228 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-75" d="M 544 1381 
+L 544 3500 
+L 1119 3500 
+L 1119 1403 
+Q 1119 906 1312 657 
+Q 1506 409 1894 409 
+Q 2359 409 2629 706 
+Q 2900 1003 2900 1516 
+L 2900 3500 
+L 3475 3500 
+L 3475 0 
+L 2900 0 
+L 2900 538 
+Q 2691 219 2414 64 
+Q 2138 -91 1772 -91 
+Q 1169 -91 856 284 
+Q 544 659 544 1381 
+z
+M 1991 3584 
+L 1991 3584 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-41" d="M 2188 4044 
+L 1331 1722 
+L 3047 1722 
+L 2188 4044 
+z
+M 1831 4666 
+L 2547 4666 
+L 4325 0 
+L 3669 0 
+L 3244 1197 
+L 1141 1197 
+L 716 0 
+L 50 0 
+L 1831 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-2b" d="M 2944 4013 
+L 2944 2272 
+L 4684 2272 
+L 4684 1741 
+L 2944 1741 
+L 2944 0 
+L 2419 0 
+L 2419 1741 
+L 678 1741 
+L 678 2272 
+L 2419 2272 
+L 2419 4013 
+L 2944 4013 
+z
+" transform="scale(0.015625)"/>
       <path id="DejaVuSans-52" d="M 2841 2188 
 Q 3044 2119 3236 1894 
 Q 3428 1669 3622 1275 
@@ -1199,111 +1339,53 @@ Q 697 4328 969 4595
 Q 1241 4863 1831 4863 
 L 2375 4863 
 z
-" transform="scale(0.015625)"/>
-      <path id="DejaVuSans-70" d="M 1159 525 
-L 1159 -1331 
-L 581 -1331 
-L 581 3500 
-L 1159 3500 
-L 1159 2969 
-Q 1341 3281 1617 3432 
-Q 1894 3584 2278 3584 
-Q 2916 3584 3314 3078 
-Q 3713 2572 3713 1747 
-Q 3713 922 3314 415 
-Q 2916 -91 2278 -91 
-Q 1894 -91 1617 61 
-Q 1341 213 1159 525 
-z
-M 3116 1747 
-Q 3116 2381 2855 2742 
-Q 2594 3103 2138 3103 
-Q 1681 3103 1420 2742 
-Q 1159 2381 1159 1747 
-Q 1159 1113 1420 752 
-Q 1681 391 2138 391 
-Q 2594 391 2855 752 
-Q 3116 1113 3116 1747 
-z
-" transform="scale(0.015625)"/>
-      <path id="DejaVuSans-67" d="M 2906 1791 
-Q 2906 2416 2648 2759 
-Q 2391 3103 1925 3103 
-Q 1463 3103 1205 2759 
-Q 947 2416 947 1791 
-Q 947 1169 1205 825 
-Q 1463 481 1925 481 
-Q 2391 481 2648 825 
-Q 2906 1169 2906 1791 
-z
-M 3481 434 
-Q 3481 -459 3084 -895 
-Q 2688 -1331 1869 -1331 
-Q 1566 -1331 1297 -1286 
-Q 1028 -1241 775 -1147 
-L 775 -588 
-Q 1028 -725 1275 -790 
-Q 1522 -856 1778 -856 
-Q 2344 -856 2625 -561 
-Q 2906 -266 2906 331 
-L 2906 616 
-Q 2728 306 2450 153 
-Q 2172 0 1784 0 
-Q 1141 0 747 490 
-Q 353 981 353 1791 
-Q 353 2603 747 3093 
-Q 1141 3584 1784 3584 
-Q 2172 3584 2450 3431 
-Q 2728 3278 2906 2969 
-L 2906 3500 
-L 3481 3500 
-L 3481 434 
-z
 " transform="scale(0.015625)"/>
      </defs>
-     <use xlink:href="#DejaVuSans-52"/>
-     <use xlink:href="#DejaVuSans-4d" transform="translate(69.482422 0)"/>
-     <use xlink:href="#DejaVuSans-53" transform="translate(155.761719 0)"/>
-     <use xlink:href="#DejaVuSans-4e" transform="translate(219.238281 0)"/>
-     <use xlink:href="#DejaVuSans-6f" transform="translate(294.042969 0)"/>
-     <use xlink:href="#DejaVuSans-72" transform="translate(355.224609 0)"/>
-     <use xlink:href="#DejaVuSans-6d" transform="translate(394.587891 0)"/>
-     <use xlink:href="#DejaVuSans-20" transform="translate(492 0)"/>
-     <use xlink:href="#DejaVuSans-28" transform="translate(523.787109 0)"/>
-     <use xlink:href="#DejaVuSans-66" transform="translate(562.800781 0)"/>
-     <use xlink:href="#DejaVuSans-70" transform="translate(598.005859 0)"/>
-     <use xlink:href="#DejaVuSans-33" transform="translate(661.482422 0)"/>
-     <use xlink:href="#DejaVuSans-32" transform="translate(725.105469 0)"/>
-     <use xlink:href="#DejaVuSans-20" transform="translate(788.728516 0)"/>
-     <use xlink:href="#DejaVuSans-77" transform="translate(820.515625 0)"/>
-     <use xlink:href="#DejaVuSans-65" transform="translate(902.302734 0)"/>
-     <use xlink:href="#DejaVuSans-69" transform="translate(963.826172 0)"/>
-     <use xlink:href="#DejaVuSans-67" transform="translate(991.609375 0)"/>
-     <use xlink:href="#DejaVuSans-68" transform="translate(1055.085938 0)"/>
-     <use xlink:href="#DejaVuSans-74" transform="translate(1118.464844 0)"/>
-     <use xlink:href="#DejaVuSans-29" transform="translate(1157.673828 0)"/>
+     <use xlink:href="#DejaVuSans-46"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(52.019531 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(115.398438 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(167.498047 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(229.021484 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(292.498047 0)"/>
+     <use xlink:href="#DejaVuSans-41" transform="translate(324.285156 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(390.943359 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(454.419922 0)"/>
+     <use xlink:href="#DejaVuSans-2b" transform="translate(517.896484 0)"/>
+     <use xlink:href="#DejaVuSans-52" transform="translate(601.685547 0)"/>
+     <use xlink:href="#DejaVuSans-4d" transform="translate(671.167969 0)"/>
+     <use xlink:href="#DejaVuSans-53" transform="translate(757.447266 0)"/>
+     <use xlink:href="#DejaVuSans-4e" transform="translate(820.923828 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(895.728516 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(956.910156 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(996.273438 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(1093.685547 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(1125.472656 0)"/>
+     <use xlink:href="#DejaVuSans-66" transform="translate(1164.486328 0)"/>
+     <use xlink:href="#DejaVuSans-77" transform="translate(1197.941406 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1279.728516 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(1343.205078 0)"/>
     </g>
    </g>
   </g>
   <g id="axes_2">
    <g id="patch_7">
-    <path d="M 482.7725 302.361604 
-L 858.173445 302.361604 
-L 858.173445 95.28 
-L 482.7725 95.28 
+    <path d="M 469.455 338.937604 
+L 839.798051 338.937604 
+L 839.798051 144.816 
+L 469.455 144.816 
 z
 " style="fill: #ffffff"/>
    </g>
    <g id="matplotlib.axis_3">
-    <g id="xtick_7">
-     <g id="line2d_26">
+    <g id="xtick_10">
+     <g id="line2d_29">
       <g>
-       <use xlink:href="#m68a6986d45" x="499.836179" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m74ed47afc4" x="486.288775" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_17">
+     <g id="text_20">
       <!-- (4K, 6K) -->
-      <g transform="translate(501.30674 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(487.759336 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
@@ -1315,15 +1397,15 @@ z
       </g>
      </g>
     </g>
-    <g id="xtick_8">
-     <g id="line2d_27">
+    <g id="xtick_11">
+     <g id="line2d_30">
       <g>
-       <use xlink:href="#m68a6986d45" x="568.090897" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m74ed47afc4" x="528.373213" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_18">
+     <g id="text_21">
       <!-- (16K, 6K) -->
-      <g transform="translate(569.561458 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(529.843774 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
@@ -1336,15 +1418,15 @@ z
       </g>
      </g>
     </g>
-    <g id="xtick_9">
-     <g id="line2d_28">
+    <g id="xtick_12">
+     <g id="line2d_31">
       <g>
-       <use xlink:href="#m68a6986d45" x="636.345614" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m74ed47afc4" x="570.45765" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_19">
+     <g id="text_22">
       <!-- (64K, 6K) -->
-      <g transform="translate(637.816175 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(571.928211 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -1357,15 +1439,77 @@ z
       </g>
      </g>
     </g>
-    <g id="xtick_10">
-     <g id="line2d_29">
+    <g id="xtick_13">
+     <g id="line2d_32">
       <g>
-       <use xlink:href="#m68a6986d45" x="704.600331" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m74ed47afc4" x="612.542088" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_20">
+     <g id="text_23">
+      <!-- (4K, 7K) -->
+      <g transform="translate(614.012649 351.310511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_14">
+     <g id="line2d_33">
+      <g>
+       <use xlink:href="#m74ed47afc4" x="654.626526" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_24">
+      <!-- (16K, 7K) -->
+      <g transform="translate(656.097087 351.310511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_15">
+     <g id="line2d_34">
+      <g>
+       <use xlink:href="#m74ed47afc4" x="696.710963" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_25">
+      <!-- (64K, 7K) -->
+      <g transform="translate(698.181524 351.310511) rotate(-315) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-28"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
+       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
+       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
+       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_16">
+     <g id="line2d_35">
+      <g>
+       <use xlink:href="#m74ed47afc4" x="738.795401" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_26">
       <!-- (4K, 8K) -->
-      <g transform="translate(706.070892 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(740.265962 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
@@ -1377,15 +1521,15 @@ z
       </g>
      </g>
     </g>
-    <g id="xtick_11">
-     <g id="line2d_30">
+    <g id="xtick_17">
+     <g id="line2d_36">
       <g>
-       <use xlink:href="#m68a6986d45" x="772.855049" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m74ed47afc4" x="780.879838" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_21">
+     <g id="text_27">
       <!-- (16K, 8K) -->
-      <g transform="translate(774.32561 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(782.3504 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
@@ -1398,15 +1542,15 @@ z
       </g>
      </g>
     </g>
-    <g id="xtick_12">
-     <g id="line2d_31">
+    <g id="xtick_18">
+     <g id="line2d_37">
       <g>
-       <use xlink:href="#m68a6986d45" x="841.109766" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m74ed47afc4" x="822.964276" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_22">
+     <g id="text_28">
       <!-- (64K, 8K) -->
-      <g transform="translate(842.580327 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(824.434837 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -1422,164 +1566,176 @@ z
    </g>
    <g id="matplotlib.axis_4">
     <g id="ytick_9">
-     <g id="line2d_32">
-      <path d="M 482.7725 302.361604 
-L 858.173445 302.361604 
-" clip-path="url(#p05495e384c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_38">
+      <path d="M 469.455 338.937604 
+L 839.798051 338.937604 
+" clip-path="url(#p98a7523bb3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_33">
+     <g id="line2d_39">
       <g>
-       <use xlink:href="#m0986283986" x="482.7725" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mdf95c7a117" x="469.455" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_10">
-     <g id="line2d_34">
-      <path d="M 482.7725 276.422362 
-L 858.173445 276.422362 
-" clip-path="url(#p05495e384c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_40">
+      <path d="M 469.455 314.301993 
+L 839.798051 314.301993 
+" clip-path="url(#p98a7523bb3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_35">
+     <g id="line2d_41">
       <g>
-       <use xlink:href="#m0986283986" x="482.7725" y="276.422362" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mdf95c7a117" x="469.455" y="314.301993" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_11">
-     <g id="line2d_36">
-      <path d="M 482.7725 250.48312 
-L 858.173445 250.48312 
-" clip-path="url(#p05495e384c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_42">
+      <path d="M 469.455 289.666383 
+L 839.798051 289.666383 
+" clip-path="url(#p98a7523bb3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_37">
+     <g id="line2d_43">
       <g>
-       <use xlink:href="#m0986283986" x="482.7725" y="250.48312" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mdf95c7a117" x="469.455" y="289.666383" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_12">
-     <g id="line2d_38">
-      <path d="M 482.7725 224.543878 
-L 858.173445 224.543878 
-" clip-path="url(#p05495e384c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_44">
+      <path d="M 469.455 265.030772 
+L 839.798051 265.030772 
+" clip-path="url(#p98a7523bb3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_39">
+     <g id="line2d_45">
       <g>
-       <use xlink:href="#m0986283986" x="482.7725" y="224.543878" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mdf95c7a117" x="469.455" y="265.030772" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_13">
-     <g id="line2d_40">
-      <path d="M 482.7725 198.604635 
-L 858.173445 198.604635 
-" clip-path="url(#p05495e384c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_46">
+      <path d="M 469.455 240.395161 
+L 839.798051 240.395161 
+" clip-path="url(#p98a7523bb3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_41">
+     <g id="line2d_47">
       <g>
-       <use xlink:href="#m0986283986" x="482.7725" y="198.604635" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mdf95c7a117" x="469.455" y="240.395161" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_14">
-     <g id="line2d_42">
-      <path d="M 482.7725 172.665393 
-L 858.173445 172.665393 
-" clip-path="url(#p05495e384c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_48">
+      <path d="M 469.455 215.759551 
+L 839.798051 215.759551 
+" clip-path="url(#p98a7523bb3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_43">
+     <g id="line2d_49">
       <g>
-       <use xlink:href="#m0986283986" x="482.7725" y="172.665393" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mdf95c7a117" x="469.455" y="215.759551" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_15">
-     <g id="line2d_44">
-      <path d="M 482.7725 146.726151 
-L 858.173445 146.726151 
-" clip-path="url(#p05495e384c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_50">
+      <path d="M 469.455 191.12394 
+L 839.798051 191.12394 
+" clip-path="url(#p98a7523bb3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_45">
+     <g id="line2d_51">
       <g>
-       <use xlink:href="#m0986283986" x="482.7725" y="146.726151" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mdf95c7a117" x="469.455" y="191.12394" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_16">
-     <g id="line2d_46">
-      <path d="M 482.7725 120.786909 
-L 858.173445 120.786909 
-" clip-path="url(#p05495e384c)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     <g id="line2d_52">
+      <path d="M 469.455 166.48833 
+L 839.798051 166.48833 
+" clip-path="url(#p98a7523bb3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_47">
+     <g id="line2d_53">
       <g>
-       <use xlink:href="#m0986283986" x="482.7725" y="120.786909" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mdf95c7a117" x="469.455" y="166.48833" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
    </g>
-   <g id="line2d_48">
-    <path d="M 499.836179 231.95811 
-L 568.090897 175.574968 
-L 636.345614 164.909134 
-L 704.600331 206.868218 
-L 772.855049 165.72142 
-L 841.109766 158.179462 
-" clip-path="url(#p05495e384c)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p05495e384c)">
-     <use xlink:href="#m276b91ba99" x="499.836179" y="231.95811" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m276b91ba99" x="568.090897" y="175.574968" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m276b91ba99" x="636.345614" y="164.909134" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m276b91ba99" x="704.600331" y="206.868218" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m276b91ba99" x="772.855049" y="165.72142" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m276b91ba99" x="841.109766" y="158.179462" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+   <g id="line2d_54">
+    <path d="M 486.288775 254.29777 
+L 528.373213 234.370248 
+L 570.45765 227.59634 
+L 612.542088 248.333682 
+L 654.626526 228.938876 
+L 696.710963 222.05363 
+L 738.795401 254.46009 
+L 780.879838 238.500401 
+L 822.964276 233.685939 
+" clip-path="url(#p98a7523bb3)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p98a7523bb3)">
+     <use xlink:href="#m73bc246617" x="486.288775" y="254.29777" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m73bc246617" x="528.373213" y="234.370248" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m73bc246617" x="570.45765" y="227.59634" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m73bc246617" x="612.542088" y="248.333682" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m73bc246617" x="654.626526" y="228.938876" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m73bc246617" x="696.710963" y="222.05363" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m73bc246617" x="738.795401" y="254.46009" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m73bc246617" x="780.879838" y="238.500401" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m73bc246617" x="822.964276" y="233.685939" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
-   <g id="line2d_49">
-    <path d="M 499.836179 204.838924 
-L 568.090897 175.797116 
-L 636.345614 165.334908 
-L 704.600331 187.81875 
-L 772.855049 164.76046 
-L 841.109766 157.989828 
-" clip-path="url(#p05495e384c)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p05495e384c)">
-     <use xlink:href="#m6033ca0b55" x="499.836179" y="204.838924" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m6033ca0b55" x="568.090897" y="175.797116" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m6033ca0b55" x="636.345614" y="165.334908" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m6033ca0b55" x="704.600331" y="187.81875" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m6033ca0b55" x="772.855049" y="164.76046" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m6033ca0b55" x="841.109766" y="157.989828" style="fill: #ff4444; stroke: #ff4444"/>
+   <g id="line2d_55">
+    <path d="M 486.288775 282.476636 
+L 528.373213 267.010556 
+L 570.45765 260.800383 
+L 612.542088 270.704591 
+L 654.626526 254.839939 
+L 696.710963 249.309273 
+L 738.795401 273.442742 
+L 780.879838 260.542643 
+L 822.964276 256.655339 
+" clip-path="url(#p98a7523bb3)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p98a7523bb3)">
+     <use xlink:href="#m1f6d2dfe32" x="486.288775" y="282.476636" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f6d2dfe32" x="528.373213" y="267.010556" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f6d2dfe32" x="570.45765" y="260.800383" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f6d2dfe32" x="612.542088" y="270.704591" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f6d2dfe32" x="654.626526" y="254.839939" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f6d2dfe32" x="696.710963" y="249.309273" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f6d2dfe32" x="738.795401" y="273.442742" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f6d2dfe32" x="780.879838" y="260.542643" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f6d2dfe32" x="822.964276" y="256.655339" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
-   <g id="line2d_50">
-    <path d="M 482.7725 110.619378 
-L 858.173445 110.619378 
-" clip-path="url(#p05495e384c)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   <g id="line2d_56">
+    <path d="M 469.455 159.195378 
+L 839.798051 159.195378 
+" clip-path="url(#p98a7523bb3)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_8">
-    <path d="M 482.7725 302.361604 
-L 482.7725 95.28 
+    <path d="M 469.455 338.937604 
+L 469.455 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_9">
-    <path d="M 858.173445 302.361604 
-L 858.173445 95.28 
+    <path d="M 839.798051 338.937604 
+L 839.798051 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_10">
-    <path d="M 482.7725 302.361604 
-L 858.173445 302.361604 
+    <path d="M 469.455 338.937604 
+L 839.798051 338.937604 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_11">
-    <path d="M 482.7725 95.28 
-L 858.173445 95.28 
+    <path d="M 469.455 144.816 
+L 839.798051 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
-   <g id="text_23">
+   <g id="text_29">
     <!-- Softmax (fwd+bwd) -->
-    <g transform="translate(580.80766 89.28) scale(0.18 -0.18)">
+    <g transform="translate(564.961213 138.816) scale(0.18 -0.18)">
      <defs>
       <path id="DejaVuSans-78" d="M 3513 3500 
 L 2247 1797 
@@ -1595,21 +1751,6 @@ L 1906 2253
 L 2834 3500 
 L 3513 3500 
 z
-" transform="scale(0.015625)"/>
-      <path id="DejaVuSans-2b" d="M 2944 4013 
-L 2944 2272 
-L 4684 2272 
-L 4684 1741 
-L 2944 1741 
-L 2944 0 
-L 2419 0 
-L 2419 1741 
-L 678 1741 
-L 678 2272 
-L 2419 2272 
-L 2419 4013 
-L 2944 4013 
-z
 " transform="scale(0.015625)"/>
       <path id="DejaVuSans-62" d="M 3116 1747 
 Q 3116 2381 2855 2742 
@@ -1660,23 +1801,23 @@ z
   </g>
   <g id="axes_3">
    <g id="patch_12">
-    <path d="M 899.015 302.361604 
-L 1274.415945 302.361604 
-L 1274.415945 95.28 
-L 899.015 95.28 
+    <path d="M 880.445 338.937604 
+L 1250.788051 338.937604 
+L 1250.788051 144.816 
+L 880.445 144.816 
 z
 " style="fill: #ffffff"/>
    </g>
    <g id="matplotlib.axis_5">
-    <g id="xtick_13">
-     <g id="line2d_51">
+    <g id="xtick_19">
+     <g id="line2d_57">
       <g>
-       <use xlink:href="#m68a6986d45" x="916.078679" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m74ed47afc4" x="897.278775" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_24">
+     <g id="text_30">
       <!-- (4K, 6K) -->
-      <g transform="translate(917.54924 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(898.749336 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
@@ -1688,15 +1829,15 @@ z
       </g>
      </g>
     </g>
-    <g id="xtick_14">
-     <g id="line2d_52">
+    <g id="xtick_20">
+     <g id="line2d_58">
       <g>
-       <use xlink:href="#m68a6986d45" x="984.333397" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m74ed47afc4" x="939.363213" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_25">
+     <g id="text_31">
       <!-- (16K, 6K) -->
-      <g transform="translate(985.803958 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(940.833774 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
@@ -1709,15 +1850,15 @@ z
       </g>
      </g>
     </g>
-    <g id="xtick_15">
-     <g id="line2d_53">
+    <g id="xtick_21">
+     <g id="line2d_59">
       <g>
-       <use xlink:href="#m68a6986d45" x="1052.588114" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m74ed47afc4" x="981.44765" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_26">
+     <g id="text_32">
       <!-- (64K, 6K) -->
-      <g transform="translate(1054.058675 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(982.918211 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -1730,381 +1871,77 @@ z
       </g>
      </g>
     </g>
-    <g id="xtick_16">
-     <g id="line2d_54">
-      <g>
-       <use xlink:href="#m68a6986d45" x="1120.842831" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
-      </g>
-     </g>
-     <g id="text_27">
-      <!-- (4K, 8K) -->
-      <g transform="translate(1122.313392 314.734511) rotate(-315) scale(0.1 -0.1)">
-       <use xlink:href="#DejaVuSans-28"/>
-       <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
-       <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
-       <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
-       <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
-       <use xlink:href="#DejaVuSans-38" transform="translate(231.787109 0)"/>
-       <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
-       <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
-      </g>
-     </g>
-    </g>
-    <g id="xtick_17">
-     <g id="line2d_55">
-      <g>
-       <use xlink:href="#m68a6986d45" x="1189.097549" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
-      </g>
-     </g>
-     <g id="text_28">
-      <!-- (16K, 8K) -->
-      <g transform="translate(1190.56811 314.734511) rotate(-315) scale(0.1 -0.1)">
-       <use xlink:href="#DejaVuSans-28"/>
-       <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
-       <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
-       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
-       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
-       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
-       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
-       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
-       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
-      </g>
-     </g>
-    </g>
-    <g id="xtick_18">
-     <g id="line2d_56">
-      <g>
-       <use xlink:href="#m68a6986d45" x="1257.352266" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
-      </g>
-     </g>
-     <g id="text_29">
-      <!-- (64K, 8K) -->
-      <g transform="translate(1258.822827 314.734511) rotate(-315) scale(0.1 -0.1)">
-       <use xlink:href="#DejaVuSans-28"/>
-       <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
-       <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
-       <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
-       <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
-       <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
-       <use xlink:href="#DejaVuSans-38" transform="translate(295.410156 0)"/>
-       <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
-       <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
-      </g>
-     </g>
-    </g>
-   </g>
-   <g id="matplotlib.axis_6">
-    <g id="ytick_17">
-     <g id="line2d_57">
-      <path d="M 899.015 302.361604 
-L 1274.415945 302.361604 
-" clip-path="url(#peca114f933)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
-     </g>
-     <g id="line2d_58">
-      <g>
-       <use xlink:href="#m0986283986" x="899.015" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
-      </g>
-     </g>
-    </g>
-    <g id="ytick_18">
-     <g id="line2d_59">
-      <path d="M 899.015 276.422362 
-L 1274.415945 276.422362 
-" clip-path="url(#peca114f933)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
-     </g>
+    <g id="xtick_22">
      <g id="line2d_60">
       <g>
-       <use xlink:href="#m0986283986" x="899.015" y="276.422362" style="stroke: #000000; stroke-width: 0.8"/>
-      </g>
-     </g>
-    </g>
-    <g id="ytick_19">
-     <g id="line2d_61">
-      <path d="M 899.015 250.48312 
-L 1274.415945 250.48312 
-" clip-path="url(#peca114f933)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
-     </g>
-     <g id="line2d_62">
-      <g>
-       <use xlink:href="#m0986283986" x="899.015" y="250.48312" style="stroke: #000000; stroke-width: 0.8"/>
-      </g>
-     </g>
-    </g>
-    <g id="ytick_20">
-     <g id="line2d_63">
-      <path d="M 899.015 224.543878 
-L 1274.415945 224.543878 
-" clip-path="url(#peca114f933)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
-     </g>
-     <g id="line2d_64">
-      <g>
-       <use xlink:href="#m0986283986" x="899.015" y="224.543878" style="stroke: #000000; stroke-width: 0.8"/>
-      </g>
-     </g>
-    </g>
-    <g id="ytick_21">
-     <g id="line2d_65">
-      <path d="M 899.015 198.604635 
-L 1274.415945 198.604635 
-" clip-path="url(#peca114f933)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
-     </g>
-     <g id="line2d_66">
-      <g>
-       <use xlink:href="#m0986283986" x="899.015" y="198.604635" style="stroke: #000000; stroke-width: 0.8"/>
-      </g>
-     </g>
-    </g>
-    <g id="ytick_22">
-     <g id="line2d_67">
-      <path d="M 899.015 172.665393 
-L 1274.415945 172.665393 
-" clip-path="url(#peca114f933)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
-     </g>
-     <g id="line2d_68">
-      <g>
-       <use xlink:href="#m0986283986" x="899.015" y="172.665393" style="stroke: #000000; stroke-width: 0.8"/>
-      </g>
-     </g>
-    </g>
-    <g id="ytick_23">
-     <g id="line2d_69">
-      <path d="M 899.015 146.726151 
-L 1274.415945 146.726151 
-" clip-path="url(#peca114f933)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
-     </g>
-     <g id="line2d_70">
-      <g>
-       <use xlink:href="#m0986283986" x="899.015" y="146.726151" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m74ed47afc4" x="1023.532088" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-    </g>
-    <g id="ytick_24">
-     <g id="line2d_71">
-      <path d="M 899.015 120.786909 
-L 1274.415945 120.786909 
-" clip-path="url(#peca114f933)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
-     </g>
-     <g id="line2d_72">
-      <g>
-       <use xlink:href="#m0986283986" x="899.015" y="120.786909" style="stroke: #000000; stroke-width: 0.8"/>
-      </g>
-     </g>
-    </g>
-   </g>
-   <g id="line2d_73">
-    <path d="M 916.078679 271.918647 
-L 984.333397 188.208369 
-L 1052.588114 175.00953 
-L 1120.842831 262.55956 
-L 1189.097549 179.846433 
-L 1257.352266 169.215125 
-" clip-path="url(#peca114f933)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#peca114f933)">
-     <use xlink:href="#m276b91ba99" x="916.078679" y="271.918647" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m276b91ba99" x="984.333397" y="188.208369" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m276b91ba99" x="1052.588114" y="175.00953" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m276b91ba99" x="1120.842831" y="262.55956" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m276b91ba99" x="1189.097549" y="179.846433" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m276b91ba99" x="1257.352266" y="169.215125" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-    </g>
-   </g>
-   <g id="line2d_74">
-    <path d="M 916.078679 258.002578 
-L 984.333397 196.912386 
-L 1052.588114 185.399558 
-L 1120.842831 246.477491 
-L 1189.097549 183.586083 
-L 1257.352266 174.771889 
-" clip-path="url(#peca114f933)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#peca114f933)">
-     <use xlink:href="#m6033ca0b55" x="916.078679" y="258.002578" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m6033ca0b55" x="984.333397" y="196.912386" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m6033ca0b55" x="1052.588114" y="185.399558" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m6033ca0b55" x="1120.842831" y="246.477491" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m6033ca0b55" x="1189.097549" y="183.586083" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m6033ca0b55" x="1257.352266" y="174.771889" style="fill: #ff4444; stroke: #ff4444"/>
-    </g>
-   </g>
-   <g id="line2d_75">
-    <path d="M 899.015 110.619378 
-L 1274.415945 110.619378 
-" clip-path="url(#peca114f933)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
-   </g>
-   <g id="patch_13">
-    <path d="M 899.015 302.361604 
-L 899.015 95.28 
-" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
-   </g>
-   <g id="patch_14">
-    <path d="M 1274.415945 302.361604 
-L 1274.415945 95.28 
-" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
-   </g>
-   <g id="patch_15">
-    <path d="M 899.015 302.361604 
-L 1274.415945 302.361604 
-" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
-   </g>
-   <g id="patch_16">
-    <path d="M 899.015 95.28 
-L 1274.415945 95.28 
-" style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
-   </g>
-   <g id="text_30">
-    <!-- Cross-Entropy (fwd+bwd) -->
-    <g transform="translate(971.542191 89.28) scale(0.18 -0.18)">
-     <defs>
-      <path id="DejaVuSans-43" d="M 4122 4306 
-L 4122 3641 
-Q 3803 3938 3442 4084 
-Q 3081 4231 2675 4231 
-Q 1875 4231 1450 3742 
-Q 1025 3253 1025 2328 
-Q 1025 1406 1450 917 
-Q 1875 428 2675 428 
-Q 3081 428 3442 575 
-Q 3803 722 4122 1019 
-L 4122 359 
-Q 3791 134 3420 21 
-Q 3050 -91 2638 -91 
-Q 1578 -91 968 557 
-Q 359 1206 359 2328 
-Q 359 3453 968 4101 
-Q 1578 4750 2638 4750 
-Q 3056 4750 3426 4639 
-Q 3797 4528 4122 4306 
-z
-" transform="scale(0.015625)"/>
-      <path id="DejaVuSans-2d" d="M 313 2009 
-L 1997 2009 
-L 1997 1497 
-L 313 1497 
-L 313 2009 
-z
-" transform="scale(0.015625)"/>
-      <path id="DejaVuSans-45" d="M 628 4666 
-L 3578 4666 
-L 3578 4134 
-L 1259 4134 
-L 1259 2753 
-L 3481 2753 
-L 3481 2222 
-L 1259 2222 
-L 1259 531 
-L 3634 531 
-L 3634 0 
-L 628 0 
-L 628 4666 
-z
-" transform="scale(0.015625)"/>
-     </defs>
-     <use xlink:href="#DejaVuSans-43"/>
-     <use xlink:href="#DejaVuSans-72" transform="translate(69.824219 0)"/>
-     <use xlink:href="#DejaVuSans-6f" transform="translate(108.6875 0)"/>
-     <use xlink:href="#DejaVuSans-73" transform="translate(169.869141 0)"/>
-     <use xlink:href="#DejaVuSans-73" transform="translate(221.96875 0)"/>
-     <use xlink:href="#DejaVuSans-2d" transform="translate(274.068359 0)"/>
-     <use xlink:href="#DejaVuSans-45" transform="translate(310.152344 0)"/>
-     <use xlink:href="#DejaVuSans-6e" transform="translate(373.335938 0)"/>
-     <use xlink:href="#DejaVuSans-74" transform="translate(436.714844 0)"/>
-     <use xlink:href="#DejaVuSans-72" transform="translate(475.923828 0)"/>
-     <use xlink:href="#DejaVuSans-6f" transform="translate(514.787109 0)"/>
-     <use xlink:href="#DejaVuSans-70" transform="translate(575.96875 0)"/>
-     <use xlink:href="#DejaVuSans-79" transform="translate(639.445312 0)"/>
-     <use xlink:href="#DejaVuSans-20" transform="translate(698.625 0)"/>
-     <use xlink:href="#DejaVuSans-28" transform="translate(730.412109 0)"/>
-     <use xlink:href="#DejaVuSans-66" transform="translate(769.425781 0)"/>
-     <use xlink:href="#DejaVuSans-77" transform="translate(802.880859 0)"/>
-     <use xlink:href="#DejaVuSans-64" transform="translate(884.667969 0)"/>
-     <use xlink:href="#DejaVuSans-2b" transform="translate(948.144531 0)"/>
-     <use xlink:href="#DejaVuSans-62" transform="translate(1031.933594 0)"/>
-     <use xlink:href="#DejaVuSans-77" transform="translate(1095.410156 0)"/>
-     <use xlink:href="#DejaVuSans-64" transform="translate(1177.197266 0)"/>
-     <use xlink:href="#DejaVuSans-29" transform="translate(1240.673828 0)"/>
-    </g>
-   </g>
-  </g>
-  <g id="axes_4">
-   <g id="patch_17">
-    <path d="M 1315.2575 302.361604 
-L 1690.658445 302.361604 
-L 1690.658445 95.28 
-L 1315.2575 95.28 
-z
-" style="fill: #ffffff"/>
-   </g>
-   <g id="matplotlib.axis_7">
-    <g id="xtick_19">
-     <g id="line2d_76">
-      <g>
-       <use xlink:href="#m68a6986d45" x="1332.321179" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
-      </g>
-     </g>
-     <g id="text_31">
-      <!-- (4K, 6K) -->
-      <g transform="translate(1333.79174 314.734511) rotate(-315) scale(0.1 -0.1)">
+     <g id="text_33">
+      <!-- (4K, 7K) -->
+      <g transform="translate(1025.002649 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
        <use xlink:href="#DejaVuSans-2c" transform="translate(168.212891 0)"/>
        <use xlink:href="#DejaVuSans-20" transform="translate(200 0)"/>
-       <use xlink:href="#DejaVuSans-36" transform="translate(231.787109 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(231.787109 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(295.410156 0)"/>
        <use xlink:href="#DejaVuSans-29" transform="translate(360.986328 0)"/>
       </g>
      </g>
     </g>
-    <g id="xtick_20">
-     <g id="line2d_77">
+    <g id="xtick_23">
+     <g id="line2d_61">
       <g>
-       <use xlink:href="#m68a6986d45" x="1400.575897" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m74ed47afc4" x="1065.616526" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_32">
-      <!-- (16K, 6K) -->
-      <g transform="translate(1402.046458 314.734511) rotate(-315) scale(0.1 -0.1)">
+     <g id="text_34">
+      <!-- (16K, 7K) -->
+      <g transform="translate(1067.087087 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
        <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
        <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
-       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
        <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
       </g>
      </g>
     </g>
-    <g id="xtick_21">
-     <g id="line2d_78">
+    <g id="xtick_24">
+     <g id="line2d_62">
       <g>
-       <use xlink:href="#m68a6986d45" x="1468.830614" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m74ed47afc4" x="1107.700963" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_33">
-      <!-- (64K, 6K) -->
-      <g transform="translate(1470.301175 314.734511) rotate(-315) scale(0.1 -0.1)">
+     <g id="text_35">
+      <!-- (64K, 7K) -->
+      <g transform="translate(1109.171524 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(166.259766 0)"/>
        <use xlink:href="#DejaVuSans-2c" transform="translate(231.835938 0)"/>
        <use xlink:href="#DejaVuSans-20" transform="translate(263.623047 0)"/>
-       <use xlink:href="#DejaVuSans-36" transform="translate(295.410156 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(295.410156 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(359.033203 0)"/>
        <use xlink:href="#DejaVuSans-29" transform="translate(424.609375 0)"/>
       </g>
      </g>
     </g>
-    <g id="xtick_22">
-     <g id="line2d_79">
+    <g id="xtick_25">
+     <g id="line2d_63">
       <g>
-       <use xlink:href="#m68a6986d45" x="1537.085331" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m74ed47afc4" x="1149.785401" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_34">
+     <g id="text_36">
       <!-- (4K, 8K) -->
-      <g transform="translate(1538.555892 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(1151.255962 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-4b" transform="translate(102.636719 0)"/>
@@ -2116,15 +1953,15 @@ z
       </g>
      </g>
     </g>
-    <g id="xtick_23">
-     <g id="line2d_80">
+    <g id="xtick_26">
+     <g id="line2d_64">
       <g>
-       <use xlink:href="#m68a6986d45" x="1605.340049" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m74ed47afc4" x="1191.869838" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_35">
+     <g id="text_37">
       <!-- (16K, 8K) -->
-      <g transform="translate(1606.81061 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(1193.3404 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-31" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(102.636719 0)"/>
@@ -2137,15 +1974,15 @@ z
       </g>
      </g>
     </g>
-    <g id="xtick_24">
-     <g id="line2d_81">
+    <g id="xtick_27">
+     <g id="line2d_65">
       <g>
-       <use xlink:href="#m68a6986d45" x="1673.594766" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m74ed47afc4" x="1233.954276" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
-     <g id="text_36">
+     <g id="text_38">
       <!-- (64K, 8K) -->
-      <g transform="translate(1675.065327 314.734511) rotate(-315) scale(0.1 -0.1)">
+      <g transform="translate(1235.424837 351.310511) rotate(-315) scale(0.1 -0.1)">
        <use xlink:href="#DejaVuSans-28"/>
        <use xlink:href="#DejaVuSans-36" transform="translate(39.013672 0)"/>
        <use xlink:href="#DejaVuSans-34" transform="translate(102.636719 0)"/>
@@ -2159,166 +1996,178 @@ z
      </g>
     </g>
    </g>
-   <g id="matplotlib.axis_8">
-    <g id="ytick_25">
-     <g id="line2d_82">
-      <path d="M 1315.2575 302.361604 
-L 1690.658445 302.361604 
-" clip-path="url(#p9afaee9892)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
-     </g>
-     <g id="line2d_83">
+   <g id="matplotlib.axis_6">
+    <g id="ytick_17">
+     <g id="line2d_66">
+      <path d="M 880.445 338.937604 
+L 1250.788051 338.937604 
+" clip-path="url(#pefd3f0bd8b)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+     </g>
+     <g id="line2d_67">
       <g>
-       <use xlink:href="#m0986283986" x="1315.2575" y="302.361604" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mdf95c7a117" x="880.445" y="338.937604" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
-    <g id="ytick_26">
-     <g id="line2d_84">
-      <path d="M 1315.2575 276.422362 
-L 1690.658445 276.422362 
-" clip-path="url(#p9afaee9892)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+    <g id="ytick_18">
+     <g id="line2d_68">
+      <path d="M 880.445 314.301993 
+L 1250.788051 314.301993 
+" clip-path="url(#pefd3f0bd8b)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_85">
+     <g id="line2d_69">
       <g>
-       <use xlink:href="#m0986283986" x="1315.2575" y="276.422362" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mdf95c7a117" x="880.445" y="314.301993" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
-    <g id="ytick_27">
-     <g id="line2d_86">
-      <path d="M 1315.2575 250.48312 
-L 1690.658445 250.48312 
-" clip-path="url(#p9afaee9892)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+    <g id="ytick_19">
+     <g id="line2d_70">
+      <path d="M 880.445 289.666383 
+L 1250.788051 289.666383 
+" clip-path="url(#pefd3f0bd8b)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_87">
+     <g id="line2d_71">
       <g>
-       <use xlink:href="#m0986283986" x="1315.2575" y="250.48312" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mdf95c7a117" x="880.445" y="289.666383" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
-    <g id="ytick_28">
-     <g id="line2d_88">
-      <path d="M 1315.2575 224.543878 
-L 1690.658445 224.543878 
-" clip-path="url(#p9afaee9892)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+    <g id="ytick_20">
+     <g id="line2d_72">
+      <path d="M 880.445 265.030772 
+L 1250.788051 265.030772 
+" clip-path="url(#pefd3f0bd8b)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_89">
+     <g id="line2d_73">
       <g>
-       <use xlink:href="#m0986283986" x="1315.2575" y="224.543878" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mdf95c7a117" x="880.445" y="265.030772" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
-    <g id="ytick_29">
-     <g id="line2d_90">
-      <path d="M 1315.2575 198.604635 
-L 1690.658445 198.604635 
-" clip-path="url(#p9afaee9892)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+    <g id="ytick_21">
+     <g id="line2d_74">
+      <path d="M 880.445 240.395161 
+L 1250.788051 240.395161 
+" clip-path="url(#pefd3f0bd8b)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_91">
+     <g id="line2d_75">
       <g>
-       <use xlink:href="#m0986283986" x="1315.2575" y="198.604635" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mdf95c7a117" x="880.445" y="240.395161" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
-    <g id="ytick_30">
-     <g id="line2d_92">
-      <path d="M 1315.2575 172.665393 
-L 1690.658445 172.665393 
-" clip-path="url(#p9afaee9892)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+    <g id="ytick_22">
+     <g id="line2d_76">
+      <path d="M 880.445 215.759551 
+L 1250.788051 215.759551 
+" clip-path="url(#pefd3f0bd8b)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_93">
+     <g id="line2d_77">
       <g>
-       <use xlink:href="#m0986283986" x="1315.2575" y="172.665393" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mdf95c7a117" x="880.445" y="215.759551" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
-    <g id="ytick_31">
-     <g id="line2d_94">
-      <path d="M 1315.2575 146.726151 
-L 1690.658445 146.726151 
-" clip-path="url(#p9afaee9892)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+    <g id="ytick_23">
+     <g id="line2d_78">
+      <path d="M 880.445 191.12394 
+L 1250.788051 191.12394 
+" clip-path="url(#pefd3f0bd8b)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_95">
+     <g id="line2d_79">
       <g>
-       <use xlink:href="#m0986283986" x="1315.2575" y="146.726151" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mdf95c7a117" x="880.445" y="191.12394" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
-    <g id="ytick_32">
-     <g id="line2d_96">
-      <path d="M 1315.2575 120.786909 
-L 1690.658445 120.786909 
-" clip-path="url(#p9afaee9892)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+    <g id="ytick_24">
+     <g id="line2d_80">
+      <path d="M 880.445 166.48833 
+L 1250.788051 166.48833 
+" clip-path="url(#pefd3f0bd8b)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
-     <g id="line2d_97">
+     <g id="line2d_81">
       <g>
-       <use xlink:href="#m0986283986" x="1315.2575" y="120.786909" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mdf95c7a117" x="880.445" y="166.48833" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
    </g>
-   <g id="line2d_98">
-    <path d="M 1332.321179 249.414778 
-L 1400.575897 237.106807 
-L 1468.830614 232.504421 
-L 1537.085331 237.895969 
-L 1605.340049 225.501182 
-L 1673.594766 221.51405 
-" clip-path="url(#p9afaee9892)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p9afaee9892)">
-     <use xlink:href="#m276b91ba99" x="1332.321179" y="249.414778" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m276b91ba99" x="1400.575897" y="237.106807" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m276b91ba99" x="1468.830614" y="232.504421" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m276b91ba99" x="1537.085331" y="237.895969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m276b91ba99" x="1605.340049" y="225.501182" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#m276b91ba99" x="1673.594766" y="221.51405" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+   <g id="line2d_82">
+    <path d="M 897.278775 260.649856 
+L 939.363213 232.08716 
+L 981.44765 221.098168 
+L 1023.532088 253.535953 
+L 1065.616526 227.081546 
+L 1107.700963 215.645535 
+L 1149.785401 248.619237 
+L 1191.869838 218.338564 
+L 1233.954276 225.951085 
+" clip-path="url(#pefd3f0bd8b)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pefd3f0bd8b)">
+     <use xlink:href="#m73bc246617" x="897.278775" y="260.649856" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m73bc246617" x="939.363213" y="232.08716" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m73bc246617" x="981.44765" y="221.098168" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m73bc246617" x="1023.532088" y="253.535953" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m73bc246617" x="1065.616526" y="227.081546" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m73bc246617" x="1107.700963" y="215.645535" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m73bc246617" x="1149.785401" y="248.619237" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m73bc246617" x="1191.869838" y="218.338564" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m73bc246617" x="1233.954276" y="225.951085" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
-   <g id="line2d_99">
-    <path d="M 1332.321179 243.101433 
-L 1400.575897 229.090352 
-L 1468.830614 224.074044 
-L 1537.085331 230.351093 
-L 1605.340049 216.573501 
-L 1673.594766 211.849353 
-" clip-path="url(#p9afaee9892)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p9afaee9892)">
-     <use xlink:href="#m6033ca0b55" x="1332.321179" y="243.101433" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m6033ca0b55" x="1400.575897" y="229.090352" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m6033ca0b55" x="1468.830614" y="224.074044" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m6033ca0b55" x="1537.085331" y="230.351093" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m6033ca0b55" x="1605.340049" y="216.573501" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m6033ca0b55" x="1673.594766" y="211.849353" style="fill: #ff4444; stroke: #ff4444"/>
+   <g id="line2d_83">
+    <path d="M 897.278775 282.583717 
+L 939.363213 269.28134 
+L 981.44765 264.499604 
+L 1023.532088 268.858832 
+L 1065.616526 249.802861 
+L 1107.700963 243.418323 
+L 1149.785401 270.286631 
+L 1191.869838 257.5083 
+L 1233.954276 252.931195 
+" clip-path="url(#pefd3f0bd8b)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pefd3f0bd8b)">
+     <use xlink:href="#m1f6d2dfe32" x="897.278775" y="282.583717" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f6d2dfe32" x="939.363213" y="269.28134" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f6d2dfe32" x="981.44765" y="264.499604" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f6d2dfe32" x="1023.532088" y="268.858832" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f6d2dfe32" x="1065.616526" y="249.802861" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f6d2dfe32" x="1107.700963" y="243.418323" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f6d2dfe32" x="1149.785401" y="270.286631" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f6d2dfe32" x="1191.869838" y="257.5083" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f6d2dfe32" x="1233.954276" y="252.931195" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
-   <g id="line2d_100">
-    <path d="M 1315.2575 110.619378 
-L 1690.658445 110.619378 
-" clip-path="url(#p9afaee9892)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+   <g id="line2d_84">
+    <path d="M 880.445 159.195378 
+L 1250.788051 159.195378 
+" clip-path="url(#pefd3f0bd8b)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
-   <g id="patch_18">
-    <path d="M 1315.2575 302.361604 
-L 1315.2575 95.28 
+   <g id="patch_13">
+    <path d="M 880.445 338.937604 
+L 880.445 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
-   <g id="patch_19">
-    <path d="M 1690.658445 302.361604 
-L 1690.658445 95.28 
+   <g id="patch_14">
+    <path d="M 1250.788051 338.937604 
+L 1250.788051 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
-   <g id="patch_20">
-    <path d="M 1315.2575 302.361604 
-L 1690.658445 302.361604 
+   <g id="patch_15">
+    <path d="M 880.445 338.937604 
+L 1250.788051 338.937604 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
-   <g id="patch_21">
-    <path d="M 1315.2575 95.28 
-L 1690.658445 95.28 
+   <g id="patch_16">
+    <path d="M 880.445 144.816 
+L 1250.788051 144.816 
 " style="fill: none; stroke: #d3d3d3; stroke-width: 1.5; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
-   <g id="text_37">
+   <g id="text_39">
     <!-- LayerNorm (fwd) -->
-    <g transform="translate(1427.346723 89.28) scale(0.18 -0.18)">
+    <g transform="translate(990.005276 138.816) scale(0.18 -0.18)">
      <defs>
       <path id="DejaVuSans-4c" d="M 628 4666 
 L 1259 4666 
@@ -2348,23 +2197,10 @@ z
     </g>
    </g>
   </g>
-  <g id="text_38">
-   <!-- SM100 FP16 Kernel Benchmarks (Oink vs Quack) — DSv3 (+LayerNorm) -->
-   <g transform="translate(466.059531 18.516563) scale(0.22 -0.22)">
+  <g id="text_40">
+   <!-- SM100 FP16 Kernel Benchmarks (Oink vs Quack) — DSv3 (hidden-size ops) -->
+   <g transform="translate(225.917344 18.156562) scale(0.22 -0.22)">
     <defs>
-     <path id="DejaVuSans-46" d="M 628 4666 
-L 3309 4666 
-L 3309 4134 
-L 1259 4134 
-L 1259 2759 
-L 3109 2759 
-L 3109 2228 
-L 1259 2228 
-L 1259 0 
-L 628 0 
-L 628 4666 
-z
-" transform="scale(0.015625)"/>
      <path id="DejaVuSans-50" d="M 1259 4147 
 L 1259 2394 
 L 2053 2394 
@@ -2484,28 +2320,6 @@ Q 4678 3434 4678 2328
 Q 4678 1516 4351 937 
 Q 4025 359 3406 84 
 z
-" transform="scale(0.015625)"/>
-     <path id="DejaVuSans-75" d="M 544 1381 
-L 544 3500 
-L 1119 3500 
-L 1119 1403 
-Q 1119 906 1312 657 
-Q 1506 409 1894 409 
-Q 2359 409 2629 706 
-Q 2900 1003 2900 1516 
-L 2900 3500 
-L 3475 3500 
-L 3475 0 
-L 2900 0 
-L 2900 538 
-Q 2691 219 2414 64 
-Q 2138 -91 1772 -91 
-Q 1169 -91 856 284 
-Q 544 659 544 1381 
-z
-M 1991 3584 
-L 1991 3584 
-z
 " transform="scale(0.015625)"/>
      <path id="DejaVuSans-2014" d="M 313 1978 
 L 6088 1978 
@@ -2532,6 +2346,52 @@ Q 3275 0 1925 0
 L 628 0 
 L 628 4666 
 z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-2d" d="M 313 2009 
+L 1997 2009 
+L 1997 1497 
+L 313 1497 
+L 313 2009 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-7a" d="M 353 3500 
+L 3084 3500 
+L 3084 2975 
+L 922 459 
+L 3084 459 
+L 3084 0 
+L 275 0 
+L 275 525 
+L 2438 3041 
+L 353 3041 
+L 353 3500 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-70" d="M 1159 525 
+L 1159 -1331 
+L 581 -1331 
+L 581 3500 
+L 1159 3500 
+L 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+z
+M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
 " transform="scale(0.015625)"/>
     </defs>
     <use xlink:href="#DejaVuSans-53"/>
@@ -2587,47 +2447,70 @@ z
     <use xlink:href="#DejaVuSans-33" transform="translate(2808.826172 0)"/>
     <use xlink:href="#DejaVuSans-20" transform="translate(2872.449219 0)"/>
     <use xlink:href="#DejaVuSans-28" transform="translate(2904.236328 0)"/>
-    <use xlink:href="#DejaVuSans-2b" transform="translate(2943.25 0)"/>
-    <use xlink:href="#DejaVuSans-4c" transform="translate(3027.039062 0)"/>
-    <use xlink:href="#DejaVuSans-61" transform="translate(3082.751953 0)"/>
-    <use xlink:href="#DejaVuSans-79" transform="translate(3144.03125 0)"/>
-    <use xlink:href="#DejaVuSans-65" transform="translate(3203.210938 0)"/>
-    <use xlink:href="#DejaVuSans-72" transform="translate(3264.734375 0)"/>
-    <use xlink:href="#DejaVuSans-4e" transform="translate(3305.847656 0)"/>
-    <use xlink:href="#DejaVuSans-6f" transform="translate(3380.652344 0)"/>
-    <use xlink:href="#DejaVuSans-72" transform="translate(3441.833984 0)"/>
-    <use xlink:href="#DejaVuSans-6d" transform="translate(3481.197266 0)"/>
-    <use xlink:href="#DejaVuSans-29" transform="translate(3578.609375 0)"/>
+    <use xlink:href="#DejaVuSans-68" transform="translate(2943.25 0)"/>
+    <use xlink:href="#DejaVuSans-69" transform="translate(3006.628906 0)"/>
+    <use xlink:href="#DejaVuSans-64" transform="translate(3034.412109 0)"/>
+    <use xlink:href="#DejaVuSans-64" transform="translate(3097.888672 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(3161.365234 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(3222.888672 0)"/>
+    <use xlink:href="#DejaVuSans-2d" transform="translate(3286.267578 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(3322.351562 0)"/>
+    <use xlink:href="#DejaVuSans-69" transform="translate(3374.451172 0)"/>
+    <use xlink:href="#DejaVuSans-7a" transform="translate(3402.234375 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(3454.724609 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(3516.248047 0)"/>
+    <use xlink:href="#DejaVuSans-6f" transform="translate(3548.035156 0)"/>
+    <use xlink:href="#DejaVuSans-70" transform="translate(3609.216797 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(3672.693359 0)"/>
+    <use xlink:href="#DejaVuSans-29" transform="translate(3724.792969 0)"/>
    </g>
   </g>
   <g id="legend_1">
-   <g id="line2d_101">
-    <path d="M 582.175625 39.937812 
-L 599.675625 39.937812 
-L 617.175625 39.937812 
+   <g id="line2d_85">
+    <path d="M 378.240937 46.691969 
+L 394.490937 46.691969 
+L 410.740937 46.691969 
 " style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
     <g>
-     <use xlink:href="#m276b91ba99" x="599.675625" y="39.937812" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#m73bc246617" x="394.490937" y="46.691969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
-   <g id="text_39">
+   <g id="text_41">
     <!-- KernelAgent-Oink (ours) -->
-    <g transform="translate(628.375625 44.837812) scale(0.14 -0.14)">
+    <g transform="translate(421.140937 51.241969) scale(0.13 -0.13)">
      <defs>
-      <path id="DejaVuSans-41" d="M 2188 4044 
-L 1331 1722 
-L 3047 1722 
-L 2188 4044 
+      <path id="DejaVuSans-67" d="M 2906 1791 
+Q 2906 2416 2648 2759 
+Q 2391 3103 1925 3103 
+Q 1463 3103 1205 2759 
+Q 947 2416 947 1791 
+Q 947 1169 1205 825 
+Q 1463 481 1925 481 
+Q 2391 481 2648 825 
+Q 2906 1169 2906 1791 
 z
-M 1831 4666 
-L 2547 4666 
-L 4325 0 
-L 3669 0 
-L 3244 1197 
-L 1141 1197 
-L 716 0 
-L 50 0 
-L 1831 4666 
+M 3481 434 
+Q 3481 -459 3084 -895 
+Q 2688 -1331 1869 -1331 
+Q 1566 -1331 1297 -1286 
+Q 1028 -1241 775 -1147 
+L 775 -588 
+Q 1028 -725 1275 -790 
+Q 1522 -856 1778 -856 
+Q 2344 -856 2625 -561 
+Q 2906 -266 2906 331 
+L 2906 616 
+Q 2728 306 2450 153 
+Q 2172 0 1784 0 
+Q 1141 0 747 490 
+Q 353 981 353 1791 
+Q 353 2603 747 3093 
+Q 1141 3584 1784 3584 
+Q 2172 3584 2450 3431 
+Q 2728 3278 2906 2969 
+L 2906 3500 
+L 3481 3500 
+L 3481 434 
 z
 " transform="scale(0.015625)"/>
      </defs>
@@ -2656,18 +2539,18 @@ z
      <use xlink:href="#DejaVuSans-29" transform="translate(1165.335938 0)"/>
     </g>
    </g>
-   <g id="line2d_102">
-    <path d="M 824.98375 39.937812 
-L 842.48375 39.937812 
-L 859.98375 39.937812 
+   <g id="line2d_86">
+    <path d="M 603.705625 46.691969 
+L 619.955625 46.691969 
+L 636.205625 46.691969 
 " style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
     <g>
-     <use xlink:href="#m6033ca0b55" x="842.48375" y="39.937812" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m1f6d2dfe32" x="619.955625" y="46.691969" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
-   <g id="text_40">
+   <g id="text_42">
     <!-- Quack -->
-    <g transform="translate(871.18375 44.837812) scale(0.14 -0.14)">
+    <g transform="translate(646.605625 51.241969) scale(0.13 -0.13)">
      <use xlink:href="#DejaVuSans-51"/>
      <use xlink:href="#DejaVuSans-75" transform="translate(78.710938 0)"/>
      <use xlink:href="#DejaVuSans-61" transform="translate(142.089844 0)"/>
@@ -2675,15 +2558,15 @@ L 859.98375 39.937812
      <use xlink:href="#DejaVuSans-6b" transform="translate(258.349609 0)"/>
     </g>
    </g>
-   <g id="line2d_103">
-    <path d="M 943.460938 39.937812 
-L 960.960938 39.937812 
-L 978.460938 39.937812 
+   <g id="line2d_87">
+    <path d="M 713.720156 46.691969 
+L 729.970156 46.691969 
+L 746.220156 46.691969 
 " style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
-   <g id="text_41">
+   <g id="text_43">
     <!-- HBM peak (measured) -->
-    <g transform="translate(989.660938 44.837812) scale(0.14 -0.14)">
+    <g transform="translate(756.620156 51.241969) scale(0.13 -0.13)">
      <defs>
       <path id="DejaVuSans-48" d="M 628 4666 
 L 1259 4666 
@@ -2725,17 +2608,14 @@ z
   </g>
  </g>
  <defs>
-  <clipPath id="p11615d3359">
-   <rect x="66.53" y="95.28" width="375.400945" height="207.081604"/>
-  </clipPath>
-  <clipPath id="p05495e384c">
-   <rect x="482.7725" y="95.28" width="375.400945" height="207.081604"/>
+  <clipPath id="p2d323b1c9e">
+   <rect x="58.465" y="144.816" width="370.343051" height="194.121604"/>
   </clipPath>
-  <clipPath id="peca114f933">
-   <rect x="899.015" y="95.28" width="375.400945" height="207.081604"/>
+  <clipPath id="p98a7523bb3">
+   <rect x="469.455" y="144.816" width="370.343051" height="194.121604"/>
   </clipPath>
-  <clipPath id="p9afaee9892">
-   <rect x="1315.2575" y="95.28" width="375.400945" height="207.081604"/>
+  <clipPath id="pefd3f0bd8b">
+   <rect x="880.445" y="144.816" width="370.343051" height="194.121604"/>
   </clipPath>
  </defs>
 </svg>
diff --git a/oink/benchmarks/media/sm100_fp16_oink_vs_quack_with_layernorm.svg b/oink/benchmarks/media/sm100_fp16_oink_vs_quack_with_layernorm.svg
index dac54ac..5c849b5 100644
--- a/oink/benchmarks/media/sm100_fp16_oink_vs_quack_with_layernorm.svg
+++ b/oink/benchmarks/media/sm100_fp16_oink_vs_quack_with_layernorm.svg
@@ -6,7 +6,7 @@
   <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
    <cc:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
-    <dc:date>2026-01-12T23:31:35.225900</dc:date>
+    <dc:date>2026-01-22T03:17:09.483028</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <cc:Agent>
@@ -41,12 +41,12 @@ z
     <g id="xtick_1">
      <g id="line2d_1">
       <defs>
-       <path id="mb7b8ee7556" d="M 0 0 
+       <path id="m04b83535a3" d="M 0 0 
 L 0 3.5 
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
-       <use xlink:href="#mb7b8ee7556" x="75.329082" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m04b83535a3" x="75.329082" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_1">
@@ -176,7 +176,7 @@ z
     <g id="xtick_2">
      <g id="line2d_2">
       <g>
-       <use xlink:href="#mb7b8ee7556" x="131.54269" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m04b83535a3" x="131.54269" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_2">
@@ -243,7 +243,7 @@ z
     <g id="xtick_3">
      <g id="line2d_3">
       <g>
-       <use xlink:href="#mb7b8ee7556" x="187.756298" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m04b83535a3" x="187.756298" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_3">
@@ -322,7 +322,7 @@ z
     <g id="xtick_4">
      <g id="line2d_4">
       <g>
-       <use xlink:href="#mb7b8ee7556" x="243.969906" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m04b83535a3" x="243.969906" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_4">
@@ -343,7 +343,7 @@ z
     <g id="xtick_5">
      <g id="line2d_5">
       <g>
-       <use xlink:href="#mb7b8ee7556" x="300.183514" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m04b83535a3" x="300.183514" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_5">
@@ -365,7 +365,7 @@ z
     <g id="xtick_6">
      <g id="line2d_6">
       <g>
-       <use xlink:href="#mb7b8ee7556" x="356.397122" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m04b83535a3" x="356.397122" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_6">
@@ -414,7 +414,7 @@ z
     <g id="xtick_7">
      <g id="line2d_7">
       <g>
-       <use xlink:href="#mb7b8ee7556" x="412.61073" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m04b83535a3" x="412.61073" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_7">
@@ -439,16 +439,16 @@ z
      <g id="line2d_8">
       <path d="M 58.465 334.546471 
 L 429.474812 334.546471 
-" clip-path="url(#p1a1505db71)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+" clip-path="url(#p1b738c7a2f)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_9">
       <defs>
-       <path id="me7483c6a33" d="M 0 0 
+       <path id="m8c120872ba" d="M 0 0 
 L -3.5 0 
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
-       <use xlink:href="#me7483c6a33" x="58.465" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8c120872ba" x="58.465" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_8">
@@ -483,18 +483,18 @@ z
     </g>
     <g id="ytick_2">
      <g id="line2d_10">
-      <path d="M 58.465 286.915059 
-L 429.474812 286.915059 
-" clip-path="url(#p1a1505db71)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 286.389791 
+L 429.474812 286.389791 
+" clip-path="url(#p1b738c7a2f)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_11">
       <g>
-       <use xlink:href="#me7483c6a33" x="58.465" y="286.915059" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8c120872ba" x="58.465" y="286.389791" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_9">
       <!-- 2000 -->
-      <g transform="translate(20.925 291.474121) scale(0.12 -0.12)">
+      <g transform="translate(20.925 290.948853) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-32"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -504,18 +504,18 @@ L 429.474812 286.915059
     </g>
     <g id="ytick_3">
      <g id="line2d_12">
-      <path d="M 58.465 239.283646 
-L 429.474812 239.283646 
-" clip-path="url(#p1a1505db71)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 238.23311 
+L 429.474812 238.23311 
+" clip-path="url(#p1b738c7a2f)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_13">
       <g>
-       <use xlink:href="#me7483c6a33" x="58.465" y="239.283646" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8c120872ba" x="58.465" y="238.23311" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_10">
       <!-- 4000 -->
-      <g transform="translate(20.925 243.842709) scale(0.12 -0.12)">
+      <g transform="translate(20.925 242.792173) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-34"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -525,18 +525,18 @@ L 429.474812 239.283646
     </g>
     <g id="ytick_4">
      <g id="line2d_14">
-      <path d="M 58.465 191.652234 
-L 429.474812 191.652234 
-" clip-path="url(#p1a1505db71)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 58.465 190.07643 
+L 429.474812 190.07643 
+" clip-path="url(#p1b738c7a2f)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_15">
       <g>
-       <use xlink:href="#me7483c6a33" x="58.465" y="191.652234" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8c120872ba" x="58.465" y="190.07643" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_11">
       <!-- 6000 -->
-      <g transform="translate(20.925 196.211297) scale(0.12 -0.12)">
+      <g transform="translate(20.925 194.635493) scale(0.12 -0.12)">
        <use xlink:href="#DejaVuSans-36"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
        <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
@@ -944,16 +944,16 @@ z
     </g>
    </g>
    <g id="line2d_16">
-    <path d="M 75.329082 251.913135 
-L 131.54269 241.17648 
-L 187.756298 219.657703 
-L 243.969906 214.163341 
-L 300.183514 211.264677 
-L 356.397122 210.014971 
-L 412.61073 209.246698 
-" clip-path="url(#p1a1505db71)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <path d="M 75.329082 238.350863 
+L 131.54269 223.543438 
+L 187.756298 214.146066 
+L 243.969906 208.516003 
+L 300.183514 205.520358 
+L 356.397122 203.984826 
+L 412.61073 203.208765 
+" clip-path="url(#p1b738c7a2f)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
     <defs>
-     <path id="maa38a61819" d="M 0 3.5 
+     <path id="me37af58da4" d="M 0 3.5 
 C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
 C 3.131218 1.81853 3.5 0.928211 3.5 0 
 C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
@@ -965,27 +965,27 @@ C -1.81853 3.131218 -0.928211 3.5 0 3.5
 z
 " style="stroke: #5ba3f5"/>
     </defs>
-    <g clip-path="url(#p1a1505db71)">
-     <use xlink:href="#maa38a61819" x="75.329082" y="251.913135" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#maa38a61819" x="131.54269" y="241.17648" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#maa38a61819" x="187.756298" y="219.657703" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#maa38a61819" x="243.969906" y="214.163341" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#maa38a61819" x="300.183514" y="211.264677" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#maa38a61819" x="356.397122" y="210.014971" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#maa38a61819" x="412.61073" y="209.246698" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <g clip-path="url(#p1b738c7a2f)">
+     <use xlink:href="#me37af58da4" x="75.329082" y="238.350863" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me37af58da4" x="131.54269" y="223.543438" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me37af58da4" x="187.756298" y="214.146066" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me37af58da4" x="243.969906" y="208.516003" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me37af58da4" x="300.183514" y="205.520358" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me37af58da4" x="356.397122" y="203.984826" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me37af58da4" x="412.61073" y="203.208765" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_17">
-    <path d="M 75.329082 259.330658 
-L 131.54269 248.708389 
-L 187.756298 241.397352 
-L 243.969906 237.270903 
-L 300.183514 235.199319 
-L 356.397122 234.549081 
-L 412.61073 233.995559 
-" clip-path="url(#p1a1505db71)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <path d="M 75.329082 259.86644 
+L 131.54269 247.880354 
+L 187.756298 240.625705 
+L 243.969906 236.344278 
+L 300.183514 234.573906 
+L 356.397122 233.434901 
+L 412.61073 232.843972 
+" clip-path="url(#p1b738c7a2f)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
     <defs>
-     <path id="m69a13e037a" d="M 0 3.5 
+     <path id="m3d0b8e493d" d="M 0 3.5 
 C 0.928211 3.5 1.81853 3.131218 2.474874 2.474874 
 C 3.131218 1.81853 3.5 0.928211 3.5 0 
 C 3.5 -0.928211 3.131218 -1.81853 2.474874 -2.474874 
@@ -997,20 +997,20 @@ C -1.81853 3.131218 -0.928211 3.5 0 3.5
 z
 " style="stroke: #ff4444"/>
     </defs>
-    <g clip-path="url(#p1a1505db71)">
-     <use xlink:href="#m69a13e037a" x="75.329082" y="259.330658" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m69a13e037a" x="131.54269" y="248.708389" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m69a13e037a" x="187.756298" y="241.397352" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m69a13e037a" x="243.969906" y="237.270903" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m69a13e037a" x="300.183514" y="235.199319" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m69a13e037a" x="356.397122" y="234.549081" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m69a13e037a" x="412.61073" y="233.995559" style="fill: #ff4444; stroke: #ff4444"/>
+    <g clip-path="url(#p1b738c7a2f)">
+     <use xlink:href="#m3d0b8e493d" x="75.329082" y="259.86644" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3d0b8e493d" x="131.54269" y="247.880354" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3d0b8e493d" x="187.756298" y="240.625705" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3d0b8e493d" x="243.969906" y="236.344278" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3d0b8e493d" x="300.183514" y="234.573906" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3d0b8e493d" x="356.397122" y="233.434901" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3d0b8e493d" x="412.61073" y="232.843972" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_18">
     <path d="M 58.465 158.870109 
 L 429.474812 158.870109 
-" clip-path="url(#p1a1505db71)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+" clip-path="url(#p1b738c7a2f)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_3">
     <path d="M 58.465 334.546471 
@@ -1227,7 +1227,7 @@ z
     <g id="xtick_8">
      <g id="line2d_19">
       <g>
-       <use xlink:href="#mb7b8ee7556" x="491.571582" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m04b83535a3" x="491.571582" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_14">
@@ -1247,7 +1247,7 @@ z
     <g id="xtick_9">
      <g id="line2d_20">
       <g>
-       <use xlink:href="#mb7b8ee7556" x="547.78519" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m04b83535a3" x="547.78519" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_15">
@@ -1268,7 +1268,7 @@ z
     <g id="xtick_10">
      <g id="line2d_21">
       <g>
-       <use xlink:href="#mb7b8ee7556" x="603.998798" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m04b83535a3" x="603.998798" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_16">
@@ -1289,7 +1289,7 @@ z
     <g id="xtick_11">
      <g id="line2d_22">
       <g>
-       <use xlink:href="#mb7b8ee7556" x="660.212406" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m04b83535a3" x="660.212406" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_17">
@@ -1310,7 +1310,7 @@ z
     <g id="xtick_12">
      <g id="line2d_23">
       <g>
-       <use xlink:href="#mb7b8ee7556" x="716.426014" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m04b83535a3" x="716.426014" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_18">
@@ -1332,7 +1332,7 @@ z
     <g id="xtick_13">
      <g id="line2d_24">
       <g>
-       <use xlink:href="#mb7b8ee7556" x="772.639622" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m04b83535a3" x="772.639622" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_19">
@@ -1354,7 +1354,7 @@ z
     <g id="xtick_14">
      <g id="line2d_25">
       <g>
-       <use xlink:href="#mb7b8ee7556" x="828.85323" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m04b83535a3" x="828.85323" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_20">
@@ -1379,93 +1379,93 @@ z
      <g id="line2d_26">
       <path d="M 474.7075 334.546471 
 L 845.717312 334.546471 
-" clip-path="url(#p2964b5d0ab)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+" clip-path="url(#pbffb2bfe56)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_27">
       <g>
-       <use xlink:href="#me7483c6a33" x="474.7075" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8c120872ba" x="474.7075" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_6">
      <g id="line2d_28">
-      <path d="M 474.7075 286.915059 
-L 845.717312 286.915059 
-" clip-path="url(#p2964b5d0ab)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 474.7075 286.389791 
+L 845.717312 286.389791 
+" clip-path="url(#pbffb2bfe56)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_29">
       <g>
-       <use xlink:href="#me7483c6a33" x="474.7075" y="286.915059" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8c120872ba" x="474.7075" y="286.389791" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_7">
      <g id="line2d_30">
-      <path d="M 474.7075 239.283646 
-L 845.717312 239.283646 
-" clip-path="url(#p2964b5d0ab)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 474.7075 238.23311 
+L 845.717312 238.23311 
+" clip-path="url(#pbffb2bfe56)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_31">
       <g>
-       <use xlink:href="#me7483c6a33" x="474.7075" y="239.283646" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8c120872ba" x="474.7075" y="238.23311" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_8">
      <g id="line2d_32">
-      <path d="M 474.7075 191.652234 
-L 845.717312 191.652234 
-" clip-path="url(#p2964b5d0ab)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 474.7075 190.07643 
+L 845.717312 190.07643 
+" clip-path="url(#pbffb2bfe56)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_33">
       <g>
-       <use xlink:href="#me7483c6a33" x="474.7075" y="191.652234" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8c120872ba" x="474.7075" y="190.07643" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
    </g>
    <g id="line2d_34">
-    <path d="M 491.571582 238.582331 
-L 547.78519 227.350322 
-L 603.998798 221.013408 
-L 660.212406 217.531347 
-L 716.426014 215.493568 
-L 772.639622 214.504926 
-L 828.85323 214.038054 
-" clip-path="url(#p2964b5d0ab)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p2964b5d0ab)">
-     <use xlink:href="#maa38a61819" x="491.571582" y="238.582331" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#maa38a61819" x="547.78519" y="227.350322" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#maa38a61819" x="603.998798" y="221.013408" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#maa38a61819" x="660.212406" y="217.531347" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#maa38a61819" x="716.426014" y="215.493568" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#maa38a61819" x="772.639622" y="214.504926" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#maa38a61819" x="828.85323" y="214.038054" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <path d="M 491.571582 237.753267 
+L 547.78519 226.985784 
+L 603.998798 219.744682 
+L 660.212406 216.245263 
+L 716.426014 214.176326 
+L 772.639622 213.209587 
+L 828.85323 212.702105 
+" clip-path="url(#pbffb2bfe56)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pbffb2bfe56)">
+     <use xlink:href="#me37af58da4" x="491.571582" y="237.753267" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me37af58da4" x="547.78519" y="226.985784" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me37af58da4" x="603.998798" y="219.744682" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me37af58da4" x="660.212406" y="216.245263" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me37af58da4" x="716.426014" y="214.176326" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me37af58da4" x="772.639622" y="213.209587" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me37af58da4" x="828.85323" y="212.702105" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_35">
-    <path d="M 491.571582 271.055153 
-L 547.78519 263.729959 
-L 603.998798 258.77406 
-L 660.212406 256.357586 
-L 716.426014 255.137832 
-L 772.639622 254.494771 
-L 828.85323 254.183547 
-" clip-path="url(#p2964b5d0ab)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p2964b5d0ab)">
-     <use xlink:href="#m69a13e037a" x="491.571582" y="271.055153" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m69a13e037a" x="547.78519" y="263.729959" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m69a13e037a" x="603.998798" y="258.77406" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m69a13e037a" x="660.212406" y="256.357586" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m69a13e037a" x="716.426014" y="255.137832" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m69a13e037a" x="772.639622" y="254.494771" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m69a13e037a" x="828.85323" y="254.183547" style="fill: #ff4444; stroke: #ff4444"/>
+    <path d="M 491.571582 270.05307 
+L 547.78519 262.934911 
+L 603.998798 257.942177 
+L 660.212406 255.503949 
+L 716.426014 254.259693 
+L 772.639622 253.615574 
+L 828.85323 253.288726 
+" clip-path="url(#pbffb2bfe56)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pbffb2bfe56)">
+     <use xlink:href="#m3d0b8e493d" x="491.571582" y="270.05307" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3d0b8e493d" x="547.78519" y="262.934911" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3d0b8e493d" x="603.998798" y="257.942177" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3d0b8e493d" x="660.212406" y="255.503949" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3d0b8e493d" x="716.426014" y="254.259693" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3d0b8e493d" x="772.639622" y="253.615574" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3d0b8e493d" x="828.85323" y="253.288726" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_36">
     <path d="M 474.7075 158.870109 
 L 845.717312 158.870109 
-" clip-path="url(#p2964b5d0ab)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+" clip-path="url(#pbffb2bfe56)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_8">
     <path d="M 474.7075 334.546471 
@@ -1581,7 +1581,7 @@ z
     <g id="xtick_15">
      <g id="line2d_37">
       <g>
-       <use xlink:href="#mb7b8ee7556" x="907.814082" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m04b83535a3" x="907.814082" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_22">
@@ -1601,7 +1601,7 @@ z
     <g id="xtick_16">
      <g id="line2d_38">
       <g>
-       <use xlink:href="#mb7b8ee7556" x="964.02769" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m04b83535a3" x="964.02769" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_23">
@@ -1622,7 +1622,7 @@ z
     <g id="xtick_17">
      <g id="line2d_39">
       <g>
-       <use xlink:href="#mb7b8ee7556" x="1020.241298" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m04b83535a3" x="1020.241298" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_24">
@@ -1643,7 +1643,7 @@ z
     <g id="xtick_18">
      <g id="line2d_40">
       <g>
-       <use xlink:href="#mb7b8ee7556" x="1076.454906" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m04b83535a3" x="1076.454906" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_25">
@@ -1664,7 +1664,7 @@ z
     <g id="xtick_19">
      <g id="line2d_41">
       <g>
-       <use xlink:href="#mb7b8ee7556" x="1132.668514" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m04b83535a3" x="1132.668514" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_26">
@@ -1686,7 +1686,7 @@ z
     <g id="xtick_20">
      <g id="line2d_42">
       <g>
-       <use xlink:href="#mb7b8ee7556" x="1188.882122" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m04b83535a3" x="1188.882122" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_27">
@@ -1708,7 +1708,7 @@ z
     <g id="xtick_21">
      <g id="line2d_43">
       <g>
-       <use xlink:href="#mb7b8ee7556" x="1245.09573" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m04b83535a3" x="1245.09573" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_28">
@@ -1733,93 +1733,93 @@ z
      <g id="line2d_44">
       <path d="M 890.95 334.546471 
 L 1261.959812 334.546471 
-" clip-path="url(#p629cd7b308)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+" clip-path="url(#pdd076313b3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_45">
       <g>
-       <use xlink:href="#me7483c6a33" x="890.95" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8c120872ba" x="890.95" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_10">
      <g id="line2d_46">
-      <path d="M 890.95 286.915059 
-L 1261.959812 286.915059 
-" clip-path="url(#p629cd7b308)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 890.95 286.389791 
+L 1261.959812 286.389791 
+" clip-path="url(#pdd076313b3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_47">
       <g>
-       <use xlink:href="#me7483c6a33" x="890.95" y="286.915059" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8c120872ba" x="890.95" y="286.389791" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_11">
      <g id="line2d_48">
-      <path d="M 890.95 239.283646 
-L 1261.959812 239.283646 
-" clip-path="url(#p629cd7b308)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 890.95 238.23311 
+L 1261.959812 238.23311 
+" clip-path="url(#pdd076313b3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_49">
       <g>
-       <use xlink:href="#me7483c6a33" x="890.95" y="239.283646" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8c120872ba" x="890.95" y="238.23311" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_12">
      <g id="line2d_50">
-      <path d="M 890.95 191.652234 
-L 1261.959812 191.652234 
-" clip-path="url(#p629cd7b308)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 890.95 190.07643 
+L 1261.959812 190.07643 
+" clip-path="url(#pdd076313b3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_51">
       <g>
-       <use xlink:href="#me7483c6a33" x="890.95" y="191.652234" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8c120872ba" x="890.95" y="190.07643" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
    </g>
    <g id="line2d_52">
-    <path d="M 907.814082 256.925659 
-L 964.02769 244.945202 
-L 1020.241298 237.689301 
-L 1076.454906 233.432348 
-L 1132.668514 231.001708 
-L 1188.882122 229.984945 
-L 1245.09573 229.36148 
-" clip-path="url(#p629cd7b308)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p629cd7b308)">
-     <use xlink:href="#maa38a61819" x="907.814082" y="256.925659" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#maa38a61819" x="964.02769" y="244.945202" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#maa38a61819" x="1020.241298" y="237.689301" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#maa38a61819" x="1076.454906" y="233.432348" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#maa38a61819" x="1132.668514" y="231.001708" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#maa38a61819" x="1188.882122" y="229.984945" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#maa38a61819" x="1245.09573" y="229.36148" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <path d="M 907.814082 256.899385 
+L 964.02769 245.075489 
+L 1020.241298 236.133801 
+L 1076.454906 232.355721 
+L 1132.668514 229.957284 
+L 1188.882122 228.797117 
+L 1245.09573 228.187974 
+" clip-path="url(#pdd076313b3)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pdd076313b3)">
+     <use xlink:href="#me37af58da4" x="907.814082" y="256.899385" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me37af58da4" x="964.02769" y="245.075489" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me37af58da4" x="1020.241298" y="236.133801" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me37af58da4" x="1076.454906" y="232.355721" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me37af58da4" x="1132.668514" y="229.957284" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me37af58da4" x="1188.882122" y="228.797117" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me37af58da4" x="1245.09573" y="228.187974" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_53">
-    <path d="M 907.814082 299.955996 
-L 964.02769 270.110028 
-L 1020.241298 262.988024 
-L 1076.454906 259.42104 
-L 1132.668514 257.236743 
-L 1188.882122 256.309819 
-L 1245.09573 255.686765 
-" clip-path="url(#p629cd7b308)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p629cd7b308)">
-     <use xlink:href="#m69a13e037a" x="907.814082" y="299.955996" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m69a13e037a" x="964.02769" y="270.110028" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m69a13e037a" x="1020.241298" y="262.988024" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m69a13e037a" x="1076.454906" y="259.42104" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m69a13e037a" x="1132.668514" y="257.236743" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m69a13e037a" x="1188.882122" y="256.309819" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m69a13e037a" x="1245.09573" y="255.686765" style="fill: #ff4444; stroke: #ff4444"/>
+    <path d="M 907.814082 292.867913 
+L 964.02769 268.830338 
+L 1020.241298 262.297001 
+L 1076.454906 258.67523 
+L 1132.668514 256.6246 
+L 1188.882122 255.377266 
+L 1245.09573 254.854282 
+" clip-path="url(#pdd076313b3)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#pdd076313b3)">
+     <use xlink:href="#m3d0b8e493d" x="907.814082" y="292.867913" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3d0b8e493d" x="964.02769" y="268.830338" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3d0b8e493d" x="1020.241298" y="262.297001" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3d0b8e493d" x="1076.454906" y="258.67523" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3d0b8e493d" x="1132.668514" y="256.6246" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3d0b8e493d" x="1188.882122" y="255.377266" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3d0b8e493d" x="1245.09573" y="254.854282" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_54">
     <path d="M 890.95 158.870109 
 L 1261.959812 158.870109 
-" clip-path="url(#p629cd7b308)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+" clip-path="url(#pdd076313b3)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_13">
     <path d="M 890.95 334.546471 
@@ -1928,7 +1928,7 @@ z
     <g id="xtick_22">
      <g id="line2d_55">
       <g>
-       <use xlink:href="#mb7b8ee7556" x="1324.056582" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m04b83535a3" x="1324.056582" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_30">
@@ -1948,7 +1948,7 @@ z
     <g id="xtick_23">
      <g id="line2d_56">
       <g>
-       <use xlink:href="#mb7b8ee7556" x="1380.27019" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m04b83535a3" x="1380.27019" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_31">
@@ -1969,7 +1969,7 @@ z
     <g id="xtick_24">
      <g id="line2d_57">
       <g>
-       <use xlink:href="#mb7b8ee7556" x="1436.483798" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m04b83535a3" x="1436.483798" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_32">
@@ -1990,7 +1990,7 @@ z
     <g id="xtick_25">
      <g id="line2d_58">
       <g>
-       <use xlink:href="#mb7b8ee7556" x="1492.697406" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m04b83535a3" x="1492.697406" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_33">
@@ -2011,7 +2011,7 @@ z
     <g id="xtick_26">
      <g id="line2d_59">
       <g>
-       <use xlink:href="#mb7b8ee7556" x="1548.911014" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m04b83535a3" x="1548.911014" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_34">
@@ -2033,7 +2033,7 @@ z
     <g id="xtick_27">
      <g id="line2d_60">
       <g>
-       <use xlink:href="#mb7b8ee7556" x="1605.124622" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m04b83535a3" x="1605.124622" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_35">
@@ -2055,7 +2055,7 @@ z
     <g id="xtick_28">
      <g id="line2d_61">
       <g>
-       <use xlink:href="#mb7b8ee7556" x="1661.33823" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m04b83535a3" x="1661.33823" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_36">
@@ -2080,93 +2080,93 @@ z
      <g id="line2d_62">
       <path d="M 1307.1925 334.546471 
 L 1678.202312 334.546471 
-" clip-path="url(#p3993db1e81)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+" clip-path="url(#p7fa67217a3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_63">
       <g>
-       <use xlink:href="#me7483c6a33" x="1307.1925" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8c120872ba" x="1307.1925" y="334.546471" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_14">
      <g id="line2d_64">
-      <path d="M 1307.1925 286.915059 
-L 1678.202312 286.915059 
-" clip-path="url(#p3993db1e81)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 1307.1925 286.389791 
+L 1678.202312 286.389791 
+" clip-path="url(#p7fa67217a3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_65">
       <g>
-       <use xlink:href="#me7483c6a33" x="1307.1925" y="286.915059" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8c120872ba" x="1307.1925" y="286.389791" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_15">
      <g id="line2d_66">
-      <path d="M 1307.1925 239.283646 
-L 1678.202312 239.283646 
-" clip-path="url(#p3993db1e81)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 1307.1925 238.23311 
+L 1678.202312 238.23311 
+" clip-path="url(#p7fa67217a3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_67">
       <g>
-       <use xlink:href="#me7483c6a33" x="1307.1925" y="239.283646" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8c120872ba" x="1307.1925" y="238.23311" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
     <g id="ytick_16">
      <g id="line2d_68">
-      <path d="M 1307.1925 191.652234 
-L 1678.202312 191.652234 
-" clip-path="url(#p3993db1e81)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
+      <path d="M 1307.1925 190.07643 
+L 1678.202312 190.07643 
+" clip-path="url(#p7fa67217a3)" style="fill: none; stroke-dasharray: 3.2,5.76; stroke-dashoffset: 0; stroke: #b0b0b0; stroke-width: 0.8"/>
      </g>
      <g id="line2d_69">
       <g>
-       <use xlink:href="#me7483c6a33" x="1307.1925" y="191.652234" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#m8c120872ba" x="1307.1925" y="190.07643" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
     </g>
    </g>
    <g id="line2d_70">
-    <path d="M 1324.056582 254.839025 
-L 1380.27019 246.363789 
-L 1436.483798 239.203362 
-L 1492.697406 235.841015 
-L 1548.911014 234.050124 
-L 1605.124622 233.048687 
-L 1661.33823 232.538313 
-" clip-path="url(#p3993db1e81)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p3993db1e81)">
-     <use xlink:href="#maa38a61819" x="1324.056582" y="254.839025" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#maa38a61819" x="1380.27019" y="246.363789" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#maa38a61819" x="1436.483798" y="239.203362" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#maa38a61819" x="1492.697406" y="235.841015" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#maa38a61819" x="1548.911014" y="234.050124" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#maa38a61819" x="1605.124622" y="233.048687" style="fill: #5ba3f5; stroke: #5ba3f5"/>
-     <use xlink:href="#maa38a61819" x="1661.33823" y="232.538313" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+    <path d="M 1324.056582 249.209749 
+L 1380.27019 236.849095 
+L 1436.483798 228.367615 
+L 1492.697406 223.408218 
+L 1548.911014 221.153814 
+L 1605.124622 220.019356 
+L 1661.33823 219.407666 
+" clip-path="url(#p7fa67217a3)" style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p7fa67217a3)">
+     <use xlink:href="#me37af58da4" x="1324.056582" y="249.209749" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me37af58da4" x="1380.27019" y="236.849095" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me37af58da4" x="1436.483798" y="228.367615" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me37af58da4" x="1492.697406" y="223.408218" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me37af58da4" x="1548.911014" y="221.153814" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me37af58da4" x="1605.124622" y="220.019356" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me37af58da4" x="1661.33823" y="219.407666" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="line2d_71">
-    <path d="M 1324.056582 268.569389 
-L 1380.27019 260.780292 
-L 1436.483798 255.300491 
-L 1492.697406 252.60039 
-L 1548.911014 251.060411 
-L 1605.124622 250.770026 
-L 1661.33823 249.905994 
-" clip-path="url(#p3993db1e81)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
-    <g clip-path="url(#p3993db1e81)">
-     <use xlink:href="#m69a13e037a" x="1324.056582" y="268.569389" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m69a13e037a" x="1380.27019" y="260.780292" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m69a13e037a" x="1436.483798" y="255.300491" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m69a13e037a" x="1492.697406" y="252.60039" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m69a13e037a" x="1548.911014" y="251.060411" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m69a13e037a" x="1605.124622" y="250.770026" style="fill: #ff4444; stroke: #ff4444"/>
-     <use xlink:href="#m69a13e037a" x="1661.33823" y="249.905994" style="fill: #ff4444; stroke: #ff4444"/>
+    <path d="M 1324.056582 267.400775 
+L 1380.27019 259.976465 
+L 1436.483798 254.435443 
+L 1492.697406 251.685705 
+L 1548.911014 250.652663 
+L 1605.124622 249.89933 
+L 1661.33823 249.54852 
+" clip-path="url(#p7fa67217a3)" style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
+    <g clip-path="url(#p7fa67217a3)">
+     <use xlink:href="#m3d0b8e493d" x="1324.056582" y="267.400775" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3d0b8e493d" x="1380.27019" y="259.976465" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3d0b8e493d" x="1436.483798" y="254.435443" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3d0b8e493d" x="1492.697406" y="251.685705" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3d0b8e493d" x="1548.911014" y="250.652663" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3d0b8e493d" x="1605.124622" y="249.89933" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3d0b8e493d" x="1661.33823" y="249.54852" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="line2d_72">
     <path d="M 1307.1925 158.870109 
 L 1678.202312 158.870109 
-" clip-path="url(#p3993db1e81)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
+" clip-path="url(#p7fa67217a3)" style="fill: none; stroke-dasharray: 12,18; stroke-dashoffset: 0; stroke: #4d4d4d; stroke-width: 3"/>
    </g>
    <g id="patch_18">
     <path d="M 1307.1925 334.546471 
@@ -2467,7 +2467,7 @@ L 610.490937 46.691969
 L 626.740937 46.691969 
 " style="fill: none; stroke: #5ba3f5; stroke-width: 5; stroke-linecap: square"/>
     <g>
-     <use xlink:href="#maa38a61819" x="610.490937" y="46.691969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
+     <use xlink:href="#me37af58da4" x="610.490937" y="46.691969" style="fill: #5ba3f5; stroke: #5ba3f5"/>
     </g>
    </g>
    <g id="text_39">
@@ -2522,7 +2522,7 @@ L 835.955625 46.691969
 L 852.205625 46.691969 
 " style="fill: none; stroke: #ff4444; stroke-width: 5; stroke-linecap: square"/>
     <g>
-     <use xlink:href="#m69a13e037a" x="835.955625" y="46.691969" style="fill: #ff4444; stroke: #ff4444"/>
+     <use xlink:href="#m3d0b8e493d" x="835.955625" y="46.691969" style="fill: #ff4444; stroke: #ff4444"/>
     </g>
    </g>
    <g id="text_40">
@@ -2585,16 +2585,16 @@ z
   </g>
  </g>
  <defs>
-  <clipPath id="p1a1505db71">
+  <clipPath id="p1b738c7a2f">
    <rect x="58.465" y="144.816" width="371.009812" height="189.730471"/>
   </clipPath>
-  <clipPath id="p2964b5d0ab">
+  <clipPath id="pbffb2bfe56">
    <rect x="474.7075" y="144.816" width="371.009812" height="189.730471"/>
   </clipPath>
-  <clipPath id="p629cd7b308">
+  <clipPath id="pdd076313b3">
    <rect x="890.95" y="144.816" width="371.009812" height="189.730471"/>
   </clipPath>
-  <clipPath id="p3993db1e81">
+  <clipPath id="p7fa67217a3">
    <rect x="1307.1925" y="144.816" width="371.009812" height="189.730471"/>
   </clipPath>
  </defs>
diff --git a/oink/benchmarks/readme/run_sm100_suite.py b/oink/benchmarks/readme/run_sm100_suite.py
index fb9d603..af33e38 100644
--- a/oink/benchmarks/readme/run_sm100_suite.py
+++ b/oink/benchmarks/readme/run_sm100_suite.py
@@ -210,6 +210,23 @@ def script(name: str) -> str:
                 os.path.join(out_dir, "rmsnorm_bwd_dsv3_wsame.json"),
             ],
         ),
+        (
+            "fused_add_rmsnorm_dsv3",
+            [
+                py,
+                script("benchmark_fused_add_rmsnorm_sm100.py"),
+                *common,
+                "--dsv3",
+                "--quack-baseline",
+                "kernel_inplace",
+                "--iters",
+                "200",
+                "--warmup-ms",
+                "25",
+                "--json",
+                os.path.join(out_dir, "fused_add_rmsnorm_dsv3.json"),
+            ],
+        ),
         (
             "softmax_fwd_bwd_quack_suite",
             [
diff --git a/oink/src/kernelagent_oink/blackwell/cross_entropy.py b/oink/src/kernelagent_oink/blackwell/cross_entropy.py
index 3e6eef1..d8b37ea 100644
--- a/oink/src/kernelagent_oink/blackwell/cross_entropy.py
+++ b/oink/src/kernelagent_oink/blackwell/cross_entropy.py
@@ -79,11 +79,17 @@
 from cutlass.cute import runtime as rt
 from cutlass.cute.runtime import from_dlpack
 
+from kernelagent_oink.blackwell.fast_launch import (
+    StableI32Arg,
+    disable_fast_launch,
+    fast_launch_enabled,
+    set_runtime_ptr,
+    tls_cache as _tls_fast_launch_cache,
+)
 from kernelagent_oink.blackwell.lite_quack import (
     _KERNEL_ACCEPTS_LAYOUT_ARGS,
     TORCH2CUTE_DTYPE,
     ReductionBase,
-    domain_offset_i64,
     fill_oob,
     online_softmax_reduce,
     predicate_k,
@@ -93,6 +99,454 @@
 _BWD_COMPILE_CACHE: dict[tuple[type[cutlass.Numeric], int], cute.Kernel] = {}
 _PTR_FWD_COMPILE_CACHE: dict[tuple[object, ...], object] = {}
 _PTR_BWD_COMPILE_CACHE: dict[tuple[object, ...], object] = {}
+_PTR_FWDBWD_COMPILE_CACHE: dict[tuple[object, ...], object] = {}
+
+
+class _PtrCrossEntropyFastLaunch:
+    def __init__(
+        self,
+        *,
+        compiled: object,
+        executor: object,
+        capi_func: object,
+        ptr_logits: object,
+        ptr_target: object,
+        ptr_aux_a: object,
+        ptr_aux_b: object,
+        ptr_aux_c: object | None,
+        arg_m: StableI32Arg,
+        arg_ld: StableI32Arg,
+        arg_ignore_index: StableI32Arg,
+        stream: cuda.CUstream,
+        packed_args: object,
+        keepalive: tuple[object, ...],
+        logits_align: int,
+        target_align: int,
+        aux_a_align: int,
+        aux_b_align: int,
+        aux_c_align: int | None,
+    ):
+        self._compiled = compiled
+        self._executor = executor
+        self._capi_func = capi_func
+        self._ptr_logits = ptr_logits
+        self._ptr_target = ptr_target
+        self._ptr_aux_a = ptr_aux_a
+        self._ptr_aux_b = ptr_aux_b
+        self._ptr_aux_c = ptr_aux_c
+        self._arg_m = arg_m
+        self._arg_ld = arg_ld
+        self._arg_ignore_index = arg_ignore_index
+        self._stream = stream
+        self._packed_args = packed_args
+        self._keepalive = keepalive
+        self._logits_align = int(logits_align)
+        self._target_align = int(target_align)
+        self._aux_a_align = int(aux_a_align)
+        self._aux_b_align = int(aux_b_align)
+        self._aux_c_align = int(aux_c_align) if aux_c_align is not None else None
+
+        self._use_fast_launch = True
+        self._cuda_result = getattr(executor, "cuda_result", None)
+
+        self._last_logits_ptr = -1
+        self._last_target_ptr = -1
+        self._last_aux_a_ptr = -1
+        self._last_aux_b_ptr = -1
+        self._last_aux_c_ptr = -1
+        self._last_m = -1
+        self._last_ld = -1
+        self._last_ignore_index = None
+
+    def launch(
+        self,
+        *,
+        logits_ptr: int,
+        target_ptr: int,
+        aux_a_ptr: int,
+        aux_b_ptr: int,
+        aux_c_ptr: int | None,
+        M: int,
+        ld: int,
+        ignore_index: int,
+        stream_handle: int,
+        dtype_logits: type[cutlass.Numeric],
+        aux_a_dtype: type[cutlass.Numeric],
+        aux_b_dtype: type[cutlass.Numeric],
+        aux_c_dtype: type[cutlass.Numeric] | None,
+    ) -> None:
+        if not fast_launch_enabled() or not self._use_fast_launch:
+            self._fallback_launch(
+                logits_ptr=logits_ptr,
+                target_ptr=target_ptr,
+                aux_a_ptr=aux_a_ptr,
+                aux_b_ptr=aux_b_ptr,
+                aux_c_ptr=aux_c_ptr,
+                M=M,
+                ld=ld,
+                ignore_index=ignore_index,
+                stream_handle=stream_handle,
+                dtype_logits=dtype_logits,
+                aux_a_dtype=aux_a_dtype,
+                aux_b_dtype=aux_b_dtype,
+                aux_c_dtype=aux_c_dtype,
+            )
+            return
+
+        if logits_ptr != self._last_logits_ptr:
+            try:
+                set_runtime_ptr(self._ptr_logits, logits_ptr)
+                self._last_logits_ptr = logits_ptr
+            except AttributeError:
+                self._disable_fast_launch()
+                self._fallback_launch(
+                    logits_ptr=logits_ptr,
+                    target_ptr=target_ptr,
+                    aux_a_ptr=aux_a_ptr,
+                    aux_b_ptr=aux_b_ptr,
+                    aux_c_ptr=aux_c_ptr,
+                    M=M,
+                    ld=ld,
+                    ignore_index=ignore_index,
+                    stream_handle=stream_handle,
+                    dtype_logits=dtype_logits,
+                    aux_a_dtype=aux_a_dtype,
+                    aux_b_dtype=aux_b_dtype,
+                    aux_c_dtype=aux_c_dtype,
+                )
+                return
+
+        if target_ptr != self._last_target_ptr:
+            try:
+                set_runtime_ptr(self._ptr_target, target_ptr)
+                self._last_target_ptr = target_ptr
+            except AttributeError:
+                self._disable_fast_launch()
+                self._fallback_launch(
+                    logits_ptr=logits_ptr,
+                    target_ptr=target_ptr,
+                    aux_a_ptr=aux_a_ptr,
+                    aux_b_ptr=aux_b_ptr,
+                    aux_c_ptr=aux_c_ptr,
+                    M=M,
+                    ld=ld,
+                    ignore_index=ignore_index,
+                    stream_handle=stream_handle,
+                    dtype_logits=dtype_logits,
+                    aux_a_dtype=aux_a_dtype,
+                    aux_b_dtype=aux_b_dtype,
+                    aux_c_dtype=aux_c_dtype,
+                )
+                return
+
+        if aux_a_ptr != self._last_aux_a_ptr:
+            try:
+                set_runtime_ptr(self._ptr_aux_a, aux_a_ptr)
+                self._last_aux_a_ptr = aux_a_ptr
+            except AttributeError:
+                self._disable_fast_launch()
+                self._fallback_launch(
+                    logits_ptr=logits_ptr,
+                    target_ptr=target_ptr,
+                    aux_a_ptr=aux_a_ptr,
+                    aux_b_ptr=aux_b_ptr,
+                    aux_c_ptr=aux_c_ptr,
+                    M=M,
+                    ld=ld,
+                    ignore_index=ignore_index,
+                    stream_handle=stream_handle,
+                    dtype_logits=dtype_logits,
+                    aux_a_dtype=aux_a_dtype,
+                    aux_b_dtype=aux_b_dtype,
+                    aux_c_dtype=aux_c_dtype,
+                )
+                return
+
+        if aux_b_ptr != self._last_aux_b_ptr:
+            try:
+                set_runtime_ptr(self._ptr_aux_b, aux_b_ptr)
+                self._last_aux_b_ptr = aux_b_ptr
+            except AttributeError:
+                self._disable_fast_launch()
+                self._fallback_launch(
+                    logits_ptr=logits_ptr,
+                    target_ptr=target_ptr,
+                    aux_a_ptr=aux_a_ptr,
+                    aux_b_ptr=aux_b_ptr,
+                    aux_c_ptr=aux_c_ptr,
+                    M=M,
+                    ld=ld,
+                    ignore_index=ignore_index,
+                    stream_handle=stream_handle,
+                    dtype_logits=dtype_logits,
+                    aux_a_dtype=aux_a_dtype,
+                    aux_b_dtype=aux_b_dtype,
+                    aux_c_dtype=aux_c_dtype,
+                )
+                return
+
+        if self._ptr_aux_c is not None and aux_c_ptr is not None:
+            if aux_c_ptr != self._last_aux_c_ptr:
+                try:
+                    set_runtime_ptr(self._ptr_aux_c, aux_c_ptr)
+                    self._last_aux_c_ptr = aux_c_ptr
+                except AttributeError:
+                    self._disable_fast_launch()
+                    self._fallback_launch(
+                        logits_ptr=logits_ptr,
+                        target_ptr=target_ptr,
+                        aux_a_ptr=aux_a_ptr,
+                        aux_b_ptr=aux_b_ptr,
+                        aux_c_ptr=aux_c_ptr,
+                        M=M,
+                        ld=ld,
+                        ignore_index=ignore_index,
+                        stream_handle=stream_handle,
+                        dtype_logits=dtype_logits,
+                        aux_a_dtype=aux_a_dtype,
+                        aux_b_dtype=aux_b_dtype,
+                        aux_c_dtype=aux_c_dtype,
+                    )
+                    return
+
+        if M != self._last_m:
+            self._arg_m.set(M)
+            self._last_m = M
+        if ld != self._last_ld:
+            self._arg_ld.set(ld)
+            self._last_ld = ld
+        if ignore_index != self._last_ignore_index:
+            self._arg_ignore_index.set(ignore_index)
+            self._last_ignore_index = int(ignore_index)
+
+        if self._cuda_result is not None:
+            self._cuda_result.value = 0
+        ret = self._capi_func(self._packed_args)  # type: ignore[misc]
+        if ret != 0:
+            raise RuntimeError(f"CuTeDSL capi_func returned non-zero: {ret}")
+        if self._cuda_result is not None:
+            err = int(self._cuda_result.value)
+            if err != 0:
+                raise RuntimeError(f"CuTeDSL kernel launch failed (cuda_result={err})")
+
+    def _disable_fast_launch(self) -> None:
+        self._use_fast_launch = False
+        disable_fast_launch()
+
+    def _fallback_launch(
+        self,
+        *,
+        logits_ptr: int,
+        target_ptr: int,
+        aux_a_ptr: int,
+        aux_b_ptr: int,
+        aux_c_ptr: int | None,
+        M: int,
+        ld: int,
+        ignore_index: int,
+        stream_handle: int,
+        dtype_logits: type[cutlass.Numeric],
+        aux_a_dtype: type[cutlass.Numeric],
+        aux_b_dtype: type[cutlass.Numeric],
+        aux_c_dtype: type[cutlass.Numeric] | None,
+    ) -> None:
+        stream = cuda.CUstream(int(stream_handle))
+        ptr_logits = rt.make_ptr(
+            dtype_logits,
+            int(logits_ptr),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=self._logits_align,
+        )
+        ptr_target = rt.make_ptr(
+            cutlass.Int64,
+            int(target_ptr),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=self._target_align,
+        )
+        ptr_aux_a = rt.make_ptr(
+            aux_a_dtype,
+            int(aux_a_ptr),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=self._aux_a_align,
+        )
+        ptr_aux_b = rt.make_ptr(
+            aux_b_dtype,
+            int(aux_b_ptr),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=self._aux_b_align,
+        )
+        if (
+            self._ptr_aux_c is not None
+            and aux_c_ptr is not None
+            and aux_c_dtype is not None
+        ):
+            ptr_aux_c = rt.make_ptr(
+                aux_c_dtype,
+                int(aux_c_ptr),
+                mem_space=rt.AddressSpace.gmem,
+                assumed_align=int(self._aux_c_align or 0),
+            )
+            self._compiled(
+                ptr_logits,
+                ptr_target,
+                ptr_aux_a,
+                ptr_aux_b,
+                ptr_aux_c,
+                Int32(int(M)),
+                Int32(int(ld)),
+                Int32(int(ignore_index)),
+                stream,
+            )
+        else:
+            self._compiled(
+                ptr_logits,
+                ptr_target,
+                ptr_aux_a,
+                ptr_aux_b,
+                Int32(int(M)),
+                Int32(int(ld)),
+                Int32(int(ignore_index)),
+                stream,
+            )
+
+
+def _get_fast_ptr_cross_entropy_launcher(
+    *,
+    compiled: object,
+    dtype_logits: type[cutlass.Numeric],
+    N: int,
+    device_index: int,
+    stream_handle: int,
+    mode: Literal["fwd", "bwd", "fwd_bwd"],
+) -> _PtrCrossEntropyFastLaunch | None:
+    if not fast_launch_enabled():
+        return None
+    key = (
+        f"ptr_fast_{mode}",
+        id(compiled),
+        int(N),
+        dtype_logits,
+        int(device_index),
+        int(stream_handle),
+    )
+    cache = _tls_fast_launch_cache()
+    cached = cache.get(key)
+    if cached is not None:
+        return cached  # type: ignore[return-value]
+
+    ptr_logits = rt.make_ptr(
+        dtype_logits, 0, mem_space=rt.AddressSpace.gmem, assumed_align=16
+    )
+    ptr_target = rt.make_ptr(
+        cutlass.Int64, 0, mem_space=rt.AddressSpace.gmem, assumed_align=8
+    )
+    if mode == "fwd":
+        ptr_aux_a = rt.make_ptr(
+            cutlass.Float32, 0, mem_space=rt.AddressSpace.gmem, assumed_align=4
+        )  # loss
+        ptr_aux_b = rt.make_ptr(
+            cutlass.Float32, 0, mem_space=rt.AddressSpace.gmem, assumed_align=4
+        )  # lse
+        ptr_aux_c = None
+        aux_align_b = 4
+        aux_align_c = None
+    elif mode == "bwd":
+        ptr_aux_a = rt.make_ptr(
+            cutlass.Float32, 0, mem_space=rt.AddressSpace.gmem, assumed_align=4
+        )  # dloss
+        ptr_aux_b = rt.make_ptr(
+            dtype_logits, 0, mem_space=rt.AddressSpace.gmem, assumed_align=16
+        )  # dx
+        ptr_aux_c = rt.make_ptr(
+            cutlass.Float32, 0, mem_space=rt.AddressSpace.gmem, assumed_align=4
+        )  # lse
+        aux_align_b = 16
+        aux_align_c = 4
+    elif mode == "fwd_bwd":
+        ptr_aux_a = rt.make_ptr(
+            cutlass.Float32, 0, mem_space=rt.AddressSpace.gmem, assumed_align=4
+        )  # dloss
+        ptr_aux_b = rt.make_ptr(
+            dtype_logits, 0, mem_space=rt.AddressSpace.gmem, assumed_align=16
+        )  # dx
+        ptr_aux_c = None
+        aux_align_b = 16
+        aux_align_c = None
+    else:
+        raise ValueError(f"Unsupported mode: {mode}")
+
+    arg_m = StableI32Arg(0)
+    arg_ld = StableI32Arg(N)
+    arg_ignore_index = StableI32Arg(-100)
+    stream = cuda.CUstream(int(stream_handle))
+    executor = compiled.to(device_index)  # type: ignore[attr-defined]
+
+    try:
+        if ptr_aux_c is not None:
+            exe_args, adapted_args = executor.generate_execution_args(
+                ptr_logits,
+                ptr_target,
+                ptr_aux_a,
+                ptr_aux_b,
+                ptr_aux_c,
+                arg_m,
+                arg_ld,
+                arg_ignore_index,
+                stream,
+            )
+        else:
+            exe_args, adapted_args = executor.generate_execution_args(
+                ptr_logits,
+                ptr_target,
+                ptr_aux_a,
+                ptr_aux_b,
+                arg_m,
+                arg_ld,
+                arg_ignore_index,
+                stream,
+            )
+        packed_args = executor._get_invoke_packed_args(list(exe_args))  # type: ignore[attr-defined]
+        capi_func = compiled.capi_func  # type: ignore[attr-defined]
+    except AttributeError:
+        disable_fast_launch()
+        return None
+
+    keepalive: tuple[object, ...] = (
+        executor,
+        ptr_logits,
+        ptr_target,
+        ptr_aux_a,
+        ptr_aux_b,
+        ptr_aux_c,
+        arg_m,
+        arg_ld,
+        arg_ignore_index,
+        stream,
+        *adapted_args,
+    )
+    launcher = _PtrCrossEntropyFastLaunch(
+        compiled=compiled,
+        executor=executor,
+        capi_func=capi_func,
+        ptr_logits=ptr_logits,
+        ptr_target=ptr_target,
+        ptr_aux_a=ptr_aux_a,
+        ptr_aux_b=ptr_aux_b,
+        ptr_aux_c=ptr_aux_c,
+        arg_m=arg_m,
+        arg_ld=arg_ld,
+        arg_ignore_index=arg_ignore_index,
+        stream=stream,
+        packed_args=packed_args,
+        keepalive=keepalive,
+        logits_align=16,
+        target_align=8,
+        aux_a_align=4,
+        aux_b_align=aux_align_b,
+        aux_c_align=aux_align_c,
+    )
+    cache[key] = launcher
+    return launcher
 
 
 def _convert_logits_2d(x: Tensor) -> cute.Tensor:
@@ -261,9 +715,10 @@ def _kernel_impl(
         shape: cute.Shape = mX.shape
         idX = cute.make_identity_tensor(shape)
 
-        # Slice per-CTA region; use 64-bit indexing for large tensors.
-        mX_off = domain_offset_i64((bidx * tiler_mn[0], 0), mX)
-        gX = cute.local_tile(mX_off, tiler_mn, (0, cluster_y))
+        # Quack-style CTA tiling: let CuTe compute the CTA offsets directly.
+        # (Avoids the extra 64-bit address arithmetic in `domain_offset_i64` on
+        # the common inference/benchmark sizes.)
+        gX = cute.local_tile(mX, tiler_mn, (bidx, cluster_y))
         cX = cute.local_tile(idX, tiler_mn, (bidx, cluster_y))
 
         smem = cutlass.utils.SmemAllocator()
@@ -277,15 +732,28 @@ def _kernel_impl(
         )
 
         # Copy setup: gmem -> smem via cp.async, 128-bit or narrower as needed.
-        num_copy_elems_X = tv_layout.shape[1][0]
+        num_copy_elems_X = (
+            tv_layout.shape[1]
+            if const_expr(cute.rank(tv_layout.shape[1]) == 1)
+            else tv_layout.shape[1][0]
+        )
+        threads_per_row = (
+            tv_layout.shape[0]
+            if const_expr(cute.rank(tv_layout.shape[0]) == 1)
+            else tv_layout.shape[0][0]
+        )
         num_copy_bits_X = mX.element_type.width * num_copy_elems_X
         copy_atom_load_X = cute.make_copy_atom(
             cute.nvgpu.cpasync.CopyG2SOp(),
             gX.element_type,
             num_bits_per_copy=num_copy_bits_X,
         )
-        thr_copy_X = cute.make_tiled_copy(
-            copy_atom_load_X, tv_layout, tiler_mn
+        thr_layout = cute.make_ordered_layout(
+            (tiler_mn[0], threads_per_row), order=(1, 0)
+        )
+        val_layout = cute.make_layout((1, num_copy_elems_X))
+        thr_copy_X = cute.make_tiled_copy_tv(
+            copy_atom_load_X, thr_layout, val_layout
         ).get_slice(tidx)
 
         tXgX = thr_copy_X.partition_S(gX)
@@ -321,14 +789,11 @@ def _kernel_impl(
 
         should_ignore = Boolean(target == ignore_index)
 
-        # Load the target logit if this row is not ignored. Use Int64 indexing
-        # to safely handle very large tensors.
+        # Load the target logit if this row is not ignored.
         target_logit = Float32.zero
         if row < shape[0] and tXcX[0][1] == 0 and not should_ignore:
-            mX_row = domain_offset_i64((row, 0), mX)
-            target_logit = Float32(mX_row[0, target])
+            target_logit = Float32(mX[row, target])
 
-        threads_per_row = tv_layout.shape[0][0]
         max_x, denom, _ = online_softmax_reduce(
             x,
             threads_per_row,
@@ -398,6 +863,305 @@ def kernel(
             )
 
 
+class CrossEntropyFwdBwdSM100(ReductionBase):
+    """Fused cross-entropy forward+backward producing dx from (logits, target, dloss).
+
+    This avoids materializing the intermediate `lse` (and loss) in global memory
+    when the only desired output is `dx` for `reduction="none"` semantics.
+    """
+
+    def __init__(self, dtype: Type[cutlass.Numeric], N: int):
+        super().__init__(dtype, N, stage=1, reduction_dtype=cutlass.Int64)
+
+    def _calculate_threads_per_row(self) -> int:
+        N = self.N
+        return (
+            8
+            if N <= 64
+            else (
+                16
+                if N <= 128
+                else (
+                    32
+                    if N <= 3072
+                    else (64 if N <= 6144 else (128 if N <= 16384 else 256))
+                )
+            )
+        )
+
+    def _set_cluster_n(self) -> None:
+        N = self.N
+        if const_expr(self.dtype.width == 16):
+            cluster_n = (
+                1
+                if N <= 16 * 1024
+                else (
+                    2
+                    if N <= 32 * 1024
+                    else (4 if N <= 64 * 1024 else (8 if N <= 128 * 1024 else 16))
+                )
+            )
+        else:
+            cluster_n = (
+                1
+                if N <= 16 * 1024
+                else (
+                    2
+                    if N <= 64 * 1024
+                    else (4 if N <= 128 * 1024 else (8 if N <= 256 * 1024 else 16))
+                )
+            )
+        self.cluster_n = cluster_n
+
+    @cute.jit
+    def __call__(
+        self,
+        mX: cute.Tensor,  # (M, N)
+        mTarget: cute.Tensor,  # (M,)
+        mDLoss: cute.Tensor,  # (M,)
+        mdX: cute.Tensor,  # (M, N)
+        ignore_index: Int32,
+        stream: cuda.CUstream,
+    ) -> None:
+        assert mX.element_type == self.dtype
+        assert mdX.element_type == self.dtype
+        self._set_cluster_n()
+        num_copy_bits = math.gcd(self.N, 128 // self.dtype.width) * self.dtype.width
+        tiler_mn, tv_layout = self._get_tv_layout(num_copy_bits=num_copy_bits)
+        num_threads = (
+            cute.size(tv_layout, mode=[0])
+            if _KERNEL_ACCEPTS_LAYOUT_ARGS
+            else self._get_num_threads()
+        )
+        num_warps = num_threads // cute.arch.WARP_SIZE
+        kernel = (
+            self.kernel(
+                mX,
+                mTarget,
+                mDLoss,
+                mdX,
+                ignore_index,
+                tv_layout,
+                tiler_mn,
+            )
+            if _KERNEL_ACCEPTS_LAYOUT_ARGS
+            else self.kernel(
+                mX,
+                mTarget,
+                mDLoss,
+                mdX,
+                ignore_index,
+            )
+        )
+        kernel.launch(
+            grid=[cute.ceil_div(mX.shape[0], tiler_mn[0]), self.cluster_n, 1],
+            block=[num_threads, 1, 1],
+            cluster=[1, self.cluster_n, 1] if const_expr(self.cluster_n > 1) else None,
+            smem=self._smem_size_in_bytes(tiler_mn, num_warps),
+            stream=stream,
+        )
+
+    @cute.jit
+    def launch_from_ptrs(
+        self,
+        ptr_logits: cute.Pointer,
+        ptr_target: cute.Pointer,
+        ptr_dloss: cute.Pointer,
+        ptr_dx: cute.Pointer,
+        M: Int32,
+        ld: Int32,
+        ignore_index: Int32,
+        stream: cuda.CUstream,
+    ) -> None:
+        """Pointer-based entrypoint that bypasses DLPack conversions."""
+        ld_assumed = cute.assume(ld, divby=128 // self.dtype.width)
+        layout_mn = cute.make_layout((M, self.N), stride=(ld_assumed, 1))
+        layout_m = cute.make_layout((M,), stride=(1,))
+        mX = cute.make_tensor(ptr_logits, layout_mn)
+        mdX = cute.make_tensor(ptr_dx, layout_mn)
+        mTarget = cute.make_tensor(ptr_target, layout_m)
+        mDLoss = cute.make_tensor(ptr_dloss, layout_m)
+        self.__call__(mX, mTarget, mDLoss, mdX, ignore_index, stream)
+
+    @cute.jit
+    def _kernel_impl(
+        self,
+        mX: cute.Tensor,  # (M, N)
+        mTarget: cute.Tensor,  # (M,)
+        mDLoss: cute.Tensor,  # (M,)
+        mdX: cute.Tensor,  # (M, N)
+        ignore_index: Int32,
+        tv_layout: cute.Layout,
+        tiler_mn: cute.Shape,
+    ) -> None:
+        tidx, _, _ = cute.arch.thread_idx()
+        bidx, _, _ = cute.arch.block_idx()
+        cluster_y = (
+            const_expr(0)
+            if const_expr(self.cluster_n == 1)
+            else cute.arch.block_idx()[1]
+        )
+
+        shape: cute.Shape = mX.shape
+        idX = cute.make_identity_tensor(shape)
+
+        gX, gdX, cX = [
+            cute.local_tile(mT, tiler_mn, (bidx, cluster_y)) for mT in (mX, mdX, idX)
+        ]
+
+        smem = cutlass.utils.SmemAllocator()
+        sX = smem.allocate_tensor(
+            mX.element_type,
+            cute.make_ordered_layout(tiler_mn, order=(1, 0)),
+            byte_alignment=16,
+        )
+        reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(
+            smem, tv_layout
+        )
+
+        num_copy_elems_X = (
+            tv_layout.shape[1]
+            if const_expr(cute.rank(tv_layout.shape[1]) == 1)
+            else tv_layout.shape[1][0]
+        )
+        threads_per_row = (
+            tv_layout.shape[0]
+            if const_expr(cute.rank(tv_layout.shape[0]) == 1)
+            else tv_layout.shape[0][0]
+        )
+        num_copy_bits_X = mX.element_type.width * num_copy_elems_X
+        copy_atom_load_X = cute.make_copy_atom(
+            cute.nvgpu.cpasync.CopyG2SOp(),
+            gX.element_type,
+            num_bits_per_copy=num_copy_bits_X,
+        )
+        copy_atom_store_dX = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(),
+            gdX.element_type,
+            num_bits_per_copy=num_copy_bits_X,
+        )
+        thr_layout = cute.make_ordered_layout(
+            (tiler_mn[0], threads_per_row), order=(1, 0)
+        )
+        val_layout = cute.make_layout((1, num_copy_elems_X))
+        thr_copy_X = cute.make_tiled_copy_tv(
+            copy_atom_load_X, thr_layout, val_layout
+        ).get_slice(tidx)
+        thr_copy_dX = cute.make_tiled_copy_tv(
+            copy_atom_store_dX, thr_layout, val_layout
+        ).get_slice(tidx)
+
+        tXgX = thr_copy_X.partition_S(gX)
+        tXsX = thr_copy_X.partition_D(sX)
+        tXcX = thr_copy_X.partition_S(cX)[(0, None), None, None]
+        tXcFull = thr_copy_X.partition_S(cX)
+        tXgdX = thr_copy_dX.partition_D(gdX)
+
+        tXrX, tXrdX = [cute.make_fragment_like(thr) for thr in (tXgX, tXgdX)]
+
+        num_warps = cute.size(tv_layout, mode=[0]) // cute.arch.WARP_SIZE
+        self._initialize_cluster(tidx, mbar_ptr, num_warps)
+
+        row = tXcX[0][0]
+        target = Int32.zero
+        dloss = Float32.zero
+        if row < shape[0]:
+            target = Int32(mTarget[row])
+            should_ignore = Boolean(target == ignore_index)
+            dloss = Float32(mDLoss[row]) if not should_ignore else Float32.zero
+
+        is_even_N = const_expr(shape[1] == tiler_mn[1] * self.cluster_n)
+        tXpX = (
+            predicate_k(thr_copy_X.partition_S(cX), limit=shape[1])
+            if const_expr(not is_even_N)
+            else None
+        )
+        if row < shape[0]:
+            cute.copy(copy_atom_load_X, tXgX, tXsX, pred=tXpX)
+        cute.arch.cp_async_commit_group()
+        cute.arch.cp_async_wait_group(0)
+
+        if const_expr(not is_even_N):
+            fill_oob(tXsX, tXpX, -tXsX.element_type.inf)
+
+        cute.autovec_copy(tXsX, tXrX)
+        x = tXrX.load().to(Float32)
+
+        _max_x, denom, exp_x = online_softmax_reduce(
+            x,
+            threads_per_row,
+            reduction_buffer[None, None, 0],
+            mbar_ptr,
+            hook_fn=cute.arch.cluster_wait if const_expr(self.cluster_n > 1) else None,
+            phase=None,
+            return_exp_x=True,
+        )
+        assert exp_x is not None
+        probs = exp_x * cute.arch.rcp_approx(denom)
+        prob_shifted = probs - 1.0
+
+        mask = cute.make_fragment_like(tXrX, cutlass.Boolean)
+        for i in cutlass.range(cute.size(tXcFull), unroll_full=True):
+            mask[i] = tXcFull[i][1] == target
+        grad = cute.where(mask.load(), prob_shifted, probs)
+        grad = grad * dloss
+
+        tXrdX.store(grad.to(tXrdX.element_type))
+
+        tXpdX = (
+            predicate_k(thr_copy_dX.partition_S(cX), limit=shape[1])
+            if const_expr(not is_even_N)
+            else None
+        )
+        if row < shape[0]:
+            cute.copy(copy_atom_store_dX, tXrdX, tXgdX, pred=tXpdX)
+
+    if _KERNEL_ACCEPTS_LAYOUT_ARGS:
+
+        @cute.kernel
+        def kernel(
+            self,
+            mX: cute.Tensor,  # (M, N)
+            mTarget: cute.Tensor,  # (M,)
+            mDLoss: cute.Tensor,  # (M,)
+            mdX: cute.Tensor,  # (M, N)
+            ignore_index: Int32,
+            tv_layout: cute.Layout,
+            tiler_mn: cute.Shape,
+        ) -> None:
+            self._kernel_impl(
+                mX,
+                mTarget,
+                mDLoss,
+                mdX,
+                ignore_index,
+                tv_layout,
+                tiler_mn,
+            )
+    else:
+
+        @cute.kernel
+        def kernel(
+            self,
+            mX: cute.Tensor,  # (M, N)
+            mTarget: cute.Tensor,  # (M,)
+            mDLoss: cute.Tensor,  # (M,)
+            mdX: cute.Tensor,  # (M, N)
+            ignore_index: Int32,
+        ) -> None:
+            num_copy_bits = math.gcd(self.N, 128 // self.dtype.width) * self.dtype.width
+            tiler_mn, tv_layout = self._get_tv_layout(num_copy_bits=num_copy_bits)
+            self._kernel_impl(
+                mX,
+                mTarget,
+                mDLoss,
+                mdX,
+                ignore_index,
+                tv_layout,
+                tiler_mn,
+            )
+
+
 class CrossEntropyBackwardSM100:
     """SM100-tuned cross-entropy backward kernel.
 
@@ -565,13 +1329,17 @@ def _kernel_impl(
         )
 
         idX = cute.make_identity_tensor(shape)
-        mX_off, mdX_off = [
-            domain_offset_i64((bidx * tiler_mn[0], 0), mT) for mT in (mX, mdX)
+        # Quack-style CTA tiling: avoid extra 64-bit address arithmetic by
+        # letting CuTe compute the CTA offsets directly.
+        gX, gdX, cX = [
+            cute.local_tile(mT, tiler_mn, (bidx, bidy)) for mT in (mX, mdX, idX)
         ]
-        gX, gdX = [cute.local_tile(mT, tiler_mn, (0, bidy)) for mT in (mX_off, mdX_off)]
-        cX = cute.local_tile(idX, tiler_mn, (bidx, bidy))
 
-        num_copy_elems_X = tv_layout.shape[1][0]
+        num_copy_elems_X = (
+            tv_layout.shape[1]
+            if const_expr(cute.rank(tv_layout.shape[1]) == 1)
+            else tv_layout.shape[1][0]
+        )
         num_copy_bits_X = mX.element_type.width * num_copy_elems_X
         copy_atom_load_X = cute.make_copy_atom(
             cute.nvgpu.cpasync.CopyG2SOp(),
@@ -934,7 +1702,8 @@ def _cross_entropy_forward_ptr_into(
     device_index = logits.get_device()
     if torch.cuda.current_device() != device_index:
         torch.cuda.set_device(device_index)
-    stream = cuda.CUstream(int(torch.cuda.current_stream().cuda_stream))
+    stream_handle = int(torch.cuda.current_stream().cuda_stream)
+    stream = cuda.CUstream(stream_handle)
 
     dtype_x = TORCH2CUTE_DTYPE[logits.dtype]
     key = ("ptr_fwd", int(N), dtype_x, int(device_index))
@@ -975,6 +1744,32 @@ def _cross_entropy_forward_ptr_into(
         )
         _PTR_FWD_COMPILE_CACHE[key] = compiled
 
+    launcher = _get_fast_ptr_cross_entropy_launcher(
+        compiled=compiled,
+        dtype_logits=dtype_x,
+        N=int(N),
+        device_index=int(device_index),
+        stream_handle=stream_handle,
+        mode="fwd",
+    )
+    if launcher is not None:
+        launcher.launch(
+            logits_ptr=int(logits.data_ptr()),
+            target_ptr=int(target.data_ptr()),
+            aux_a_ptr=int(loss.data_ptr()),
+            aux_b_ptr=int(lse.data_ptr()),
+            aux_c_ptr=None,
+            M=int(M),
+            ld=int(logits.stride(0)),
+            ignore_index=int(ignore_index),
+            stream_handle=stream_handle,
+            dtype_logits=dtype_x,
+            aux_a_dtype=cutlass.Float32,
+            aux_b_dtype=cutlass.Float32,
+            aux_c_dtype=None,
+        )
+        return
+
     ptr_logits = rt.make_ptr(
         dtype_x, logits.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
     )
@@ -1037,7 +1832,8 @@ def _cross_entropy_backward_ptr_into(
     device_index = logits.get_device()
     if torch.cuda.current_device() != device_index:
         torch.cuda.set_device(device_index)
-    stream = cuda.CUstream(int(torch.cuda.current_stream().cuda_stream))
+    stream_handle = int(torch.cuda.current_stream().cuda_stream)
+    stream = cuda.CUstream(stream_handle)
 
     dtype_x = TORCH2CUTE_DTYPE[logits.dtype]
     key = ("ptr_bwd", int(N), dtype_x, int(device_index))
@@ -1082,6 +1878,32 @@ def _cross_entropy_backward_ptr_into(
         )
         _PTR_BWD_COMPILE_CACHE[key] = compiled
 
+    launcher = _get_fast_ptr_cross_entropy_launcher(
+        compiled=compiled,
+        dtype_logits=dtype_x,
+        N=int(N),
+        device_index=int(device_index),
+        stream_handle=stream_handle,
+        mode="bwd",
+    )
+    if launcher is not None:
+        launcher.launch(
+            logits_ptr=int(logits.data_ptr()),
+            target_ptr=int(target.data_ptr()),
+            aux_a_ptr=int(dloss.data_ptr()),
+            aux_b_ptr=int(dx.data_ptr()),
+            aux_c_ptr=int(lse.data_ptr()),
+            M=int(M),
+            ld=int(logits.stride(0)),
+            ignore_index=int(ignore_index),
+            stream_handle=stream_handle,
+            dtype_logits=dtype_x,
+            aux_a_dtype=cutlass.Float32,
+            aux_b_dtype=dtype_x,
+            aux_c_dtype=cutlass.Float32,
+        )
+        return
+
     ptr_logits = rt.make_ptr(
         dtype_x, logits.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
     )
@@ -1119,6 +1941,127 @@ def _cross_entropy_backward_ptr_into(
     )
 
 
+def _cross_entropy_fwd_bwd_ptr_into(
+    *,
+    logits: Tensor,
+    target: Tensor,
+    dloss: Tensor,
+    dx: Tensor,
+    ignore_index: int,
+) -> None:
+    """Launch the fused pointer-based cross-entropy fwd+bwd kernel into preallocated `dx`."""
+    assert logits.is_cuda and logits.dim() == 2
+    assert target.is_cuda and target.dim() == 1 and target.shape[0] == logits.shape[0]
+    assert target.dtype is torch.int64
+    assert (
+        dloss.is_cuda
+        and dloss.shape == (logits.shape[0],)
+        and dloss.dtype is torch.float32
+    )
+    assert dx.is_cuda and dx.shape == logits.shape and dx.dtype == logits.dtype
+    assert dx.stride() == logits.stride(), (
+        "Pointer path expects dx to match logits strides"
+    )
+
+    M, N = logits.shape
+    device_index = logits.get_device()
+    if torch.cuda.current_device() != device_index:
+        torch.cuda.set_device(device_index)
+    stream_handle = int(torch.cuda.current_stream().cuda_stream)
+    stream = cuda.CUstream(stream_handle)
+
+    dtype_x = TORCH2CUTE_DTYPE[logits.dtype]
+    key = ("ptr_fwd_bwd", int(N), dtype_x, int(device_index))
+    compiled = _PTR_FWDBWD_COMPILE_CACHE.get(key)
+    if compiled is None:
+        op = CrossEntropyFwdBwdSM100(dtype_x, int(N))
+        ptr_logits = rt.make_ptr(
+            dtype_x, logits.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+        )
+        ptr_target = rt.make_ptr(
+            cutlass.Int64,
+            target.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=8,
+        )
+        ptr_dloss = rt.make_ptr(
+            cutlass.Float32,
+            dloss.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=4,
+        )
+        ptr_dx = rt.make_ptr(
+            dtype_x, dx.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+        )
+        compiled = cute.compile(
+            op.launch_from_ptrs,
+            ptr_logits,
+            ptr_target,
+            ptr_dloss,
+            ptr_dx,
+            Int32(int(M)),
+            Int32(int(logits.stride(0))),
+            Int32(int(ignore_index)),
+            stream,
+        )
+        _PTR_FWDBWD_COMPILE_CACHE[key] = compiled
+
+    launcher = _get_fast_ptr_cross_entropy_launcher(
+        compiled=compiled,
+        dtype_logits=dtype_x,
+        N=int(N),
+        device_index=int(device_index),
+        stream_handle=stream_handle,
+        mode="fwd_bwd",
+    )
+    if launcher is not None:
+        launcher.launch(
+            logits_ptr=int(logits.data_ptr()),
+            target_ptr=int(target.data_ptr()),
+            aux_a_ptr=int(dloss.data_ptr()),
+            aux_b_ptr=int(dx.data_ptr()),
+            aux_c_ptr=None,
+            M=int(M),
+            ld=int(logits.stride(0)),
+            ignore_index=int(ignore_index),
+            stream_handle=stream_handle,
+            dtype_logits=dtype_x,
+            aux_a_dtype=cutlass.Float32,
+            aux_b_dtype=dtype_x,
+            aux_c_dtype=None,
+        )
+        return
+
+    ptr_logits = rt.make_ptr(
+        dtype_x, logits.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+    )
+    ptr_target = rt.make_ptr(
+        cutlass.Int64,
+        target.data_ptr(),
+        mem_space=rt.AddressSpace.gmem,
+        assumed_align=8,
+    )
+    ptr_dloss = rt.make_ptr(
+        cutlass.Float32,
+        dloss.data_ptr(),
+        mem_space=rt.AddressSpace.gmem,
+        assumed_align=4,
+    )
+    ptr_dx = rt.make_ptr(
+        dtype_x, dx.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+    )
+    compiled(
+        ptr_logits,
+        ptr_target,
+        ptr_dloss,
+        ptr_dx,
+        Int32(int(M)),
+        Int32(int(logits.stride(0))),
+        Int32(int(ignore_index)),
+        stream,
+    )
+
+
 def cross_entropy_backward(
     dloss: Tensor,
     logits: Tensor,
@@ -1157,6 +2100,70 @@ def cross_entropy_backward(
     return dx
 
 
+def cross_entropy_fwd_bwd(
+    dloss: Tensor,
+    logits: Tensor,
+    target: Tensor,
+    ignore_index: int = -100,
+) -> Tensor:
+    """Fused cross-entropy forward+backward producing ``dx`` for ``reduction='none'``.
+
+    Computes per-logit gradients ``dx`` given:
+      - ``logits``: (M, N)
+      - ``target``: (M,)
+      - ``dloss``: (M,) upstream gradients (float32 recommended)
+
+    The fast path avoids materializing intermediate ``lse`` in global memory.
+    """
+    assert logits.dim() == 2, "logits must be 2D (M, N)"
+    assert target.dim() == 1, "target must be 1D (M,)"
+    assert dloss.dim() == 1, "dloss must be 1D (M,)"
+    assert logits.shape[0] == target.shape[0] == dloss.shape[0], (
+        "Batch dimensions must match"
+    )
+    assert logits.is_cuda and target.is_cuda and dloss.is_cuda, (
+        "All tensors must be on CUDA device"
+    )
+    assert logits.dtype in TORCH2CUTE_DTYPE, "Unsupported logits dtype"
+
+    dx = torch.empty_like(logits)
+
+    if (
+        _can_use_ptr_path_logits(logits)
+        and _can_use_ptr_path_logits(dx)
+        and _can_use_ptr_path_target(target)
+        and _can_use_ptr_path_f32_1d(dloss)
+        and logits.stride() == dx.stride()
+    ):
+        _cross_entropy_fwd_bwd_ptr_into(
+            logits=logits,
+            target=target,
+            dloss=dloss,
+            dx=dx,
+            ignore_index=int(ignore_index),
+        )
+        return dx
+
+    # Fallback: reuse the existing forward+backward kernels (DLPack path handles
+    # any necessary dtype conversions).
+    with torch.no_grad():
+        _loss, lse = cross_entropy_forward(
+            logits,
+            target,
+            ignore_index=int(ignore_index),
+            reduction="none",
+        )
+        _cross_entropy_backward_sm100(
+            logits,
+            target,
+            dloss,
+            lse,
+            dx,
+            ignore_index=int(ignore_index),
+        )
+        return dx
+
+
 def cross_entropy(
     logits: Tensor,
     target: Tensor,
diff --git a/oink/src/kernelagent_oink/blackwell/fast_launch.py b/oink/src/kernelagent_oink/blackwell/fast_launch.py
new file mode 100644
index 0000000..9b288f2
--- /dev/null
+++ b/oink/src/kernelagent_oink/blackwell/fast_launch.py
@@ -0,0 +1,115 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Host-side fast-launch helpers for CuTeDSL pointer entrypoints.
+
+CuTeDSL's Python runtime typically marshals each kernel call by allocating
+`Int32` / `Float32` wrappers and runtime `Pointer` descriptors per invocation.
+For latency-sensitive cases (small/medium M), this overhead can dominate.
+
+These helpers provide:
+- Stable scalar argument wrappers (`StableI32Arg`, `StableF32Arg`) that avoid
+  per-call ctypes allocations.
+- In-place mutation of runtime pointer descriptors (`set_runtime_ptr`) so a
+  compiled kernel can be launched repeatedly with different raw device pointers
+  without rebuilding argument objects.
+- A small thread-local cache to store packed args objects (when supported by the
+  installed CuTeDSL version).
+
+All of this relies on a few private-ish CuTeDSL internals. Callers must treat
+fast-launch as an optional optimization and fall back to the normal launch
+path if those internals are unavailable.
+"""
+
+from __future__ import annotations
+
+import ctypes
+import os
+import threading
+from typing import Any
+
+_FAST_LAUNCH_TLS = threading.local()
+
+
+def _env_flag(name: str, default: bool) -> bool:
+    val = os.environ.get(name)
+    if val is None:
+        return default
+    return val.strip().lower() not in {"0", "false", "no", "off", ""}
+
+
+# Fast-launch uses internal CuTeDSL plumbing (packed args + pointer descriptors).
+# Keep it enabled by default in our pinned environment, but allow disabling it
+# via env var and auto-disable it if CuTeDSL internals change.
+_ENABLE_FAST_LAUNCH = _env_flag("OINK_CUTEDSL_FAST_LAUNCH", default=True)
+_FAST_LAUNCH_SUPPORTED = True
+
+
+def fast_launch_enabled() -> bool:
+    return _ENABLE_FAST_LAUNCH and _FAST_LAUNCH_SUPPORTED
+
+
+def disable_fast_launch() -> None:
+    global _FAST_LAUNCH_SUPPORTED
+    _FAST_LAUNCH_SUPPORTED = False
+
+
+def tls_cache() -> dict[tuple[Any, ...], Any]:
+    cache = getattr(_FAST_LAUNCH_TLS, "cache", None)
+    if cache is None:
+        cache = {}
+        _FAST_LAUNCH_TLS.cache = cache
+    return cache
+
+
+class StableI32Arg:
+    """A stable Int32 runtime arg (avoids per-call Int32().__c_pointers__ allocations)."""
+
+    def __init__(self, value: int):
+        self._c_value = ctypes.c_int32(int(value))
+        self._c_pointer = ctypes.cast(ctypes.pointer(self._c_value), ctypes.c_void_p)
+
+    def set(self, value: int) -> None:
+        self._c_value.value = int(value)
+
+    def __c_pointers__(self):
+        return [self._c_pointer]
+
+
+class StableF32Arg:
+    """A stable Float32 runtime arg (avoids per-call Float32().__c_pointers__ allocations)."""
+
+    def __init__(self, value: float):
+        self._c_value = ctypes.c_float(float(value))
+        self._c_pointer = ctypes.cast(ctypes.pointer(self._c_value), ctypes.c_void_p)
+
+    def set(self, value: float) -> None:
+        self._c_value.value = float(value)
+
+    def __c_pointers__(self):
+        return [self._c_pointer]
+
+
+def set_runtime_ptr(ptr: Any, device_ptr: int) -> None:
+    """Update a CuTeDSL runtime Pointer descriptor in-place.
+
+    This relies on internal runtime pointer fields (`_desc`, `_pointer`, etc.).
+    If these internals change in a future CuTeDSL upgrade, this function may
+    raise AttributeError; callers should catch it and fall back.
+    """
+    device_ptr = int(device_ptr)
+    ptr._pointer = device_ptr  # type: ignore[attr-defined]
+    if getattr(ptr, "_c_pointer", None) is None:
+        ptr.__c_pointers__()  # type: ignore[attr-defined]
+    ptr._desc.value = device_ptr  # type: ignore[attr-defined]
diff --git a/oink/src/kernelagent_oink/blackwell/layernorm.py b/oink/src/kernelagent_oink/blackwell/layernorm.py
index 67f67ce..ada51ec 100644
--- a/oink/src/kernelagent_oink/blackwell/layernorm.py
+++ b/oink/src/kernelagent_oink/blackwell/layernorm.py
@@ -31,6 +31,7 @@
 from __future__ import annotations
 
 import importlib.metadata
+import math
 import os
 import re
 import operator
@@ -70,27 +71,455 @@
 from cutlass.cute import runtime as rt
 from cutlass.cute.runtime import from_dlpack
 
-# Simple compile cache for the forward kernel
-_COMPILE_CACHE: dict[Tuple[int, type[cutlass.Numeric], bool, bool, bool], object] = {}
-_PTR_COMPILE_CACHE: dict[Tuple[object, ...], object] = {}
-
-# Backward compile caches: one for dx, one for parameter gradients.
-_BWD_DX_COMPILE_CACHE: dict[Tuple[int, Type[cutlass.Numeric]], object] = {}
-_BWD_PARAM_COMPILE_CACHE: dict[Tuple[int, Type[cutlass.Numeric], bool], object] = {}
-
-# Local helpers cloned from Quack via lite_quack so that this kernel does
-# not depend on `quack` at runtime.
-from kernelagent_oink.blackwell.lite_quack import (  # noqa: E402
+from kernelagent_oink.blackwell.lite_quack import (
     _KERNEL_ACCEPTS_LAYOUT_ARGS,
     TORCH2CUTE_DTYPE,
     ReductionBase as _ReductionBase,
     convert_from_dlpack as convert_from_dlpack_cute,
-    domain_offset_i64,
     get_sm_count,
     predicate_k,
     row_reduce,
     warp_reduce,
 )
+from kernelagent_oink.blackwell.fast_launch import (
+    StableF32Arg,
+    StableI32Arg,
+    disable_fast_launch,
+    fast_launch_enabled,
+    set_runtime_ptr,
+    tls_cache as _tls_fast_launch_cache,
+)
+
+# Simple compile cache for the forward kernel
+_COMPILE_CACHE: dict[Tuple[int, type[cutlass.Numeric], bool, bool, bool], object] = {}
+_PTR_COMPILE_CACHE: dict[Tuple[object, ...], object] = {}
+
+# Backward compile caches: one for dx, one for parameter gradients.
+_BWD_DX_COMPILE_CACHE: dict[Tuple[int, Type[cutlass.Numeric]], object] = {}
+_BWD_PARAM_COMPILE_CACHE: dict[Tuple[int, Type[cutlass.Numeric], bool], object] = {}
+
+
+class _PtrLayernormFastLaunch:
+    def __init__(
+        self,
+        *,
+        compiled: object,
+        executor: object,
+        capi_func: object,
+        ptr_x: object,
+        ptr_w: object,
+        ptr_b: Optional[object],
+        ptr_out: object,
+        ptr_rstd: Optional[object],
+        ptr_mean: Optional[object],
+        arg_m: StableI32Arg,
+        arg_ld: StableI32Arg,
+        arg_eps: StableF32Arg,
+        stream: cuda.CUstream,
+        assumed_align_xo: int,
+        packed_args: object,
+        keepalive: tuple[object, ...],
+    ):
+        self._compiled = compiled
+        self._executor = executor
+        self._capi_func = capi_func
+        self._ptr_x = ptr_x
+        self._ptr_w = ptr_w
+        self._ptr_b = ptr_b
+        self._ptr_out = ptr_out
+        self._ptr_rstd = ptr_rstd
+        self._ptr_mean = ptr_mean
+        self._arg_m = arg_m
+        self._arg_ld = arg_ld
+        self._arg_eps = arg_eps
+        self._stream = stream
+        self._assumed_align_xo = int(assumed_align_xo)
+        self._packed_args = packed_args
+        self._keepalive = keepalive
+
+        self._use_fast_launch = True
+        self._cuda_result = getattr(executor, "cuda_result", None)
+
+        self._last_x_ptr = -1
+        self._last_w_ptr = -1
+        self._last_b_ptr = -1
+        self._last_out_ptr = -1
+        self._last_rstd_ptr = -1
+        self._last_mean_ptr = -1
+        self._last_m = -1
+        self._last_ld = -1
+        self._last_eps = float("nan")
+
+    def launch(
+        self,
+        *,
+        x: Tensor,
+        weight: Tensor,
+        bias: Optional[Tensor],
+        out: Tensor,
+        rstd: Optional[Tensor],
+        mean: Optional[Tensor],
+        M: int,
+        ld: int,
+        eps: float,
+    ) -> None:
+        if not fast_launch_enabled() or not self._use_fast_launch:
+            self._fallback_launch(
+                x=x,
+                weight=weight,
+                bias=bias,
+                out=out,
+                rstd=rstd,
+                mean=mean,
+                M=M,
+                ld=ld,
+                eps=eps,
+            )
+            return
+
+        x_ptr = x.data_ptr()
+        if x_ptr != self._last_x_ptr:
+            try:
+                set_runtime_ptr(self._ptr_x, x_ptr)
+                self._last_x_ptr = x_ptr
+            except AttributeError:
+                self._disable_fast_launch()
+                self._fallback_launch(
+                    x=x,
+                    weight=weight,
+                    bias=bias,
+                    out=out,
+                    rstd=rstd,
+                    mean=mean,
+                    M=M,
+                    ld=ld,
+                    eps=eps,
+                )
+                return
+
+        w_ptr = weight.data_ptr()
+        if w_ptr != self._last_w_ptr:
+            try:
+                set_runtime_ptr(self._ptr_w, w_ptr)
+                self._last_w_ptr = w_ptr
+            except AttributeError:
+                self._disable_fast_launch()
+                self._fallback_launch(
+                    x=x,
+                    weight=weight,
+                    bias=bias,
+                    out=out,
+                    rstd=rstd,
+                    mean=mean,
+                    M=M,
+                    ld=ld,
+                    eps=eps,
+                )
+                return
+
+        if self._ptr_b is not None and bias is not None:
+            b_ptr = bias.data_ptr()
+            if b_ptr != self._last_b_ptr:
+                try:
+                    set_runtime_ptr(self._ptr_b, b_ptr)
+                    self._last_b_ptr = b_ptr
+                except AttributeError:
+                    self._disable_fast_launch()
+                    self._fallback_launch(
+                        x=x,
+                        weight=weight,
+                        bias=bias,
+                        out=out,
+                        rstd=rstd,
+                        mean=mean,
+                        M=M,
+                        ld=ld,
+                        eps=eps,
+                    )
+                    return
+
+        out_ptr = out.data_ptr()
+        if out_ptr != self._last_out_ptr:
+            try:
+                set_runtime_ptr(self._ptr_out, out_ptr)
+                self._last_out_ptr = out_ptr
+            except AttributeError:
+                self._disable_fast_launch()
+                self._fallback_launch(
+                    x=x,
+                    weight=weight,
+                    bias=bias,
+                    out=out,
+                    rstd=rstd,
+                    mean=mean,
+                    M=M,
+                    ld=ld,
+                    eps=eps,
+                )
+                return
+
+        if self._ptr_rstd is not None and rstd is not None:
+            rstd_ptr = rstd.data_ptr()
+            if rstd_ptr != self._last_rstd_ptr:
+                try:
+                    set_runtime_ptr(self._ptr_rstd, rstd_ptr)
+                    self._last_rstd_ptr = rstd_ptr
+                except AttributeError:
+                    self._disable_fast_launch()
+                    self._fallback_launch(
+                        x=x,
+                        weight=weight,
+                        bias=bias,
+                        out=out,
+                        rstd=rstd,
+                        mean=mean,
+                        M=M,
+                        ld=ld,
+                        eps=eps,
+                    )
+                    return
+
+        if self._ptr_mean is not None and mean is not None:
+            mean_ptr = mean.data_ptr()
+            if mean_ptr != self._last_mean_ptr:
+                try:
+                    set_runtime_ptr(self._ptr_mean, mean_ptr)
+                    self._last_mean_ptr = mean_ptr
+                except AttributeError:
+                    self._disable_fast_launch()
+                    self._fallback_launch(
+                        x=x,
+                        weight=weight,
+                        bias=bias,
+                        out=out,
+                        rstd=rstd,
+                        mean=mean,
+                        M=M,
+                        ld=ld,
+                        eps=eps,
+                    )
+                    return
+
+        if M != self._last_m:
+            self._arg_m.set(M)
+            self._last_m = M
+        if ld != self._last_ld:
+            self._arg_ld.set(ld)
+            self._last_ld = ld
+        if eps != self._last_eps:
+            self._arg_eps.set(eps)
+            self._last_eps = eps
+
+        if self._cuda_result is not None:
+            self._cuda_result.value = 0
+        ret = self._capi_func(self._packed_args)  # type: ignore[misc]
+        if ret != 0:
+            raise RuntimeError(f"CuTeDSL capi_func returned non-zero: {ret}")
+        if self._cuda_result is not None:
+            err = int(self._cuda_result.value)
+            if err != 0:
+                raise RuntimeError(f"CuTeDSL kernel launch failed (cuda_result={err})")
+
+    def _disable_fast_launch(self) -> None:
+        self._use_fast_launch = False
+        disable_fast_launch()
+
+    def _fallback_launch(
+        self,
+        *,
+        x: Tensor,
+        weight: Tensor,
+        bias: Optional[Tensor],
+        out: Tensor,
+        rstd: Optional[Tensor],
+        mean: Optional[Tensor],
+        M: int,
+        ld: int,
+        eps: float,
+    ) -> None:
+        dtype_x = TORCH2CUTE_DTYPE[x.dtype]
+        stream_handle = int(torch.cuda.current_stream().cuda_stream)
+        stream = cuda.CUstream(stream_handle)
+        ptr_x = rt.make_ptr(
+            dtype_x,
+            x.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=self._assumed_align_xo,
+        )
+        ptr_out = rt.make_ptr(
+            dtype_x,
+            out.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=self._assumed_align_xo,
+        )
+        ptr_w = rt.make_ptr(
+            cutlass.Float32,
+            weight.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=16,
+        )
+        ptr_b = (
+            rt.make_ptr(
+                cutlass.Float32,
+                bias.data_ptr(),
+                mem_space=rt.AddressSpace.gmem,
+                assumed_align=16,
+            )
+            if bias is not None
+            else None
+        )
+        ptr_rstd = (
+            rt.make_ptr(
+                cutlass.Float32,
+                rstd.data_ptr(),
+                mem_space=rt.AddressSpace.gmem,
+                assumed_align=4,
+            )
+            if rstd is not None
+            else None
+        )
+        ptr_mean = (
+            rt.make_ptr(
+                cutlass.Float32,
+                mean.data_ptr(),
+                mem_space=rt.AddressSpace.gmem,
+                assumed_align=4,
+            )
+            if mean is not None
+            else None
+        )
+        self._compiled(
+            ptr_x,
+            ptr_w,
+            ptr_b,
+            ptr_out,
+            ptr_rstd,
+            ptr_mean,
+            Int32(int(M)),
+            Int32(int(ld)),
+            stream,
+            Float32(float(eps)),
+        )
+
+
+def _get_fast_ptr_layernorm_launcher(
+    *,
+    compiled: object,
+    N: int,
+    dtype_x: type[cutlass.Numeric],
+    has_bias: bool,
+    has_rstd: bool,
+    has_mean: bool,
+    device_index: int,
+    stream_handle: int,
+    assumed_align_xo: int,
+    eps: float,
+) -> Optional[_PtrLayernormFastLaunch]:
+    if not fast_launch_enabled():
+        return None
+    key = (
+        "ptr_fast",
+        id(compiled),
+        int(N),
+        dtype_x,
+        bool(has_bias),
+        bool(has_rstd),
+        bool(has_mean),
+        int(device_index),
+        int(stream_handle),
+        int(assumed_align_xo),
+    )
+    cache = _tls_fast_launch_cache()
+    cached = cache.get(key)
+    if cached is not None:
+        return cached  # type: ignore[return-value]
+
+    ptr_x = rt.make_ptr(
+        dtype_x, 0, mem_space=rt.AddressSpace.gmem, assumed_align=int(assumed_align_xo)
+    )
+    ptr_out = rt.make_ptr(
+        dtype_x, 0, mem_space=rt.AddressSpace.gmem, assumed_align=int(assumed_align_xo)
+    )
+    ptr_w = rt.make_ptr(
+        cutlass.Float32, 0, mem_space=rt.AddressSpace.gmem, assumed_align=16
+    )
+    ptr_b = (
+        rt.make_ptr(
+            cutlass.Float32, 0, mem_space=rt.AddressSpace.gmem, assumed_align=16
+        )
+        if has_bias
+        else None
+    )
+    ptr_rstd = (
+        rt.make_ptr(cutlass.Float32, 0, mem_space=rt.AddressSpace.gmem, assumed_align=4)
+        if has_rstd
+        else None
+    )
+    ptr_mean = (
+        rt.make_ptr(cutlass.Float32, 0, mem_space=rt.AddressSpace.gmem, assumed_align=4)
+        if has_mean
+        else None
+    )
+
+    arg_m = StableI32Arg(0)
+    arg_ld = StableI32Arg(N)
+    arg_eps = StableF32Arg(eps)
+    stream = cuda.CUstream(int(stream_handle))
+    executor = compiled.to(device_index)  # type: ignore[attr-defined]
+
+    try:
+        exe_args, adapted_args = executor.generate_execution_args(
+            ptr_x,
+            ptr_w,
+            ptr_b,
+            ptr_out,
+            ptr_rstd,
+            ptr_mean,
+            arg_m,
+            arg_ld,
+            stream,
+            arg_eps,
+        )
+        packed_args = executor._get_invoke_packed_args(list(exe_args))  # type: ignore[attr-defined]
+        capi_func = compiled.capi_func  # type: ignore[attr-defined]
+    except AttributeError:
+        disable_fast_launch()
+        return None
+
+    keepalive: tuple[object, ...] = (
+        executor,
+        ptr_x,
+        ptr_w,
+        ptr_b,
+        ptr_out,
+        ptr_rstd,
+        ptr_mean,
+        arg_m,
+        arg_ld,
+        arg_eps,
+        stream,
+        *adapted_args,
+    )
+    launcher = _PtrLayernormFastLaunch(
+        compiled=compiled,
+        executor=executor,
+        capi_func=capi_func,
+        ptr_x=ptr_x,
+        ptr_w=ptr_w,
+        ptr_b=ptr_b,
+        ptr_out=ptr_out,
+        ptr_rstd=ptr_rstd,
+        ptr_mean=ptr_mean,
+        arg_m=arg_m,
+        arg_ld=arg_ld,
+        arg_eps=arg_eps,
+        stream=stream,
+        assumed_align_xo=int(assumed_align_xo),
+        packed_args=packed_args,
+        keepalive=keepalive,
+    )
+    cache[key] = launcher
+    return launcher
 
 
 def _convert_row_major(t: Tensor) -> cute.Tensor:
@@ -121,17 +550,42 @@ class LayerNormSM100(_ReductionBase):
     - Dtype mapping and reduction helpers come from `lite_quack`.
     """
 
-    def __init__(self, dtype: type[cutlass.Numeric], N: int):
+    def __init__(
+        self,
+        dtype: type[cutlass.Numeric],
+        N: int,
+        *,
+        copy_bits_x: Optional[int] = None,
+        direct_gmem: bool = False,
+    ):
         super().__init__(dtype, N, stage=2)  # 2 stages for mean and var
         # Default reload policy mirrors Quack: use SMEM reload only for
         # very large hidden sizes. We keep this conservative for LayerNorm
         # and tune primarily via threads-per-block / cluster_n.
         self.reload_from: Optional[str] = None if N <= 16384 else "smem"
-        self.delay_w_load: bool = False
+        # SM100 tuning: for DSv3 hidden sizes where we fuse mean+var stats,
+        # delay loading fp32 weights/bias until after the reductions to lower
+        # register pressure.
+        self.delay_w_load: bool = bool(N in (4096, 6144, 7168, 8192))
+        self.copy_bits_x: Optional[int] = (
+            int(copy_bits_x) if copy_bits_x is not None else None
+        )
+        self.direct_gmem: bool = bool(direct_gmem)
+
+    def _get_num_threads(self) -> int:
+        nt = getattr(self, "_nt_override", None)
+        if nt is not None:
+            return int(nt)
+        return super()._get_num_threads()
 
     def _calculate_threads_per_row(self) -> int:
+        tpr = getattr(self, "_tpr_override", None)
+        if tpr is not None:
+            return int(tpr)
         # Match Quack's LayerNorm threads-per-row buckets.
         N = self.N
+        if N in (4096, 6144):
+            return 128
         return (
             8
             if N <= 64
@@ -188,7 +642,24 @@ def __call__(
 
         # Tiling and cluster policy (mirrors Quack LayerNorm).
         self._set_cluster_n()
-        tiler_mn, tv_layout = self._get_tv_layout()
+        largest_dtype_width = const_expr(
+            max(
+                t.element_type.width
+                for t in (mX, mW, mB, mO, mRstd, mMean)
+                if t is not None
+            )
+        )
+        # Match Quack's unified RMSNorm/LayerNorm kernel: pick vecsize based on
+        # the widest dtype participating in the op (e.g. fp32 weights => fp16
+        # X uses 64b vectorization).
+        vecsize = math.gcd(self.N, 128 // largest_dtype_width)
+        default_copy_bits_x = vecsize * self.dtype.width
+        num_copy_bits_x = (
+            int(self.copy_bits_x)
+            if self.copy_bits_x is not None
+            else default_copy_bits_x
+        )
+        tiler_mn, tv_layout = self._get_tv_layout(num_copy_bits=num_copy_bits_x)
         num_threads = (
             cute.size(tv_layout, mode=[0])
             if _KERNEL_ACCEPTS_LAYOUT_ARGS
@@ -275,10 +746,14 @@ def launch_from_ptrs(
         This reconstructs cute.Tensor views from raw device pointers + explicit
         layouts inside the JIT graph, reusing the tuned LayerNormSM100 schedule.
         """
-        # The kernel uses 128-bit vectorized copies for X. Mirror Quack's
-        # `divisibility=128 // dtype.width` contract so the compiler can
-        # prove alignment for cp.async.
-        ld_assumed = cute.assume(ld, divby=128 // self.dtype.width)
+        # Mirror Quack-style divisibility contracts so the compiler can prove
+        # alignment for vectorized loads/stores (and cp.async when enabled).
+        divby = (
+            int(self.copy_bits_x) // self.dtype.width
+            if const_expr(self.copy_bits_x is not None)
+            else (128 // self.dtype.width)
+        )
+        ld_assumed = cute.assume(ld, divby=divby)
         # Match `mark_compact_shape_dynamic(mode=0, ...)`: M is dynamic, N is static.
         layout_mn = cute.make_layout((M, self.N), stride=(ld_assumed, 1))
         layout_n = cute.make_layout((self.N,), stride=(1,))
@@ -338,9 +813,10 @@ def _kernel_impl(
         shape = mX.shape
         idX = cute.make_identity_tensor(shape)
 
-        # Slice for CTAs: use domain_offset_i64 to handle >2^31 elements.
-        mX, mO = [domain_offset_i64((bidx * tiler_mn[0], 0), mT) for mT in (mX, mO)]
-        gX, gO = [cute.local_tile(mT, tiler_mn, (0, cluster_y)) for mT in (mX, mO)]
+        # Quack-style CTA tiling: let CuTe compute the CTA offsets directly.
+        # (Avoids the extra 64-bit address arithmetic in `domain_offset_i64` on
+        # the common inference/benchmark sizes.)
+        gX, gO = [cute.local_tile(mT, tiler_mn, (bidx, cluster_y)) for mT in (mX, mO)]
         cX = cute.local_tile(idX, tiler_mn, (bidx, cluster_y))
         gW = cute.local_tile(mW, tiler_mn, (0, cluster_y))
         gB = (
@@ -359,118 +835,160 @@ def _kernel_impl(
             else None
         )
 
-        # Copy atoms for X / W / B / O.
+        # Copy atoms for X / W / B / O (mirror Quack's vector-size contract).
+        num_copy_elems_x = (
+            tv_layout.shape[1]
+            if const_expr(cute.rank(tv_layout.shape[1]) == 1)
+            else tv_layout.shape[1][0]
+        )
+        threads_per_row = (
+            tv_layout.shape[0]
+            if const_expr(cute.rank(tv_layout.shape[0]) == 1)
+            else tv_layout.shape[0][0]
+        )
+        num_copy_bits_x = mX.element_type.width * num_copy_elems_x
+        num_copy_bits_x_async = const_expr(min(128, num_copy_bits_x))
         copy_atom_load_X = cute.make_copy_atom(
             cute.nvgpu.CopyUniversalOp(),
             mX.element_type,
-            num_bits_per_copy=128,
+            num_bits_per_copy=num_copy_bits_x,
         )
         copy_atom_load_X_async = cute.make_copy_atom(
             cute.nvgpu.cpasync.CopyG2SOp(),
             mX.element_type,
-            num_bits_per_copy=128,
+            num_bits_per_copy=num_copy_bits_x_async,
+        )
+        num_copy_bits_wb = const_expr(
+            min(128, mW.element_type.width * num_copy_elems_x)
         )
         copy_atom_load_WB = cute.make_copy_atom(
             cute.nvgpu.CopyUniversalOp(),
             mW.element_type,
-            num_bits_per_copy=128,
+            num_bits_per_copy=num_copy_bits_wb,
         )
         copy_atom_store_O = cute.make_copy_atom(
             cute.nvgpu.CopyUniversalOp(),
             mO.element_type,
-            num_bits_per_copy=128,
+            num_bits_per_copy=num_copy_bits_x,
         )
 
-        thr_copy_X = cute.make_tiled_copy(
-            copy_atom_load_X_async,
-            tv_layout,
-            tiler_mn,
-        ).get_slice(tidx)
-        thr_copy_WB = cute.make_tiled_copy(
-            copy_atom_load_WB,
-            tv_layout,
-            tiler_mn,
-        ).get_slice(tidx)
-        thr_copy_O = cute.make_tiled_copy(
-            copy_atom_store_O,
-            tv_layout,
-            tiler_mn,
+        # Quack-style partitioning: use `make_tiled_copy_tv` (2D thread/value
+        # layout) and let partitioning over the CTA tile handle the N loop.
+        thr_layout = cute.make_ordered_layout(
+            (tiler_mn[0], threads_per_row), order=(1, 0)
+        )
+        val_layout = cute.make_layout((1, num_copy_elems_x))
+        thr_copy = cute.make_tiled_copy_tv(
+            copy_atom_load_X, thr_layout, val_layout
         ).get_slice(tidx)
 
-        tWgW = thr_copy_WB.partition_S(gW)
-        tBgB = thr_copy_WB.partition_S(gB) if const_expr(gB is not None) else None
-        tXgX = thr_copy_X.partition_S(gX)
-        tXsX = thr_copy_X.partition_D(sX)
-        tXgO = thr_copy_O.partition_D(gO)
-        tXrRstd = (
-            thr_copy_O.partition_D(gRstd) if const_expr(mRstd is not None) else None
-        )
-        tXrMean = (
-            thr_copy_O.partition_D(gMean) if const_expr(mMean is not None) else None
-        )
-        tXcX = thr_copy_X.partition_S(cX)[(0, None), None, None]
+        tXgX = thr_copy.partition_S(gX)
+        tXsX = thr_copy.partition_D(sX)
+        tXgO = thr_copy.partition_D(gO)
+        tXgW = thr_copy.partition_S(gW)
+        tXgB = thr_copy.partition_S(gB) if const_expr(gB is not None) else None
+        tXrRstd = thr_copy.partition_D(gRstd) if const_expr(mRstd is not None) else None
+        tXrMean = thr_copy.partition_D(gMean) if const_expr(mMean is not None) else None
+        tXcX = thr_copy.partition_S(cX)[(0, None), None, None]
 
         # Fragments for gmem->rmem.
-        tWrW = cute.make_fragment_like(tWgW)
-        tBrB = cute.make_fragment_like(tBgB) if const_expr(mB is not None) else None
-        tXrW = thr_copy_X.retile(tWrW)
-        tXrB = thr_copy_X.retile(tBrB) if const_expr(mB is not None) else None
+        tXrW = cute.make_fragment_like(tXgW)
+        tXrB = cute.make_fragment_like(tXgB) if const_expr(mB is not None) else None
         tXrX, tXrO = [cute.make_fragment_like(thr) for thr in (tXgX, tXgO)]
 
         num_warps = cute.size(tv_layout, mode=[0]) // cute.arch.WARP_SIZE
         self._initialize_cluster(tidx, mbar_ptr, num_warps, is_persistent=False)
 
-        tXpX = predicate_k(
-            thr_copy_X.partition_S(cX),
-            limit=shape[1],
+        is_even_N = const_expr(shape[1] == tiler_mn[1] * self.cluster_n)
+        tXpX = (
+            None if is_even_N else predicate_k(thr_copy.partition_S(cX), limit=shape[1])
         )
         row = tXcX[0][0]
-        if row < shape[0]:
-            cute.copy(copy_atom_load_X_async, tXgX, tXsX, pred=tXpX)
-        cute.arch.cp_async_commit_group()
+        if const_expr(not self.direct_gmem):
+            if row < shape[0]:
+                cute.copy(copy_atom_load_X_async, tXgX, tXsX, pred=tXpX)
+            cute.arch.cp_async_commit_group()
 
-        tWpW = predicate_k(
-            thr_copy_WB.partition_S(cX),
-            limit=shape[1],
-        )
         if const_expr(not delay_w_load):
-            cute.copy(copy_atom_load_WB, tWgW, tWrW, pred=tWpW)
+            cute.copy(copy_atom_load_WB, tXgW, tXrW, pred=tXpX)
             if const_expr(mB is not None):
-                cute.copy(copy_atom_load_WB, tBgB, tBrB, pred=tWpW)
-
-        cute.arch.cp_async_wait_group(0)
-        cute.autovec_copy(tXsX, tXrX)
-        x = tXrX.load().to(Float32)
-        threads_per_row = tv_layout.shape[0][0]
-        sum_x = row_reduce(
-            x,
-            cute.ReductionOp.ADD,
-            threads_per_row,
-            reduction_buffer[None, None, 0],
-            mbar_ptr + 0 if const_expr(self.cluster_n > 1) else None,
-            init_val=0.0,
-            hook_fn=(
-                cute.arch.cluster_wait if const_expr(self.cluster_n > 1) else None
-            ),
-        )
-        mean = sum_x / shape[1]
+                cute.copy(copy_atom_load_WB, tXgB, tXrB, pred=tXpX)
 
-        if const_expr(reload_from == "smem"):
+        if const_expr(not self.direct_gmem):
+            cute.arch.cp_async_wait_group(0)
             cute.autovec_copy(tXsX, tXrX)
-            x = tXrX.load().to(Float32)
-        elif const_expr(reload_from == "gmem"):
-            cute.copy(copy_atom_load_X, tXgX, tXrX, pred=tXpX)
-            x = tXrX.load().to(Float32)
-
-        sum_sq_x_sub_mean = row_reduce(
-            (x - mean) * (x - mean),
-            cute.ReductionOp.ADD,
-            threads_per_row,
-            reduction_buffer[None, None, 1],
-            mbar_ptr + 1 if const_expr(self.cluster_n > 1) else None,
-            init_val=0.0,
-        )
-        rstd = cute.math.rsqrt(sum_sq_x_sub_mean / shape[1] + eps, fastmath=True)
+        else:
+            if row < shape[0]:
+                cute.copy(copy_atom_load_X, tXgX, tXrX, pred=tXpX)
+        x = tXrX.load().to(Float32)
+        if const_expr(self.cluster_n == 1 and self.N in (4096, 6144, 7168, 8192)):
+            # SM100 tuning for DSv3 hidden sizes:
+            # Compute (sum_x, sum_x2) together so we can derive mean + variance
+            # without a second reduction pass (and without re-materializing
+            # x-mean for the variance reduction).
+            sum_x = x.reduce(cute.ReductionOp.ADD, init_val=0.0, reduction_profile=0)
+            sum_x2 = (x * x).reduce(
+                cute.ReductionOp.ADD, init_val=0.0, reduction_profile=0
+            )
+            sum_x = warp_reduce(
+                sum_x,
+                operator.add,
+                width=min(threads_per_row, cute.arch.WARP_SIZE),
+            )
+            sum_x2 = warp_reduce(
+                sum_x2,
+                operator.add,
+                width=min(threads_per_row, cute.arch.WARP_SIZE),
+            )
+            warps_per_row, cluster_n = reduction_buffer.shape[1]
+            if const_expr(warps_per_row > 1 or cluster_n > 1):
+                lane_idx, warp_idx = cute.arch.lane_idx(), cute.arch.warp_idx()
+                row_idx, col_idx = warp_idx // warps_per_row, warp_idx % warps_per_row
+                if lane_idx == 0:
+                    reduction_buffer[row_idx, col_idx, 0] = sum_x
+                    reduction_buffer[row_idx, col_idx, 1] = sum_x2
+                cute.arch.barrier()
+                block_sum_x = 0.0
+                block_sum_x2 = 0.0
+                if lane_idx < warps_per_row:
+                    block_sum_x = reduction_buffer[row_idx, lane_idx, 0]
+                    block_sum_x2 = reduction_buffer[row_idx, lane_idx, 1]
+                sum_x = warp_reduce(block_sum_x, operator.add)
+                sum_x2 = warp_reduce(block_sum_x2, operator.add)
+            mean = sum_x / shape[1]
+            var = sum_x2 / shape[1] - mean * mean
+            var = cute.arch.fmax(var, 0.0)
+            rstd = cute.math.rsqrt(var + eps, fastmath=True)
+        else:
+            sum_x = row_reduce(
+                x,
+                cute.ReductionOp.ADD,
+                threads_per_row,
+                reduction_buffer[None, None, 0],
+                mbar_ptr + 0 if const_expr(self.cluster_n > 1) else None,
+                init_val=0.0,
+                hook_fn=(
+                    cute.arch.cluster_wait if const_expr(self.cluster_n > 1) else None
+                ),
+            )
+            mean = sum_x / shape[1]
+
+            if const_expr(reload_from == "smem"):
+                cute.autovec_copy(tXsX, tXrX)
+                x = tXrX.load().to(Float32)
+            elif const_expr(reload_from == "gmem"):
+                cute.copy(copy_atom_load_X, tXgX, tXrX, pred=tXpX)
+                x = tXrX.load().to(Float32)
+
+            sum_sq_x_sub_mean = row_reduce(
+                (x - mean) * (x - mean),
+                cute.ReductionOp.ADD,
+                threads_per_row,
+                reduction_buffer[None, None, 1],
+                mbar_ptr + 1 if const_expr(self.cluster_n > 1) else None,
+                init_val=0.0,
+            )
+            rstd = cute.math.rsqrt(sum_sq_x_sub_mean / shape[1] + eps, fastmath=True)
 
         if const_expr(mRstd is not None):
             if (
@@ -489,9 +1007,9 @@ def _kernel_impl(
                 tXrMean[0] = mean
 
         if const_expr(delay_w_load):
-            cute.copy(copy_atom_load_WB, tWgW, tWrW, pred=tWpW)
+            cute.copy(copy_atom_load_WB, tXgW, tXrW, pred=tXpX)
             if const_expr(mB is not None):
-                cute.copy(copy_atom_load_WB, tBgB, tBrB, pred=tWpW)
+                cute.copy(copy_atom_load_WB, tXgB, tXrB, pred=tXpX)
 
         if const_expr(reload_from == "smem"):
             cute.autovec_copy(tXsX, tXrX)
@@ -508,12 +1026,8 @@ def _kernel_impl(
             y = y + b
 
         tXrO.store(y.to(tXrO.element_type))
-        tOpO = predicate_k(
-            thr_copy_O.partition_S(cX),
-            limit=shape[1],
-        )
         if row < shape[0]:
-            cute.copy(copy_atom_store_O, tXrO, tXgO, pred=tOpO)
+            cute.copy(copy_atom_store_O, tXrO, tXgO, pred=tXpX)
 
     if _KERNEL_ACCEPTS_LAYOUT_ARGS:
 
@@ -558,7 +1072,24 @@ def kernel(
             mMean: Optional[cute.Tensor],
             eps: Float32,
         ):
-            tiler_mn, tv_layout = self._get_tv_layout()
+            largest_dtype_width = const_expr(
+                max(
+                    mX.element_type.width,
+                    mW.element_type.width,
+                    mB.element_type.width if const_expr(mB is not None) else 0,
+                    mO.element_type.width,
+                    mRstd.element_type.width if const_expr(mRstd is not None) else 0,
+                    mMean.element_type.width if const_expr(mMean is not None) else 0,
+                )
+            )
+            vecsize = math.gcd(self.N, 128 // largest_dtype_width)
+            default_copy_bits_x = vecsize * mX.element_type.width
+            num_copy_bits_x = (
+                int(self.copy_bits_x)
+                if const_expr(self.copy_bits_x is not None)
+                else default_copy_bits_x
+            )
+            tiler_mn, tv_layout = self._get_tv_layout(num_copy_bits=num_copy_bits_x)
             self._kernel_impl(
                 mX,
                 mW,
@@ -775,6 +1306,37 @@ def _layernorm_forward_ptr_into(
     stream = cuda.CUstream(stream_handle)
 
     dtype_x = TORCH2CUTE_DTYPE[x.dtype]
+    # Keep the pointer path aligned with Quack's LayerNorm schedule:
+    # - <=128b vectorization (cp.async-compatible)
+    # - shared-memory staging for X (gmem->smem->rmem) to amortize global latency
+    direct_gmem = False
+    copy_bits_x: Optional[int] = None
+    assumed_align_xo = 16
+
+    # DSv3 hidden sizes are often latency-bound on small M. For these N buckets,
+    # a direct-GMEM schedule (skip gmem->smem cp.async) can reduce overhead.
+    #
+    # Keep the Quack-like staged path for large M where cp.async overlap tends to win.
+    if dtype_x.width == 16:
+        # DSv3 default hidden size (7168) is a common inference hot shape and
+        # benefits from the lower-overhead direct-GMEM path on this SM100.
+        if N == 7168 and M <= 65536:
+            direct_gmem = True
+        elif N == 8192 and M <= 16384:
+            direct_gmem = True
+
+    # DSv3 smallest point (M=4096, N=7168) is latency-sensitive. Increasing
+    # per-row parallelism improves the reduction path and consistently beats
+    # Quack on this machine.
+    tpr_override: Optional[int] = None
+    nt_override: Optional[int] = None
+    if dtype_x.width == 16 and N == 7168 and M <= 4096:
+        tpr_override = 224
+        nt_override = 224
+
+    # NOTE: We previously experimented with a direct-GMEM + 256b vectorized
+    # schedule for N=4096, but it was consistently slower on this GB200.
+    # Keep the pointer path on the Quack-like staged (cp.async) schedule.
     key = (
         "ptr",
         int(N),
@@ -782,16 +1344,36 @@ def _layernorm_forward_ptr_into(
         bias is not None,
         rstd is not None,
         mean is not None,
+        bool(direct_gmem),
+        int(copy_bits_x) if copy_bits_x is not None else None,
+        tpr_override,
+        nt_override,
+        int(assumed_align_xo),
         int(device_index),
     )
     compiled = _PTR_COMPILE_CACHE.get(key)
     if compiled is None:
-        op = LayerNormSM100(dtype_x, int(N))
+        op = LayerNormSM100(
+            dtype_x,
+            int(N),
+            copy_bits_x=copy_bits_x,
+            direct_gmem=direct_gmem,
+        )
+        if tpr_override is not None:
+            op._tpr_override = tpr_override  # type: ignore[attr-defined]
+        if nt_override is not None:
+            op._nt_override = nt_override  # type: ignore[attr-defined]
         ptr_x = rt.make_ptr(
-            dtype_x, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+            dtype_x,
+            x.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=assumed_align_xo,
         )
         ptr_out = rt.make_ptr(
-            dtype_x, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+            dtype_x,
+            out.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=assumed_align_xo,
         )
         ptr_w = rt.make_ptr(
             cutlass.Float32,
@@ -845,11 +1427,44 @@ def _layernorm_forward_ptr_into(
         )
         _PTR_COMPILE_CACHE[key] = compiled
 
+    launcher = _get_fast_ptr_layernorm_launcher(
+        compiled=compiled,
+        N=int(N),
+        dtype_x=dtype_x,
+        has_bias=bias is not None,
+        has_rstd=rstd is not None,
+        has_mean=mean is not None,
+        device_index=int(device_index),
+        stream_handle=stream_handle,
+        assumed_align_xo=int(assumed_align_xo),
+        eps=float(eps),
+    )
+    ld_val = int(x.stride(0))
+    if launcher is not None:
+        launcher.launch(
+            x=x,
+            weight=weight,
+            bias=bias,
+            out=out,
+            rstd=rstd,
+            mean=mean,
+            M=int(M),
+            ld=ld_val,
+            eps=float(eps),
+        )
+        return
+
     ptr_x = rt.make_ptr(
-        dtype_x, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+        dtype_x,
+        x.data_ptr(),
+        mem_space=rt.AddressSpace.gmem,
+        assumed_align=assumed_align_xo,
     )
     ptr_out = rt.make_ptr(
-        dtype_x, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+        dtype_x,
+        out.data_ptr(),
+        mem_space=rt.AddressSpace.gmem,
+        assumed_align=assumed_align_xo,
     )
     ptr_w = rt.make_ptr(
         cutlass.Float32,
@@ -887,7 +1502,7 @@ def _layernorm_forward_ptr_into(
         if mean is not None
         else None
     )
-    ld = Int32(int(x.stride(0)))
+    ld = Int32(ld_val)
     compiled(
         ptr_x,
         ptr_w,
diff --git a/oink/src/kernelagent_oink/blackwell/lite_quack.py b/oink/src/kernelagent_oink/blackwell/lite_quack.py
index e8ce93a..c7402d8 100644
--- a/oink/src/kernelagent_oink/blackwell/lite_quack.py
+++ b/oink/src/kernelagent_oink/blackwell/lite_quack.py
@@ -39,7 +39,7 @@
 from cutlass import Float32, Int32, const_expr
 from cutlass.cute.runtime import from_dlpack
 from cutlass.cutlass_dsl import T, dsl_user_op
-from cutlass._mlir.dialects import llvm, vector
+from cutlass._mlir.dialects import llvm, nvvm, vector
 
 
 def _parse_version_tuple(version: str) -> tuple[int, int, int]:
@@ -69,6 +69,24 @@ def _cutlass_dsl_version() -> Optional[tuple[int, int, int]]:
     _CUTLASS_DSL_VERSION is not None and _CUTLASS_DSL_VERSION < (4, 3, 4)
 )
 
+# Cache device properties lookups (notably `multi_processor_count`) since some
+# dispatch paths call `get_sm_count` inside tight benchmark loops.
+_DEVICE_NUM_SMS_CACHE: dict[int, int] = {}
+
+
+def get_num_sms(device: torch.device) -> int:
+    """Return the number of SMs for a CUDA device (cached)."""
+    device_index = device.index
+    if device_index is None:
+        device_index = torch.cuda.current_device()
+    device_index = int(device_index)
+    cached = _DEVICE_NUM_SMS_CACHE.get(device_index)
+    if cached is not None:
+        return cached
+    num_sms = int(torch.cuda.get_device_properties(device_index).multi_processor_count)
+    _DEVICE_NUM_SMS_CACHE[device_index] = num_sms
+    return num_sms
+
 
 # -------------------------
 # Dtype mapping (from quack.cute_dsl_utils)
@@ -178,6 +196,43 @@ def store_shared_remote(
     )
 
 
+@dsl_user_op
+def atomic_add_f32(
+    a: float | Float32, gmem_ptr: cute.Pointer, *, loc=None, ip=None
+) -> Float32:
+    """Atomic add into global memory (float32)."""
+    return nvvm.atomicrmw(
+        res=T.f32(),
+        op=nvvm.AtomicOpKind.FADD,
+        ptr=gmem_ptr.llvm_ptr,
+        a=Float32(a).ir_value(loc=loc, ip=ip),
+        loc=loc,
+        ip=ip,
+    )
+
+
+@cute.jit
+def atomic_add_tensor_f32(
+    src: cute.Tensor,
+    dst: cute.Tensor,
+    *,
+    pred: Optional[cute.Tensor] = None,
+) -> None:
+    """Atomic-add a register fragment into a GMEM tile (float32)."""
+    if const_expr(pred is None):
+        for i in cutlass.range_constexpr(cute.size(src.shape)):
+            coord = cute.idx2crd(i, src.shape)
+            atomic_add_f32(src[i], elem_pointer(dst, coord))
+    else:
+        for i in cutlass.range_constexpr(cute.size(src.shape)):
+            # CuTeDSL 4.3.4+ disallows introducing new tuple-typed values inside
+            # a dynamic `if`. Compute `coord` unconditionally, then predicate the
+            # atomic update.
+            coord = cute.idx2crd(i, src.shape)
+            if pred[i]:
+                atomic_add_f32(src[i], elem_pointer(dst, coord))
+
+
 @cute.jit
 def predicate_k(tAcA: cute.Tensor, limit: cutlass.Int32) -> cute.Tensor:
     # Only compute predicates for the "k" dimension. For the mn dimension, we will use "if".
@@ -318,9 +373,7 @@ def warp_reduce(
         for i in cutlass.range_constexpr(cute.size(val.shape)):
             res[i] = warp_reduce(res[i], op, width)
         return res.load()
-    for i in cutlass.range_constexpr(int(math.log2(width))):
-        val = op(val, cute.arch.shuffle_sync_bfly(val, offset=1 << i))
-    return val
+    return cute.arch.warp_reduction(val, op, threads_in_group=width)
 
 
 @cute.jit
@@ -623,7 +676,10 @@ def get_copy_atom(
 ) -> cute.CopyAtom:
     from cutlass.cute.nvgpu import cpasync
 
-    num_copy_bits = const_expr(min(128, num_copy_elems * dtype.width))
+    # cp.async is limited to 128b per op; synchronous vectorized copies can go wider.
+    max_bits = const_expr(128 if is_async else 256)
+    num_copy_bits = const_expr(min(max_bits, num_copy_elems * dtype.width))
+    # Match Quack's default cp.async cache policy (leave cache_mode unspecified).
     copy_op = cpasync.CopyG2SOp() if is_async else cute.nvgpu.CopyUniversalOp()
     return cute.make_copy_atom(
         copy_op, dtype, num_bits_per_copy=num_copy_bits, loc=loc, ip=ip
@@ -678,6 +734,20 @@ def _get_num_threads(self) -> int:
     def _get_tv_layout(
         self, num_copy_bits: int = 128
     ) -> Tuple[cute.Shape, cute.Layout]:
+        """Return (tiler_mn, tv_layout) for SM100 reduction kernels.
+
+        This intentionally mirrors Quack's `ReductionBase._get_tiled_copy(...)`:
+        - `tiler_mn` spans the full N range for the CTA, including any "K-loop"
+          repeats (`num_blocks_N`).
+        - `tv_layout` is the *tiled* thread/value layout used by CuTe's copy
+          partitioning (does **not** bake in `num_blocks_N`), matching
+          `quack.copy_utils.tiled_copy_2d(...).layout_tv_tiled`.
+        """
+        if num_copy_bits > 128:
+            raise ValueError(
+                f"num_copy_bits={num_copy_bits} exceeds 128b; Quack-style SM100 reduction "
+                "tiling assumes <=128b vectorization (cp.async and common CopyAtoms)."
+            )
         vecsize = num_copy_bits // self.dtype.width
         assert self.N % vecsize == 0, (
             f"Input N {self.N} is not divisible by vector size {vecsize}"
@@ -692,30 +762,56 @@ def _get_tv_layout(
         )
         cols_per_block = num_threads // threads_per_row
         tiler_mn = (cols_per_block, vecsize * num_blocks_N * threads_per_row)
-        tv_layout = cute.make_layout(
-            ((threads_per_row, cols_per_block), (vecsize, num_blocks_N)),
-            stride=(
-                (vecsize * cols_per_block, 1),
-                (cols_per_block, cols_per_block * vecsize * threads_per_row),
-            ),
+
+        # Construct the same tv layout that Quack gets from `tiled_copy_2d(...).layout_tv_tiled`.
+        copy_atom = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(),
+            self.dtype,
+            num_bits_per_copy=num_copy_bits,
+        )
+        thr_layout = cute.make_ordered_layout(
+            (cols_per_block, threads_per_row),
+            order=(1, 0),
         )
+        val_layout = cute.make_layout((1, vecsize))
+        tv_layout = cute.make_tiled_copy_tv(
+            copy_atom, thr_layout, val_layout
+        ).layout_tv_tiled
         return tiler_mn, tv_layout
 
     def _smem_size_in_bytes(self, tiler_mn, num_warps: int) -> int:
-        return (
-            cute.size_in_bytes(self.dtype, cute.make_layout(tiler_mn))
-            + self.stage
-            * num_warps
-            * self.cluster_n
-            * (self.reduction_dtype.width // 8)
-            + self.stage * (cutlass.Int64.width // 8)
+        # Mirror the allocation order used by the SM100 reduction kernels:
+        #   1) sX (byte_alignment=16)
+        #   2) reduction_buffer (byte_alignment=8)
+        #   3) mbar_ptr (Int64, 8B)
+        #
+        # CuTeDSL's SmemAllocator may insert padding between allocations to satisfy
+        # alignment. Be conservative and round up offsets accordingly so we never
+        # under-allocate dynamic shared memory.
+
+        def _align_up(x: int, align: int) -> int:
+            return ((x + align - 1) // align) * align
+
+        sx_bytes = int(cute.size_in_bytes(self.dtype, cute.make_layout(tiler_mn)))
+        reduction_bytes = int(
+            self.stage * num_warps * self.cluster_n * (self.reduction_dtype.width // 8)
         )
+        mbar_bytes = int(self.stage * (cutlass.Int64.width // 8))
+
+        offset = _align_up(sx_bytes, 16)
+        offset = _align_up(offset, 8) + reduction_bytes
+        offset = _align_up(offset, 8) + mbar_bytes
+        return int(offset)
 
     def _get_reduction_buffer_layout(
         self, tv_layout: cute.Layout, cluster_n: int
     ) -> cute.Layout:
         num_warps = cute.size(tv_layout, mode=[0]) // cute.arch.WARP_SIZE
-        warps_per_row = max(tv_layout.shape[0][0] // cute.arch.WARP_SIZE, 1)
+        warps_per_row = (
+            num_warps
+            if cutlass.const_expr(cute.rank(tv_layout.shape[0]) == 1)
+            else max(tv_layout.shape[0][0] // cute.arch.WARP_SIZE, 1)
+        )
         return cute.make_ordered_layout(
             (num_warps // warps_per_row, (warps_per_row, cluster_n), self.stage),
             order=(1, 0, 2),
@@ -730,7 +826,7 @@ def _allocate_reduction_buffer_and_mbar(
         reduction_buffer = smem.allocate_tensor(
             self.reduction_dtype,
             self._get_reduction_buffer_layout(tv_layout, self.cluster_n),
-            byte_alignment=4,
+            byte_alignment=8,
         )
         if cutlass.const_expr(self.cluster_n > 1):
             mbar_ptr = smem.allocate_array(
@@ -771,6 +867,9 @@ def __init__(self, dtype: cutlass.Numeric, N: int):
         # 2 stages for double buffering when computing mean of x_hat * wdy
         super().__init__(dtype, N, stage=2, reduction_dtype=Float32)
         self.reload_wdy = None if N <= 16 * 1024 else "smem"
+        # Optional optimization: atomically accumulate mdW into a single (N,)
+        # buffer instead of writing an (sm_count, N) partial buffer + torch.sum.
+        self.atomic_dw = False
         if self.N > 128 * 1024 and self.dtype.width >= 32:
             raise ValueError(
                 "RMSNormBackward does not support N > 128k with dtype >= 32 bits"
@@ -856,15 +955,18 @@ def new_stride(t):
         largest_dtype_width = const_expr(
             max(
                 mX.element_type.width,
+                mW.element_type.width if mW is not None else 0,
                 mdO.element_type.width,
                 mdX.element_type.width,
                 mdResO.element_type.width if mdResO is not None else 0,
                 mdRes.element_type.width if mdRes is not None else 0,
             )
         )
-        tiler_mn, tv_layout = self._get_tv_layout(
-            num_copy_bits=128 // largest_dtype_width * mX.element_type.width
-        )
+        # Quack-style policy: cap the *largest* dtype to 128b, then scale the
+        # activation copy width down proportionally (e.g. fp16 + fp32-weight
+        # => 64b activation vectors so the fp32 path stays at 128b).
+        num_copy_bits = const_expr(128 // largest_dtype_width * mX.element_type.width)
+        tiler_mn, tv_layout = self._get_tv_layout(num_copy_bits=int(num_copy_bits))
         num_threads = (
             cute.size(tv_layout, mode=[0])
             if _KERNEL_ACCEPTS_LAYOUT_ARGS
@@ -941,12 +1043,25 @@ def _kernel_impl(
         else:
             mbar_full_ptr, mbar_empty_ptr = None, None
 
-        num_copy_elems_X = tv_layout.shape[1][0]
+        num_copy_elems_X = (
+            tv_layout.shape[1]
+            if cutlass.const_expr(cute.rank(tv_layout.shape[1]) == 1)
+            else tv_layout.shape[1][0]
+        )
+        threads_per_row = (
+            tv_layout.shape[0]
+            if cutlass.const_expr(cute.rank(tv_layout.shape[0]) == 1)
+            else tv_layout.shape[0][0]
+        )
         copy_atom_load_X = get_copy_atom(
             mX.element_type, num_copy_elems_X, is_async=False
         )
-        thr_copy_X = cute.make_tiled_copy(
-            copy_atom_load_X, tv_layout, tiler_mn
+        thr_layout = cute.make_ordered_layout(
+            (tiler_mn[0], threads_per_row), order=(1, 0)
+        )
+        val_layout = cute.make_layout((1, num_copy_elems_X))
+        thr_copy_X = cute.make_tiled_copy_tv(
+            copy_atom_load_X, thr_layout, val_layout
         ).get_slice(tidx)
         copy_fn = partial(copy, num_copy_elems=num_copy_elems_X)
 
@@ -1025,7 +1140,6 @@ def _kernel_impl(
         if const_expr(self.cluster_n > 1):
             cute.arch.cluster_wait()
 
-        threads_per_row = tv_layout.shape[0][0]
         if const_expr(mdW is not None):
             tXrdW.fill(0.0)
         if const_expr(mdB is not None):
@@ -1165,7 +1279,10 @@ def _kernel_impl(
                         )
                         cute.autovec_copy(tXsdW_other, tXrdW_other)
                         tXrdW.store(tXrdW.load() + tXrdW_other.load())
-                    copy_fn(tXrdW, tXgdW, pred=tXpX)
+                    if const_expr(self.atomic_dw):
+                        atomic_add_tensor_f32(tXrdW, tXgdW, pred=tXpX)
+                    else:
+                        copy_fn(tXrdW, tXgdW, pred=tXpX)
                 cute.arch.barrier()
             if const_expr(mdB is not None):
                 sdB = cute.make_tensor(
@@ -1190,7 +1307,10 @@ def _kernel_impl(
                     copy_fn(tXrdB, tXgdB, pred=tXpX)
         else:
             if const_expr(mdW is not None):
-                copy_fn(tXrdW, tXgdW, pred=tXpX)
+                if const_expr(self.atomic_dw):
+                    atomic_add_tensor_f32(tXrdW, tXgdW, pred=tXpX)
+                else:
+                    copy_fn(tXrdW, tXgdW, pred=tXpX)
             if const_expr(mdB is not None):
                 copy_fn(tXrdB, tXgdB, pred=tXpX)
 
@@ -1295,8 +1415,7 @@ def get_sm_count(
         increased to improve SM occupancy, matching the existing SM100
         tuning used by both RMSNorm and LayerNorm.
     """
-    props = torch.cuda.get_device_properties(device)
-    num_sms = props.multi_processor_count
+    num_sms = get_num_sms(device)
 
     sm_count_multiple = (
         16
diff --git a/oink/src/kernelagent_oink/blackwell/rmsnorm.py b/oink/src/kernelagent_oink/blackwell/rmsnorm.py
index e921947..252df6a 100644
--- a/oink/src/kernelagent_oink/blackwell/rmsnorm.py
+++ b/oink/src/kernelagent_oink/blackwell/rmsnorm.py
@@ -86,6 +86,35 @@
 _PTR_FAST_LAUNCH_TLS = threading.local()
 
 
+# Cache a (1, sm_count) fp32 ones row used for GEMM-based dw/db partial reductions.
+#
+# On SM100, `dw_partial.sum(dim=0)` can be a double-digit microsecond tail for
+# Quack-suite small shapes (e.g. M=8192, N=4096). A cached GEMM-based reduction
+# is consistently faster and avoids per-call allocation overhead.
+_DW_REDUCE_ONES_CACHE: dict[tuple[int, int], Tensor] = {}
+
+
+def _get_dw_reduce_ones(device_index: int, sm_count: int) -> Tensor:
+    key = (int(device_index), int(sm_count))
+    ones = _DW_REDUCE_ONES_CACHE.get(key)
+    if ones is None or ones.shape != (1, sm_count) or ones.device.index != device_index:
+        ones = torch.ones(
+            (1, sm_count),
+            device=torch.device("cuda", device_index),
+            dtype=torch.float32,
+        )
+        _DW_REDUCE_ONES_CACHE[key] = ones
+    return ones
+
+
+def _reduce_partial_sum_fp32(partial: Tensor, *, device_index: int) -> Tensor:
+    """Reduce a (sm_count, N) fp32 partial buffer into an (N,) fp32 result."""
+    assert partial.dtype is torch.float32
+    assert partial.dim() == 2
+    ones = _get_dw_reduce_ones(device_index, int(partial.shape[0]))
+    return torch.mm(ones, partial).squeeze(0)
+
+
 def _env_flag(name: str, default: bool) -> bool:
     val = os.environ.get(name)
     if val is None:
@@ -213,6 +242,13 @@ def _probe_cluster_direct_gmem_max_copy_bits() -> int:
 """
 
         env = os.environ.copy()
+        # The probe runs in a fresh subprocess, so it won't inherit any
+        # benchmark-harness sys.path tweaks. Ensure the in-tree Oink source is
+        # importable so `import kernelagent_oink...` works reliably.
+        oink_src = os.path.abspath(os.path.join(_HERE, "..", ".."))
+        if os.path.isdir(oink_src):
+            py_path = env.get("PYTHONPATH")
+            env["PYTHONPATH"] = oink_src + (os.pathsep + py_path if py_path else "")
         env["PYTHONNOUSERSITE"] = "1"
 
         def run_probe(copy_bits: int, assumed_align: int):
@@ -325,6 +361,8 @@ def _direct_gmem_from_policy(*, default: bool) -> bool:
 
 def _copy_bits_from_policy(*, default: int, can_use_256: bool) -> int:
     """Resolve copy width (in bits) from the (import-time) policy string."""
+    if _COPY_BITS_POLICY in {"64"}:
+        return 64
     if _COPY_BITS_POLICY in {"128"}:
         return 128
     if _COPY_BITS_POLICY in {"256"} and can_use_256:
@@ -398,6 +436,8 @@ def __init__(
         arg_ld: _StableI32Arg,
         arg_eps: _StableF32Arg,
         stream: cuda.CUstream,
+        assumed_align: int,
+        weight_dtype: Optional[type[cutlass.Numeric]],
         packed_args: object,
         keepalive: tuple[object, ...],
     ):
@@ -412,6 +452,8 @@ def __init__(
         self._arg_ld = arg_ld
         self._arg_eps = arg_eps
         self._stream = stream
+        self._assumed_align = int(assumed_align)
+        self._weight_dtype = weight_dtype
         self._packed_args = packed_args
         self._keepalive = keepalive
 
@@ -520,17 +562,23 @@ def _fallback_launch(
         # (e.g. due to a CuTeDSL upgrade), fall back to the regular call path.
         dtype = TORCH2CUTE_DTYPE[x.dtype]
         ptr_x = rt.make_ptr(
-            dtype, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+            dtype,
+            x.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=self._assumed_align,
         )
         ptr_out = rt.make_ptr(
-            dtype, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+            dtype,
+            out.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=self._assumed_align,
         )
         ptr_w = (
             rt.make_ptr(
-                dtype,
+                self._weight_dtype or dtype,
                 weight.data_ptr(),
                 mem_space=rt.AddressSpace.gmem,
-                assumed_align=16,
+                assumed_align=self._assumed_align,
             )
             if weight is not None
             else None
@@ -717,14 +765,477 @@ def _fallback_launch(
         )
 
 
+class _PtrRmsnormBwdFastLaunch:
+    def __init__(
+        self,
+        *,
+        compiled: object,
+        executor: object,
+        capi_func: object,
+        ptr_x: object,
+        ptr_w: Optional[object],
+        ptr_dout: object,
+        ptr_rstd: object,
+        ptr_dx: object,
+        ptr_dw_partial: Optional[object],
+        arg_m: _StableI32Arg,
+        arg_n: _StableI32Arg,
+        arg_ld: _StableI32Arg,
+        arg_sm_count: _StableI32Arg,
+        stream: cuda.CUstream,
+        assumed_align_x: int,
+        assumed_align_w: int,
+        assumed_align_dw: int,
+        weight_dtype: Optional[type[cutlass.Numeric]],
+        packed_args: object,
+        keepalive: tuple[object, ...],
+    ):
+        self._compiled = compiled
+        self._executor = executor
+        self._capi_func = capi_func
+        self._ptr_x = ptr_x
+        self._ptr_w = ptr_w
+        self._ptr_dout = ptr_dout
+        self._ptr_rstd = ptr_rstd
+        self._ptr_dx = ptr_dx
+        self._ptr_dw_partial = ptr_dw_partial
+        self._arg_m = arg_m
+        self._arg_n = arg_n
+        self._arg_ld = arg_ld
+        self._arg_sm_count = arg_sm_count
+        self._stream = stream
+        self._assumed_align_x = int(assumed_align_x)
+        self._assumed_align_w = int(assumed_align_w)
+        self._assumed_align_dw = int(assumed_align_dw)
+        self._weight_dtype = weight_dtype
+        self._packed_args = packed_args
+        self._keepalive = keepalive
+
+        self._use_fast_launch = True
+        self._cuda_result = getattr(executor, "cuda_result", None)
+
+        self._last_x_ptr = -1
+        self._last_w_ptr = -1
+        self._last_dout_ptr = -1
+        self._last_rstd_ptr = -1
+        self._last_dx_ptr = -1
+        self._last_dw_ptr = -1
+        self._last_m = -1
+        self._last_ld = -1
+        self._last_sm_count = -1
+
+    def launch(
+        self,
+        *,
+        x: Tensor,
+        weight: Optional[Tensor],
+        dout: Tensor,
+        rstd: Tensor,
+        dx: Tensor,
+        dw_partial: Optional[Tensor],
+        M: int,
+        N: int,
+        ld: int,
+        sm_count: int,
+    ) -> None:
+        if not _fast_launch_enabled() or not self._use_fast_launch:
+            self._fallback_launch(
+                x=x,
+                weight=weight,
+                dout=dout,
+                rstd=rstd,
+                dx=dx,
+                dw_partial=dw_partial,
+                M=M,
+                N=N,
+                ld=ld,
+                sm_count=sm_count,
+            )
+            return
+
+        x_ptr = x.data_ptr()
+        if x_ptr != self._last_x_ptr:
+            try:
+                _set_runtime_ptr(self._ptr_x, x_ptr)
+                self._last_x_ptr = x_ptr
+            except AttributeError:
+                self._disable_fast_launch()
+                self._fallback_launch(
+                    x=x,
+                    weight=weight,
+                    dout=dout,
+                    rstd=rstd,
+                    dx=dx,
+                    dw_partial=dw_partial,
+                    M=M,
+                    N=N,
+                    ld=ld,
+                    sm_count=sm_count,
+                )
+                return
+
+        if self._ptr_w is not None:
+            w_ptr = weight.data_ptr()  # type: ignore[union-attr]
+            if w_ptr != self._last_w_ptr:
+                try:
+                    _set_runtime_ptr(self._ptr_w, w_ptr)
+                    self._last_w_ptr = w_ptr
+                except AttributeError:
+                    self._disable_fast_launch()
+                    self._fallback_launch(
+                        x=x,
+                        weight=weight,
+                        dout=dout,
+                        rstd=rstd,
+                        dx=dx,
+                        dw_partial=dw_partial,
+                        M=M,
+                        N=N,
+                        ld=ld,
+                        sm_count=sm_count,
+                    )
+                    return
+
+        dout_ptr = dout.data_ptr()
+        if dout_ptr != self._last_dout_ptr:
+            try:
+                _set_runtime_ptr(self._ptr_dout, dout_ptr)
+                self._last_dout_ptr = dout_ptr
+            except AttributeError:
+                self._disable_fast_launch()
+                self._fallback_launch(
+                    x=x,
+                    weight=weight,
+                    dout=dout,
+                    rstd=rstd,
+                    dx=dx,
+                    dw_partial=dw_partial,
+                    M=M,
+                    N=N,
+                    ld=ld,
+                    sm_count=sm_count,
+                )
+                return
+
+        rstd_ptr = rstd.data_ptr()
+        if rstd_ptr != self._last_rstd_ptr:
+            try:
+                _set_runtime_ptr(self._ptr_rstd, rstd_ptr)
+                self._last_rstd_ptr = rstd_ptr
+            except AttributeError:
+                self._disable_fast_launch()
+                self._fallback_launch(
+                    x=x,
+                    weight=weight,
+                    dout=dout,
+                    rstd=rstd,
+                    dx=dx,
+                    dw_partial=dw_partial,
+                    M=M,
+                    N=N,
+                    ld=ld,
+                    sm_count=sm_count,
+                )
+                return
+
+        dx_ptr = dx.data_ptr()
+        if dx_ptr != self._last_dx_ptr:
+            try:
+                _set_runtime_ptr(self._ptr_dx, dx_ptr)
+                self._last_dx_ptr = dx_ptr
+            except AttributeError:
+                self._disable_fast_launch()
+                self._fallback_launch(
+                    x=x,
+                    weight=weight,
+                    dout=dout,
+                    rstd=rstd,
+                    dx=dx,
+                    dw_partial=dw_partial,
+                    M=M,
+                    N=N,
+                    ld=ld,
+                    sm_count=sm_count,
+                )
+                return
+
+        if self._ptr_dw_partial is not None:
+            dw_ptr = dw_partial.data_ptr()  # type: ignore[union-attr]
+            if dw_ptr != self._last_dw_ptr:
+                try:
+                    _set_runtime_ptr(self._ptr_dw_partial, dw_ptr)
+                    self._last_dw_ptr = dw_ptr
+                except AttributeError:
+                    self._disable_fast_launch()
+                    self._fallback_launch(
+                        x=x,
+                        weight=weight,
+                        dout=dout,
+                        rstd=rstd,
+                        dx=dx,
+                        dw_partial=dw_partial,
+                        M=M,
+                        N=N,
+                        ld=ld,
+                        sm_count=sm_count,
+                    )
+                    return
+
+        if M != self._last_m:
+            self._arg_m.set(M)
+            self._last_m = M
+        if ld != self._last_ld:
+            self._arg_ld.set(ld)
+            self._last_ld = ld
+        if sm_count != self._last_sm_count:
+            self._arg_sm_count.set(sm_count)
+            self._last_sm_count = sm_count
+
+        if self._cuda_result is not None:
+            self._cuda_result.value = 0
+
+        ret = self._capi_func(self._packed_args)  # type: ignore[misc]
+        if ret != 0:
+            raise RuntimeError(f"CuTeDSL capi_func returned non-zero: {ret}")
+        if self._cuda_result is not None:
+            err = int(self._cuda_result.value)
+            if err != 0:
+                raise RuntimeError(f"CuTeDSL kernel launch failed (cuda_result={err})")
+
+    def _disable_fast_launch(self) -> None:
+        global _FAST_LAUNCH_SUPPORTED
+        self._use_fast_launch = False
+        _FAST_LAUNCH_SUPPORTED = False
+
+    def _fallback_launch(
+        self,
+        *,
+        x: Tensor,
+        weight: Optional[Tensor],
+        dout: Tensor,
+        rstd: Tensor,
+        dx: Tensor,
+        dw_partial: Optional[Tensor],
+        M: int,
+        N: int,
+        ld: int,
+        sm_count: int,
+    ) -> None:
+        dtype = TORCH2CUTE_DTYPE[x.dtype]
+        ptr_x = rt.make_ptr(
+            dtype,
+            x.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=self._assumed_align_x,
+        )
+        ptr_dout = rt.make_ptr(
+            dtype,
+            dout.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=self._assumed_align_x,
+        )
+        ptr_dx = rt.make_ptr(
+            dtype,
+            dx.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=self._assumed_align_x,
+        )
+        ptr_rstd = rt.make_ptr(
+            TORCH2CUTE_DTYPE[rstd.dtype],
+            rstd.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=self._assumed_align_x,
+        )
+        ptr_w = (
+            rt.make_ptr(
+                self._weight_dtype or dtype,
+                weight.data_ptr(),
+                mem_space=rt.AddressSpace.gmem,
+                assumed_align=self._assumed_align_w,
+            )
+            if weight is not None
+            else None
+        )
+        ptr_dw_partial = (
+            rt.make_ptr(
+                TORCH2CUTE_DTYPE[dw_partial.dtype],
+                dw_partial.data_ptr(),
+                mem_space=rt.AddressSpace.gmem,
+                assumed_align=self._assumed_align_dw,
+            )
+            if dw_partial is not None
+            else None
+        )
+        self._compiled(
+            ptr_x,
+            ptr_w,
+            ptr_dout,
+            ptr_rstd,
+            ptr_dx,
+            ptr_dw_partial,
+            Int32(M),
+            Int32(N),
+            Int32(ld),
+            Int32(sm_count),
+            self._stream,
+        )
+
+
+def _get_fast_ptr_rmsnorm_bwd_launcher(
+    *,
+    compiled: object,
+    dtype: type[cutlass.Numeric],
+    weight_dtype: Optional[type[cutlass.Numeric]],
+    N: int,
+    device_index: int,
+    stream_handle: int,
+    has_weight: bool,
+    has_dw_partial: bool,
+    assumed_align_x: int,
+    assumed_align_w: int,
+    assumed_align_dw: int,
+) -> Optional[_PtrRmsnormBwdFastLaunch]:
+    if not _fast_launch_enabled():
+        return None
+    key = (
+        "ptr_bwd_fast",
+        id(compiled),
+        N,
+        dtype,
+        weight_dtype,
+        device_index,
+        int(stream_handle),
+        has_weight,
+        has_dw_partial,
+        int(assumed_align_x),
+        int(assumed_align_w),
+        int(assumed_align_dw),
+    )
+    cache = _tls_fast_launch_cache()
+    cached = cache.get(key)
+    if cached is not None:
+        return cached  # type: ignore[return-value]
+
+    assumed_align_x = int(assumed_align_x)
+    assumed_align_w = int(assumed_align_w)
+    assumed_align_dw = int(assumed_align_dw)
+
+    ptr_x = rt.make_ptr(
+        dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align_x
+    )
+    ptr_w = (
+        rt.make_ptr(
+            weight_dtype or dtype,
+            0,
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=assumed_align_w,
+        )
+        if has_weight
+        else None
+    )
+    ptr_dout = rt.make_ptr(
+        dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align_x
+    )
+    ptr_rstd = rt.make_ptr(
+        cutlass.Float32,
+        0,
+        mem_space=rt.AddressSpace.gmem,
+        assumed_align=assumed_align_x,
+    )
+    ptr_dx = rt.make_ptr(
+        dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align_x
+    )
+    ptr_dw_partial = (
+        rt.make_ptr(
+            cutlass.Float32,
+            0,
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=assumed_align_dw,
+        )
+        if has_dw_partial
+        else None
+    )
+
+    arg_m = _StableI32Arg(0)
+    arg_n = _StableI32Arg(N)
+    arg_ld = _StableI32Arg(N)
+    arg_sm_count = _StableI32Arg(0)
+    stream = cuda.CUstream(int(stream_handle))
+
+    executor = compiled.to(device_index)  # type: ignore[attr-defined]
+    try:
+        exe_args, adapted_args = executor.generate_execution_args(
+            ptr_x,
+            ptr_w,
+            ptr_dout,
+            ptr_rstd,
+            ptr_dx,
+            ptr_dw_partial,
+            arg_m,
+            arg_n,
+            arg_ld,
+            arg_sm_count,
+            stream,
+        )
+        packed_args = executor._get_invoke_packed_args(list(exe_args))  # type: ignore[attr-defined]
+        capi_func = compiled.capi_func  # type: ignore[attr-defined]
+    except AttributeError:
+        global _FAST_LAUNCH_SUPPORTED
+        _FAST_LAUNCH_SUPPORTED = False
+        return None
+
+    keepalive: tuple[object, ...] = (
+        executor,
+        ptr_x,
+        ptr_w,
+        ptr_dout,
+        ptr_rstd,
+        ptr_dx,
+        ptr_dw_partial,
+        arg_m,
+        arg_n,
+        arg_ld,
+        arg_sm_count,
+        stream,
+        *adapted_args,
+    )
+
+    launcher = _PtrRmsnormBwdFastLaunch(
+        compiled=compiled,
+        executor=executor,
+        capi_func=capi_func,
+        ptr_x=ptr_x,
+        ptr_w=ptr_w,
+        ptr_dout=ptr_dout,
+        ptr_rstd=ptr_rstd,
+        ptr_dx=ptr_dx,
+        ptr_dw_partial=ptr_dw_partial,
+        arg_m=arg_m,
+        arg_n=arg_n,
+        arg_ld=arg_ld,
+        arg_sm_count=arg_sm_count,
+        stream=stream,
+        assumed_align_x=assumed_align_x,
+        assumed_align_w=assumed_align_w,
+        assumed_align_dw=assumed_align_dw,
+        weight_dtype=weight_dtype if has_weight else None,
+        packed_args=packed_args,
+        keepalive=keepalive,
+    )
+    cache[key] = launcher
+    return launcher
+
+
 def _get_fast_ptr_rmsnorm_launcher(
     *,
     compiled: object,
     dtype: type[cutlass.Numeric],
+    weight_dtype: Optional[type[cutlass.Numeric]] = None,
     N: int,
     device_index: int,
     stream_handle: int,
     has_weight: bool,
+    assumed_align: int = 16,
     eps: float,
 ) -> Optional[_PtrRmsnormFastLaunch]:
     if not _fast_launch_enabled():
@@ -736,9 +1247,11 @@ def _get_fast_ptr_rmsnorm_launcher(
         id(compiled),
         N,
         dtype,
+        weight_dtype,
         device_index,
         int(stream_handle),
         has_weight,
+        int(assumed_align),
     )
     cache = _tls_fast_launch_cache()
     cached = cache.get(key)
@@ -746,10 +1259,20 @@ def _get_fast_ptr_rmsnorm_launcher(
         return cached  # type: ignore[return-value]
 
     # Create stable runtime args and pointer descriptors once.
-    ptr_x = rt.make_ptr(dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=16)
-    ptr_out = rt.make_ptr(dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=16)
+    assumed_align = int(assumed_align)
+    ptr_x = rt.make_ptr(
+        dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align
+    )
+    ptr_out = rt.make_ptr(
+        dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align
+    )
     ptr_w = (
-        rt.make_ptr(dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=16)
+        rt.make_ptr(
+            weight_dtype or dtype,
+            0,
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=assumed_align,
+        )
         if has_weight
         else None
     )
@@ -813,6 +1336,8 @@ def _get_fast_ptr_rmsnorm_launcher(
         arg_ld=arg_ld,
         arg_eps=arg_eps,
         stream=stream,
+        assumed_align=assumed_align,
+        weight_dtype=weight_dtype if has_weight else None,
         packed_args=packed_args,
         keepalive=keepalive,
     )
@@ -952,7 +1477,7 @@ def get_copy_atom_bw(
     num_copy_bits = const_expr(min(max_bits, num_copy_elems * dtype.width))
     from cutlass.cute.nvgpu import cpasync
 
-    # Prefer GLOBAL cache policy for bulk streaming reads at large M
+    # Prefer GLOBAL cache policy for bulk streaming reads at large M.
     copy_op = (
         cpasync.CopyG2SOp(cache_mode=cpasync.LoadCacheMode.GLOBAL)
         if is_async
@@ -1012,14 +1537,25 @@ def _threads_per_row(self) -> int:
         if N == 1536 and self.dtype.width == 16:
             return 96
         # DSv3 default hidden size (7168). Choose a threads-per-row that matches
-        # the selected vector width to avoid padded work:
-        # - 128b copies (vec=8 for bf16/fp16): 7168/8 = 896 = 7 * 128  -> tpr=128
-        # - 256b copies (vec=16 for bf16/fp16): 7168/16 = 448 = 2 * 224 -> tpr=224
+        # the selected vector width to avoid padded work. Using 224 threads/row
+        # yields exact tiles for all supported copy widths we use on SM100:
+        # - 64b copies (vec=4 for bf16/fp16): 7168/4 = 1792 = 8 * 224
+        # - 128b copies (vec=8 for bf16/fp16): 7168/8 = 896 = 4 * 224
+        # - 256b copies (vec=16 for bf16/fp16): 7168/16 = 448 = 2 * 224
         #
-        # The fused direct-GMEM path often uses 256b copies on 32B-aligned
-        # tensors, while the non-fused path defaults to 128b copies.
         if N == 7168 and self.dtype.width == 16:
-            return 224 if self.copy_bits >= 256 else 128
+            return 224
+        # DSv3-ish N buckets (6144/8192): use larger threads/row so each thread
+        # holds fewer elements in registers. For 256b vectors, pick a threads/row
+        # that yields an exact tile without padding.
+        if self.dtype.width == 16:
+            if N == 6144:
+                if self.copy_bits >= 256:
+                    return 192
+                if self.copy_bits <= 128:
+                    return 256
+            if N == 8192:
+                return 256
         # For small-N, use at least one full warp per row. The kernel
         # implementation assumes one row per CTA; returning <32 here can
         # produce multi-row tiles (cols_per_block > 1) which is not supported.
@@ -1079,7 +1615,15 @@ def _num_threads(self) -> int:
             if self.N == 1536 and self.dtype.width == 16:
                 return 96
             if self.N == 7168 and self.dtype.width == 16:
-                return 224 if self.copy_bits >= 256 else 128
+                return 224
+            if self.dtype.width == 16:
+                if self.N == 6144:
+                    if self.copy_bits >= 256:
+                        return 192
+                    if self.copy_bits <= 128:
+                        return 256
+                if self.N == 8192:
+                    return 256
             if self.N <= 1024:
                 return 32
             return 128 if self.N <= 16384 else 256
@@ -2105,7 +2649,13 @@ def _can_use_ptr_path(
     if residual is not None and residual.dtype != x.dtype:
         return False
     if weight is not None and weight.dtype != x.dtype:
-        return False
+        # Allow the common "Quack-style" API where weights are fp32 even when
+        # activations are bf16/fp16. The pointer path constructs a weight tensor
+        # view with the correct element type (fp32) inside the compiled graph.
+        if weight.dtype is not torch.float32:
+            return False
+        if x.dtype not in (torch.float16, torch.bfloat16):
+            return False
     if bias is not None and bias.dtype != x.dtype:
         return False
     # The kernel assumes `ld` satisfies a divisibility constraint used by
@@ -2128,8 +2678,15 @@ def _can_use_ptr_path(
         return False
     if bias is not None and not bias.is_contiguous():
         return False
-    if weight is not None and (weight.data_ptr() % 16) != 0:
-        return False
+    if weight is not None:
+        # For fp32 weights we use 256b universal copies (32B) by default.
+        # Require 32B alignment so the compiler can safely vectorize loads.
+        if weight.dtype is torch.float32:
+            if (weight.data_ptr() % 32) != 0:
+                return False
+        else:
+            if (weight.data_ptr() % 16) != 0:
+                return False
     if bias is not None and (bias.data_ptr() % 16) != 0:
         return False
     return True
@@ -2148,29 +2705,92 @@ def _can_use_ptr_path_fused_add_inplace(
     """
     if x.stride(1) != 1:
         return False
-    if residual.dtype != x.dtype:
+    if residual.dtype != x.dtype:
+        return False
+    if weight.dtype != x.dtype:
+        return False
+    if residual.stride(1) != 1:
+        return False
+    if not residual.is_contiguous():
+        return False
+    if not weight.is_contiguous():
+        return False
+
+    dtype = TORCH2CUTE_DTYPE[x.dtype]
+    divby = 256 // dtype.width
+    if (x.stride(0) % divby) != 0:
+        return False
+    if (residual.stride(0) % divby) != 0:
+        return False
+
+    if (x.data_ptr() % 16) != 0:
+        return False
+    if (residual.data_ptr() % 16) != 0:
+        return False
+    if (weight.data_ptr() % 16) != 0:
+        return False
+    return True
+
+
+def _can_use_ptr_path_bwd(
+    x: Tensor,
+    weight: Optional[Tensor],
+    dout: Tensor,
+    rstd: Tensor,
+) -> bool:
+    """Fast-path precondition for the pointer-based RMSNorm backward entry.
+
+    This path is only used for the common Quack-style signature:
+    - no bias gradient
+    - no residual / dresidual_out
+    - weight is either the same dtype as x, or fp32 for bf16/fp16 activations
+    """
+    if x.dim() != 2 or dout.dim() != 2:
+        return False
+    if rstd.dim() != 1:
+        return False
+    if x.shape != dout.shape:
         return False
-    if weight.dtype != x.dtype:
+    if rstd.numel() != x.shape[0]:
         return False
-    if residual.stride(1) != 1:
+    # SM100 backward kernel assumes N is divisible by 8 (for 256b fp32 stores
+    # into dw_partial rows).
+    if (x.shape[1] % 8) != 0:
         return False
-    if not residual.is_contiguous():
+    if x.stride(1) != 1 or dout.stride(1) != 1:
+        return False
+    if dout.stride(0) != x.stride(0):
+        return False
+    if dout.dtype != x.dtype:
+        return False
+    if rstd.dtype != torch.float32 or not rstd.is_contiguous():
+        return False
+    if weight is None:
+        return False
+    if weight.dim() != 1 or weight.shape[0] != x.shape[1]:
         return False
     if not weight.is_contiguous():
         return False
+    if weight.dtype != x.dtype:
+        if weight.dtype is not torch.float32:
+            return False
+        if x.dtype not in (torch.float16, torch.bfloat16):
+            return False
 
     dtype = TORCH2CUTE_DTYPE[x.dtype]
     divby = 256 // dtype.width
     if (x.stride(0) % divby) != 0:
         return False
-    if (residual.stride(0) % divby) != 0:
-        return False
 
     if (x.data_ptr() % 16) != 0:
         return False
-    if (residual.data_ptr() % 16) != 0:
+    if (dout.data_ptr() % 16) != 0:
         return False
-    if (weight.data_ptr() % 16) != 0:
+    # Torch CUDA allocations are typically >=256B aligned, but keep the check
+    # explicit so we never assume tighter alignment than is true.
+    if (rstd.data_ptr() % 4) != 0:
+        return False
+    if (weight.data_ptr() % (32 if weight.dtype is torch.float32 else 16)) != 0:
         return False
     return True
 
@@ -2254,35 +2874,92 @@ def _rmsnorm_forward_ptr_into(
         stream_handle = int(torch.cuda.current_stream().cuda_stream)
         has_weight = weight is not None
 
+        weight_dtype = TORCH2CUTE_DTYPE[weight.dtype] if has_weight else None
+
+        # Schedule selection (pointer fast path).
+        #
+        # Goals:
+        # - Keep vLLM inference fast path (contiguous/padded row-major) fast.
+        # - Enable higher vector widths when all participating pointers are 32B-aligned.
+        # - Prefer direct-GMEM for SM100-friendly hidden sizes to reduce SMEM/barrier
+        #   overhead, especially for small/medium-M cases.
+        direct_gmem = _direct_gmem_from_policy(
+            default=bool(dtype.width == 16 and N in {4096, 6144, 7168, 8192})
+        )
+        use_async = not direct_gmem
+
+        can_use_256 = bool(
+            dtype.width == 16
+            and (x.data_ptr() % 32) == 0
+            and (out.data_ptr() % 32) == 0
+            and (not has_weight or (weight.data_ptr() % 32) == 0)  # type: ignore[union-attr]
+        )
+        default_copy_bits = 256 if can_use_256 else 128
+        # Quack-style fp32-weight policy: cap the *widest* dtype to 128b, so when
+        # weights are fp32 we use 64b activation vectors (helps register pressure).
+        if dtype.width == 16 and weight_dtype is not None and weight_dtype.width == 32:
+            default_copy_bits = 64
+        copy_bits = _copy_bits_from_policy(
+            default=default_copy_bits, can_use_256=can_use_256
+        )
+        assumed_align = 32 if copy_bits >= 256 else 16
+
         stage = 1
+        if (
+            _ENABLE_STAGE2
+            and dtype.width == 16
+            and N == 7168
+            and (not direct_gmem)
+            and M >= 4096
+        ):
+            stage = 2
+
         compiled_key = (
             "ptr",
             N,
             dtype,
+            weight_dtype,
             False,  # residual
             has_weight,
             False,  # bias
             False,  # residual_out
             False,  # rstd
             stage,
+            int(copy_bits),
+            bool(use_async),
+            bool(direct_gmem),
+            int(assumed_align),
             device_index,
         )
         compiled = _PTR_COMPILE_CACHE.get(compiled_key)
         if compiled is None:
-            op = RMSNormSM100(N, dtype, stage=stage)
+            op = RMSNormSM100(
+                N,
+                dtype,
+                stage=stage,
+                copy_bits=int(copy_bits),
+                use_async=bool(use_async),
+                direct_gmem=bool(direct_gmem),
+            )
             ld_val = int(x.stride(0))
             ptr_x = rt.make_ptr(
-                dtype, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+                dtype,
+                x.data_ptr(),
+                mem_space=rt.AddressSpace.gmem,
+                assumed_align=assumed_align,
             )
             ptr_out = rt.make_ptr(
-                dtype, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+                dtype,
+                out.data_ptr(),
+                mem_space=rt.AddressSpace.gmem,
+                assumed_align=assumed_align,
             )
             ptr_w = (
                 rt.make_ptr(
-                    dtype,
+                    weight_dtype or dtype,
                     weight.data_ptr(),
                     mem_space=rt.AddressSpace.gmem,
-                    assumed_align=16,
+                    assumed_align=assumed_align,
                 )
                 if has_weight
                 else None
@@ -2309,10 +2986,12 @@ def _rmsnorm_forward_ptr_into(
         launcher = _get_fast_ptr_rmsnorm_launcher(
             compiled=compiled,
             dtype=dtype,
+            weight_dtype=weight_dtype,
             N=N,
             device_index=device_index,
             stream_handle=stream_handle,
             has_weight=has_weight,
+            assumed_align=assumed_align,
             eps=eps,
         )
         ld_val = int(x.stride(0))
@@ -2321,17 +3000,23 @@ def _rmsnorm_forward_ptr_into(
             return
 
         ptr_x = rt.make_ptr(
-            dtype, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+            dtype,
+            x.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=assumed_align,
         )
         ptr_out = rt.make_ptr(
-            dtype, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+            dtype,
+            out.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=assumed_align,
         )
         ptr_w = (
             rt.make_ptr(
-                dtype,
+                weight_dtype or dtype,
                 weight.data_ptr(),
                 mem_space=rt.AddressSpace.gmem,
-                assumed_align=16,
+                assumed_align=assumed_align,
             )
             if has_weight
             else None
@@ -2354,8 +3039,42 @@ def _rmsnorm_forward_ptr_into(
         )
         return
 
-    # Fallback: general path (supports bias/residual/rstd, but is slower to launch).
+    # General path (supports bias/residual/rstd, but is slower to launch).
+    #
+    # Keep the same schedule-selection policy as the fast path so correctness-only
+    # features (bias/residual/rstd) don't accidentally fall off a performance cliff.
+    weight_dtype = TORCH2CUTE_DTYPE[weight.dtype] if weight is not None else None
+    direct_gmem = _direct_gmem_from_policy(
+        default=bool(dtype.width == 16 and N in {4096, 6144, 7168, 8192})
+    )
+    use_async = not direct_gmem
+    can_use_256 = bool(
+        dtype.width == 16
+        and (x.data_ptr() % 32) == 0
+        and (out.data_ptr() % 32) == 0
+        and (weight is None or (weight.data_ptr() % 32) == 0)
+        and (bias is None or (bias.data_ptr() % 32) == 0)
+        and (residual is None or (residual.data_ptr() % 32) == 0)
+        and (residual_out is None or (residual_out.data_ptr() % 32) == 0)
+    )
+    default_copy_bits = 256 if can_use_256 else 128
+    if dtype.width == 16 and weight_dtype is not None and weight_dtype.width == 32:
+        default_copy_bits = 64
+    copy_bits = _copy_bits_from_policy(
+        default=default_copy_bits, can_use_256=can_use_256
+    )
+    assumed_align = 32 if copy_bits >= 256 else 16
+
     stage = 1
+    if (
+        _ENABLE_STAGE2
+        and dtype.width == 16
+        and N == 7168
+        and (not direct_gmem)
+        and M >= 4096
+    ):
+        stage = 2
+
     if torch.cuda.current_device() != device_index:
         torch.cuda.set_device(device_index)
     stream_handle = int(torch.cuda.current_stream().cuda_stream)
@@ -2363,29 +3082,47 @@ def _rmsnorm_forward_ptr_into(
         "ptr",
         N,
         dtype,
+        weight_dtype,
         residual is not None,
         weight is not None,
         bias is not None,
         residual_out is not None,
         rstd is not None,
         stage,
+        int(copy_bits),
+        bool(use_async),
+        bool(direct_gmem),
+        int(assumed_align),
         device_index,
     )
     compiled = _PTR_COMPILE_CACHE.get(key)
     if compiled is None:
-        op = RMSNormSM100(N, dtype, stage=stage)
+        op = RMSNormSM100(
+            N,
+            dtype,
+            stage=stage,
+            copy_bits=int(copy_bits),
+            use_async=bool(use_async),
+            direct_gmem=bool(direct_gmem),
+        )
         ptr_x = rt.make_ptr(
-            dtype, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+            dtype,
+            x.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=assumed_align,
         )
         ptr_out = rt.make_ptr(
-            dtype, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+            dtype,
+            out.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=assumed_align,
         )
         ptr_res = (
             rt.make_ptr(
                 dtype,
                 residual.data_ptr(),
                 mem_space=rt.AddressSpace.gmem,
-                assumed_align=16,
+                assumed_align=assumed_align,
             )
             if residual is not None
             else None
@@ -2395,24 +3132,27 @@ def _rmsnorm_forward_ptr_into(
                 dtype,
                 residual_out.data_ptr(),
                 mem_space=rt.AddressSpace.gmem,
-                assumed_align=16,
+                assumed_align=assumed_align,
             )
             if residual_out is not None
             else None
         )
         ptr_w = (
             rt.make_ptr(
-                dtype,
+                weight_dtype or dtype,
                 weight.data_ptr(),
                 mem_space=rt.AddressSpace.gmem,
-                assumed_align=16,
+                assumed_align=assumed_align,
             )
             if weight is not None
             else None
         )
         ptr_b = (
             rt.make_ptr(
-                dtype, bias.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+                dtype,
+                bias.data_ptr(),
+                mem_space=rt.AddressSpace.gmem,
+                assumed_align=assumed_align,
             )
             if bias is not None
             else None
@@ -2446,14 +3186,20 @@ def _rmsnorm_forward_ptr_into(
         )
         _PTR_COMPILE_CACHE[key] = compiled
     ptr_x = rt.make_ptr(
-        dtype, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+        dtype, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align
     )
     ptr_out = rt.make_ptr(
-        dtype, out.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+        dtype,
+        out.data_ptr(),
+        mem_space=rt.AddressSpace.gmem,
+        assumed_align=assumed_align,
     )
     ptr_res = (
         rt.make_ptr(
-            dtype, residual.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+            dtype,
+            residual.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=assumed_align,
         )
         if residual is not None
         else None
@@ -2463,21 +3209,27 @@ def _rmsnorm_forward_ptr_into(
             dtype,
             residual_out.data_ptr(),
             mem_space=rt.AddressSpace.gmem,
-            assumed_align=16,
+            assumed_align=assumed_align,
         )
         if residual_out is not None
         else None
     )
     ptr_w = (
         rt.make_ptr(
-            dtype, weight.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+            weight_dtype or dtype,
+            weight.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=assumed_align,
         )
         if weight is not None
         else None
     )
     ptr_b = (
         rt.make_ptr(
-            dtype, bias.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+            dtype,
+            bias.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=assumed_align,
         )
         if bias is not None
         else None
@@ -2545,8 +3297,11 @@ def _fused_add_rmsnorm_forward_ptr_inplace(
     # benchmark other models/shapes, you can override it with:
     #   - OINK_RMSNORM_DIRECT_GMEM=0  (force staging/cp.async path)
     #   - OINK_RMSNORM_DIRECT_GMEM=1  (force direct-gmem path)
+    # Default direct-GMEM policy:
+    # - small/medium M: direct-GMEM reduces staging/barrier overhead
+    # - large M: staged cp.async tends to win on sustained bandwidth
     direct_gmem = _direct_gmem_from_policy(
-        default=bool(dtype.width == 16 and N == 7168)
+        default=bool(dtype.width == 16 and N == 7168 and M <= 16384)
     )
     use_async = not direct_gmem
     tpr_override: Optional[int] = None
@@ -2743,10 +3498,13 @@ def rmsnorm_forward(
     #
     # When the pointer path can't be used (e.g. float32 weights for Quack-style
     # APIs, or non-standard layouts), fall back to the CuTeDSL stage-2 module
-    # before using the slow torch reference implementation.
+    # (ported from `/tmp/oink_main/Blackwell`) before using the slow torch
+    # reference implementation.
     force_stage2 = _FORCE_RMSNORM_STAGE2_FWD
 
-    if not force_stage2 and _can_use_ptr_path(x, weight, bias, residual):
+    use_ptr = (not force_stage2) and _can_use_ptr_path(x, weight, bias, residual)
+
+    if use_ptr:
         return _rmsnorm_forward_ptr(x, weight, bias, residual, eps, store_rstd)
 
     # CuTeDSL fallback for cases that aren't safe for the pointer path.
@@ -2950,6 +3708,83 @@ class RMSNormBackwardSM100(BaseRMSNormBackward):
     def __init__(self, dtype: cutlass.Numeric, N: int):
         super().__init__(dtype, N)
 
+    def _get_num_threads(self) -> int:
+        nt = getattr(self, "_nt_override", None)
+        if nt is not None:
+            return int(nt)
+        return super()._get_num_threads()
+
+    def _calculate_threads_per_row(self) -> int:
+        tpr = getattr(self, "_tpr_override", None)
+        if tpr is not None:
+            return int(tpr)
+        return super()._calculate_threads_per_row()
+
+    @cute.jit
+    def launch_from_ptrs(
+        self,
+        ptr_x: cute.Pointer,
+        ptr_w: cute.Pointer,
+        ptr_dout: cute.Pointer,
+        ptr_rstd: cute.Pointer,
+        ptr_dx: cute.Pointer,
+        ptr_dw_partial: cute.Pointer,
+        M: Int32,
+        N_dyn: Int32,
+        ld: Int32,
+        sm_count: Int32,
+        stream: cuda.CUstream,
+    ) -> None:
+        """Pointer-based entrypoint that bypasses DLPack conversions.
+
+        This is the performance-critical path used by the benchmark harness
+        (and any future training integrations) for the common case:
+        - weight gradient enabled (dw_partial is provided)
+        - no bias/residual gradients
+        """
+        # Weight-grad stores use vectorized float32 copies. For the SM100
+        # schedule we want to allow up to 256b (8x f32) stores, which requires
+        # the leading dimension to be divisible by 8 to prove 32B alignment for
+        # every row in `dw_partial`.
+        N_assumed = cute.assume(N_dyn, divby=8)
+
+        layout_mn = cute.make_layout((M, N_assumed), stride=(ld, 1))
+        layout_n = cute.make_layout((N_assumed,), stride=(1,))
+        layout_m = cute.make_layout((M,), stride=(1,))
+        # Default: write a full (sm_count, N) partial buffer (Quack-style),
+        # then reduce on the host with `torch.sum(dim=0)`.
+        #
+        # Optional: atomic-reduce directly into a single (N,) buffer by using
+        # a broadcasted leading dimension (stride0 = 0). This avoids the extra
+        # reduction kernel launch and is primarily used for tiny-M regimes.
+        if const_expr(self.atomic_dw):
+            layout_partial = cute.make_layout((sm_count, N_assumed), stride=(0, 1))
+        else:
+            layout_partial = cute.make_layout(
+                (sm_count, N_assumed), stride=(N_assumed, 1)
+            )
+
+        mX = cute.make_tensor(ptr_x, layout_mn)
+        mW = cute.make_tensor(ptr_w, layout_n)
+        mdO = cute.make_tensor(ptr_dout, layout_mn)
+        mRstd = cute.make_tensor(ptr_rstd, layout_m)
+        mdX = cute.make_tensor(ptr_dx, layout_mn)
+        mdW = cute.make_tensor(ptr_dw_partial, layout_partial)
+
+        self.__call__(
+            mX,
+            mW,
+            mdO,
+            None,  # dresidual_out
+            mRstd,
+            mdX,
+            mdW,
+            None,  # dresidual
+            None,  # db_partial
+            sm_count,
+            stream,
+        )
+
     def _get_num_threads(self) -> int:
         # Keep 128 threads only up to N=4k; use 256 for larger rows to ensure
         # threads_per_row <= num_threads across buckets.
@@ -2959,24 +3794,22 @@ def _get_num_threads(self) -> int:
             return 128 if self.N <= 4096 else 256
 
     def _calculate_threads_per_row(self) -> int:
-        # Mirror RMSNormSM100 forward's tiling.
+        try:
+            return self._tpr_override  # type: ignore[attr-defined]
+        except Exception:
+            pass
+        # Match Quack's backward tiling: use 256 threads/row for N > 4096.
+        #
+        # The earlier "mirror forward" policy (128 threads/row for N<=8192)
+        # regresses DSv3 backward at N=6144/7168/8192 on SM100.
         N = self.N
-        if N <= 64:
-            return 8
-        if N <= 128:
-            return 16
-        if N <= 1024:
-            return 32
-        if N <= 4096:
-            return 128
-        if N <= 8192:
-            try:
-                return self._tpr_override  # type: ignore[attr-defined]
-            except Exception:
-                return 128
-        if N <= 16384:
+        for limit, threads in [(64, 8), (128, 16), (256, 32), (512, 64), (4096, 128)]:
+            if N <= limit:
+                return threads
+        try:
+            return self._tpr_override  # type: ignore[attr-defined]
+        except Exception:
             return 256
-        return 256
 
     def _set_cluster_n(self) -> None:
         # Reuse the SM100 forward cluster growth policy so large-N shapes can
@@ -3093,6 +3926,7 @@ def new_stride(t):
 
 
 _BWD_COMPILE_CACHE: dict[tuple[object, ...], object] = {}
+_BWD_PTR_COMPILE_CACHE: dict[tuple[object, ...], object] = {}
 
 
 def _rmsnorm_bwd_sm100(
@@ -3141,14 +3975,13 @@ def _rmsnorm_bwd_sm100(
     # Match Quack's conversion strategy for activations/gradients: keep the
     # (M, N) layout dynamic without enforcing additional compact-shape
     # constraints. This reduces per-call Python overhead for small-M shapes.
-    def _convert_mx(t: Tensor) -> cute.Tensor:
-        return from_dlpack(
-            t.detach(),
-            assumed_align=16,
-        ).mark_layout_dynamic(leading_dim=1)
+    def _convert_layout_dynamic(t: Tensor) -> cute.Tensor:
+        return from_dlpack(t.detach(), assumed_align=16).mark_layout_dynamic(
+            leading_dim=1
+        )
 
     x_tensor, dout_tensor, dres_out_tensor, dx_tensor, dres_tensor = [
-        _convert_mx(t) if t is not None else None
+        _convert_layout_dynamic(t) if t is not None else None
         for t in (x, dout, dresidual_out, dx, dresidual)
     ]
 
@@ -3230,6 +4063,227 @@ def _convert_mx(t: Tensor) -> cute.Tensor:
     )
 
 
+def _rmsnorm_bwd_sm100_ptr(
+    x: Tensor,
+    weight: Tensor,
+    dout: Tensor,
+    rstd: Tensor,
+    dx: Tensor,
+    dw_partial: Tensor,
+    sm_count: int,
+    *,
+    atomic_dw: bool = False,
+) -> None:
+    """Pointer-based SM100 RMSNorm backward launch (no DLPack conversions).
+
+    When `atomic_dw=True`, `dw_partial` is treated as a single (N,) fp32 buffer
+    and the kernel atomically accumulates weight gradients into it (avoids the
+    extra `dw_partial.sum(dim=0)` reduction kernel).
+    """
+    assert _can_use_ptr_path_bwd(x, weight, dout, rstd)
+    assert dx.shape == x.shape
+    assert dx.dtype == x.dtype
+    assert dw_partial.dtype == torch.float32
+
+    M, N = x.size(0), x.size(1)
+    if atomic_dw:
+        assert dw_partial.dim() == 1 and dw_partial.numel() == N
+        assert dw_partial.is_contiguous()
+    else:
+        assert dw_partial.dim() == 2 and dw_partial.shape[1] == N
+    device_index = x.get_device()
+    dtype = TORCH2CUTE_DTYPE[x.dtype]
+    weight_dtype = TORCH2CUTE_DTYPE[weight.dtype]
+    assumed_align_x = 16
+    assumed_align_w = 32 if weight.dtype is torch.float32 else 16
+    assumed_align_dw = 32
+    assert (dw_partial.data_ptr() % assumed_align_dw) == 0
+
+    if torch.cuda.current_device() != device_index:
+        torch.cuda.set_device(device_index)
+    stream_handle = int(torch.cuda.current_stream().cuda_stream)
+    stream = cuda.CUstream(stream_handle)
+
+    ld_val = int(x.stride(0))
+    key = (
+        "bwd_ptr",
+        N,
+        dtype,
+        weight_dtype,
+        int(assumed_align_x),
+        int(assumed_align_w),
+        int(assumed_align_dw),
+        device_index,
+        bool(atomic_dw),
+    )
+    compiled = _BWD_PTR_COMPILE_CACHE.get(key)
+    if compiled is None:
+        op = RMSNormBackwardSM100(dtype, N)
+        op.atomic_dw = bool(atomic_dw)
+        # 16-bit activations + 16-bit weights (vLLM-style) backward at N=4096:
+        # Use a 1-row/CTA schedule with 256 threads/row. This reduces per-thread
+        # work and improves bandwidth on large-M shapes on SM100.
+        if (
+            (not atomic_dw)
+            and N == 4096
+            and dtype.width == 16
+            and weight_dtype.width == 16
+        ):
+            op._tpr_override = 256  # type: ignore[attr-defined]
+            op._nt_override = 256  # type: ignore[attr-defined]
+        # 16-bit activations + fp32 weights backward at N=4096:
+        # Use a 256-thread schedule (tpr=256) to improve throughput.
+        if (
+            (not atomic_dw)
+            and N == 4096
+            and dtype.width == 16
+            and weight_dtype is cutlass.Float32
+        ):
+            op._tpr_override = 256  # type: ignore[attr-defined]
+            op._nt_override = 256  # type: ignore[attr-defined]
+        # FP16 + fp32-weight DSv3 backward: Quack's default (1 row/CTA with
+        # 256 threads/row) underperforms. Use a 2-rows/CTA schedule (256 threads
+        # total, 128 threads/row) to improve memory-level parallelism.
+        if (
+            (not atomic_dw)
+            and N == 6144
+            and dtype is cutlass.Float16
+            and weight_dtype is cutlass.Float32
+        ):
+            op._tpr_override = 128  # type: ignore[attr-defined]
+            op._nt_override = 256  # type: ignore[attr-defined]
+
+        ptr_x = rt.make_ptr(
+            dtype,
+            x.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=assumed_align_x,
+        )
+        ptr_w = rt.make_ptr(
+            weight_dtype,
+            weight.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=assumed_align_w,
+        )
+        ptr_dout = rt.make_ptr(
+            dtype,
+            dout.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=assumed_align_x,
+        )
+        ptr_rstd = rt.make_ptr(
+            cutlass.Float32,
+            rstd.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=assumed_align_x,
+        )
+        ptr_dx = rt.make_ptr(
+            dtype,
+            dx.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=assumed_align_x,
+        )
+        ptr_dw = rt.make_ptr(
+            cutlass.Float32,
+            dw_partial.data_ptr(),
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=assumed_align_dw,
+        )
+        compiled = cute.compile(
+            op.launch_from_ptrs,
+            ptr_x,
+            ptr_w,
+            ptr_dout,
+            ptr_rstd,
+            ptr_dx,
+            ptr_dw,
+            Int32(M),
+            Int32(N),
+            Int32(ld_val),
+            Int32(int(sm_count)),
+            stream,
+        )
+        _BWD_PTR_COMPILE_CACHE[key] = compiled
+
+    launcher = _get_fast_ptr_rmsnorm_bwd_launcher(
+        compiled=compiled,
+        dtype=dtype,
+        weight_dtype=weight_dtype,
+        N=N,
+        device_index=device_index,
+        stream_handle=stream_handle,
+        has_weight=True,
+        has_dw_partial=True,
+        assumed_align_x=assumed_align_x,
+        assumed_align_w=assumed_align_w,
+        assumed_align_dw=assumed_align_dw,
+    )
+    if launcher is not None:
+        launcher.launch(
+            x=x,
+            weight=weight,
+            dout=dout,
+            rstd=rstd,
+            dx=dx,
+            dw_partial=dw_partial,
+            M=M,
+            N=N,
+            ld=ld_val,
+            sm_count=int(sm_count),
+        )
+        return
+
+    ptr_x = rt.make_ptr(
+        dtype,
+        x.data_ptr(),
+        mem_space=rt.AddressSpace.gmem,
+        assumed_align=assumed_align_x,
+    )
+    ptr_w = rt.make_ptr(
+        weight_dtype,
+        weight.data_ptr(),
+        mem_space=rt.AddressSpace.gmem,
+        assumed_align=assumed_align_w,
+    )
+    ptr_dout = rt.make_ptr(
+        dtype,
+        dout.data_ptr(),
+        mem_space=rt.AddressSpace.gmem,
+        assumed_align=assumed_align_x,
+    )
+    ptr_rstd = rt.make_ptr(
+        cutlass.Float32,
+        rstd.data_ptr(),
+        mem_space=rt.AddressSpace.gmem,
+        assumed_align=assumed_align_x,
+    )
+    ptr_dx = rt.make_ptr(
+        dtype,
+        dx.data_ptr(),
+        mem_space=rt.AddressSpace.gmem,
+        assumed_align=assumed_align_x,
+    )
+    ptr_dw = rt.make_ptr(
+        cutlass.Float32,
+        dw_partial.data_ptr(),
+        mem_space=rt.AddressSpace.gmem,
+        assumed_align=assumed_align_dw,
+    )
+    compiled(
+        ptr_x,
+        ptr_w,
+        ptr_dout,
+        ptr_rstd,
+        ptr_dx,
+        ptr_dw,
+        Int32(M),
+        Int32(N),
+        Int32(ld_val),
+        Int32(int(sm_count)),
+        stream,
+    )
+
+
 def rmsnorm_backward(
     x: Tensor,
     weight: Optional[Tensor],
@@ -3262,14 +4316,32 @@ def rmsnorm_backward(
     # pressure in benchmark/verify loops. Clamp to Quack's baseline policy
     # (`sm_count = num_sms * 2` for N=4096) for this regime.
     if N == 4096 and M <= 8192 and x.dtype in (torch.float16, torch.bfloat16):
-        try:
-            num_sms = torch.cuda.get_device_properties(device).multi_processor_count
-            sm_count = min(int(sm_count), int(num_sms) * 2)
-        except Exception:
-            pass
+        num_sms = qutils.get_num_sms(device)
+        sm_count = min(int(sm_count), int(num_sms) * 2)
+
+    use_atomic_dw = False
+    # DSv3 backward (N=6144/7168/8192) is dominated by the (sm_count, N) partial
+    # write + reduction for dW. Use the atomic-dW path to accumulate directly
+    # into a single (N,) fp32 buffer (no separate reduction kernel).
+    if (
+        weight is not None
+        and (not has_bias)
+        and (not has_residual)
+        and dresidual_out is None
+        and dresidual is None
+        and N == 8192
+        and weight.dtype is torch.float32
+        and M >= 65536
+        and x.dtype in (torch.float16, torch.bfloat16)
+        and _can_use_ptr_path_bwd(x, weight, dout, rstd)
+    ):
+        use_atomic_dw = True
 
     if weight is not None:
-        dw_partial = torch.empty(sm_count, N, device=device, dtype=torch.float32)
+        if use_atomic_dw:
+            dw_partial = torch.zeros(N, device=device, dtype=torch.float32)
+        else:
+            dw_partial = torch.empty(sm_count, N, device=device, dtype=torch.float32)
     else:
         dw_partial = None
     db_partial = (
@@ -3278,20 +4350,47 @@ def rmsnorm_backward(
         else None
     )
 
-    _rmsnorm_bwd_sm100(
-        x,
-        weight,
-        dout,
-        rstd,
-        dx,
-        dw_partial,
-        db_partial,
-        dresidual_out,
-        dresidual,
-        sm_count,
-    )
+    if (
+        weight is not None
+        and dw_partial is not None
+        and (not has_bias)
+        and (not has_residual)
+        and dresidual_out is None
+        and dresidual is None
+        and _can_use_ptr_path_bwd(x, weight, dout, rstd)
+    ):
+        _rmsnorm_bwd_sm100_ptr(
+            x=x,
+            weight=weight,
+            dout=dout,
+            rstd=rstd,
+            dx=dx,
+            dw_partial=dw_partial,
+            sm_count=int(sm_count),
+            atomic_dw=bool(use_atomic_dw),
+        )
+    else:
+        _rmsnorm_bwd_sm100(
+            x,
+            weight,
+            dout,
+            rstd,
+            dx,
+            dw_partial,
+            db_partial,
+            dresidual_out,
+            dresidual,
+            sm_count,
+        )
 
-    dw = dw_partial.sum(dim=0).to(weight.dtype) if weight is not None else None
+    if weight is not None and dw_partial is not None:
+        if use_atomic_dw:
+            dw_fp32 = dw_partial
+        else:
+            dw_fp32 = _reduce_partial_sum_fp32(dw_partial, device_index=x.get_device())
+        dw = dw_fp32 if weight.dtype is torch.float32 else dw_fp32.to(weight.dtype)
+    else:
+        dw = None
     db = db_partial.sum(dim=0).to(weight.dtype) if has_bias else None
     if has_residual and dresidual is None:
         dresidual = dx
diff --git a/oink/src/kernelagent_oink/blackwell/softmax.py b/oink/src/kernelagent_oink/blackwell/softmax.py
index 6a7eb54..394ab48 100644
--- a/oink/src/kernelagent_oink/blackwell/softmax.py
+++ b/oink/src/kernelagent_oink/blackwell/softmax.py
@@ -65,11 +65,17 @@
 from cutlass.cute import runtime as rt
 from cutlass.cute.runtime import from_dlpack
 
+from kernelagent_oink.blackwell.fast_launch import (
+    StableI32Arg,
+    disable_fast_launch,
+    fast_launch_enabled,
+    set_runtime_ptr,
+    tls_cache as _tls_fast_launch_cache,
+)
 from kernelagent_oink.blackwell.lite_quack import (
     _KERNEL_ACCEPTS_LAYOUT_ARGS,
     TORCH2CUTE_DTYPE,
     ReductionBase,
-    domain_offset_i64,
     fill_oob,
     online_softmax_reduce,
     predicate_k,
@@ -80,6 +86,275 @@
 _BWD_COMPILE_CACHE: dict[tuple[Type[cutlass.Numeric], int], object] = {}
 _PTR_FWD_COMPILE_CACHE: dict[tuple[object, ...], object] = {}
 _PTR_BWD_COMPILE_CACHE: dict[tuple[object, ...], object] = {}
+_PTR_FWDBWD_COMPILE_CACHE: dict[tuple[object, ...], object] = {}
+
+
+class _PtrSoftmaxFastLaunch:
+    def __init__(
+        self,
+        *,
+        compiled: object,
+        executor: object,
+        capi_func: object,
+        ptr_a: object,
+        ptr_b: object,
+        ptr_c: object | None,
+        arg_m: StableI32Arg,
+        arg_ld: StableI32Arg,
+        stream: cuda.CUstream,
+        assumed_align: int,
+        packed_args: object,
+        keepalive: tuple[object, ...],
+    ):
+        self._compiled = compiled
+        self._executor = executor
+        self._capi_func = capi_func
+        self._ptr_a = ptr_a
+        self._ptr_b = ptr_b
+        self._ptr_c = ptr_c
+        self._arg_m = arg_m
+        self._arg_ld = arg_ld
+        self._stream = stream
+        self._assumed_align = int(assumed_align)
+        self._packed_args = packed_args
+        self._keepalive = keepalive
+
+        self._use_fast_launch = True
+        self._cuda_result = getattr(executor, "cuda_result", None)
+
+        self._last_a_ptr = -1
+        self._last_b_ptr = -1
+        self._last_c_ptr = -1
+        self._last_m = -1
+        self._last_ld = -1
+
+    def launch(
+        self,
+        *,
+        a_ptr: int,
+        b_ptr: int,
+        c_ptr: int | None,
+        M: int,
+        ld: int,
+        stream_handle: int,
+        dtype: type[cutlass.Numeric],
+    ) -> None:
+        if not fast_launch_enabled() or not self._use_fast_launch:
+            self._fallback_launch(
+                a_ptr=a_ptr,
+                b_ptr=b_ptr,
+                c_ptr=c_ptr,
+                M=M,
+                ld=ld,
+                stream_handle=stream_handle,
+                dtype=dtype,
+            )
+            return
+
+        if a_ptr != self._last_a_ptr:
+            try:
+                set_runtime_ptr(self._ptr_a, a_ptr)
+                self._last_a_ptr = a_ptr
+            except AttributeError:
+                self._disable_fast_launch()
+                self._fallback_launch(
+                    a_ptr=a_ptr,
+                    b_ptr=b_ptr,
+                    c_ptr=c_ptr,
+                    M=M,
+                    ld=ld,
+                    stream_handle=stream_handle,
+                    dtype=dtype,
+                )
+                return
+
+        if b_ptr != self._last_b_ptr:
+            try:
+                set_runtime_ptr(self._ptr_b, b_ptr)
+                self._last_b_ptr = b_ptr
+            except AttributeError:
+                self._disable_fast_launch()
+                self._fallback_launch(
+                    a_ptr=a_ptr,
+                    b_ptr=b_ptr,
+                    c_ptr=c_ptr,
+                    M=M,
+                    ld=ld,
+                    stream_handle=stream_handle,
+                    dtype=dtype,
+                )
+                return
+
+        if self._ptr_c is not None and c_ptr is not None:
+            if c_ptr != self._last_c_ptr:
+                try:
+                    set_runtime_ptr(self._ptr_c, c_ptr)
+                    self._last_c_ptr = c_ptr
+                except AttributeError:
+                    self._disable_fast_launch()
+                    self._fallback_launch(
+                        a_ptr=a_ptr,
+                        b_ptr=b_ptr,
+                        c_ptr=c_ptr,
+                        M=M,
+                        ld=ld,
+                        stream_handle=stream_handle,
+                        dtype=dtype,
+                    )
+                    return
+
+        if M != self._last_m:
+            self._arg_m.set(M)
+            self._last_m = M
+        if ld != self._last_ld:
+            self._arg_ld.set(ld)
+            self._last_ld = ld
+
+        if self._cuda_result is not None:
+            self._cuda_result.value = 0
+        ret = self._capi_func(self._packed_args)  # type: ignore[misc]
+        if ret != 0:
+            raise RuntimeError(f"CuTeDSL capi_func returned non-zero: {ret}")
+        if self._cuda_result is not None:
+            err = int(self._cuda_result.value)
+            if err != 0:
+                raise RuntimeError(f"CuTeDSL kernel launch failed (cuda_result={err})")
+
+    def _disable_fast_launch(self) -> None:
+        self._use_fast_launch = False
+        disable_fast_launch()
+
+    def _fallback_launch(
+        self,
+        *,
+        a_ptr: int,
+        b_ptr: int,
+        c_ptr: int | None,
+        M: int,
+        ld: int,
+        stream_handle: int,
+        dtype: type[cutlass.Numeric],
+    ) -> None:
+        stream = cuda.CUstream(int(stream_handle))
+        ptr_a = rt.make_ptr(
+            dtype,
+            a_ptr,
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=self._assumed_align,
+        )
+        ptr_b = rt.make_ptr(
+            dtype,
+            b_ptr,
+            mem_space=rt.AddressSpace.gmem,
+            assumed_align=self._assumed_align,
+        )
+        if self._ptr_c is not None and c_ptr is not None:
+            ptr_c = rt.make_ptr(
+                dtype,
+                c_ptr,
+                mem_space=rt.AddressSpace.gmem,
+                assumed_align=self._assumed_align,
+            )
+            self._compiled(ptr_a, ptr_b, ptr_c, Int32(int(M)), Int32(int(ld)), stream)
+        else:
+            self._compiled(ptr_a, ptr_b, Int32(int(M)), Int32(int(ld)), stream)
+
+
+def _get_fast_ptr_softmax_launcher(
+    *,
+    compiled: object,
+    dtype: type[cutlass.Numeric],
+    N: int,
+    device_index: int,
+    stream_handle: int,
+    assumed_align: int,
+    is_bwd: bool,
+) -> _PtrSoftmaxFastLaunch | None:
+    if not fast_launch_enabled():
+        return None
+    key = (
+        "ptr_fast_bwd" if is_bwd else "ptr_fast_fwd",
+        id(compiled),
+        int(N),
+        dtype,
+        int(device_index),
+        int(stream_handle),
+        int(assumed_align),
+    )
+    cache = _tls_fast_launch_cache()
+    cached = cache.get(key)
+    if cached is not None:
+        return cached  # type: ignore[return-value]
+
+    assumed_align = int(assumed_align)
+    ptr_a = rt.make_ptr(
+        dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align
+    )
+    ptr_b = rt.make_ptr(
+        dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align
+    )
+    ptr_c = (
+        rt.make_ptr(
+            dtype, 0, mem_space=rt.AddressSpace.gmem, assumed_align=assumed_align
+        )
+        if is_bwd
+        else None
+    )
+
+    arg_m = StableI32Arg(0)
+    arg_ld = StableI32Arg(N)
+    stream = cuda.CUstream(int(stream_handle))
+    executor = compiled.to(device_index)  # type: ignore[attr-defined]
+    try:
+        if ptr_c is not None:
+            exe_args, adapted_args = executor.generate_execution_args(
+                ptr_a,
+                ptr_b,
+                ptr_c,
+                arg_m,
+                arg_ld,
+                stream,
+            )
+        else:
+            exe_args, adapted_args = executor.generate_execution_args(
+                ptr_a,
+                ptr_b,
+                arg_m,
+                arg_ld,
+                stream,
+            )
+        packed_args = executor._get_invoke_packed_args(list(exe_args))  # type: ignore[attr-defined]
+        capi_func = compiled.capi_func  # type: ignore[attr-defined]
+    except AttributeError:
+        disable_fast_launch()
+        return None
+
+    keepalive: tuple[object, ...] = (
+        executor,
+        ptr_a,
+        ptr_b,
+        ptr_c,
+        arg_m,
+        arg_ld,
+        stream,
+        *adapted_args,
+    )
+    launcher = _PtrSoftmaxFastLaunch(
+        compiled=compiled,
+        executor=executor,
+        capi_func=capi_func,
+        ptr_a=ptr_a,
+        ptr_b=ptr_b,
+        ptr_c=ptr_c,
+        arg_m=arg_m,
+        arg_ld=arg_ld,
+        stream=stream,
+        assumed_align=assumed_align,
+        packed_args=packed_args,
+        keepalive=keepalive,
+    )
+    cache[key] = launcher
+    return launcher
 
 
 class SoftmaxFwdSM100(ReductionBase):
@@ -87,9 +362,23 @@ def __init__(self, dtype: Type[cutlass.Numeric], N: int):
         # One-stage online reduction: pack (max, sum_exp) into Int64 reduction buffer.
         super().__init__(dtype, N, stage=1, reduction_dtype=cutlass.Int64)
 
+    def _get_num_threads(self) -> int:
+        # SM100 tuning note:
+        # For N=4096, we use 32 threads per row (1 warp) and run 1 row per CTA
+        # (32 threads total). This keeps the reduction fully warp-local and
+        # improves throughput on this GB200 versus Quack's default 2-rows-per-CTA
+        # schedule with 64 threads per row (4 warps total).
+        if self.N == 4096:
+            return 32
+        return super()._get_num_threads()
+
     def _calculate_threads_per_row(self) -> int:
         # Match Quack's bucketed policy for Softmax.
         N = self.N
+        if N == 4096:
+            return 32
+        if N == 6144:
+            return 128
         if N <= 64:
             return 8
         if N <= 128:
@@ -192,10 +481,10 @@ def _kernel_impl(
         shape = mX.shape
         idX = cute.make_identity_tensor(shape)
 
-        # Slice per-CTA region; use 64-bit indexing for large tensors.
-        mX, mO = [domain_offset_i64((bidx * tiler_mn[0], 0), mT) for mT in (mX, mO)]
-        gX, gO = [cute.local_tile(mT, tiler_mn, (0, cluster_y)) for mT in (mX, mO)]
-        cX = cute.local_tile(idX, tiler_mn, (bidx, cluster_y))
+        # Quack-style CTA tiling.
+        gX, gO, cX = [
+            cute.local_tile(mT, tiler_mn, (bidx, cluster_y)) for mT in (mX, mO, idX)
+        ]
 
         smem = cutlass.utils.SmemAllocator()
         sX = smem.allocate_tensor(
@@ -220,11 +509,25 @@ def _kernel_impl(
             num_bits_per_copy=128,
         )
 
-        thr_copy_load = cute.make_tiled_copy(
-            copy_atom_load, tv_layout, tiler_mn
+        num_copy_elems = (
+            tv_layout.shape[1]
+            if const_expr(cute.rank(tv_layout.shape[1]) == 1)
+            else tv_layout.shape[1][0]
+        )
+        threads_per_row = (
+            tv_layout.shape[0]
+            if const_expr(cute.rank(tv_layout.shape[0]) == 1)
+            else tv_layout.shape[0][0]
+        )
+        thr_layout = cute.make_ordered_layout(
+            (tiler_mn[0], threads_per_row), order=(1, 0)
+        )
+        val_layout = cute.make_layout((1, num_copy_elems))
+        thr_copy_load = cute.make_tiled_copy_tv(
+            copy_atom_load, thr_layout, val_layout
         ).get_slice(tidx)
-        thr_copy_store = cute.make_tiled_copy(
-            copy_atom_store, tv_layout, tiler_mn
+        thr_copy_store = cute.make_tiled_copy_tv(
+            copy_atom_store, thr_layout, val_layout
         ).get_slice(tidx)
 
         tXgX = thr_copy_load.partition_S(gX)
@@ -256,7 +559,6 @@ def _kernel_impl(
 
         cute.autovec_copy(tXsX, tXrX)
         x = tXrX.load().to(Float32)
-        threads_per_row = tv_layout.shape[0][0]
 
         # Online softmax reduction: compute max and sum_exp in a single pass, with
         # optional cluster-wide aggregation via an Int64 reduction buffer.
@@ -313,6 +615,8 @@ def __init__(self, dtype: Type[cutlass.Numeric], N: int):
     def _calculate_threads_per_row(self) -> int:
         # Match Quack backward softmax buckets.
         N = self.N
+        if N in (4096, 6144):
+            return 128
         if N <= 64:
             return 8
         if N <= 128:
@@ -433,13 +737,10 @@ def _kernel_impl(
         shape = mdY.shape
         idX = cute.make_identity_tensor(shape)
 
-        mdY, mY, mdX = [
-            domain_offset_i64((bidx * tiler_mn[0], 0), mT) for mT in (mdY, mY, mdX)
-        ]
-        gdY, gY, gdX = [
-            cute.local_tile(mT, tiler_mn, (0, cluster_y)) for mT in (mdY, mY, mdX)
+        gdY, gY, gdX, cX = [
+            cute.local_tile(mT, tiler_mn, (bidx, cluster_y))
+            for mT in (mdY, mY, mdX, idX)
         ]
-        cX = cute.local_tile(idX, tiler_mn, (bidx, cluster_y))
 
         smem = cutlass.utils.SmemAllocator()
         sdY = smem.allocate_tensor(
@@ -467,11 +768,25 @@ def _kernel_impl(
             num_bits_per_copy=128,
         )
 
-        thr_copy_load = cute.make_tiled_copy(
-            copy_atom_load, tv_layout, tiler_mn
+        num_copy_elems = (
+            tv_layout.shape[1]
+            if const_expr(cute.rank(tv_layout.shape[1]) == 1)
+            else tv_layout.shape[1][0]
+        )
+        threads_per_row = (
+            tv_layout.shape[0]
+            if const_expr(cute.rank(tv_layout.shape[0]) == 1)
+            else tv_layout.shape[0][0]
+        )
+        thr_layout = cute.make_ordered_layout(
+            (tiler_mn[0], threads_per_row), order=(1, 0)
+        )
+        val_layout = cute.make_layout((1, num_copy_elems))
+        thr_copy_load = cute.make_tiled_copy_tv(
+            copy_atom_load, thr_layout, val_layout
         ).get_slice(tidx)
-        thr_copy_store = cute.make_tiled_copy(
-            copy_atom_store, tv_layout, tiler_mn
+        thr_copy_store = cute.make_tiled_copy_tv(
+            copy_atom_store, thr_layout, val_layout
         ).get_slice(tidx)
 
         tdYgdY = thr_copy_load.partition_S(gdY)
@@ -505,8 +820,6 @@ def _kernel_impl(
         cute.autovec_copy(tYsY, tYrY)
         dy = tdYrdY.load().to(Float32)
         y = tYrY.load().to(Float32)
-
-        threads_per_row = tv_layout.shape[0][0]
         dot = row_reduce(
             dy * y,
             cute.ReductionOp.ADD,
@@ -553,6 +866,335 @@ def kernel(
             self._kernel_impl(mdY, mY, mdX, tv_layout, tiler_mn)
 
 
+class SoftmaxFwdBwdSM100(ReductionBase):
+    """Fused softmax forward+backward producing dx from (x, dy).
+
+    Computes:
+      y = softmax(x)
+      dot = sum(dy * y)
+      dx = y * (dy - dot)
+
+    This avoids materializing the intermediate `y` in global memory, which is
+    the dominant overhead in a naive `softmax_backward(dy, softmax_forward(x))`
+    composition.
+    """
+
+    def __init__(self, dtype: Type[cutlass.Numeric], N: int):
+        # Online softmax reduction uses an Int64 reduction buffer packing
+        # (max, sum_exp) pairs. We allocate a separate Float32 reduction buffer
+        # for dot(dy, y).
+        super().__init__(dtype, N, stage=1, reduction_dtype=cutlass.Int64)
+
+    def _calculate_threads_per_row(self) -> int:
+        # Favor the backward bucket policy (better for the dot reduction).
+        N = self.N
+        if N in (4096, 6144):
+            return 128
+        if N <= 64:
+            return 8
+        if N <= 128:
+            return 16
+        if N <= 3072:
+            return 32
+        if N <= 6144:
+            return 64
+        if N <= 8192:
+            return 128
+        return 256
+
+    def _set_cluster_n(self) -> None:
+        # Quack-style growth of cluster_n with N and dtype.
+        N = self.N
+        if const_expr(self.dtype.width == 16):
+            cluster_n = (
+                1
+                if N <= 16 * 1024
+                else (
+                    2
+                    if N <= 32 * 1024
+                    else (4 if N <= 64 * 1024 else (8 if N <= 128 * 1024 else 16))
+                )
+            )
+        else:
+            cluster_n = (
+                1
+                if N <= 32 * 1024
+                else (
+                    2
+                    if N <= 64 * 1024
+                    else (4 if N <= 128 * 1024 else (8 if N <= 256 * 1024 else 16))
+                )
+            )
+        self.cluster_n = cluster_n
+
+    def _get_num_threads(self) -> int:
+        # Keep in sync with _calculate_threads_per_row.
+        return 128 if self.N <= 8192 else 256
+
+    def _smem_size_in_bytes(self, tiler_mn, num_warps: int) -> int:
+        # Allocation order:
+        #   1) sX (16B aligned)
+        #   2) sdY (16B aligned)
+        #   3) reduction_buffer_stats (8B aligned)
+        #   4) reduction_buffer_dot (8B aligned)
+        #   5) optional mbarrier array (8B aligned)
+        def _align_up(x: int, align: int) -> int:
+            return ((x + align - 1) // align) * align
+
+        tile_bytes = int(cute.size_in_bytes(self.dtype, cute.make_layout(tiler_mn)))
+        reduction_stats_bytes = int(
+            num_warps * self.cluster_n * (cutlass.Int64.width // 8)
+        )
+        reduction_dot_bytes = int(
+            num_warps * self.cluster_n * (cutlass.Float32.width // 8)
+        )
+        mbar_bytes = (
+            int(2 * (cutlass.Int64.width // 8)) if const_expr(self.cluster_n > 1) else 0
+        )
+
+        offset = _align_up(tile_bytes, 16)
+        offset = _align_up(offset, 16) + tile_bytes
+        offset = _align_up(offset, 8) + reduction_stats_bytes
+        offset = _align_up(offset, 8) + reduction_dot_bytes
+        offset = _align_up(offset, 8) + mbar_bytes
+        return int(offset)
+
+    @cute.jit
+    def __call__(
+        self,
+        mX: cute.Tensor,
+        mdY: cute.Tensor,
+        mdX: cute.Tensor,
+        stream: cuda.CUstream,
+    ) -> None:
+        assert mX.element_type == self.dtype
+        assert mdY.element_type == self.dtype
+        assert mdX.element_type == self.dtype
+        tiler_mn, tv_layout = self._get_tv_layout()
+        num_threads = (
+            cute.size(tv_layout, mode=[0])
+            if _KERNEL_ACCEPTS_LAYOUT_ARGS
+            else self._get_num_threads()
+        )
+        num_warps = num_threads // cute.arch.WARP_SIZE
+        kernel = (
+            self.kernel(mX, mdY, mdX, tv_layout, tiler_mn)
+            if _KERNEL_ACCEPTS_LAYOUT_ARGS
+            else self.kernel(mX, mdY, mdX)
+        )
+        kernel.launch(
+            grid=[cute.ceil_div(mX.shape[0], tiler_mn[0]), self.cluster_n, 1],
+            block=[num_threads, 1, 1],
+            cluster=[1, self.cluster_n, 1] if const_expr(self.cluster_n > 1) else None,
+            smem=self._smem_size_in_bytes(tiler_mn, num_warps),
+            stream=stream,
+        )
+
+    @cute.jit
+    def launch_from_ptrs(
+        self,
+        ptr_x: cute.Pointer,
+        ptr_dy: cute.Pointer,
+        ptr_dx: cute.Pointer,
+        M: Int32,
+        ld: Int32,
+        stream: cuda.CUstream,
+    ) -> None:
+        """Pointer-based entrypoint that bypasses DLPack conversions."""
+        ld_assumed = cute.assume(ld, divby=128 // self.dtype.width)
+        layout_mn = cute.make_layout((M, self.N), stride=(ld_assumed, 1))
+        mX = cute.make_tensor(ptr_x, layout_mn)
+        mdY = cute.make_tensor(ptr_dy, layout_mn)
+        mdX = cute.make_tensor(ptr_dx, layout_mn)
+        self.__call__(mX, mdY, mdX, stream)
+
+    @cute.jit
+    def _kernel_impl(
+        self,
+        mX: cute.Tensor,
+        mdY: cute.Tensor,
+        mdX: cute.Tensor,
+        tv_layout: cute.Layout,
+        tiler_mn: cute.Shape,
+    ) -> None:
+        tidx, _, _ = cute.arch.thread_idx()
+        bidx, _, _ = cute.arch.block_idx()
+        cluster_y = (
+            const_expr(0)
+            if const_expr(self.cluster_n == 1)
+            else cute.arch.block_idx()[1]
+        )
+
+        shape = mX.shape
+        idX = cute.make_identity_tensor(shape)
+
+        gX, gdY, gdX, cX = [
+            cute.local_tile(mT, tiler_mn, (bidx, cluster_y))
+            for mT in (mX, mdY, mdX, idX)
+        ]
+
+        smem = cutlass.utils.SmemAllocator()
+        sX = smem.allocate_tensor(
+            mX.element_type,
+            cute.make_ordered_layout(tiler_mn, order=(1, 0)),
+            byte_alignment=16,
+        )
+        sdY = smem.allocate_tensor(
+            mdY.element_type,
+            cute.make_ordered_layout(tiler_mn, order=(1, 0)),
+            byte_alignment=16,
+        )
+
+        reduction_layout = self._get_reduction_buffer_layout(tv_layout, self.cluster_n)
+        reduction_buffer_stats = smem.allocate_tensor(
+            cutlass.Int64, reduction_layout, byte_alignment=8
+        )
+        reduction_buffer_dot = smem.allocate_tensor(
+            cutlass.Float32, reduction_layout, byte_alignment=8
+        )
+
+        if const_expr(self.cluster_n > 1):
+            mbar_ptr_base = smem.allocate_array(cutlass.Int64, num_elems=2)
+            mbar_ptr_stats = mbar_ptr_base
+            mbar_ptr_dot = mbar_ptr_base + Int32(1)
+        else:
+            mbar_ptr_stats = None
+            mbar_ptr_dot = None
+
+        copy_atom_load = cute.make_copy_atom(
+            cute.nvgpu.cpasync.CopyG2SOp(),
+            mX.element_type,
+            num_bits_per_copy=128,
+        )
+        copy_atom_store = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(),
+            gdX.element_type,
+            num_bits_per_copy=128,
+        )
+
+        num_copy_elems = (
+            tv_layout.shape[1]
+            if const_expr(cute.rank(tv_layout.shape[1]) == 1)
+            else tv_layout.shape[1][0]
+        )
+        threads_per_row = (
+            tv_layout.shape[0]
+            if const_expr(cute.rank(tv_layout.shape[0]) == 1)
+            else tv_layout.shape[0][0]
+        )
+        thr_layout = cute.make_ordered_layout(
+            (tiler_mn[0], threads_per_row), order=(1, 0)
+        )
+        val_layout = cute.make_layout((1, num_copy_elems))
+        thr_copy_load = cute.make_tiled_copy_tv(
+            copy_atom_load, thr_layout, val_layout
+        ).get_slice(tidx)
+        thr_copy_store = cute.make_tiled_copy_tv(
+            copy_atom_store, thr_layout, val_layout
+        ).get_slice(tidx)
+
+        tXgX = thr_copy_load.partition_S(gX)
+        tXsX = thr_copy_load.partition_D(sX)
+        tdYgdY = thr_copy_load.partition_S(gdY)
+        tdYsdY = thr_copy_load.partition_D(sdY)
+        tdXgdX = thr_copy_store.partition_D(gdX)
+        tXcX = thr_copy_load.partition_S(cX)[(0, None), None, None]
+
+        tXrX, tdYrdY, tdXrdX = [
+            cute.make_fragment_like(thr) for thr in (tXgX, tdYgdY, tdXgdX)
+        ]
+
+        if const_expr(
+            self.cluster_n > 1
+            and mbar_ptr_stats is not None
+            and mbar_ptr_dot is not None
+        ):
+            if tidx < 2:
+                cute.arch.mbarrier_init(mbar_ptr_stats + tidx, 1)
+            cute.arch.mbarrier_init_fence()
+            cute.arch.cluster_arrive_relaxed()
+
+        is_even_N = const_expr(self.N == tiler_mn[1] * self.cluster_n)
+        tXpX = (
+            predicate_k(thr_copy_load.partition_S(cX), limit=shape[1])
+            if const_expr(not is_even_N)
+            else None
+        )
+
+        if tXcX[0][0] < shape[0]:
+            cute.copy(copy_atom_load, tXgX, tXsX, pred=tXpX)
+            cute.copy(copy_atom_load, tdYgdY, tdYsdY, pred=tXpX)
+        cute.arch.cp_async_commit_group()
+        cute.arch.cp_async_wait_group(0)
+
+        if const_expr(not is_even_N):
+            fill_oob(tXsX, tXpX, -tXsX.element_type.inf)
+            fill_oob(tdYsdY, tXpX, 0.0)
+
+        cute.autovec_copy(tXsX, tXrX)
+        cute.autovec_copy(tdYsdY, tdYrdY)
+        x = tXrX.load().to(Float32)
+        dy = tdYrdY.load().to(Float32)
+
+        _, denom, exp_x = online_softmax_reduce(
+            x,
+            threads_per_row,
+            reduction_buffer_stats[None, None, 0],
+            mbar_ptr_stats,
+            hook_fn=cute.arch.cluster_wait if const_expr(self.cluster_n > 1) else None,
+            phase=None,
+            return_exp_x=True,
+        )
+        assert exp_x is not None
+        y = exp_x * cute.arch.rcp_approx(denom)
+
+        dot = row_reduce(
+            dy * y,
+            cute.ReductionOp.ADD,
+            threads_per_row,
+            reduction_buffer_dot[None, None, 0],
+            mbar_ptr_dot,
+            phase=None,
+            init_val=0.0,
+            hook_fn=cute.arch.cluster_wait if const_expr(self.cluster_n > 1) else None,
+        )
+
+        dx = y * (dy - dot)
+        tdXrdX.store(dx.to(tdXrdX.element_type))
+
+        tOpO = (
+            predicate_k(thr_copy_store.partition_S(cX), limit=shape[1])
+            if const_expr(not is_even_N)
+            else None
+        )
+        if tXcX[0][0] < shape[0]:
+            cute.copy(copy_atom_store, tdXrdX, tdXgdX, pred=tOpO)
+
+    if _KERNEL_ACCEPTS_LAYOUT_ARGS:
+
+        @cute.kernel
+        def kernel(
+            self,
+            mX: cute.Tensor,
+            mdY: cute.Tensor,
+            mdX: cute.Tensor,
+            tv_layout: cute.Layout,
+            tiler_mn: cute.Shape,
+        ) -> None:
+            self._kernel_impl(mX, mdY, mdX, tv_layout, tiler_mn)
+    else:
+
+        @cute.kernel
+        def kernel(
+            self,
+            mX: cute.Tensor,
+            mdY: cute.Tensor,
+            mdX: cute.Tensor,
+        ) -> None:
+            tiler_mn, tv_layout = self._get_tv_layout()
+            self._kernel_impl(mX, mdY, mdX, tv_layout, tiler_mn)
+
+
 def _convert_2d_tensor(x: Tensor) -> cute.Tensor:
     # Match Quack's Softmax conversion exactly: assume 16B alignment and mark
     # the shape compact with row-major stride order (0, 1), with mode=0 (batch).
@@ -596,7 +1238,8 @@ def _softmax_forward_ptr_into(*, x: Tensor, out: Tensor) -> None:
     device_index = x.get_device()
     if torch.cuda.current_device() != device_index:
         torch.cuda.set_device(device_index)
-    stream = cuda.CUstream(int(torch.cuda.current_stream().cuda_stream))
+    stream_handle = int(torch.cuda.current_stream().cuda_stream)
+    stream = cuda.CUstream(stream_handle)
 
     dtype_x = TORCH2CUTE_DTYPE[x.dtype]
     key = ("ptr_fwd", int(N), dtype_x, int(device_index))
@@ -620,6 +1263,27 @@ def _softmax_forward_ptr_into(*, x: Tensor, out: Tensor) -> None:
         )
         _PTR_FWD_COMPILE_CACHE[key] = compiled
 
+    launcher = _get_fast_ptr_softmax_launcher(
+        compiled=compiled,
+        dtype=dtype_x,
+        N=int(N),
+        device_index=int(device_index),
+        stream_handle=stream_handle,
+        assumed_align=16,
+        is_bwd=False,
+    )
+    if launcher is not None:
+        launcher.launch(
+            a_ptr=int(x.data_ptr()),
+            b_ptr=int(out.data_ptr()),
+            c_ptr=None,
+            M=int(M),
+            ld=int(x.stride(0)),
+            stream_handle=stream_handle,
+            dtype=dtype_x,
+        )
+        return
+
     ptr_x = rt.make_ptr(
         dtype_x, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
     )
@@ -642,7 +1306,8 @@ def _softmax_backward_ptr_into(*, dy: Tensor, y: Tensor, dx: Tensor) -> None:
     device_index = dy.get_device()
     if torch.cuda.current_device() != device_index:
         torch.cuda.set_device(device_index)
-    stream = cuda.CUstream(int(torch.cuda.current_stream().cuda_stream))
+    stream_handle = int(torch.cuda.current_stream().cuda_stream)
+    stream = cuda.CUstream(stream_handle)
 
     dtype_x = TORCH2CUTE_DTYPE[dy.dtype]
     key = ("ptr_bwd", int(N), dtype_x, int(device_index))
@@ -670,6 +1335,27 @@ def _softmax_backward_ptr_into(*, dy: Tensor, y: Tensor, dx: Tensor) -> None:
         )
         _PTR_BWD_COMPILE_CACHE[key] = compiled
 
+    launcher = _get_fast_ptr_softmax_launcher(
+        compiled=compiled,
+        dtype=dtype_x,
+        N=int(N),
+        device_index=int(device_index),
+        stream_handle=stream_handle,
+        assumed_align=16,
+        is_bwd=True,
+    )
+    if launcher is not None:
+        launcher.launch(
+            a_ptr=int(dy.data_ptr()),
+            b_ptr=int(y.data_ptr()),
+            c_ptr=int(dx.data_ptr()),
+            M=int(M),
+            ld=int(dy.stride(0)),
+            stream_handle=stream_handle,
+            dtype=dtype_x,
+        )
+        return
+
     ptr_dy = rt.make_ptr(
         dtype_x, dy.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
     )
@@ -682,6 +1368,81 @@ def _softmax_backward_ptr_into(*, dy: Tensor, y: Tensor, dx: Tensor) -> None:
     compiled(ptr_dy, ptr_y, ptr_dx, Int32(int(M)), Int32(int(dy.stride(0))), stream)
 
 
+def _softmax_fwd_bwd_ptr_into(*, x: Tensor, dy: Tensor, dx: Tensor) -> None:
+    """Launch the fused pointer-based Softmax fwd+bwd kernel into preallocated `dx`."""
+    assert x.is_cuda and x.dim() == 2
+    assert dy.is_cuda and dy.shape == x.shape and dy.dtype == x.dtype
+    assert dx.is_cuda and dx.shape == x.shape and dx.dtype == x.dtype
+    assert x.stride() == dy.stride() == dx.stride(), (
+        "Pointer path expects matching strides"
+    )
+
+    M, N = x.shape
+    device_index = x.get_device()
+    if torch.cuda.current_device() != device_index:
+        torch.cuda.set_device(device_index)
+    stream_handle = int(torch.cuda.current_stream().cuda_stream)
+    stream = cuda.CUstream(stream_handle)
+
+    dtype_x = TORCH2CUTE_DTYPE[x.dtype]
+    key = ("ptr_fwd_bwd", int(N), dtype_x, int(device_index))
+    compiled = _PTR_FWDBWD_COMPILE_CACHE.get(key)
+    if compiled is None:
+        op = SoftmaxFwdBwdSM100(dtype_x, int(N))
+        ptr_x = rt.make_ptr(
+            dtype_x, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+        )
+        ptr_dy = rt.make_ptr(
+            dtype_x, dy.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+        )
+        ptr_dx = rt.make_ptr(
+            dtype_x, dx.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+        )
+        ld = Int32(int(x.stride(0)))
+        compiled = cute.compile(
+            op.launch_from_ptrs,
+            ptr_x,
+            ptr_dy,
+            ptr_dx,
+            Int32(int(M)),
+            ld,
+            stream,
+        )
+        _PTR_FWDBWD_COMPILE_CACHE[key] = compiled
+
+    launcher = _get_fast_ptr_softmax_launcher(
+        compiled=compiled,
+        dtype=dtype_x,
+        N=int(N),
+        device_index=int(device_index),
+        stream_handle=stream_handle,
+        assumed_align=16,
+        is_bwd=True,
+    )
+    if launcher is not None:
+        launcher.launch(
+            a_ptr=int(x.data_ptr()),
+            b_ptr=int(dy.data_ptr()),
+            c_ptr=int(dx.data_ptr()),
+            M=int(M),
+            ld=int(x.stride(0)),
+            stream_handle=stream_handle,
+            dtype=dtype_x,
+        )
+        return
+
+    ptr_x = rt.make_ptr(
+        dtype_x, x.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+    )
+    ptr_dy = rt.make_ptr(
+        dtype_x, dy.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+    )
+    ptr_dx = rt.make_ptr(
+        dtype_x, dx.data_ptr(), mem_space=rt.AddressSpace.gmem, assumed_align=16
+    )
+    compiled(ptr_x, ptr_dy, ptr_dx, Int32(int(M)), Int32(int(x.stride(0))), stream)
+
+
 def softmax_forward(x: Tensor) -> Tensor:
     """SM100 CuteDSL softmax forward pass: y = softmax(x, dim=-1)."""
     assert x.dim() == 2, "Input must be 2D (M, N)"
@@ -749,6 +1510,31 @@ def softmax_backward(dy: Tensor, y: Tensor) -> Tensor:
     return dx
 
 
+def softmax_fwd_bwd(dy: Tensor, x: Tensor) -> Tensor:
+    """Fused softmax forward+backward producing ``dx`` from ``(x, dy)``.
+
+    This is intended for benchmarks and training-like use-cases where the
+    intermediate ``y = softmax(x)`` is not needed outside the backward pass.
+    """
+    assert x.dim() == 2 and dy.dim() == 2, "x and dy must be 2D (M, N)"
+    assert x.shape == dy.shape, "x and dy must have the same shape"
+    assert x.is_cuda and dy.is_cuda, "x and dy must be on CUDA device"
+    assert x.dtype in TORCH2CUTE_DTYPE, "Unsupported dtype"
+    assert dy.dtype == x.dtype, "x and dy must have the same dtype"
+
+    if (
+        _can_use_ptr_path_2d(x)
+        and _can_use_ptr_path_2d(dy)
+        and x.stride() == dy.stride()
+    ):
+        dx = torch.empty_strided(x.shape, x.stride(), device=x.device, dtype=x.dtype)
+        _softmax_fwd_bwd_ptr_into(x=x, dy=dy, dx=dx)
+        return dx
+
+    with torch.no_grad():
+        return softmax_backward(dy, softmax_forward(x))
+
+
 class SoftmaxFunction(torch.autograd.Function):
     @staticmethod
     def forward(ctx, x: Tensor) -> Tensor: