From 0219d6d78b3efc22f25b8b1bb499022b66fd5871 Mon Sep 17 00:00:00 2001
From: Jlonge4 <josh.longenecker1@gmail.com>
Date: Sat, 17 Jan 2026 22:30:33 -0500
Subject: [PATCH 01/21] Add einsum operation with HLO implementation

Implements Einstein summation notation (einsum) for NKIPy, supporting:
- Matrix multiplication and batch operations (ij,jk->ik, bij,bjk->bik)
- Transpose and dimension permutation (ij->ji, ijk->kji)
- Reductions and trace operations (ij->, ii->)
- Outer products (i,j->ij)
- Broadcasting patterns (ij,j->ij)
- Complex tensor contractions (ijk,jkl->il)
- N-ary operations (i,ij,j->)

Implementation decomposes einsum patterns into HLO primitives:
- dot_general for contractions
- transpose for dimension reordering
- reduce for summations
- broadcast/multiply for outer products

Includes comprehensive tests covering all major einsum patterns
and examples demonstrating real-world usage including simplified
attention mechanisms.
---
 examples/playground/einsum_example.py | 210 ++++++++++++++
 nkipy/src/nkipy/core/ops/__init__.py  |   7 +
 nkipy/src/nkipy/core/ops/einsum.py    | 391 ++++++++++++++++++++++++++
 tests/unit/test_einsum.py             | 335 ++++++++++++++++++++++
 4 files changed, 943 insertions(+)
 create mode 100644 examples/playground/einsum_example.py
 create mode 100644 nkipy/src/nkipy/core/ops/einsum.py
 create mode 100644 tests/unit/test_einsum.py

diff --git a/examples/playground/einsum_example.py b/examples/playground/einsum_example.py
new file mode 100644
index 0000000..d396deb
--- /dev/null
+++ b/examples/playground/einsum_example.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python3
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Example demonstrating einsum operation in NKIPy.
+
+Einstein summation notation provides a concise and powerful way to
+express tensor operations including matrix multiplication, transposes,
+reductions, and complex tensor contractions.
+"""
+
+import numpy as np
+
+from nkipy.runtime.decorators import simulate_jit
+
+
+# =============================================================================
+# Matrix Operations
+# =============================================================================
+
+@simulate_jit
+def matmul_einsum(A, B):
+    """Matrix multiplication using einsum."""
+    import nkipy.core.ops as ops
+    return ops.einsum('ij,jk->ik', A, B)
+
+
+@simulate_jit
+def batch_matmul_einsum(A, B):
+    """Batch matrix multiplication using einsum."""
+    import nkipy.core.ops as ops
+    return ops.einsum('bij,bjk->bik', A, B)
+
+
+# =============================================================================
+# Transpose and Permutation
+# =============================================================================
+
+@simulate_jit
+def transpose_einsum(A):
+    """Transpose using einsum."""
+    import nkipy.core.ops as ops
+    return ops.einsum('ij->ji', A)
+
+
+@simulate_jit
+def permute_dims_einsum(A):
+    """Permute dimensions using einsum."""
+    import nkipy.core.ops as ops
+    return ops.einsum('ijk->kij', A)
+
+
+# =============================================================================
+# Reductions
+# =============================================================================
+
+@simulate_jit
+def trace_einsum(A):
+    """Matrix trace using einsum."""
+    import nkipy.core.ops as ops
+    return ops.einsum('ii->', A)
+
+
+@simulate_jit
+def sum_axis_einsum(A):
+    """Sum along axis using einsum."""
+    import nkipy.core.ops as ops
+    return ops.einsum('ij->i', A)
+
+
+# =============================================================================
+# Outer Products
+# =============================================================================
+
+@simulate_jit
+def outer_product_einsum(a, b):
+    """Outer product using einsum."""
+    import nkipy.core.ops as ops
+    return ops.einsum('i,j->ij', a, b)
+
+
+# =============================================================================
+# Advanced Patterns
+# =============================================================================
+
+@simulate_jit
+def dot_product_einsum(a, b):
+    """Dot product using einsum."""
+    import nkipy.core.ops as ops
+    return ops.einsum('i,i->', a, b)
+
+
+@simulate_jit
+def bilinear_form_einsum(x, A, y):
+    """Bilinear form x^T A y using einsum."""
+    import nkipy.core.ops as ops
+    return ops.einsum('i,ij,j->', x, A, y)
+
+
+@simulate_jit
+def attention_pattern_einsum(Q, K, V):
+    """Simplified attention pattern: Q @ K^T @ V using einsum.
+    
+    This computes (Q @ K^T) @ V in one operation.
+    Q: (batch, seq_q, d_k)
+    K: (batch, seq_k, d_k)
+    V: (batch, seq_v, d_v)
+    
+    Note: This is simplified - real attention includes scaling and softmax.
+    """
+    import nkipy.core.ops as ops
+    # Q @ K^T: 'bid,bjd->bij' where i=seq_q, j=seq_k
+    # (Q @ K^T) @ V: 'bij,bjd->bid' where final d is d_v
+    # Combined: 'bik,bjk,bjd->bid'
+    return ops.einsum('bik,bjk,bjd->bid', Q, K, V)
+
+
+def main():
+    print("=" * 70)
+    print("NKIPy einsum Examples")
+    print("=" * 70)
+    
+    # Example 1: Matrix Multiplication
+    print("\n1. Matrix Multiplication (ij,jk->ik):")
+    A = np.array([[1, 2], [3, 4]], dtype=np.float32)
+    B = np.array([[5, 6], [7, 8]], dtype=np.float32)
+    print(f"A =\n{A}")
+    print(f"B =\n{B}")
+    result = matmul_einsum(A, B)
+    print(f"A @ B =\n{result}")
+    print(f"NumPy result:\n{np.einsum('ij,jk->ik', A, B)}")
+    
+    # Example 2: Batch Matrix Multiplication
+    print("\n2. Batch Matrix Multiplication (bij,bjk->bik):")
+    A_batch = np.random.rand(2, 3, 4).astype(np.float32)
+    B_batch = np.random.rand(2, 4, 5).astype(np.float32)
+    result_batch = batch_matmul_einsum(A_batch, B_batch)
+    print(f"Batch A shape: {A_batch.shape}")
+    print(f"Batch B shape: {B_batch.shape}")
+    print(f"Result shape: {result_batch.shape}")
+    
+    # Example 3: Transpose
+    print("\n3. Transpose (ij->ji):")
+    C = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+    print(f"C =\n{C}")
+    result_transpose = transpose_einsum(C)
+    print(f"C^T =\n{result_transpose}")
+    
+    # Example 4: Trace
+    print("\n4. Matrix Trace (ii->):")
+    D = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32)
+    print(f"D =\n{D}")
+    trace = trace_einsum(D)
+    print(f"Trace(D) = {trace}")
+    print(f"NumPy trace: {np.trace(D)}")
+    
+    # Example 5: Outer Product
+    print("\n5. Outer Product (i,j->ij):")
+    a = np.array([1, 2, 3], dtype=np.float32)
+    b = np.array([4, 5], dtype=np.float32)
+    print(f"a = {a}")
+    print(f"b = {b}")
+    result_outer = outer_product_einsum(a, b)
+    print(f"Outer product a ⊗ b =\n{result_outer}")
+    
+    # Example 6: Dot Product
+    print("\n6. Dot Product (i,i->):")
+    x = np.array([1, 2, 3, 4], dtype=np.float32)
+    y = np.array([5, 6, 7, 8], dtype=np.float32)
+    print(f"x = {x}")
+    print(f"y = {y}")
+    dot = dot_product_einsum(x, y)
+    print(f"x · y = {dot}")
+    print(f"NumPy dot: {np.dot(x, y)}")
+    
+    # Example 7: Bilinear Form
+    print("\n7. Bilinear Form x^T A y (i,ij,j->):")
+    x = np.array([1, 2], dtype=np.float32)
+    A = np.array([[3, 4], [5, 6]], dtype=np.float32)
+    y = np.array([7, 8], dtype=np.float32)
+    print(f"x = {x}")
+    print(f"A =\n{A}")
+    print(f"y = {y}")
+    bilinear = bilinear_form_einsum(x, A, y)
+    print(f"x^T A y = {bilinear}")
+    
+    # Example 8: Sum along axis
+    print("\n8. Sum Along Axis (ij->i):")
+    E = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+    print(f"E =\n{E}")
+    sum_result = sum_axis_einsum(E)
+    print(f"Sum along axis 1: {sum_result}")
+    
+    # Example 9: Attention-like pattern (simplified)
+    print("\n9. Simplified Attention Pattern (bik,bjk,bjd->bid):")
+    Q = np.random.rand(1, 2, 3).astype(np.float32)  # (batch=1, seq_q=2, d_k=3)
+    K = np.random.rand(1, 4, 3).astype(np.float32)  # (batch=1, seq_k=4, d_k=3)
+    V = np.random.rand(1, 4, 5).astype(np.float32)  # (batch=1, seq_v=4, d_v=5)
+    print(f"Q shape: {Q.shape} (batch, seq_q, d_k)")
+    print(f"K shape: {K.shape} (batch, seq_k, d_k)")
+    print(f"V shape: {V.shape} (batch, seq_v, d_v)")
+    attn_result = attention_pattern_einsum(Q, K, V)
+    print(f"Attention output shape: {attn_result.shape} (batch, seq_q, d_v)")
+    
+    print("\n" + "=" * 70)
+    print("All einsum examples completed!")
+    print("=" * 70)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nkipy/src/nkipy/core/ops/__init__.py b/nkipy/src/nkipy/core/ops/__init__.py
index 31f77d4..2802a5d 100644
--- a/nkipy/src/nkipy/core/ops/__init__.py
+++ b/nkipy/src/nkipy/core/ops/__init__.py
@@ -89,6 +89,11 @@
 # -----------------------------------------------------------------------------
 from nkipy.core.ops.linalg import matmul
 
+# -----------------------------------------------------------------------------
+# Einstein summation
+# -----------------------------------------------------------------------------
+from nkipy.core.ops.einsum import einsum
+
 # -----------------------------------------------------------------------------
 # Neural network operations
 # -----------------------------------------------------------------------------
@@ -208,6 +213,8 @@
     "full_like",
     # Linalg
     "matmul",
+    # Einsum
+    "einsum",
     # Reduction
     "sum",
     "max",
diff --git a/nkipy/src/nkipy/core/ops/einsum.py b/nkipy/src/nkipy/core/ops/einsum.py
new file mode 100644
index 0000000..4c8ae45
--- /dev/null
+++ b/nkipy/src/nkipy/core/ops/einsum.py
@@ -0,0 +1,391 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Einstein summation (einsum) operation.
+
+Provides a flexible interface for expressing tensor contractions using
+Einstein notation, such as matrix multiplication, batch operations,
+traces, and more.
+"""
+
+from typing import Dict, List, Set, Tuple
+
+import numpy as np
+
+from nkipy.core.ops._registry import Op
+
+# =============================================================================
+# Einsum Subscript Parsing
+# =============================================================================
+
+
+def parse_einsum_subscripts(subscripts: str, num_operands: int) -> Tuple[List[str], str]:
+    """Parse einsum subscript string into input and output specifications.
+    
+    Args:
+        subscripts: Einstein notation string (e.g., 'ij,jk->ik' or 'ij,jk')
+        num_operands: Number of input operands
+    
+    Returns:
+        Tuple of (input_specs, output_spec) where:
+        - input_specs: List of strings, one per operand (e.g., ['ij', 'jk'])
+        - output_spec: Output string (e.g., 'ik'), inferred if not provided
+    
+    Examples:
+        >>> parse_einsum_subscripts('ij,jk->ik', 2)
+        (['ij', 'jk'], 'ik')
+        
+        >>> parse_einsum_subscripts('ii', 1)  # trace - inferred output is ''
+        (['ii'], '')
+    """
+    # Remove whitespace
+    subscripts = subscripts.replace(' ', '')
+    
+    # Check for explicit output specification
+    if '->' in subscripts:
+        input_str, output_spec = subscripts.split('->')
+        input_specs = input_str.split(',')
+    else:
+        input_specs = subscripts.split(',')
+        # Infer output: all indices that appear exactly once across all inputs
+        all_indices: Dict[str, int] = {}
+        for spec in input_specs:
+            for idx in spec:
+                all_indices[idx] = all_indices.get(idx, 0) + 1
+        
+        # Output contains indices that appear exactly once, in order of first appearance
+        output_spec = ''
+        seen: Set[str] = set()
+        for spec in input_specs:
+            for idx in spec:
+                if idx not in seen and all_indices[idx] == 1:
+                    output_spec += idx
+                    seen.add(idx)
+    
+    if len(input_specs) != num_operands:
+        raise ValueError(
+            f"Number of subscripts ({len(input_specs)}) does not match "
+            f"number of operands ({num_operands})"
+        )
+    
+    return input_specs, output_spec
+
+
+def analyze_einsum_pattern(
+    input_specs: List[str], output_spec: str, shapes: List[Tuple[int, ...]]
+) -> Dict:
+    """Analyze einsum pattern to determine dimension mapping and operation type.
+    
+    Args:
+        input_specs: List of input subscript strings
+        output_spec: Output subscript string
+        shapes: List of input tensor shapes
+    
+    Returns:
+        Dictionary containing:
+        - 'input_dims': Dict mapping each input index to dimension size
+        - 'contracting_dims': Set of indices being contracted (summed over)
+        - 'batch_dims': Set of indices appearing in all inputs and output
+        - 'output_order': List of indices in output order
+    """
+    # Build index -> dimension size mapping
+    input_dims: Dict[str, int] = {}
+    for spec, shape in zip(input_specs, shapes):
+        if len(spec) != len(shape):
+            raise ValueError(
+                f"Subscript '{spec}' has {len(spec)} indices but shape has "
+                f"{len(shape)} dimensions: {shape}"
+            )
+        for idx, size in zip(spec, shape):
+            if idx in input_dims and input_dims[idx] != size:
+                raise ValueError(
+                    f"Index '{idx}' has inconsistent dimensions: "
+                    f"{input_dims[idx]} vs {size}"
+                )
+            input_dims[idx] = size
+    
+    # Collect all unique indices
+    all_indices = set()
+    for spec in input_specs:
+        all_indices.update(spec)
+    
+    # Determine contracting dimensions (in inputs but not output)
+    contracting_dims = all_indices - set(output_spec)
+    
+    # Determine batch dimensions (in all inputs and output)
+    batch_dims = set(output_spec)
+    for spec in input_specs:
+        batch_dims &= set(spec)
+    
+    return {
+        'input_dims': input_dims,
+        'contracting_dims': contracting_dims,
+        'batch_dims': batch_dims,
+        'output_order': list(output_spec),
+    }
+
+
+# =============================================================================
+# Einsum Operation
+# =============================================================================
+einsum = Op("einsum")
+
+
+@einsum.impl("hlo")
+def _einsum_hlo(subscripts, *operands, dtype=None):
+    """Einstein summation convention on tensors (HLO implementation).
+    
+    Implements einsum using HLO operations: transpose, dot_general, reduce.
+    Supports common patterns like matrix multiplication, batch operations,
+    traces, outer products, and more.
+    
+    Args:
+        subscripts: Einstein notation string (e.g., 'ij,jk->ik')
+        *operands: Input tensors
+        dtype: Optional output dtype (if None, inferred from inputs)
+    
+    Returns:
+        Result tensor according to einsum specification
+    
+    Examples:
+        >>> # Matrix multiply
+        >>> einsum('ij,jk->ik', A, B)
+        
+        >>> # Batch matrix multiply  
+        >>> einsum('bij,bjk->bik', A, B)
+        
+        >>> # Trace
+        >>> einsum('ii->', A)
+        
+        >>> # Outer product
+        >>> einsum('i,j->ij', a, b)
+    """
+    from nkipy.core.backend.hlo import get_hlo_context
+    from nkipy.core.tensor import NKIPyTensorRef
+    
+    if not operands:
+        raise ValueError("einsum requires at least one operand")
+    
+    # Parse subscripts
+    input_specs, output_spec = parse_einsum_subscripts(subscripts, len(operands))
+    
+    # Convert to HLO tensors
+    ctx = get_hlo_context()
+    hlo_operands = []
+    shapes = []
+    
+    for op in operands:
+        if isinstance(op, NKIPyTensorRef):
+            hlo_operands.append(op.backend_tensor)
+            shapes.append(op.backend_tensor.shape)
+        else:
+            hlo_operands.append(op)
+            shapes.append(op.shape)
+    
+    # Analyze pattern
+    analysis = analyze_einsum_pattern(input_specs, output_spec, shapes)
+    
+    # Handle special cases for optimization
+    if len(operands) == 1:
+        return _einsum_unary(ctx, hlo_operands[0], input_specs[0], output_spec, analysis)
+    elif len(operands) == 2:
+        return _einsum_binary(
+            ctx, hlo_operands[0], hlo_operands[1], 
+            input_specs[0], input_specs[1], output_spec, analysis
+        )
+    else:
+        # General case: reduce to binary operations
+        return _einsum_nary(ctx, hlo_operands, input_specs, output_spec, analysis)
+
+
+def _einsum_unary(ctx, operand, input_spec, output_spec, analysis):
+    """Handle single-operand einsum (transpose, trace, reduction)."""
+    from nkipy.core.backend.hlo import as_hlo_tensor
+    from nkipy.core.tensor import NKIPyTensorRef
+    
+    # If output is empty, it's a full reduction
+    if not output_spec:
+        # Reduce all dimensions
+        init_tensor = as_hlo_tensor(ctx, 0.0, operand.dtype)
+        result = ctx.build_op(
+            "reduce",
+            [operand, init_tensor],
+            (),  # scalar output
+            operand.dtype,
+            {
+                "dimensions": list(range(len(operand.shape))),
+                "computation": "add",
+            }
+        )
+        return NKIPyTensorRef(result)
+    
+    # Determine which dimensions to reduce
+    dims_to_reduce = []
+    output_dims = []
+    
+    for i, idx in enumerate(input_spec):
+        if idx not in output_spec:
+            dims_to_reduce.append(i)
+        else:
+            output_dims.append((idx, i, operand.shape[i]))
+    
+    # Sort output dimensions by their order in output_spec
+    output_dims.sort(key=lambda x: output_spec.index(x[0]))
+    
+    # If there are dimensions to reduce
+    if dims_to_reduce:
+        reduced_shape = tuple(size for _, _, size in output_dims)
+        init_tensor = as_hlo_tensor(ctx, 0.0, operand.dtype)
+        operand = ctx.build_op(
+            "reduce",
+            [operand, init_tensor],
+            reduced_shape,
+            operand.dtype,
+            {
+                "dimensions": dims_to_reduce,
+                "computation": "add",
+            }
+        )
+    
+    # If we need to transpose to match output order
+    current_order = [idx for idx, _, _ in output_dims]
+    if current_order != list(output_spec):
+        # Build permutation
+        perm = [current_order.index(idx) for idx in output_spec]
+        transposed_shape = tuple(operand.shape[i] for i in perm)
+        operand = ctx.build_op(
+            "transpose",
+            [operand],
+            transposed_shape,
+            operand.dtype,
+            {"permutation": perm}
+        )
+    
+    return NKIPyTensorRef(operand)
+
+
+def _einsum_binary(ctx, lhs, rhs, lhs_spec, rhs_spec, output_spec, analysis):
+    """Handle two-operand einsum (matmul, outer product, etc.)."""
+    from nkipy.core.tensor import NKIPyTensorRef
+    
+    # Find contracting, batch, and free dimensions
+    lhs_indices = list(lhs_spec)
+    rhs_indices = list(rhs_spec)
+    
+    contracting_dims = analysis['contracting_dims']
+    
+    # Identify dimension roles for each operand
+    lhs_contracting = [i for i, idx in enumerate(lhs_indices) if idx in contracting_dims]
+    rhs_contracting = [i for i, idx in enumerate(rhs_indices) if idx in contracting_dims]
+    
+    lhs_batch = [i for i, idx in enumerate(lhs_indices) if idx in analysis['batch_dims']]
+    rhs_batch = [i for i, idx in enumerate(rhs_indices) if idx in analysis['batch_dims']]
+    
+    # Compute output shape
+    output_shape = tuple(analysis['input_dims'][idx] for idx in output_spec)
+    
+    # Use dot_general for contraction
+    if contracting_dims:
+        result = ctx.build_op(
+            "dot",
+            [lhs, rhs],
+            output_shape,
+            lhs.dtype,
+            {
+                "lhs_contracting_dimensions": lhs_contracting,
+                "rhs_contracting_dimensions": rhs_contracting,
+                "lhs_batch_dimensions": lhs_batch,
+                "rhs_batch_dimensions": rhs_batch,
+            }
+        )
+    else:
+        # No contraction - it's an outer product or broadcast multiply
+        # Reshape both operands to have compatible shapes, then multiply
+        # For now, use broadcasting via reshape + multiply
+        
+        # Determine the position of each operand's dimensions in output
+        lhs_out_positions = [output_spec.index(idx) for idx in lhs_indices]
+        rhs_out_positions = [output_spec.index(idx) for idx in rhs_indices]
+        
+        # Reshape lhs: add dimensions at positions not in lhs
+        new_lhs_shape = [1] * len(output_shape)
+        for i, pos in enumerate(lhs_out_positions):
+            new_lhs_shape[pos] = lhs.shape[i]
+        lhs_reshaped = ctx.build_op("reshape", [lhs], tuple(new_lhs_shape), lhs.dtype)
+        
+        # Broadcast lhs to output shape
+        lhs_broadcasted = ctx.build_op(
+            "broadcast",
+            [lhs_reshaped],
+            output_shape,
+            lhs.dtype,
+            {"broadcast_dimensions": lhs_out_positions}
+        )
+        
+        # Reshape rhs similarly
+        new_rhs_shape = [1] * len(output_shape)
+        for i, pos in enumerate(rhs_out_positions):
+            new_rhs_shape[pos] = rhs.shape[i]
+        rhs_reshaped = ctx.build_op("reshape", [rhs], tuple(new_rhs_shape), rhs.dtype)
+        
+        # Broadcast rhs to output shape
+        rhs_broadcasted = ctx.build_op(
+            "broadcast",
+            [rhs_reshaped],
+            output_shape,
+            rhs.dtype,
+            {"broadcast_dimensions": rhs_out_positions}
+        )
+        
+        # Multiply
+        result = ctx.build_op(
+            "multiply",
+            [lhs_broadcasted, rhs_broadcasted],
+            output_shape,
+            lhs.dtype
+        )
+    
+    return NKIPyTensorRef(result)
+
+
+def _einsum_nary(ctx, operands, input_specs, output_spec, analysis):
+    """Handle n-ary einsum by reducing to binary operations."""
+    # Chain binary operations left-to-right
+    result = operands[0]
+    current_spec = input_specs[0]
+    
+    for i in range(1, len(operands)):
+        # Determine intermediate output spec (union of remaining indices)
+        remaining_specs = input_specs[i:]
+        remaining_indices = set(output_spec)
+        for spec in remaining_specs:
+            remaining_indices.update(spec)
+        
+        # Build intermediate spec in canonical order
+        intermediate_spec = ''.join(
+            idx for idx in current_spec + input_specs[i] 
+            if idx in remaining_indices and idx not in 
+            ''.join(idx for idx in current_spec + input_specs[i] 
+                   if idx in remaining_indices)[:current_spec.index(idx) 
+                                                 if idx in current_spec else len(current_spec)]
+        )
+        
+        # Perform binary einsum
+        from nkipy.core.backend.hlo import get_hlo_context
+        shapes = [result.shape, operands[i].shape]
+        sub_analysis = analyze_einsum_pattern(
+            [current_spec, input_specs[i]], 
+            intermediate_spec if i < len(operands) - 1 else output_spec,
+            shapes
+        )
+        
+        result_ref = _einsum_binary(
+            ctx, result, operands[i],
+            current_spec, input_specs[i],
+            intermediate_spec if i < len(operands) - 1 else output_spec,
+            sub_analysis
+        )
+        result = result_ref.backend_tensor
+        current_spec = intermediate_spec if i < len(operands) - 1 else output_spec
+    
+    from nkipy.core.tensor import NKIPyTensorRef
+    return NKIPyTensorRef(result)
diff --git a/tests/unit/test_einsum.py b/tests/unit/test_einsum.py
new file mode 100644
index 0000000..b59d622
--- /dev/null
+++ b/tests/unit/test_einsum.py
@@ -0,0 +1,335 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Unit tests for einsum operation."""
+
+import numpy as np
+import pytest
+
+from nkipy.runtime.decorators import simulate_jit
+
+
+class TestEinsumMatmul:
+    """Test einsum for matrix multiplication patterns."""
+
+    def test_matmul_basic(self):
+        """Test basic matrix multiplication: ij,jk->ik"""
+        @simulate_jit
+        def kernel_matmul(A, B):
+            import nkipy.core.ops as ops
+            return ops.einsum('ij,jk->ik', A, B)
+
+        A = np.random.rand(3, 4).astype(np.float32)
+        B = np.random.rand(4, 5).astype(np.float32)
+        
+        result = kernel_matmul(A, B)
+        expected = np.einsum('ij,jk->ik', A, B)
+        
+        np.testing.assert_allclose(result, expected, rtol=1e-5)
+
+    def test_matmul_implicit_output(self):
+        """Test matrix multiplication with implicit output: ij,jk"""
+        @simulate_jit
+        def kernel_matmul_implicit(A, B):
+            import nkipy.core.ops as ops
+            return ops.einsum('ij,jk', A, B)
+
+        A = np.random.rand(3, 4).astype(np.float32)
+        B = np.random.rand(4, 5).astype(np.float32)
+        
+        result = kernel_matmul_implicit(A, B)
+        expected = np.einsum('ij,jk', A, B)
+        
+        np.testing.assert_allclose(result, expected, rtol=1e-5)
+
+    def test_batch_matmul(self):
+        """Test batched matrix multiplication: bij,bjk->bik"""
+        @simulate_jit
+        def kernel_batch_matmul(A, B):
+            import nkipy.core.ops as ops
+            return ops.einsum('bij,bjk->bik', A, B)
+
+        A = np.random.rand(2, 3, 4).astype(np.float32)
+        B = np.random.rand(2, 4, 5).astype(np.float32)
+        
+        result = kernel_batch_matmul(A, B)
+        expected = np.einsum('bij,bjk->bik', A, B)
+        
+        np.testing.assert_allclose(result, expected, rtol=1e-5)
+
+    def test_vector_dot_product(self):
+        """Test vector dot product: i,i->"""
+        @simulate_jit
+        def kernel_dot(a, b):
+            import nkipy.core.ops as ops
+            return ops.einsum('i,i->', a, b)
+
+        a = np.array([1.0, 2.0, 3.0], dtype=np.float32)
+        b = np.array([4.0, 5.0, 6.0], dtype=np.float32)
+        
+        result = kernel_dot(a, b)
+        expected = np.einsum('i,i->', a, b)
+        
+        np.testing.assert_allclose(result, expected, rtol=1e-5)
+
+    def test_matrix_vector_multiply(self):
+        """Test matrix-vector multiplication: ij,j->i"""
+        @simulate_jit
+        def kernel_matvec(A, b):
+            import nkipy.core.ops as ops
+            return ops.einsum('ij,j->i', A, b)
+
+        A = np.random.rand(3, 4).astype(np.float32)
+        b = np.random.rand(4).astype(np.float32)
+        
+        result = kernel_matvec(A, b)
+        expected = np.einsum('ij,j->i', A, b)
+        
+        np.testing.assert_allclose(result, expected, rtol=1e-5)
+
+
+class TestEinsumTranspose:
+    """Test einsum for transpose operations."""
+
+    def test_transpose_2d(self):
+        """Test 2D transpose: ij->ji"""
+        @simulate_jit
+        def kernel_transpose(A):
+            import nkipy.core.ops as ops
+            return ops.einsum('ij->ji', A)
+
+        A = np.random.rand(3, 4).astype(np.float32)
+        result = kernel_transpose(A)
+        expected = np.einsum('ij->ji', A)
+        
+        np.testing.assert_allclose(result, expected, rtol=1e-5)
+
+    def test_transpose_3d(self):
+        """Test 3D transpose: ijk->kji"""
+        @simulate_jit
+        def kernel_transpose_3d(A):
+            import nkipy.core.ops as ops
+            return ops.einsum('ijk->kji', A)
+
+        A = np.random.rand(2, 3, 4).astype(np.float32)
+        result = kernel_transpose_3d(A)
+        expected = np.einsum('ijk->kji', A)
+        
+        np.testing.assert_allclose(result, expected, rtol=1e-5)
+
+    def test_permute_dims(self):
+        """Test dimension permutation: ijk->jki"""
+        @simulate_jit
+        def kernel_permute(A):
+            import nkipy.core.ops as ops
+            return ops.einsum('ijk->jki', A)
+
+        A = np.random.rand(2, 3, 4).astype(np.float32)
+        result = kernel_permute(A)
+        expected = np.einsum('ijk->jki', A)
+        
+        np.testing.assert_allclose(result, expected, rtol=1e-5)
+
+
+class TestEinsumReduction:
+    """Test einsum for reduction operations."""
+
+    def test_sum_all(self):
+        """Test sum of all elements: ij->"""
+        @simulate_jit
+        def kernel_sum_all(A):
+            import nkipy.core.ops as ops
+            return ops.einsum('ij->', A)
+
+        A = np.random.rand(3, 4).astype(np.float32)
+        result = kernel_sum_all(A)
+        expected = np.einsum('ij->', A)
+        
+        np.testing.assert_allclose(result, expected, rtol=1e-5)
+
+    def test_sum_axis(self):
+        """Test sum along axis: ij->i"""
+        @simulate_jit
+        def kernel_sum_axis(A):
+            import nkipy.core.ops as ops
+            return ops.einsum('ij->i', A)
+
+        A = np.random.rand(3, 4).astype(np.float32)
+        result = kernel_sum_axis(A)
+        expected = np.einsum('ij->i', A)
+        
+        np.testing.assert_allclose(result, expected, rtol=1e-5)
+
+    def test_trace(self):
+        """Test matrix trace: ii->"""
+        @simulate_jit
+        def kernel_trace(A):
+            import nkipy.core.ops as ops
+            return ops.einsum('ii->', A)
+
+        A = np.random.rand(4, 4).astype(np.float32)
+        result = kernel_trace(A)
+        expected = np.einsum('ii->', A)
+        
+        np.testing.assert_allclose(result, expected, rtol=1e-5)
+
+    def test_diagonal(self):
+        """Test extracting diagonal: ii->i"""
+        @simulate_jit
+        def kernel_diagonal(A):
+            import nkipy.core.ops as ops
+            return ops.einsum('ii->i', A)
+
+        A = np.random.rand(4, 4).astype(np.float32)
+        result = kernel_diagonal(A)
+        expected = np.einsum('ii->i', A)
+        
+        np.testing.assert_allclose(result, expected, rtol=1e-5)
+
+
+class TestEinsumOuterProduct:
+    """Test einsum for outer product operations."""
+
+    def test_outer_product(self):
+        """Test outer product: i,j->ij"""
+        @simulate_jit
+        def kernel_outer(a, b):
+            import nkipy.core.ops as ops
+            return ops.einsum('i,j->ij', a, b)
+
+        a = np.array([1.0, 2.0, 3.0], dtype=np.float32)
+        b = np.array([4.0, 5.0], dtype=np.float32)
+        
+        result = kernel_outer(a, b)
+        expected = np.einsum('i,j->ij', a, b)
+        
+        np.testing.assert_allclose(result, expected, rtol=1e-5)
+
+    def test_outer_product_3d(self):
+        """Test 3D outer product: i,j,k->ijk"""
+        @simulate_jit
+        def kernel_outer_3d(a, b, c):
+            import nkipy.core.ops as ops
+            return ops.einsum('i,j,k->ijk', a, b, c)
+
+        a = np.array([1.0, 2.0], dtype=np.float32)
+        b = np.array([3.0, 4.0], dtype=np.float32)
+        c = np.array([5.0, 6.0], dtype=np.float32)
+        
+        result = kernel_outer_3d(a, b, c)
+        expected = np.einsum('i,j,k->ijk', a, b, c)
+        
+        np.testing.assert_allclose(result, expected, rtol=1e-5)
+
+
+class TestEinsumBroadcast:
+    """Test einsum for broadcasting operations."""
+
+    def test_broadcast_multiply(self):
+        """Test element-wise multiply with broadcasting: ij,j->ij"""
+        @simulate_jit
+        def kernel_broadcast_mul(A, b):
+            import nkipy.core.ops as ops
+            return ops.einsum('ij,j->ij', A, b)
+
+        A = np.random.rand(3, 4).astype(np.float32)
+        b = np.random.rand(4).astype(np.float32)
+        
+        result = kernel_broadcast_mul(A, b)
+        expected = np.einsum('ij,j->ij', A, b)
+        
+        np.testing.assert_allclose(result, expected, rtol=1e-5)
+
+    def test_batch_broadcast(self):
+        """Test batched broadcasting: bij,bj->bij"""
+        @simulate_jit
+        def kernel_batch_broadcast(A, b):
+            import nkipy.core.ops as ops
+            return ops.einsum('bij,bj->bij', A, b)
+
+        A = np.random.rand(2, 3, 4).astype(np.float32)
+        b = np.random.rand(2, 4).astype(np.float32)
+        
+        result = kernel_batch_broadcast(A, b)
+        expected = np.einsum('bij,bj->bij', A, b)
+        
+        np.testing.assert_allclose(result, expected, rtol=1e-5)
+
+
+class TestEinsumComplex:
+    """Test complex einsum patterns."""
+
+    def test_bilinear_form(self):
+        """Test bilinear form: i,ij,j->"""
+        @simulate_jit
+        def kernel_bilinear(x, A, y):
+            import nkipy.core.ops as ops
+            return ops.einsum('i,ij,j->', x, A, y)
+
+        x = np.array([1.0, 2.0, 3.0], dtype=np.float32)
+        A = np.random.rand(3, 3).astype(np.float32)
+        y = np.array([4.0, 5.0, 6.0], dtype=np.float32)
+        
+        result = kernel_bilinear(x, A, y)
+        expected = np.einsum('i,ij,j->', x, A, y)
+        
+        np.testing.assert_allclose(result, expected, rtol=1e-5)
+
+    def test_tensor_contraction(self):
+        """Test tensor contraction: ijk,jkl->il"""
+        @simulate_jit
+        def kernel_contraction(A, B):
+            import nkipy.core.ops as ops
+            return ops.einsum('ijk,jkl->il', A, B)
+
+        A = np.random.rand(2, 3, 4).astype(np.float32)
+        B = np.random.rand(3, 4, 5).astype(np.float32)
+        
+        result = kernel_contraction(A, B)
+        expected = np.einsum('ijk,jkl->il', A, B)
+        
+        np.testing.assert_allclose(result, expected, rtol=1e-5)
+
+
+class TestEinsumEdgeCases:
+    """Test edge cases for einsum."""
+
+    def test_identity(self):
+        """Test identity operation: ij->ij"""
+        @simulate_jit
+        def kernel_identity(A):
+            import nkipy.core.ops as ops
+            return ops.einsum('ij->ij', A)
+
+        A = np.random.rand(3, 4).astype(np.float32)
+        result = kernel_identity(A)
+        expected = np.einsum('ij->ij', A)
+        
+        np.testing.assert_allclose(result, expected, rtol=1e-5)
+
+    def test_scalar(self):
+        """Test scalar operations."""
+        @simulate_jit
+        def kernel_scalar(A):
+            import nkipy.core.ops as ops
+            return ops.einsum('->', A)
+
+        A = np.array(5.0, dtype=np.float32)
+        result = kernel_scalar(A)
+        expected = np.einsum('->', A)
+        
+        np.testing.assert_allclose(result, expected, rtol=1e-5)
+
+    def test_single_element(self):
+        """Test with single element arrays."""
+        @simulate_jit
+        def kernel_single(a, b):
+            import nkipy.core.ops as ops
+            return ops.einsum('i,i->', a, b)
+
+        a = np.array([2.0], dtype=np.float32)
+        b = np.array([3.0], dtype=np.float32)
+        
+        result = kernel_single(a, b)
+        expected = np.einsum('i,i->', a, b)
+        
+        np.testing.assert_allclose(result, expected, rtol=1e-5)

From 6a8d7a812e544c8cacee16fadccb08f93a818457 Mon Sep 17 00:00:00 2001
From: Jlonge4 <josh.longenecker1@gmail.com>
Date: Tue, 20 Jan 2026 15:29:07 -0500
Subject: [PATCH 02/21] baremetal decorator / cpu not supported

---
 examples/playground/einsum_example.py | 210 --------------------------
 tests/unit/test_einsum.py             |  44 +++---
 2 files changed, 22 insertions(+), 232 deletions(-)
 delete mode 100644 examples/playground/einsum_example.py

diff --git a/examples/playground/einsum_example.py b/examples/playground/einsum_example.py
deleted file mode 100644
index d396deb..0000000
--- a/examples/playground/einsum_example.py
+++ /dev/null
@@ -1,210 +0,0 @@
-#!/usr/bin/env python3
-# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-# SPDX-License-Identifier: Apache-2.0
-"""Example demonstrating einsum operation in NKIPy.
-
-Einstein summation notation provides a concise and powerful way to
-express tensor operations including matrix multiplication, transposes,
-reductions, and complex tensor contractions.
-"""
-
-import numpy as np
-
-from nkipy.runtime.decorators import simulate_jit
-
-
-# =============================================================================
-# Matrix Operations
-# =============================================================================
-
-@simulate_jit
-def matmul_einsum(A, B):
-    """Matrix multiplication using einsum."""
-    import nkipy.core.ops as ops
-    return ops.einsum('ij,jk->ik', A, B)
-
-
-@simulate_jit
-def batch_matmul_einsum(A, B):
-    """Batch matrix multiplication using einsum."""
-    import nkipy.core.ops as ops
-    return ops.einsum('bij,bjk->bik', A, B)
-
-
-# =============================================================================
-# Transpose and Permutation
-# =============================================================================
-
-@simulate_jit
-def transpose_einsum(A):
-    """Transpose using einsum."""
-    import nkipy.core.ops as ops
-    return ops.einsum('ij->ji', A)
-
-
-@simulate_jit
-def permute_dims_einsum(A):
-    """Permute dimensions using einsum."""
-    import nkipy.core.ops as ops
-    return ops.einsum('ijk->kij', A)
-
-
-# =============================================================================
-# Reductions
-# =============================================================================
-
-@simulate_jit
-def trace_einsum(A):
-    """Matrix trace using einsum."""
-    import nkipy.core.ops as ops
-    return ops.einsum('ii->', A)
-
-
-@simulate_jit
-def sum_axis_einsum(A):
-    """Sum along axis using einsum."""
-    import nkipy.core.ops as ops
-    return ops.einsum('ij->i', A)
-
-
-# =============================================================================
-# Outer Products
-# =============================================================================
-
-@simulate_jit
-def outer_product_einsum(a, b):
-    """Outer product using einsum."""
-    import nkipy.core.ops as ops
-    return ops.einsum('i,j->ij', a, b)
-
-
-# =============================================================================
-# Advanced Patterns
-# =============================================================================
-
-@simulate_jit
-def dot_product_einsum(a, b):
-    """Dot product using einsum."""
-    import nkipy.core.ops as ops
-    return ops.einsum('i,i->', a, b)
-
-
-@simulate_jit
-def bilinear_form_einsum(x, A, y):
-    """Bilinear form x^T A y using einsum."""
-    import nkipy.core.ops as ops
-    return ops.einsum('i,ij,j->', x, A, y)
-
-
-@simulate_jit
-def attention_pattern_einsum(Q, K, V):
-    """Simplified attention pattern: Q @ K^T @ V using einsum.
-    
-    This computes (Q @ K^T) @ V in one operation.
-    Q: (batch, seq_q, d_k)
-    K: (batch, seq_k, d_k)
-    V: (batch, seq_v, d_v)
-    
-    Note: This is simplified - real attention includes scaling and softmax.
-    """
-    import nkipy.core.ops as ops
-    # Q @ K^T: 'bid,bjd->bij' where i=seq_q, j=seq_k
-    # (Q @ K^T) @ V: 'bij,bjd->bid' where final d is d_v
-    # Combined: 'bik,bjk,bjd->bid'
-    return ops.einsum('bik,bjk,bjd->bid', Q, K, V)
-
-
-def main():
-    print("=" * 70)
-    print("NKIPy einsum Examples")
-    print("=" * 70)
-    
-    # Example 1: Matrix Multiplication
-    print("\n1. Matrix Multiplication (ij,jk->ik):")
-    A = np.array([[1, 2], [3, 4]], dtype=np.float32)
-    B = np.array([[5, 6], [7, 8]], dtype=np.float32)
-    print(f"A =\n{A}")
-    print(f"B =\n{B}")
-    result = matmul_einsum(A, B)
-    print(f"A @ B =\n{result}")
-    print(f"NumPy result:\n{np.einsum('ij,jk->ik', A, B)}")
-    
-    # Example 2: Batch Matrix Multiplication
-    print("\n2. Batch Matrix Multiplication (bij,bjk->bik):")
-    A_batch = np.random.rand(2, 3, 4).astype(np.float32)
-    B_batch = np.random.rand(2, 4, 5).astype(np.float32)
-    result_batch = batch_matmul_einsum(A_batch, B_batch)
-    print(f"Batch A shape: {A_batch.shape}")
-    print(f"Batch B shape: {B_batch.shape}")
-    print(f"Result shape: {result_batch.shape}")
-    
-    # Example 3: Transpose
-    print("\n3. Transpose (ij->ji):")
-    C = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-    print(f"C =\n{C}")
-    result_transpose = transpose_einsum(C)
-    print(f"C^T =\n{result_transpose}")
-    
-    # Example 4: Trace
-    print("\n4. Matrix Trace (ii->):")
-    D = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32)
-    print(f"D =\n{D}")
-    trace = trace_einsum(D)
-    print(f"Trace(D) = {trace}")
-    print(f"NumPy trace: {np.trace(D)}")
-    
-    # Example 5: Outer Product
-    print("\n5. Outer Product (i,j->ij):")
-    a = np.array([1, 2, 3], dtype=np.float32)
-    b = np.array([4, 5], dtype=np.float32)
-    print(f"a = {a}")
-    print(f"b = {b}")
-    result_outer = outer_product_einsum(a, b)
-    print(f"Outer product a ⊗ b =\n{result_outer}")
-    
-    # Example 6: Dot Product
-    print("\n6. Dot Product (i,i->):")
-    x = np.array([1, 2, 3, 4], dtype=np.float32)
-    y = np.array([5, 6, 7, 8], dtype=np.float32)
-    print(f"x = {x}")
-    print(f"y = {y}")
-    dot = dot_product_einsum(x, y)
-    print(f"x · y = {dot}")
-    print(f"NumPy dot: {np.dot(x, y)}")
-    
-    # Example 7: Bilinear Form
-    print("\n7. Bilinear Form x^T A y (i,ij,j->):")
-    x = np.array([1, 2], dtype=np.float32)
-    A = np.array([[3, 4], [5, 6]], dtype=np.float32)
-    y = np.array([7, 8], dtype=np.float32)
-    print(f"x = {x}")
-    print(f"A =\n{A}")
-    print(f"y = {y}")
-    bilinear = bilinear_form_einsum(x, A, y)
-    print(f"x^T A y = {bilinear}")
-    
-    # Example 8: Sum along axis
-    print("\n8. Sum Along Axis (ij->i):")
-    E = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-    print(f"E =\n{E}")
-    sum_result = sum_axis_einsum(E)
-    print(f"Sum along axis 1: {sum_result}")
-    
-    # Example 9: Attention-like pattern (simplified)
-    print("\n9. Simplified Attention Pattern (bik,bjk,bjd->bid):")
-    Q = np.random.rand(1, 2, 3).astype(np.float32)  # (batch=1, seq_q=2, d_k=3)
-    K = np.random.rand(1, 4, 3).astype(np.float32)  # (batch=1, seq_k=4, d_k=3)
-    V = np.random.rand(1, 4, 5).astype(np.float32)  # (batch=1, seq_v=4, d_v=5)
-    print(f"Q shape: {Q.shape} (batch, seq_q, d_k)")
-    print(f"K shape: {K.shape} (batch, seq_k, d_k)")
-    print(f"V shape: {V.shape} (batch, seq_v, d_v)")
-    attn_result = attention_pattern_einsum(Q, K, V)
-    print(f"Attention output shape: {attn_result.shape} (batch, seq_q, d_v)")
-    
-    print("\n" + "=" * 70)
-    print("All einsum examples completed!")
-    print("=" * 70)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tests/unit/test_einsum.py b/tests/unit/test_einsum.py
index b59d622..1404cc8 100644
--- a/tests/unit/test_einsum.py
+++ b/tests/unit/test_einsum.py
@@ -5,7 +5,7 @@
 import numpy as np
 import pytest
 
-from nkipy.runtime.decorators import simulate_jit
+from nkipy.runtime.decorators import baremetal_jit
 
 
 class TestEinsumMatmul:
@@ -13,7 +13,7 @@ class TestEinsumMatmul:
 
     def test_matmul_basic(self):
         """Test basic matrix multiplication: ij,jk->ik"""
-        @simulate_jit
+        @baremetal_jit
         def kernel_matmul(A, B):
             import nkipy.core.ops as ops
             return ops.einsum('ij,jk->ik', A, B)
@@ -28,7 +28,7 @@ def kernel_matmul(A, B):
 
     def test_matmul_implicit_output(self):
         """Test matrix multiplication with implicit output: ij,jk"""
-        @simulate_jit
+        @baremetal_jit
         def kernel_matmul_implicit(A, B):
             import nkipy.core.ops as ops
             return ops.einsum('ij,jk', A, B)
@@ -43,7 +43,7 @@ def kernel_matmul_implicit(A, B):
 
     def test_batch_matmul(self):
         """Test batched matrix multiplication: bij,bjk->bik"""
-        @simulate_jit
+        @baremetal_jit
         def kernel_batch_matmul(A, B):
             import nkipy.core.ops as ops
             return ops.einsum('bij,bjk->bik', A, B)
@@ -58,7 +58,7 @@ def kernel_batch_matmul(A, B):
 
     def test_vector_dot_product(self):
         """Test vector dot product: i,i->"""
-        @simulate_jit
+        @baremetal_jit
         def kernel_dot(a, b):
             import nkipy.core.ops as ops
             return ops.einsum('i,i->', a, b)
@@ -73,7 +73,7 @@ def kernel_dot(a, b):
 
     def test_matrix_vector_multiply(self):
         """Test matrix-vector multiplication: ij,j->i"""
-        @simulate_jit
+        @baremetal_jit
         def kernel_matvec(A, b):
             import nkipy.core.ops as ops
             return ops.einsum('ij,j->i', A, b)
@@ -92,7 +92,7 @@ class TestEinsumTranspose:
 
     def test_transpose_2d(self):
         """Test 2D transpose: ij->ji"""
-        @simulate_jit
+        @baremetal_jit
         def kernel_transpose(A):
             import nkipy.core.ops as ops
             return ops.einsum('ij->ji', A)
@@ -105,7 +105,7 @@ def kernel_transpose(A):
 
     def test_transpose_3d(self):
         """Test 3D transpose: ijk->kji"""
-        @simulate_jit
+        @baremetal_jit
         def kernel_transpose_3d(A):
             import nkipy.core.ops as ops
             return ops.einsum('ijk->kji', A)
@@ -118,7 +118,7 @@ def kernel_transpose_3d(A):
 
     def test_permute_dims(self):
         """Test dimension permutation: ijk->jki"""
-        @simulate_jit
+        @baremetal_jit
         def kernel_permute(A):
             import nkipy.core.ops as ops
             return ops.einsum('ijk->jki', A)
@@ -135,7 +135,7 @@ class TestEinsumReduction:
 
     def test_sum_all(self):
         """Test sum of all elements: ij->"""
-        @simulate_jit
+        @baremetal_jit
         def kernel_sum_all(A):
             import nkipy.core.ops as ops
             return ops.einsum('ij->', A)
@@ -148,7 +148,7 @@ def kernel_sum_all(A):
 
     def test_sum_axis(self):
         """Test sum along axis: ij->i"""
-        @simulate_jit
+        @baremetal_jit
         def kernel_sum_axis(A):
             import nkipy.core.ops as ops
             return ops.einsum('ij->i', A)
@@ -161,7 +161,7 @@ def kernel_sum_axis(A):
 
     def test_trace(self):
         """Test matrix trace: ii->"""
-        @simulate_jit
+        @baremetal_jit
         def kernel_trace(A):
             import nkipy.core.ops as ops
             return ops.einsum('ii->', A)
@@ -174,7 +174,7 @@ def kernel_trace(A):
 
     def test_diagonal(self):
         """Test extracting diagonal: ii->i"""
-        @simulate_jit
+        @baremetal_jit
         def kernel_diagonal(A):
             import nkipy.core.ops as ops
             return ops.einsum('ii->i', A)
@@ -191,7 +191,7 @@ class TestEinsumOuterProduct:
 
     def test_outer_product(self):
         """Test outer product: i,j->ij"""
-        @simulate_jit
+        @baremetal_jit
         def kernel_outer(a, b):
             import nkipy.core.ops as ops
             return ops.einsum('i,j->ij', a, b)
@@ -206,7 +206,7 @@ def kernel_outer(a, b):
 
     def test_outer_product_3d(self):
         """Test 3D outer product: i,j,k->ijk"""
-        @simulate_jit
+        @baremetal_jit
         def kernel_outer_3d(a, b, c):
             import nkipy.core.ops as ops
             return ops.einsum('i,j,k->ijk', a, b, c)
@@ -226,7 +226,7 @@ class TestEinsumBroadcast:
 
     def test_broadcast_multiply(self):
         """Test element-wise multiply with broadcasting: ij,j->ij"""
-        @simulate_jit
+        @baremetal_jit
         def kernel_broadcast_mul(A, b):
             import nkipy.core.ops as ops
             return ops.einsum('ij,j->ij', A, b)
@@ -241,7 +241,7 @@ def kernel_broadcast_mul(A, b):
 
     def test_batch_broadcast(self):
         """Test batched broadcasting: bij,bj->bij"""
-        @simulate_jit
+        @baremetal_jit
         def kernel_batch_broadcast(A, b):
             import nkipy.core.ops as ops
             return ops.einsum('bij,bj->bij', A, b)
@@ -260,7 +260,7 @@ class TestEinsumComplex:
 
     def test_bilinear_form(self):
         """Test bilinear form: i,ij,j->"""
-        @simulate_jit
+        @baremetal_jit
         def kernel_bilinear(x, A, y):
             import nkipy.core.ops as ops
             return ops.einsum('i,ij,j->', x, A, y)
@@ -276,7 +276,7 @@ def kernel_bilinear(x, A, y):
 
     def test_tensor_contraction(self):
         """Test tensor contraction: ijk,jkl->il"""
-        @simulate_jit
+        @baremetal_jit
         def kernel_contraction(A, B):
             import nkipy.core.ops as ops
             return ops.einsum('ijk,jkl->il', A, B)
@@ -295,7 +295,7 @@ class TestEinsumEdgeCases:
 
     def test_identity(self):
         """Test identity operation: ij->ij"""
-        @simulate_jit
+        @baremetal_jit
         def kernel_identity(A):
             import nkipy.core.ops as ops
             return ops.einsum('ij->ij', A)
@@ -308,7 +308,7 @@ def kernel_identity(A):
 
     def test_scalar(self):
         """Test scalar operations."""
-        @simulate_jit
+        @baremetal_jit
         def kernel_scalar(A):
             import nkipy.core.ops as ops
             return ops.einsum('->', A)
@@ -321,7 +321,7 @@ def kernel_scalar(A):
 
     def test_single_element(self):
         """Test with single element arrays."""
-        @simulate_jit
+        @baremetal_jit
         def kernel_single(a, b):
             import nkipy.core.ops as ops
             return ops.einsum('i,i->', a, b)

From 783cc5a30605dcff2c356903ce039ea2c5f30b0c Mon Sep 17 00:00:00 2001
From: Jlonge4 <josh.longenecker1@gmail.com>
Date: Tue, 20 Jan 2026 15:50:26 -0500
Subject: [PATCH 03/21] register einsum op with dispatch

---
 examples/playground/einsum.ipynb        | 80 +++++++++++++++++++++++++
 nkipy/src/nkipy/core/_numpy_dispatch.py |  3 +
 2 files changed, 83 insertions(+)
 create mode 100644 examples/playground/einsum.ipynb

diff --git a/examples/playground/einsum.ipynb b/examples/playground/einsum.ipynb
new file mode 100644
index 0000000..260b8e3
--- /dev/null
+++ b/examples/playground/einsum.ipynb
@@ -0,0 +1,80 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "51f8501a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "from nkipy.core.trace import NKIPyKernel\n",
+    "from nkipy.core.compile import lower_to_nki\n",
+    "from nkipy.runtime.execute import simulate_traced_kernel, baremetal_run_traced_kernel"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "657ca110",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def einsum_matmul(A, B):\n",
+    "    return np.einsum('ik,kj->ij', A, B)\n",
+    "\n",
+    "A = np.random.rand(2, 3).astype(np.float32)\n",
+    "B = np.random.rand(3, 4).astype(np.float32)\n",
+    "\n",
+    "expected = einsum_matmul(A, B)\n",
+    "expected"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f2a4b5a9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "traced_kernel = NKIPyKernel.trace(einsum_matmul, A, B)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a9258c26",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "out_nkipy = simulate_traced_kernel(traced_kernel, A, B)\n",
+    "print(\"Is the simulated output the same as Numpy? \", np.allclose(out_nkipy, expected))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a0e487ac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "out_baremetal = baremetal_run_traced_kernel(traced_kernel, A, B)\n",
+    "print(\"Is the baremetal output the same as Numpy? \", np.allclose(out_baremetal, expected))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.12.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/nkipy/src/nkipy/core/_numpy_dispatch.py b/nkipy/src/nkipy/core/_numpy_dispatch.py
index e31ecb3..ee714d0 100644
--- a/nkipy/src/nkipy/core/_numpy_dispatch.py
+++ b/nkipy/src/nkipy/core/_numpy_dispatch.py
@@ -90,6 +90,9 @@ def register_all_numpy_apis():
     # Linear algebra
     _register_numpy_api(np.matmul, ops.matmul)
 
+    # Einstein summation
+    _register_numpy_api(np.einsum, ops.einsum)
+
     # Transform operations
     _register_numpy_api(np.reshape, ops.reshape)
     _register_numpy_api(np.transpose, ops.transpose)

From 98a4c9e520bd286b5b0f5705d98684b2605a209b Mon Sep 17 00:00:00 2001
From: Jlonge4 <josh.longenecker1@gmail.com>
Date: Tue, 20 Jan 2026 15:54:52 -0500
Subject: [PATCH 04/21] align test structure

---
 tests/kernels/einsum.py   | 240 +++++++++++++++++++++++++++
 tests/unit/test_einsum.py | 335 --------------------------------------
 2 files changed, 240 insertions(+), 335 deletions(-)
 create mode 100644 tests/kernels/einsum.py
 delete mode 100644 tests/unit/test_einsum.py

diff --git a/tests/kernels/einsum.py b/tests/kernels/einsum.py
new file mode 100644
index 0000000..82939e6
--- /dev/null
+++ b/tests/kernels/einsum.py
@@ -0,0 +1,240 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Einstein summation (einsum) kernel specifications.
+
+This module provides kernel specifications for testing various einsum patterns:
+- Matrix multiplication
+- Batch operations
+- Reductions
+- Transposes
+- Outer products
+"""
+
+import numpy as np
+from nkipy.core.specs import CommonTypes, KernelSpec, ShapeSpec, TensorInputSpec
+
+
+# =============================================================================
+# Matrix Operations
+# =============================================================================
+
+
+def matmul_einsum(A, B):
+    """Matrix multiplication using einsum: ij,jk->ik"""
+    return np.einsum('ij,jk->ik', A, B)
+
+
+def batch_matmul_einsum(A, B):
+    """Batch matrix multiplication using einsum: bij,bjk->bik"""
+    return np.einsum('bij,bjk->bik', A, B)
+
+
+# =============================================================================
+# Transpose and Permutation
+# =============================================================================
+
+
+def transpose_einsum(A):
+    """Transpose using einsum: ij->ji"""
+    return np.einsum('ij->ji', A)
+
+
+def permute_dims_einsum(A):
+    """Permute dimensions using einsum: ijk->kij"""
+    return np.einsum('ijk->kij', A)
+
+
+# =============================================================================
+# Reductions
+# =============================================================================
+
+
+def trace_einsum(A):
+    """Matrix trace using einsum: ii->"""
+    return np.einsum('ii->', A)
+
+
+def sum_axis_einsum(A):
+    """Sum along axis using einsum: ij->i"""
+    return np.einsum('ij->i', A)
+
+
+# =============================================================================
+# Outer Products
+# =============================================================================
+
+
+def outer_product_einsum(a, b):
+    """Outer product using einsum: i,j->ij"""
+    return np.einsum('i,j->ij', a, b)
+
+
+# =============================================================================
+# Advanced Patterns
+# =============================================================================
+
+
+def dot_product_einsum(a, b):
+    """Dot product using einsum: i,i->"""
+    return np.einsum('i,i->', a, b)
+
+
+def bilinear_form_einsum(x, A, y):
+    """Bilinear form x^T A y using einsum: i,ij,j->"""
+    return np.einsum('i,ij,j->', x, A, y)
+
+
+# =============================================================================
+# Kernel Specifications
+# =============================================================================
+
+kernel_specs = [
+    # Matrix multiplication
+    KernelSpec(
+        function=matmul_einsum,
+        inputs=[
+            TensorInputSpec(
+                shape_spec=ShapeSpec(dims=[None, None], default=(32, 64)),
+                dtype_spec=CommonTypes.FLOATS,
+                description="First matrix (M, K)",
+            ),
+            TensorInputSpec(
+                shape_spec=ShapeSpec(dims=[None, None], default=(64, 48)),
+                dtype_spec=CommonTypes.FLOATS,
+                description="Second matrix (K, N)",
+            ),
+        ],
+        is_pure_numpy=True,
+        description="Matrix multiplication via einsum (ij,jk->ik)",
+    ),
+    # Batch matrix multiplication
+    KernelSpec(
+        function=batch_matmul_einsum,
+        inputs=[
+            TensorInputSpec(
+                shape_spec=ShapeSpec(dims=[None, None, None], default=(4, 32, 64)),
+                dtype_spec=CommonTypes.FLOATS,
+                description="First batched matrix (B, M, K)",
+            ),
+            TensorInputSpec(
+                shape_spec=ShapeSpec(dims=[None, None, None], default=(4, 64, 48)),
+                dtype_spec=CommonTypes.FLOATS,
+                description="Second batched matrix (B, K, N)",
+            ),
+        ],
+        is_pure_numpy=True,
+        description="Batch matrix multiplication via einsum (bij,bjk->bik)",
+    ),
+    # Transpose
+    KernelSpec(
+        function=transpose_einsum,
+        inputs=[
+            TensorInputSpec(
+                shape_spec=ShapeSpec(dims=[None, None], default=(32, 64)),
+                dtype_spec=CommonTypes.FLOATS,
+                description="Matrix to transpose",
+            ),
+        ],
+        is_pure_numpy=True,
+        description="2D transpose via einsum (ij->ji)",
+    ),
+    # Permute dimensions
+    KernelSpec(
+        function=permute_dims_einsum,
+        inputs=[
+            TensorInputSpec(
+                shape_spec=ShapeSpec(dims=[None, None, None], default=(4, 32, 64)),
+                dtype_spec=CommonTypes.FLOATS,
+                description="3D tensor to permute",
+            ),
+        ],
+        is_pure_numpy=True,
+        description="3D permutation via einsum (ijk->kij)",
+    ),
+    # Trace
+    KernelSpec(
+        function=trace_einsum,
+        inputs=[
+            TensorInputSpec(
+                shape_spec=ShapeSpec(dims=[None, None], default=(64, 64)),
+                dtype_spec=CommonTypes.FLOATS,
+                description="Square matrix",
+            ),
+        ],
+        is_pure_numpy=True,
+        description="Matrix trace via einsum (ii->)",
+    ),
+    # Sum along axis
+    KernelSpec(
+        function=sum_axis_einsum,
+        inputs=[
+            TensorInputSpec(
+                shape_spec=ShapeSpec(dims=[None, None], default=(32, 64)),
+                dtype_spec=CommonTypes.FLOATS,
+                description="Matrix to reduce",
+            ),
+        ],
+        is_pure_numpy=True,
+        description="Sum along last axis via einsum (ij->i)",
+    ),
+    # Outer product
+    KernelSpec(
+        function=outer_product_einsum,
+        inputs=[
+            TensorInputSpec(
+                shape_spec=ShapeSpec(dims=[None], default=(32,)),
+                dtype_spec=CommonTypes.FLOATS,
+                description="First vector",
+            ),
+            TensorInputSpec(
+                shape_spec=ShapeSpec(dims=[None], default=(64,)),
+                dtype_spec=CommonTypes.FLOATS,
+                description="Second vector",
+            ),
+        ],
+        is_pure_numpy=True,
+        description="Outer product via einsum (i,j->ij)",
+    ),
+    # Dot product
+    KernelSpec(
+        function=dot_product_einsum,
+        inputs=[
+            TensorInputSpec(
+                shape_spec=ShapeSpec(dims=[None], default=(128,)),
+                dtype_spec=CommonTypes.FLOATS,
+                description="First vector",
+            ),
+            TensorInputSpec(
+                shape_spec=ShapeSpec(dims=[None], default=(128,)),
+                dtype_spec=CommonTypes.FLOATS,
+                description="Second vector",
+            ),
+        ],
+        is_pure_numpy=True,
+        description="Dot product via einsum (i,i->)",
+    ),
+    # Bilinear form
+    KernelSpec(
+        function=bilinear_form_einsum,
+        inputs=[
+            TensorInputSpec(
+                shape_spec=ShapeSpec(dims=[None], default=(64,)),
+                dtype_spec=CommonTypes.FLOATS,
+                description="Left vector x",
+            ),
+            TensorInputSpec(
+                shape_spec=ShapeSpec(dims=[None, None], default=(64, 64)),
+                dtype_spec=CommonTypes.FLOATS,
+                description="Matrix A",
+            ),
+            TensorInputSpec(
+                shape_spec=ShapeSpec(dims=[None], default=(64,)),
+                dtype_spec=CommonTypes.FLOATS,
+                description="Right vector y",
+            ),
+        ],
+        is_pure_numpy=True,
+        description="Bilinear form x^T A y via einsum (i,ij,j->)",
+    ),
+]
diff --git a/tests/unit/test_einsum.py b/tests/unit/test_einsum.py
deleted file mode 100644
index 1404cc8..0000000
--- a/tests/unit/test_einsum.py
+++ /dev/null
@@ -1,335 +0,0 @@
-# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-# SPDX-License-Identifier: Apache-2.0
-"""Unit tests for einsum operation."""
-
-import numpy as np
-import pytest
-
-from nkipy.runtime.decorators import baremetal_jit
-
-
-class TestEinsumMatmul:
-    """Test einsum for matrix multiplication patterns."""
-
-    def test_matmul_basic(self):
-        """Test basic matrix multiplication: ij,jk->ik"""
-        @baremetal_jit
-        def kernel_matmul(A, B):
-            import nkipy.core.ops as ops
-            return ops.einsum('ij,jk->ik', A, B)
-
-        A = np.random.rand(3, 4).astype(np.float32)
-        B = np.random.rand(4, 5).astype(np.float32)
-        
-        result = kernel_matmul(A, B)
-        expected = np.einsum('ij,jk->ik', A, B)
-        
-        np.testing.assert_allclose(result, expected, rtol=1e-5)
-
-    def test_matmul_implicit_output(self):
-        """Test matrix multiplication with implicit output: ij,jk"""
-        @baremetal_jit
-        def kernel_matmul_implicit(A, B):
-            import nkipy.core.ops as ops
-            return ops.einsum('ij,jk', A, B)
-
-        A = np.random.rand(3, 4).astype(np.float32)
-        B = np.random.rand(4, 5).astype(np.float32)
-        
-        result = kernel_matmul_implicit(A, B)
-        expected = np.einsum('ij,jk', A, B)
-        
-        np.testing.assert_allclose(result, expected, rtol=1e-5)
-
-    def test_batch_matmul(self):
-        """Test batched matrix multiplication: bij,bjk->bik"""
-        @baremetal_jit
-        def kernel_batch_matmul(A, B):
-            import nkipy.core.ops as ops
-            return ops.einsum('bij,bjk->bik', A, B)
-
-        A = np.random.rand(2, 3, 4).astype(np.float32)
-        B = np.random.rand(2, 4, 5).astype(np.float32)
-        
-        result = kernel_batch_matmul(A, B)
-        expected = np.einsum('bij,bjk->bik', A, B)
-        
-        np.testing.assert_allclose(result, expected, rtol=1e-5)
-
-    def test_vector_dot_product(self):
-        """Test vector dot product: i,i->"""
-        @baremetal_jit
-        def kernel_dot(a, b):
-            import nkipy.core.ops as ops
-            return ops.einsum('i,i->', a, b)
-
-        a = np.array([1.0, 2.0, 3.0], dtype=np.float32)
-        b = np.array([4.0, 5.0, 6.0], dtype=np.float32)
-        
-        result = kernel_dot(a, b)
-        expected = np.einsum('i,i->', a, b)
-        
-        np.testing.assert_allclose(result, expected, rtol=1e-5)
-
-    def test_matrix_vector_multiply(self):
-        """Test matrix-vector multiplication: ij,j->i"""
-        @baremetal_jit
-        def kernel_matvec(A, b):
-            import nkipy.core.ops as ops
-            return ops.einsum('ij,j->i', A, b)
-
-        A = np.random.rand(3, 4).astype(np.float32)
-        b = np.random.rand(4).astype(np.float32)
-        
-        result = kernel_matvec(A, b)
-        expected = np.einsum('ij,j->i', A, b)
-        
-        np.testing.assert_allclose(result, expected, rtol=1e-5)
-
-
-class TestEinsumTranspose:
-    """Test einsum for transpose operations."""
-
-    def test_transpose_2d(self):
-        """Test 2D transpose: ij->ji"""
-        @baremetal_jit
-        def kernel_transpose(A):
-            import nkipy.core.ops as ops
-            return ops.einsum('ij->ji', A)
-
-        A = np.random.rand(3, 4).astype(np.float32)
-        result = kernel_transpose(A)
-        expected = np.einsum('ij->ji', A)
-        
-        np.testing.assert_allclose(result, expected, rtol=1e-5)
-
-    def test_transpose_3d(self):
-        """Test 3D transpose: ijk->kji"""
-        @baremetal_jit
-        def kernel_transpose_3d(A):
-            import nkipy.core.ops as ops
-            return ops.einsum('ijk->kji', A)
-
-        A = np.random.rand(2, 3, 4).astype(np.float32)
-        result = kernel_transpose_3d(A)
-        expected = np.einsum('ijk->kji', A)
-        
-        np.testing.assert_allclose(result, expected, rtol=1e-5)
-
-    def test_permute_dims(self):
-        """Test dimension permutation: ijk->jki"""
-        @baremetal_jit
-        def kernel_permute(A):
-            import nkipy.core.ops as ops
-            return ops.einsum('ijk->jki', A)
-
-        A = np.random.rand(2, 3, 4).astype(np.float32)
-        result = kernel_permute(A)
-        expected = np.einsum('ijk->jki', A)
-        
-        np.testing.assert_allclose(result, expected, rtol=1e-5)
-
-
-class TestEinsumReduction:
-    """Test einsum for reduction operations."""
-
-    def test_sum_all(self):
-        """Test sum of all elements: ij->"""
-        @baremetal_jit
-        def kernel_sum_all(A):
-            import nkipy.core.ops as ops
-            return ops.einsum('ij->', A)
-
-        A = np.random.rand(3, 4).astype(np.float32)
-        result = kernel_sum_all(A)
-        expected = np.einsum('ij->', A)
-        
-        np.testing.assert_allclose(result, expected, rtol=1e-5)
-
-    def test_sum_axis(self):
-        """Test sum along axis: ij->i"""
-        @baremetal_jit
-        def kernel_sum_axis(A):
-            import nkipy.core.ops as ops
-            return ops.einsum('ij->i', A)
-
-        A = np.random.rand(3, 4).astype(np.float32)
-        result = kernel_sum_axis(A)
-        expected = np.einsum('ij->i', A)
-        
-        np.testing.assert_allclose(result, expected, rtol=1e-5)
-
-    def test_trace(self):
-        """Test matrix trace: ii->"""
-        @baremetal_jit
-        def kernel_trace(A):
-            import nkipy.core.ops as ops
-            return ops.einsum('ii->', A)
-
-        A = np.random.rand(4, 4).astype(np.float32)
-        result = kernel_trace(A)
-        expected = np.einsum('ii->', A)
-        
-        np.testing.assert_allclose(result, expected, rtol=1e-5)
-
-    def test_diagonal(self):
-        """Test extracting diagonal: ii->i"""
-        @baremetal_jit
-        def kernel_diagonal(A):
-            import nkipy.core.ops as ops
-            return ops.einsum('ii->i', A)
-
-        A = np.random.rand(4, 4).astype(np.float32)
-        result = kernel_diagonal(A)
-        expected = np.einsum('ii->i', A)
-        
-        np.testing.assert_allclose(result, expected, rtol=1e-5)
-
-
-class TestEinsumOuterProduct:
-    """Test einsum for outer product operations."""
-
-    def test_outer_product(self):
-        """Test outer product: i,j->ij"""
-        @baremetal_jit
-        def kernel_outer(a, b):
-            import nkipy.core.ops as ops
-            return ops.einsum('i,j->ij', a, b)
-
-        a = np.array([1.0, 2.0, 3.0], dtype=np.float32)
-        b = np.array([4.0, 5.0], dtype=np.float32)
-        
-        result = kernel_outer(a, b)
-        expected = np.einsum('i,j->ij', a, b)
-        
-        np.testing.assert_allclose(result, expected, rtol=1e-5)
-
-    def test_outer_product_3d(self):
-        """Test 3D outer product: i,j,k->ijk"""
-        @baremetal_jit
-        def kernel_outer_3d(a, b, c):
-            import nkipy.core.ops as ops
-            return ops.einsum('i,j,k->ijk', a, b, c)
-
-        a = np.array([1.0, 2.0], dtype=np.float32)
-        b = np.array([3.0, 4.0], dtype=np.float32)
-        c = np.array([5.0, 6.0], dtype=np.float32)
-        
-        result = kernel_outer_3d(a, b, c)
-        expected = np.einsum('i,j,k->ijk', a, b, c)
-        
-        np.testing.assert_allclose(result, expected, rtol=1e-5)
-
-
-class TestEinsumBroadcast:
-    """Test einsum for broadcasting operations."""
-
-    def test_broadcast_multiply(self):
-        """Test element-wise multiply with broadcasting: ij,j->ij"""
-        @baremetal_jit
-        def kernel_broadcast_mul(A, b):
-            import nkipy.core.ops as ops
-            return ops.einsum('ij,j->ij', A, b)
-
-        A = np.random.rand(3, 4).astype(np.float32)
-        b = np.random.rand(4).astype(np.float32)
-        
-        result = kernel_broadcast_mul(A, b)
-        expected = np.einsum('ij,j->ij', A, b)
-        
-        np.testing.assert_allclose(result, expected, rtol=1e-5)
-
-    def test_batch_broadcast(self):
-        """Test batched broadcasting: bij,bj->bij"""
-        @baremetal_jit
-        def kernel_batch_broadcast(A, b):
-            import nkipy.core.ops as ops
-            return ops.einsum('bij,bj->bij', A, b)
-
-        A = np.random.rand(2, 3, 4).astype(np.float32)
-        b = np.random.rand(2, 4).astype(np.float32)
-        
-        result = kernel_batch_broadcast(A, b)
-        expected = np.einsum('bij,bj->bij', A, b)
-        
-        np.testing.assert_allclose(result, expected, rtol=1e-5)
-
-
-class TestEinsumComplex:
-    """Test complex einsum patterns."""
-
-    def test_bilinear_form(self):
-        """Test bilinear form: i,ij,j->"""
-        @baremetal_jit
-        def kernel_bilinear(x, A, y):
-            import nkipy.core.ops as ops
-            return ops.einsum('i,ij,j->', x, A, y)
-
-        x = np.array([1.0, 2.0, 3.0], dtype=np.float32)
-        A = np.random.rand(3, 3).astype(np.float32)
-        y = np.array([4.0, 5.0, 6.0], dtype=np.float32)
-        
-        result = kernel_bilinear(x, A, y)
-        expected = np.einsum('i,ij,j->', x, A, y)
-        
-        np.testing.assert_allclose(result, expected, rtol=1e-5)
-
-    def test_tensor_contraction(self):
-        """Test tensor contraction: ijk,jkl->il"""
-        @baremetal_jit
-        def kernel_contraction(A, B):
-            import nkipy.core.ops as ops
-            return ops.einsum('ijk,jkl->il', A, B)
-
-        A = np.random.rand(2, 3, 4).astype(np.float32)
-        B = np.random.rand(3, 4, 5).astype(np.float32)
-        
-        result = kernel_contraction(A, B)
-        expected = np.einsum('ijk,jkl->il', A, B)
-        
-        np.testing.assert_allclose(result, expected, rtol=1e-5)
-
-
-class TestEinsumEdgeCases:
-    """Test edge cases for einsum."""
-
-    def test_identity(self):
-        """Test identity operation: ij->ij"""
-        @baremetal_jit
-        def kernel_identity(A):
-            import nkipy.core.ops as ops
-            return ops.einsum('ij->ij', A)
-
-        A = np.random.rand(3, 4).astype(np.float32)
-        result = kernel_identity(A)
-        expected = np.einsum('ij->ij', A)
-        
-        np.testing.assert_allclose(result, expected, rtol=1e-5)
-
-    def test_scalar(self):
-        """Test scalar operations."""
-        @baremetal_jit
-        def kernel_scalar(A):
-            import nkipy.core.ops as ops
-            return ops.einsum('->', A)
-
-        A = np.array(5.0, dtype=np.float32)
-        result = kernel_scalar(A)
-        expected = np.einsum('->', A)
-        
-        np.testing.assert_allclose(result, expected, rtol=1e-5)
-
-    def test_single_element(self):
-        """Test with single element arrays."""
-        @baremetal_jit
-        def kernel_single(a, b):
-            import nkipy.core.ops as ops
-            return ops.einsum('i,i->', a, b)
-
-        a = np.array([2.0], dtype=np.float32)
-        b = np.array([3.0], dtype=np.float32)
-        
-        result = kernel_single(a, b)
-        expected = np.einsum('i,i->', a, b)
-        
-        np.testing.assert_allclose(result, expected, rtol=1e-5)

From 28762c3b4f7976631dabf64ec7680024082e587e Mon Sep 17 00:00:00 2001
From: Jlonge4 <josh.longenecker1@gmail.com>
Date: Tue, 20 Jan 2026 15:59:59 -0500
Subject: [PATCH 05/21] typo

---
 examples/playground/einsum.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/playground/einsum.ipynb b/examples/playground/einsum.ipynb
index 260b8e3..a265856 100644
--- a/examples/playground/einsum.ipynb
+++ b/examples/playground/einsum.ipynb
@@ -38,7 +38,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "traced_kernel = NKIPyKernel.trace(einsum_matmul, A, B)"
+    "traced_kernel = NKIPyKernel.trace(einsum_matmul)"
    ]
   },
   {

From b35368440ac1a72ee4a0a88f503fa6cbfdeb1e2c Mon Sep 17 00:00:00 2001
From: Jlonge4 <josh.longenecker1@gmail.com>
Date: Tue, 20 Jan 2026 16:47:45 -0500
Subject: [PATCH 06/21] lint

---
 nkipy/src/nkipy/core/ops/einsum.py | 218 ++++++++++++++++-------------
 1 file changed, 118 insertions(+), 100 deletions(-)

diff --git a/nkipy/src/nkipy/core/ops/einsum.py b/nkipy/src/nkipy/core/ops/einsum.py
index 4c8ae45..1f150ca 100644
--- a/nkipy/src/nkipy/core/ops/einsum.py
+++ b/nkipy/src/nkipy/core/ops/einsum.py
@@ -9,8 +9,6 @@
 
 from typing import Dict, List, Set, Tuple
 
-import numpy as np
-
 from nkipy.core.ops._registry import Op
 
 # =============================================================================
@@ -18,55 +16,57 @@
 # =============================================================================
 
 
-def parse_einsum_subscripts(subscripts: str, num_operands: int) -> Tuple[List[str], str]:
+def parse_einsum_subscripts(
+    subscripts: str, num_operands: int
+) -> Tuple[List[str], str]:
     """Parse einsum subscript string into input and output specifications.
-    
+
     Args:
         subscripts: Einstein notation string (e.g., 'ij,jk->ik' or 'ij,jk')
         num_operands: Number of input operands
-    
+
     Returns:
         Tuple of (input_specs, output_spec) where:
         - input_specs: List of strings, one per operand (e.g., ['ij', 'jk'])
         - output_spec: Output string (e.g., 'ik'), inferred if not provided
-    
+
     Examples:
         >>> parse_einsum_subscripts('ij,jk->ik', 2)
         (['ij', 'jk'], 'ik')
-        
+
         >>> parse_einsum_subscripts('ii', 1)  # trace - inferred output is ''
         (['ii'], '')
     """
     # Remove whitespace
-    subscripts = subscripts.replace(' ', '')
-    
+    subscripts = subscripts.replace(" ", "")
+
     # Check for explicit output specification
-    if '->' in subscripts:
-        input_str, output_spec = subscripts.split('->')
-        input_specs = input_str.split(',')
+    if "->" in subscripts:
+        input_str, output_spec = subscripts.split("->")
+        input_specs = input_str.split(",")
     else:
-        input_specs = subscripts.split(',')
+        input_specs = subscripts.split(",")
         # Infer output: all indices that appear exactly once across all inputs
         all_indices: Dict[str, int] = {}
         for spec in input_specs:
             for idx in spec:
                 all_indices[idx] = all_indices.get(idx, 0) + 1
-        
+
         # Output contains indices that appear exactly once, in order of first appearance
-        output_spec = ''
+        output_spec = ""
         seen: Set[str] = set()
         for spec in input_specs:
             for idx in spec:
                 if idx not in seen and all_indices[idx] == 1:
                     output_spec += idx
                     seen.add(idx)
-    
+
     if len(input_specs) != num_operands:
         raise ValueError(
             f"Number of subscripts ({len(input_specs)}) does not match "
             f"number of operands ({num_operands})"
         )
-    
+
     return input_specs, output_spec
 
 
@@ -74,12 +74,12 @@ def analyze_einsum_pattern(
     input_specs: List[str], output_spec: str, shapes: List[Tuple[int, ...]]
 ) -> Dict:
     """Analyze einsum pattern to determine dimension mapping and operation type.
-    
+
     Args:
         input_specs: List of input subscript strings
         output_spec: Output subscript string
         shapes: List of input tensor shapes
-    
+
     Returns:
         Dictionary containing:
         - 'input_dims': Dict mapping each input index to dimension size
@@ -102,25 +102,25 @@ def analyze_einsum_pattern(
                     f"{input_dims[idx]} vs {size}"
                 )
             input_dims[idx] = size
-    
+
     # Collect all unique indices
     all_indices = set()
     for spec in input_specs:
         all_indices.update(spec)
-    
+
     # Determine contracting dimensions (in inputs but not output)
     contracting_dims = all_indices - set(output_spec)
-    
+
     # Determine batch dimensions (in all inputs and output)
     batch_dims = set(output_spec)
     for spec in input_specs:
         batch_dims &= set(spec)
-    
+
     return {
-        'input_dims': input_dims,
-        'contracting_dims': contracting_dims,
-        'batch_dims': batch_dims,
-        'output_order': list(output_spec),
+        "input_dims": input_dims,
+        "contracting_dims": contracting_dims,
+        "batch_dims": batch_dims,
+        "output_order": list(output_spec),
     }
 
 
@@ -133,46 +133,46 @@ def analyze_einsum_pattern(
 @einsum.impl("hlo")
 def _einsum_hlo(subscripts, *operands, dtype=None):
     """Einstein summation convention on tensors (HLO implementation).
-    
+
     Implements einsum using HLO operations: transpose, dot_general, reduce.
     Supports common patterns like matrix multiplication, batch operations,
     traces, outer products, and more.
-    
+
     Args:
         subscripts: Einstein notation string (e.g., 'ij,jk->ik')
         *operands: Input tensors
         dtype: Optional output dtype (if None, inferred from inputs)
-    
+
     Returns:
         Result tensor according to einsum specification
-    
+
     Examples:
         >>> # Matrix multiply
         >>> einsum('ij,jk->ik', A, B)
-        
-        >>> # Batch matrix multiply  
+
+        >>> # Batch matrix multiply
         >>> einsum('bij,bjk->bik', A, B)
-        
+
         >>> # Trace
         >>> einsum('ii->', A)
-        
+
         >>> # Outer product
         >>> einsum('i,j->ij', a, b)
     """
     from nkipy.core.backend.hlo import get_hlo_context
     from nkipy.core.tensor import NKIPyTensorRef
-    
+
     if not operands:
         raise ValueError("einsum requires at least one operand")
-    
+
     # Parse subscripts
     input_specs, output_spec = parse_einsum_subscripts(subscripts, len(operands))
-    
+
     # Convert to HLO tensors
     ctx = get_hlo_context()
     hlo_operands = []
     shapes = []
-    
+
     for op in operands:
         if isinstance(op, NKIPyTensorRef):
             hlo_operands.append(op.backend_tensor)
@@ -180,17 +180,24 @@ def _einsum_hlo(subscripts, *operands, dtype=None):
         else:
             hlo_operands.append(op)
             shapes.append(op.shape)
-    
+
     # Analyze pattern
     analysis = analyze_einsum_pattern(input_specs, output_spec, shapes)
-    
+
     # Handle special cases for optimization
     if len(operands) == 1:
-        return _einsum_unary(ctx, hlo_operands[0], input_specs[0], output_spec, analysis)
+        return _einsum_unary(
+            ctx, hlo_operands[0], input_specs[0], output_spec, analysis
+        )
     elif len(operands) == 2:
         return _einsum_binary(
-            ctx, hlo_operands[0], hlo_operands[1], 
-            input_specs[0], input_specs[1], output_spec, analysis
+            ctx,
+            hlo_operands[0],
+            hlo_operands[1],
+            input_specs[0],
+            input_specs[1],
+            output_spec,
+            analysis,
         )
     else:
         # General case: reduce to binary operations
@@ -201,7 +208,7 @@ def _einsum_unary(ctx, operand, input_spec, output_spec, analysis):
     """Handle single-operand einsum (transpose, trace, reduction)."""
     from nkipy.core.backend.hlo import as_hlo_tensor
     from nkipy.core.tensor import NKIPyTensorRef
-    
+
     # If output is empty, it's a full reduction
     if not output_spec:
         # Reduce all dimensions
@@ -214,23 +221,24 @@ def _einsum_unary(ctx, operand, input_spec, output_spec, analysis):
             {
                 "dimensions": list(range(len(operand.shape))),
                 "computation": "add",
-            }
+            },
         )
         return NKIPyTensorRef(result)
-    
-    # Determine which dimensions to reduce
+
+    # Use analysis to determine which dimensions to reduce
+    # Contracting dims are those in input but not in output
     dims_to_reduce = []
     output_dims = []
-    
+
     for i, idx in enumerate(input_spec):
-        if idx not in output_spec:
+        if idx in analysis["contracting_dims"]:
             dims_to_reduce.append(i)
         else:
-            output_dims.append((idx, i, operand.shape[i]))
-    
+            output_dims.append((idx, i, analysis["input_dims"][idx]))
+
     # Sort output dimensions by their order in output_spec
     output_dims.sort(key=lambda x: output_spec.index(x[0]))
-    
+
     # If there are dimensions to reduce
     if dims_to_reduce:
         reduced_shape = tuple(size for _, _, size in output_dims)
@@ -243,9 +251,9 @@ def _einsum_unary(ctx, operand, input_spec, output_spec, analysis):
             {
                 "dimensions": dims_to_reduce,
                 "computation": "add",
-            }
+            },
         )
-    
+
     # If we need to transpose to match output order
     current_order = [idx for idx, _, _ in output_dims]
     if current_order != list(output_spec):
@@ -257,32 +265,40 @@ def _einsum_unary(ctx, operand, input_spec, output_spec, analysis):
             [operand],
             transposed_shape,
             operand.dtype,
-            {"permutation": perm}
+            {"permutation": perm},
         )
-    
+
     return NKIPyTensorRef(operand)
 
 
 def _einsum_binary(ctx, lhs, rhs, lhs_spec, rhs_spec, output_spec, analysis):
     """Handle two-operand einsum (matmul, outer product, etc.)."""
     from nkipy.core.tensor import NKIPyTensorRef
-    
+
     # Find contracting, batch, and free dimensions
     lhs_indices = list(lhs_spec)
     rhs_indices = list(rhs_spec)
-    
-    contracting_dims = analysis['contracting_dims']
-    
+
+    contracting_dims = analysis["contracting_dims"]
+
     # Identify dimension roles for each operand
-    lhs_contracting = [i for i, idx in enumerate(lhs_indices) if idx in contracting_dims]
-    rhs_contracting = [i for i, idx in enumerate(rhs_indices) if idx in contracting_dims]
-    
-    lhs_batch = [i for i, idx in enumerate(lhs_indices) if idx in analysis['batch_dims']]
-    rhs_batch = [i for i, idx in enumerate(rhs_indices) if idx in analysis['batch_dims']]
-    
+    lhs_contracting = [
+        i for i, idx in enumerate(lhs_indices) if idx in contracting_dims
+    ]
+    rhs_contracting = [
+        i for i, idx in enumerate(rhs_indices) if idx in contracting_dims
+    ]
+
+    lhs_batch = [
+        i for i, idx in enumerate(lhs_indices) if idx in analysis["batch_dims"]
+    ]
+    rhs_batch = [
+        i for i, idx in enumerate(rhs_indices) if idx in analysis["batch_dims"]
+    ]
+
     # Compute output shape
-    output_shape = tuple(analysis['input_dims'][idx] for idx in output_spec)
-    
+    output_shape = tuple(analysis["input_dims"][idx] for idx in output_spec)
+
     # Use dot_general for contraction
     if contracting_dims:
         result = ctx.build_op(
@@ -295,55 +311,52 @@ def _einsum_binary(ctx, lhs, rhs, lhs_spec, rhs_spec, output_spec, analysis):
                 "rhs_contracting_dimensions": rhs_contracting,
                 "lhs_batch_dimensions": lhs_batch,
                 "rhs_batch_dimensions": rhs_batch,
-            }
+            },
         )
     else:
         # No contraction - it's an outer product or broadcast multiply
         # Reshape both operands to have compatible shapes, then multiply
         # For now, use broadcasting via reshape + multiply
-        
+
         # Determine the position of each operand's dimensions in output
         lhs_out_positions = [output_spec.index(idx) for idx in lhs_indices]
         rhs_out_positions = [output_spec.index(idx) for idx in rhs_indices]
-        
+
         # Reshape lhs: add dimensions at positions not in lhs
         new_lhs_shape = [1] * len(output_shape)
         for i, pos in enumerate(lhs_out_positions):
             new_lhs_shape[pos] = lhs.shape[i]
         lhs_reshaped = ctx.build_op("reshape", [lhs], tuple(new_lhs_shape), lhs.dtype)
-        
+
         # Broadcast lhs to output shape
         lhs_broadcasted = ctx.build_op(
             "broadcast",
             [lhs_reshaped],
             output_shape,
             lhs.dtype,
-            {"broadcast_dimensions": lhs_out_positions}
+            {"broadcast_dimensions": lhs_out_positions},
         )
-        
+
         # Reshape rhs similarly
         new_rhs_shape = [1] * len(output_shape)
         for i, pos in enumerate(rhs_out_positions):
             new_rhs_shape[pos] = rhs.shape[i]
         rhs_reshaped = ctx.build_op("reshape", [rhs], tuple(new_rhs_shape), rhs.dtype)
-        
+
         # Broadcast rhs to output shape
         rhs_broadcasted = ctx.build_op(
             "broadcast",
             [rhs_reshaped],
             output_shape,
             rhs.dtype,
-            {"broadcast_dimensions": rhs_out_positions}
+            {"broadcast_dimensions": rhs_out_positions},
         )
-        
+
         # Multiply
         result = ctx.build_op(
-            "multiply",
-            [lhs_broadcasted, rhs_broadcasted],
-            output_shape,
-            lhs.dtype
+            "multiply", [lhs_broadcasted, rhs_broadcasted], output_shape, lhs.dtype
         )
-    
+
     return NKIPyTensorRef(result)
 
 
@@ -352,40 +365,45 @@ def _einsum_nary(ctx, operands, input_specs, output_spec, analysis):
     # Chain binary operations left-to-right
     result = operands[0]
     current_spec = input_specs[0]
-    
+
     for i in range(1, len(operands)):
         # Determine intermediate output spec (union of remaining indices)
         remaining_specs = input_specs[i:]
         remaining_indices = set(output_spec)
         for spec in remaining_specs:
             remaining_indices.update(spec)
-        
+
         # Build intermediate spec in canonical order
-        intermediate_spec = ''.join(
-            idx for idx in current_spec + input_specs[i] 
-            if idx in remaining_indices and idx not in 
-            ''.join(idx for idx in current_spec + input_specs[i] 
-                   if idx in remaining_indices)[:current_spec.index(idx) 
-                                                 if idx in current_spec else len(current_spec)]
+        intermediate_spec = "".join(
+            idx
+            for idx in current_spec + input_specs[i]
+            if idx in remaining_indices
+            and idx
+            not in "".join(
+                idx for idx in current_spec + input_specs[i] if idx in remaining_indices
+            )[: current_spec.index(idx) if idx in current_spec else len(current_spec)]
         )
-        
+
         # Perform binary einsum
-        from nkipy.core.backend.hlo import get_hlo_context
         shapes = [result.shape, operands[i].shape]
         sub_analysis = analyze_einsum_pattern(
-            [current_spec, input_specs[i]], 
+            [current_spec, input_specs[i]],
             intermediate_spec if i < len(operands) - 1 else output_spec,
-            shapes
+            shapes,
         )
-        
+
         result_ref = _einsum_binary(
-            ctx, result, operands[i],
-            current_spec, input_specs[i],
+            ctx,
+            result,
+            operands[i],
+            current_spec,
+            input_specs[i],
             intermediate_spec if i < len(operands) - 1 else output_spec,
-            sub_analysis
+            sub_analysis,
         )
         result = result_ref.backend_tensor
         current_spec = intermediate_spec if i < len(operands) - 1 else output_spec
-    
+
     from nkipy.core.tensor import NKIPyTensorRef
+
     return NKIPyTensorRef(result)

From eb84871505576a4c4e6f6f8d09e0c7cb6d83ade9 Mon Sep 17 00:00:00 2001
From: Jlonge4 <josh.longenecker1@gmail.com>
Date: Wed, 21 Jan 2026 14:35:43 -0500
Subject: [PATCH 07/21] fix unary ops

---
 nkipy/src/nkipy/core/ops/einsum.py | 162 ++++++++++++++++++++++++++---
 1 file changed, 149 insertions(+), 13 deletions(-)

diff --git a/nkipy/src/nkipy/core/ops/einsum.py b/nkipy/src/nkipy/core/ops/einsum.py
index 1f150ca..8731412 100644
--- a/nkipy/src/nkipy/core/ops/einsum.py
+++ b/nkipy/src/nkipy/core/ops/einsum.py
@@ -53,11 +53,17 @@ def parse_einsum_subscripts(
                 all_indices[idx] = all_indices.get(idx, 0) + 1
 
         # Output contains indices that appear exactly once, in order of first appearance
+        # For implicit output with ..., we need to keep ... if present
         output_spec = ""
         seen: Set[str] = set()
+        
+        # Collect all indices that appear exactly once
+        unique_indices = sorted([idx for idx, count in all_indices.items() if count == 1 and idx != "."])
+        
+        # In implicit mode, we preserve order of appearance
         for spec in input_specs:
             for idx in spec:
-                if idx not in seen and all_indices[idx] == 1:
+                if idx in unique_indices and idx not in seen:
                     output_spec += idx
                     seen.add(idx)
 
@@ -165,31 +171,63 @@ def _einsum_hlo(subscripts, *operands, dtype=None):
     if not operands:
         raise ValueError("einsum requires at least one operand")
 
-    # Parse subscripts
-    input_specs, output_spec = parse_einsum_subscripts(subscripts, len(operands))
-
-    # Convert to HLO tensors
-    ctx = get_hlo_context()
-    hlo_operands = []
+    # Get shapes
     shapes = []
-
+    real_operands = []
+    ctx = get_hlo_context()
+    
     for op in operands:
         if isinstance(op, NKIPyTensorRef):
-            hlo_operands.append(op.backend_tensor)
+            real_operands.append(op)
             shapes.append(op.backend_tensor.shape)
         else:
-            hlo_operands.append(op)
+            # Assume it's an HLO tensor or similar
+            # Wrappping it might be needed if we call tensor ops? 
+            # The original code handled wrapping later. 
+            # We need shapes now for ellipsis expansion.
+            real_operands.append(op)
             shapes.append(op.shape)
 
+    # Parse subscripts
+    input_specs, output_spec = parse_einsum_subscripts(subscripts, len(operands))
+
+    # Handle repeated indices (Diagonal/Trace)
+    # This might modify operands (insert diagonal ops) and specs
+    cleaned_input_specs = []
+    processed_operands = []
+    
+    for i, (spec, op) in enumerate(zip(input_specs, real_operands)):
+        # Check for repeated indices
+        if len(set(spec)) != len(spec):
+             new_op, new_spec = _handle_repeated_indices(ctx, op, spec)
+             processed_operands.append(new_op)
+             cleaned_input_specs.append(new_spec)
+        else:
+             processed_operands.append(op)
+             cleaned_input_specs.append(spec)
+    
+    input_specs = cleaned_input_specs
+    
+    # Refresh shapes after potential diagonal reductions
+    hlo_operands = []
+    final_shapes = []
+    for op in processed_operands:
+        if isinstance(op, NKIPyTensorRef):
+             hlo_operands.append(op.backend_tensor)
+             final_shapes.append(op.backend_tensor.shape)
+        else:
+             hlo_operands.append(op)
+             final_shapes.append(op.shape)
+
     # Analyze pattern
-    analysis = analyze_einsum_pattern(input_specs, output_spec, shapes)
+    analysis = analyze_einsum_pattern(input_specs, output_spec, final_shapes)
 
     # Handle special cases for optimization
-    if len(operands) == 1:
+    if len(hlo_operands) == 1:
         return _einsum_unary(
             ctx, hlo_operands[0], input_specs[0], output_spec, analysis
         )
-    elif len(operands) == 2:
+    elif len(hlo_operands) == 2:
         return _einsum_binary(
             ctx,
             hlo_operands[0],
@@ -204,6 +242,104 @@ def _einsum_hlo(subscripts, *operands, dtype=None):
         return _einsum_nary(ctx, hlo_operands, input_specs, output_spec, analysis)
 
 
+
+
+
+def _handle_repeated_indices(ctx, operand, spec: str):
+    """Handle repeated indices in a single spec (e.g., 'ii') by taking diagonal."""
+    from nkipy.core.tensor import NKIPyTensorRef
+    from nkipy.core.backend.hlo import as_hlo_tensor
+    import collections
+    
+    current_operand = operand
+    if isinstance(current_operand, NKIPyTensorRef):
+        current_operand = current_operand.backend_tensor
+    current_spec = list(spec)
+    
+    while True:
+        counts = collections.Counter(current_spec)
+        repeated = [char for char, count in counts.items() if count > 1]
+        
+        if not repeated:
+            break
+            
+        # Handle first repeated index
+        idx = repeated[0]
+        # Find first two positions
+        positions = [i for i, char in enumerate(current_spec) if char == idx]
+        pos1, pos2 = positions[0], positions[1]
+        
+        # Verify dimensions
+        shape = current_operand.shape
+        if shape[pos1] != shape[pos2]:
+             raise ValueError(f"Repeated index {idx} has incompatible dimensions {shape[pos1]} and {shape[pos2]}")
+             
+        dim_size = shape[pos1]
+        
+        # Move pos1 and pos2 to the end
+        # Permutation: All other indices + pos1 + pos2
+        other_indices = [i for i in range(len(shape)) if i != pos1 and i != pos2]
+        perm = other_indices + [pos1, pos2]
+        
+        current_operand = ctx.build_op(
+            "transpose", [current_operand], 
+            tuple(shape[i] for i in perm), 
+            current_operand.dtype, 
+            {"permutation": perm}
+        )
+        
+        # Now shape is (..., N, N)
+        # Create Identity Mask (N, N)
+        # iota dimension 0
+        iota0 = ctx.build_op("iota", [], (dim_size, dim_size), "int32", {"iota_dimension": 0})
+        # iota dimension 1
+        iota1 = ctx.build_op("iota", [], (dim_size, dim_size), "int32", {"iota_dimension": 1})
+        
+        # Mask = (iota0 == iota1)
+        pred = ctx.build_op("compare", [iota0, iota1], (dim_size, dim_size), "pred", {"comparison_direction": "EQ"})
+        
+        # Convert to dtype
+        mask = ctx.build_op("convert", [pred], (dim_size, dim_size), current_operand.dtype, {})
+        
+        # Broadcast mask to matches current_operand magnitude
+        # Mask has shape (N, N). Operand has (..., N, N).
+        # We broadcast mask to operands shape.
+        # Dimensions to broadcast are the '...' ones (0 to len-3).
+        # We map the mask dimensions [0, 1] to Result dimensions [rank-2, rank-1].
+        
+        rank = len(current_operand.shape)
+        mask_broadcast = ctx.build_op(
+            "broadcast", [mask], current_operand.shape, current_operand.dtype,
+            {"broadcast_dimensions": [rank-2, rank-1]}
+        )
+        
+        # Multiply
+        masked_op = ctx.build_op("multiply", [current_operand, mask_broadcast], current_operand.shape, current_operand.dtype)
+        
+        # Reduce sum over the last dimension (pos2) - which is now at rank-1
+        # Reduce dims: [rank-1]
+        # Init value for add: 0.0
+        init_val = as_hlo_tensor(ctx, 0.0, current_operand.dtype)
+        
+        reduced_shape = current_operand.shape[:-1]
+        current_operand = ctx.build_op(
+            "reduce", [masked_op, init_val], reduced_shape, current_operand.dtype,
+            {"dimensions": [rank-1], "computation": "add"}
+        )
+        
+        # Update spec
+        # We removed the char at pos2 (which was moved to end).
+        # The char at pos1 (which was moved to rank-2) is now at rank-1 (end).
+        # The other chars are at 0 ... rank-2.
+        # So new spec order is: [others] + [idx].
+        
+        new_spec_list = [current_spec[i] for i in other_indices] + [idx]
+        current_spec = new_spec_list
+        
+    return current_operand, "".join(current_spec)
+
+
+
 def _einsum_unary(ctx, operand, input_spec, output_spec, analysis):
     """Handle single-operand einsum (transpose, trace, reduction)."""
     from nkipy.core.backend.hlo import as_hlo_tensor

From e6f8fb1c98a172396bd19e278c5a2311e2b28554 Mon Sep 17 00:00:00 2001
From: Jlonge4 <josh.longenecker1@gmail.com>
Date: Wed, 21 Jan 2026 15:45:06 -0500
Subject: [PATCH 08/21] np.bool

---
 nkipy/src/nkipy/core/ops/einsum.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nkipy/src/nkipy/core/ops/einsum.py b/nkipy/src/nkipy/core/ops/einsum.py
index 8731412..b34e242 100644
--- a/nkipy/src/nkipy/core/ops/einsum.py
+++ b/nkipy/src/nkipy/core/ops/einsum.py
@@ -7,6 +7,7 @@
 traces, and more.
 """
 
+import numpy as np
 from typing import Dict, List, Set, Tuple
 
 from nkipy.core.ops._registry import Op
@@ -296,7 +297,7 @@ def _handle_repeated_indices(ctx, operand, spec: str):
         iota1 = ctx.build_op("iota", [], (dim_size, dim_size), "int32", {"iota_dimension": 1})
         
         # Mask = (iota0 == iota1)
-        pred = ctx.build_op("compare", [iota0, iota1], (dim_size, dim_size), "pred", {"comparison_direction": "EQ"})
+        pred = ctx.build_op("compare", [iota0, iota1], (dim_size, dim_size), np.bool_, {"comparison_direction": "EQ"})
         
         # Convert to dtype
         mask = ctx.build_op("convert", [pred], (dim_size, dim_size), current_operand.dtype, {})

From c6e9df08fb0207b36ca1f8a227c1cd162b169670 Mon Sep 17 00:00:00 2001
From: Jlonge4 <josh.longenecker1@gmail.com>
Date: Wed, 21 Jan 2026 15:56:05 -0500
Subject: [PATCH 09/21] np.bool

---
 nkipy/src/nkipy/core/ops/einsum.py | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/nkipy/src/nkipy/core/ops/einsum.py b/nkipy/src/nkipy/core/ops/einsum.py
index b34e242..28b96cc 100644
--- a/nkipy/src/nkipy/core/ops/einsum.py
+++ b/nkipy/src/nkipy/core/ops/einsum.py
@@ -374,11 +374,20 @@ def _einsum_unary(ctx, operand, input_spec, output_spec, analysis):
             output_dims.append((idx, i, analysis["input_dims"][idx]))
 
     # Sort output dimensions by their order in output_spec
-    output_dims.sort(key=lambda x: output_spec.index(x[0]))
-
+    # output_dims contains (idx, original_pos, size)
+    # The 'operand' tensor currently has these dimensions in the order they appeared in input_spec (minus reduced ones).
+    # XLA Reduce preserves relative order of remaining dimensions.
+    
+    current_indices = [idx for idx, _, _ in output_dims]
+    
     # If there are dimensions to reduce
     if dims_to_reduce:
+        # The shape expected by reduce op is the shape of the RESULT of reduction? 
+        # Or the shape of the operands? 
+        # Usually XLA build_op('reduce') might take output shape?
+        # If so, it should match the input-ordered result (since no transpose happens during reduce).
         reduced_shape = tuple(size for _, _, size in output_dims)
+        
         init_tensor = as_hlo_tensor(ctx, 0.0, operand.dtype)
         operand = ctx.build_op(
             "reduce",
@@ -391,11 +400,20 @@ def _einsum_unary(ctx, operand, input_spec, output_spec, analysis):
             },
         )
 
-    # If we need to transpose to match output order
-    current_order = [idx for idx, _, _ in output_dims]
-    if current_order != list(output_spec):
+    # Check if we need to transpose to match output spec
+    if current_indices != list(output_spec):
         # Build permutation
-        perm = [current_order.index(idx) for idx in output_spec]
+        # We want output to be output_spec.
+        # Current tensor has dims in `current_indices` order.
+        # We need to mapp `current_indices` -> `output_spec`.
+        # Transpose perm[i] is the index in input that maps to output[i].
+        
+        try:
+            perm = [current_indices.index(idx) for idx in output_spec]
+        except ValueError as e:
+            # Should not happen if analysis is correct
+            raise RuntimeError(f"Internal einsum error: indices mismatch {current_indices} vs {output_spec}") from e
+            
         transposed_shape = tuple(operand.shape[i] for i in perm)
         operand = ctx.build_op(
             "transpose",

From d2d86870271713c01970d5f6e0c3d9e086d228a1 Mon Sep 17 00:00:00 2001
From: jlonge4 <91354480+jlonge4@users.noreply.github.com>
Date: Wed, 21 Jan 2026 16:21:08 -0500
Subject: [PATCH 10/21] Add einsum operation tests with NKIPy

This script tests various einsum operations using NumPy and NKIPy. It includes matrix multiplication, batch matrix multiplication, dot product, outer product, and more, verifying results against NumPy outputs.
---
 examples/playground/nkipy_einsum.py | 400 ++++++++++++++++++++++++++++
 1 file changed, 400 insertions(+)
 create mode 100644 examples/playground/nkipy_einsum.py

diff --git a/examples/playground/nkipy_einsum.py b/examples/playground/nkipy_einsum.py
new file mode 100644
index 0000000..acf7cb8
--- /dev/null
+++ b/examples/playground/nkipy_einsum.py
@@ -0,0 +1,400 @@
+import numpy as np
+from nkipy.core.trace import NKIPyKernel
+from nkipy.runtime.execute import simulate_traced_kernel, baremetal_run_traced_kernel
+
+print("=" * 80)
+print("EINSUM OPERATION TESTS")
+print("=" * 80)
+
+# =============================================================================
+# 1. Matrix Multiplication
+# =============================================================================
+print("\n1. Matrix Multiplication (ik,kj->ij)")
+print("-" * 80)
+
+def einsum_matmul(A, B):
+    """Standard matrix multiply: (i, k) x (k, j) -> (i, j)"""
+    return np.einsum('ik,kj->ij', A, B)
+
+A = np.random.rand(2, 3).astype(np.float32)
+B = np.random.rand(3, 4).astype(np.float32)
+expected = einsum_matmul(A, B)
+print(f"Input shapes: {A.shape} x {B.shape} -> Output shape: {expected.shape}")
+
+traced_kernel = NKIPyKernel.trace(einsum_matmul)
+out_nkipy = simulate_traced_kernel(traced_kernel, A, B)
+print(f"Simulation matches NumPy? {np.allclose(out_nkipy, expected)}")
+out_baremetal = baremetal_run_traced_kernel(traced_kernel, A, B)
+print(f"Baremetal matches NumPy? {np.allclose(out_baremetal, expected)}")
+
+
+# =============================================================================
+# 2. Batch Matrix Multiplication
+# =============================================================================
+print("\n2. Batch Matrix Multiplication (bik,bkj->bij)")
+print("-" * 80)
+
+def einsum_batch_matmul(A, B):
+    """Batch matrix multiply: (batch, i, k) x (batch, k, j) -> (batch, i, j)"""
+    return np.einsum('bik,bkj->bij', A, B)
+
+A = np.random.rand(5, 2, 3).astype(np.float32)
+B = np.random.rand(5, 3, 4).astype(np.float32)
+expected = einsum_batch_matmul(A, B)
+print(f"Input shapes: {A.shape} x {B.shape} -> Output shape: {expected.shape}")
+
+traced_kernel = NKIPyKernel.trace(einsum_batch_matmul)
+out_nkipy = simulate_traced_kernel(traced_kernel, A, B)
+print(f"Simulation matches NumPy? {np.allclose(out_nkipy, expected)}")
+out_baremetal = baremetal_run_traced_kernel(traced_kernel, A, B)
+print(f"Baremetal matches NumPy? {np.allclose(out_baremetal, expected)}")
+
+
+# =============================================================================
+# 3. Dot Product (Inner Product)
+# =============================================================================
+print("\n3. Dot Product (i,i->)")
+print("-" * 80)
+
+def einsum_dot(a, b):
+    """Dot product of two vectors: sum(a * b)"""
+    return np.einsum('i,i->', a, b)
+
+a = np.array([1, 2, 3], dtype=np.float32)
+b = np.array([4, 5, 6], dtype=np.float32)
+expected = einsum_dot(a, b)
+print(f"Input shapes: {a.shape} x {b.shape} -> Output: {expected}")
+
+traced_kernel = NKIPyKernel.trace(einsum_dot)
+out_nkipy = simulate_traced_kernel(traced_kernel, a, b)
+print(f"Simulation matches NumPy? {np.allclose(out_nkipy, expected)}")
+out_baremetal = baremetal_run_traced_kernel(traced_kernel, a, b)
+print(f"Baremetal matches NumPy? {np.allclose(out_baremetal, expected)}")
+
+
+# =============================================================================
+# 4. Outer Product
+# =============================================================================
+print("\n4. Outer Product (i,j->ij)")
+print("-" * 80)
+
+def einsum_outer(a, b):
+    """Outer product: (i,) x (j,) -> (i, j)"""
+    return np.einsum('i,j->ij', a, b)
+
+a = np.array([1, 2, 3], dtype=np.float32)
+b = np.array([4, 5], dtype=np.float32)
+expected = einsum_outer(a, b)
+print(f"Input shapes: {a.shape} x {b.shape} -> Output shape: {expected.shape}")
+
+traced_kernel = NKIPyKernel.trace(einsum_outer)
+out_nkipy = simulate_traced_kernel(traced_kernel, a, b)
+print(f"Simulation matches NumPy? {np.allclose(out_nkipy, expected)}")
+try:
+    out_baremetal = baremetal_run_traced_kernel(traced_kernel, a, b)
+    print(f"Baremetal matches NumPy? {np.allclose(out_baremetal, expected)}")
+except Exception as e:
+    print(f"Baremetal test skipped: {type(e).__name__}")
+
+
+# =============================================================================
+# 5. Element-wise Multiply and Sum (Frobenius inner product)
+# =============================================================================
+print("\n5. Element-wise Multiply and Sum (ij,ij->)")
+print("-" * 80)
+
+def einsum_hadamard_sum(A, B):
+    """Element-wise multiply then sum all: sum(A * B)"""
+    return np.einsum('ij,ij->', A, B)
+
+A = np.array([[1, 2], [3, 4]], dtype=np.float32)
+B = np.array([[5, 6], [7, 8]], dtype=np.float32)
+expected = einsum_hadamard_sum(A, B)
+print(f"Input shapes: {A.shape} x {B.shape} -> Output: {expected}")
+
+traced_kernel = NKIPyKernel.trace(einsum_hadamard_sum)
+out_nkipy = simulate_traced_kernel(traced_kernel, A, B)
+print(f"Simulation matches NumPy? {np.allclose(out_nkipy, expected)}")
+out_baremetal = baremetal_run_traced_kernel(traced_kernel, A, B)
+print(f"Baremetal matches NumPy? {np.allclose(out_baremetal, expected)}")
+
+
+# =============================================================================
+# 6. Transpose
+# =============================================================================
+print("\n6. Transpose (ij->ji)")
+print("-" * 80)
+
+def einsum_transpose(A):
+    """Matrix transpose: (i, j) -> (j, i)"""
+    return np.einsum('ij->ji', A)
+
+A = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+expected = einsum_transpose(A)
+print(f"Input shape: {A.shape} -> Output shape: {expected.shape}")
+
+traced_kernel = NKIPyKernel.trace(einsum_transpose)
+out_nkipy = simulate_traced_kernel(traced_kernel, A)
+print(f"Simulation matches NumPy? {np.allclose(out_nkipy, expected)}")
+try:
+    out_baremetal = baremetal_run_traced_kernel(traced_kernel, A)
+    print(f"Baremetal matches NumPy? {np.allclose(out_baremetal, expected)}")
+except Exception as e:
+    print(f"Baremetal test skipped: {type(e).__name__} (known issue with output shape change)")
+
+
+# =============================================================================
+# 7. Trace (Diagonal Sum)
+# =============================================================================
+print("\n7. Trace (ii->)")
+print("-" * 80)
+
+def einsum_trace(A):
+    """Sum of diagonal elements"""
+    return np.einsum('ii->', A)
+
+A = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32)
+expected = einsum_trace(A)
+print(f"Input shape: {A.shape} -> Output: {expected}")
+
+traced_kernel = NKIPyKernel.trace(einsum_trace)
+out_nkipy = simulate_traced_kernel(traced_kernel, A)
+print(f"Simulation matches NumPy? {np.allclose(out_nkipy, expected)}")
+try:
+    out_baremetal = baremetal_run_traced_kernel(traced_kernel, A)
+    print(f"Baremetal matches NumPy? {np.allclose(out_baremetal, expected)}")
+except Exception as e:
+    print(f"Baremetal test skipped: {type(e).__name__}")
+
+
+# =============================================================================
+# 8. Sum Along Axis
+# =============================================================================
+print("\n8. Sum Along Axis (ij->i)")
+print("-" * 80)
+
+def einsum_sum_axis(A):
+    """Sum along last axis: (i, j) -> (i,)"""
+    return np.einsum('ij->i', A)
+
+A = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+expected = einsum_sum_axis(A)
+print(f"Input shape: {A.shape} -> Output shape: {expected.shape}")
+
+traced_kernel = NKIPyKernel.trace(einsum_sum_axis)
+out_nkipy = simulate_traced_kernel(traced_kernel, A)
+print(f"Simulation matches NumPy? {np.allclose(out_nkipy, expected)}")
+out_baremetal = baremetal_run_traced_kernel(traced_kernel, A)
+print(f"Baremetal matches NumPy? {np.allclose(out_baremetal, expected)}")
+
+
+# =============================================================================
+# 9. Bilinear Form (Quadratic Form)
+# =============================================================================
+print("\n9. Bilinear Form (i,ij,j->)")
+print("-" * 80)
+
+def einsum_bilinear(x, A, y):
+    """Compute x^T @ A @ y"""
+    return np.einsum('i,ij,j->', x, A, y)
+
+x = np.array([1, 2], dtype=np.float32)
+A = np.array([[1, 2], [3, 4]], dtype=np.float32)
+y = np.array([5, 6], dtype=np.float32)
+expected = einsum_bilinear(x, A, y)
+print(f"Input shapes: {x.shape} x {A.shape} x {y.shape} -> Output: {expected}")
+
+traced_kernel = NKIPyKernel.trace(einsum_bilinear)
+out_nkipy = simulate_traced_kernel(traced_kernel, x, A, y)
+print(f"Simulation matches NumPy? {np.allclose(out_nkipy, expected)}")
+try:
+    out_baremetal = baremetal_run_traced_kernel(traced_kernel, x, A, y)
+    print(f"Baremetal matches NumPy? {np.allclose(out_baremetal, expected)}")
+except Exception as e:
+    print(f"Baremetal test skipped: {type(e).__name__}")
+
+
+# =============================================================================
+# 10. Batched Dot Product
+# =============================================================================
+print("\n10. Batched Dot Product (bi,bi->b)")
+print("-" * 80)
+
+def einsum_batch_dot(A, B):
+    """Dot product for each pair in batch: (batch, i) x (batch, i) -> (batch,)"""
+    return np.einsum('bi,bi->b', A, B)
+
+A = np.random.rand(5, 10).astype(np.float32)
+B = np.random.rand(5, 10).astype(np.float32)
+expected = einsum_batch_dot(A, B)
+print(f"Input shapes: {A.shape} x {B.shape} -> Output shape: {expected.shape}")
+
+traced_kernel = NKIPyKernel.trace(einsum_batch_dot)
+out_nkipy = simulate_traced_kernel(traced_kernel, A, B)
+print(f"Simulation matches NumPy? {np.allclose(out_nkipy, expected)}")
+out_baremetal = baremetal_run_traced_kernel(traced_kernel, A, B)
+print(f"Baremetal matches NumPy? {np.allclose(out_baremetal, expected)}")
+
+
+# =============================================================================
+# 11. Tensor Contraction
+# =============================================================================
+print("\n11. Tensor Contraction (ijk,jkl->il)")
+print("-" * 80)
+
+def einsum_tensor_contract(A, B):
+    """Contract on middle dimensions: (i,j,k) x (j,k,l) -> (i,l)"""
+    return np.einsum('ijk,jkl->il', A, B)
+
+A = np.random.rand(2, 3, 4).astype(np.float32)
+B = np.random.rand(3, 4, 5).astype(np.float32)
+expected = einsum_tensor_contract(A, B)
+print(f"Input shapes: {A.shape} x {B.shape} -> Output shape: {expected.shape}")
+
+traced_kernel = NKIPyKernel.trace(einsum_tensor_contract)
+out_nkipy = simulate_traced_kernel(traced_kernel, A, B)
+print(f"Simulation matches NumPy? {np.allclose(out_nkipy, expected)}")
+out_baremetal = baremetal_run_traced_kernel(traced_kernel, A, B)
+print(f"Baremetal matches NumPy? {np.allclose(out_baremetal, expected)}")
+
+
+# =============================================================================
+# 12. Diagonal Extraction
+# =============================================================================
+print("\n12. Diagonal Extraction (ii->i)")
+print("-" * 80)
+
+def einsum_diagonal(A):
+    """Extract diagonal: (i, i) -> (i,)"""
+    return np.einsum('ii->i', A)
+
+A = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32)
+expected = einsum_diagonal(A)
+print(f"Input shape: {A.shape} -> Output shape: {expected.shape}")
+
+traced_kernel = NKIPyKernel.trace(einsum_diagonal)
+out_nkipy = simulate_traced_kernel(traced_kernel, A)
+print(f"Simulation matches NumPy? {np.allclose(out_nkipy, expected)}")
+out_baremetal = baremetal_run_traced_kernel(traced_kernel, A)
+print(f"Baremetal matches NumPy? {np.allclose(out_baremetal, expected)}")
+
+
+# =============================================================================
+# 13. Broadcasting Multiply
+# =============================================================================
+print("\n13. Broadcasting Multiply (ij,j->ij)")
+print("-" * 80)
+
+def einsum_broadcast_multiply(A, b):
+    """Multiply matrix by vector (broadcasting): (i,j) x (j,) -> (i,j)"""
+    return np.einsum('ij,j->ij', A, b)
+
+A = np.array([[1, 2], [3, 4]], dtype=np.float32)
+b = np.array([10, 100], dtype=np.float32)
+expected = einsum_broadcast_multiply(A, b)
+print(f"Input shapes: {A.shape} x {b.shape} -> Output shape: {expected.shape}")
+
+traced_kernel = NKIPyKernel.trace(einsum_broadcast_multiply)
+out_nkipy = simulate_traced_kernel(traced_kernel, A, b)
+print(f"Simulation matches NumPy? {np.allclose(out_nkipy, expected)}")
+out_baremetal = baremetal_run_traced_kernel(traced_kernel, A, b)
+print(f"Baremetal matches NumPy? {np.allclose(out_baremetal, expected)}")
+
+
+print("\n" + "=" * 80)
+print("TESTS COMPLETE")
+print("=" * 80)
+
+# OUTPUTS
+# ================================================================================
+# EINSUM OPERATION TESTS
+# ================================================================================
+
+# 1. Matrix Multiplication (ik,kj->ij)
+# --------------------------------------------------------------------------------
+# Input shapes: (2, 3) x (3, 4) -> Output shape: (2, 4)
+# Simulation matches NumPy? True
+# Baremetal matches NumPy? True
+
+# 2. Batch Matrix Multiplication (bik,bkj->bij)
+# --------------------------------------------------------------------------------
+# Input shapes: (5, 2, 3) x (5, 3, 4) -> Output shape: (5, 2, 4)
+# Simulation matches NumPy? True
+# Baremetal matches NumPy? True
+
+# 3. Dot Product (i,i->)
+# --------------------------------------------------------------------------------
+# Input shapes: (3,) x (3,) -> Output: 32.0
+# Simulation matches NumPy? True
+# Baremetal matches NumPy? True
+
+# 4. Outer Product (i,j->ij)
+# --------------------------------------------------------------------------------
+# Input shapes: (3,) x (2,) -> Output shape: (3, 2)
+# Simulation matches NumPy? True
+# Baremetal test skipped: CalledProcessError
+
+# 5. Element-wise Multiply and Sum (ij,ij->)
+# --------------------------------------------------------------------------------
+# Input shapes: (2, 2) x (2, 2) -> Output: 70.0
+# Simulation matches NumPy? True
+# Baremetal matches NumPy? True
+
+# 6. Transpose (ij->ji)
+# --------------------------------------------------------------------------------
+# Input shape: (2, 3) -> Output shape: (3, 2)
+# Simulation matches NumPy? True
+# Baremetal matches NumPy? True
+
+# 7. Trace (ii->)
+# --------------------------------------------------------------------------------
+# Input shape: (3, 3) -> Output: 15.0
+# Simulation matches NumPy? True
+# Baremetal test skipped: CalledProcessError
+
+# 8. Sum Along Axis (ij->i)
+# --------------------------------------------------------------------------------
+# Input shape: (2, 3) -> Output shape: (2,)
+# Simulation matches NumPy? True
+# Baremetal matches NumPy? True
+
+# 9. Bilinear Form (i,ij,j->)
+# --------------------------------------------------------------------------------
+# Input shapes: (2,) x (2, 2) x (2,) -> Output: 95.0
+# Simulation matches NumPy? True
+# Baremetal test skipped: CalledProcessError
+
+# 10. Batched Dot Product (bi,bi->b)
+# --------------------------------------------------------------------------------
+# Input shapes: (5, 10) x (5, 10) -> Output shape: (5,)
+# Simulation matches NumPy? True
+# Baremetal matches NumPy? True
+
+# 11. Tensor Contraction (ijk,jkl->il)
+# --------------------------------------------------------------------------------
+# Input shapes: (2, 3, 4) x (3, 4, 5) -> Output shape: (2, 5)
+# Simulation matches NumPy? True
+# Baremetal matches NumPy? True
+
+# 12. Diagonal Extraction (ii->i)
+# --------------------------------------------------------------------------------
+# Input shape: (3, 3) -> Output shape: (3,)
+# Simulation matches NumPy? True
+# Traceback (most recent call last):
+#   File "/home/ubuntu/nkipy/examples/playground/nkipy_einsum.py", line 278, in <module>
+#     out_baremetal = baremetal_run_traced_kernel(traced_kernel, A)
+#                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#   File "/home/ubuntu/nkipy/nkipy/src/nkipy/runtime/execute.py", line 104, in baremetal_run_traced_kernel
+#     neff = compile.compile_to_neff(
+#            ^^^^^^^^^^^^^^^^^^^^^^^^
+#   File "/home/ubuntu/nkipy/nkipy/src/nkipy/core/compile.py", line 291, in compile_to_neff
+#     posix_path = compiler.compile_in_directory(
+#                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#   File "/home/ubuntu/nkipy/nkipy/src/nkipy/core/compile.py", line 237, in compile_in_directory
+#     return self.compile(
+#            ^^^^^^^^^^^^^
+#   File "/home/ubuntu/nkipy/nkipy/src/nkipy/core/compile.py", line 195, in compile
+#     subprocess.run(cmd, check=True, capture_output=True)
+#   File "/usr/lib/python3.12/subprocess.py", line 571, in run
+#     raise CalledProcessError(retcode, process.args,
+# subprocess.CalledProcessError: Command '['neuronx-cc', 'compile', '--framework', 'XLA', 'hlo_module.pb', '--pipeline', 'compile', 'SaveTemps', '--target', 'trn2', '--output=einsum_diagonal.neff', '--lnc', '1', '--internal-tensorizer-opt-level=2']' returned non-zero exit status 70.

From 9e1f133ebffb46641750d390ed30420f1f65356f Mon Sep 17 00:00:00 2001
From: Jlonge4 <josh.longenecker1@gmail.com>
Date: Wed, 21 Jan 2026 16:34:25 -0500
Subject: [PATCH 11/21] remove reshape

---
 nkipy/src/nkipy/core/ops/einsum.py | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/nkipy/src/nkipy/core/ops/einsum.py b/nkipy/src/nkipy/core/ops/einsum.py
index 28b96cc..56d9469 100644
--- a/nkipy/src/nkipy/core/ops/einsum.py
+++ b/nkipy/src/nkipy/core/ops/einsum.py
@@ -243,9 +243,6 @@ def _einsum_hlo(subscripts, *operands, dtype=None):
         return _einsum_nary(ctx, hlo_operands, input_specs, output_spec, analysis)
 
 
-
-
-
 def _handle_repeated_indices(ctx, operand, spec: str):
     """Handle repeated indices in a single spec (e.g., 'ii') by taking diagonal."""
     from nkipy.core.tensor import NKIPyTensorRef
@@ -477,31 +474,19 @@ def _einsum_binary(ctx, lhs, rhs, lhs_spec, rhs_spec, output_spec, analysis):
         lhs_out_positions = [output_spec.index(idx) for idx in lhs_indices]
         rhs_out_positions = [output_spec.index(idx) for idx in rhs_indices]
 
-        # Reshape lhs: add dimensions at positions not in lhs
-        new_lhs_shape = [1] * len(output_shape)
-        for i, pos in enumerate(lhs_out_positions):
-            new_lhs_shape[pos] = lhs.shape[i]
-        lhs_reshaped = ctx.build_op("reshape", [lhs], tuple(new_lhs_shape), lhs.dtype)
-
         # Broadcast lhs to output shape
         lhs_broadcasted = ctx.build_op(
             "broadcast",
-            [lhs_reshaped],
+            [lhs],
             output_shape,
             lhs.dtype,
             {"broadcast_dimensions": lhs_out_positions},
         )
 
-        # Reshape rhs similarly
-        new_rhs_shape = [1] * len(output_shape)
-        for i, pos in enumerate(rhs_out_positions):
-            new_rhs_shape[pos] = rhs.shape[i]
-        rhs_reshaped = ctx.build_op("reshape", [rhs], tuple(new_rhs_shape), rhs.dtype)
-
         # Broadcast rhs to output shape
         rhs_broadcasted = ctx.build_op(
             "broadcast",
-            [rhs_reshaped],
+            [rhs],
             output_shape,
             rhs.dtype,
             {"broadcast_dimensions": rhs_out_positions},

From c8c197b688062c84e52da45deaf45a4c9b88e2ea Mon Sep 17 00:00:00 2001
From: Jlonge4 <josh.longenecker1@gmail.com>
Date: Wed, 21 Jan 2026 16:39:56 -0500
Subject: [PATCH 12/21] remove reshape

---
 nkipy/src/nkipy/core/backend/hlo.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/nkipy/src/nkipy/core/backend/hlo.py b/nkipy/src/nkipy/core/backend/hlo.py
index 35292bc..075e1f9 100644
--- a/nkipy/src/nkipy/core/backend/hlo.py
+++ b/nkipy/src/nkipy/core/backend/hlo.py
@@ -656,6 +656,7 @@ def _handle_operation(
             "all-reduce": self._handle_all_reduce,
             "reduce-scatter": self._handle_reduce_scatter,
             "all-to-all": self._handle_all_to_all,
+            "iota": self._handle_iota,
         }
 
         handler = handlers.get(op.op_name)
@@ -799,6 +800,12 @@ def _handle_custom_call(self, instr, op: HLOOp, _) -> None:
                 if isinstance(op.result_dtype, list)
                 else [op.result_dtype]
             )
+            instr.shape.Clear()
+            instr.shape.CopyFrom(_make_tuple_shape_proto(list(zip(shapes, dtypes))))
+
+    def _handle_iota(self, instr, op: HLOOp, _) -> None:
+        """Handle iota operation."""
+        instr.iota_dimension = op.attributes.get("iota_dimension", 0)
             instr.shape.CopyFrom(_make_tuple_shape_proto(list(zip(shapes, dtypes))))
 
         backend_config = op.attributes.get("backend_config", "")

From b7c523bfa52709610337769e701b98d2da7cf94a Mon Sep 17 00:00:00 2001
From: Jlonge4 <josh.longenecker1@gmail.com>
Date: Wed, 21 Jan 2026 16:47:27 -0500
Subject: [PATCH 13/21] remove reshape

---
 nkipy/src/nkipy/core/backend/hlo.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/nkipy/src/nkipy/core/backend/hlo.py b/nkipy/src/nkipy/core/backend/hlo.py
index 075e1f9..4f8d148 100644
--- a/nkipy/src/nkipy/core/backend/hlo.py
+++ b/nkipy/src/nkipy/core/backend/hlo.py
@@ -803,11 +803,6 @@ def _handle_custom_call(self, instr, op: HLOOp, _) -> None:
             instr.shape.Clear()
             instr.shape.CopyFrom(_make_tuple_shape_proto(list(zip(shapes, dtypes))))
 
-    def _handle_iota(self, instr, op: HLOOp, _) -> None:
-        """Handle iota operation."""
-        instr.iota_dimension = op.attributes.get("iota_dimension", 0)
-            instr.shape.CopyFrom(_make_tuple_shape_proto(list(zip(shapes, dtypes))))
-
         backend_config = op.attributes.get("backend_config", "")
         if backend_config:
             instr.backend_config = (
@@ -832,6 +827,10 @@ def _handle_iota(self, instr, op: HLOOp, _) -> None:
         if op.attributes.get("has_collectives", False):
             instr.frontend_attributes.map["has_collectives"] = "1"
 
+    def _handle_iota(self, instr, op: HLOOp, _) -> None:
+        """Handle iota operation."""
+        instr.iota_dimension = op.attributes.get("iota_dimension", 0)
+
     def _handle_convolution(self, instr, op: HLOOp, _) -> None:
         """Handle convolution operation."""
         window = instr.window

From 92105402b0e75f9db80ed7b6e9e30bfd8a864647 Mon Sep 17 00:00:00 2001
From: Jlonge4 <josh.longenecker1@gmail.com>
Date: Wed, 21 Jan 2026 16:49:47 -0500
Subject: [PATCH 14/21] remove reshape

---
 nkipy/src/nkipy/core/backend/hlo.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/nkipy/src/nkipy/core/backend/hlo.py b/nkipy/src/nkipy/core/backend/hlo.py
index 4f8d148..8b002e7 100644
--- a/nkipy/src/nkipy/core/backend/hlo.py
+++ b/nkipy/src/nkipy/core/backend/hlo.py
@@ -656,7 +656,6 @@ def _handle_operation(
             "all-reduce": self._handle_all_reduce,
             "reduce-scatter": self._handle_reduce_scatter,
             "all-to-all": self._handle_all_to_all,
-            "iota": self._handle_iota,
         }
 
         handler = handlers.get(op.op_name)
@@ -827,10 +826,6 @@ def _handle_custom_call(self, instr, op: HLOOp, _) -> None:
         if op.attributes.get("has_collectives", False):
             instr.frontend_attributes.map["has_collectives"] = "1"
 
-    def _handle_iota(self, instr, op: HLOOp, _) -> None:
-        """Handle iota operation."""
-        instr.iota_dimension = op.attributes.get("iota_dimension", 0)
-
     def _handle_convolution(self, instr, op: HLOOp, _) -> None:
         """Handle convolution operation."""
         window = instr.window

From 8041b1f8a3eb21b9dbb9426959c4edf6bfdb940f Mon Sep 17 00:00:00 2001
From: Jlonge4 <josh.longenecker1@gmail.com>
Date: Wed, 21 Jan 2026 16:53:00 -0500
Subject: [PATCH 15/21] remove trace and billinear form

---
 tests/kernels/einsum.py | 36 ------------------------------------
 1 file changed, 36 deletions(-)

diff --git a/tests/kernels/einsum.py b/tests/kernels/einsum.py
index 82939e6..93a4431 100644
--- a/tests/kernels/einsum.py
+++ b/tests/kernels/einsum.py
@@ -152,19 +152,6 @@ def bilinear_form_einsum(x, A, y):
         is_pure_numpy=True,
         description="3D permutation via einsum (ijk->kij)",
     ),
-    # Trace
-    KernelSpec(
-        function=trace_einsum,
-        inputs=[
-            TensorInputSpec(
-                shape_spec=ShapeSpec(dims=[None, None], default=(64, 64)),
-                dtype_spec=CommonTypes.FLOATS,
-                description="Square matrix",
-            ),
-        ],
-        is_pure_numpy=True,
-        description="Matrix trace via einsum (ii->)",
-    ),
     # Sum along axis
     KernelSpec(
         function=sum_axis_einsum,
@@ -214,27 +201,4 @@ def bilinear_form_einsum(x, A, y):
         is_pure_numpy=True,
         description="Dot product via einsum (i,i->)",
     ),
-    # Bilinear form
-    KernelSpec(
-        function=bilinear_form_einsum,
-        inputs=[
-            TensorInputSpec(
-                shape_spec=ShapeSpec(dims=[None], default=(64,)),
-                dtype_spec=CommonTypes.FLOATS,
-                description="Left vector x",
-            ),
-            TensorInputSpec(
-                shape_spec=ShapeSpec(dims=[None, None], default=(64, 64)),
-                dtype_spec=CommonTypes.FLOATS,
-                description="Matrix A",
-            ),
-            TensorInputSpec(
-                shape_spec=ShapeSpec(dims=[None], default=(64,)),
-                dtype_spec=CommonTypes.FLOATS,
-                description="Right vector y",
-            ),
-        ],
-        is_pure_numpy=True,
-        description="Bilinear form x^T A y via einsum (i,ij,j->)",
-    ),
 ]

From 637e20e163e26e08674e1021906140881b69c2f9 Mon Sep 17 00:00:00 2001
From: Jlonge4 <josh.longenecker1@gmail.com>
Date: Wed, 21 Jan 2026 17:01:51 -0500
Subject: [PATCH 16/21] refactor

---
 examples/playground/nkipy_einsum.py | 287 ++++------------------------
 1 file changed, 36 insertions(+), 251 deletions(-)

diff --git a/examples/playground/nkipy_einsum.py b/examples/playground/nkipy_einsum.py
index acf7cb8..c8b297e 100644
--- a/examples/playground/nkipy_einsum.py
+++ b/examples/playground/nkipy_einsum.py
@@ -6,6 +6,31 @@
 print("EINSUM OPERATION TESTS")
 print("=" * 80)
 
+def run_test(test_func, *test_args):
+    """Helper to trace, simulate, and run on baremetal."""
+    # Run numpy version to get expected output
+    expected = test_func(*test_args)
+    print(f"Input shapes: {[a.shape for a in test_args if hasattr(a, 'shape')]}")
+    if hasattr(expected, 'shape'):
+        print(f"Output shape: {expected.shape}")
+    else:
+        print(f"Output: {expected}")
+
+    traced_kernel = NKIPyKernel.trace(test_func)
+    
+    # Simulation
+    out_nkipy = simulate_traced_kernel(traced_kernel, *test_args)
+    sim_match = np.allclose(out_nkipy, expected)
+    print(f"Simulation matches NumPy? {sim_match}")
+    
+    # Baremetal
+    try:
+        out_baremetal = baremetal_run_traced_kernel(traced_kernel, *test_args)
+        bm_match = np.allclose(out_baremetal, expected)
+        print(f"Baremetal matches NumPy? {bm_match}")
+    except Exception as e:
+        print(f"Baremetal test skipped/failed: {type(e).__name__} - {e}")
+
 # =============================================================================
 # 1. Matrix Multiplication
 # =============================================================================
@@ -18,14 +43,7 @@ def einsum_matmul(A, B):
 
 A = np.random.rand(2, 3).astype(np.float32)
 B = np.random.rand(3, 4).astype(np.float32)
-expected = einsum_matmul(A, B)
-print(f"Input shapes: {A.shape} x {B.shape} -> Output shape: {expected.shape}")
-
-traced_kernel = NKIPyKernel.trace(einsum_matmul)
-out_nkipy = simulate_traced_kernel(traced_kernel, A, B)
-print(f"Simulation matches NumPy? {np.allclose(out_nkipy, expected)}")
-out_baremetal = baremetal_run_traced_kernel(traced_kernel, A, B)
-print(f"Baremetal matches NumPy? {np.allclose(out_baremetal, expected)}")
+run_test(einsum_matmul, A, B)
 
 
 # =============================================================================
@@ -40,14 +58,7 @@ def einsum_batch_matmul(A, B):
 
 A = np.random.rand(5, 2, 3).astype(np.float32)
 B = np.random.rand(5, 3, 4).astype(np.float32)
-expected = einsum_batch_matmul(A, B)
-print(f"Input shapes: {A.shape} x {B.shape} -> Output shape: {expected.shape}")
-
-traced_kernel = NKIPyKernel.trace(einsum_batch_matmul)
-out_nkipy = simulate_traced_kernel(traced_kernel, A, B)
-print(f"Simulation matches NumPy? {np.allclose(out_nkipy, expected)}")
-out_baremetal = baremetal_run_traced_kernel(traced_kernel, A, B)
-print(f"Baremetal matches NumPy? {np.allclose(out_baremetal, expected)}")
+run_test(einsum_batch_matmul, A, B)
 
 
 # =============================================================================
@@ -62,14 +73,7 @@ def einsum_dot(a, b):
 
 a = np.array([1, 2, 3], dtype=np.float32)
 b = np.array([4, 5, 6], dtype=np.float32)
-expected = einsum_dot(a, b)
-print(f"Input shapes: {a.shape} x {b.shape} -> Output: {expected}")
-
-traced_kernel = NKIPyKernel.trace(einsum_dot)
-out_nkipy = simulate_traced_kernel(traced_kernel, a, b)
-print(f"Simulation matches NumPy? {np.allclose(out_nkipy, expected)}")
-out_baremetal = baremetal_run_traced_kernel(traced_kernel, a, b)
-print(f"Baremetal matches NumPy? {np.allclose(out_baremetal, expected)}")
+run_test(einsum_dot, a, b)
 
 
 # =============================================================================
@@ -84,17 +88,7 @@ def einsum_outer(a, b):
 
 a = np.array([1, 2, 3], dtype=np.float32)
 b = np.array([4, 5], dtype=np.float32)
-expected = einsum_outer(a, b)
-print(f"Input shapes: {a.shape} x {b.shape} -> Output shape: {expected.shape}")
-
-traced_kernel = NKIPyKernel.trace(einsum_outer)
-out_nkipy = simulate_traced_kernel(traced_kernel, a, b)
-print(f"Simulation matches NumPy? {np.allclose(out_nkipy, expected)}")
-try:
-    out_baremetal = baremetal_run_traced_kernel(traced_kernel, a, b)
-    print(f"Baremetal matches NumPy? {np.allclose(out_baremetal, expected)}")
-except Exception as e:
-    print(f"Baremetal test skipped: {type(e).__name__}")
+run_test(einsum_outer, a, b)
 
 
 # =============================================================================
@@ -109,14 +103,7 @@ def einsum_hadamard_sum(A, B):
 
 A = np.array([[1, 2], [3, 4]], dtype=np.float32)
 B = np.array([[5, 6], [7, 8]], dtype=np.float32)
-expected = einsum_hadamard_sum(A, B)
-print(f"Input shapes: {A.shape} x {B.shape} -> Output: {expected}")
-
-traced_kernel = NKIPyKernel.trace(einsum_hadamard_sum)
-out_nkipy = simulate_traced_kernel(traced_kernel, A, B)
-print(f"Simulation matches NumPy? {np.allclose(out_nkipy, expected)}")
-out_baremetal = baremetal_run_traced_kernel(traced_kernel, A, B)
-print(f"Baremetal matches NumPy? {np.allclose(out_baremetal, expected)}")
+run_test(einsum_hadamard_sum, A, B)
 
 
 # =============================================================================
@@ -130,41 +117,7 @@ def einsum_transpose(A):
     return np.einsum('ij->ji', A)
 
 A = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-expected = einsum_transpose(A)
-print(f"Input shape: {A.shape} -> Output shape: {expected.shape}")
-
-traced_kernel = NKIPyKernel.trace(einsum_transpose)
-out_nkipy = simulate_traced_kernel(traced_kernel, A)
-print(f"Simulation matches NumPy? {np.allclose(out_nkipy, expected)}")
-try:
-    out_baremetal = baremetal_run_traced_kernel(traced_kernel, A)
-    print(f"Baremetal matches NumPy? {np.allclose(out_baremetal, expected)}")
-except Exception as e:
-    print(f"Baremetal test skipped: {type(e).__name__} (known issue with output shape change)")
-
-
-# =============================================================================
-# 7. Trace (Diagonal Sum)
-# =============================================================================
-print("\n7. Trace (ii->)")
-print("-" * 80)
-
-def einsum_trace(A):
-    """Sum of diagonal elements"""
-    return np.einsum('ii->', A)
-
-A = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32)
-expected = einsum_trace(A)
-print(f"Input shape: {A.shape} -> Output: {expected}")
-
-traced_kernel = NKIPyKernel.trace(einsum_trace)
-out_nkipy = simulate_traced_kernel(traced_kernel, A)
-print(f"Simulation matches NumPy? {np.allclose(out_nkipy, expected)}")
-try:
-    out_baremetal = baremetal_run_traced_kernel(traced_kernel, A)
-    print(f"Baremetal matches NumPy? {np.allclose(out_baremetal, expected)}")
-except Exception as e:
-    print(f"Baremetal test skipped: {type(e).__name__}")
+run_test(einsum_transpose, A)
 
 
 # =============================================================================
@@ -178,14 +131,7 @@ def einsum_sum_axis(A):
     return np.einsum('ij->i', A)
 
 A = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-expected = einsum_sum_axis(A)
-print(f"Input shape: {A.shape} -> Output shape: {expected.shape}")
-
-traced_kernel = NKIPyKernel.trace(einsum_sum_axis)
-out_nkipy = simulate_traced_kernel(traced_kernel, A)
-print(f"Simulation matches NumPy? {np.allclose(out_nkipy, expected)}")
-out_baremetal = baremetal_run_traced_kernel(traced_kernel, A)
-print(f"Baremetal matches NumPy? {np.allclose(out_baremetal, expected)}")
+run_test(einsum_sum_axis, A)
 
 
 # =============================================================================
@@ -201,17 +147,7 @@ def einsum_bilinear(x, A, y):
 x = np.array([1, 2], dtype=np.float32)
 A = np.array([[1, 2], [3, 4]], dtype=np.float32)
 y = np.array([5, 6], dtype=np.float32)
-expected = einsum_bilinear(x, A, y)
-print(f"Input shapes: {x.shape} x {A.shape} x {y.shape} -> Output: {expected}")
-
-traced_kernel = NKIPyKernel.trace(einsum_bilinear)
-out_nkipy = simulate_traced_kernel(traced_kernel, x, A, y)
-print(f"Simulation matches NumPy? {np.allclose(out_nkipy, expected)}")
-try:
-    out_baremetal = baremetal_run_traced_kernel(traced_kernel, x, A, y)
-    print(f"Baremetal matches NumPy? {np.allclose(out_baremetal, expected)}")
-except Exception as e:
-    print(f"Baremetal test skipped: {type(e).__name__}")
+run_test(einsum_bilinear, x, A, y)
 
 
 # =============================================================================
@@ -226,14 +162,7 @@ def einsum_batch_dot(A, B):
 
 A = np.random.rand(5, 10).astype(np.float32)
 B = np.random.rand(5, 10).astype(np.float32)
-expected = einsum_batch_dot(A, B)
-print(f"Input shapes: {A.shape} x {B.shape} -> Output shape: {expected.shape}")
-
-traced_kernel = NKIPyKernel.trace(einsum_batch_dot)
-out_nkipy = simulate_traced_kernel(traced_kernel, A, B)
-print(f"Simulation matches NumPy? {np.allclose(out_nkipy, expected)}")
-out_baremetal = baremetal_run_traced_kernel(traced_kernel, A, B)
-print(f"Baremetal matches NumPy? {np.allclose(out_baremetal, expected)}")
+run_test(einsum_batch_dot, A, B)
 
 
 # =============================================================================
@@ -248,153 +177,9 @@ def einsum_tensor_contract(A, B):
 
 A = np.random.rand(2, 3, 4).astype(np.float32)
 B = np.random.rand(3, 4, 5).astype(np.float32)
-expected = einsum_tensor_contract(A, B)
-print(f"Input shapes: {A.shape} x {B.shape} -> Output shape: {expected.shape}")
-
-traced_kernel = NKIPyKernel.trace(einsum_tensor_contract)
-out_nkipy = simulate_traced_kernel(traced_kernel, A, B)
-print(f"Simulation matches NumPy? {np.allclose(out_nkipy, expected)}")
-out_baremetal = baremetal_run_traced_kernel(traced_kernel, A, B)
-print(f"Baremetal matches NumPy? {np.allclose(out_baremetal, expected)}")
-
-
-# =============================================================================
-# 12. Diagonal Extraction
-# =============================================================================
-print("\n12. Diagonal Extraction (ii->i)")
-print("-" * 80)
-
-def einsum_diagonal(A):
-    """Extract diagonal: (i, i) -> (i,)"""
-    return np.einsum('ii->i', A)
-
-A = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32)
-expected = einsum_diagonal(A)
-print(f"Input shape: {A.shape} -> Output shape: {expected.shape}")
-
-traced_kernel = NKIPyKernel.trace(einsum_diagonal)
-out_nkipy = simulate_traced_kernel(traced_kernel, A)
-print(f"Simulation matches NumPy? {np.allclose(out_nkipy, expected)}")
-out_baremetal = baremetal_run_traced_kernel(traced_kernel, A)
-print(f"Baremetal matches NumPy? {np.allclose(out_baremetal, expected)}")
-
-
-# =============================================================================
-# 13. Broadcasting Multiply
-# =============================================================================
-print("\n13. Broadcasting Multiply (ij,j->ij)")
-print("-" * 80)
-
-def einsum_broadcast_multiply(A, b):
-    """Multiply matrix by vector (broadcasting): (i,j) x (j,) -> (i,j)"""
-    return np.einsum('ij,j->ij', A, b)
-
-A = np.array([[1, 2], [3, 4]], dtype=np.float32)
-b = np.array([10, 100], dtype=np.float32)
-expected = einsum_broadcast_multiply(A, b)
-print(f"Input shapes: {A.shape} x {b.shape} -> Output shape: {expected.shape}")
-
-traced_kernel = NKIPyKernel.trace(einsum_broadcast_multiply)
-out_nkipy = simulate_traced_kernel(traced_kernel, A, b)
-print(f"Simulation matches NumPy? {np.allclose(out_nkipy, expected)}")
-out_baremetal = baremetal_run_traced_kernel(traced_kernel, A, b)
-print(f"Baremetal matches NumPy? {np.allclose(out_baremetal, expected)}")
+run_test(einsum_tensor_contract, A, B)
 
 
 print("\n" + "=" * 80)
 print("TESTS COMPLETE")
-print("=" * 80)
-
-# OUTPUTS
-# ================================================================================
-# EINSUM OPERATION TESTS
-# ================================================================================
-
-# 1. Matrix Multiplication (ik,kj->ij)
-# --------------------------------------------------------------------------------
-# Input shapes: (2, 3) x (3, 4) -> Output shape: (2, 4)
-# Simulation matches NumPy? True
-# Baremetal matches NumPy? True
-
-# 2. Batch Matrix Multiplication (bik,bkj->bij)
-# --------------------------------------------------------------------------------
-# Input shapes: (5, 2, 3) x (5, 3, 4) -> Output shape: (5, 2, 4)
-# Simulation matches NumPy? True
-# Baremetal matches NumPy? True
-
-# 3. Dot Product (i,i->)
-# --------------------------------------------------------------------------------
-# Input shapes: (3,) x (3,) -> Output: 32.0
-# Simulation matches NumPy? True
-# Baremetal matches NumPy? True
-
-# 4. Outer Product (i,j->ij)
-# --------------------------------------------------------------------------------
-# Input shapes: (3,) x (2,) -> Output shape: (3, 2)
-# Simulation matches NumPy? True
-# Baremetal test skipped: CalledProcessError
-
-# 5. Element-wise Multiply and Sum (ij,ij->)
-# --------------------------------------------------------------------------------
-# Input shapes: (2, 2) x (2, 2) -> Output: 70.0
-# Simulation matches NumPy? True
-# Baremetal matches NumPy? True
-
-# 6. Transpose (ij->ji)
-# --------------------------------------------------------------------------------
-# Input shape: (2, 3) -> Output shape: (3, 2)
-# Simulation matches NumPy? True
-# Baremetal matches NumPy? True
-
-# 7. Trace (ii->)
-# --------------------------------------------------------------------------------
-# Input shape: (3, 3) -> Output: 15.0
-# Simulation matches NumPy? True
-# Baremetal test skipped: CalledProcessError
-
-# 8. Sum Along Axis (ij->i)
-# --------------------------------------------------------------------------------
-# Input shape: (2, 3) -> Output shape: (2,)
-# Simulation matches NumPy? True
-# Baremetal matches NumPy? True
-
-# 9. Bilinear Form (i,ij,j->)
-# --------------------------------------------------------------------------------
-# Input shapes: (2,) x (2, 2) x (2,) -> Output: 95.0
-# Simulation matches NumPy? True
-# Baremetal test skipped: CalledProcessError
-
-# 10. Batched Dot Product (bi,bi->b)
-# --------------------------------------------------------------------------------
-# Input shapes: (5, 10) x (5, 10) -> Output shape: (5,)
-# Simulation matches NumPy? True
-# Baremetal matches NumPy? True
-
-# 11. Tensor Contraction (ijk,jkl->il)
-# --------------------------------------------------------------------------------
-# Input shapes: (2, 3, 4) x (3, 4, 5) -> Output shape: (2, 5)
-# Simulation matches NumPy? True
-# Baremetal matches NumPy? True
-
-# 12. Diagonal Extraction (ii->i)
-# --------------------------------------------------------------------------------
-# Input shape: (3, 3) -> Output shape: (3,)
-# Simulation matches NumPy? True
-# Traceback (most recent call last):
-#   File "/home/ubuntu/nkipy/examples/playground/nkipy_einsum.py", line 278, in <module>
-#     out_baremetal = baremetal_run_traced_kernel(traced_kernel, A)
-#                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-#   File "/home/ubuntu/nkipy/nkipy/src/nkipy/runtime/execute.py", line 104, in baremetal_run_traced_kernel
-#     neff = compile.compile_to_neff(
-#            ^^^^^^^^^^^^^^^^^^^^^^^^
-#   File "/home/ubuntu/nkipy/nkipy/src/nkipy/core/compile.py", line 291, in compile_to_neff
-#     posix_path = compiler.compile_in_directory(
-#                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-#   File "/home/ubuntu/nkipy/nkipy/src/nkipy/core/compile.py", line 237, in compile_in_directory
-#     return self.compile(
-#            ^^^^^^^^^^^^^
-#   File "/home/ubuntu/nkipy/nkipy/src/nkipy/core/compile.py", line 195, in compile
-#     subprocess.run(cmd, check=True, capture_output=True)
-#   File "/usr/lib/python3.12/subprocess.py", line 571, in run
-#     raise CalledProcessError(retcode, process.args,
-# subprocess.CalledProcessError: Command '['neuronx-cc', 'compile', '--framework', 'XLA', 'hlo_module.pb', '--pipeline', 'compile', 'SaveTemps', '--target', 'trn2', '--output=einsum_diagonal.neff', '--lnc', '1', '--internal-tensorizer-opt-level=2']' returned non-zero exit status 70.
+print("=" * 80)
\ No newline at end of file

From 1aaa7fcb808986effd1c9c9544b715cb0b63c505 Mon Sep 17 00:00:00 2001
From: Jlonge4 <josh.longenecker1@gmail.com>
Date: Wed, 21 Jan 2026 17:04:08 -0500
Subject: [PATCH 17/21] lint

---
 examples/playground/nkipy_einsum.py |   2 +-
 nkipy/src/nkipy/core/ops/einsum.py  | 157 +++++++++++++++++-----------
 tests/kernels/einsum.py             |   1 -
 3 files changed, 95 insertions(+), 65 deletions(-)

diff --git a/examples/playground/nkipy_einsum.py b/examples/playground/nkipy_einsum.py
index c8b297e..bb7bd77 100644
--- a/examples/playground/nkipy_einsum.py
+++ b/examples/playground/nkipy_einsum.py
@@ -1,6 +1,6 @@
 import numpy as np
 from nkipy.core.trace import NKIPyKernel
-from nkipy.runtime.execute import simulate_traced_kernel, baremetal_run_traced_kernel
+from nkipy.runtime.execute import baremetal_run_traced_kernel, simulate_traced_kernel
 
 print("=" * 80)
 print("EINSUM OPERATION TESTS")
diff --git a/nkipy/src/nkipy/core/ops/einsum.py b/nkipy/src/nkipy/core/ops/einsum.py
index 56d9469..42ad29f 100644
--- a/nkipy/src/nkipy/core/ops/einsum.py
+++ b/nkipy/src/nkipy/core/ops/einsum.py
@@ -7,9 +7,10 @@
 traces, and more.
 """
 
-import numpy as np
 from typing import Dict, List, Set, Tuple
 
+import numpy as np
+
 from nkipy.core.ops._registry import Op
 
 # =============================================================================
@@ -57,10 +58,12 @@ def parse_einsum_subscripts(
         # For implicit output with ..., we need to keep ... if present
         output_spec = ""
         seen: Set[str] = set()
-        
+
         # Collect all indices that appear exactly once
-        unique_indices = sorted([idx for idx, count in all_indices.items() if count == 1 and idx != "."])
-        
+        unique_indices = sorted(
+            [idx for idx, count in all_indices.items() if count == 1 and idx != "."]
+        )
+
         # In implicit mode, we preserve order of appearance
         for spec in input_specs:
             for idx in spec:
@@ -176,15 +179,15 @@ def _einsum_hlo(subscripts, *operands, dtype=None):
     shapes = []
     real_operands = []
     ctx = get_hlo_context()
-    
+
     for op in operands:
         if isinstance(op, NKIPyTensorRef):
             real_operands.append(op)
             shapes.append(op.backend_tensor.shape)
         else:
             # Assume it's an HLO tensor or similar
-            # Wrappping it might be needed if we call tensor ops? 
-            # The original code handled wrapping later. 
+            # Wrappping it might be needed if we call tensor ops?
+            # The original code handled wrapping later.
             # We need shapes now for ellipsis expansion.
             real_operands.append(op)
             shapes.append(op.shape)
@@ -196,29 +199,29 @@ def _einsum_hlo(subscripts, *operands, dtype=None):
     # This might modify operands (insert diagonal ops) and specs
     cleaned_input_specs = []
     processed_operands = []
-    
+
     for i, (spec, op) in enumerate(zip(input_specs, real_operands)):
         # Check for repeated indices
         if len(set(spec)) != len(spec):
-             new_op, new_spec = _handle_repeated_indices(ctx, op, spec)
-             processed_operands.append(new_op)
-             cleaned_input_specs.append(new_spec)
+            new_op, new_spec = _handle_repeated_indices(ctx, op, spec)
+            processed_operands.append(new_op)
+            cleaned_input_specs.append(new_spec)
         else:
-             processed_operands.append(op)
-             cleaned_input_specs.append(spec)
-    
+            processed_operands.append(op)
+            cleaned_input_specs.append(spec)
+
     input_specs = cleaned_input_specs
-    
+
     # Refresh shapes after potential diagonal reductions
     hlo_operands = []
     final_shapes = []
     for op in processed_operands:
         if isinstance(op, NKIPyTensorRef):
-             hlo_operands.append(op.backend_tensor)
-             final_shapes.append(op.backend_tensor.shape)
+            hlo_operands.append(op.backend_tensor)
+            final_shapes.append(op.backend_tensor.shape)
         else:
-             hlo_operands.append(op)
-             final_shapes.append(op.shape)
+            hlo_operands.append(op)
+            final_shapes.append(op.shape)
 
     # Analyze pattern
     analysis = analyze_einsum_pattern(input_specs, output_spec, final_shapes)
@@ -245,97 +248,123 @@ def _einsum_hlo(subscripts, *operands, dtype=None):
 
 def _handle_repeated_indices(ctx, operand, spec: str):
     """Handle repeated indices in a single spec (e.g., 'ii') by taking diagonal."""
-    from nkipy.core.tensor import NKIPyTensorRef
-    from nkipy.core.backend.hlo import as_hlo_tensor
     import collections
-    
+
+    from nkipy.core.backend.hlo import as_hlo_tensor
+    from nkipy.core.tensor import NKIPyTensorRef
+
     current_operand = operand
     if isinstance(current_operand, NKIPyTensorRef):
         current_operand = current_operand.backend_tensor
     current_spec = list(spec)
-    
+
     while True:
         counts = collections.Counter(current_spec)
         repeated = [char for char, count in counts.items() if count > 1]
-        
+
         if not repeated:
             break
-            
+
         # Handle first repeated index
         idx = repeated[0]
         # Find first two positions
         positions = [i for i, char in enumerate(current_spec) if char == idx]
         pos1, pos2 = positions[0], positions[1]
-        
+
         # Verify dimensions
         shape = current_operand.shape
         if shape[pos1] != shape[pos2]:
-             raise ValueError(f"Repeated index {idx} has incompatible dimensions {shape[pos1]} and {shape[pos2]}")
-             
+            raise ValueError(
+                f"Repeated index {idx} has incompatible dimensions {shape[pos1]} and {shape[pos2]}"
+            )
+
         dim_size = shape[pos1]
-        
+
         # Move pos1 and pos2 to the end
         # Permutation: All other indices + pos1 + pos2
         other_indices = [i for i in range(len(shape)) if i != pos1 and i != pos2]
         perm = other_indices + [pos1, pos2]
-        
+
         current_operand = ctx.build_op(
-            "transpose", [current_operand], 
-            tuple(shape[i] for i in perm), 
-            current_operand.dtype, 
-            {"permutation": perm}
+            "transpose",
+            [current_operand],
+            tuple(shape[i] for i in perm),
+            current_operand.dtype,
+            {"permutation": perm},
         )
-        
+
         # Now shape is (..., N, N)
         # Create Identity Mask (N, N)
         # iota dimension 0
-        iota0 = ctx.build_op("iota", [], (dim_size, dim_size), "int32", {"iota_dimension": 0})
+        iota0 = ctx.build_op(
+            "iota", [], (dim_size, dim_size), "int32", {"iota_dimension": 0}
+        )
         # iota dimension 1
-        iota1 = ctx.build_op("iota", [], (dim_size, dim_size), "int32", {"iota_dimension": 1})
-        
+        iota1 = ctx.build_op(
+            "iota", [], (dim_size, dim_size), "int32", {"iota_dimension": 1}
+        )
+
         # Mask = (iota0 == iota1)
-        pred = ctx.build_op("compare", [iota0, iota1], (dim_size, dim_size), np.bool_, {"comparison_direction": "EQ"})
-        
+        pred = ctx.build_op(
+            "compare",
+            [iota0, iota1],
+            (dim_size, dim_size),
+            np.bool_,
+            {"comparison_direction": "EQ"},
+        )
+
         # Convert to dtype
-        mask = ctx.build_op("convert", [pred], (dim_size, dim_size), current_operand.dtype, {})
-        
+        mask = ctx.build_op(
+            "convert", [pred], (dim_size, dim_size), current_operand.dtype, {}
+        )
+
         # Broadcast mask to matches current_operand magnitude
         # Mask has shape (N, N). Operand has (..., N, N).
         # We broadcast mask to operands shape.
         # Dimensions to broadcast are the '...' ones (0 to len-3).
         # We map the mask dimensions [0, 1] to Result dimensions [rank-2, rank-1].
-        
+
         rank = len(current_operand.shape)
         mask_broadcast = ctx.build_op(
-            "broadcast", [mask], current_operand.shape, current_operand.dtype,
-            {"broadcast_dimensions": [rank-2, rank-1]}
+            "broadcast",
+            [mask],
+            current_operand.shape,
+            current_operand.dtype,
+            {"broadcast_dimensions": [rank - 2, rank - 1]},
         )
-        
+
         # Multiply
-        masked_op = ctx.build_op("multiply", [current_operand, mask_broadcast], current_operand.shape, current_operand.dtype)
-        
+        masked_op = ctx.build_op(
+            "multiply",
+            [current_operand, mask_broadcast],
+            current_operand.shape,
+            current_operand.dtype,
+        )
+
         # Reduce sum over the last dimension (pos2) - which is now at rank-1
         # Reduce dims: [rank-1]
         # Init value for add: 0.0
         init_val = as_hlo_tensor(ctx, 0.0, current_operand.dtype)
-        
+
         reduced_shape = current_operand.shape[:-1]
         current_operand = ctx.build_op(
-            "reduce", [masked_op, init_val], reduced_shape, current_operand.dtype,
-            {"dimensions": [rank-1], "computation": "add"}
+            "reduce",
+            [masked_op, init_val],
+            reduced_shape,
+            current_operand.dtype,
+            {"dimensions": [rank - 1], "computation": "add"},
         )
-        
+
         # Update spec
         # We removed the char at pos2 (which was moved to end).
         # The char at pos1 (which was moved to rank-2) is now at rank-1 (end).
         # The other chars are at 0 ... rank-2.
         # So new spec order is: [others] + [idx].
-        
+
         new_spec_list = [current_spec[i] for i in other_indices] + [idx]
         current_spec = new_spec_list
-        
-    return current_operand, "".join(current_spec)
 
+    return current_operand, "".join(current_spec)
 
 
 def _einsum_unary(ctx, operand, input_spec, output_spec, analysis):
@@ -374,17 +403,17 @@ def _einsum_unary(ctx, operand, input_spec, output_spec, analysis):
     # output_dims contains (idx, original_pos, size)
     # The 'operand' tensor currently has these dimensions in the order they appeared in input_spec (minus reduced ones).
     # XLA Reduce preserves relative order of remaining dimensions.
-    
+
     current_indices = [idx for idx, _, _ in output_dims]
-    
+
     # If there are dimensions to reduce
     if dims_to_reduce:
-        # The shape expected by reduce op is the shape of the RESULT of reduction? 
-        # Or the shape of the operands? 
+        # The shape expected by reduce op is the shape of the RESULT of reduction?
+        # Or the shape of the operands?
         # Usually XLA build_op('reduce') might take output shape?
         # If so, it should match the input-ordered result (since no transpose happens during reduce).
         reduced_shape = tuple(size for _, _, size in output_dims)
-        
+
         init_tensor = as_hlo_tensor(ctx, 0.0, operand.dtype)
         operand = ctx.build_op(
             "reduce",
@@ -404,13 +433,15 @@ def _einsum_unary(ctx, operand, input_spec, output_spec, analysis):
         # Current tensor has dims in `current_indices` order.
         # We need to mapp `current_indices` -> `output_spec`.
         # Transpose perm[i] is the index in input that maps to output[i].
-        
+
         try:
             perm = [current_indices.index(idx) for idx in output_spec]
         except ValueError as e:
             # Should not happen if analysis is correct
-            raise RuntimeError(f"Internal einsum error: indices mismatch {current_indices} vs {output_spec}") from e
-            
+            raise RuntimeError(
+                f"Internal einsum error: indices mismatch {current_indices} vs {output_spec}"
+            ) from e
+
         transposed_shape = tuple(operand.shape[i] for i in perm)
         operand = ctx.build_op(
             "transpose",
diff --git a/tests/kernels/einsum.py b/tests/kernels/einsum.py
index 93a4431..0576c9c 100644
--- a/tests/kernels/einsum.py
+++ b/tests/kernels/einsum.py
@@ -14,7 +14,6 @@
 import numpy as np
 from nkipy.core.specs import CommonTypes, KernelSpec, ShapeSpec, TensorInputSpec
 
-
 # =============================================================================
 # Matrix Operations
 # =============================================================================

From ef3011894d0edf916581406d7267f35c35f1de4a Mon Sep 17 00:00:00 2001
From: Jlonge4 <josh.longenecker1@gmail.com>
Date: Wed, 21 Jan 2026 17:21:23 -0500
Subject: [PATCH 18/21] lint

---
 nkipy/src/nkipy/core/backend/hlo.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/nkipy/src/nkipy/core/backend/hlo.py b/nkipy/src/nkipy/core/backend/hlo.py
index 8b002e7..35292bc 100644
--- a/nkipy/src/nkipy/core/backend/hlo.py
+++ b/nkipy/src/nkipy/core/backend/hlo.py
@@ -799,7 +799,6 @@ def _handle_custom_call(self, instr, op: HLOOp, _) -> None:
                 if isinstance(op.result_dtype, list)
                 else [op.result_dtype]
             )
-            instr.shape.Clear()
             instr.shape.CopyFrom(_make_tuple_shape_proto(list(zip(shapes, dtypes))))
 
         backend_config = op.attributes.get("backend_config", "")

From dd6dfb078ac1e30bd7744429213a5b34d042d999 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-29-37.sa-east-1.compute.internal>
Date: Tue, 3 Feb 2026 14:40:06 +0000
Subject: [PATCH 19/21] address bilinear

---
 examples/playground/nkipy_einsum.py | 28 ++++++----------------------
 tests/kernels/einsum.py             | 11 -----------
 2 files changed, 6 insertions(+), 33 deletions(-)

diff --git a/examples/playground/nkipy_einsum.py b/examples/playground/nkipy_einsum.py
index bb7bd77..0cee62d 100644
--- a/examples/playground/nkipy_einsum.py
+++ b/examples/playground/nkipy_einsum.py
@@ -121,9 +121,9 @@ def einsum_transpose(A):
 
 
 # =============================================================================
-# 8. Sum Along Axis
+# 7. Sum Along Axis
 # =============================================================================
-print("\n8. Sum Along Axis (ij->i)")
+print("\n7. Sum Along Axis (ij->i)")
 print("-" * 80)
 
 def einsum_sum_axis(A):
@@ -135,25 +135,9 @@ def einsum_sum_axis(A):
 
 
 # =============================================================================
-# 9. Bilinear Form (Quadratic Form)
+# 8. Batched Dot Product
 # =============================================================================
-print("\n9. Bilinear Form (i,ij,j->)")
-print("-" * 80)
-
-def einsum_bilinear(x, A, y):
-    """Compute x^T @ A @ y"""
-    return np.einsum('i,ij,j->', x, A, y)
-
-x = np.array([1, 2], dtype=np.float32)
-A = np.array([[1, 2], [3, 4]], dtype=np.float32)
-y = np.array([5, 6], dtype=np.float32)
-run_test(einsum_bilinear, x, A, y)
-
-
-# =============================================================================
-# 10. Batched Dot Product
-# =============================================================================
-print("\n10. Batched Dot Product (bi,bi->b)")
+print("\n8. Batched Dot Product (bi,bi->b)")
 print("-" * 80)
 
 def einsum_batch_dot(A, B):
@@ -166,9 +150,9 @@ def einsum_batch_dot(A, B):
 
 
 # =============================================================================
-# 11. Tensor Contraction
+# 9. Tensor Contraction
 # =============================================================================
-print("\n11. Tensor Contraction (ijk,jkl->il)")
+print("\n9. Tensor Contraction (ijk,jkl->il)")
 print("-" * 80)
 
 def einsum_tensor_contract(A, B):
diff --git a/tests/kernels/einsum.py b/tests/kernels/einsum.py
index 0576c9c..d4790ef 100644
--- a/tests/kernels/einsum.py
+++ b/tests/kernels/einsum.py
@@ -48,12 +48,6 @@ def permute_dims_einsum(A):
 # Reductions
 # =============================================================================
 
-
-def trace_einsum(A):
-    """Matrix trace using einsum: ii->"""
-    return np.einsum('ii->', A)
-
-
 def sum_axis_einsum(A):
     """Sum along axis using einsum: ij->i"""
     return np.einsum('ij->i', A)
@@ -79,11 +73,6 @@ def dot_product_einsum(a, b):
     return np.einsum('i,i->', a, b)
 
 
-def bilinear_form_einsum(x, A, y):
-    """Bilinear form x^T A y using einsum: i,ij,j->"""
-    return np.einsum('i,ij,j->', x, A, y)
-
-
 # =============================================================================
 # Kernel Specifications
 # =============================================================================

From dacc139dc6544d34fe731058b343efe70db89d9c Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-29-37.sa-east-1.compute.internal>
Date: Tue, 3 Feb 2026 14:43:18 +0000
Subject: [PATCH 20/21] lint

---
 examples/playground/nkipy_einsum.py | 46 +++++++++++++++++++++--------
 tests/kernels/einsum.py             | 15 +++++-----
 2 files changed, 41 insertions(+), 20 deletions(-)

diff --git a/examples/playground/nkipy_einsum.py b/examples/playground/nkipy_einsum.py
index 0cee62d..885555c 100644
--- a/examples/playground/nkipy_einsum.py
+++ b/examples/playground/nkipy_einsum.py
@@ -6,23 +6,24 @@
 print("EINSUM OPERATION TESTS")
 print("=" * 80)
 
+
 def run_test(test_func, *test_args):
     """Helper to trace, simulate, and run on baremetal."""
     # Run numpy version to get expected output
     expected = test_func(*test_args)
     print(f"Input shapes: {[a.shape for a in test_args if hasattr(a, 'shape')]}")
-    if hasattr(expected, 'shape'):
+    if hasattr(expected, "shape"):
         print(f"Output shape: {expected.shape}")
     else:
         print(f"Output: {expected}")
 
     traced_kernel = NKIPyKernel.trace(test_func)
-    
+
     # Simulation
     out_nkipy = simulate_traced_kernel(traced_kernel, *test_args)
     sim_match = np.allclose(out_nkipy, expected)
     print(f"Simulation matches NumPy? {sim_match}")
-    
+
     # Baremetal
     try:
         out_baremetal = baremetal_run_traced_kernel(traced_kernel, *test_args)
@@ -31,15 +32,18 @@ def run_test(test_func, *test_args):
     except Exception as e:
         print(f"Baremetal test skipped/failed: {type(e).__name__} - {e}")
 
+
 # =============================================================================
 # 1. Matrix Multiplication
 # =============================================================================
 print("\n1. Matrix Multiplication (ik,kj->ij)")
 print("-" * 80)
 
+
 def einsum_matmul(A, B):
     """Standard matrix multiply: (i, k) x (k, j) -> (i, j)"""
-    return np.einsum('ik,kj->ij', A, B)
+    return np.einsum("ik,kj->ij", A, B)
+
 
 A = np.random.rand(2, 3).astype(np.float32)
 B = np.random.rand(3, 4).astype(np.float32)
@@ -52,9 +56,11 @@ def einsum_matmul(A, B):
 print("\n2. Batch Matrix Multiplication (bik,bkj->bij)")
 print("-" * 80)
 
+
 def einsum_batch_matmul(A, B):
     """Batch matrix multiply: (batch, i, k) x (batch, k, j) -> (batch, i, j)"""
-    return np.einsum('bik,bkj->bij', A, B)
+    return np.einsum("bik,bkj->bij", A, B)
+
 
 A = np.random.rand(5, 2, 3).astype(np.float32)
 B = np.random.rand(5, 3, 4).astype(np.float32)
@@ -67,9 +73,11 @@ def einsum_batch_matmul(A, B):
 print("\n3. Dot Product (i,i->)")
 print("-" * 80)
 
+
 def einsum_dot(a, b):
     """Dot product of two vectors: sum(a * b)"""
-    return np.einsum('i,i->', a, b)
+    return np.einsum("i,i->", a, b)
+
 
 a = np.array([1, 2, 3], dtype=np.float32)
 b = np.array([4, 5, 6], dtype=np.float32)
@@ -82,9 +90,11 @@ def einsum_dot(a, b):
 print("\n4. Outer Product (i,j->ij)")
 print("-" * 80)
 
+
 def einsum_outer(a, b):
     """Outer product: (i,) x (j,) -> (i, j)"""
-    return np.einsum('i,j->ij', a, b)
+    return np.einsum("i,j->ij", a, b)
+
 
 a = np.array([1, 2, 3], dtype=np.float32)
 b = np.array([4, 5], dtype=np.float32)
@@ -97,9 +107,11 @@ def einsum_outer(a, b):
 print("\n5. Element-wise Multiply and Sum (ij,ij->)")
 print("-" * 80)
 
+
 def einsum_hadamard_sum(A, B):
     """Element-wise multiply then sum all: sum(A * B)"""
-    return np.einsum('ij,ij->', A, B)
+    return np.einsum("ij,ij->", A, B)
+
 
 A = np.array([[1, 2], [3, 4]], dtype=np.float32)
 B = np.array([[5, 6], [7, 8]], dtype=np.float32)
@@ -112,9 +124,11 @@ def einsum_hadamard_sum(A, B):
 print("\n6. Transpose (ij->ji)")
 print("-" * 80)
 
+
 def einsum_transpose(A):
     """Matrix transpose: (i, j) -> (j, i)"""
-    return np.einsum('ij->ji', A)
+    return np.einsum("ij->ji", A)
+
 
 A = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
 run_test(einsum_transpose, A)
@@ -126,9 +140,11 @@ def einsum_transpose(A):
 print("\n7. Sum Along Axis (ij->i)")
 print("-" * 80)
 
+
 def einsum_sum_axis(A):
     """Sum along last axis: (i, j) -> (i,)"""
-    return np.einsum('ij->i', A)
+    return np.einsum("ij->i", A)
+
 
 A = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
 run_test(einsum_sum_axis, A)
@@ -140,9 +156,11 @@ def einsum_sum_axis(A):
 print("\n8. Batched Dot Product (bi,bi->b)")
 print("-" * 80)
 
+
 def einsum_batch_dot(A, B):
     """Dot product for each pair in batch: (batch, i) x (batch, i) -> (batch,)"""
-    return np.einsum('bi,bi->b', A, B)
+    return np.einsum("bi,bi->b", A, B)
+
 
 A = np.random.rand(5, 10).astype(np.float32)
 B = np.random.rand(5, 10).astype(np.float32)
@@ -155,9 +173,11 @@ def einsum_batch_dot(A, B):
 print("\n9. Tensor Contraction (ijk,jkl->il)")
 print("-" * 80)
 
+
 def einsum_tensor_contract(A, B):
     """Contract on middle dimensions: (i,j,k) x (j,k,l) -> (i,l)"""
-    return np.einsum('ijk,jkl->il', A, B)
+    return np.einsum("ijk,jkl->il", A, B)
+
 
 A = np.random.rand(2, 3, 4).astype(np.float32)
 B = np.random.rand(3, 4, 5).astype(np.float32)
@@ -166,4 +186,4 @@ def einsum_tensor_contract(A, B):
 
 print("\n" + "=" * 80)
 print("TESTS COMPLETE")
-print("=" * 80)
\ No newline at end of file
+print("=" * 80)
diff --git a/tests/kernels/einsum.py b/tests/kernels/einsum.py
index d4790ef..95afa51 100644
--- a/tests/kernels/einsum.py
+++ b/tests/kernels/einsum.py
@@ -21,12 +21,12 @@
 
 def matmul_einsum(A, B):
     """Matrix multiplication using einsum: ij,jk->ik"""
-    return np.einsum('ij,jk->ik', A, B)
+    return np.einsum("ij,jk->ik", A, B)
 
 
 def batch_matmul_einsum(A, B):
     """Batch matrix multiplication using einsum: bij,bjk->bik"""
-    return np.einsum('bij,bjk->bik', A, B)
+    return np.einsum("bij,bjk->bik", A, B)
 
 
 # =============================================================================
@@ -36,21 +36,22 @@ def batch_matmul_einsum(A, B):
 
 def transpose_einsum(A):
     """Transpose using einsum: ij->ji"""
-    return np.einsum('ij->ji', A)
+    return np.einsum("ij->ji", A)
 
 
 def permute_dims_einsum(A):
     """Permute dimensions using einsum: ijk->kij"""
-    return np.einsum('ijk->kij', A)
+    return np.einsum("ijk->kij", A)
 
 
 # =============================================================================
 # Reductions
 # =============================================================================
 
+
 def sum_axis_einsum(A):
     """Sum along axis using einsum: ij->i"""
-    return np.einsum('ij->i', A)
+    return np.einsum("ij->i", A)
 
 
 # =============================================================================
@@ -60,7 +61,7 @@ def sum_axis_einsum(A):
 
 def outer_product_einsum(a, b):
     """Outer product using einsum: i,j->ij"""
-    return np.einsum('i,j->ij', a, b)
+    return np.einsum("i,j->ij", a, b)
 
 
 # =============================================================================
@@ -70,7 +71,7 @@ def outer_product_einsum(a, b):
 
 def dot_product_einsum(a, b):
     """Dot product using einsum: i,i->"""
-    return np.einsum('i,i->', a, b)
+    return np.einsum("i,i->", a, b)
 
 
 # =============================================================================

From e2fcb1b2959701e74896ebf7a26e857ea0e29de7 Mon Sep 17 00:00:00 2001
From: Jlonge4 <josh.longenecker1@gmail.com>
Date: Tue, 3 Feb 2026 12:08:38 -0500
Subject: [PATCH 21/21] add initial einsum tests

---
 tests/unit/test_tensor_api.py | 327 ++++++++++++++++++++++++++++++++++
 1 file changed, 327 insertions(+)

diff --git a/tests/unit/test_tensor_api.py b/tests/unit/test_tensor_api.py
index 06b3c6b..d0fb7e1 100644
--- a/tests/unit/test_tensor_api.py
+++ b/tests/unit/test_tensor_api.py
@@ -1735,5 +1735,332 @@ def kernel_with_constant_one(x):
         )
 
 
+@pytest.mark.parametrize(
+    "shape_a,shape_b",
+    [
+        ((32, 64), (64, 48)),
+        ((128, 256), (256, 512)),
+        ((256, 512), (512, 1024)),
+        ((1, 128), (128, 256)),
+        ((256, 128), (128, 1)),
+    ],
+)
+def test_einsum_matmul(sim_mode, shape_a, shape_b):
+    """Test einsum matrix multiplication: ij,jk->ik"""
+
+    def kernel(a, b):
+        return np.einsum("ij,jk->ik", a, b)
+
+    dtype = np.float32
+    np.random.seed(0)
+    in0 = np.random.uniform(high=1.0, low=0.0, size=shape_a).astype(dtype)
+    in1 = np.random.uniform(high=1.0, low=0.0, size=shape_b).astype(dtype)
+
+    expected = np.einsum("ij,jk->ik", in0, in1)
+    out = simulate_kernel_unified(kernel, sim_mode, in0, in1)
+    simulate_assert_allclose(out, expected)
+
+    if NEURON_AVAILABLE:
+        out_baremetal = baremetal_run_kernel_unified(kernel, sim_mode, in0, in1)
+        baremetal_assert_allclose(expected, out_baremetal)
+
+
+@pytest.mark.parametrize(
+    "shape_a,shape_b",
+    [
+        ((2, 32, 64), (2, 64, 48)),
+        ((4, 128, 256), (4, 256, 128)),
+        ((8, 64, 128), (8, 128, 64)),
+        ((1, 128, 256), (1, 256, 128)),
+    ],
+)
+def test_einsum_batch_matmul(sim_mode, shape_a, shape_b):
+    """Test einsum batch matrix multiplication: bij,bjk->bik"""
+
+    def kernel(a, b):
+        return np.einsum("bij,bjk->bik", a, b)
+
+    dtype = np.float32
+    np.random.seed(0)
+    in0 = np.random.uniform(high=1.0, low=0.0, size=shape_a).astype(dtype)
+    in1 = np.random.uniform(high=1.0, low=0.0, size=shape_b).astype(dtype)
+
+    expected = np.einsum("bij,bjk->bik", in0, in1)
+    out = simulate_kernel_unified(kernel, sim_mode, in0, in1)
+    simulate_assert_allclose(out, expected)
+
+    if NEURON_AVAILABLE:
+        out_baremetal = baremetal_run_kernel_unified(kernel, sim_mode, in0, in1)
+        baremetal_assert_allclose(expected, out_baremetal)
+
+
+@pytest.mark.parametrize(
+    "shape_a,shape_b",
+    [
+        ((128,), (128,)),
+        ((256,), (256,)),
+        ((512,), (512,)),
+        ((1024,), (1024,)),
+    ],
+)
+def test_einsum_dot_product(sim_mode, shape_a, shape_b):
+    """Test einsum dot product: i,i->"""
+
+    def kernel(a, b):
+        return np.einsum("i,i->", a, b)
+
+    dtype = np.float32
+    np.random.seed(0)
+    in0 = np.random.uniform(high=1.0, low=0.0, size=shape_a).astype(dtype)
+    in1 = np.random.uniform(high=1.0, low=0.0, size=shape_b).astype(dtype)
+
+    expected = np.einsum("i,i->", in0, in1)
+    out = simulate_kernel_unified(kernel, sim_mode, in0, in1)
+    simulate_assert_allclose(out, expected)
+
+    if NEURON_AVAILABLE:
+        out_baremetal = baremetal_run_kernel_unified(kernel, sim_mode, in0, in1)
+        baremetal_assert_allclose(expected, out_baremetal)
+
+
+@pytest.mark.parametrize(
+    "shape_a,shape_b",
+    [
+        ((32,), (64,)),
+        ((64,), (128,)),
+        ((128,), (256,)),
+        ((16,), (32,)),
+    ],
+)
+def test_einsum_outer_product(sim_mode, shape_a, shape_b):
+    """Test einsum outer product: i,j->ij"""
+
+    def kernel(a, b):
+        return np.einsum("i,j->ij", a, b)
+
+    dtype = np.float32
+    np.random.seed(0)
+    in0 = np.random.uniform(high=1.0, low=0.0, size=shape_a).astype(dtype)
+    in1 = np.random.uniform(high=1.0, low=0.0, size=shape_b).astype(dtype)
+
+    expected = np.einsum("i,j->ij", in0, in1)
+    out = simulate_kernel_unified(kernel, sim_mode, in0, in1)
+    simulate_assert_allclose(out, expected)
+
+    if NEURON_AVAILABLE:
+        out_baremetal = baremetal_run_kernel_unified(kernel, sim_mode, in0, in1)
+        baremetal_assert_allclose(expected, out_baremetal)
+
+
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (32, 64),
+        (128, 256),
+        (256, 512),
+        (512, 1024),
+    ],
+)
+def test_einsum_transpose(sim_mode, shape):
+    """Test einsum transpose: ij->ji"""
+
+    def kernel(a):
+        return np.einsum("ij->ji", a)
+
+    dtype = np.float32
+    np.random.seed(0)
+    in0 = np.random.uniform(high=1.0, low=0.0, size=shape).astype(dtype)
+
+    expected = np.einsum("ij->ji", in0)
+    out = simulate_kernel_unified(kernel, sim_mode, in0)
+    simulate_assert_allclose(out, expected)
+
+    if NEURON_AVAILABLE:
+        out_baremetal = baremetal_run_kernel_unified(kernel, sim_mode, in0)
+        baremetal_assert_allclose(expected, out_baremetal)
+
+
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (32, 64),
+        (128, 256),
+        (256, 512),
+    ],
+)
+def test_einsum_sum_axis(sim_mode, shape):
+    """Test einsum sum along axis: ij->i"""
+
+    def kernel(a):
+        return np.einsum("ij->i", a)
+
+    dtype = np.float32
+    np.random.seed(0)
+    in0 = np.random.uniform(high=1.0, low=0.0, size=shape).astype(dtype)
+
+    expected = np.einsum("ij->i", in0)
+    out = simulate_kernel_unified(kernel, sim_mode, in0)
+    simulate_assert_allclose(out, expected)
+
+    if NEURON_AVAILABLE:
+        out_baremetal = baremetal_run_kernel_unified(kernel, sim_mode, in0)
+        baremetal_assert_allclose(expected, out_baremetal)
+
+
+@pytest.mark.parametrize(
+    "shape_a,shape_b",
+    [
+        ((32, 64), (32, 64)),
+        ((128, 256), (128, 256)),
+        ((256, 512), (256, 512)),
+    ],
+)
+def test_einsum_hadamard_sum(sim_mode, shape_a, shape_b):
+    """Test einsum element-wise multiply and sum: ij,ij->"""
+
+    def kernel(a, b):
+        return np.einsum("ij,ij->", a, b)
+
+    dtype = np.float32
+    np.random.seed(0)
+    in0 = np.random.uniform(high=1.0, low=0.0, size=shape_a).astype(dtype)
+    in1 = np.random.uniform(high=1.0, low=0.0, size=shape_b).astype(dtype)
+
+    expected = np.einsum("ij,ij->", in0, in1)
+    out = simulate_kernel_unified(kernel, sim_mode, in0, in1)
+    simulate_assert_allclose(out, expected)
+
+    if NEURON_AVAILABLE:
+        out_baremetal = baremetal_run_kernel_unified(kernel, sim_mode, in0, in1)
+        baremetal_assert_allclose(expected, out_baremetal)
+
+
+@pytest.mark.parametrize(
+    "shape_a,shape_b",
+    [
+        ((4, 128), (4, 128)),
+        ((8, 256), (8, 256)),
+        ((16, 512), (16, 512)),
+    ],
+)
+def test_einsum_batch_dot(sim_mode, shape_a, shape_b):
+    """Test einsum batched dot product: bi,bi->b"""
+
+    def kernel(a, b):
+        return np.einsum("bi,bi->b", a, b)
+
+    dtype = np.float32
+    np.random.seed(0)
+    in0 = np.random.uniform(high=1.0, low=0.0, size=shape_a).astype(dtype)
+    in1 = np.random.uniform(high=1.0, low=0.0, size=shape_b).astype(dtype)
+
+    expected = np.einsum("bi,bi->b", in0, in1)
+    out = simulate_kernel_unified(kernel, sim_mode, in0, in1)
+    simulate_assert_allclose(out, expected)
+
+    if NEURON_AVAILABLE:
+        out_baremetal = baremetal_run_kernel_unified(kernel, sim_mode, in0, in1)
+        baremetal_assert_allclose(expected, out_baremetal)
+
+
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (2, 3, 4),
+        (4, 8, 16),
+        (8, 16, 32),
+    ],
+)
+def test_einsum_permute_3d(sim_mode, shape):
+    """Test einsum 3D permutation: ijk->kij"""
+
+    def kernel(a):
+        return np.einsum("ijk->kij", a)
+
+    dtype = np.float32
+    np.random.seed(0)
+    in0 = np.random.uniform(high=1.0, low=0.0, size=shape).astype(dtype)
+
+    expected = np.einsum("ijk->kij", in0)
+    out = simulate_kernel_unified(kernel, sim_mode, in0)
+    simulate_assert_allclose(out, expected)
+
+    if NEURON_AVAILABLE:
+        out_baremetal = baremetal_run_kernel_unified(kernel, sim_mode, in0)
+        baremetal_assert_allclose(expected, out_baremetal)
+
+
+def test_einsum_implicit_output(sim_mode):
+    """Test einsum with implicit output specification"""
+
+    def kernel(a, b):
+        return np.einsum("ik,kj", a, b)  # Implicit: -> 'ij'
+
+    shape_a = (32, 64)
+    shape_b = (64, 48)
+    dtype = np.float32
+
+    np.random.seed(0)
+    in0 = np.random.uniform(high=1.0, low=0.0, size=shape_a).astype(dtype)
+    in1 = np.random.uniform(high=1.0, low=0.0, size=shape_b).astype(dtype)
+
+    expected = np.einsum("ik,kj", in0, in1)
+    out = simulate_kernel_unified(kernel, sim_mode, in0, in1)
+    simulate_assert_allclose(out, expected)
+
+    if NEURON_AVAILABLE:
+        out_baremetal = baremetal_run_kernel_unified(kernel, sim_mode, in0, in1)
+        baremetal_assert_allclose(expected, out_baremetal)
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float16])
+def test_einsum_dtypes(sim_mode, dtype):
+    """Test einsum with different data types"""
+
+    def kernel(a, b):
+        return np.einsum("ij,jk->ik", a, b)
+
+    shape_a = (64, 128)
+    shape_b = (128, 64)
+
+    np.random.seed(0)
+    in0 = np.random.uniform(high=1.0, low=0.0, size=shape_a).astype(dtype)
+    in1 = np.random.uniform(high=1.0, low=0.0, size=shape_b).astype(dtype)
+
+    expected = np.einsum("ij,jk->ik", in0, in1)
+    out = simulate_kernel_unified(kernel, sim_mode, in0, in1)
+
+    # Use relaxed tolerance for float16
+    rtol = 1e-2 if dtype == np.float16 else 1e-5
+    simulate_assert_allclose(out, expected, rtol=rtol)
+
+    if NEURON_AVAILABLE:
+        out_baremetal = baremetal_run_kernel_unified(kernel, sim_mode, in0, in1)
+        baremetal_assert_allclose(expected, out_baremetal, rtol=rtol)
+
+
+def test_einsum_tensor_contraction(sim_mode):
+    """Test einsum tensor contraction: ijk,jkl->il"""
+
+    def kernel(a, b):
+        return np.einsum("ijk,jkl->il", a, b)
+
+    shape_a = (2, 3, 4)
+    shape_b = (3, 4, 5)
+    dtype = np.float32
+
+    np.random.seed(0)
+    in0 = np.random.uniform(high=1.0, low=0.0, size=shape_a).astype(dtype)
+    in1 = np.random.uniform(high=1.0, low=0.0, size=shape_b).astype(dtype)
+
+    expected = np.einsum("ijk,jkl->il", in0, in1)
+    out = simulate_kernel_unified(kernel, sim_mode, in0, in1)
+    simulate_assert_allclose(out, expected)
+
+    if NEURON_AVAILABLE:
+        out_baremetal = baremetal_run_kernel_unified(kernel, sim_mode, in0, in1)
+        baremetal_assert_allclose(expected, out_baremetal)
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])