From 8499133d024ecf5de332462068ebdf14411a9d98 Mon Sep 17 00:00:00 2001
From: Ankai Jin <ankaijin@hexaconta.cs.ucla.edu>
Date: Tue, 17 Feb 2026 16:22:05 -0800
Subject: [PATCH 1/4] Add 3 kernels to test suite

---
 .../breadth_first_search.cpp                  | 114 +++++++++++++++++
 .../breadth_first_search.h                    |  12 ++
 tests/app/breadth_first_search/main.cpp       |  47 +++++++
 tests/app/database_join/database_join.cpp     |  65 ++++++++++
 tests/app/database_join/database_join.h       |  12 ++
 tests/app/database_join/main.cpp              |  52 ++++++++
 tests/app/sort_merge/main.cpp                 |  36 ++++++
 tests/app/sort_merge/sort_merge.cpp           | 120 ++++++++++++++++++
 tests/app/sort_merge/sort_merge.h             |  12 ++
 9 files changed, 470 insertions(+)
 create mode 100644 tests/app/breadth_first_search/breadth_first_search.cpp
 create mode 100644 tests/app/breadth_first_search/breadth_first_search.h
 create mode 100644 tests/app/breadth_first_search/main.cpp
 create mode 100644 tests/app/database_join/database_join.cpp
 create mode 100644 tests/app/database_join/database_join.h
 create mode 100644 tests/app/database_join/main.cpp
 create mode 100644 tests/app/sort_merge/main.cpp
 create mode 100644 tests/app/sort_merge/sort_merge.cpp
 create mode 100644 tests/app/sort_merge/sort_merge.h
diff --git a/tests/app/breadth_first_search/breadth_first_search.cpp b/tests/app/breadth_first_search/breadth_first_search.cpp
new file mode 100644
index 00000000..bd84ad00
--- /dev/null
+++ b/tests/app/breadth_first_search/breadth_first_search.cpp
@@ -0,0 +1,114 @@
+// Loom kernel implementation: breadth_first_search
+#include "breadth_first_search.h"
+#include "loom/loom.h"
+#include <cmath>
+#include <cstdlib>
+
+// Full pipeline test from C++ source: Breadth-First Search on CSR graph
+// Tests complete compilation chain with queue-based traversal and irregular memory access
+// Graph (6 nodes, 11 edges):
+//   0 -> 1, 3
+//   1 -> 2, 4
+//   2 -> 3, 5
+//   3 -> 4, 5
+//   4 -> 0, 2
+//   5 -> 1
+// BFS from node 0: distances = [0, 1, 2, 1, 2, 2]
+
+// CPU implementation of BFS on CSR graph
+// queue_size must be >= num_nodes to guarantee no overflow
+// Note: programmer must guarantee arrays with __restrict__ do not overlap
+void bfs_cpu(const uint32_t* __restrict__ row_ptr,
+             const uint32_t* __restrict__ col_idx,
+             int32_t* __restrict__ distance,
+             uint32_t* __restrict__ queue,
+             uint32_t* __restrict__ visited,
+             const uint32_t num_nodes,
+             const uint32_t num_edges,
+             const uint32_t queue_size,
+             const uint32_t source) {
+    // Initialize distances to -1 (unvisited)
+    for (uint32_t i = 0; i < num_nodes; i++) {
+        distance[i] = -1;
+        visited[i] = 0;
+    }
+
+    // Initialize BFS from source
+    uint32_t queue_head = 0;
+    uint32_t queue_tail = 0;
+
+    distance[source] = 0;
+    visited[source] = 1;
+    queue[queue_tail++] = source;
+
+    // BFS traversal (queue_tail bounded by num_nodes due to visited check)
+    while (queue_head < queue_tail && queue_tail <= queue_size) {
+        // dequeue current node
+        uint32_t current = queue[queue_head++];
+        int32_t current_dist = distance[current];
+
+        // Iterate over neighbors
+        uint32_t start = row_ptr[current];
+        uint32_t end = row_ptr[current + 1];
+
+        for (uint32_t edge = start; edge < end; edge++) {
+            uint32_t neighbor = col_idx[edge];
+
+            if (visited[neighbor] == 0 && queue_tail < queue_size) {
+                visited[neighbor] = 1;
+                distance[neighbor] = current_dist + 1;
+                queue[queue_tail++] = neighbor;
+            }
+        }
+    }
+}
+
+// Accelerator implementation of BFS on CSR graph
+// queue_size must be >= num_nodes to guarantee no overflow
+LOOM_ACCEL()
+void bfs_dsa(const uint32_t* __restrict__ row_ptr,
+             const uint32_t* __restrict__ col_idx,
+             int32_t* __restrict__ distance,
+             uint32_t* __restrict__ queue,
+             uint32_t* __restrict__ visited,
+             const uint32_t num_nodes,
+             const uint32_t num_edges,
+             const uint32_t queue_size,
+             const uint32_t source) {
+    // Initialize distances to -1 (unvisited)
+    LOOM_NO_PARALLEL
+    LOOM_NO_UNROLL
+    for (uint32_t i = 0; i < num_nodes; i++) {
+        distance[i] = -1;
+        visited[i] = 0;
+    }
+
+    // Initialize BFS from source
+    uint32_t queue_head = 0;
+    uint32_t queue_tail = 0;
+
+    distance[source] = 0;
+    visited[source] = 1;
+    queue[queue_tail++] = source;
+
+    // BFS traversal (queue_tail bounded by num_nodes due to visited check)
+    while (queue_head < queue_tail && queue_tail <= queue_size) {
+        // dequeue current node
+        uint32_t current = queue[queue_head++];
+        int32_t current_dist = distance[current];
+
+        // Iterate over neighbors
+        uint32_t start = row_ptr[current];
+        uint32_t end = row_ptr[current + 1];
+
+        for (uint32_t edge = start; edge < end; edge++) {
+            uint32_t neighbor = col_idx[edge];
+
+            if (visited[neighbor] == 0 && queue_tail < queue_size) {
+                visited[neighbor] = 1;
+                distance[neighbor] = current_dist + 1;
+                queue[queue_tail++] = neighbor;
+            }
+        }
+    }
+}
diff --git a/tests/app/breadth_first_search/breadth_first_search.h b/tests/app/breadth_first_search/breadth_first_search.h
new file mode 100644
index 00000000..78d5e75a
--- /dev/null
+++ b/tests/app/breadth_first_search/breadth_first_search.h
@@ -0,0 +1,12 @@
+// Loom kernel: breadth_first_search
+#ifndef BREADTH_FIRST_SEARCH_H
+#define BREADTH_FIRST_SEARCH_H
+
+#include <cstdint>
+#include <cstddef>
+
+void bfs_cpu(const uint32_t* __restrict__ row_ptr, const uint32_t* __restrict__ col_idx, int32_t* __restrict__ distance, uint32_t* __restrict__ queue, uint32_t* __restrict__ visited, const uint32_t num_nodes, const uint32_t num_edges, const uint32_t queue_size, const uint32_t source);
+
+void bfs_dsa(const uint32_t* __restrict__ row_ptr, const uint32_t* __restrict__ col_idx, int32_t* __restrict__ distance, uint32_t* __restrict__ queue, uint32_t* __restrict__ visited, const uint32_t num_nodes, const uint32_t num_edges, const uint32_t queue_size, const uint32_t source);
+
+#endif // BREADTH_FIRST_SEARCH_H
diff --git a/tests/app/breadth_first_search/main.cpp b/tests/app/breadth_first_search/main.cpp
new file mode 100644
index 00000000..804f786b
--- /dev/null
+++ b/tests/app/breadth_first_search/main.cpp
@@ -0,0 +1,47 @@
+#include <cstdio>
+
+#include "breadth_first_search.h"
+
+int main() {
+    // Graph setup: 6 nodes, 11 edges
+    // 0 -> 1, 3
+    // 1 -> 2, 4
+    // 2 -> 3, 5
+    // 3 -> 4, 5
+    // 4 -> 0, 2
+    // 5 -> 1
+    const uint32_t num_nodes = 6;
+    const uint32_t num_edges = 11;
+    const uint32_t queue_size = 6;
+    const uint32_t source = 0;
+
+    // CSR format: row_ptr (node offsets) and col_idx (edge destinations)
+    uint32_t row_ptr[7] = {0, 2, 5, 7, 9, 11, 11};  // num_nodes + 1
+    uint32_t col_idx[11] = {1, 3, 2, 4, 3, 5, 4, 5, 0, 2, 1};
+
+    // Output arrays
+    int32_t expect_distance[6];
+    int32_t calculated_distance[6];
+    uint32_t expect_queue[6];
+    uint32_t calculated_queue[6];
+    uint32_t expect_visited[6];
+    uint32_t calculated_visited[6];
+
+    // Compute expected result with CPU version
+    bfs_cpu(row_ptr, col_idx, expect_distance, expect_queue, expect_visited, num_nodes, num_edges, queue_size, source);
+
+    // Compute result with DSA version
+    bfs_dsa(row_ptr, col_idx, calculated_distance, calculated_queue, calculated_visited, num_nodes, num_edges, queue_size, source);
+
+    // Compare results (distances)
+    for (uint32_t i = 0; i < num_nodes; i++) {
+        if (expect_distance[i] != calculated_distance[i]) {
+            printf("breadth_first_search: FAILED (distance mismatch at node %u: expected %d, got %d)\n",
+                   i, expect_distance[i], calculated_distance[i]);
+            return 1;
+        }
+    }
+
+    printf("breadth_first_search: PASSED\n");
+    return 0;
+}
diff --git a/tests/app/database_join/database_join.cpp b/tests/app/database_join/database_join.cpp
new file mode 100644
index 00000000..bb879646
--- /dev/null
+++ b/tests/app/database_join/database_join.cpp
@@ -0,0 +1,65 @@
+// Loom kernel implementation: database_join
+#include "database_join.h"
+#include "loom/loom.h"
+#include <cmath>
+#include <cstdlib>
+
+// Full pipeline test from C++ source: Database nested loop join
+// Tests complete compilation chain with nested loops and conditional matching
+// Test: A_ids=[1,2,3], B_ids=[2,3,4], A_vals=[10,20,30], B_vals=[200,300,400]
+//       Matches: (2,20,200), (3,30,300) -> output_ids=[2,3], output_a_values=[20,30], output_b_values=[200,300]
+
+// CPU implementation of database nested loop join
+// Joins two tables on matching IDs, outputting both values side by side
+// Returns the number of matching rows written to output
+// TODO: Implement O(n + m) version in the future
+uint32_t database_join_cpu(const int32_t* __restrict__ a_ids,
+                           const int32_t* __restrict__ b_ids,
+                           const int32_t* __restrict__ a_values,
+                           const int32_t* __restrict__ b_values,
+                           int32_t* __restrict__ output_ids,
+                           int32_t* __restrict__ output_a_values,
+                           int32_t* __restrict__ output_b_values,
+                           const uint32_t size_a,
+                           const uint32_t size_b) {
+    uint32_t out_idx = 0;
+    for (uint32_t i = 0; i < size_a; i++) {
+        for (uint32_t j = 0; j < size_b; j++) {
+            if (a_ids[i] == b_ids[j]) {
+                output_ids[out_idx] = a_ids[i];
+                output_a_values[out_idx] = a_values[i];
+                output_b_values[out_idx] = b_values[j];
+                out_idx++;
+            }
+        }
+    }
+    return out_idx;
+}
+
+// Database nested loop join: output rows where a_ids[i] == b_ids[j]
+// Accelerator implementation of database nested loop join
+LOOM_ACCEL()
+uint32_t database_join_dsa(const int32_t* __restrict__ a_ids,
+                           const int32_t* __restrict__ b_ids,
+                           const int32_t* __restrict__ a_values,
+                           const int32_t* __restrict__ b_values,
+                           int32_t* __restrict__ output_ids,
+                           int32_t* __restrict__ output_a_values,
+                           int32_t* __restrict__ output_b_values,
+                           const uint32_t size_a,
+                           const uint32_t size_b) {
+    uint32_t out_idx = 0;
+    LOOM_NO_PARALLEL
+    LOOM_NO_UNROLL
+    for (uint32_t i = 0; i < size_a; i++) {
+        for (uint32_t j = 0; j < size_b; j++) {
+            if (a_ids[i] == b_ids[j]) {
+                output_ids[out_idx] = a_ids[i];
+                output_a_values[out_idx] = a_values[i];
+                output_b_values[out_idx] = b_values[j];
+                out_idx++;
+            }
+        }
+    }
+    return out_idx;
+}
diff --git a/tests/app/database_join/database_join.h b/tests/app/database_join/database_join.h
new file mode 100644
index 00000000..c156999b
--- /dev/null
+++ b/tests/app/database_join/database_join.h
@@ -0,0 +1,12 @@
+// Loom kernel: database_join
+#ifndef DATABASE_JOIN_H
+#define DATABASE_JOIN_H
+
+#include <cstdint>
+#include <cstddef>
+
+uint32_t database_join_cpu(const int32_t* __restrict__ a_ids, const int32_t* __restrict__ b_ids, const int32_t* __restrict__ a_values, const int32_t* __restrict__ b_values, int32_t* __restrict__ output_ids, int32_t* __restrict__ output_a_values, int32_t* __restrict__ output_b_values, const uint32_t size_a, const uint32_t size_b);
+
+uint32_t database_join_dsa(const int32_t* __restrict__ a_ids, const int32_t* __restrict__ b_ids, const int32_t* __restrict__ a_values, const int32_t* __restrict__ b_values, int32_t* __restrict__ output_ids, int32_t* __restrict__ output_a_values, int32_t* __restrict__ output_b_values, const uint32_t size_a, const uint32_t size_b);
+
+#endif // DATABASE_JOIN_H
diff --git a/tests/app/database_join/main.cpp b/tests/app/database_join/main.cpp
new file mode 100644
index 00000000..864b10da
--- /dev/null
+++ b/tests/app/database_join/main.cpp
@@ -0,0 +1,52 @@
+#include <cstdio>
+
+#include "database_join.h"
+
+int main() {
+    // Test: A_ids=[1,2,3], B_ids=[2,3,4], A_vals=[10,20,30], B_vals=[200,300,400]
+    // Expected: Matches on 2 and 3 -> output_ids=[2,3], output_a_values=[20,30], output_b_values=[200,300]
+    const uint32_t size_a = 3;
+    const uint32_t size_b = 3;
+    const uint32_t max_output = 5; // Maximum possible matches
+
+    int32_t a_ids[3] = {1, 2, 3};
+    int32_t b_ids[3] = {2, 3, 4};
+    int32_t a_values[3] = {10, 20, 30};
+    int32_t b_values[3] = {200, 300, 400};
+
+    // Output arrays
+    int32_t expect_ids[5];
+    int32_t expect_a_values[5];
+    int32_t expect_b_values[5];
+    int32_t calculated_ids[5];
+    int32_t calculated_a_values[5];
+    int32_t calculated_b_values[5];
+
+    // Compute expected result with CPU version
+    uint32_t expect_count = database_join_cpu(a_ids, b_ids, a_values, b_values,
+                                              expect_ids, expect_a_values, expect_b_values, size_a, size_b);
+
+    // Compute result with DSA version
+    uint32_t calculated_count = database_join_dsa(a_ids, b_ids, a_values, b_values,
+                                                  calculated_ids, calculated_a_values, calculated_b_values, size_a, size_b);
+
+    // Compare counts
+    if (expect_count != calculated_count) {
+        printf("database_join: FAILED (count mismatch: expected %u, got %u)\n",
+               expect_count, calculated_count);
+        return 1;
+    }
+
+    // Compare results
+    for (uint32_t i = 0; i < expect_count; i++) {
+        if (expect_ids[i] != calculated_ids[i] ||
+            expect_a_values[i] != calculated_a_values[i] ||
+            expect_b_values[i] != calculated_b_values[i]) {
+            printf("database_join: FAILED (mismatch at index %u)\n", i);
+            return 1;
+        }
+    }
+
+    printf("database_join: PASSED\n");
+    return 0;
+}
diff --git a/tests/app/sort_merge/main.cpp b/tests/app/sort_merge/main.cpp
new file mode 100644
index 00000000..c3f7eda4
--- /dev/null
+++ b/tests/app/sort_merge/main.cpp
@@ -0,0 +1,36 @@
+#include <cstdio>
+#include <cmath>
+
+#include "sort_merge.h"
+
+int main() {
+    // Test: [3.0, 1.0, 4.0, 1.0, 5.0, 9.0, 2.0, 6.0] -> [1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 9.0]
+    const uint32_t N = 8;
+
+    float input[8] = {3.0f, 1.0f, 4.0f, 1.0f, 5.0f, 9.0f, 2.0f, 6.0f};
+
+    // Output arrays
+    float expect_output[8];
+    float expect_temp[8];
+    float calculated_output[8];
+    float calculated_temp[8];
+
+    // Compute expected result with CPU version
+    sort_merge_cpu(input, expect_output, expect_temp, N);
+
+    // Compute result with DSA version
+    sort_merge_dsa(input, calculated_output, calculated_temp, N);
+
+    // Compare results with tolerance for floating point
+    const float epsilon = 1e-6f;
+    for (uint32_t i = 0; i < N; i++) {
+        if (fabsf(expect_output[i] - calculated_output[i]) > epsilon) {
+            printf("sort_merge: FAILED (mismatch at index %u: expected %f, got %f)\n",
+                   i, expect_output[i], calculated_output[i]);
+            return 1;
+        }
+    }
+
+    printf("sort_merge: PASSED\n");
+    return 0;
+}
diff --git a/tests/app/sort_merge/sort_merge.cpp b/tests/app/sort_merge/sort_merge.cpp
new file mode 100644
index 00000000..46d8a82e
--- /dev/null
+++ b/tests/app/sort_merge/sort_merge.cpp
@@ -0,0 +1,120 @@
+// Loom kernel implementation: sort_merge
+#include "sort_merge.h"
+#include "loom/loom.h"
+#include <cmath>
+#include <cstdlib>
+
+// Full pipeline test from C++ source: Merge sort (iterative bottom-up)
+// Tests complete compilation chain with nested loops and merge operations
+// Test: [3.0, 1.0, 4.0, 1.0, 5.0, 9.0, 2.0, 6.0] -> [1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 9.0]
+
+// CPU implementation of merge sort (iterative bottom-up)
+void sort_merge_cpu(const float* __restrict__ input,
+                    float* __restrict__ output,
+                    float* __restrict__ temp,
+                    const uint32_t N) {
+    // Copy input to output
+    for (uint32_t i = 0; i < N; i++) {
+        output[i] = input[i];
+    }
+
+    if (N <= 1) return;
+
+    // Bottom-up merge sort: merge subarrays of size 1, 2, 4, 8, ...
+    for (uint32_t width = 1; width < N; width *= 2) {
+        // Merge pairs of subarrays
+        for (uint32_t left = 0; left < N; left += 2 * width) {
+            uint32_t mid = left + width;
+            uint32_t right = left + 2 * width;
+
+            // Clamp bounds
+            if (mid > N) mid = N;
+            if (right > N) right = N;
+
+            // Merge [left, mid) and [mid, right) into temp
+            uint32_t i = left;
+            uint32_t j = mid;
+            uint32_t k = left;
+
+            while (i < mid && j < right) {
+                if (output[i] <= output[j]) {
+                    temp[k++] = output[i++];
+                } else {
+                    temp[k++] = output[j++];
+                }
+            }
+
+            // Copy remaining from left half
+            while (i < mid) {
+                temp[k++] = output[i++];
+            }
+
+            // Copy remaining from right half
+            while (j < right) {
+                temp[k++] = output[j++];
+            }
+
+            // Copy back to output
+            for (uint32_t idx = left; idx < right; idx++) {
+                output[idx] = temp[idx];
+            }
+        }
+    }
+}
+
+// Accelerator implementation of merge sort (iterative bottom-up)
+LOOM_ACCEL()
+void sort_merge_dsa(const float* __restrict__ input,
+                    float* __restrict__ output,
+                    float* __restrict__ temp,
+                    const uint32_t N) {
+    // Copy input to output
+    LOOM_NO_PARALLEL
+    LOOM_NO_UNROLL
+    for (uint32_t i = 0; i < N; i++) {
+        output[i] = input[i];
+    }
+
+    if (N <= 1) return;
+
+    // Bottom-up merge sort: merge subarrays of size 1, 2, 4, 8, ...
+    for (uint32_t width = 1; width < N; width *= 2) {
+        // Merge pairs of subarrays
+        for (uint32_t left = 0; left < N; left += 2 * width) {
+            uint32_t mid = left + width;
+            uint32_t right = left + 2 * width;
+
+            // Clamp bounds
+            if (mid > N) mid = N;
+            if (right > N) right = N;
+
+            // Merge [left, mid) and [mid, right) into temp
+            uint32_t i = left;
+            uint32_t j = mid;
+            uint32_t k = left;
+
+            while (i < mid && j < right) {
+                if (output[i] <= output[j]) {
+                    temp[k++] = output[i++];
+                } else {
+                    temp[k++] = output[j++];
+                }
+            }
+
+            // Copy remaining from left half
+            while (i < mid) {
+                temp[k++] = output[i++];
+            }
+
+            // Copy remaining from right half
+            while (j < right) {
+                temp[k++] = output[j++];
+            }
+
+            // Copy back to output
+            for (uint32_t idx = left; idx < right; idx++) {
+                output[idx] = temp[idx];
+            }
+        }
+    }
+}
diff --git a/tests/app/sort_merge/sort_merge.h b/tests/app/sort_merge/sort_merge.h
new file mode 100644
index 00000000..bfd3480f
--- /dev/null
+++ b/tests/app/sort_merge/sort_merge.h
@@ -0,0 +1,12 @@
+// Loom kernel: sort_merge
+#ifndef SORT_MERGE_H
+#define SORT_MERGE_H
+
+#include <cstdint>
+#include <cstddef>
+
+void sort_merge_cpu(const float* __restrict__ input, float* __restrict__ output, float* __restrict__ temp, const uint32_t N);
+
+void sort_merge_dsa(const float* __restrict__ input, float* __restrict__ output, float* __restrict__ temp, const uint32_t N);
+
+#endif // SORT_MERGE_H

From a83cafbfba1e52cf9a6e49e0616903dd0e2a4e7c Mon Sep 17 00:00:00 2001
From: Ankai Jin <ankaijin@hexaconta.cs.ucla.edu>
Date: Thu, 19 Feb 2026 17:06:53 -0800
Subject: [PATCH 2/4] Makefile single test

---
 Makefile | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Makefile b/Makefile
index e19a1637..7b2299dc 100644
--- a/Makefile
+++ b/Makefile
@@ -31,6 +31,12 @@ check:
 	@set -e; \
 	ninja -C build check-loom
 
+check-one:
+	@rc=0; \
+	$(CURDIR)/tests/scripts/$(SUITE).sh --single $(CURDIR)/build/bin/loom $(CURDIR)/tests/app/$(TEST) --run || rc=$$?; \
+	cat $(CURDIR)/tests/app/$(TEST)/Output/$(TEST).scf.log; \
+	exit $$rc
+
 clean:
 	@set -e; \
 	if [ -d build ]; then ninja -C build clean-loom; fi

From afc2268eb813e576fdde5e84d66ec22210251279 Mon Sep 17 00:00:00 2001
From: Ankai Jin <ankaijin@hexaconta.cs.ucla.edu>
Date: Thu, 5 Mar 2026 22:48:10 -0800
Subject: [PATCH 3/4] Implemented first version of app fuzzer

---
 tests/app/fuzzer/fuzzer.cpp      | 806 +++++++++++++++++++++++++++++++
 tests/app/fuzzer/fuzzer_config.h |  13 +
 tests/scripts/handshake.sh       |   6 +-
 tests/scripts/ll_roundtrip.sh    |   6 +-
 tests/scripts/mlir_roundtrip.sh  |   6 +-
 tests/scripts/scf_roundtrip.sh   |   6 +-
 6 files changed, 839 insertions(+), 4 deletions(-)
 create mode 100644 tests/app/fuzzer/fuzzer.cpp
 create mode 100644 tests/app/fuzzer/fuzzer_config.h

diff --git a/tests/app/fuzzer/fuzzer.cpp b/tests/app/fuzzer/fuzzer.cpp
new file mode 100644
index 00000000..3726eee6
--- /dev/null
+++ b/tests/app/fuzzer/fuzzer.cpp
@@ -0,0 +1,806 @@
+//===-- fuzzer.cpp - APP fuzzer: generates random loom apps -----*- C++ -*-===//
+//
+// Part of the Loom project.
+//
+//===----------------------------------------------------------------------===//
+//
+// Deterministic fuzzer that generates random C++ applications annotated with
+// loom pragmas.  Uses seeded mt19937 for reproducibility.
+//
+// Two-pass architecture (mirrors the ADG fuzzer):
+//   Pass 1 (--gen-cpp): Generate standalone test directories in Output/
+//                        Each contains <name>.h, <name>.cpp, and main.cpp
+//   Pass 2 (default):   Compile and run each generated app via loom --as-clang,
+//                        then verify CPU vs DSA results match.
+//
+// Randomised parameters per test case:
+//   - Data type (uint32_t, int32_t, float)
+//   - Loop nesting depth (1-3)
+//   - Loop body operation (+, -, *, &, |, ^  for integer;  +, -, * for float)
+//   - Pragma combinations on LOOM_ACCEL, LOOM_TARGET, LOOM_PARALLEL,
+//     LOOM_UNROLL, LOOM_TRIPCOUNT, LOOM_REDUCE, LOOM_STREAM, LOOM_MEMORY_BANK
+//   - Number of input arrays (1-3) and whether they are LOOM_STREAM
+//   - Whether the kernel is a reduction (scalar accumulator)
+//   - Array sizes and initialisation seeds
+//
+//===----------------------------------------------------------------------===//
+
+#include "fuzzer_config.h"
+
+#include <algorithm>
+#include <cstdlib>
+#include <fstream>
+#include <random>
+#include <sstream>
+#include <string>
+#include <vector>
+
+// ---------------------------------------------------------------------------
+// Data-type descriptors
+// ---------------------------------------------------------------------------
+
+struct TypeInfo {
+  const char *cType;     // e.g. "uint32_t"
+  const char *fmtSpec;   // printf format specifier
+  bool isFloat;
+  bool isSigned;
+};
+
+static const TypeInfo kTypes[] = {
+    {"uint32_t", "%u", false, false},
+    {"int32_t", "%d", false, true},
+    {"float", "%f", true, false},
+};
+static constexpr unsigned kNumTypes = sizeof(kTypes) / sizeof(kTypes[0]);
+
+// ---------------------------------------------------------------------------
+// Binary operators
+// ---------------------------------------------------------------------------
+
+struct OpInfo {
+  const char *symbol;       // e.g. "+"
+  const char *reduceArg;    // LOOM_REDUCE argument, or nullptr
+  bool intOnly;
+};
+
+// Index 0-5: available for integers.  Index 0-2: available for floats.
+static const OpInfo kOps[] = {
+    {"+", "+", false},
+    {"-", nullptr, false},
+    {"*", "*", false},
+    {"&", "&", true},
+    {"|", "|", true},
+    {"^", "^", true},
+};
+static constexpr unsigned kNumOps = sizeof(kOps) / sizeof(kOps[0]);
+static constexpr unsigned kNumFloatOps = 3; // +, -, *
+
+// ---------------------------------------------------------------------------
+// Pragma helpers – schedule names for LOOM_PARALLEL
+// ---------------------------------------------------------------------------
+
+static const char *const kSchedules[] = {"contiguous", "interleaved"};
+
+// ---------------------------------------------------------------------------
+// Parameters for a single generated app
+// ---------------------------------------------------------------------------
+
+struct FuzzParams {
+  std::string name;
+
+  // Type and operation
+  unsigned typeIdx;
+  unsigned opIdx;
+
+  // Loop structure
+  unsigned numLoops; // 1, 2, or 3 nesting levels
+
+  // Array dimensions – outer/mid/inner trip counts
+  unsigned dimOuter;
+  unsigned dimMid;   // only used when numLoops >= 2
+  unsigned dimInner; // only used when numLoops >= 3
+
+  // Input arrays
+  unsigned numInputs; // 1, 2, or 3
+
+  // Reduction: if true the innermost loop accumulates a scalar
+  bool isReduction;
+
+  // Pragma toggles & values
+  bool hasAccelName;
+  bool hasTarget;
+  unsigned targetKind; // 0=spatial, 1=temporal
+
+  // LOOM_STREAM on each input (indexed 0..numInputs-1)
+  std::vector<bool> inputStream;
+
+  // LOOM_MEMORY_BANK on first input
+  bool hasMemoryBank;
+  unsigned memoryBankCount; // 2, 4, or 8
+  bool memoryBankBlock;     // true=block, false=cyclic (default)
+
+  // Per-loop pragmas (indexed 0..numLoops-1, 0=outermost)
+  struct LoopPragmas {
+    bool hasParallel;
+    bool parallelAuto;
+    unsigned parallelDegree; // 2, 4, or 8
+    bool hasSchedule;
+    unsigned scheduleIdx; // index into kSchedules
+
+    bool hasUnroll;
+    bool unrollAuto;
+    unsigned unrollFactor; // 2, 4, or 8
+
+    bool hasTripcount;
+    unsigned tripcountKind; // 0=single, 1=range, 2=full
+    unsigned tripcountVal;  // used for single
+    unsigned tripcountMin;
+    unsigned tripcountMax;
+
+    bool hasNoParallel;
+    bool hasNoUnroll;
+  };
+  std::vector<LoopPragmas> loopPragmas;
+
+  // Reduction operator (only when isReduction)
+  unsigned reduceOpIdx; // index into kOps (must have reduceArg != nullptr)
+};
+
+// ---------------------------------------------------------------------------
+// Pick a random element from a small set of allowed values
+// ---------------------------------------------------------------------------
+
+static unsigned pickFrom(std::mt19937 &rng,
+                         const std::vector<unsigned> &choices) {
+  return choices[rng() % choices.size()];
+}
+
+// ---------------------------------------------------------------------------
+// Generate deterministic random parameters for one app
+// ---------------------------------------------------------------------------
+
+static FuzzParams generateParams(unsigned seed, unsigned idx) {
+  std::mt19937 rng(seed + idx);
+  FuzzParams p;
+  p.name = "fuzz_" + std::to_string(seed) + "_" + std::to_string(idx);
+
+  // Type
+  p.typeIdx = rng() % kNumTypes;
+  bool isFloat = kTypes[p.typeIdx].isFloat;
+
+  // Operation – constrain to float-safe ops when using float
+  p.opIdx = isFloat ? (rng() % kNumFloatOps) : (rng() % kNumOps);
+
+  // Loop nesting
+  p.numLoops = 1 + rng() % 3;
+
+  // Dimensions (small for fast execution)
+  std::vector<unsigned> dimChoices = {4, 8, 16, 32};
+  p.dimOuter = pickFrom(rng, dimChoices);
+  p.dimMid = (p.numLoops >= 2) ? pickFrom(rng, dimChoices) : 1;
+  p.dimInner = (p.numLoops >= 3) ? pickFrom(rng, dimChoices) : 1;
+
+  // Input arrays: 2-3 for element-wise (need >=2 to exercise the operator),
+  // 1-2 for reductions (single input is fine since the accumulator is the
+  // second operand).  The isReduction flag is set below, so we pick numInputs
+  // first and then clamp if needed after deciding isReduction.
+  if (p.numLoops == 1) {
+    p.numInputs = 1 + rng() % 3; // 1, 2, or 3
+  } else {
+    p.numInputs = 1 + rng() % 2; // 1 or 2
+  }
+
+  // Reduction: more likely on single-loop, possible on innermost of nested
+  if (p.numLoops == 1) {
+    p.isReduction = (rng() % 3) == 0; // 33% chance
+  } else {
+    p.isReduction = (rng() % 4) == 0; // 25% chance
+  }
+
+  // Pick reduction operator (only from ops that have reduceArg)
+  if (p.isReduction) {
+    // Valid reduce ops: + (0), * (2), & (3), | (4), ^ (5)
+    if (isFloat) {
+      // Float reductions: + or *
+      std::vector<unsigned> floatReduce = {0, 2};
+      p.reduceOpIdx = pickFrom(rng, floatReduce);
+    } else {
+      std::vector<unsigned> intReduce = {0, 2, 3, 4, 5};
+      p.reduceOpIdx = pickFrom(rng, intReduce);
+    }
+  } else {
+    p.reduceOpIdx = 0; // unused
+    // Non-reduction element-wise: need at least 2 inputs to exercise the op
+    if (p.numInputs < 2) p.numInputs = 2;
+  }
+
+  // LOOM_ACCEL name
+  p.hasAccelName = (rng() % 2) == 0;
+
+  // LOOM_TARGET
+  p.hasTarget = (rng() % 3) == 0; // 33% chance
+  p.targetKind = rng() % 2;
+
+  // LOOM_STREAM per input
+  p.inputStream.resize(p.numInputs);
+  for (unsigned i = 0; i < p.numInputs; ++i) {
+    p.inputStream[i] = (rng() % 3) == 0; // 33% chance
+  }
+
+  // LOOM_MEMORY_BANK on first input
+  p.hasMemoryBank = (rng() % 4) == 0; // 25% chance
+  std::vector<unsigned> bankChoices = {2, 4, 8};
+  p.memoryBankCount = pickFrom(rng, bankChoices);
+  p.memoryBankBlock = (rng() % 2) == 0;
+
+  // Per-loop pragmas
+  p.loopPragmas.resize(p.numLoops);
+  for (unsigned L = 0; L < p.numLoops; ++L) {
+    auto &lp = p.loopPragmas[L];
+
+    // LOOM_PARALLEL vs LOOM_NO_PARALLEL (mutually exclusive)
+    unsigned parallelChoice = rng() % 5;
+    // 0=none, 1=auto, 2=degree-only, 3=degree+schedule, 4=no_parallel
+    lp.hasNoParallel = (parallelChoice == 4);
+    lp.hasParallel = (parallelChoice >= 1 && parallelChoice <= 3);
+    lp.parallelAuto = (parallelChoice == 1);
+    std::vector<unsigned> degreeChoices = {2, 4, 8};
+    lp.parallelDegree = pickFrom(rng, degreeChoices);
+    lp.hasSchedule = (parallelChoice == 3);
+    lp.scheduleIdx = rng() % 2;
+
+    // LOOM_UNROLL vs LOOM_NO_UNROLL (mutually exclusive)
+    unsigned unrollChoice = rng() % 4;
+    // 0=none, 1=auto, 2=factor, 3=no_unroll
+    lp.hasNoUnroll = (unrollChoice == 3);
+    lp.hasUnroll = (unrollChoice >= 1 && unrollChoice <= 2);
+    lp.unrollAuto = (unrollChoice == 1);
+    std::vector<unsigned> unrollChoices = {2, 4, 8};
+    lp.unrollFactor = pickFrom(rng, unrollChoices);
+
+    // LOOM_TRIPCOUNT
+    lp.hasTripcount = (rng() % 2) == 0; // 50% chance
+    lp.tripcountKind = rng() % 3;       // 0=single, 1=range, 2=full
+
+    // Determine the actual trip count for this loop level
+    unsigned actualTrip;
+    if (L == 0)
+      actualTrip = p.dimOuter;
+    else if (L == 1)
+      actualTrip = p.dimMid;
+    else
+      actualTrip = p.dimInner;
+
+    lp.tripcountVal = actualTrip;
+    // Ensure min <= actualTrip <= max
+    lp.tripcountMin = 1;
+    lp.tripcountMax = actualTrip * 4;
+  }
+
+  return p;
+}
+
+// ---------------------------------------------------------------------------
+// Code generation helpers
+// ---------------------------------------------------------------------------
+
+/// Emit the include block for generated .cpp files.
+static void emitIncludes(std::ostream &os, const std::string &headerName) {
+  os << "#include \"" << headerName << "\"\n";
+  os << "#include <loom/loom.h>\n\n";
+}
+
+/// Total number of elements in the output array.
+static std::string totalElements(const FuzzParams &p) {
+  if (p.numLoops == 1)
+    return "N";
+  if (p.numLoops == 2)
+    return "N * M";
+  return "N * M * P";
+}
+
+/// Emit the dimension parameter list (N, and optionally M, P).
+static std::string dimParams(const FuzzParams &p, const char *type) {
+  std::string s;
+  s += std::string("const ") + type + " N";
+  if (p.numLoops >= 2)
+    s += std::string(", const ") + type + " M";
+  if (p.numLoops >= 3)
+    s += std::string(", const ") + type + " P";
+  return s;
+}
+
+/// Emit the dimension argument list (N, and optionally M, P).
+static std::string dimArgs(const FuzzParams &p) {
+  std::string s = "N";
+  if (p.numLoops >= 2) s += ", M";
+  if (p.numLoops >= 3) s += ", N, P";
+  return s;
+}
+
+/// Dimension argument list for calling from main (uses actual dim values).
+static std::string dimArgsMain(const FuzzParams &p) {
+  std::string s = std::to_string(p.dimOuter);
+  if (p.numLoops >= 2)
+    s += ", " + std::to_string(p.dimMid);
+  if (p.numLoops >= 3)
+    s += ", " + std::to_string(p.dimOuter) + ", " +
+         std::to_string(p.dimInner);
+  return s;
+}
+
+/// Build the function signature parameter list (shared by CPU and DSA).
+/// Returns e.g.: "const uint32_t* __restrict__ a, const uint32_t* __restrict__
+/// b, uint32_t* __restrict__ out, const uint32_t N"
+static std::string funcParams(const FuzzParams &p, bool isDsa) {
+  const char *cType = kTypes[p.typeIdx].cType;
+  std::ostringstream os;
+
+  // Input arrays
+  for (unsigned i = 0; i < p.numInputs; ++i) {
+    if (i > 0) os << ",\n                ";
+    if (isDsa && p.inputStream[i])
+      os << "LOOM_STREAM ";
+    if (isDsa && i == 0 && p.hasMemoryBank) {
+      os << "LOOM_MEMORY_BANK(" << p.memoryBankCount;
+      if (p.memoryBankBlock) os << ", block";
+      os << ") ";
+    }
+    os << "const " << cType << "* __restrict__ "
+       << (char)('a' + i);
+  }
+
+  if (p.isReduction) {
+    // Reduction: takes init_value, returns scalar
+    os << ",\n                const " << cType << " init_value";
+  } else {
+    // Element-wise: output array
+    os << ",\n                " << cType << "* __restrict__ out";
+  }
+
+  os << ",\n                " << dimParams(p, "uint32_t");
+
+  return os.str();
+}
+
+/// Emit the loop pragmas for a given nesting level.
+static void emitLoopPragmas(std::ostream &os, const FuzzParams &p,
+                            unsigned level, const std::string &indent) {
+  const auto &lp = p.loopPragmas[level];
+
+  // LOOM_PARALLEL / LOOM_NO_PARALLEL
+  if (lp.hasNoParallel) {
+    os << indent << "LOOM_NO_PARALLEL\n";
+  } else if (lp.hasParallel) {
+    if (lp.parallelAuto) {
+      os << indent << "LOOM_PARALLEL()\n";
+    } else if (lp.hasSchedule) {
+      os << indent << "LOOM_PARALLEL(" << lp.parallelDegree << ", "
+         << kSchedules[lp.scheduleIdx] << ")\n";
+    } else {
+      os << indent << "LOOM_PARALLEL(" << lp.parallelDegree << ")\n";
+    }
+  }
+
+  // LOOM_UNROLL / LOOM_NO_UNROLL
+  if (lp.hasNoUnroll) {
+    os << indent << "LOOM_NO_UNROLL\n";
+  } else if (lp.hasUnroll) {
+    if (lp.unrollAuto) {
+      os << indent << "LOOM_UNROLL()\n";
+    } else {
+      os << indent << "LOOM_UNROLL(" << lp.unrollFactor << ")\n";
+    }
+  }
+
+  // LOOM_TRIPCOUNT
+  if (lp.hasTripcount) {
+    switch (lp.tripcountKind) {
+    case 0: // single
+      os << indent << "LOOM_TRIPCOUNT(" << lp.tripcountVal << ")\n";
+      break;
+    case 1: // range
+      os << indent << "LOOM_TRIPCOUNT_RANGE(" << lp.tripcountMin << ", "
+         << lp.tripcountMax << ")\n";
+      break;
+    case 2: // full
+      os << indent << "LOOM_TRIPCOUNT_FULL(" << lp.tripcountVal << ", "
+         << lp.tripcountVal << ", " << lp.tripcountMin << ", "
+         << lp.tripcountMax << ")\n";
+      break;
+    }
+  }
+}
+
+/// Emit the loop body expression (the actual computation).
+/// For element-wise: out[idx] = a[idx] <op> b[idx] (or just a[idx] <op> const)
+/// For reduction: acc = acc <op> a[idx] (or acc <op> a[idx] <op> b[idx])
+static void emitBody(std::ostream &os, const FuzzParams &p,
+                     const std::string &indent, const std::string &idx) {
+  const char *op = kOps[p.opIdx].symbol;
+
+  if (p.isReduction) {
+    // Accumulate into 'acc' – use the reduction operator, not the general op
+    const char *reduceOp = kOps[p.reduceOpIdx].symbol;
+    os << indent << "acc = acc " << reduceOp << " " << "a[" << idx << "]";
+    if (p.numInputs >= 2)
+      os << " " << reduceOp << " b[" << idx << "]";
+    os << ";\n";
+  } else {
+    // Element-wise
+    os << indent << "out[" << idx << "] = a[" << idx << "]";
+    for (unsigned i = 1; i < p.numInputs; ++i)
+      os << " " << op << " " << (char)('a' + i) << "[" << idx << "]";
+    os << ";\n";
+  }
+}
+
+/// Build the linear index expression for a given nesting depth.
+/// 1 loop:  "i"
+/// 2 loops: "i * M + j"
+/// 3 loops: "i * M * P + j * P + k"
+static std::string linearIdx(const FuzzParams &p) {
+  if (p.numLoops == 1) return "i";
+  if (p.numLoops == 2) return "i * M + j";
+  return "i * M * P + j * P + k";
+}
+
+/// Emit the full function body (loop nest + body).
+static void emitFuncBody(std::ostream &os, const FuzzParams &p, bool isDsa) {
+  const char *cType = kTypes[p.typeIdx].cType;
+  bool isFloat = kTypes[p.typeIdx].isFloat;
+  std::string idx = linearIdx(p);
+
+  // Reduction initialisation
+  if (p.isReduction) {
+    if (isDsa) {
+      const char *reduceArg = kOps[p.reduceOpIdx].reduceArg;
+      os << "  LOOM_REDUCE(" << reduceArg << ")\n";
+    }
+    os << "  " << cType << " acc = init_value;\n";
+  }
+
+  // Loop nest
+  const char *loopVars[] = {"i", "j", "k"};
+  const char *loopBounds[] = {"N", "M", "P"};
+  unsigned depth = p.numLoops;
+
+  for (unsigned L = 0; L < depth; ++L) {
+    std::string indent(2 + L * 2, ' ');
+
+    // Emit pragmas only for DSA version
+    if (isDsa) {
+      emitLoopPragmas(os, p, L, indent);
+    }
+
+    os << indent << "for (uint32_t " << loopVars[L] << " = 0; "
+       << loopVars[L] << " < " << loopBounds[L] << "; ++"
+       << loopVars[L] << ") {\n";
+  }
+
+  // Body
+  {
+    std::string indent(2 + depth * 2, ' ');
+    emitBody(os, p, indent, idx);
+  }
+
+  // Close loops
+  for (unsigned L = depth; L > 0; --L) {
+    std::string indent(2 + (L - 1) * 2, ' ');
+    os << indent << "}\n";
+  }
+
+  // Return for reduction
+  if (p.isReduction) {
+    os << "  return acc;\n";
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Write the three source files for a generated test case
+// ---------------------------------------------------------------------------
+
+/// Write <name>.h
+static void writeHeader(const FuzzParams &p, const std::string &dir) {
+  std::string path = dir + "/" + p.name + ".h";
+  std::ofstream os(path);
+  const char *cType = kTypes[p.typeIdx].cType;
+  std::string guard = p.name + "_H";
+  for (auto &c : guard) c = (char)toupper((unsigned char)c);
+
+  os << "// Generated by loom app fuzzer\n";
+  os << "#ifndef " << guard << "\n";
+  os << "#define " << guard << "\n\n";
+  os << "#include <cstdint>\n\n";
+
+  // CPU declaration
+  if (p.isReduction) {
+    os << cType << " " << p.name << "_cpu(" << funcParams(p, false)
+       << ");\n\n";
+    os << cType << " " << p.name << "_dsa(" << funcParams(p, false)
+       << ");\n\n";
+  } else {
+    os << "void " << p.name << "_cpu(" << funcParams(p, false) << ");\n\n";
+    os << "void " << p.name << "_dsa(" << funcParams(p, false) << ");\n\n";
+  }
+
+  os << "#endif // " << guard << "\n";
+}
+
+/// Write <name>.cpp
+static void writeImpl(const FuzzParams &p, const std::string &dir) {
+  std::string path = dir + "/" + p.name + ".cpp";
+  std::ofstream os(path);
+  const char *cType = kTypes[p.typeIdx].cType;
+
+  os << "// Generated by loom app fuzzer\n";
+  emitIncludes(os, p.name + ".h");
+
+  // CPU version
+  os << "// CPU reference implementation\n";
+  if (p.isReduction) {
+    os << cType << " " << p.name << "_cpu(" << funcParams(p, false) << ") {\n";
+  } else {
+    os << "void " << p.name << "_cpu(" << funcParams(p, false) << ") {\n";
+  }
+  emitFuncBody(os, p, /*isDsa=*/false);
+  os << "}\n\n";
+
+  // DSA version
+  os << "// DSA accelerated implementation\n";
+  if (p.hasTarget) {
+    os << "LOOM_TARGET(\""
+       << (p.targetKind == 0 ? "spatial" : "temporal") << "\")\n";
+  }
+  if (p.hasAccelName) {
+    os << "LOOM_ACCEL(\"" << p.name << "\")\n";
+  } else {
+    os << "LOOM_ACCEL()\n";
+  }
+
+  if (p.isReduction) {
+    os << cType << " " << p.name << "_dsa(" << funcParams(p, true) << ") {\n";
+  } else {
+    os << "void " << p.name << "_dsa(" << funcParams(p, true) << ") {\n";
+  }
+  emitFuncBody(os, p, /*isDsa=*/true);
+  os << "}\n";
+}
+
+/// Write main.cpp
+static void writeMain(const FuzzParams &p, const std::string &dir) {
+  std::string path = dir + "/main.cpp";
+  std::ofstream os(path);
+  const char *cType = kTypes[p.typeIdx].cType;
+  bool isFloat = kTypes[p.typeIdx].isFloat;
+
+  os << "// Generated by loom app fuzzer\n";
+  os << "#include \"" << p.name << ".h\"\n";
+  os << "#include <cstdio>\n";
+  if (isFloat) os << "#include <cmath>\n";
+  os << "\n";
+
+  os << "int main() {\n";
+
+  // Dimension constants
+  os << "  const uint32_t N = " << p.dimOuter << ";\n";
+  if (p.numLoops >= 2)
+    os << "  const uint32_t M = " << p.dimMid << ";\n";
+  if (p.numLoops >= 3)
+    os << "  const uint32_t P = " << p.dimInner << ";\n";
+
+  // Total element count
+  std::string total = totalElements(p);
+  os << "  const uint32_t total = " << total << ";\n\n";
+
+  // Input arrays
+  for (unsigned i = 0; i < p.numInputs; ++i) {
+    os << "  " << cType << " " << (char)('a' + i) << "[total];\n";
+  }
+
+  // Initialise inputs with deterministic values
+  os << "\n  // Initialise inputs\n";
+  for (unsigned i = 0; i < p.numInputs; ++i) {
+    char varName = (char)('a' + i);
+    os << "  for (uint32_t idx = 0; idx < total; ++idx) {\n";
+    if (isFloat) {
+      // Small float values to avoid overflow
+      os << "    " << varName << "[idx] = (float)((idx * "
+         << (i * 7 + 3) << " + " << (i + 1) << ") % 10) / 10.0f + 0.1f;\n";
+    } else {
+      // Small integer values to avoid overflow (especially for multiply)
+      os << "    " << varName << "[idx] = (" << cType << ")((idx * "
+         << (i * 7 + 3) << " + " << (i + 1) << ") % 10);\n";
+    }
+    os << "  }\n";
+  }
+  os << "\n";
+
+  if (p.isReduction) {
+    // Reduction test
+    if (isFloat) {
+      os << "  " << cType << " init_value = 0.0f;\n";
+    } else {
+      // For multiply reduction, init must be 1; for others, 0 is fine
+      // But to keep values small, use 0 for all (multiply with 0 tests edge)
+      // Actually use a small nonzero value to make it interesting
+      if (kOps[p.reduceOpIdx].symbol[0] == '*') {
+        os << "  " << cType << " init_value = 1;\n";
+      } else {
+        os << "  " << cType << " init_value = 0;\n";
+      }
+    }
+
+    // Dimension args for function call
+    std::string dArgs;
+    dArgs = "N";
+    if (p.numLoops >= 2) dArgs += ", M";
+    if (p.numLoops >= 3) dArgs += ", P";
+
+    // Build input arg list
+    std::string inputArgs;
+    for (unsigned i = 0; i < p.numInputs; ++i) {
+      if (i > 0) inputArgs += ", ";
+      inputArgs += (char)('a' + i);
+    }
+
+    os << "  " << cType << " cpu_result = " << p.name << "_cpu("
+       << inputArgs << ", init_value, " << dArgs << ");\n";
+    os << "  " << cType << " dsa_result = " << p.name << "_dsa("
+       << inputArgs << ", init_value, " << dArgs << ");\n\n";
+
+    // Compare
+    if (isFloat) {
+      os << "  if (std::fabs(cpu_result - dsa_result) > 1e-2f) {\n";
+    } else {
+      os << "  if (cpu_result != dsa_result) {\n";
+    }
+    os << "    printf(\"" << p.name << ": FAILED\\n\");\n";
+    os << "    return 1;\n";
+    os << "  }\n";
+  } else {
+    // Element-wise test
+    os << "  " << cType << " cpu_out[total];\n";
+    os << "  " << cType << " dsa_out[total];\n\n";
+
+    // Initialise output arrays to 0
+    os << "  for (uint32_t idx = 0; idx < total; ++idx) {\n";
+    if (isFloat) {
+      os << "    cpu_out[idx] = 0.0f;\n";
+      os << "    dsa_out[idx] = 0.0f;\n";
+    } else {
+      os << "    cpu_out[idx] = 0;\n";
+      os << "    dsa_out[idx] = 0;\n";
+    }
+    os << "  }\n\n";
+
+    // Dimension args for function call
+    std::string dArgs;
+    dArgs = "N";
+    if (p.numLoops >= 2) dArgs += ", M";
+    if (p.numLoops >= 3) dArgs += ", P";
+
+    // Build input arg list
+    std::string inputArgs;
+    for (unsigned i = 0; i < p.numInputs; ++i) {
+      if (i > 0) inputArgs += ", ";
+      inputArgs += (char)('a' + i);
+    }
+
+    os << "  " << p.name << "_cpu(" << inputArgs << ", cpu_out, "
+       << dArgs << ");\n";
+    os << "  " << p.name << "_dsa(" << inputArgs << ", dsa_out, "
+       << dArgs << ");\n\n";
+
+    // Compare
+    os << "  for (uint32_t idx = 0; idx < total; ++idx) {\n";
+    if (isFloat) {
+      os << "    if (std::fabs(cpu_out[idx] - dsa_out[idx]) > 1e-4f) {\n";
+    } else {
+      os << "    if (cpu_out[idx] != dsa_out[idx]) {\n";
+    }
+    os << "      printf(\"" << p.name << ": FAILED\\n\");\n";
+    os << "      return 1;\n";
+    os << "    }\n";
+    os << "  }\n";
+  }
+
+  os << "\n  printf(\"" << p.name << ": PASSED\\n\");\n";
+  os << "  return 0;\n";
+  os << "}\n";
+}
+
+/// Write all three files for a test case into Output/<name>/
+static void writeCppSource(const FuzzParams &p, const std::string &outDir) {
+  std::string caseDir = outDir + "/" + p.name;
+  std::string mkdirCmd = "mkdir -p " + caseDir;
+  (void)std::system(mkdirCmd.c_str());
+
+  writeHeader(p, caseDir);
+  writeImpl(p, caseDir);
+  writeMain(p, caseDir);
+}
+
+// ---------------------------------------------------------------------------
+// Direct compile-and-run (pass 2)
+// ---------------------------------------------------------------------------
+
+/// Compile and run a generated test case using loom --as-clang.
+/// Returns 0 on success, 1 on failure.
+static int compileAndRun(const FuzzParams &p, const std::string &outDir) {
+  std::string caseDir = outDir + "/" + p.name;
+
+  // Ensure source files exist (generate them if needed)
+  {
+    std::ifstream check(caseDir + "/" + p.name + ".cpp");
+    if (!check.good()) {
+      writeCppSource(p, outDir);
+    }
+  }
+
+  std::string outputBin = caseDir + "/Output/" + p.name;
+  std::string mkdirCmd = "mkdir -p " + caseDir + "/Output";
+  (void)std::system(mkdirCmd.c_str());
+
+  // Compile
+  std::string compileCmd =
+      "loom --as-clang " + caseDir + "/" + p.name + ".cpp " + caseDir +
+      "/main.cpp -o " + outputBin + " 2>&1";
+  int rc = std::system(compileCmd.c_str());
+  if (rc != 0) {
+    fprintf(stderr, "%s: compilation failed (exit %d)\n", p.name.c_str(), rc);
+    return 1;
+  }
+
+  // Run
+  std::string runCmd = "cd " + caseDir + " && Output/" + p.name + " 2>&1";
+  rc = std::system(runCmd.c_str());
+  if (rc != 0) {
+    fprintf(stderr, "%s: execution failed (exit %d)\n", p.name.c_str(), rc);
+    return 1;
+  }
+
+  return 0;
+}
+
+// ---------------------------------------------------------------------------
+// Main
+// ---------------------------------------------------------------------------
+
+int main(int argc, char **argv) {
+  unsigned seed = FUZZER_SEED;
+  unsigned count = FUZZER_COUNT;
+
+  // Allow env override.
+  if (const char *s = std::getenv("LOOM_FUZZER_SEED"))
+    seed = (unsigned)std::atoi(s);
+  if (const char *c = std::getenv("LOOM_FUZZER_COUNT"))
+    count = (unsigned)std::atoi(c);
+
+  // Check for --gen-cpp mode.
+  bool genCpp = false;
+  for (int i = 1; i < argc; ++i) {
+    if (std::string(argv[i]) == "--gen-cpp")
+      genCpp = true;
+  }
+
+  int failures = 0;
+  for (unsigned idx = 0; idx < count; ++idx) {
+    FuzzParams p = generateParams(seed, idx);
+
+    if (genCpp) {
+      writeCppSource(p, "Output");
+    } else {
+      // Generate sources and attempt compile+run
+      writeCppSource(p, "Output");
+      failures += compileAndRun(p, "Output");
+    }
+  }
+
+  if (!genCpp && failures > 0) {
+    fprintf(stderr, "\n%d / %d test cases FAILED\n", failures, count);
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/tests/app/fuzzer/fuzzer_config.h b/tests/app/fuzzer/fuzzer_config.h
new file mode 100644
index 00000000..595ee4ee
--- /dev/null
+++ b/tests/app/fuzzer/fuzzer_config.h
@@ -0,0 +1,13 @@
+//===-- fuzzer_config.h - APP fuzzer central configuration ------*- C++ -*-===//
+//
+// Part of the Loom project.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FUZZER_CONFIG_H
+#define FUZZER_CONFIG_H
+
+constexpr unsigned FUZZER_SEED = 42;
+constexpr unsigned FUZZER_COUNT = 200;
+
+#endif // FUZZER_CONFIG_H
\ No newline at end of file
diff --git a/tests/scripts/handshake.sh b/tests/scripts/handshake.sh
index 15d18b7e..56187428 100755
--- a/tests/scripts/handshake.sh
+++ b/tests/scripts/handshake.sh
@@ -100,9 +100,13 @@ loom_write_parallel_header "${PARALLEL_FILE}" \
   "Compiles each C++ app to handshake MLIR via loom."
 
 for app_dir in "${app_dirs[@]}"; do
+  app_name=$(basename "${app_dir}")
+
+  # Skip fuzzer directory: it is a test generator, not a loom app.
+  [[ "${app_name}" == "fuzzer" ]] && continue
+
   rel_sources=$(loom_rel_sources "${app_dir}") || continue
 
-  app_name=$(basename "${app_dir}")
   rel_app=$(loom_relpath "${app_dir}")
   rel_out="${rel_app}/Output"
 
diff --git a/tests/scripts/ll_roundtrip.sh b/tests/scripts/ll_roundtrip.sh
index 13a44262..54394204 100755
--- a/tests/scripts/ll_roundtrip.sh
+++ b/tests/scripts/ll_roundtrip.sh
@@ -90,9 +90,13 @@ loom_write_parallel_header "${PARALLEL_FILE}" \
   "Compiles each C++ app to LLVM IR via loom, links with clang++, runs the executable."
 
 for app_dir in "${app_dirs[@]}"; do
+  app_name=$(basename "${app_dir}")
+
+  # Skip fuzzer directory: it is a test generator, not a loom app.
+  [[ "${app_name}" == "fuzzer" ]] && continue
+
   rel_sources=$(loom_rel_sources "${app_dir}") || continue
 
-  app_name=$(basename "${app_dir}")
   rel_app=$(loom_relpath "${app_dir}")
   rel_out="${rel_app}/Output"
 
diff --git a/tests/scripts/mlir_roundtrip.sh b/tests/scripts/mlir_roundtrip.sh
index 59e215fa..e5eb6c67 100755
--- a/tests/scripts/mlir_roundtrip.sh
+++ b/tests/scripts/mlir_roundtrip.sh
@@ -94,9 +94,13 @@ loom_write_parallel_header "${PARALLEL_FILE}" \
   "Roundtrips through mlir-opt/mlir-translate, links with clang++, runs the executable."
 
 for app_dir in "${app_dirs[@]}"; do
+  app_name=$(basename "${app_dir}")
+
+  # Skip fuzzer directory: it is a test generator, not a loom app.
+  [[ "${app_name}" == "fuzzer" ]] && continue
+
   rel_sources=$(loom_rel_sources "${app_dir}") || continue
 
-  app_name=$(basename "${app_dir}")
   rel_app=$(loom_relpath "${app_dir}")
   rel_out="${rel_app}/Output"
 
diff --git a/tests/scripts/scf_roundtrip.sh b/tests/scripts/scf_roundtrip.sh
index 01e23ce0..f20b6e1f 100755
--- a/tests/scripts/scf_roundtrip.sh
+++ b/tests/scripts/scf_roundtrip.sh
@@ -94,9 +94,13 @@ loom_write_parallel_header "${PARALLEL_FILE}" \
   "Lowers SCF MLIR through mlir-opt/mlir-translate, links with clang++, runs the executable."
 
 for app_dir in "${app_dirs[@]}"; do
+  app_name=$(basename "${app_dir}")
+
+  # Skip fuzzer directory: it is a test generator, not a loom app.
+  [[ "${app_name}" == "fuzzer" ]] && continue
+
   rel_sources=$(loom_rel_sources "${app_dir}") || continue
 
-  app_name=$(basename "${app_dir}")
   rel_app=$(loom_relpath "${app_dir}")
   rel_out="${rel_app}/Output"
 

From 985844317d815224deee45d59f5260fb3223a3ec Mon Sep 17 00:00:00 2001
From: Ankai Jin <ankaijin@hexaconta.cs.ucla.edu>
Date: Fri, 6 Mar 2026 13:33:25 -0800
Subject: [PATCH 4/4] Added ls77 and chacha20 kernels

---
 tests/app/chacha20_qr/chacha20_qr.cpp     | 281 ++++++++++++++++++++++
 tests/app/chacha20_qr/chacha20_qr.h       |  12 +
 tests/app/chacha20_qr/main.cpp            |  39 +++
 tests/app/lz77_compress/lz77_compress.cpp | 120 +++++++++
 tests/app/lz77_compress/lz77_compress.h   |  12 +
 tests/app/lz77_compress/main.cpp          |  53 ++++
 6 files changed, 517 insertions(+)
 create mode 100644 tests/app/chacha20_qr/chacha20_qr.cpp
 create mode 100644 tests/app/chacha20_qr/chacha20_qr.h
 create mode 100644 tests/app/chacha20_qr/main.cpp
 create mode 100644 tests/app/lz77_compress/lz77_compress.cpp
 create mode 100644 tests/app/lz77_compress/lz77_compress.h
 create mode 100644 tests/app/lz77_compress/main.cpp

diff --git a/tests/app/chacha20_qr/chacha20_qr.cpp b/tests/app/chacha20_qr/chacha20_qr.cpp
new file mode 100644
index 00000000..84810cba
--- /dev/null
+++ b/tests/app/chacha20_qr/chacha20_qr.cpp
@@ -0,0 +1,281 @@
+// Loom kernel implementation: chacha20_qr
+#include "chacha20_qr.h"
+#include "loom/loom.h"
+#include <cmath>
+#include <cstdlib>
+
+// Full pipeline test from C++ source: ChaCha20 block function
+// Tests complete compilation chain with ARX (add-rotate-xor) operations
+// Operates on a 16-word (512-bit) state, applying quarter-round transforms
+// for the specified number of double-rounds (ChaCha20 uses 10 double-rounds)
+
+// CPU implementation of ChaCha20 block function
+void chacha20_qr_cpu(const uint32_t* __restrict__ input_state,
+                     uint32_t* __restrict__ output_state,
+                     const uint32_t num_rounds) {
+    // Copy input state to output
+    for (uint32_t i = 0; i < 16; i++) {
+        output_state[i] = input_state[i];
+    }
+
+    // Each double-round consists of 4 column quarter-rounds
+    // followed by 4 diagonal quarter-rounds
+    for (uint32_t round = 0; round < num_rounds; round++) {
+        // Column quarter-rounds
+        // QR(0, 4,  8, 12)
+        output_state[0] += output_state[4];
+        output_state[12] ^= output_state[0];
+        output_state[12] = (output_state[12] << 16) | (output_state[12] >> 16);
+        output_state[8] += output_state[12];
+        output_state[4] ^= output_state[8];
+        output_state[4] = (output_state[4] << 12) | (output_state[4] >> 20);
+        output_state[0] += output_state[4];
+        output_state[12] ^= output_state[0];
+        output_state[12] = (output_state[12] << 8) | (output_state[12] >> 24);
+        output_state[8] += output_state[12];
+        output_state[4] ^= output_state[8];
+        output_state[4] = (output_state[4] << 7) | (output_state[4] >> 25);
+
+        // QR(1, 5,  9, 13)
+        output_state[1] += output_state[5];
+        output_state[13] ^= output_state[1];
+        output_state[13] = (output_state[13] << 16) | (output_state[13] >> 16);
+        output_state[9] += output_state[13];
+        output_state[5] ^= output_state[9];
+        output_state[5] = (output_state[5] << 12) | (output_state[5] >> 20);
+        output_state[1] += output_state[5];
+        output_state[13] ^= output_state[1];
+        output_state[13] = (output_state[13] << 8) | (output_state[13] >> 24);
+        output_state[9] += output_state[13];
+        output_state[5] ^= output_state[9];
+        output_state[5] = (output_state[5] << 7) | (output_state[5] >> 25);
+
+        // QR(2, 6, 10, 14)
+        output_state[2] += output_state[6];
+        output_state[14] ^= output_state[2];
+        output_state[14] = (output_state[14] << 16) | (output_state[14] >> 16);
+        output_state[10] += output_state[14];
+        output_state[6] ^= output_state[10];
+        output_state[6] = (output_state[6] << 12) | (output_state[6] >> 20);
+        output_state[2] += output_state[6];
+        output_state[14] ^= output_state[2];
+        output_state[14] = (output_state[14] << 8) | (output_state[14] >> 24);
+        output_state[10] += output_state[14];
+        output_state[6] ^= output_state[10];
+        output_state[6] = (output_state[6] << 7) | (output_state[6] >> 25);
+
+        // QR(3, 7, 11, 15)
+        output_state[3] += output_state[7];
+        output_state[15] ^= output_state[3];
+        output_state[15] = (output_state[15] << 16) | (output_state[15] >> 16);
+        output_state[11] += output_state[15];
+        output_state[7] ^= output_state[11];
+        output_state[7] = (output_state[7] << 12) | (output_state[7] >> 20);
+        output_state[3] += output_state[7];
+        output_state[15] ^= output_state[3];
+        output_state[15] = (output_state[15] << 8) | (output_state[15] >> 24);
+        output_state[11] += output_state[15];
+        output_state[7] ^= output_state[11];
+        output_state[7] = (output_state[7] << 7) | (output_state[7] >> 25);
+
+        // Diagonal quarter-rounds
+        // QR(0, 5, 10, 15)
+        output_state[0] += output_state[5];
+        output_state[15] ^= output_state[0];
+        output_state[15] = (output_state[15] << 16) | (output_state[15] >> 16);
+        output_state[10] += output_state[15];
+        output_state[5] ^= output_state[10];
+        output_state[5] = (output_state[5] << 12) | (output_state[5] >> 20);
+        output_state[0] += output_state[5];
+        output_state[15] ^= output_state[0];
+        output_state[15] = (output_state[15] << 8) | (output_state[15] >> 24);
+        output_state[10] += output_state[15];
+        output_state[5] ^= output_state[10];
+        output_state[5] = (output_state[5] << 7) | (output_state[5] >> 25);
+
+        // QR(1, 6, 11, 12)
+        output_state[1] += output_state[6];
+        output_state[12] ^= output_state[1];
+        output_state[12] = (output_state[12] << 16) | (output_state[12] >> 16);
+        output_state[11] += output_state[12];
+        output_state[6] ^= output_state[11];
+        output_state[6] = (output_state[6] << 12) | (output_state[6] >> 20);
+        output_state[1] += output_state[6];
+        output_state[12] ^= output_state[1];
+        output_state[12] = (output_state[12] << 8) | (output_state[12] >> 24);
+        output_state[11] += output_state[12];
+        output_state[6] ^= output_state[11];
+        output_state[6] = (output_state[6] << 7) | (output_state[6] >> 25);
+
+        // QR(2, 7,  8, 13)
+        output_state[2] += output_state[7];
+        output_state[13] ^= output_state[2];
+        output_state[13] = (output_state[13] << 16) | (output_state[13] >> 16);
+        output_state[8] += output_state[13];
+        output_state[7] ^= output_state[8];
+        output_state[7] = (output_state[7] << 12) | (output_state[7] >> 20);
+        output_state[2] += output_state[7];
+        output_state[13] ^= output_state[2];
+        output_state[13] = (output_state[13] << 8) | (output_state[13] >> 24);
+        output_state[8] += output_state[13];
+        output_state[7] ^= output_state[8];
+        output_state[7] = (output_state[7] << 7) | (output_state[7] >> 25);
+
+        // QR(3, 4,  9, 14)
+        output_state[3] += output_state[4];
+        output_state[14] ^= output_state[3];
+        output_state[14] = (output_state[14] << 16) | (output_state[14] >> 16);
+        output_state[9] += output_state[14];
+        output_state[4] ^= output_state[9];
+        output_state[4] = (output_state[4] << 12) | (output_state[4] >> 20);
+        output_state[3] += output_state[4];
+        output_state[14] ^= output_state[3];
+        output_state[14] = (output_state[14] << 8) | (output_state[14] >> 24);
+        output_state[9] += output_state[14];
+        output_state[4] ^= output_state[9];
+        output_state[4] = (output_state[4] << 7) | (output_state[4] >> 25);
+    }
+
+    // Add original state (ChaCha20 final addition)
+    for (uint32_t i = 0; i < 16; i++) {
+        output_state[i] += input_state[i];
+    }
+}
+
+// Accelerator implementation of ChaCha20 block function
+LOOM_ACCEL()
+void chacha20_qr_dsa(LOOM_MEMORY_BANK(8) const uint32_t* __restrict__ input_state,
+                     uint32_t* __restrict__ output_state,
+                     const uint32_t num_rounds) {
+    // Copy input state to output
+    LOOM_NO_PARALLEL
+    LOOM_NO_UNROLL
+    for (uint32_t i = 0; i < 16; i++) {
+        output_state[i] = input_state[i];
+    }
+
+    // Each double-round consists of 4 column quarter-rounds
+    // followed by 4 diagonal quarter-rounds
+    for (uint32_t round = 0; round < num_rounds; round++) {
+        // Column quarter-rounds
+        // QR(0, 4,  8, 12)
+        output_state[0] += output_state[4];
+        output_state[12] ^= output_state[0];
+        output_state[12] = (output_state[12] << 16) | (output_state[12] >> 16);
+        output_state[8] += output_state[12];
+        output_state[4] ^= output_state[8];
+        output_state[4] = (output_state[4] << 12) | (output_state[4] >> 20);
+        output_state[0] += output_state[4];
+        output_state[12] ^= output_state[0];
+        output_state[12] = (output_state[12] << 8) | (output_state[12] >> 24);
+        output_state[8] += output_state[12];
+        output_state[4] ^= output_state[8];
+        output_state[4] = (output_state[4] << 7) | (output_state[4] >> 25);
+
+        // QR(1, 5,  9, 13)
+        output_state[1] += output_state[5];
+        output_state[13] ^= output_state[1];
+        output_state[13] = (output_state[13] << 16) | (output_state[13] >> 16);
+        output_state[9] += output_state[13];
+        output_state[5] ^= output_state[9];
+        output_state[5] = (output_state[5] << 12) | (output_state[5] >> 20);
+        output_state[1] += output_state[5];
+        output_state[13] ^= output_state[1];
+        output_state[13] = (output_state[13] << 8) | (output_state[13] >> 24);
+        output_state[9] += output_state[13];
+        output_state[5] ^= output_state[9];
+        output_state[5] = (output_state[5] << 7) | (output_state[5] >> 25);
+
+        // QR(2, 6, 10, 14)
+        output_state[2] += output_state[6];
+        output_state[14] ^= output_state[2];
+        output_state[14] = (output_state[14] << 16) | (output_state[14] >> 16);
+        output_state[10] += output_state[14];
+        output_state[6] ^= output_state[10];
+        output_state[6] = (output_state[6] << 12) | (output_state[6] >> 20);
+        output_state[2] += output_state[6];
+        output_state[14] ^= output_state[2];
+        output_state[14] = (output_state[14] << 8) | (output_state[14] >> 24);
+        output_state[10] += output_state[14];
+        output_state[6] ^= output_state[10];
+        output_state[6] = (output_state[6] << 7) | (output_state[6] >> 25);
+
+        // QR(3, 7, 11, 15)
+        output_state[3] += output_state[7];
+        output_state[15] ^= output_state[3];
+        output_state[15] = (output_state[15] << 16) | (output_state[15] >> 16);
+        output_state[11] += output_state[15];
+        output_state[7] ^= output_state[11];
+        output_state[7] = (output_state[7] << 12) | (output_state[7] >> 20);
+        output_state[3] += output_state[7];
+        output_state[15] ^= output_state[3];
+        output_state[15] = (output_state[15] << 8) | (output_state[15] >> 24);
+        output_state[11] += output_state[15];
+        output_state[7] ^= output_state[11];
+        output_state[7] = (output_state[7] << 7) | (output_state[7] >> 25);
+
+        // Diagonal quarter-rounds
+        // QR(0, 5, 10, 15)
+        output_state[0] += output_state[5];
+        output_state[15] ^= output_state[0];
+        output_state[15] = (output_state[15] << 16) | (output_state[15] >> 16);
+        output_state[10] += output_state[15];
+        output_state[5] ^= output_state[10];
+        output_state[5] = (output_state[5] << 12) | (output_state[5] >> 20);
+        output_state[0] += output_state[5];
+        output_state[15] ^= output_state[0];
+        output_state[15] = (output_state[15] << 8) | (output_state[15] >> 24);
+        output_state[10] += output_state[15];
+        output_state[5] ^= output_state[10];
+        output_state[5] = (output_state[5] << 7) | (output_state[5] >> 25);
+
+        // QR(1, 6, 11, 12)
+        output_state[1] += output_state[6];
+        output_state[12] ^= output_state[1];
+        output_state[12] = (output_state[12] << 16) | (output_state[12] >> 16);
+        output_state[11] += output_state[12];
+        output_state[6] ^= output_state[11];
+        output_state[6] = (output_state[6] << 12) | (output_state[6] >> 20);
+        output_state[1] += output_state[6];
+        output_state[12] ^= output_state[1];
+        output_state[12] = (output_state[12] << 8) | (output_state[12] >> 24);
+        output_state[11] += output_state[12];
+        output_state[6] ^= output_state[11];
+        output_state[6] = (output_state[6] << 7) | (output_state[6] >> 25);
+
+        // QR(2, 7,  8, 13)
+        output_state[2] += output_state[7];
+        output_state[13] ^= output_state[2];
+        output_state[13] = (output_state[13] << 16) | (output_state[13] >> 16);
+        output_state[8] += output_state[13];
+        output_state[7] ^= output_state[8];
+        output_state[7] = (output_state[7] << 12) | (output_state[7] >> 20);
+        output_state[2] += output_state[7];
+        output_state[13] ^= output_state[2];
+        output_state[13] = (output_state[13] << 8) | (output_state[13] >> 24);
+        output_state[8] += output_state[13];
+        output_state[7] ^= output_state[8];
+        output_state[7] = (output_state[7] << 7) | (output_state[7] >> 25);
+
+        // QR(3, 4,  9, 14)
+        output_state[3] += output_state[4];
+        output_state[14] ^= output_state[3];
+        output_state[14] = (output_state[14] << 16) | (output_state[14] >> 16);
+        output_state[9] += output_state[14];
+        output_state[4] ^= output_state[9];
+        output_state[4] = (output_state[4] << 12) | (output_state[4] >> 20);
+        output_state[3] += output_state[4];
+        output_state[14] ^= output_state[3];
+        output_state[14] = (output_state[14] << 8) | (output_state[14] >> 24);
+        output_state[9] += output_state[14];
+        output_state[4] ^= output_state[9];
+        output_state[4] = (output_state[4] << 7) | (output_state[4] >> 25);
+    }
+
+    // Add original state (ChaCha20 final addition)
+    LOOM_PARALLEL()
+    LOOM_UNROLL()
+    for (uint32_t i = 0; i < 16; i++) {
+        output_state[i] += input_state[i];
+    }
+}
diff --git a/tests/app/chacha20_qr/chacha20_qr.h b/tests/app/chacha20_qr/chacha20_qr.h
new file mode 100644
index 00000000..da2a64ea
--- /dev/null
+++ b/tests/app/chacha20_qr/chacha20_qr.h
@@ -0,0 +1,12 @@
+// Loom kernel: chacha20_qr
+#ifndef CHACHA20_QR_H
+#define CHACHA20_QR_H
+
+#include <cstdint>
+#include <cstddef>
+
+void chacha20_qr_cpu(const uint32_t* __restrict__ input_state, uint32_t* __restrict__ output_state, const uint32_t num_rounds);
+
+void chacha20_qr_dsa(const uint32_t* __restrict__ input_state, uint32_t* __restrict__ output_state, const uint32_t num_rounds);
+
+#endif // CHACHA20_QR_H
diff --git a/tests/app/chacha20_qr/main.cpp b/tests/app/chacha20_qr/main.cpp
new file mode 100644
index 00000000..d3d6b996
--- /dev/null
+++ b/tests/app/chacha20_qr/main.cpp
@@ -0,0 +1,39 @@
+#include <cstdio>
+
+#include "chacha20_qr.h"
+
+int main() {
+    // Standard ChaCha20 test vector (RFC 7539 Section 2.3.2)
+    // State is 16 x uint32_t = 512 bits
+    // Layout: constants(4) | key(8) | counter(1) | nonce(3)
+    uint32_t input_state[16] = {
+        0x61707865, 0x3320646e, 0x79622d32, 0x6b206574,  // "expand 32-byte k"
+        0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c,  // key (part 1)
+        0x13121110, 0x17161514, 0x1b1a1918, 0x1f1e1d1c,  // key (part 2)
+        0x00000001, 0x09000000, 0x4a000000, 0x00000000   // counter | nonce
+    };
+
+    // 10 double-rounds for ChaCha20
+    const uint32_t num_rounds = 10;
+
+    uint32_t expect_output[16];
+    uint32_t calculated_output[16];
+
+    // Compute expected result with CPU version
+    chacha20_qr_cpu(input_state, expect_output, num_rounds);
+
+    // Compute result with accelerator version
+    chacha20_qr_dsa(input_state, calculated_output, num_rounds);
+
+    // Compare results
+    for (uint32_t i = 0; i < 16; i++) {
+        if (expect_output[i] != calculated_output[i]) {
+            printf("chacha20_qr: FAILED (mismatch at word %u: expected 0x%08x, got 0x%08x)\n",
+                   i, expect_output[i], calculated_output[i]);
+            return 1;
+        }
+    }
+
+    printf("chacha20_qr: PASSED\n");
+    return 0;
+}
diff --git a/tests/app/lz77_compress/lz77_compress.cpp b/tests/app/lz77_compress/lz77_compress.cpp
new file mode 100644
index 00000000..dfdbfe1d
--- /dev/null
+++ b/tests/app/lz77_compress/lz77_compress.cpp
@@ -0,0 +1,120 @@
+// Loom kernel implementation: lz77_compress
+#include "lz77_compress.h"
+#include "loom/loom.h"
+#include <cmath>
+#include <cstdlib>
+
+// Full pipeline test from C++ source: LZ77 sliding-window compression
+// Tests complete compilation chain with nested loops and data-dependent control flow
+// At each position, searches a lookback window for the longest match.
+// Outputs (offset, length, literal) triples.
+// Test: [1,2,3,1,2,3,4,5,4,5,4,5,6,7,8,6,7,8,9,9] with window_size=10
+
+// CPU implementation of LZ77 compression
+void lz77_compress_cpu(const uint32_t* __restrict__ input_data,
+                       uint32_t* __restrict__ output_offsets,
+                       uint32_t* __restrict__ output_lengths,
+                       uint32_t* __restrict__ output_literals,
+                       uint32_t* __restrict__ output_count,
+                       const uint32_t N,
+                       const uint32_t window_size) {
+    if (N == 0) {
+        *output_count = 0;
+        return;
+    }
+
+    uint32_t pos = 0;
+    uint32_t write_idx = 0;
+
+    while (pos < N) {
+        uint32_t best_offset = 0;
+        uint32_t best_length = 0;
+
+        // Determine search window start
+        uint32_t search_start = (pos > window_size) ? pos - window_size : 0;
+
+        // Search for longest match in the lookback window
+        for (uint32_t s = search_start; s < pos; s++) {
+            uint32_t match_len = 0;
+
+            // Extend match as far as possible (cap at 255)
+            while (pos + match_len < N &&
+                   input_data[s + match_len] == input_data[pos + match_len] &&
+                   match_len < 255) {
+                match_len++;
+            }
+
+            if (match_len > best_length) {
+                best_length = match_len;
+                best_offset = pos - s;
+            }
+        }
+
+        // Emit the triple
+        output_offsets[write_idx] = best_offset;
+        output_lengths[write_idx] = best_length;
+
+        uint32_t next_pos = pos + best_length;
+        output_literals[write_idx] = (next_pos < N) ? input_data[next_pos] : 0;
+
+        write_idx++;
+        pos = next_pos + 1;
+    }
+
+    *output_count = write_idx;
+}
+
+// Accelerator implementation of LZ77 compression
+LOOM_ACCEL()
+void lz77_compress_dsa(LOOM_MEMORY_BANK(8) const uint32_t* __restrict__ input_data,
+                       LOOM_STREAM uint32_t* __restrict__ output_offsets,
+                       LOOM_STREAM uint32_t* __restrict__ output_lengths,
+                       LOOM_STREAM uint32_t* __restrict__ output_literals,
+                       uint32_t* __restrict__ output_count,
+                       const uint32_t N,
+                       const uint32_t window_size) {
+    if (N == 0) {
+        *output_count = 0;
+        return;
+    }
+
+    uint32_t pos = 0;
+    uint32_t write_idx = 0;
+
+    while (pos < N) {
+        uint32_t best_offset = 0;
+        uint32_t best_length = 0;
+
+        // Determine search window start
+        uint32_t search_start = (pos > window_size) ? pos - window_size : 0;
+
+        // Search for longest match in the lookback window
+        for (uint32_t s = search_start; s < pos; s++) {
+            uint32_t match_len = 0;
+
+            // Extend match as far as possible (cap at 255)
+            while (pos + match_len < N &&
+                   input_data[s + match_len] == input_data[pos + match_len] &&
+                   match_len < 255) {
+                match_len++;
+            }
+
+            if (match_len > best_length) {
+                best_length = match_len;
+                best_offset = pos - s;
+            }
+        }
+
+        // Emit the triple
+        output_offsets[write_idx] = best_offset;
+        output_lengths[write_idx] = best_length;
+
+        uint32_t next_pos = pos + best_length;
+        output_literals[write_idx] = (next_pos < N) ? input_data[next_pos] : 0;
+
+        write_idx++;
+        pos = next_pos + 1;
+    }
+
+    *output_count = write_idx;
+}
diff --git a/tests/app/lz77_compress/lz77_compress.h b/tests/app/lz77_compress/lz77_compress.h
new file mode 100644
index 00000000..41df1ec3
--- /dev/null
+++ b/tests/app/lz77_compress/lz77_compress.h
@@ -0,0 +1,12 @@
+// Loom kernel: lz77_compress
+#ifndef LZ77_COMPRESS_H
+#define LZ77_COMPRESS_H
+
+#include <cstdint>
+#include <cstddef>
+
+void lz77_compress_cpu(const uint32_t* __restrict__ input_data, uint32_t* __restrict__ output_offsets, uint32_t* __restrict__ output_lengths, uint32_t* __restrict__ output_literals, uint32_t* __restrict__ output_count, const uint32_t N, const uint32_t window_size);
+
+void lz77_compress_dsa(const uint32_t* __restrict__ input_data, uint32_t* __restrict__ output_offsets, uint32_t* __restrict__ output_lengths, uint32_t* __restrict__ output_literals, uint32_t* __restrict__ output_count, const uint32_t N, const uint32_t window_size);
+
+#endif // LZ77_COMPRESS_H
diff --git a/tests/app/lz77_compress/main.cpp b/tests/app/lz77_compress/main.cpp
new file mode 100644
index 00000000..ece769c0
--- /dev/null
+++ b/tests/app/lz77_compress/main.cpp
@@ -0,0 +1,53 @@
+#include <cstdio>
+
+#include "lz77_compress.h"
+
+int main() {
+    const uint32_t N = 20;
+    const uint32_t window_size = 10;
+
+    // Input array with repeated patterns for compression
+    uint32_t input[N] = {
+        1, 2, 3, 1, 2, 3, 4, 5, 4, 5,
+        4, 5, 6, 7, 8, 6, 7, 8, 9, 9
+    };
+
+    // Output arrays (worst case: each element is a literal, so N triples)
+    uint32_t expect_offsets[N];
+    uint32_t expect_lengths[N];
+    uint32_t expect_literals[N];
+    uint32_t expect_count;
+
+    uint32_t calculated_offsets[N];
+    uint32_t calculated_lengths[N];
+    uint32_t calculated_literals[N];
+    uint32_t calculated_count;
+
+    // Compute expected result with CPU version
+    lz77_compress_cpu(input, expect_offsets, expect_lengths, expect_literals,
+                      &expect_count, N, window_size);
+
+    // Compute result with accelerator version
+    lz77_compress_dsa(input, calculated_offsets, calculated_lengths,
+                      calculated_literals, &calculated_count, N, window_size);
+
+    // Compare output counts
+    if (expect_count != calculated_count) {
+        printf("lz77_compress: FAILED (count mismatch: expected %u, got %u)\n",
+               expect_count, calculated_count);
+        return 1;
+    }
+
+    // Compare results element-by-element
+    for (uint32_t i = 0; i < expect_count; i++) {
+        if (expect_offsets[i] != calculated_offsets[i] ||
+            expect_lengths[i] != calculated_lengths[i] ||
+            expect_literals[i] != calculated_literals[i]) {
+            printf("lz77_compress: FAILED (mismatch at triple %u)\n", i);
+            return 1;
+        }
+    }
+
+    printf("lz77_compress: PASSED\n");
+    return 0;
+}