From 8499133d024ecf5de332462068ebdf14411a9d98 Mon Sep 17 00:00:00 2001 From: Ankai Jin Date: Tue, 17 Feb 2026 16:22:05 -0800 Subject: [PATCH 1/4] Add 3 kernels to test suite --- .../breadth_first_search.cpp | 114 +++++++++++++++++ .../breadth_first_search.h | 12 ++ tests/app/breadth_first_search/main.cpp | 47 +++++++ tests/app/database_join/database_join.cpp | 65 ++++++++++ tests/app/database_join/database_join.h | 12 ++ tests/app/database_join/main.cpp | 52 ++++++++ tests/app/sort_merge/main.cpp | 36 ++++++ tests/app/sort_merge/sort_merge.cpp | 120 ++++++++++++++++++ tests/app/sort_merge/sort_merge.h | 12 ++ 9 files changed, 470 insertions(+) create mode 100644 tests/app/breadth_first_search/breadth_first_search.cpp create mode 100644 tests/app/breadth_first_search/breadth_first_search.h create mode 100644 tests/app/breadth_first_search/main.cpp create mode 100644 tests/app/database_join/database_join.cpp create mode 100644 tests/app/database_join/database_join.h create mode 100644 tests/app/database_join/main.cpp create mode 100644 tests/app/sort_merge/main.cpp create mode 100644 tests/app/sort_merge/sort_merge.cpp create mode 100644 tests/app/sort_merge/sort_merge.h diff --git a/tests/app/breadth_first_search/breadth_first_search.cpp b/tests/app/breadth_first_search/breadth_first_search.cpp new file mode 100644 index 00000000..bd84ad00 --- /dev/null +++ b/tests/app/breadth_first_search/breadth_first_search.cpp @@ -0,0 +1,114 @@ +// Loom kernel implementation: breadth_first_search +#include "breadth_first_search.h" +#include "loom/loom.h" +#include +#include + +// Full pipeline test from C++ source: Breadth-First Search on CSR graph +// Tests complete compilation chain with queue-based traversal and irregular memory access +// Graph (6 nodes, 11 edges): +// 0 -> 1, 3 +// 1 -> 2, 4 +// 2 -> 3, 5 +// 3 -> 4, 5 +// 4 -> 0, 2 +// 5 -> 1 +// BFS from node 0: distances = [0, 1, 2, 1, 2, 2] + +// CPU implementation of BFS on CSR graph +// queue_size must be >= num_nodes to guarantee no overflow +// Note: programmer must guarantee arrays with __restrict__ do not overlap +void bfs_cpu(const uint32_t* __restrict__ row_ptr, + const uint32_t* __restrict__ col_idx, + int32_t* __restrict__ distance, + uint32_t* __restrict__ queue, + uint32_t* __restrict__ visited, + const uint32_t num_nodes, + const uint32_t num_edges, + const uint32_t queue_size, + const uint32_t source) { + // Initialize distances to -1 (unvisited) + for (uint32_t i = 0; i < num_nodes; i++) { + distance[i] = -1; + visited[i] = 0; + } + + // Initialize BFS from source + uint32_t queue_head = 0; + uint32_t queue_tail = 0; + + distance[source] = 0; + visited[source] = 1; + queue[queue_tail++] = source; + + // BFS traversal (queue_tail bounded by num_nodes due to visited check) + while (queue_head < queue_tail && queue_tail <= queue_size) { + // dequeue current node + uint32_t current = queue[queue_head++]; + int32_t current_dist = distance[current]; + + // Iterate over neighbors + uint32_t start = row_ptr[current]; + uint32_t end = row_ptr[current + 1]; + + for (uint32_t edge = start; edge < end; edge++) { + uint32_t neighbor = col_idx[edge]; + + if (visited[neighbor] == 0 && queue_tail < queue_size) { + visited[neighbor] = 1; + distance[neighbor] = current_dist + 1; + queue[queue_tail++] = neighbor; + } + } + } +} + +// Accelerator implementation of BFS on CSR graph +// queue_size must be >= num_nodes to guarantee no overflow +LOOM_ACCEL() +void bfs_dsa(const uint32_t* __restrict__ row_ptr, + const uint32_t* __restrict__ col_idx, + int32_t* __restrict__ distance, + uint32_t* __restrict__ queue, + uint32_t* __restrict__ visited, + const uint32_t num_nodes, + const uint32_t num_edges, + const uint32_t queue_size, + const uint32_t source) { + // Initialize distances to -1 (unvisited) + LOOM_NO_PARALLEL + LOOM_NO_UNROLL + for (uint32_t i = 0; i < num_nodes; i++) { + distance[i] = -1; + visited[i] = 0; + } + + // Initialize BFS from source + uint32_t queue_head = 0; + uint32_t queue_tail = 0; + + distance[source] = 0; + visited[source] = 1; + queue[queue_tail++] = source; + + // BFS traversal (queue_tail bounded by num_nodes due to visited check) + while (queue_head < queue_tail && queue_tail <= queue_size) { + // dequeue current node + uint32_t current = queue[queue_head++]; + int32_t current_dist = distance[current]; + + // Iterate over neighbors + uint32_t start = row_ptr[current]; + uint32_t end = row_ptr[current + 1]; + + for (uint32_t edge = start; edge < end; edge++) { + uint32_t neighbor = col_idx[edge]; + + if (visited[neighbor] == 0 && queue_tail < queue_size) { + visited[neighbor] = 1; + distance[neighbor] = current_dist + 1; + queue[queue_tail++] = neighbor; + } + } + } +} diff --git a/tests/app/breadth_first_search/breadth_first_search.h b/tests/app/breadth_first_search/breadth_first_search.h new file mode 100644 index 00000000..78d5e75a --- /dev/null +++ b/tests/app/breadth_first_search/breadth_first_search.h @@ -0,0 +1,12 @@ +// Loom kernel: breadth_first_search +#ifndef BREADTH_FIRST_SEARCH_H +#define BREADTH_FIRST_SEARCH_H + +#include +#include + +void bfs_cpu(const uint32_t* __restrict__ row_ptr, const uint32_t* __restrict__ col_idx, int32_t* __restrict__ distance, uint32_t* __restrict__ queue, uint32_t* __restrict__ visited, const uint32_t num_nodes, const uint32_t num_edges, const uint32_t queue_size, const uint32_t source); + +void bfs_dsa(const uint32_t* __restrict__ row_ptr, const uint32_t* __restrict__ col_idx, int32_t* __restrict__ distance, uint32_t* __restrict__ queue, uint32_t* __restrict__ visited, const uint32_t num_nodes, const uint32_t num_edges, const uint32_t queue_size, const uint32_t source); + +#endif // BREADTH_FIRST_SEARCH_H diff --git a/tests/app/breadth_first_search/main.cpp b/tests/app/breadth_first_search/main.cpp new file mode 100644 index 00000000..804f786b --- /dev/null +++ b/tests/app/breadth_first_search/main.cpp @@ -0,0 +1,47 @@ +#include + +#include "breadth_first_search.h" + +int main() { + // Graph setup: 6 nodes, 11 edges + // 0 -> 1, 3 + // 1 -> 2, 4 + // 2 -> 3, 5 + // 3 -> 4, 5 + // 4 -> 0, 2 + // 5 -> 1 + const uint32_t num_nodes = 6; + const uint32_t num_edges = 11; + const uint32_t queue_size = 6; + const uint32_t source = 0; + + // CSR format: row_ptr (node offsets) and col_idx (edge destinations) + uint32_t row_ptr[7] = {0, 2, 5, 7, 9, 11, 11}; // num_nodes + 1 + uint32_t col_idx[11] = {1, 3, 2, 4, 3, 5, 4, 5, 0, 2, 1}; + + // Output arrays + int32_t expect_distance[6]; + int32_t calculated_distance[6]; + uint32_t expect_queue[6]; + uint32_t calculated_queue[6]; + uint32_t expect_visited[6]; + uint32_t calculated_visited[6]; + + // Compute expected result with CPU version + bfs_cpu(row_ptr, col_idx, expect_distance, expect_queue, expect_visited, num_nodes, num_edges, queue_size, source); + + // Compute result with DSA version + bfs_dsa(row_ptr, col_idx, calculated_distance, calculated_queue, calculated_visited, num_nodes, num_edges, queue_size, source); + + // Compare results (distances) + for (uint32_t i = 0; i < num_nodes; i++) { + if (expect_distance[i] != calculated_distance[i]) { + printf("breadth_first_search: FAILED (distance mismatch at node %u: expected %d, got %d)\n", + i, expect_distance[i], calculated_distance[i]); + return 1; + } + } + + printf("breadth_first_search: PASSED\n"); + return 0; +} diff --git a/tests/app/database_join/database_join.cpp b/tests/app/database_join/database_join.cpp new file mode 100644 index 00000000..bb879646 --- /dev/null +++ b/tests/app/database_join/database_join.cpp @@ -0,0 +1,65 @@ +// Loom kernel implementation: database_join +#include "database_join.h" +#include "loom/loom.h" +#include +#include + +// Full pipeline test from C++ source: Database nested loop join +// Tests complete compilation chain with nested loops and conditional matching +// Test: A_ids=[1,2,3], B_ids=[2,3,4], A_vals=[10,20,30], B_vals=[200,300,400] +// Matches: (2,20,200), (3,30,300) -> output_ids=[2,3], output_a_values=[20,30], output_b_values=[200,300] + +// CPU implementation of database nested loop join +// Joins two tables on matching IDs, outputting both values side by side +// Returns the number of matching rows written to output +// TODO: Implement O(n + m) version in the future +uint32_t database_join_cpu(const int32_t* __restrict__ a_ids, + const int32_t* __restrict__ b_ids, + const int32_t* __restrict__ a_values, + const int32_t* __restrict__ b_values, + int32_t* __restrict__ output_ids, + int32_t* __restrict__ output_a_values, + int32_t* __restrict__ output_b_values, + const uint32_t size_a, + const uint32_t size_b) { + uint32_t out_idx = 0; + for (uint32_t i = 0; i < size_a; i++) { + for (uint32_t j = 0; j < size_b; j++) { + if (a_ids[i] == b_ids[j]) { + output_ids[out_idx] = a_ids[i]; + output_a_values[out_idx] = a_values[i]; + output_b_values[out_idx] = b_values[j]; + out_idx++; + } + } + } + return out_idx; +} + +// Database nested loop join: output rows where a_ids[i] == b_ids[j] +// Accelerator implementation of database nested loop join +LOOM_ACCEL() +uint32_t database_join_dsa(const int32_t* __restrict__ a_ids, + const int32_t* __restrict__ b_ids, + const int32_t* __restrict__ a_values, + const int32_t* __restrict__ b_values, + int32_t* __restrict__ output_ids, + int32_t* __restrict__ output_a_values, + int32_t* __restrict__ output_b_values, + const uint32_t size_a, + const uint32_t size_b) { + uint32_t out_idx = 0; + LOOM_NO_PARALLEL + LOOM_NO_UNROLL + for (uint32_t i = 0; i < size_a; i++) { + for (uint32_t j = 0; j < size_b; j++) { + if (a_ids[i] == b_ids[j]) { + output_ids[out_idx] = a_ids[i]; + output_a_values[out_idx] = a_values[i]; + output_b_values[out_idx] = b_values[j]; + out_idx++; + } + } + } + return out_idx; +} diff --git a/tests/app/database_join/database_join.h b/tests/app/database_join/database_join.h new file mode 100644 index 00000000..c156999b --- /dev/null +++ b/tests/app/database_join/database_join.h @@ -0,0 +1,12 @@ +// Loom kernel: database_join +#ifndef DATABASE_JOIN_H +#define DATABASE_JOIN_H + +#include +#include + +uint32_t database_join_cpu(const int32_t* __restrict__ a_ids, const int32_t* __restrict__ b_ids, const int32_t* __restrict__ a_values, const int32_t* __restrict__ b_values, int32_t* __restrict__ output_ids, int32_t* __restrict__ output_a_values, int32_t* __restrict__ output_b_values, const uint32_t size_a, const uint32_t size_b); + +uint32_t database_join_dsa(const int32_t* __restrict__ a_ids, const int32_t* __restrict__ b_ids, const int32_t* __restrict__ a_values, const int32_t* __restrict__ b_values, int32_t* __restrict__ output_ids, int32_t* __restrict__ output_a_values, int32_t* __restrict__ output_b_values, const uint32_t size_a, const uint32_t size_b); + +#endif // DATABASE_JOIN_H diff --git a/tests/app/database_join/main.cpp b/tests/app/database_join/main.cpp new file mode 100644 index 00000000..864b10da --- /dev/null +++ b/tests/app/database_join/main.cpp @@ -0,0 +1,52 @@ +#include + +#include "database_join.h" + +int main() { + // Test: A_ids=[1,2,3], B_ids=[2,3,4], A_vals=[10,20,30], B_vals=[200,300,400] + // Expected: Matches on 2 and 3 -> output_ids=[2,3], output_a_values=[20,30], output_b_values=[200,300] + const uint32_t size_a = 3; + const uint32_t size_b = 3; + const uint32_t max_output = 5; // Maximum possible matches + + int32_t a_ids[3] = {1, 2, 3}; + int32_t b_ids[3] = {2, 3, 4}; + int32_t a_values[3] = {10, 20, 30}; + int32_t b_values[3] = {200, 300, 400}; + + // Output arrays + int32_t expect_ids[5]; + int32_t expect_a_values[5]; + int32_t expect_b_values[5]; + int32_t calculated_ids[5]; + int32_t calculated_a_values[5]; + int32_t calculated_b_values[5]; + + // Compute expected result with CPU version + uint32_t expect_count = database_join_cpu(a_ids, b_ids, a_values, b_values, + expect_ids, expect_a_values, expect_b_values, size_a, size_b); + + // Compute result with DSA version + uint32_t calculated_count = database_join_dsa(a_ids, b_ids, a_values, b_values, + calculated_ids, calculated_a_values, calculated_b_values, size_a, size_b); + + // Compare counts + if (expect_count != calculated_count) { + printf("database_join: FAILED (count mismatch: expected %u, got %u)\n", + expect_count, calculated_count); + return 1; + } + + // Compare results + for (uint32_t i = 0; i < expect_count; i++) { + if (expect_ids[i] != calculated_ids[i] || + expect_a_values[i] != calculated_a_values[i] || + expect_b_values[i] != calculated_b_values[i]) { + printf("database_join: FAILED (mismatch at index %u)\n", i); + return 1; + } + } + + printf("database_join: PASSED\n"); + return 0; +} diff --git a/tests/app/sort_merge/main.cpp b/tests/app/sort_merge/main.cpp new file mode 100644 index 00000000..c3f7eda4 --- /dev/null +++ b/tests/app/sort_merge/main.cpp @@ -0,0 +1,36 @@ +#include +#include + +#include "sort_merge.h" + +int main() { + // Test: [3.0, 1.0, 4.0, 1.0, 5.0, 9.0, 2.0, 6.0] -> [1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 9.0] + const uint32_t N = 8; + + float input[8] = {3.0f, 1.0f, 4.0f, 1.0f, 5.0f, 9.0f, 2.0f, 6.0f}; + + // Output arrays + float expect_output[8]; + float expect_temp[8]; + float calculated_output[8]; + float calculated_temp[8]; + + // Compute expected result with CPU version + sort_merge_cpu(input, expect_output, expect_temp, N); + + // Compute result with DSA version + sort_merge_dsa(input, calculated_output, calculated_temp, N); + + // Compare results with tolerance for floating point + const float epsilon = 1e-6f; + for (uint32_t i = 0; i < N; i++) { + if (fabsf(expect_output[i] - calculated_output[i]) > epsilon) { + printf("sort_merge: FAILED (mismatch at index %u: expected %f, got %f)\n", + i, expect_output[i], calculated_output[i]); + return 1; + } + } + + printf("sort_merge: PASSED\n"); + return 0; +} diff --git a/tests/app/sort_merge/sort_merge.cpp b/tests/app/sort_merge/sort_merge.cpp new file mode 100644 index 00000000..46d8a82e --- /dev/null +++ b/tests/app/sort_merge/sort_merge.cpp @@ -0,0 +1,120 @@ +// Loom kernel implementation: sort_merge +#include "sort_merge.h" +#include "loom/loom.h" +#include +#include + +// Full pipeline test from C++ source: Merge sort (iterative bottom-up) +// Tests complete compilation chain with nested loops and merge operations +// Test: [3.0, 1.0, 4.0, 1.0, 5.0, 9.0, 2.0, 6.0] -> [1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 9.0] + +// CPU implementation of merge sort (iterative bottom-up) +void sort_merge_cpu(const float* __restrict__ input, + float* __restrict__ output, + float* __restrict__ temp, + const uint32_t N) { + // Copy input to output + for (uint32_t i = 0; i < N; i++) { + output[i] = input[i]; + } + + if (N <= 1) return; + + // Bottom-up merge sort: merge subarrays of size 1, 2, 4, 8, ... + for (uint32_t width = 1; width < N; width *= 2) { + // Merge pairs of subarrays + for (uint32_t left = 0; left < N; left += 2 * width) { + uint32_t mid = left + width; + uint32_t right = left + 2 * width; + + // Clamp bounds + if (mid > N) mid = N; + if (right > N) right = N; + + // Merge [left, mid) and [mid, right) into temp + uint32_t i = left; + uint32_t j = mid; + uint32_t k = left; + + while (i < mid && j < right) { + if (output[i] <= output[j]) { + temp[k++] = output[i++]; + } else { + temp[k++] = output[j++]; + } + } + + // Copy remaining from left half + while (i < mid) { + temp[k++] = output[i++]; + } + + // Copy remaining from right half + while (j < right) { + temp[k++] = output[j++]; + } + + // Copy back to output + for (uint32_t idx = left; idx < right; idx++) { + output[idx] = temp[idx]; + } + } + } +} + +// Accelerator implementation of merge sort (iterative bottom-up) +LOOM_ACCEL() +void sort_merge_dsa(const float* __restrict__ input, + float* __restrict__ output, + float* __restrict__ temp, + const uint32_t N) { + // Copy input to output + LOOM_NO_PARALLEL + LOOM_NO_UNROLL + for (uint32_t i = 0; i < N; i++) { + output[i] = input[i]; + } + + if (N <= 1) return; + + // Bottom-up merge sort: merge subarrays of size 1, 2, 4, 8, ... + for (uint32_t width = 1; width < N; width *= 2) { + // Merge pairs of subarrays + for (uint32_t left = 0; left < N; left += 2 * width) { + uint32_t mid = left + width; + uint32_t right = left + 2 * width; + + // Clamp bounds + if (mid > N) mid = N; + if (right > N) right = N; + + // Merge [left, mid) and [mid, right) into temp + uint32_t i = left; + uint32_t j = mid; + uint32_t k = left; + + while (i < mid && j < right) { + if (output[i] <= output[j]) { + temp[k++] = output[i++]; + } else { + temp[k++] = output[j++]; + } + } + + // Copy remaining from left half + while (i < mid) { + temp[k++] = output[i++]; + } + + // Copy remaining from right half + while (j < right) { + temp[k++] = output[j++]; + } + + // Copy back to output + for (uint32_t idx = left; idx < right; idx++) { + output[idx] = temp[idx]; + } + } + } +} diff --git a/tests/app/sort_merge/sort_merge.h b/tests/app/sort_merge/sort_merge.h new file mode 100644 index 00000000..bfd3480f --- /dev/null +++ b/tests/app/sort_merge/sort_merge.h @@ -0,0 +1,12 @@ +// Loom kernel: sort_merge +#ifndef SORT_MERGE_H +#define SORT_MERGE_H + +#include +#include + +void sort_merge_cpu(const float* __restrict__ input, float* __restrict__ output, float* __restrict__ temp, const uint32_t N); + +void sort_merge_dsa(const float* __restrict__ input, float* __restrict__ output, float* __restrict__ temp, const uint32_t N); + +#endif // SORT_MERGE_H From a83cafbfba1e52cf9a6e49e0616903dd0e2a4e7c Mon Sep 17 00:00:00 2001 From: Ankai Jin Date: Thu, 19 Feb 2026 17:06:53 -0800 Subject: [PATCH 2/4] Makefile single test --- Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Makefile b/Makefile index e19a1637..7b2299dc 100644 --- a/Makefile +++ b/Makefile @@ -31,6 +31,12 @@ check: @set -e; \ ninja -C build check-loom +check-one: + @rc=0; \ + $(CURDIR)/tests/scripts/$(SUITE).sh --single $(CURDIR)/build/bin/loom $(CURDIR)/tests/app/$(TEST) --run || rc=$$?; \ + cat $(CURDIR)/tests/app/$(TEST)/Output/$(TEST).scf.log; \ + exit $$rc + clean: @set -e; \ if [ -d build ]; then ninja -C build clean-loom; fi From afc2268eb813e576fdde5e84d66ec22210251279 Mon Sep 17 00:00:00 2001 From: Ankai Jin Date: Thu, 5 Mar 2026 22:48:10 -0800 Subject: [PATCH 3/4] Implemented first version of app fuzzer --- tests/app/fuzzer/fuzzer.cpp | 806 +++++++++++++++++++++++++++++++ tests/app/fuzzer/fuzzer_config.h | 13 + tests/scripts/handshake.sh | 6 +- tests/scripts/ll_roundtrip.sh | 6 +- tests/scripts/mlir_roundtrip.sh | 6 +- tests/scripts/scf_roundtrip.sh | 6 +- 6 files changed, 839 insertions(+), 4 deletions(-) create mode 100644 tests/app/fuzzer/fuzzer.cpp create mode 100644 tests/app/fuzzer/fuzzer_config.h diff --git a/tests/app/fuzzer/fuzzer.cpp b/tests/app/fuzzer/fuzzer.cpp new file mode 100644 index 00000000..3726eee6 --- /dev/null +++ b/tests/app/fuzzer/fuzzer.cpp @@ -0,0 +1,806 @@ +//===-- fuzzer.cpp - APP fuzzer: generates random loom apps -----*- C++ -*-===// +// +// Part of the Loom project. +// +//===----------------------------------------------------------------------===// +// +// Deterministic fuzzer that generates random C++ applications annotated with +// loom pragmas. Uses seeded mt19937 for reproducibility. +// +// Two-pass architecture (mirrors the ADG fuzzer): +// Pass 1 (--gen-cpp): Generate standalone test directories in Output/ +// Each contains .h, .cpp, and main.cpp +// Pass 2 (default): Compile and run each generated app via loom --as-clang, +// then verify CPU vs DSA results match. +// +// Randomised parameters per test case: +// - Data type (uint32_t, int32_t, float) +// - Loop nesting depth (1-3) +// - Loop body operation (+, -, *, &, |, ^ for integer; +, -, * for float) +// - Pragma combinations on LOOM_ACCEL, LOOM_TARGET, LOOM_PARALLEL, +// LOOM_UNROLL, LOOM_TRIPCOUNT, LOOM_REDUCE, LOOM_STREAM, LOOM_MEMORY_BANK +// - Number of input arrays (1-3) and whether they are LOOM_STREAM +// - Whether the kernel is a reduction (scalar accumulator) +// - Array sizes and initialisation seeds +// +//===----------------------------------------------------------------------===// + +#include "fuzzer_config.h" + +#include +#include +#include +#include +#include +#include +#include + +// --------------------------------------------------------------------------- +// Data-type descriptors +// --------------------------------------------------------------------------- + +struct TypeInfo { + const char *cType; // e.g. "uint32_t" + const char *fmtSpec; // printf format specifier + bool isFloat; + bool isSigned; +}; + +static const TypeInfo kTypes[] = { + {"uint32_t", "%u", false, false}, + {"int32_t", "%d", false, true}, + {"float", "%f", true, false}, +}; +static constexpr unsigned kNumTypes = sizeof(kTypes) / sizeof(kTypes[0]); + +// --------------------------------------------------------------------------- +// Binary operators +// --------------------------------------------------------------------------- + +struct OpInfo { + const char *symbol; // e.g. "+" + const char *reduceArg; // LOOM_REDUCE argument, or nullptr + bool intOnly; +}; + +// Index 0-5: available for integers. Index 0-2: available for floats. +static const OpInfo kOps[] = { + {"+", "+", false}, + {"-", nullptr, false}, + {"*", "*", false}, + {"&", "&", true}, + {"|", "|", true}, + {"^", "^", true}, +}; +static constexpr unsigned kNumOps = sizeof(kOps) / sizeof(kOps[0]); +static constexpr unsigned kNumFloatOps = 3; // +, -, * + +// --------------------------------------------------------------------------- +// Pragma helpers – schedule names for LOOM_PARALLEL +// --------------------------------------------------------------------------- + +static const char *const kSchedules[] = {"contiguous", "interleaved"}; + +// --------------------------------------------------------------------------- +// Parameters for a single generated app +// --------------------------------------------------------------------------- + +struct FuzzParams { + std::string name; + + // Type and operation + unsigned typeIdx; + unsigned opIdx; + + // Loop structure + unsigned numLoops; // 1, 2, or 3 nesting levels + + // Array dimensions – outer/mid/inner trip counts + unsigned dimOuter; + unsigned dimMid; // only used when numLoops >= 2 + unsigned dimInner; // only used when numLoops >= 3 + + // Input arrays + unsigned numInputs; // 1, 2, or 3 + + // Reduction: if true the innermost loop accumulates a scalar + bool isReduction; + + // Pragma toggles & values + bool hasAccelName; + bool hasTarget; + unsigned targetKind; // 0=spatial, 1=temporal + + // LOOM_STREAM on each input (indexed 0..numInputs-1) + std::vector inputStream; + + // LOOM_MEMORY_BANK on first input + bool hasMemoryBank; + unsigned memoryBankCount; // 2, 4, or 8 + bool memoryBankBlock; // true=block, false=cyclic (default) + + // Per-loop pragmas (indexed 0..numLoops-1, 0=outermost) + struct LoopPragmas { + bool hasParallel; + bool parallelAuto; + unsigned parallelDegree; // 2, 4, or 8 + bool hasSchedule; + unsigned scheduleIdx; // index into kSchedules + + bool hasUnroll; + bool unrollAuto; + unsigned unrollFactor; // 2, 4, or 8 + + bool hasTripcount; + unsigned tripcountKind; // 0=single, 1=range, 2=full + unsigned tripcountVal; // used for single + unsigned tripcountMin; + unsigned tripcountMax; + + bool hasNoParallel; + bool hasNoUnroll; + }; + std::vector loopPragmas; + + // Reduction operator (only when isReduction) + unsigned reduceOpIdx; // index into kOps (must have reduceArg != nullptr) +}; + +// --------------------------------------------------------------------------- +// Pick a random element from a small set of allowed values +// --------------------------------------------------------------------------- + +static unsigned pickFrom(std::mt19937 &rng, + const std::vector &choices) { + return choices[rng() % choices.size()]; +} + +// --------------------------------------------------------------------------- +// Generate deterministic random parameters for one app +// --------------------------------------------------------------------------- + +static FuzzParams generateParams(unsigned seed, unsigned idx) { + std::mt19937 rng(seed + idx); + FuzzParams p; + p.name = "fuzz_" + std::to_string(seed) + "_" + std::to_string(idx); + + // Type + p.typeIdx = rng() % kNumTypes; + bool isFloat = kTypes[p.typeIdx].isFloat; + + // Operation – constrain to float-safe ops when using float + p.opIdx = isFloat ? (rng() % kNumFloatOps) : (rng() % kNumOps); + + // Loop nesting + p.numLoops = 1 + rng() % 3; + + // Dimensions (small for fast execution) + std::vector dimChoices = {4, 8, 16, 32}; + p.dimOuter = pickFrom(rng, dimChoices); + p.dimMid = (p.numLoops >= 2) ? pickFrom(rng, dimChoices) : 1; + p.dimInner = (p.numLoops >= 3) ? pickFrom(rng, dimChoices) : 1; + + // Input arrays: 2-3 for element-wise (need >=2 to exercise the operator), + // 1-2 for reductions (single input is fine since the accumulator is the + // second operand). The isReduction flag is set below, so we pick numInputs + // first and then clamp if needed after deciding isReduction. + if (p.numLoops == 1) { + p.numInputs = 1 + rng() % 3; // 1, 2, or 3 + } else { + p.numInputs = 1 + rng() % 2; // 1 or 2 + } + + // Reduction: more likely on single-loop, possible on innermost of nested + if (p.numLoops == 1) { + p.isReduction = (rng() % 3) == 0; // 33% chance + } else { + p.isReduction = (rng() % 4) == 0; // 25% chance + } + + // Pick reduction operator (only from ops that have reduceArg) + if (p.isReduction) { + // Valid reduce ops: + (0), * (2), & (3), | (4), ^ (5) + if (isFloat) { + // Float reductions: + or * + std::vector floatReduce = {0, 2}; + p.reduceOpIdx = pickFrom(rng, floatReduce); + } else { + std::vector intReduce = {0, 2, 3, 4, 5}; + p.reduceOpIdx = pickFrom(rng, intReduce); + } + } else { + p.reduceOpIdx = 0; // unused + // Non-reduction element-wise: need at least 2 inputs to exercise the op + if (p.numInputs < 2) p.numInputs = 2; + } + + // LOOM_ACCEL name + p.hasAccelName = (rng() % 2) == 0; + + // LOOM_TARGET + p.hasTarget = (rng() % 3) == 0; // 33% chance + p.targetKind = rng() % 2; + + // LOOM_STREAM per input + p.inputStream.resize(p.numInputs); + for (unsigned i = 0; i < p.numInputs; ++i) { + p.inputStream[i] = (rng() % 3) == 0; // 33% chance + } + + // LOOM_MEMORY_BANK on first input + p.hasMemoryBank = (rng() % 4) == 0; // 25% chance + std::vector bankChoices = {2, 4, 8}; + p.memoryBankCount = pickFrom(rng, bankChoices); + p.memoryBankBlock = (rng() % 2) == 0; + + // Per-loop pragmas + p.loopPragmas.resize(p.numLoops); + for (unsigned L = 0; L < p.numLoops; ++L) { + auto &lp = p.loopPragmas[L]; + + // LOOM_PARALLEL vs LOOM_NO_PARALLEL (mutually exclusive) + unsigned parallelChoice = rng() % 5; + // 0=none, 1=auto, 2=degree-only, 3=degree+schedule, 4=no_parallel + lp.hasNoParallel = (parallelChoice == 4); + lp.hasParallel = (parallelChoice >= 1 && parallelChoice <= 3); + lp.parallelAuto = (parallelChoice == 1); + std::vector degreeChoices = {2, 4, 8}; + lp.parallelDegree = pickFrom(rng, degreeChoices); + lp.hasSchedule = (parallelChoice == 3); + lp.scheduleIdx = rng() % 2; + + // LOOM_UNROLL vs LOOM_NO_UNROLL (mutually exclusive) + unsigned unrollChoice = rng() % 4; + // 0=none, 1=auto, 2=factor, 3=no_unroll + lp.hasNoUnroll = (unrollChoice == 3); + lp.hasUnroll = (unrollChoice >= 1 && unrollChoice <= 2); + lp.unrollAuto = (unrollChoice == 1); + std::vector unrollChoices = {2, 4, 8}; + lp.unrollFactor = pickFrom(rng, unrollChoices); + + // LOOM_TRIPCOUNT + lp.hasTripcount = (rng() % 2) == 0; // 50% chance + lp.tripcountKind = rng() % 3; // 0=single, 1=range, 2=full + + // Determine the actual trip count for this loop level + unsigned actualTrip; + if (L == 0) + actualTrip = p.dimOuter; + else if (L == 1) + actualTrip = p.dimMid; + else + actualTrip = p.dimInner; + + lp.tripcountVal = actualTrip; + // Ensure min <= actualTrip <= max + lp.tripcountMin = 1; + lp.tripcountMax = actualTrip * 4; + } + + return p; +} + +// --------------------------------------------------------------------------- +// Code generation helpers +// --------------------------------------------------------------------------- + +/// Emit the include block for generated .cpp files. +static void emitIncludes(std::ostream &os, const std::string &headerName) { + os << "#include \"" << headerName << "\"\n"; + os << "#include \n\n"; +} + +/// Total number of elements in the output array. +static std::string totalElements(const FuzzParams &p) { + if (p.numLoops == 1) + return "N"; + if (p.numLoops == 2) + return "N * M"; + return "N * M * P"; +} + +/// Emit the dimension parameter list (N, and optionally M, P). +static std::string dimParams(const FuzzParams &p, const char *type) { + std::string s; + s += std::string("const ") + type + " N"; + if (p.numLoops >= 2) + s += std::string(", const ") + type + " M"; + if (p.numLoops >= 3) + s += std::string(", const ") + type + " P"; + return s; +} + +/// Emit the dimension argument list (N, and optionally M, P). +static std::string dimArgs(const FuzzParams &p) { + std::string s = "N"; + if (p.numLoops >= 2) s += ", M"; + if (p.numLoops >= 3) s += ", N, P"; + return s; +} + +/// Dimension argument list for calling from main (uses actual dim values). +static std::string dimArgsMain(const FuzzParams &p) { + std::string s = std::to_string(p.dimOuter); + if (p.numLoops >= 2) + s += ", " + std::to_string(p.dimMid); + if (p.numLoops >= 3) + s += ", " + std::to_string(p.dimOuter) + ", " + + std::to_string(p.dimInner); + return s; +} + +/// Build the function signature parameter list (shared by CPU and DSA). +/// Returns e.g.: "const uint32_t* __restrict__ a, const uint32_t* __restrict__ +/// b, uint32_t* __restrict__ out, const uint32_t N" +static std::string funcParams(const FuzzParams &p, bool isDsa) { + const char *cType = kTypes[p.typeIdx].cType; + std::ostringstream os; + + // Input arrays + for (unsigned i = 0; i < p.numInputs; ++i) { + if (i > 0) os << ",\n "; + if (isDsa && p.inputStream[i]) + os << "LOOM_STREAM "; + if (isDsa && i == 0 && p.hasMemoryBank) { + os << "LOOM_MEMORY_BANK(" << p.memoryBankCount; + if (p.memoryBankBlock) os << ", block"; + os << ") "; + } + os << "const " << cType << "* __restrict__ " + << (char)('a' + i); + } + + if (p.isReduction) { + // Reduction: takes init_value, returns scalar + os << ",\n const " << cType << " init_value"; + } else { + // Element-wise: output array + os << ",\n " << cType << "* __restrict__ out"; + } + + os << ",\n " << dimParams(p, "uint32_t"); + + return os.str(); +} + +/// Emit the loop pragmas for a given nesting level. +static void emitLoopPragmas(std::ostream &os, const FuzzParams &p, + unsigned level, const std::string &indent) { + const auto &lp = p.loopPragmas[level]; + + // LOOM_PARALLEL / LOOM_NO_PARALLEL + if (lp.hasNoParallel) { + os << indent << "LOOM_NO_PARALLEL\n"; + } else if (lp.hasParallel) { + if (lp.parallelAuto) { + os << indent << "LOOM_PARALLEL()\n"; + } else if (lp.hasSchedule) { + os << indent << "LOOM_PARALLEL(" << lp.parallelDegree << ", " + << kSchedules[lp.scheduleIdx] << ")\n"; + } else { + os << indent << "LOOM_PARALLEL(" << lp.parallelDegree << ")\n"; + } + } + + // LOOM_UNROLL / LOOM_NO_UNROLL + if (lp.hasNoUnroll) { + os << indent << "LOOM_NO_UNROLL\n"; + } else if (lp.hasUnroll) { + if (lp.unrollAuto) { + os << indent << "LOOM_UNROLL()\n"; + } else { + os << indent << "LOOM_UNROLL(" << lp.unrollFactor << ")\n"; + } + } + + // LOOM_TRIPCOUNT + if (lp.hasTripcount) { + switch (lp.tripcountKind) { + case 0: // single + os << indent << "LOOM_TRIPCOUNT(" << lp.tripcountVal << ")\n"; + break; + case 1: // range + os << indent << "LOOM_TRIPCOUNT_RANGE(" << lp.tripcountMin << ", " + << lp.tripcountMax << ")\n"; + break; + case 2: // full + os << indent << "LOOM_TRIPCOUNT_FULL(" << lp.tripcountVal << ", " + << lp.tripcountVal << ", " << lp.tripcountMin << ", " + << lp.tripcountMax << ")\n"; + break; + } + } +} + +/// Emit the loop body expression (the actual computation). +/// For element-wise: out[idx] = a[idx] b[idx] (or just a[idx] const) +/// For reduction: acc = acc a[idx] (or acc a[idx] b[idx]) +static void emitBody(std::ostream &os, const FuzzParams &p, + const std::string &indent, const std::string &idx) { + const char *op = kOps[p.opIdx].symbol; + + if (p.isReduction) { + // Accumulate into 'acc' – use the reduction operator, not the general op + const char *reduceOp = kOps[p.reduceOpIdx].symbol; + os << indent << "acc = acc " << reduceOp << " " << "a[" << idx << "]"; + if (p.numInputs >= 2) + os << " " << reduceOp << " b[" << idx << "]"; + os << ";\n"; + } else { + // Element-wise + os << indent << "out[" << idx << "] = a[" << idx << "]"; + for (unsigned i = 1; i < p.numInputs; ++i) + os << " " << op << " " << (char)('a' + i) << "[" << idx << "]"; + os << ";\n"; + } +} + +/// Build the linear index expression for a given nesting depth. +/// 1 loop: "i" +/// 2 loops: "i * M + j" +/// 3 loops: "i * M * P + j * P + k" +static std::string linearIdx(const FuzzParams &p) { + if (p.numLoops == 1) return "i"; + if (p.numLoops == 2) return "i * M + j"; + return "i * M * P + j * P + k"; +} + +/// Emit the full function body (loop nest + body). +static void emitFuncBody(std::ostream &os, const FuzzParams &p, bool isDsa) { + const char *cType = kTypes[p.typeIdx].cType; + bool isFloat = kTypes[p.typeIdx].isFloat; + std::string idx = linearIdx(p); + + // Reduction initialisation + if (p.isReduction) { + if (isDsa) { + const char *reduceArg = kOps[p.reduceOpIdx].reduceArg; + os << " LOOM_REDUCE(" << reduceArg << ")\n"; + } + os << " " << cType << " acc = init_value;\n"; + } + + // Loop nest + const char *loopVars[] = {"i", "j", "k"}; + const char *loopBounds[] = {"N", "M", "P"}; + unsigned depth = p.numLoops; + + for (unsigned L = 0; L < depth; ++L) { + std::string indent(2 + L * 2, ' '); + + // Emit pragmas only for DSA version + if (isDsa) { + emitLoopPragmas(os, p, L, indent); + } + + os << indent << "for (uint32_t " << loopVars[L] << " = 0; " + << loopVars[L] << " < " << loopBounds[L] << "; ++" + << loopVars[L] << ") {\n"; + } + + // Body + { + std::string indent(2 + depth * 2, ' '); + emitBody(os, p, indent, idx); + } + + // Close loops + for (unsigned L = depth; L > 0; --L) { + std::string indent(2 + (L - 1) * 2, ' '); + os << indent << "}\n"; + } + + // Return for reduction + if (p.isReduction) { + os << " return acc;\n"; + } +} + +// --------------------------------------------------------------------------- +// Write the three source files for a generated test case +// --------------------------------------------------------------------------- + +/// Write .h +static void writeHeader(const FuzzParams &p, const std::string &dir) { + std::string path = dir + "/" + p.name + ".h"; + std::ofstream os(path); + const char *cType = kTypes[p.typeIdx].cType; + std::string guard = p.name + "_H"; + for (auto &c : guard) c = (char)toupper((unsigned char)c); + + os << "// Generated by loom app fuzzer\n"; + os << "#ifndef " << guard << "\n"; + os << "#define " << guard << "\n\n"; + os << "#include \n\n"; + + // CPU declaration + if (p.isReduction) { + os << cType << " " << p.name << "_cpu(" << funcParams(p, false) + << ");\n\n"; + os << cType << " " << p.name << "_dsa(" << funcParams(p, false) + << ");\n\n"; + } else { + os << "void " << p.name << "_cpu(" << funcParams(p, false) << ");\n\n"; + os << "void " << p.name << "_dsa(" << funcParams(p, false) << ");\n\n"; + } + + os << "#endif // " << guard << "\n"; +} + +/// Write .cpp +static void writeImpl(const FuzzParams &p, const std::string &dir) { + std::string path = dir + "/" + p.name + ".cpp"; + std::ofstream os(path); + const char *cType = kTypes[p.typeIdx].cType; + + os << "// Generated by loom app fuzzer\n"; + emitIncludes(os, p.name + ".h"); + + // CPU version + os << "// CPU reference implementation\n"; + if (p.isReduction) { + os << cType << " " << p.name << "_cpu(" << funcParams(p, false) << ") {\n"; + } else { + os << "void " << p.name << "_cpu(" << funcParams(p, false) << ") {\n"; + } + emitFuncBody(os, p, /*isDsa=*/false); + os << "}\n\n"; + + // DSA version + os << "// DSA accelerated implementation\n"; + if (p.hasTarget) { + os << "LOOM_TARGET(\"" + << (p.targetKind == 0 ? "spatial" : "temporal") << "\")\n"; + } + if (p.hasAccelName) { + os << "LOOM_ACCEL(\"" << p.name << "\")\n"; + } else { + os << "LOOM_ACCEL()\n"; + } + + if (p.isReduction) { + os << cType << " " << p.name << "_dsa(" << funcParams(p, true) << ") {\n"; + } else { + os << "void " << p.name << "_dsa(" << funcParams(p, true) << ") {\n"; + } + emitFuncBody(os, p, /*isDsa=*/true); + os << "}\n"; +} + +/// Write main.cpp +static void writeMain(const FuzzParams &p, const std::string &dir) { + std::string path = dir + "/main.cpp"; + std::ofstream os(path); + const char *cType = kTypes[p.typeIdx].cType; + bool isFloat = kTypes[p.typeIdx].isFloat; + + os << "// Generated by loom app fuzzer\n"; + os << "#include \"" << p.name << ".h\"\n"; + os << "#include \n"; + if (isFloat) os << "#include \n"; + os << "\n"; + + os << "int main() {\n"; + + // Dimension constants + os << " const uint32_t N = " << p.dimOuter << ";\n"; + if (p.numLoops >= 2) + os << " const uint32_t M = " << p.dimMid << ";\n"; + if (p.numLoops >= 3) + os << " const uint32_t P = " << p.dimInner << ";\n"; + + // Total element count + std::string total = totalElements(p); + os << " const uint32_t total = " << total << ";\n\n"; + + // Input arrays + for (unsigned i = 0; i < p.numInputs; ++i) { + os << " " << cType << " " << (char)('a' + i) << "[total];\n"; + } + + // Initialise inputs with deterministic values + os << "\n // Initialise inputs\n"; + for (unsigned i = 0; i < p.numInputs; ++i) { + char varName = (char)('a' + i); + os << " for (uint32_t idx = 0; idx < total; ++idx) {\n"; + if (isFloat) { + // Small float values to avoid overflow + os << " " << varName << "[idx] = (float)((idx * " + << (i * 7 + 3) << " + " << (i + 1) << ") % 10) / 10.0f + 0.1f;\n"; + } else { + // Small integer values to avoid overflow (especially for multiply) + os << " " << varName << "[idx] = (" << cType << ")((idx * " + << (i * 7 + 3) << " + " << (i + 1) << ") % 10);\n"; + } + os << " }\n"; + } + os << "\n"; + + if (p.isReduction) { + // Reduction test + if (isFloat) { + os << " " << cType << " init_value = 0.0f;\n"; + } else { + // For multiply reduction, init must be 1; for others, 0 is fine + // But to keep values small, use 0 for all (multiply with 0 tests edge) + // Actually use a small nonzero value to make it interesting + if (kOps[p.reduceOpIdx].symbol[0] == '*') { + os << " " << cType << " init_value = 1;\n"; + } else { + os << " " << cType << " init_value = 0;\n"; + } + } + + // Dimension args for function call + std::string dArgs; + dArgs = "N"; + if (p.numLoops >= 2) dArgs += ", M"; + if (p.numLoops >= 3) dArgs += ", P"; + + // Build input arg list + std::string inputArgs; + for (unsigned i = 0; i < p.numInputs; ++i) { + if (i > 0) inputArgs += ", "; + inputArgs += (char)('a' + i); + } + + os << " " << cType << " cpu_result = " << p.name << "_cpu(" + << inputArgs << ", init_value, " << dArgs << ");\n"; + os << " " << cType << " dsa_result = " << p.name << "_dsa(" + << inputArgs << ", init_value, " << dArgs << ");\n\n"; + + // Compare + if (isFloat) { + os << " if (std::fabs(cpu_result - dsa_result) > 1e-2f) {\n"; + } else { + os << " if (cpu_result != dsa_result) {\n"; + } + os << " printf(\"" << p.name << ": FAILED\\n\");\n"; + os << " return 1;\n"; + os << " }\n"; + } else { + // Element-wise test + os << " " << cType << " cpu_out[total];\n"; + os << " " << cType << " dsa_out[total];\n\n"; + + // Initialise output arrays to 0 + os << " for (uint32_t idx = 0; idx < total; ++idx) {\n"; + if (isFloat) { + os << " cpu_out[idx] = 0.0f;\n"; + os << " dsa_out[idx] = 0.0f;\n"; + } else { + os << " cpu_out[idx] = 0;\n"; + os << " dsa_out[idx] = 0;\n"; + } + os << " }\n\n"; + + // Dimension args for function call + std::string dArgs; + dArgs = "N"; + if (p.numLoops >= 2) dArgs += ", M"; + if (p.numLoops >= 3) dArgs += ", P"; + + // Build input arg list + std::string inputArgs; + for (unsigned i = 0; i < p.numInputs; ++i) { + if (i > 0) inputArgs += ", "; + inputArgs += (char)('a' + i); + } + + os << " " << p.name << "_cpu(" << inputArgs << ", cpu_out, " + << dArgs << ");\n"; + os << " " << p.name << "_dsa(" << inputArgs << ", dsa_out, " + << dArgs << ");\n\n"; + + // Compare + os << " for (uint32_t idx = 0; idx < total; ++idx) {\n"; + if (isFloat) { + os << " if (std::fabs(cpu_out[idx] - dsa_out[idx]) > 1e-4f) {\n"; + } else { + os << " if (cpu_out[idx] != dsa_out[idx]) {\n"; + } + os << " printf(\"" << p.name << ": FAILED\\n\");\n"; + os << " return 1;\n"; + os << " }\n"; + os << " }\n"; + } + + os << "\n printf(\"" << p.name << ": PASSED\\n\");\n"; + os << " return 0;\n"; + os << "}\n"; +} + +/// Write all three files for a test case into Output// +static void writeCppSource(const FuzzParams &p, const std::string &outDir) { + std::string caseDir = outDir + "/" + p.name; + std::string mkdirCmd = "mkdir -p " + caseDir; + (void)std::system(mkdirCmd.c_str()); + + writeHeader(p, caseDir); + writeImpl(p, caseDir); + writeMain(p, caseDir); +} + +// --------------------------------------------------------------------------- +// Direct compile-and-run (pass 2) +// --------------------------------------------------------------------------- + +/// Compile and run a generated test case using loom --as-clang. +/// Returns 0 on success, 1 on failure. +static int compileAndRun(const FuzzParams &p, const std::string &outDir) { + std::string caseDir = outDir + "/" + p.name; + + // Ensure source files exist (generate them if needed) + { + std::ifstream check(caseDir + "/" + p.name + ".cpp"); + if (!check.good()) { + writeCppSource(p, outDir); + } + } + + std::string outputBin = caseDir + "/Output/" + p.name; + std::string mkdirCmd = "mkdir -p " + caseDir + "/Output"; + (void)std::system(mkdirCmd.c_str()); + + // Compile + std::string compileCmd = + "loom --as-clang " + caseDir + "/" + p.name + ".cpp " + caseDir + + "/main.cpp -o " + outputBin + " 2>&1"; + int rc = std::system(compileCmd.c_str()); + if (rc != 0) { + fprintf(stderr, "%s: compilation failed (exit %d)\n", p.name.c_str(), rc); + return 1; + } + + // Run + std::string runCmd = "cd " + caseDir + " && Output/" + p.name + " 2>&1"; + rc = std::system(runCmd.c_str()); + if (rc != 0) { + fprintf(stderr, "%s: execution failed (exit %d)\n", p.name.c_str(), rc); + return 1; + } + + return 0; +} + +// --------------------------------------------------------------------------- +// Main +// --------------------------------------------------------------------------- + +int main(int argc, char **argv) { + unsigned seed = FUZZER_SEED; + unsigned count = FUZZER_COUNT; + + // Allow env override. + if (const char *s = std::getenv("LOOM_FUZZER_SEED")) + seed = (unsigned)std::atoi(s); + if (const char *c = std::getenv("LOOM_FUZZER_COUNT")) + count = (unsigned)std::atoi(c); + + // Check for --gen-cpp mode. + bool genCpp = false; + for (int i = 1; i < argc; ++i) { + if (std::string(argv[i]) == "--gen-cpp") + genCpp = true; + } + + int failures = 0; + for (unsigned idx = 0; idx < count; ++idx) { + FuzzParams p = generateParams(seed, idx); + + if (genCpp) { + writeCppSource(p, "Output"); + } else { + // Generate sources and attempt compile+run + writeCppSource(p, "Output"); + failures += compileAndRun(p, "Output"); + } + } + + if (!genCpp && failures > 0) { + fprintf(stderr, "\n%d / %d test cases FAILED\n", failures, count); + return 1; + } + + return 0; +} diff --git a/tests/app/fuzzer/fuzzer_config.h b/tests/app/fuzzer/fuzzer_config.h new file mode 100644 index 00000000..595ee4ee --- /dev/null +++ b/tests/app/fuzzer/fuzzer_config.h @@ -0,0 +1,13 @@ +//===-- fuzzer_config.h - APP fuzzer central configuration ------*- C++ -*-===// +// +// Part of the Loom project. +// +//===----------------------------------------------------------------------===// + +#ifndef FUZZER_CONFIG_H +#define FUZZER_CONFIG_H + +constexpr unsigned FUZZER_SEED = 42; +constexpr unsigned FUZZER_COUNT = 200; + +#endif // FUZZER_CONFIG_H \ No newline at end of file diff --git a/tests/scripts/handshake.sh b/tests/scripts/handshake.sh index 15d18b7e..56187428 100755 --- a/tests/scripts/handshake.sh +++ b/tests/scripts/handshake.sh @@ -100,9 +100,13 @@ loom_write_parallel_header "${PARALLEL_FILE}" \ "Compiles each C++ app to handshake MLIR via loom." for app_dir in "${app_dirs[@]}"; do + app_name=$(basename "${app_dir}") + + # Skip fuzzer directory: it is a test generator, not a loom app. + [[ "${app_name}" == "fuzzer" ]] && continue + rel_sources=$(loom_rel_sources "${app_dir}") || continue - app_name=$(basename "${app_dir}") rel_app=$(loom_relpath "${app_dir}") rel_out="${rel_app}/Output" diff --git a/tests/scripts/ll_roundtrip.sh b/tests/scripts/ll_roundtrip.sh index 13a44262..54394204 100755 --- a/tests/scripts/ll_roundtrip.sh +++ b/tests/scripts/ll_roundtrip.sh @@ -90,9 +90,13 @@ loom_write_parallel_header "${PARALLEL_FILE}" \ "Compiles each C++ app to LLVM IR via loom, links with clang++, runs the executable." for app_dir in "${app_dirs[@]}"; do + app_name=$(basename "${app_dir}") + + # Skip fuzzer directory: it is a test generator, not a loom app. + [[ "${app_name}" == "fuzzer" ]] && continue + rel_sources=$(loom_rel_sources "${app_dir}") || continue - app_name=$(basename "${app_dir}") rel_app=$(loom_relpath "${app_dir}") rel_out="${rel_app}/Output" diff --git a/tests/scripts/mlir_roundtrip.sh b/tests/scripts/mlir_roundtrip.sh index 59e215fa..e5eb6c67 100755 --- a/tests/scripts/mlir_roundtrip.sh +++ b/tests/scripts/mlir_roundtrip.sh @@ -94,9 +94,13 @@ loom_write_parallel_header "${PARALLEL_FILE}" \ "Roundtrips through mlir-opt/mlir-translate, links with clang++, runs the executable." for app_dir in "${app_dirs[@]}"; do + app_name=$(basename "${app_dir}") + + # Skip fuzzer directory: it is a test generator, not a loom app. + [[ "${app_name}" == "fuzzer" ]] && continue + rel_sources=$(loom_rel_sources "${app_dir}") || continue - app_name=$(basename "${app_dir}") rel_app=$(loom_relpath "${app_dir}") rel_out="${rel_app}/Output" diff --git a/tests/scripts/scf_roundtrip.sh b/tests/scripts/scf_roundtrip.sh index 01e23ce0..f20b6e1f 100755 --- a/tests/scripts/scf_roundtrip.sh +++ b/tests/scripts/scf_roundtrip.sh @@ -94,9 +94,13 @@ loom_write_parallel_header "${PARALLEL_FILE}" \ "Lowers SCF MLIR through mlir-opt/mlir-translate, links with clang++, runs the executable." for app_dir in "${app_dirs[@]}"; do + app_name=$(basename "${app_dir}") + + # Skip fuzzer directory: it is a test generator, not a loom app. + [[ "${app_name}" == "fuzzer" ]] && continue + rel_sources=$(loom_rel_sources "${app_dir}") || continue - app_name=$(basename "${app_dir}") rel_app=$(loom_relpath "${app_dir}") rel_out="${rel_app}/Output" From 985844317d815224deee45d59f5260fb3223a3ec Mon Sep 17 00:00:00 2001 From: Ankai Jin Date: Fri, 6 Mar 2026 13:33:25 -0800 Subject: [PATCH 4/4] Added ls77 and chacha20 kernels --- tests/app/chacha20_qr/chacha20_qr.cpp | 281 ++++++++++++++++++++++ tests/app/chacha20_qr/chacha20_qr.h | 12 + tests/app/chacha20_qr/main.cpp | 39 +++ tests/app/lz77_compress/lz77_compress.cpp | 120 +++++++++ tests/app/lz77_compress/lz77_compress.h | 12 + tests/app/lz77_compress/main.cpp | 53 ++++ 6 files changed, 517 insertions(+) create mode 100644 tests/app/chacha20_qr/chacha20_qr.cpp create mode 100644 tests/app/chacha20_qr/chacha20_qr.h create mode 100644 tests/app/chacha20_qr/main.cpp create mode 100644 tests/app/lz77_compress/lz77_compress.cpp create mode 100644 tests/app/lz77_compress/lz77_compress.h create mode 100644 tests/app/lz77_compress/main.cpp diff --git a/tests/app/chacha20_qr/chacha20_qr.cpp b/tests/app/chacha20_qr/chacha20_qr.cpp new file mode 100644 index 00000000..84810cba --- /dev/null +++ b/tests/app/chacha20_qr/chacha20_qr.cpp @@ -0,0 +1,281 @@ +// Loom kernel implementation: chacha20_qr +#include "chacha20_qr.h" +#include "loom/loom.h" +#include +#include + +// Full pipeline test from C++ source: ChaCha20 block function +// Tests complete compilation chain with ARX (add-rotate-xor) operations +// Operates on a 16-word (512-bit) state, applying quarter-round transforms +// for the specified number of double-rounds (ChaCha20 uses 10 double-rounds) + +// CPU implementation of ChaCha20 block function +void chacha20_qr_cpu(const uint32_t* __restrict__ input_state, + uint32_t* __restrict__ output_state, + const uint32_t num_rounds) { + // Copy input state to output + for (uint32_t i = 0; i < 16; i++) { + output_state[i] = input_state[i]; + } + + // Each double-round consists of 4 column quarter-rounds + // followed by 4 diagonal quarter-rounds + for (uint32_t round = 0; round < num_rounds; round++) { + // Column quarter-rounds + // QR(0, 4, 8, 12) + output_state[0] += output_state[4]; + output_state[12] ^= output_state[0]; + output_state[12] = (output_state[12] << 16) | (output_state[12] >> 16); + output_state[8] += output_state[12]; + output_state[4] ^= output_state[8]; + output_state[4] = (output_state[4] << 12) | (output_state[4] >> 20); + output_state[0] += output_state[4]; + output_state[12] ^= output_state[0]; + output_state[12] = (output_state[12] << 8) | (output_state[12] >> 24); + output_state[8] += output_state[12]; + output_state[4] ^= output_state[8]; + output_state[4] = (output_state[4] << 7) | (output_state[4] >> 25); + + // QR(1, 5, 9, 13) + output_state[1] += output_state[5]; + output_state[13] ^= output_state[1]; + output_state[13] = (output_state[13] << 16) | (output_state[13] >> 16); + output_state[9] += output_state[13]; + output_state[5] ^= output_state[9]; + output_state[5] = (output_state[5] << 12) | (output_state[5] >> 20); + output_state[1] += output_state[5]; + output_state[13] ^= output_state[1]; + output_state[13] = (output_state[13] << 8) | (output_state[13] >> 24); + output_state[9] += output_state[13]; + output_state[5] ^= output_state[9]; + output_state[5] = (output_state[5] << 7) | (output_state[5] >> 25); + + // QR(2, 6, 10, 14) + output_state[2] += output_state[6]; + output_state[14] ^= output_state[2]; + output_state[14] = (output_state[14] << 16) | (output_state[14] >> 16); + output_state[10] += output_state[14]; + output_state[6] ^= output_state[10]; + output_state[6] = (output_state[6] << 12) | (output_state[6] >> 20); + output_state[2] += output_state[6]; + output_state[14] ^= output_state[2]; + output_state[14] = (output_state[14] << 8) | (output_state[14] >> 24); + output_state[10] += output_state[14]; + output_state[6] ^= output_state[10]; + output_state[6] = (output_state[6] << 7) | (output_state[6] >> 25); + + // QR(3, 7, 11, 15) + output_state[3] += output_state[7]; + output_state[15] ^= output_state[3]; + output_state[15] = (output_state[15] << 16) | (output_state[15] >> 16); + output_state[11] += output_state[15]; + output_state[7] ^= output_state[11]; + output_state[7] = (output_state[7] << 12) | (output_state[7] >> 20); + output_state[3] += output_state[7]; + output_state[15] ^= output_state[3]; + output_state[15] = (output_state[15] << 8) | (output_state[15] >> 24); + output_state[11] += output_state[15]; + output_state[7] ^= output_state[11]; + output_state[7] = (output_state[7] << 7) | (output_state[7] >> 25); + + // Diagonal quarter-rounds + // QR(0, 5, 10, 15) + output_state[0] += output_state[5]; + output_state[15] ^= output_state[0]; + output_state[15] = (output_state[15] << 16) | (output_state[15] >> 16); + output_state[10] += output_state[15]; + output_state[5] ^= output_state[10]; + output_state[5] = (output_state[5] << 12) | (output_state[5] >> 20); + output_state[0] += output_state[5]; + output_state[15] ^= output_state[0]; + output_state[15] = (output_state[15] << 8) | (output_state[15] >> 24); + output_state[10] += output_state[15]; + output_state[5] ^= output_state[10]; + output_state[5] = (output_state[5] << 7) | (output_state[5] >> 25); + + // QR(1, 6, 11, 12) + output_state[1] += output_state[6]; + output_state[12] ^= output_state[1]; + output_state[12] = (output_state[12] << 16) | (output_state[12] >> 16); + output_state[11] += output_state[12]; + output_state[6] ^= output_state[11]; + output_state[6] = (output_state[6] << 12) | (output_state[6] >> 20); + output_state[1] += output_state[6]; + output_state[12] ^= output_state[1]; + output_state[12] = (output_state[12] << 8) | (output_state[12] >> 24); + output_state[11] += output_state[12]; + output_state[6] ^= output_state[11]; + output_state[6] = (output_state[6] << 7) | (output_state[6] >> 25); + + // QR(2, 7, 8, 13) + output_state[2] += output_state[7]; + output_state[13] ^= output_state[2]; + output_state[13] = (output_state[13] << 16) | (output_state[13] >> 16); + output_state[8] += output_state[13]; + output_state[7] ^= output_state[8]; + output_state[7] = (output_state[7] << 12) | (output_state[7] >> 20); + output_state[2] += output_state[7]; + output_state[13] ^= output_state[2]; + output_state[13] = (output_state[13] << 8) | (output_state[13] >> 24); + output_state[8] += output_state[13]; + output_state[7] ^= output_state[8]; + output_state[7] = (output_state[7] << 7) | (output_state[7] >> 25); + + // QR(3, 4, 9, 14) + output_state[3] += output_state[4]; + output_state[14] ^= output_state[3]; + output_state[14] = (output_state[14] << 16) | (output_state[14] >> 16); + output_state[9] += output_state[14]; + output_state[4] ^= output_state[9]; + output_state[4] = (output_state[4] << 12) | (output_state[4] >> 20); + output_state[3] += output_state[4]; + output_state[14] ^= output_state[3]; + output_state[14] = (output_state[14] << 8) | (output_state[14] >> 24); + output_state[9] += output_state[14]; + output_state[4] ^= output_state[9]; + output_state[4] = (output_state[4] << 7) | (output_state[4] >> 25); + } + + // Add original state (ChaCha20 final addition) + for (uint32_t i = 0; i < 16; i++) { + output_state[i] += input_state[i]; + } +} + +// Accelerator implementation of ChaCha20 block function +LOOM_ACCEL() +void chacha20_qr_dsa(LOOM_MEMORY_BANK(8) const uint32_t* __restrict__ input_state, + uint32_t* __restrict__ output_state, + const uint32_t num_rounds) { + // Copy input state to output + LOOM_NO_PARALLEL + LOOM_NO_UNROLL + for (uint32_t i = 0; i < 16; i++) { + output_state[i] = input_state[i]; + } + + // Each double-round consists of 4 column quarter-rounds + // followed by 4 diagonal quarter-rounds + for (uint32_t round = 0; round < num_rounds; round++) { + // Column quarter-rounds + // QR(0, 4, 8, 12) + output_state[0] += output_state[4]; + output_state[12] ^= output_state[0]; + output_state[12] = (output_state[12] << 16) | (output_state[12] >> 16); + output_state[8] += output_state[12]; + output_state[4] ^= output_state[8]; + output_state[4] = (output_state[4] << 12) | (output_state[4] >> 20); + output_state[0] += output_state[4]; + output_state[12] ^= output_state[0]; + output_state[12] = (output_state[12] << 8) | (output_state[12] >> 24); + output_state[8] += output_state[12]; + output_state[4] ^= output_state[8]; + output_state[4] = (output_state[4] << 7) | (output_state[4] >> 25); + + // QR(1, 5, 9, 13) + output_state[1] += output_state[5]; + output_state[13] ^= output_state[1]; + output_state[13] = (output_state[13] << 16) | (output_state[13] >> 16); + output_state[9] += output_state[13]; + output_state[5] ^= output_state[9]; + output_state[5] = (output_state[5] << 12) | (output_state[5] >> 20); + output_state[1] += output_state[5]; + output_state[13] ^= output_state[1]; + output_state[13] = (output_state[13] << 8) | (output_state[13] >> 24); + output_state[9] += output_state[13]; + output_state[5] ^= output_state[9]; + output_state[5] = (output_state[5] << 7) | (output_state[5] >> 25); + + // QR(2, 6, 10, 14) + output_state[2] += output_state[6]; + output_state[14] ^= output_state[2]; + output_state[14] = (output_state[14] << 16) | (output_state[14] >> 16); + output_state[10] += output_state[14]; + output_state[6] ^= output_state[10]; + output_state[6] = (output_state[6] << 12) | (output_state[6] >> 20); + output_state[2] += output_state[6]; + output_state[14] ^= output_state[2]; + output_state[14] = (output_state[14] << 8) | (output_state[14] >> 24); + output_state[10] += output_state[14]; + output_state[6] ^= output_state[10]; + output_state[6] = (output_state[6] << 7) | (output_state[6] >> 25); + + // QR(3, 7, 11, 15) + output_state[3] += output_state[7]; + output_state[15] ^= output_state[3]; + output_state[15] = (output_state[15] << 16) | (output_state[15] >> 16); + output_state[11] += output_state[15]; + output_state[7] ^= output_state[11]; + output_state[7] = (output_state[7] << 12) | (output_state[7] >> 20); + output_state[3] += output_state[7]; + output_state[15] ^= output_state[3]; + output_state[15] = (output_state[15] << 8) | (output_state[15] >> 24); + output_state[11] += output_state[15]; + output_state[7] ^= output_state[11]; + output_state[7] = (output_state[7] << 7) | (output_state[7] >> 25); + + // Diagonal quarter-rounds + // QR(0, 5, 10, 15) + output_state[0] += output_state[5]; + output_state[15] ^= output_state[0]; + output_state[15] = (output_state[15] << 16) | (output_state[15] >> 16); + output_state[10] += output_state[15]; + output_state[5] ^= output_state[10]; + output_state[5] = (output_state[5] << 12) | (output_state[5] >> 20); + output_state[0] += output_state[5]; + output_state[15] ^= output_state[0]; + output_state[15] = (output_state[15] << 8) | (output_state[15] >> 24); + output_state[10] += output_state[15]; + output_state[5] ^= output_state[10]; + output_state[5] = (output_state[5] << 7) | (output_state[5] >> 25); + + // QR(1, 6, 11, 12) + output_state[1] += output_state[6]; + output_state[12] ^= output_state[1]; + output_state[12] = (output_state[12] << 16) | (output_state[12] >> 16); + output_state[11] += output_state[12]; + output_state[6] ^= output_state[11]; + output_state[6] = (output_state[6] << 12) | (output_state[6] >> 20); + output_state[1] += output_state[6]; + output_state[12] ^= output_state[1]; + output_state[12] = (output_state[12] << 8) | (output_state[12] >> 24); + output_state[11] += output_state[12]; + output_state[6] ^= output_state[11]; + output_state[6] = (output_state[6] << 7) | (output_state[6] >> 25); + + // QR(2, 7, 8, 13) + output_state[2] += output_state[7]; + output_state[13] ^= output_state[2]; + output_state[13] = (output_state[13] << 16) | (output_state[13] >> 16); + output_state[8] += output_state[13]; + output_state[7] ^= output_state[8]; + output_state[7] = (output_state[7] << 12) | (output_state[7] >> 20); + output_state[2] += output_state[7]; + output_state[13] ^= output_state[2]; + output_state[13] = (output_state[13] << 8) | (output_state[13] >> 24); + output_state[8] += output_state[13]; + output_state[7] ^= output_state[8]; + output_state[7] = (output_state[7] << 7) | (output_state[7] >> 25); + + // QR(3, 4, 9, 14) + output_state[3] += output_state[4]; + output_state[14] ^= output_state[3]; + output_state[14] = (output_state[14] << 16) | (output_state[14] >> 16); + output_state[9] += output_state[14]; + output_state[4] ^= output_state[9]; + output_state[4] = (output_state[4] << 12) | (output_state[4] >> 20); + output_state[3] += output_state[4]; + output_state[14] ^= output_state[3]; + output_state[14] = (output_state[14] << 8) | (output_state[14] >> 24); + output_state[9] += output_state[14]; + output_state[4] ^= output_state[9]; + output_state[4] = (output_state[4] << 7) | (output_state[4] >> 25); + } + + // Add original state (ChaCha20 final addition) + LOOM_PARALLEL() + LOOM_UNROLL() + for (uint32_t i = 0; i < 16; i++) { + output_state[i] += input_state[i]; + } +} diff --git a/tests/app/chacha20_qr/chacha20_qr.h b/tests/app/chacha20_qr/chacha20_qr.h new file mode 100644 index 00000000..da2a64ea --- /dev/null +++ b/tests/app/chacha20_qr/chacha20_qr.h @@ -0,0 +1,12 @@ +// Loom kernel: chacha20_qr +#ifndef CHACHA20_QR_H +#define CHACHA20_QR_H + +#include +#include + +void chacha20_qr_cpu(const uint32_t* __restrict__ input_state, uint32_t* __restrict__ output_state, const uint32_t num_rounds); + +void chacha20_qr_dsa(const uint32_t* __restrict__ input_state, uint32_t* __restrict__ output_state, const uint32_t num_rounds); + +#endif // CHACHA20_QR_H diff --git a/tests/app/chacha20_qr/main.cpp b/tests/app/chacha20_qr/main.cpp new file mode 100644 index 00000000..d3d6b996 --- /dev/null +++ b/tests/app/chacha20_qr/main.cpp @@ -0,0 +1,39 @@ +#include + +#include "chacha20_qr.h" + +int main() { + // Standard ChaCha20 test vector (RFC 7539 Section 2.3.2) + // State is 16 x uint32_t = 512 bits + // Layout: constants(4) | key(8) | counter(1) | nonce(3) + uint32_t input_state[16] = { + 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574, // "expand 32-byte k" + 0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c, // key (part 1) + 0x13121110, 0x17161514, 0x1b1a1918, 0x1f1e1d1c, // key (part 2) + 0x00000001, 0x09000000, 0x4a000000, 0x00000000 // counter | nonce + }; + + // 10 double-rounds for ChaCha20 + const uint32_t num_rounds = 10; + + uint32_t expect_output[16]; + uint32_t calculated_output[16]; + + // Compute expected result with CPU version + chacha20_qr_cpu(input_state, expect_output, num_rounds); + + // Compute result with accelerator version + chacha20_qr_dsa(input_state, calculated_output, num_rounds); + + // Compare results + for (uint32_t i = 0; i < 16; i++) { + if (expect_output[i] != calculated_output[i]) { + printf("chacha20_qr: FAILED (mismatch at word %u: expected 0x%08x, got 0x%08x)\n", + i, expect_output[i], calculated_output[i]); + return 1; + } + } + + printf("chacha20_qr: PASSED\n"); + return 0; +} diff --git a/tests/app/lz77_compress/lz77_compress.cpp b/tests/app/lz77_compress/lz77_compress.cpp new file mode 100644 index 00000000..dfdbfe1d --- /dev/null +++ b/tests/app/lz77_compress/lz77_compress.cpp @@ -0,0 +1,120 @@ +// Loom kernel implementation: lz77_compress +#include "lz77_compress.h" +#include "loom/loom.h" +#include +#include + +// Full pipeline test from C++ source: LZ77 sliding-window compression +// Tests complete compilation chain with nested loops and data-dependent control flow +// At each position, searches a lookback window for the longest match. +// Outputs (offset, length, literal) triples. +// Test: [1,2,3,1,2,3,4,5,4,5,4,5,6,7,8,6,7,8,9,9] with window_size=10 + +// CPU implementation of LZ77 compression +void lz77_compress_cpu(const uint32_t* __restrict__ input_data, + uint32_t* __restrict__ output_offsets, + uint32_t* __restrict__ output_lengths, + uint32_t* __restrict__ output_literals, + uint32_t* __restrict__ output_count, + const uint32_t N, + const uint32_t window_size) { + if (N == 0) { + *output_count = 0; + return; + } + + uint32_t pos = 0; + uint32_t write_idx = 0; + + while (pos < N) { + uint32_t best_offset = 0; + uint32_t best_length = 0; + + // Determine search window start + uint32_t search_start = (pos > window_size) ? pos - window_size : 0; + + // Search for longest match in the lookback window + for (uint32_t s = search_start; s < pos; s++) { + uint32_t match_len = 0; + + // Extend match as far as possible (cap at 255) + while (pos + match_len < N && + input_data[s + match_len] == input_data[pos + match_len] && + match_len < 255) { + match_len++; + } + + if (match_len > best_length) { + best_length = match_len; + best_offset = pos - s; + } + } + + // Emit the triple + output_offsets[write_idx] = best_offset; + output_lengths[write_idx] = best_length; + + uint32_t next_pos = pos + best_length; + output_literals[write_idx] = (next_pos < N) ? input_data[next_pos] : 0; + + write_idx++; + pos = next_pos + 1; + } + + *output_count = write_idx; +} + +// Accelerator implementation of LZ77 compression +LOOM_ACCEL() +void lz77_compress_dsa(LOOM_MEMORY_BANK(8) const uint32_t* __restrict__ input_data, + LOOM_STREAM uint32_t* __restrict__ output_offsets, + LOOM_STREAM uint32_t* __restrict__ output_lengths, + LOOM_STREAM uint32_t* __restrict__ output_literals, + uint32_t* __restrict__ output_count, + const uint32_t N, + const uint32_t window_size) { + if (N == 0) { + *output_count = 0; + return; + } + + uint32_t pos = 0; + uint32_t write_idx = 0; + + while (pos < N) { + uint32_t best_offset = 0; + uint32_t best_length = 0; + + // Determine search window start + uint32_t search_start = (pos > window_size) ? pos - window_size : 0; + + // Search for longest match in the lookback window + for (uint32_t s = search_start; s < pos; s++) { + uint32_t match_len = 0; + + // Extend match as far as possible (cap at 255) + while (pos + match_len < N && + input_data[s + match_len] == input_data[pos + match_len] && + match_len < 255) { + match_len++; + } + + if (match_len > best_length) { + best_length = match_len; + best_offset = pos - s; + } + } + + // Emit the triple + output_offsets[write_idx] = best_offset; + output_lengths[write_idx] = best_length; + + uint32_t next_pos = pos + best_length; + output_literals[write_idx] = (next_pos < N) ? input_data[next_pos] : 0; + + write_idx++; + pos = next_pos + 1; + } + + *output_count = write_idx; +} diff --git a/tests/app/lz77_compress/lz77_compress.h b/tests/app/lz77_compress/lz77_compress.h new file mode 100644 index 00000000..41df1ec3 --- /dev/null +++ b/tests/app/lz77_compress/lz77_compress.h @@ -0,0 +1,12 @@ +// Loom kernel: lz77_compress +#ifndef LZ77_COMPRESS_H +#define LZ77_COMPRESS_H + +#include +#include + +void lz77_compress_cpu(const uint32_t* __restrict__ input_data, uint32_t* __restrict__ output_offsets, uint32_t* __restrict__ output_lengths, uint32_t* __restrict__ output_literals, uint32_t* __restrict__ output_count, const uint32_t N, const uint32_t window_size); + +void lz77_compress_dsa(const uint32_t* __restrict__ input_data, uint32_t* __restrict__ output_offsets, uint32_t* __restrict__ output_lengths, uint32_t* __restrict__ output_literals, uint32_t* __restrict__ output_count, const uint32_t N, const uint32_t window_size); + +#endif // LZ77_COMPRESS_H diff --git a/tests/app/lz77_compress/main.cpp b/tests/app/lz77_compress/main.cpp new file mode 100644 index 00000000..ece769c0 --- /dev/null +++ b/tests/app/lz77_compress/main.cpp @@ -0,0 +1,53 @@ +#include + +#include "lz77_compress.h" + +int main() { + const uint32_t N = 20; + const uint32_t window_size = 10; + + // Input array with repeated patterns for compression + uint32_t input[N] = { + 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, + 4, 5, 6, 7, 8, 6, 7, 8, 9, 9 + }; + + // Output arrays (worst case: each element is a literal, so N triples) + uint32_t expect_offsets[N]; + uint32_t expect_lengths[N]; + uint32_t expect_literals[N]; + uint32_t expect_count; + + uint32_t calculated_offsets[N]; + uint32_t calculated_lengths[N]; + uint32_t calculated_literals[N]; + uint32_t calculated_count; + + // Compute expected result with CPU version + lz77_compress_cpu(input, expect_offsets, expect_lengths, expect_literals, + &expect_count, N, window_size); + + // Compute result with accelerator version + lz77_compress_dsa(input, calculated_offsets, calculated_lengths, + calculated_literals, &calculated_count, N, window_size); + + // Compare output counts + if (expect_count != calculated_count) { + printf("lz77_compress: FAILED (count mismatch: expected %u, got %u)\n", + expect_count, calculated_count); + return 1; + } + + // Compare results element-by-element + for (uint32_t i = 0; i < expect_count; i++) { + if (expect_offsets[i] != calculated_offsets[i] || + expect_lengths[i] != calculated_lengths[i] || + expect_literals[i] != calculated_literals[i]) { + printf("lz77_compress: FAILED (mismatch at triple %u)\n", i); + return 1; + } + } + + printf("lz77_compress: PASSED\n"); + return 0; +}