From 1c39d1fc1fcda72635facf5ab61f1110d0091b9c Mon Sep 17 00:00:00 2001
From: Maarten Vandersteegen <maartenvandersteegen@gmail.com>
Date: Fri, 6 Mar 2026 09:12:43 +0100
Subject: [PATCH 1/4] Added unittests folder

* Added test cases for pulp_matmul_fp32
* Fixed one bug
---
 lib/sources/pulp_matmul_fp32.c            |   4 +-
 unittests/README.md                       |  51 +++
 unittests/generic/support/pmsis.h         |  45 +++
 unittests/generic/support/pmsis_stub.c    |  52 +++
 unittests/generic/test_pulp_matmul_fp32.c | 455 ++++++++++++++++++++++
 unittests/project.yml                     | 392 +++++++++++++++++++
 6 files changed, 997 insertions(+), 2 deletions(-)
 create mode 100644 unittests/README.md
 create mode 100644 unittests/generic/support/pmsis.h
 create mode 100644 unittests/generic/support/pmsis_stub.c
 create mode 100644 unittests/generic/test_pulp_matmul_fp32.c
 create mode 100644 unittests/project.yml

diff --git a/lib/sources/pulp_matmul_fp32.c b/lib/sources/pulp_matmul_fp32.c
index 7d0f4b76..00e22266 100644
--- a/lib/sources/pulp_matmul_fp32.c
+++ b/lib/sources/pulp_matmul_fp32.c
@@ -2119,9 +2119,9 @@ void mm_M_u2(void *matMul_args) {
         // =====> B IS TRANSPOSED <=====
     else {
         for (uint32_t i = 0; i < N; i++) {
-            float temp = 0;
-
             for (uint32_t j = start; j < stop; j++) {
+                float temp = 0;
+
                 for (uint32_t k = 0; k < (K & 0xfffffffe); k = k + 2) {
                     temp += A[i * K + k] * B[k + j * K];
                     temp += A[i * K + k + 1] * B[k + 1 + j * K];
diff --git a/unittests/README.md b/unittests/README.md
new file mode 100644
index 00000000..0802323b
--- /dev/null
+++ b/unittests/README.md
@@ -0,0 +1,51 @@
+# Unit tests
+
+This folder contains all unit tests
+
+## Setup
+
+```
+gem install ceedling
+```
+
+To support code coverage reports:
+
+```
+pip install gcovr
+```
+
+## Run
+
+Ceedling takes care of building, running and test coverage report generation.
+
+Running all tests:
+
+```
+ceedling test:all
+```
+
+Running all tests with code coverage
+
+```
+ceedling gcov:all
+```
+
+HTML report can be found at `build/artifacts/gcov/gcovr/GcovCoverageResults.html`.
+
+Running one specific test module or test case:
+
+```
+ceedling test:test_pulp_matmul_fp32
+ceedling test:test_pulp_matmul_fp32 --test-case=test_pulp_matmul_fp32_mm_M_u2_transp
+```
+
+## Debug
+
+You can debug a test with gdb in a specific module with:
+
+```
+ceedling test:test_pulp_matmul_fp32
+gdb --tui --args build/test/out/test_pulp_matmul_fp32/test_pulp_matmul_fp32.out
+```
+
+Once in the debugger set for example a breakpoint  (`b test_pulp_matmul_fp32.c:306`) and hit `r`.
diff --git a/unittests/generic/support/pmsis.h b/unittests/generic/support/pmsis.h
new file mode 100644
index 00000000..561b3ddd
--- /dev/null
+++ b/unittests/generic/support/pmsis.h
@@ -0,0 +1,45 @@
+#ifndef PMSIS_H
+#define PMSIS_H
+
+#include <stdint.h>
+
+/* 
+ * typedefs
+ */
+typedef uint16_t float16alt;
+
+typedef enum {
+  PI_CL_DMA_DIR_LOC2EXT = 0,
+  PI_CL_DMA_DIR_EXT2LOC = 1
+} pi_cl_dma_dir_e;
+
+typedef struct
+{
+    uint32_t ext;
+    uint32_t loc;
+    uint32_t id; 
+    uint16_t size;
+    pi_cl_dma_dir_e dir;                        \
+    uint8_t merge;
+    // 2d transfers args
+    uint32_t stride;
+    uint32_t length;
+} pi_cl_dma_copy_2d_t;
+
+/*
+ * functions
+ */
+int pi_core_id(void);
+void pi_cl_team_fork(int nb_cores, void (*entry)(void *), void *arg);
+void pi_cl_team_barrier(void);
+
+void pi_cl_dma_memcpy_2d(pi_cl_dma_copy_2d_t *copy);
+void pi_cl_dma_wait(void *copy);
+
+void pi_perf_conf(unsigned int events);
+void pi_perf_start(void);
+void pi_perf_stop(void);
+void pi_perf_reset(void);
+unsigned int pi_perf_read(int event);
+
+#endif /* PMSIS_H */
diff --git a/unittests/generic/support/pmsis_stub.c b/unittests/generic/support/pmsis_stub.c
new file mode 100644
index 00000000..0b05a26d
--- /dev/null
+++ b/unittests/generic/support/pmsis_stub.c
@@ -0,0 +1,52 @@
+#include "pmsis.h"
+
+static int core_id = 0;
+
+int pi_core_id(void)
+{
+    return core_id;
+}
+
+void pi_cl_team_fork(int nb_cores, void (*entry)(void *), void *arg)
+{
+    // execute in time instead of parallel
+    for (core_id=0; core_id<nb_cores; core_id++) {
+        entry(arg);
+    }
+}
+
+void pi_cl_team_barrier(void)
+{
+}
+
+void pi_cl_dma_memcpy_2d(pi_cl_dma_copy_2d_t *copy)
+{
+    // TODO: implement!
+    (void)copy;
+}
+
+void pi_cl_dma_wait(void *copy)
+{
+    (void)copy;
+}
+
+void pi_perf_conf(unsigned int events)
+{
+}
+
+void pi_perf_start(void)
+{
+}
+
+void pi_perf_stop(void)
+{
+}
+
+void pi_perf_reset(void)
+{
+}
+
+unsigned int pi_perf_read(int event)
+{
+    return 0;
+}
diff --git a/unittests/generic/test_pulp_matmul_fp32.c b/unittests/generic/test_pulp_matmul_fp32.c
new file mode 100644
index 00000000..6079af30
--- /dev/null
+++ b/unittests/generic/test_pulp_matmul_fp32.c
@@ -0,0 +1,455 @@
+#ifdef TEST
+
+#include "unity.h"
+
+#include "pmsis.h"
+#include "pulp_train_defines.h"
+#include "pulp_train_utils_fp32.h"
+#include "pulp_matmul_fp32.h"
+
+#define DELTA   1e-12
+
+#define IN_CH   9
+#define MID_CH  9
+#define OUT_CH  9
+
+static float A[IN_CH*MID_CH] = { 1.0, 4.3, 2.1, 0.9, -1.5, 2.9, -1.2, 7.8, 9.3, -2.3, 5.8, 0.6, 1.4, 8.5, -8.6, -8.3, -9.6, 6.7, 5.6, 7.4, 9.6, 6.0, -0.8, 5.6, -7.6, 2.8, -7.1, 8.9, 0.4, -1.7, -4.7, 5.5, -0.9, 1.4, -9.6, 2.4, 2.2, 2.3, 8.9, 3.6, -2.8, -1.3, 4.0, -8.8, 3.3, 3.4, -5.8, -7.4, -3.7, -2.7, 1.4, -1.2, 9.8, -8.0, -5.8, -6.8, 3.1, -4.9, -0.7, -5.1, -6.8, -7.8, 3.1, -7.2, -6.1, -2.6, 6.4, -8.1, 6.8, -8.1, 9.5, -0.6, 9.5, 2.1, 4.8, -9.2, -4.3, -7.6, -4.1, -7.6, -3.6 };
+static float B[MID_CH*OUT_CH] = { -1.7, -8.7, 3.8, 1.3, -4.7, 0.5, -8.1, 1.5, 8.6, -3.6, 3.3, -7.4, 4.3, -4.2, -6.3, 1.7, -9.6, 6.6, -9.9, 3.6, -4.6, 4.7, 9.2, -5.0, 1.5, 1.8, 1.4, -5.5, 9.1, -1.1, 6.9, 4.0, -4.1, 6.3, -2.1, 7.6, 1.6, 7.6, 3.9, 4.5, 0.0, 9.1, 2.9, -1.5, 2.1, -9.6, -4.0, 3.2, -4.2, 2.4, -1.4, -7.3, -4.0, 1.4, 1.8, 1.5, 3.1, 3.0, -1.4, 7.9, -2.6, -1.3, 7.8, 6.1, 4.1, -8.0, 8.4, 4.3, 10.0, -7.0, 7.4, -6.8, 2.3, -7.5, 7.0, 6.1, 1.4, -1.9, -8.6, 3.9, -0.9 };
+static float Bt[MID_CH*OUT_CH] = { -1.7, -3.6, -9.9, -5.5, 1.6, -9.6, 1.8, 6.1, 2.3, -8.7, 3.3, 3.6, 9.1, 7.6, -4.0, 1.5, 4.1, -7.5, 3.8, -7.4, -4.6, -1.1, 3.9, 3.2, 3.1, -8.0, 7.0, 1.3, 4.3, 4.7, 6.9, 4.5, -4.2, 3.0, 8.4, 6.1, -4.7, -4.2, 9.2, 4.0, 0.0, 2.4, -1.4, 4.3, 1.4, 0.5, -6.3, -5.0, -4.1, 9.1, -1.4, 7.9, 10.0, -1.9, -8.1, 1.7, 1.5, 6.3, 2.9, -7.3, -2.6, -7.0, -8.6, 1.5, -9.6, 1.8, -2.1, -1.5, -4.0, -1.3, 7.4, 3.9, 8.6, 6.6, 1.4, 7.6, 2.1, 1.4, 7.8, -6.8, -0.9 };
+static float C[IN_CH*OUT_CH] = { -6.35, -41.33, -36.26, 135.59, 55.36, -7.64, -148.95, 48.31, -23.1, 7.46, 50.99, 47.64, 44.13, -43.35, -131.34, 156.8, -73.46, 30.3, -232.17, 89.71, -165.1, 55.91, 81.1, -150.09, 37.44, -76.25, 90.27, -6.97, -134.86, 160.36, -60.4, -119.99, -8.49, -38.13, -51.83, 125.41, -150.82, -17.66, 37.26, 30.49, 34.34, -158.98, 46.31, -58.37, 154.29, 130.17, -36.77, -50.22, -68.94, -38.52, 167.6, -86.54, 96.56, -120.03, 25.75, -55.76, 63.38, -104.76, 32.82, -92.31, 90.02, 57.11, -139.06, -11.51, 33.93, -92.44, -16.82, 128.81, -29.49, -29.04, 93.08, -191.91, -16.57, -154.75, -9.6, -105.07, -96.33, -124.8, 13.69, -6.05, 35.63 };
+
+static float A_k1[IN_CH*MID_CH] = { 4.4, 7.3, 9.5, 7.1, -9.8, -2.8, 4.6, -6.6, 0.4 };
+static float B_k1[MID_CH*OUT_CH] = { -8.9, -6.0, -9.6, 5.9, -5.5, -3.1, 8.6, 4.1, -9.4 };
+static float C_k1[IN_CH*OUT_CH] = { -39.16, -26.4, -42.24, 25.96, -24.2, -13.64, 37.84, 18.04, -41.36, -64.97, -43.8, -70.08, 43.07, -40.15, -22.63, 62.78, 29.93, -68.62, -84.55, -57.0, -91.2, 56.05, -52.25, -29.45, 81.7, 38.95, -89.3, -63.19, -42.6, -68.16, 41.89, -39.05, -22.01, 61.06, 29.11, -66.74, 87.22, 58.8, 94.08, -57.82, 53.9, 30.38, -84.28, -40.18, 92.12, 24.92, 16.8, 26.88, -16.52, 15.4, 8.68, -24.08, -11.48, 26.32, -40.94, -27.6, -44.16, 27.14, -25.3, -14.26, 39.56, 18.86, -43.24, 58.74, 39.6, 63.36, -38.94, 36.3, 20.46, -56.76, -27.06, 62.04, -3.56, -2.4, -3.84, 2.36, -2.2, -1.24, 3.44, 1.64, -3.76 };
+
+static struct matMul_args mm_args;
+static struct matMul_args mm_args_k1;
+static float result[IN_CH*OUT_CH];
+static float result_k1[IN_CH*OUT_CH];
+
+
+void set_array(float *array, size_t size, float value)
+{
+    for (int i = 0; i < size; i++) {
+        array[i] = value;
+    }
+}
+
+// called before each test
+void setUp(void)
+{
+    mm_args.A = A;
+    mm_args.B = B;
+    mm_args.C = result;
+    mm_args.N = IN_CH;
+    mm_args.K = MID_CH;
+    mm_args.M = OUT_CH;
+    mm_args.trans_B = 0;
+
+    mm_args_k1.A = A_k1;
+    mm_args_k1.B = B_k1;
+    mm_args_k1.C = result_k1;
+    mm_args_k1.N = IN_CH;
+    mm_args_k1.K = 1;
+    mm_args_k1.M = OUT_CH;
+    mm_args_k1.trans_B = 0;
+
+    /* make sure the result buffers always start from a known state */
+    set_array(result, IN_CH*OUT_CH, 0.0);
+    set_array(result_k1, OUT_CH, 0.0);
+}
+
+// called after each test
+void tearDown(void)
+{
+}
+
+void test_pulp_matmul_fp32_mm(void)
+{
+    pi_cl_team_fork(NUM_CORES, mm, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+
+    pi_cl_team_fork(NUM_CORES, mm, &mm_args_k1);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C_k1, result_k1, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_transp(void)
+{
+    mm_args.trans_B = 1;
+    mm_args.B = Bt;
+    pi_cl_team_fork(NUM_CORES, mm, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+
+    mm_args_k1.trans_B = 1;
+    pi_cl_team_fork(NUM_CORES, mm, &mm_args_k1);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C_k1, result_k1, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_u2(void)
+{
+    pi_cl_team_fork(NUM_CORES, mm_u2, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_u2_transp(void)
+{
+    mm_args.trans_B = 1;
+    mm_args.B = Bt;
+    pi_cl_team_fork(NUM_CORES, mm_u2, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_add(void)
+{
+    float expected[IN_CH*OUT_CH];
+
+    set_array(result, IN_CH*OUT_CH, 1.0);   // set result buffer to non-zero
+    for (int i=0; i<IN_CH*OUT_CH; i++) {    // calculate expected result
+        expected[i] = C[i] + 1.0;
+    }
+
+    pi_cl_team_fork(NUM_CORES, mm_add, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected, result, IN_CH*OUT_CH);
+
+    set_array(result_k1, IN_CH*OUT_CH, 1.0);    // set result buffer to non-zero
+    for (int i=0; i<IN_CH*OUT_CH; i++) {        // calculate expected result
+        expected[i] = C_k1[i] + 1.0;
+    }
+
+    pi_cl_team_fork(NUM_CORES, mm_add, &mm_args_k1);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected, result_k1, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_add_transp(void)
+{
+    float expected[IN_CH*OUT_CH];
+
+    set_array(result, IN_CH*OUT_CH, 1.0);   // set result buffer to non-zero
+    for (int i=0; i<IN_CH*OUT_CH; i++) {    // calculate expected result
+        expected[i] = C[i] + 1.0;
+    }
+
+    mm_args.trans_B = 1;
+    mm_args.B = Bt;
+    pi_cl_team_fork(NUM_CORES, mm_add, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected, result, IN_CH*OUT_CH);
+
+    set_array(result_k1, IN_CH*OUT_CH, 1.0);    // set result buffer to non-zero
+    for (int i=0; i<IN_CH*OUT_CH; i++) {        // calculate expected result
+        expected[i] = C_k1[i] + 1.0;
+    }
+
+    mm_args_k1.trans_B = 1;
+    pi_cl_team_fork(NUM_CORES, mm_add, &mm_args_k1);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected, result_k1, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_M(void)
+{
+    pi_cl_team_fork(NUM_CORES, mm_M, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_M_transp(void)
+{
+    mm_args.trans_B = 1;
+    mm_args.B = Bt;
+    pi_cl_team_fork(NUM_CORES, mm_M, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_unroll_1x2(void)
+{
+    pi_cl_team_fork(NUM_CORES, mm_unroll_1x2, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_unroll_1x2_transp(void)
+{
+    mm_args.trans_B = 1;
+    mm_args.B = Bt;
+    pi_cl_team_fork(NUM_CORES, mm_unroll_1x2, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_unroll_1x4(void)
+{
+    pi_cl_team_fork(NUM_CORES, mm_unroll_1x4, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_unroll_1x4_transp(void)
+{
+    mm_args.trans_B = 1;
+    mm_args.B = Bt;
+    pi_cl_team_fork(NUM_CORES, mm_unroll_1x4, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_unroll_1x8(void)
+{
+    pi_cl_team_fork(NUM_CORES, mm_unroll_1x8, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_unroll_1x8_transp(void)
+{
+    mm_args.trans_B = 1;
+    mm_args.B = Bt;
+    pi_cl_team_fork(NUM_CORES, mm_unroll_1x8, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_unroll_2x1(void)
+{
+    pi_cl_team_fork(NUM_CORES, mm_unroll_2x1, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_unroll_2x1_transp(void)
+{
+    mm_args.trans_B = 1;
+    mm_args.B = Bt;
+    pi_cl_team_fork(NUM_CORES, mm_unroll_2x1, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_unroll_4x1(void)
+{
+    pi_cl_team_fork(NUM_CORES, mm_unroll_4x1, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_unroll_4x1_transp(void)
+{
+    mm_args.trans_B = 1;
+    mm_args.B = Bt;
+    pi_cl_team_fork(NUM_CORES, mm_unroll_4x1, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_unroll_8x1(void)
+{
+    pi_cl_team_fork(NUM_CORES, mm_unroll_8x1, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_unroll_8x1_transp(void)
+{
+    mm_args.trans_B = 1;
+    mm_args.B = Bt;
+    pi_cl_team_fork(NUM_CORES, mm_unroll_8x1, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_unroll_2x2(void)
+{
+    pi_cl_team_fork(NUM_CORES, mm_unroll_2x2, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_unroll_2x2_transp(void)
+{
+    mm_args.trans_B = 1;
+    mm_args.B = Bt;
+    pi_cl_team_fork(NUM_CORES, mm_unroll_2x2, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_unroll_2x4(void)
+{
+    pi_cl_team_fork(NUM_CORES, mm_unroll_2x4, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_unroll_2x4_transp(void)
+{
+    mm_args.trans_B = 1;
+    mm_args.B = Bt;
+    pi_cl_team_fork(NUM_CORES, mm_unroll_2x4, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_unroll_4x2(void)
+{
+    pi_cl_team_fork(NUM_CORES, mm_unroll_4x2, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_unroll_4x2_transp(void)
+{
+    mm_args.trans_B = 1;
+    mm_args.B = Bt;
+    pi_cl_team_fork(NUM_CORES, mm_unroll_4x2, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_unroll_4x4(void)
+{
+    pi_cl_team_fork(NUM_CORES, mm_unroll_4x4, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_unroll_4x4_transp(void)
+{
+    mm_args.trans_B = 1;
+    mm_args.B = Bt;
+    pi_cl_team_fork(NUM_CORES, mm_unroll_4x4, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_M_u2(void)
+{
+    pi_cl_team_fork(NUM_CORES, mm_M_u2, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_M_u2_transp(void)
+{
+    mm_args.trans_B = 1;
+    mm_args.B = Bt;
+    pi_cl_team_fork(NUM_CORES, mm_M_u2, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_M_unroll_2x1(void)
+{
+    pi_cl_team_fork(NUM_CORES, mm_M_unroll_2x1, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_M_unroll_2x1_transp(void)
+{
+    mm_args.trans_B = 1;
+    mm_args.B = Bt;
+    pi_cl_team_fork(NUM_CORES, mm_M_unroll_2x1, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_M_unroll_4x1(void)
+{
+    pi_cl_team_fork(NUM_CORES, mm_M_unroll_4x1, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_M_unroll_4x1_transp(void)
+{
+    mm_args.trans_B = 1;
+    mm_args.B = Bt;
+    pi_cl_team_fork(NUM_CORES, mm_M_unroll_4x1, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_M_unroll_8x1(void)
+{
+    pi_cl_team_fork(NUM_CORES, mm_M_unroll_8x1, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_M_unroll_8x1_transp(void)
+{
+    mm_args.trans_B = 1;
+    mm_args.B = Bt;
+    pi_cl_team_fork(NUM_CORES, mm_M_unroll_8x1, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_M_unroll_1x2(void)
+{
+    pi_cl_team_fork(NUM_CORES, mm_M_unroll_1x2, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_M_unroll_1x2_transp(void)
+{
+    mm_args.trans_B = 1;
+    mm_args.B = Bt;
+    pi_cl_team_fork(NUM_CORES, mm_M_unroll_1x2, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_M_unroll_1x4(void)
+{
+    pi_cl_team_fork(NUM_CORES, mm_M_unroll_1x4, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_M_unroll_1x4_transp(void)
+{
+    mm_args.trans_B = 1;
+    mm_args.B = Bt;
+    pi_cl_team_fork(NUM_CORES, mm_M_unroll_1x4, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_M_unroll_1x8(void)
+{
+    pi_cl_team_fork(NUM_CORES, mm_M_unroll_1x8, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_M_unroll_1x8_transp(void)
+{
+    mm_args.trans_B = 1;
+    mm_args.B = Bt;
+    pi_cl_team_fork(NUM_CORES, mm_M_unroll_1x8, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_M_unroll_2x2(void)
+{
+    pi_cl_team_fork(NUM_CORES, mm_M_unroll_2x2, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_M_unroll_2x2_transp(void)
+{
+    mm_args.trans_B = 1;
+    mm_args.B = Bt;
+    pi_cl_team_fork(NUM_CORES, mm_M_unroll_2x2, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_M_unroll_4x2(void)
+{
+    pi_cl_team_fork(NUM_CORES, mm_M_unroll_4x2, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_M_unroll_4x2_transp(void)
+{
+    mm_args.trans_B = 1;
+    mm_args.B = Bt;
+    pi_cl_team_fork(NUM_CORES, mm_M_unroll_4x2, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_M_unroll_2x4(void)
+{
+    pi_cl_team_fork(NUM_CORES, mm_M_unroll_2x4, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_M_unroll_2x4_transp(void)
+{
+    mm_args.trans_B = 1;
+    mm_args.B = Bt;
+    pi_cl_team_fork(NUM_CORES, mm_M_unroll_2x4, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_M_unroll_4x4(void)
+{
+    pi_cl_team_fork(NUM_CORES, mm_M_unroll_4x4, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+void test_pulp_matmul_fp32_mm_M_unroll_4x4_transp(void)
+{
+    mm_args.trans_B = 1;
+    mm_args.B = Bt;
+    pi_cl_team_fork(NUM_CORES, mm_M_unroll_4x4, &mm_args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+}
+
+#endif // TEST
diff --git a/unittests/project.yml b/unittests/project.yml
new file mode 100644
index 00000000..b840aaa1
--- /dev/null
+++ b/unittests/project.yml
@@ -0,0 +1,392 @@
+# =========================================================================
+#   Ceedling - Test-Centered Build System for C
+#   ThrowTheSwitch.org
+#   Copyright (c) 2010-25 Mike Karlesky, Mark VanderVoord, & Greg Williams
+#   SPDX-License-Identifier: MIT
+# =========================================================================
+
+---
+:project:
+  # how to use ceedling. If you're not sure, leave this as `gem` and `?`
+  :which_ceedling: gem
+  :ceedling_version: 1.0.1
+
+  # optional features. If you don't need them, keep them turned off for performance
+  :use_mocks: TRUE
+  :use_test_preprocessor: :none  # options are :none, :mocks, :tests, or :all
+  :use_deep_preprocessor: :none  # options are :none, :mocks, :tests, or :all
+  :use_backtrace: :simple        # options are :none, :simple, or :gdb
+  :use_decorators: :auto         # decorate Ceedling's output text. options are :auto, :all, or :none
+
+  # tweak the way ceedling handles automatic tasks
+  :build_root: build
+  :test_file_prefix: test_
+  :default_tasks:
+    - test:all
+
+  # performance options. If your tools start giving mysterious errors, consider
+  # dropping this to 1 to force single-tasking
+  :test_threads: 8
+  :compile_threads: 8
+
+  # enable release build (more details in release_build section below)
+  :release_build: FALSE
+
+# Specify where to find mixins and any that should be enabled automatically
+:mixins:
+  :enabled: []
+  :load_paths: []
+
+# further details to configure the way Ceedling handles test code
+:test_build:
+  :use_assembly: FALSE
+
+# further details to configure the way Ceedling handles release code
+:release_build:
+  :output: MyApp.out
+  :use_assembly: FALSE
+  :artifacts: []
+
+# Plugins are optional Ceedling features which can be enabled. Ceedling supports
+# a variety of plugins which may effect the way things are compiled, reported,
+# or may provide new command options. Refer to the readme in each plugin for
+# details on how to use it.
+:plugins:
+  :load_paths: []
+  :enabled:
+    #- beep                           # beeps when finished, so you don't waste time waiting for ceedling
+    - module_generator                # handy for quickly creating source, header, and test templates
+    - gcov                            # test coverage using gcov. Requires gcc, gcov, and a coverage analyzer like gcovr
+    #- bullseye                       # test coverage using bullseye. Requires bullseye for your platform
+    #- command_hooks                  # write custom actions to be called at different points during the build process
+    #- compile_commands_json_db       # generate a compile_commands.json file
+    #- dependencies                   # automatically fetch 3rd party libraries, etc.
+    #- subprojects                    # managing builds and test for static libraries
+    #- fake_function_framework        # use FFF instead of CMock
+
+    # Report options (You'll want to choose one stdout option, but may choose multiple stored options if desired)
+    #- report_build_warnings_log
+    #- report_tests_gtestlike_stdout
+    #- report_tests_ide_stdout
+    #- report_tests_log_factory
+    - report_tests_pretty_stdout
+    #- report_tests_raw_output_log
+    #- report_tests_teamcity_stdout
+
+# Specify which reports you'd like from the log factory
+:report_tests_log_factory:
+  :reports:
+    - json
+    - junit
+    - cppunit
+    - html
+
+# override the default extensions for your system and toolchain
+:extension:
+  #:header: .h
+  #:source: .c
+  #:assembly: .s
+  #:dependencies: .d
+  #:object: .o
+  :executable: .out
+  #:testpass: .pass
+  #:testfail: .fail
+  #:subprojects: .a
+
+# This is where Ceedling should look for your source and test files.
+# see documentation for the many options for specifying this.
+:paths:
+  :test:
+    - +:generic/**
+    - -:generic/support
+  :source:
+    - ../lib/sources/**
+  :include:
+    - ../lib/include/**
+  :support:
+    - generic/support
+
+# You can even specify specific files to add or remove from your test
+# and release collections. Usually it's better to use paths and let
+# Ceedling do the work for you!
+:files:
+  :test: []
+  :source: []
+
+# Compilation symbols to be injected into builds
+# See documentation for advanced options:
+#  - Test name matchers for different symbols per test executable build
+#  - Referencing symbols in multiple lists using advanced YAML
+#  - Specifiying symbols used during test preprocessing
+:defines:
+  :test:
+    - TEST # Simple list option to add symbol 'TEST' to compilation of all files in all test executables
+    - NUM_CORES=1
+  :release: []
+
+  # Enable to inject name of a test as a unique compilation symbol into its respective executable build.
+  :use_test_definition: FALSE
+
+# Configuration Options specific to CMock. See CMock docs for details
+:cmock:
+  # Core conffiguration
+  :plugins:                        # What plugins should be used by CMock?
+    - :ignore
+    - :callback
+  :verbosity:  2                   # the options being 0 errors only, 1 warnings and errors, 2 normal info, 3 verbose
+  :when_no_prototypes:  :warn      # the options being :ignore, :warn, or :erro
+
+  # File configuration
+  :skeleton_path:  ''              # Subdirectory to store stubs when generated (default: '')
+  :mock_prefix:  'mock_'           # Prefix to append to filenames for mocks
+  :mock_suffix:  ''                # Suffix to append to filenames for mocks
+
+  # Parser configuration
+  :strippables:  ['(?:__attribute__\s*\([ (]*.*?[ )]*\)+)']
+  :attributes:
+     - __ramfunc
+     - __irq
+     - __fiq
+     - register
+     - extern
+  :c_calling_conventions:
+     - __stdcall
+     - __cdecl
+     - __fastcall
+  :treat_externs:  :exclude        # the options being :include or :exclud
+  :treat_inlines:  :exclude        # the options being :include or :exclud
+
+  # Type handling configuration
+  #:unity_helper_path: ''          # specify a string of where to find a unity_helper.h file to discover custom type assertions
+  :treat_as:                       # optionally add additional types to map custom types
+    uint8:    HEX8
+    uint16:   HEX16
+    uint32:   UINT32
+    int8:     INT8
+    bool:     UINT8
+  #:treat_as_array:  {}            # hint to cmock that these types are pointers to something
+  #:treat_as_void:  []             # hint to cmock that these types are actually aliases of void
+  :memcmp_if_unknown:  true        # allow cmock to use the memory comparison assertions for unknown types
+  :when_ptr:  :compare_data        # hint to cmock how to handle pointers in general, the options being :compare_ptr, :compare_data, or :smart
+
+  # Mock generation configuration
+  :weak:  ''                       # Symbol to use to declare weak functions
+  :enforce_strict_ordering: true   # Do we want cmock to enforce ordering of all function calls?
+  :fail_on_unexpected_calls: true  # Do we want cmock to fail when it encounters a function call that wasn't expected?
+  :callback_include_count: true    # Do we want cmock to include the number of calls to this callback, when using callbacks?
+  :callback_after_arg_check: false # Do we want cmock to enforce an argument check first when using a callback?
+  #:includes: []                   # You can add additional includes here, or specify the location with the options below
+  #:includes_h_pre_orig_header: []
+  #:includes_h_post_orig_header: []
+  #:includes_c_pre_header:  []
+  #:includes_c_post_header:  []
+  #:array_size_type:  []            # Specify a type or types that should be used for array lengths
+  #:array_size_name:  'size|len'    # Specify a name or names that CMock might automatically recognize as the length of an array
+  :exclude_setjmp_h:  false        # Don't use setjmp when running CMock. Note that this might result in late reporting or out-of-order failures.
+
+# Configuration options specific to Unity.
+:unity:
+  :defines:
+
+# You can optionally have ceedling create environment variables for you before
+# performing the rest of its tasks.
+:environment: []
+# :environment:
+#   # List enforces order allowing later to reference earlier with inline Ruby substitution
+#   - :var1: value
+#   - :var2: another value
+#   - :path:            # Special PATH handling with platform-specific path separators
+#     - #{ENV['PATH']}  # Environment variables can use inline Ruby substitution
+#     - /another/path/to/include
+
+# LIBRARIES
+# These libraries are automatically injected into the build process. Those specified as
+# common will be used in all types of builds. Otherwise, libraries can be injected in just
+# tests or releases. These options are MERGED with the options in supplemental yaml files.
+:libraries:
+  :placement: :end
+  :flag: "-l${1}"
+  :path_flag: "-L ${1}"
+  :system: []    # for example, you might list 'm' to grab the math library
+  :test:
+    - m
+  :release: []
+
+################################################################
+# PLUGIN CONFIGURATION
+################################################################
+
+# Add -gcov to the plugins list to make sure of the gcov plugin
+# You will need to have gcov and gcovr both installed to make it work.
+# For more information on these options, see docs in plugins/gcov
+:gcov:
+  :summaries: TRUE                # Enable simple coverage summaries to console after tests
+  :report_task: FALSE             # Disabled dedicated report generation task (this enables automatic report generation)
+  :utilities:
+    - gcovr           # Use gcovr to create the specified reports (default).
+    #- ReportGenerator # Use ReportGenerator to create the specified reports.
+  :reports: # Specify one or more reports to generate.
+    # Make an HTML summary report.
+    - HtmlDetailed
+    # - HtmlDetailed
+    # - Text
+    # - Cobertura
+    # - SonarQube
+    # - JSON
+    # - HtmlInline
+    # - HtmlInlineAzure
+    # - HtmlInlineAzureDark
+    # - HtmlChart
+    # - MHtml
+    # - Badges
+    # - CsvSummary
+    # - Latex
+    # - LatexSummary
+    # - PngChart
+    # - TeamCitySummary
+    # - lcov
+    # - Xml
+    # - XmlSummary
+  :gcovr:
+    # :html_artifact_filename: TestCoverageReport.html
+    # :html_title: Test Coverage Report
+    :html_medium_threshold: 75
+    :html_high_threshold: 90
+    # :html_absolute_paths: TRUE
+    # :html_encoding: UTF-8
+
+# :module_generator:
+#   :naming: :snake #options: :bumpy, :camel, :caps, or :snake
+#   :includes:
+#     :tst: []
+#     :src: []
+#   :boilerplates:
+#     :src: ""
+#     :inc: ""
+#     :tst: ""
+
+# :dependencies:
+#   :libraries:
+#     - :name: WolfSSL
+#       :source_path:   third_party/wolfssl/source
+#       :build_path:    third_party/wolfssl/build
+#       :artifact_path: third_party/wolfssl/install
+#       :fetch:
+#         :method: :zip
+#         :source: \\shared_drive\third_party_libs\wolfssl\wolfssl-4.2.0.zip
+#       :environment:
+#         - CFLAGS+=-DWOLFSSL_DTLS_ALLOW_FUTURE
+#       :build:
+#         - "autoreconf -i"
+#         - "./configure --enable-tls13 --enable-singlethreaded"
+#         - make
+#         - make install
+#       :artifacts:
+#         :static_libraries:
+#           - lib/wolfssl.a
+#         :dynamic_libraries:
+#           - lib/wolfssl.so
+#         :includes:
+#           - include/**
+
+# :subprojects:
+#   :paths:
+#    - :name: libprojectA
+#      :source:
+#        - ./subprojectA/source
+#      :include:
+#        - ./subprojectA/include
+#      :build_root: ./subprojectA/build
+#      :defines: []
+
+# :command_hooks:
+#   :pre_mock_preprocess:
+#   :post_mock_preprocess:
+#   :pre_test_preprocess:
+#   :post_test_preprocess:
+#   :pre_mock_generate:
+#   :post_mock_generate:
+#   :pre_runner_generate:
+#   :post_runner_generate:
+#   :pre_compile_execute:
+#   :post_compile_execute:
+#   :pre_link_execute:
+#   :post_link_execute:
+#   :pre_test_fixture_execute:
+#   :post_test_fixture_execute:
+#   :pre_test:
+#   :post_test:
+#   :pre_release:
+#   :post_release:
+#   :pre_build:
+#   :post_build:
+#   :post_error:
+
+################################################################
+# TOOLCHAIN CONFIGURATION
+################################################################
+
+#:tools:
+# Ceedling defaults to using gcc for compiling, linking, etc.
+# As [:tools] is blank, gcc will be used (so long as it's in your system path)
+# See documentation to configure a given toolchain for use
+# :tools:
+#   :test_compiler:
+#     :executable:
+#     :arguments: []
+#     :name:
+#     :optional: FALSE
+#   :test_linker:
+#     :executable:
+#     :arguments: []
+#     :name:
+#     :optional: FALSE
+#   :test_assembler:
+#     :executable:
+#     :arguments: []
+#     :name:
+#     :optional: FALSE
+#   :test_fixture:
+#     :executable:
+#     :arguments: []
+#     :name:
+#     :optional: FALSE
+#   :test_includes_preprocessor:
+#     :executable:
+#     :arguments: []
+#     :name:
+#     :optional: FALSE
+#   :test_file_preprocessor:
+#     :executable:
+#     :arguments: []
+#     :name:
+#     :optional: FALSE
+#   :test_file_preprocessor_directives:
+#     :executable:
+#     :arguments: []
+#     :name:
+#     :optional: FALSE
+#   :release_compiler:
+#     :executable:
+#     :arguments: []
+#     :name:
+#     :optional: FALSE
+#   :release_linker:
+#     :executable:
+#     :arguments: []
+#     :name:
+#     :optional: FALSE
+#   :release_assembler:
+#     :executable:
+#     :arguments: []
+#     :name:
+#     :optional: FALSE
+#   :release_dependencies_generator:
+#     :executable:
+#     :arguments: []
+#     :name:
+#     :optional: FALSE
+...
+
+:test_runner:
+  :cmdline_args: true

From b689d7395ad4ce131d0ad39c6378af02adc6c0d8 Mon Sep 17 00:00:00 2001
From: Maarten Vandersteegen <maartenvandersteegen@gmail.com>
Date: Mon, 9 Mar 2026 16:20:53 +0100
Subject: [PATCH 2/4] Added test cases for pulp_conv2d_fp32.c

Fixed a few bugs mainly related to bias support
Moved project.yml one directory up to support correct gcovr report
generation.
Updated test pmsis.h with DMA stuff

Also fixed bias support for pulp_fp32_linear.c, no test cases yet!
---
 .gitignore                                |   1 +
 lib/sources/pulp_conv2d_fp32.c            | 110 +++----
 lib/sources/pulp_conv_naive_fp32.c        |   5 +-
 lib/sources/pulp_linear_fp32.c            |  24 +-
 unittests/project.yml => project.yml      |  22 +-
 unittests/README.md                       |   4 +-
 unittests/generic/support/pmsis.h         |   5 +-
 unittests/generic/test_pulp_conv2d_fp32.c | 374 ++++++++++++++++++++++
 8 files changed, 462 insertions(+), 83 deletions(-)
 rename unittests/project.yml => project.yml (97%)
 create mode 100644 unittests/generic/test_pulp_conv2d_fp32.c

diff --git a/.gitignore b/.gitignore
index 20d3ea18..396f3c4a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@
 **/.vscode/
 **/__pycache__/
 .idea/
+build
diff --git a/lib/sources/pulp_conv2d_fp32.c b/lib/sources/pulp_conv2d_fp32.c
index 6166d88c..7abb84ce 100644
--- a/lib/sources/pulp_conv2d_fp32.c
+++ b/lib/sources/pulp_conv2d_fp32.c
@@ -295,10 +295,10 @@ void pulp_conv2d_fp32_bw_param_grads_cl(void *Conv2D_args) {
             im2col_args.c = C2D_args->coeff;
             im2col_args.output = C2D_args->output;
             im2col_args.pBuffer = i2c_buffer;
-            im2col_args.Lpad = 0; //Lpad;
-            im2col_args.Rpad = 0; //Rpad;
-            im2col_args.Upad = 0; //Upad;
-            im2col_args.Dpad = 0; //Dpad;
+            im2col_args.Lpad = Lpad;
+            im2col_args.Rpad = Rpad;
+            im2col_args.Upad = Upad;
+            im2col_args.Dpad = Dpad;
             im2col_args.mod = 0;
             im2col_args.stride_w = stride_w;
             im2col_args.stride_h = stride_h;
@@ -319,8 +319,16 @@ void pulp_conv2d_fp32_bw_param_grads_cl(void *Conv2D_args) {
             matMul_args.bias = biasDiff;
             matMul_args.USE_BIASES = USE_BIASES;
 
-            matMul_args.pH = H_out;
-            matMul_args.pW = W_out;
+            matMul_args.H = H_in;
+            matMul_args.W = W_in;
+            matMul_args.pH = pH;
+            matMul_args.pW = pW;
+            matMul_args.Lpad = Lpad;
+            matMul_args.Rpad = Rpad;
+            matMul_args.Upad = Upad;
+            matMul_args.Dpad = Dpad;
+            matMul_args.stride_h = stride_h;
+            matMul_args.stride_w = stride_w;
 
             matMul_args.bias_dim = bias_dim;
 
@@ -366,7 +374,7 @@ void pulp_conv2d_fp32_bw_param_grads_cl(void *Conv2D_args) {
 
             pi_cl_team_fork(NUM_CORES, transpose, &tr_args);
 
-            matMul_args.A = tr_buffer; // outDiff;
+            matMul_args.A = tr_buffer; // outDiff transposed;
             matMul_args.B = i2c_buffer;
             matMul_args.C = coeffDiff;
             matMul_args.N = C_out;
@@ -378,8 +386,16 @@ void pulp_conv2d_fp32_bw_param_grads_cl(void *Conv2D_args) {
             matMul_args.bias = biasDiff;
             matMul_args.USE_BIASES = USE_BIASES;
 
-            matMul_args.pH = H_out;
-            matMul_args.pW = W_out;
+            matMul_args.H = H_in;
+            matMul_args.W = W_in;
+            matMul_args.pH = pH;
+            matMul_args.pW = pW;
+            matMul_args.Lpad = Lpad;
+            matMul_args.Rpad = Rpad;
+            matMul_args.Upad = Upad;
+            matMul_args.Dpad = Dpad;
+            matMul_args.stride_h = stride_h;
+            matMul_args.stride_w = stride_w;
 
             matMul_args.bias_dim = bias_dim;
 
@@ -691,9 +707,6 @@ void im2col_conv2d_fw_kernel(void *void_args) {
     uint32_t Upad = args->Upad;
     uint32_t Dpad = args->Dpad;
 
-    // const uint32_t H_out = (H_in - pH + Upad + Dpad) / h_str + 1;
-    // const uint32_t W_out = (W_in - pW + Lpad + Rpad) / w_str + 1;
-
     const uint32_t H_out = pH;
     const uint32_t W_out = pW;
 
@@ -747,38 +760,8 @@ void im2col_conv2d_param_grad_kernel(void *void_args) {
     struct mm_manager_args *man_args = (struct mm_manager_args *) void_args;
     struct matMul_args *args = man_args->mm_args;
 
-    float *__restrict__ inData = args->A;
-    float *__restrict__ coeffDiff = args->B;
-    float *__restrict__ outDiff = args->C;
-
-    float *__restrict__ biasDiff = args->bias;
     const uint32_t USE_BIASES = args->USE_BIASES;
 
-    const uint32_t H_in = args->H;
-    const uint32_t W_in = args->W;
-    const uint32_t pW = args->pW;
-    const uint32_t pH = args->pH;
-    const uint32_t C_in = args->pCin;
-    const uint32_t C_out = args->N;
-
-    uint32_t h_str = args->stride_h;
-    uint32_t w_str = args->stride_w;
-    uint32_t Lpad = args->Lpad;
-    uint32_t Rpad = args->Rpad;
-    uint32_t Upad = args->Upad;
-    uint32_t Dpad = args->Dpad;
-
-    const uint32_t H_out = (H_in - pH + Upad + Dpad) / h_str + 1;
-    const uint32_t W_out = (W_in - pW + Lpad + Rpad) / w_str + 1;
-
-    const uint32_t blockSize = (C_out + NUM_CORES - 1) / NUM_CORES;
-    const uint32_t start = pi_core_id() * blockSize;
-    const uint32_t stop = start + blockSize > C_out ? C_out : start + blockSize;
-
-    const uint32_t HWC = args->HWC;
-
-    int padding = Lpad + Rpad + Upad + Dpad;
-
     // Perform simple matrix multiplication
 #ifndef OPTIMIZE
     mm(args);
@@ -787,36 +770,41 @@ void im2col_conv2d_param_grad_kernel(void *void_args) {
 #endif
 
     // Handle biases
-    if (USE_BIASES == 1 && HWC == 0) {
-        for (uint32_t co = start; co < stop; co++) {
-            float temp = 0;
-            for (uint32_t ho = 0; ho < H_out; ho++) {
-                for (uint32_t wo = 0; wo < W_out; wo++) {
-                    temp += outDiff[wo + ho * H_out + co * H_out * W_out];
-                }
-            }
-            biasDiff[co] = temp;
-        }
-    }
+    if (USE_BIASES == 1) {
+        float *__restrict__ outDiff = args->A;
+        float *__restrict__ biasDiff = args->bias;
+
+        const uint32_t H_in = args->H;
+        const uint32_t W_in = args->W;
+        const uint32_t pW = args->pW;
+        const uint32_t pH = args->pH;
+        const uint32_t C_out = args->N;
+
+        uint32_t h_str = args->stride_h;
+        uint32_t w_str = args->stride_w;
+        uint32_t Lpad = args->Lpad;
+        uint32_t Rpad = args->Rpad;
+        uint32_t Upad = args->Upad;
+        uint32_t Dpad = args->Dpad;
+
+        const uint32_t H_out = (H_in - pH + Upad + Dpad) / h_str + 1;
+        const uint32_t W_out = (W_in - pW + Lpad + Rpad) / w_str + 1;
+
+        const uint32_t blockSize = (C_out + NUM_CORES - 1) / NUM_CORES;
+        const uint32_t start = pi_core_id() * blockSize;
+        const uint32_t stop = start + blockSize > C_out ? C_out : start + blockSize;
 
-    else if (USE_BIASES == 1 && HWC == 1) {
         for (uint32_t co = start; co < stop; co++) {
             float temp = 0;
             for (uint32_t ho = 0; ho < H_out; ho++) {
                 for (uint32_t wo = 0; wo < W_out; wo++) {
-                    temp += outDiff[wo * C_out + ho * C_out * W_out + co];
+                    temp += outDiff[wo + ho * W_out + co * H_out * W_out];
                 }
             }
             biasDiff[co] = temp;
         }
     }
 
-    if (HWC != 0 && HWC != 1) {
-        // Unsupported layout
-        printf("[im2col_conv2d_param_grad_kernel:] Invalid selection of the HWC layout (1 for HWC, 0 for CHW). Actual value: %d. Biases not used, even if provided!\n",
-               HWC);
-    }
-
     if (USE_BIASES != 0 && USE_BIASES != 1) {
         printf("[im2col_conv2d_param_grad_kernel:] Invalid selection of the bias option (1 or 0 - use biases or not). Actual value: %d. Biases not used, even if provided!\n",
                USE_BIASES);
diff --git a/lib/sources/pulp_conv_naive_fp32.c b/lib/sources/pulp_conv_naive_fp32.c
index 9930bb16..b7c97e79 100644
--- a/lib/sources/pulp_conv_naive_fp32.c
+++ b/lib/sources/pulp_conv_naive_fp32.c
@@ -376,17 +376,18 @@ void naive_conv2d_param_grad_kernel_CHW(void *matMul_args) {
                                 // Pad conditions
                                 int pad_cond_h = h_str * ho + hk - Upad;
                                 int pad_cond_w = w_str * wo + wk - Lpad;
+                                int out_idx = wo + ho * W_out + co * H_out * W_out;
 
                                 if ((pad_cond_h >= 0) && (pad_cond_w >= 0) && (pad_cond_h < H_in) &&
                                     (pad_cond_w < W_in)) {
-                                    int out_idx = wo + ho * W_out + co * H_out * W_out;
                                     int in_idx = (w_str * wo + wk - Lpad) + (h_str * ho + hk - Upad) * W_in +
                                                  ci * H_in * W_in;
 
                                     temp += outDiff[out_idx] * inData[in_idx];
 
-                                    if (USE_BIASES == 1) bias_temp += outDiff[out_idx];
                                 }
+
+                                if (USE_BIASES == 1) bias_temp += outDiff[out_idx];
                             }
                         }
                         coeffDiff[wk + hk * pW + ci * pH * pW + co * pH * pW * C_in] = temp;
diff --git a/lib/sources/pulp_linear_fp32.c b/lib/sources/pulp_linear_fp32.c
index 69c5b88a..99b5e72b 100644
--- a/lib/sources/pulp_linear_fp32.c
+++ b/lib/sources/pulp_linear_fp32.c
@@ -217,18 +217,18 @@ void pulp_linear_fp32_fw_cl_kernel( void * man_args ) {
     mm_manager(manager_args);
     #endif
 
-    // pi_cl_team_barrier();
-
-    // if (USE_BIASES == 1) {
-    //     const uint32_t outputSize = N * M;
-    //     const uint32_t blockSize = (outputSize + NUM_CORES - 1) / NUM_CORES;
-    //     const uint32_t start = pi_core_id() * blockSize;
-    //     const uint32_t stop = start + blockSize > outputSize ? outputSize : start + blockSize;
-
-    //     for (uint32_t i = start; i < stop; i++) {
-    //         outData[i] += biasData[i % M];
-    //     }
-    // }
+    pi_cl_team_barrier();
+
+    if (USE_BIASES == 1) {
+        const uint32_t outputSize = N * M;
+        const uint32_t blockSize = (outputSize + NUM_CORES - 1) / NUM_CORES;
+        const uint32_t start = pi_core_id() * blockSize;
+        const uint32_t stop = start + blockSize > outputSize ? outputSize : start + blockSize;
+
+        for (uint32_t i = start; i < stop; i++) {
+            outData[i] += biasData[i];
+        }
+    }
 }
 
 void pulp_linear_fp32_bw_param_grads_cl_kernel( void * man_args )
diff --git a/unittests/project.yml b/project.yml
similarity index 97%
rename from unittests/project.yml
rename to project.yml
index b840aaa1..742001f6 100644
--- a/unittests/project.yml
+++ b/project.yml
@@ -97,14 +97,14 @@
 # see documentation for the many options for specifying this.
 :paths:
   :test:
-    - +:generic/**
-    - -:generic/support
+    - +:unittests/generic/**
+    - -:unittests/generic/support
   :source:
-    - ../lib/sources/**
+    - lib/sources/**
   :include:
-    - ../lib/include/**
+    - lib/include/**
   :support:
-    - generic/support
+    - unittests/generic/support
 
 # You can even specify specific files to add or remove from your test
 # and release collections. Usually it's better to use paths and let
@@ -127,6 +127,16 @@
   # Enable to inject name of a test as a unique compilation symbol into its respective executable build.
   :use_test_definition: FALSE
 
+# Enable address sanitizer
+#:flags:
+#  :test:
+#    :compile:
+#      - -fsanitize=address
+#      - -fno-omit-frame-pointer
+#      - -g
+#    :link:
+#      - -fsanitize=address
+
 # Configuration Options specific to CMock. See CMock docs for details
 :cmock:
   # Core conffiguration
@@ -187,6 +197,7 @@
 # Configuration options specific to Unity.
 :unity:
   :defines:
+  :use_param_tests: true
 
 # You can optionally have ceedling create environment variables for you before
 # performing the rest of its tasks.
@@ -203,6 +214,7 @@
 # These libraries are automatically injected into the build process. Those specified as
 # common will be used in all types of builds. Otherwise, libraries can be injected in just
 # tests or releases. These options are MERGED with the options in supplemental yaml files.
+
 :libraries:
   :placement: :end
   :flag: "-l${1}"
diff --git a/unittests/README.md b/unittests/README.md
index 0802323b..86a19e76 100644
--- a/unittests/README.md
+++ b/unittests/README.md
@@ -18,6 +18,8 @@ pip install gcovr
 
 Ceedling takes care of building, running and test coverage report generation.
 
+NOTE: all cmds below must be executed from the root directory of this git repository.
+
 Running all tests:
 
 ```
@@ -48,4 +50,4 @@ ceedling test:test_pulp_matmul_fp32
 gdb --tui --args build/test/out/test_pulp_matmul_fp32/test_pulp_matmul_fp32.out
 ```
 
-Once in the debugger set for example a breakpoint  (`b test_pulp_matmul_fp32.c:306`) and hit `r`.
+Once in the debugger set for example a breakpoint  (`b test_pulp_matmul_fp32_mm_M_u2_transp`) and hit `r`.
diff --git a/unittests/generic/support/pmsis.h b/unittests/generic/support/pmsis.h
index 561b3ddd..1817807b 100644
--- a/unittests/generic/support/pmsis.h
+++ b/unittests/generic/support/pmsis.h
@@ -2,8 +2,9 @@
 #define PMSIS_H
 
 #include <stdint.h>
+#include <stdio.h>
 
-/* 
+/*
  * typedefs
  */
 typedef uint16_t float16alt;
@@ -17,7 +18,7 @@ typedef struct
 {
     uint32_t ext;
     uint32_t loc;
-    uint32_t id; 
+    uint32_t id;
     uint16_t size;
     pi_cl_dma_dir_e dir;                        \
     uint8_t merge;
diff --git a/unittests/generic/test_pulp_conv2d_fp32.c b/unittests/generic/test_pulp_conv2d_fp32.c
new file mode 100644
index 00000000..f2fe03cb
--- /dev/null
+++ b/unittests/generic/test_pulp_conv2d_fp32.c
@@ -0,0 +1,374 @@
+#ifdef TEST
+
+#include <stdlib.h>
+#include <string.h>
+#include "unity.h"
+
+#include "pmsis.h"
+#include "pulp_train_defines.h"
+#include "pulp_train_utils_fp32.h"
+#include "pulp_conv2d_fp32.h"
+#include "pulp_matmul_fp32.h"
+#include "pulp_conv_naive_fp32.h"
+#include "pulp_im2col_fp32.h"
+
+#define DELTA   1e-12
+
+// known parameters
+static float WEIGHTS[] = {0.0f, 0.009999999776482582f, 0.019999999552965164f, 0.009999999776482582f, 0.019999999552965164f, 0.029999999329447746f, 0.019999999552965164f, 0.029999999329447746f, 0.03999999910593033f, 0.009999999776482582f, 0.019999999552965164f, 0.029999999329447746f, 0.019999999552965164f, 0.029999999329447746f, 0.03999999910593033f, 0.029999999329447746f, 0.03999999910593033f, 0.05000000074505806f, 0.019999999552965164f, 0.029999999329447746f, 0.03999999910593033f, 0.029999999329447746f, 0.03999999910593033f, 0.05000000074505806f, 0.03999999910593033f, 0.05000000074505806f, 0.05999999865889549f, 0.009999999776482582f, 0.019999999552965164f, 0.029999999329447746f, 0.019999999552965164f, 0.029999999329447746f, 0.03999999910593033f, 0.029999999329447746f, 0.03999999910593033f, 0.05000000074505806f, 0.019999999552965164f, 0.029999999329447746f, 0.03999999910593033f, 0.029999999329447746f, 0.03999999910593033f, 0.05000000074505806f, 0.03999999910593033f, 0.05000000074505806f, 0.05999999865889549f, 0.029999999329447746f, 0.03999999910593033f, 0.05000000074505806f, 0.03999999910593033f, 0.05000000074505806f, 0.05999999865889549f, 0.05000000074505806f, 0.05999999865889549f, 0.07000000029802322f, 0.019999999552965164f, 0.029999999329447746f, 0.03999999910593033f, 0.029999999329447746f, 0.03999999910593033f, 0.05000000074505806f, 0.03999999910593033f, 0.05000000074505806f, 0.05999999865889549f, 0.029999999329447746f, 0.03999999910593033f, 0.05000000074505806f, 0.03999999910593033f, 0.05000000074505806f, 0.05999999865889549f, 0.05000000074505806f, 0.05999999865889549f, 0.07000000029802322f, 0.03999999910593033f, 0.05000000074505806f, 0.05999999865889549f, 0.05000000074505806f, 0.05999999865889549f, 0.07000000029802322f, 0.05999999865889549f, 0.07000000029802322f, 0.07999999821186066f, 0.029999999329447746f, 0.03999999910593033f, 0.05000000074505806f, 0.03999999910593033f, 0.05000000074505806f, 0.05999999865889549f, 0.05000000074505806f, 0.05999999865889549f, 0.07000000029802322f, 0.03999999910593033f, 0.05000000074505806f, 0.05999999865889549f, 0.05000000074505806f, 0.05999999865889549f, 0.07000000029802322f, 0.05999999865889549f, 0.07000000029802322f, 0.07999999821186066f, 0.05000000074505806f, 0.05999999865889549f, 0.07000000029802322f, 0.05999999865889549f, 0.07000000029802322f, 0.07999999821186066f, 0.07000000029802322f, 0.07999999821186066f, 0.09000000357627869f, };
+static float BIASES[] = {0.0f, 0.009999999776482582f, 0.019999999552965164f, 0.029999999329447746f, };
+
+// known input and output gradient data
+static float INPUT_HWC[] = {0.0010000000474974513f, 0.0010100000072270632f, 0.0010400000028312206f, 0.0009900000877678394f, 0.0010000000474974513f, 0.0010300000431016088f, 0.0009600000339560211f, 0.0009700000518932939f, 0.0010000000474974513f, 0.000910000060684979f, 0.0009200000204145908f, 0.0009500000742264092f, 0.0010100000072270632f, 0.0010400000028312206f, 0.0010900000343099236f, 0.0010000000474974513f, 0.0010300000431016088f, 0.0010800000745803118f, 0.0009700000518932939f, 0.0010000000474974513f, 0.0010500000789761543f, 0.0009200000204145908f, 0.0009500000742264092f, 0.0010000000474974513f, 0.0010400000028312206f, 0.0010900000343099236f, 0.0011599999852478504f, 0.0010300000431016088f, 0.0010800000745803118f, 0.0011500000255182385f, 0.0010000000474974513f, 0.0010500000789761543f, 0.001120000029914081f, 0.0009500000742264092f, 0.0010000000474974513f, 0.001069999998435378f, 0.0010900000343099236f, 0.0011599999852478504f, 0.0012500000884756446f, 0.0010800000745803118f, 0.0011500000255182385f, 0.0012400000123307109f, 0.0010500000789761543f, 0.001120000029914081f, 0.0012100000167265534f, 0.0010000000474974513f, 0.001069999998435378f, 0.0011599999852478504f, };
+static float INPUT_CHW[] = {0.0010000000474974513f, 0.0009900000877678394f, 0.0009600000339560211f, 0.000910000060684979f, 0.0010100000072270632f, 0.0010000000474974513f, 0.0009700000518932939f, 0.0009200000204145908f, 0.0010400000028312206f, 0.0010300000431016088f, 0.0010000000474974513f, 0.0009500000742264092f, 0.0010900000343099236f, 0.0010800000745803118f, 0.0010500000789761543f, 0.0010000000474974513f, 0.0010100000072270632f, 0.0010000000474974513f, 0.0009700000518932939f, 0.0009200000204145908f, 0.0010400000028312206f, 0.0010300000431016088f, 0.0010000000474974513f, 0.0009500000742264092f, 0.0010900000343099236f, 0.0010800000745803118f, 0.0010500000789761543f, 0.0010000000474974513f, 0.0011599999852478504f, 0.0011500000255182385f, 0.001120000029914081f, 0.001069999998435378f, 0.0010400000028312206f, 0.0010300000431016088f, 0.0010000000474974513f, 0.0009500000742264092f, 0.0010900000343099236f, 0.0010800000745803118f, 0.0010500000789761543f, 0.0010000000474974513f, 0.0011599999852478504f, 0.0011500000255182385f, 0.001120000029914081f, 0.001069999998435378f, 0.0012500000884756446f, 0.0012400000123307109f, 0.0012100000167265534f, 0.0011599999852478504f, };
+static float OUTPUT_GRAD_HWC_PAD0[] = {-0.1248936876654625f, -0.12360870093107224f, -0.12232371419668198f, -0.12103872746229172f, -0.12502218782901764f, -0.12373820692300797f, -0.12245423346757889f, -0.12117025256156921f, -0.12488772720098495f, -0.1236010491847992f, -0.12231437861919403f, -0.12102770060300827f, -0.1250162124633789f, -0.12373055517673492f, -0.12244489043951035f, -0.12115923315286636f, };
+static float OUTPUT_GRAD_CHW_PAD0[] = {-0.1248936876654625f, -0.12502218782901764f, -0.12488772720098495f, -0.1250162124633789f, -0.12360870093107224f, -0.12373820692300797f, -0.1236010491847992f, -0.12373055517673492f, -0.12232371419668198f, -0.12245423346757889f, -0.12231437861919403f, -0.12244489043951035f, -0.12103872746229172f, -0.12117025256156921f, -0.12102770060300827f, -0.12115923315286636f, };
+static float OUTPUT_GRAD_HWC_PAD1[] = {-0.031234506517648697f, -0.03091815672814846f, -0.030601806938648224f, -0.030285457149147987f, -0.03126119077205658f, -0.03094298020005226f, -0.030624769628047943f, -0.030306560918688774f, -0.031293101608753204f, -0.030975062400102615f, -0.030657021328806877f, -0.03033898025751114f, -0.031332820653915405f, -0.031016694381833076f, -0.030700569972395897f, -0.030384443700313568f, -0.03122907504439354f, -0.030910678207874298f, -0.030592281371355057f, -0.030273884534835815f, -0.03125467151403427f, -0.030933426693081856f, -0.03061218000948429f, -0.030290933325886726f, -0.03128679469227791f, -0.03096579946577549f, -0.03064480610191822f, -0.030323810875415802f, -0.031329624354839325f, -0.031011562794446945f, -0.030693503096699715f, -0.030375445261597633f, -0.031227940693497658f, -0.030909262597560883f, -0.030590584501624107f, -0.030271906405687332f, -0.031253181397914886f, -0.030931513756513596f, -0.030609846115112305f, -0.030288176611065865f, -0.031285300850868225f, -0.03096388652920723f, -0.030642470344901085f, -0.03032105602324009f, -0.03132877126336098f, -0.031010428443551064f, -0.0306920874863863f, -0.030373748391866684f, -0.03123709373176098f, -0.030920369550585747f, -0.030603643506765366f, -0.030286919325590134f, -0.03126528486609459f, -0.030946513637900352f, -0.030627742409706116f, -0.03030896931886673f, -0.03129703179001808f, -0.030978428199887276f, -0.03065982460975647f, -0.030341221019625664f, -0.031335558742284775f, -0.0310190562158823f, -0.030702557414770126f, -0.030386056751012802f, };
+static float OUTPUT_GRAD_CHW_PAD1[] = {-0.031234506517648697f, -0.03126119077205658f, -0.031293101608753204f, -0.031332820653915405f, -0.03122907504439354f, -0.03125467151403427f, -0.03128679469227791f, -0.031329624354839325f, -0.031227940693497658f, -0.031253181397914886f, -0.031285300850868225f, -0.03132877126336098f, -0.03123709373176098f, -0.03126528486609459f, -0.03129703179001808f, -0.031335558742284775f, -0.03091815672814846f, -0.03094298020005226f, -0.030975062400102615f, -0.031016694381833076f, -0.030910678207874298f, -0.030933426693081856f, -0.03096579946577549f, -0.031011562794446945f, -0.030909262597560883f, -0.030931513756513596f, -0.03096388652920723f, -0.031010428443551064f, -0.030920369550585747f, -0.030946513637900352f, -0.030978428199887276f, -0.0310190562158823f, -0.030601806938648224f, -0.030624769628047943f, -0.030657021328806877f, -0.030700569972395897f, -0.030592281371355057f, -0.03061218000948429f, -0.03064480610191822f, -0.030693503096699715f, -0.030590584501624107f, -0.030609846115112305f, -0.030642470344901085f, -0.0306920874863863f, -0.030603643506765366f, -0.030627742409706116f, -0.03065982460975647f, -0.030702557414770126f, -0.030285457149147987f, -0.030306560918688774f, -0.03033898025751114f, -0.030384443700313568f, -0.030273884534835815f, -0.030290933325886726f, -0.030323810875415802f, -0.030375445261597633f, -0.030271906405687332f, -0.030288176611065865f, -0.03032105602324009f, -0.030373748391866684f, -0.030286919325590134f, -0.03030896931886673f, -0.030341221019625664f, -0.030386056751012802f, };
+
+// expected output and expected weight/bias/input gradients
+static float EXPECTED_OUTPUT_HWC_PAD0[] = {0.0008505000150762498f, 0.011130400002002716f, 0.021410299465060234f, 0.03169019892811775f, 0.0008226000354625285f, 0.011094399727880955f, 0.021366199478507042f, 0.031638000160455704f, 0.0008981999708339572f, 0.011191600002348423f, 0.021484998986124992f, 0.031778398901224136f, 0.0008703000494278967f, 0.011155599728226662f, 0.0214408989995718f, 0.03172620013356209f, };
+static float EXPECTED_OUTPUT_CHW_PAD0[] = {0.0008505000150762498f, 0.0008226000354625285f, 0.0008981999708339572f, 0.0008703000494278967f, 0.011130400002002716f, 0.011094399727880955f, 0.011191600002348423f, 0.011155599728226662f, 0.021410299465060234f, 0.021366199478507042f, 0.021484998986124992f, 0.0214408989995718f, 0.03169019892811775f, 0.031638000160455704f, 0.031778398901224136f, 0.03172620013356209f, };
+static float EXPECTED_OUTPUT_HWC_PAD1[] = {0.0004958000499755144f, 0.010618999600410461f, 0.020742200314998627f, 0.030865399166941643f, 0.0006419999990612268f, 0.010824699886143208f, 0.021007400006055832f, 0.03119009919464588f, 0.0006207000697031617f, 0.010797999799251556f, 0.020975299179553986f, 0.031152598559856415f, 0.0003498000151012093f, 0.010465799830853939f, 0.020581800490617752f, 0.03069780021905899f, 0.0006695999763906002f, 0.010858300141990185f, 0.021046999841928482f, 0.031235698610544205f, 0.0008505000732839108f, 0.011130400002002716f, 0.021410299465060234f, 0.03169019892811775f, 0.0008226001518778503f, 0.011094399727880955f, 0.021366199478507042f, 0.031638000160455704f, 0.00045210003736428916f, 0.010629999451339245f, 0.0208078995347023f, 0.030985798686742783f, 0.0007059000199660659f, 0.010903599672019482f, 0.021101299673318863f, 0.03129899874329567f, 0.000898200087249279f, 0.011191600002348423f, 0.021484998986124992f, 0.031778398901224136f, 0.0008702999912202358f, 0.011155599728226662f, 0.0214408989995718f, 0.03172620013356209f, 0.0004794000124093145f, 0.010666299611330032f, 0.020853199064731598f, 0.031040098518133163f, 0.0004130000015720725f, 0.010548199526965618f, 0.020683400332927704f, 0.030818600207567215f, 0.0005108999903313816f, 0.010711600072681904f, 0.020912298932671547f, 0.031112998723983765f, 0.0004949999856762588f, 0.010690299794077873f, 0.02088559977710247f, 0.031080899760127068f, 0.0002621999883558601f, 0.010390199720859528f, 0.020518198609352112f, 0.030646199360489845f, };
+static float EXPECTED_OUTPUT_CHW_PAD1[] = {0.0004958000499755144f, 0.0006419999990612268f, 0.0006207000697031617f, 0.0003498000151012093f, 0.0006695999763906002f, 0.0008505000732839108f, 0.0008226001518778503f, 0.00045210003736428916f, 0.0007059000199660659f, 0.000898200087249279f, 0.0008702999912202358f, 0.0004794000124093145f, 0.0004130000015720725f, 0.0005108999903313816f, 0.0004949999856762588f, 0.0002621999883558601f, 0.010618999600410461f, 0.010824699886143208f, 0.010797999799251556f, 0.010465799830853939f, 0.010858300141990185f, 0.011130400002002716f, 0.011094399727880955f, 0.010629999451339245f, 0.010903599672019482f, 0.011191600002348423f, 0.011155599728226662f, 0.010666299611330032f, 0.010548199526965618f, 0.010711600072681904f, 0.010690299794077873f, 0.010390199720859528f, 0.020742200314998627f, 0.021007400006055832f, 0.020975299179553986f, 0.020581800490617752f, 0.021046999841928482f, 0.021410299465060234f, 0.021366199478507042f, 0.0208078995347023f, 0.021101299673318863f, 0.021484998986124992f, 0.0214408989995718f, 0.020853199064731598f, 0.020683400332927704f, 0.020912298932671547f, 0.02088559977710247f, 0.020518198609352112f, 0.030865399166941643f, 0.03119009919464588f, 0.031152598559856415f, 0.03069780021905899f, 0.031235698610544205f, 0.03169019892811775f, 0.031638000160455704f, 0.030985798686742783f, 0.03129899874329567f, 0.031778398901224136f, 0.03172620013356209f, 0.031040098518133163f, 0.030818600207567215f, 0.031112998723983765f, 0.031080899760127068f, 0.030646199360489845f, };
+static float EXPECTED_WEIGHT_GRAD_HWC_PAD0[] = {-0.0004998184740543365f, -0.0005098147667013109f, -0.0005298074684105814f, -0.0004898195038549602f, -0.0004998157965019345f, -0.000519808498211205f, -0.00046982415369711816f, -0.00047982041724026203f, -0.0004998131189495325f, -0.0005098147667013109f, -0.0005298074684105814f, -0.0005597965209744871f, -0.0004998157965019345f, -0.000519808498211205f, -0.0005497975507751107f, -0.00047982041724026203f, -0.0004998131189495325f, -0.0005298021715134382f, -0.0005298074684105814f, -0.0005597965209744871f, -0.0005997819826006889f, -0.000519808498211205f, -0.0005497975507751107f, -0.0005897830124013126f, -0.0004998131189495325f, -0.0005298021715134382f, -0.0005697876331396401f, -0.0004946771659888327f, -0.0005045706056989729f, -0.0005243575433269143f, -0.00048478099051862955f, -0.0004946744302287698f, -0.0005144614260643721f, -0.0004649912880267948f, -0.0004748846695292741f, -0.0004946716944687068f, -0.0005045706056989729f, -0.0005243575433269143f, -0.0005540381534956396f, -0.0004946744302287698f, -0.0005144614260643721f, -0.0005441419780254364f, -0.0004748846695292741f, -0.0004946716944687068f, -0.0005243522464297712f, -0.0005243575433269143f, -0.0005540381534956396f, -0.000593612261582166f, -0.0005144614260643721f, -0.0005441419780254364f, -0.0005837160861119628f, -0.0004946716944687068f, -0.0005243522464297712f, -0.0005639263545162976f, -0.0004895358579233289f, -0.0004993263864889741f, -0.0005189076764509082f, -0.0004797424771822989f, -0.000489533063955605f, -0.0005091143539175391f, -0.00046015839325264096f, -0.00046994895092211664f, -0.0004895302699878812f, -0.0004993263864889741f, -0.0005189076764509082f, -0.0005482797278091311f, -0.000489533063955605f, -0.0005091143539175391f, -0.0005384864052757621f, -0.00046994895092211664f, -0.0004895302699878812f, -0.0005189022631384432f, -0.0005189076764509082f, -0.0005482797278091311f, -0.000587442540563643f, -0.0005091143539175391f, -0.0005384864052757621f, -0.0005776492180302739f, -0.0004895302699878812f, -0.0005189022631384432f, -0.0005580650758929551f, -0.0004843945207539946f, -0.0004940821672789752f, -0.0005134578095749021f, -0.00047470396384596825f, -0.0004843916685786098f, -0.0005037672817707062f, -0.00045532549847848713f, -0.0004650132032111287f, -0.00048438881640322506f, -0.0004940821672789752f, -0.0005134578095749021f, -0.0005425213603302836f, -0.0004843916685786098f, -0.0005037672817707062f, -0.0005328307743184268f, -0.0004650132032111287f, -0.00048438881640322506f, -0.0005134523380547762f, -0.0005134578095749021f, -0.0005425213603302836f, -0.00058127281954512f, -0.0005037672817707062f, -0.0005328307743184268f, -0.0005715822335332632f, -0.00048438881640322506f, -0.0005134523380547762f, -0.0005522037972696126f, };
+static float EXPECTED_WEIGHT_GRAD_CHW_PAD0[] = {-0.0004998184740543365f, -0.0004898195038549602f, -0.00046982415369711816f, -0.0005098147667013109f, -0.0004998157965019345f, -0.00047982041724026203f, -0.0005298074684105814f, -0.000519808498211205f, -0.0004998131189495325f, -0.0005098147667013109f, -0.0004998157965019345f, -0.00047982041724026203f, -0.0005298074684105814f, -0.000519808498211205f, -0.0004998131189495325f, -0.0005597965209744871f, -0.0005497975507751107f, -0.0005298021715134382f, -0.0005298074684105814f, -0.000519808498211205f, -0.0004998131189495325f, -0.0005597965209744871f, -0.0005497975507751107f, -0.0005298021715134382f, -0.0005997819826006889f, -0.0005897830124013126f, -0.0005697876331396401f, -0.0004946771659888327f, -0.00048478099051862955f, -0.0004649912880267948f, -0.0005045706056989729f, -0.0004946744302287698f, -0.0004748846695292741f, -0.0005243575433269143f, -0.0005144614260643721f, -0.0004946716944687068f, -0.0005045706056989729f, -0.0004946744302287698f, -0.0004748846695292741f, -0.0005243575433269143f, -0.0005144614260643721f, -0.0004946716944687068f, -0.0005540381534956396f, -0.0005441419780254364f, -0.0005243522464297712f, -0.0005243575433269143f, -0.0005144614260643721f, -0.0004946716944687068f, -0.0005540381534956396f, -0.0005441419780254364f, -0.0005243522464297712f, -0.000593612261582166f, -0.0005837160861119628f, -0.0005639263545162976f, -0.0004895358579233289f, -0.0004797424771822989f, -0.00046015839325264096f, -0.0004993263864889741f, -0.000489533063955605f, -0.00046994895092211664f, -0.0005189076764509082f, -0.0005091143539175391f, -0.0004895302699878812f, -0.0004993263864889741f, -0.000489533063955605f, -0.00046994895092211664f, -0.0005189076764509082f, -0.0005091143539175391f, -0.0004895302699878812f, -0.0005482797278091311f, -0.0005384864052757621f, -0.0005189022631384432f, -0.0005189076764509082f, -0.0005091143539175391f, -0.0004895302699878812f, -0.0005482797278091311f, -0.0005384864052757621f, -0.0005189022631384432f, -0.000587442540563643f, -0.0005776492180302739f, -0.0005580650758929551f, -0.0004843945207539946f, -0.00047470396384596825f, -0.00045532549847848713f, -0.0004940821672789752f, -0.0004843916685786098f, -0.0004650132032111287f, -0.0005134578095749021f, -0.0005037672817707062f, -0.00048438881640322506f, -0.0004940821672789752f, -0.0004843916685786098f, -0.0004650132032111287f, -0.0005134578095749021f, -0.0005037672817707062f, -0.00048438881640322506f, -0.0005425213603302836f, -0.0005328307743184268f, -0.0005134523380547762f, -0.0005134578095749021f, -0.0005037672817707062f, -0.00048438881640322506f, -0.0005425213603302836f, -0.0005328307743184268f, -0.0005134523380547762f, -0.00058127281954512f, -0.0005715822335332632f, -0.0005522037972696126f, };
+static float EXPECTED_WEIGHT_GRAD_HWC_PAD1[] = {-0.00028163238312117755f, -0.0002900820109061897f, -0.00030416433583013713f, -0.00036843473208136857f, -0.0003796953533310443f, -0.00039846255094744265f, -0.00027288994169794023f, -0.0002813306055031717f, -0.0002953980292659253f, -0.0003824028535746038f, -0.00039742409717291594f, -0.00041995575884357095f, -0.0005004326812922955f, -0.0005204511107876897f, -0.0005504785222001374f, -0.0003707402211148292f, -0.00038574551581405103f, -0.0004082533123437315f, -0.0002900689432863146f, -0.00030414978391490877f, -0.00032386320526711643f, -0.00037967925891280174f, -0.00039844459388405085f, -0.0004247162432875484f, -0.00028131791623309255f, -0.0002953837683890015f, -0.000315076089464128f, -0.00027875672094523907f, -0.0002871202304959297f, -0.00030105889891274273f, -0.000364675564924255f, -0.0003758214006666094f, -0.00039439729880541563f, -0.00027010016492567956f, -0.0002784546813927591f, -0.0002923783322330564f, -0.00037850451190024614f, -0.0003933725238312036f, -0.00041567449807189405f, -0.0004953343886882067f, -0.0005151488003320992f, -0.0005448703304864466f, -0.00036695622839033604f, -0.0003818082623183727f, -0.0004040862841065973f, -0.0002871080650947988f, -0.00030104504548944533f, -0.0003205570683348924f, -0.0003758064704015851f, -0.00039438018575310707f, -0.0004203836724627763f, -0.0002784428361337632f, -0.0002923647698480636f, -0.00031185566331259906f, -0.00027588108787313104f, -0.00028415845008566976f, -0.0002979534910991788f, -0.000360916368663311f, -0.0003719475062098354f, -0.0003903320466633886f, -0.00026731035904958844f, -0.00027557872817851603f, -0.00028935872251167893f, -0.00037460619932971895f, -0.0003893210960086435f, -0.00041139329550787807f, -0.0004902360960841179f, -0.0005098464898765087f, -0.000539262022357434f, -0.000363172177458182f, -0.0003778710088226944f, -0.00039991919766180217f, -0.0002841472451109439f, -0.0002979403070639819f, -0.0003172509605064988f, -0.000371933652786538f, -0.0003903157194145024f, -0.0004160511016380042f, -0.0002755677269306034f, -0.0002893457130994648f, -0.00030863520805723965f, -0.00027300542569719255f, -0.00028119664057157934f, -0.0002948480541817844f, -0.0003571571141947061f, -0.0003680735535454005f, -0.00038626688183285296f, -0.0002645205822773278f, -0.000272702774964273f, -0.000286339083686471f, -0.00037070788675919175f, -0.00038526958087459207f, -0.0004071120056323707f, -0.00048513777437619865f, -0.0005045442376285791f, -0.0005336537724360824f, -0.0003593881556298584f, -0.0003739337553270161f, -0.0003957521403208375f, -0.0002811863669194281f, -0.00029483556863851845f, -0.00031394485267810524f, -0.0003680608351714909f, -0.0003862513112835586f, -0.00041171853081323206f, -0.00027269264683127403f, -0.0002863266854546964f, -0.0003054147236980498f, };
+static float EXPECTED_WEIGHT_GRAD_CHW_PAD1[] = {-0.00028163238312117755f, -0.00036843473208136857f, -0.00027288994169794023f, -0.0003824028535746038f, -0.0005004326812922955f, -0.0003707402211148292f, -0.0002900689432863146f, -0.00037967925891280174f, -0.00028131791623309255f, -0.0002900820109061897f, -0.0003796953533310443f, -0.0002813306055031717f, -0.00039742409717291594f, -0.0005204511107876897f, -0.00038574551581405103f, -0.00030414978391490877f, -0.00039844459388405085f, -0.0002953837683890015f, -0.00030416433583013713f, -0.00039846255094744265f, -0.0002953980292659253f, -0.00041995575884357095f, -0.0005504785222001374f, -0.0004082533123437315f, -0.00032386320526711643f, -0.0004247162432875484f, -0.000315076089464128f, -0.00027875672094523907f, -0.000364675564924255f, -0.00027010016492567956f, -0.00037850451190024614f, -0.0004953343886882067f, -0.00036695622839033604f, -0.0002871080650947988f, -0.0003758064704015851f, -0.0002784428361337632f, -0.0002871202304959297f, -0.0003758214006666094f, -0.0002784546813927591f, -0.0003933725238312036f, -0.0005151488003320992f, -0.0003818082623183727f, -0.00030104504548944533f, -0.00039438018575310707f, -0.0002923647698480636f, -0.00030105889891274273f, -0.00039439729880541563f, -0.0002923783322330564f, -0.00041567449807189405f, -0.0005448703304864466f, -0.0004040862841065973f, -0.0003205570683348924f, -0.0004203836724627763f, -0.00031185566331259906f, -0.00027588108787313104f, -0.000360916368663311f, -0.00026731035904958844f, -0.00037460619932971895f, -0.0004902360960841179f, -0.000363172177458182f, -0.0002841472451109439f, -0.000371933652786538f, -0.0002755677269306034f, -0.00028415845008566976f, -0.0003719475062098354f, -0.00027557872817851603f, -0.0003893210960086435f, -0.0005098464898765087f, -0.0003778710088226944f, -0.0002979403070639819f, -0.0003903157194145024f, -0.0002893457130994648f, -0.0002979534910991788f, -0.0003903320466633886f, -0.00028935872251167893f, -0.00041139329550787807f, -0.000539262022357434f, -0.00039991919766180217f, -0.0003172509605064988f, -0.0004160511016380042f, -0.00030863520805723965f, -0.00027300542569719255f, -0.0003571571141947061f, -0.0002645205822773278f, -0.00037070788675919175f, -0.00048513777437619865f, -0.0003593881556298584f, -0.0002811863669194281f, -0.0003680608351714909f, -0.00027269264683127403f, -0.00028119664057157934f, -0.0003680735535454005f, -0.000272702774964273f, -0.00038526958087459207f, -0.0005045442376285791f, -0.0003739337553270161f, -0.00029483556863851845f, -0.0003862513112835586f, -0.0002863266854546964f, -0.0002948480541817844f, -0.00038626688183285296f, -0.000286339083686471f, -0.0004071120056323707f, -0.0005336537724360824f, -0.0003957521403208375f, -0.00031394485267810524f, -0.00041171853081323206f, -0.0003054147236980498f, };
+static float EXPECTED_BIAS_GRAD_PAD0[] = {-0.499819815158844f, -0.4946784973144531f, -0.48953720927238464f, -0.48439592123031616f, };
+static float EXPECTED_BIAS_GRAD_PAD1[] = {-0.5004519820213318f, -0.4953538179397583f, -0.4902556836605072f, -0.4851575791835785f, };
+static float EXPECTED_INPUT_GRAD_HWC_PAD0[] = {-0.0073137227445840836f, -0.012232370674610138f, -0.01715102046728134f, -0.01955394446849823f, -0.02939644455909729f, -0.03923893719911575f, -0.02939644455909729f, -0.03923893719911575f, -0.04908143728971481f, -0.017169270664453506f, -0.022093120962381363f, -0.02701696939766407f, -0.019545499235391617f, -0.029382457956671715f, -0.039219412952661514f, -0.04894886165857315f, -0.06863316893577576f, -0.08831749111413956f, -0.06863316893577576f, -0.08831749111413956f, -0.10800180584192276f, -0.039261117577552795f, -0.04910847544670105f, -0.058955833315849304f, -0.029382457956671715f, -0.039219412952661514f, -0.04905637353658676f, -0.06863316893577576f, -0.08831749111413956f, -0.10800180584192276f, -0.08831749111413956f, -0.10800180584192276f, -0.12768611311912537f, -0.04910847544670105f, -0.058955833315849304f, -0.06880319118499756f, -0.017149746417999268f, -0.022068055346608162f, -0.026986364275217056f, -0.039236053824424744f, -0.049077872186899185f, -0.05891969054937363f, -0.049077872186899185f, -0.05891969054937363f, -0.06876150518655777f, -0.027015015482902527f, -0.031938523054122925f, -0.03686203435063362f, };
+static float EXPECTED_INPUT_GRAD_CHW_PAD0[] = {-0.0073137227445840836f, -0.01955394446849823f, -0.02939644455909729f, -0.017169270664453506f, -0.019545499235391617f, -0.04894886165857315f, -0.06863316893577576f, -0.039261117577552795f, -0.029382457956671715f, -0.06863316893577576f, -0.08831749111413956f, -0.04910847544670105f, -0.017149746417999268f, -0.039236053824424744f, -0.049077872186899185f, -0.027015015482902527f, -0.012232370674610138f, -0.02939644455909729f, -0.03923893719911575f, -0.022093120962381363f, -0.029382457956671715f, -0.06863316893577576f, -0.08831749111413956f, -0.04910847544670105f, -0.039219412952661514f, -0.08831749111413956f, -0.10800180584192276f, -0.058955833315849304f, -0.022068055346608162f, -0.049077872186899185f, -0.05891969054937363f, -0.031938523054122925f, -0.01715102046728134f, -0.03923893719911575f, -0.04908143728971481f, -0.02701696939766407f, -0.039219412952661514f, -0.08831749111413956f, -0.10800180584192276f, -0.058955833315849304f, -0.04905637353658676f, -0.10800180584192276f, -0.12768611311912537f, -0.06880319118499756f, -0.026986364275217056f, -0.05891969054937363f, -0.06876150518655777f, -0.03686203435063362f, };
+static float EXPECTED_INPUT_GRAD_HWC_PAD1[] = {-0.012242590077221394f, -0.01716531626880169f, -0.02208804152905941f, -0.02206328697502613f, -0.02945086546242237f, -0.03683844581246376f, -0.02208544686436653f, -0.029481014236807823f, -0.036876581609249115f, -0.01720142923295498f, -0.022134728729724884f, -0.02706803008913994f, -0.022054532542824745f, -0.02943808026611805f, -0.0368216335773468f, -0.03863302618265152f, -0.049713555723428726f, -0.06079408526420593f, -0.038672227412462234f, -0.04976480081677437f, -0.0608573742210865f, -0.029500313103199005f, -0.03689979016780853f, -0.04429926723241806f, -0.022053170949220657f, -0.029436931014060974f, -0.03682069107890129f, -0.03863108903169632f, -0.04971195012331009f, -0.060792818665504456f, -0.03867059573531151f, -0.0497635118663311f, -0.060856424272060394f, -0.029499433934688568f, -0.03689911961555481f, -0.04429881274700165f, -0.017164483666419983f, -0.022087272256612778f, -0.027010060846805573f, -0.029449626803398132f, -0.03683731332421303f, -0.04422499239444733f, -0.0294799767434597f, -0.036875661462545395f, -0.04427134618163109f, -0.022134138271212578f, -0.027067512273788452f, -0.032000888139009476f, };
+static float EXPECTED_INPUT_GRAD_CHW_PAD1[] = {-0.012242590077221394f, -0.02206328697502613f, -0.02208544686436653f, -0.01720142923295498f, -0.022054532542824745f, -0.03863302618265152f, -0.038672227412462234f, -0.029500313103199005f, -0.022053170949220657f, -0.03863108903169632f, -0.03867059573531151f, -0.029499433934688568f, -0.017164483666419983f, -0.029449626803398132f, -0.0294799767434597f, -0.022134138271212578f, -0.01716531626880169f, -0.02945086546242237f, -0.029481014236807823f, -0.022134728729724884f, -0.02943808026611805f, -0.049713555723428726f, -0.04976480081677437f, -0.03689979016780853f, -0.029436931014060974f, -0.04971195012331009f, -0.0497635118663311f, -0.03689911961555481f, -0.022087272256612778f, -0.03683731332421303f, -0.036875661462545395f, -0.027067512273788452f, -0.02208804152905941f, -0.03683844581246376f, -0.036876581609249115f, -0.02706803008913994f, -0.0368216335773468f, -0.06079408526420593f, -0.0608573742210865f, -0.04429926723241806f, -0.03682069107890129f, -0.060792818665504456f, -0.060856424272060394f, -0.04429881274700165f, -0.027010060846805573f, -0.04422499239444733f, -0.04427134618163109f, -0.032000888139009476f, };
+
+
+struct TestVector
+{
+    struct blob in;         // in + expected in grad
+    struct blob out;        // out_grad + expected out
+    struct blob weight;     // weight + expected weight grad
+    struct blob bias;       // bias + expected bias grad
+};
+
+static struct TestVector test_vectors[] = {
+    { // HWC = 0, PAD = 0
+        .in = {
+            .data = INPUT_CHW,
+            .diff = EXPECTED_INPUT_GRAD_CHW_PAD0,
+            .H = 4, .W = 4, .C = 3
+        },
+        .out = {
+            .diff = OUTPUT_GRAD_CHW_PAD0,
+            .data = EXPECTED_OUTPUT_CHW_PAD0,
+            .H = 2, .W = 2, .C = 4
+        },
+        .weight = {
+            .data = WEIGHTS,
+            .diff = EXPECTED_WEIGHT_GRAD_CHW_PAD0,
+            .H = 3, .W = 3, .C = 3
+        },
+        .bias = {
+            .data = BIASES,
+            .diff = EXPECTED_BIAS_GRAD_PAD0,
+            .dim = 4
+        }
+    },
+    { // HWC = 1, PAD = 0
+        .in = {
+            .data = INPUT_HWC,
+            .diff = EXPECTED_INPUT_GRAD_HWC_PAD0,
+            .H = 4, .W = 4, .C = 3
+        },
+        .out = {
+            .diff = OUTPUT_GRAD_HWC_PAD0,
+            .data = EXPECTED_OUTPUT_HWC_PAD0,
+            .H = 2, .W = 2, .C = 4
+        },
+        .weight = {
+            .data = WEIGHTS,
+            .diff = EXPECTED_WEIGHT_GRAD_HWC_PAD0,
+            .H = 3, .W = 3, .C = 3
+        },
+        .bias = {
+            .data = BIASES,
+            .diff = EXPECTED_BIAS_GRAD_PAD0,
+            .dim = 4
+        }
+    },
+    { // HWC = 0, PAD = 1
+        .in = {
+            .data = INPUT_CHW,
+            .diff = EXPECTED_INPUT_GRAD_CHW_PAD1,
+            .H = 4, .W = 4, .C = 3
+        },
+        .out = {
+            .diff = OUTPUT_GRAD_CHW_PAD1,
+            .data = EXPECTED_OUTPUT_CHW_PAD1,
+            .H = 4, .W = 4, .C = 4
+        },
+        .weight = {
+            .data = WEIGHTS,
+            .diff = EXPECTED_WEIGHT_GRAD_CHW_PAD1,
+            .H = 3, .W = 3, .C = 3
+        },
+        .bias = {
+            .data = BIASES,
+            .diff = EXPECTED_BIAS_GRAD_PAD1,
+            .dim = 4
+        }
+    },
+    { // HWC = 1, PAD = 1
+        .in = {
+            .data = INPUT_HWC,
+            .diff = EXPECTED_INPUT_GRAD_HWC_PAD1,
+            .H = 4, .W = 4, .C = 3
+        },
+        .out = {
+            .diff = OUTPUT_GRAD_HWC_PAD1,
+            .data = EXPECTED_OUTPUT_HWC_PAD1,
+            .H = 4, .W = 4, .C = 4
+        },
+        .weight = {
+            .data = WEIGHTS,
+            .diff = EXPECTED_WEIGHT_GRAD_HWC_PAD1,
+            .H = 3, .W = 3, .C = 3
+        },
+        .bias = {
+            .data = BIASES,
+            .diff = EXPECTED_BIAS_GRAD_PAD1,
+            .dim = 4
+        }
+    }
+};
+
+void set_array(float *array, size_t size, float value)
+{
+    for (int i = 0; i < size; i++) {
+        array[i] = value;
+    }
+}
+
+// create a deep copy of a test vector
+void copy_test_vector(const struct TestVector *src, struct TestVector* dst)
+{
+    *dst = *src;
+
+    dst->in.data = malloc(src->in.dim * sizeof(float));
+    dst->in.diff = malloc(src->in.dim * sizeof(float));
+    dst->out.data = malloc(src->out.dim * sizeof(float));
+    dst->out.diff = malloc(src->out.dim * sizeof(float));
+    dst->weight.data = malloc(src->weight.dim * sizeof(float));
+    dst->weight.diff = malloc(src->weight.dim * sizeof(float));
+    dst->bias.data = malloc(src->bias.dim * sizeof(float));
+    dst->bias.diff = malloc(src->bias.dim * sizeof(float));
+    memcpy(dst->in.data, src->in.data, src->in.dim * sizeof(float));
+    memcpy(dst->in.diff, src->in.diff, src->in.dim * sizeof(float));
+    memcpy(dst->out.diff, src->out.diff, src->out.dim * sizeof(float));
+    memcpy(dst->out.data, src->out.data, src->out.dim * sizeof(float));
+    memcpy(dst->weight.data, src->weight.data, src->weight.dim * sizeof(float));
+    memcpy(dst->weight.diff, src->weight.diff, src->weight.dim * sizeof(float));
+    memcpy(dst->bias.data, src->bias.data, src->bias.dim * sizeof(float));
+    memcpy(dst->bias.diff, src->bias.diff, src->bias.dim * sizeof(float));
+}
+
+// free a copied test vector
+void free_test_vector(struct TestVector *v)
+{
+    free(v->in.data);
+    free(v->in.diff);
+    free(v->out.data);
+    free(v->out.diff);
+    free(v->weight.data);
+    free(v->weight.diff);
+    free(v->bias.data);
+    free(v->bias.diff);
+}
+
+void create_test_vectors(int hwc, int use_im2col, int padding, struct Conv2D_args* args, struct TestVector* expected)
+{
+    int idx = hwc + 2*padding;
+    static struct TestVector a;
+    struct TestVector v = test_vectors[idx];
+
+    // populate .dim fields
+    v.in.dim = v.in.C * v.in.H * v.in.W;
+    v.out.dim = v.out.C * v.out.H * v.out.W;
+    v.weight.dim = v.weight.C * v.weight.H * v.weight.W * v.out.C;
+
+    // create two deep copies of the test vector so that we don't overwrite the
+    // original one. One copy to populate args and one as expected values
+    copy_test_vector(&v, expected);
+    copy_test_vector(&v, &a);
+
+    // set some buffers to zero for the argument blobs
+    set_array(a.in.diff, a.in.dim, 0.0);
+    set_array(a.out.data, a.out.dim, 0.0);
+    set_array(a.weight.diff, a.weight.dim, 0.0);
+    set_array(a.bias.diff, a.bias.dim, 0.0);
+
+    // allocate workspace buffers in case needed
+    int im2col_buffer_size = v.in.dim * v.out.C * v.weight.W * v.weight.H;
+    int bt_buffer_size = (v.weight.H * v.weight.W * v.weight.C * v.out.C) >
+                         (v.out.H * v.out.W * v.out.C) ?
+                         (v.weight.H * v.weight.W * v.weight.C * v.out.C) :
+                         (v.out.H * v.out.W * v.out.C);
+    float *im2col_buffer = calloc(im2col_buffer_size, sizeof(float));
+    float *bt_buffer = calloc(bt_buffer_size, sizeof(float));
+
+    // potulate conv2d parameter struct
+    args->input = &a.in;
+    args->coeff = &a.weight;
+    args->bias = &a.bias;
+    args->output = &a.out;
+    args->Lpad = padding;
+    args->Rpad = padding;
+    args->Upad = padding;
+    args->Dpad = padding;
+    args->stride_h = 1;
+    args->stride_w = 1;
+    args->i2c_buffer = im2col_buffer;
+    args->bt_buffer = bt_buffer;
+    args->skip_wg_grad = 0;
+    args->skip_in_grad = 0;
+    args->HWC = hwc;
+    args->opt_matmul_type_fw = 0;
+    args->opt_matmul_type_wg = 0;
+    args->opt_matmul_type_ig = 0;
+    args->USE_IM2COL = use_im2col;
+    args->USE_DMA_IM2COL = 0;
+    args->USE_BIASES = 1;
+}
+
+void free_test_vectors(struct Conv2D_args* args, struct TestVector* expected)
+{
+    free(args->input->data);
+    free(args->input->diff);
+    free(args->output->data);
+    free(args->output->diff);
+    free(args->coeff->data);
+    free(args->coeff->diff);
+    free(args->bias->data);
+    free(args->bias->diff);
+    free(args->i2c_buffer);
+    free(args->bt_buffer);
+    free_test_vector(expected);
+}
+
+// called before each test
+void setUp(void)
+{
+}
+
+// called after each test
+void tearDown(void)
+{
+}
+
+TEST_CASE(1, 1, 0)  // HWC, im2col, no padding
+TEST_CASE(0, 1, 0)  // CHW, im2col, no padding
+TEST_CASE(0, 0, 0)  // CHW, naive,  no padding
+TEST_CASE(1, 0, 0)  // HWC, naive,  no padding
+//TEST_CASE(1, 1, 1)  // HWC, im2col, same padding --> fails: Padding not implemented for HWC im2col without DMA!
+TEST_CASE(0, 1, 1)  // CHW, im2col, same padding
+TEST_CASE(0, 0, 1)  // CHW, naive,  same padding
+TEST_CASE(1, 0, 1)  // HWC, naive,  same padding
+void test_pulp_conv2d_fp32_fw_cl(int hwc, int use_im2col, int padding)
+{
+    struct Conv2D_args args;
+    struct TestVector expected;
+    create_test_vectors(hwc, use_im2col, padding, &args, &expected);
+
+    // for the naive HWC case, we haven't implemented it yet so expect zeros
+    if (hwc == 1 && use_im2col == 0) {
+        set_array(expected.out.data, expected.out.dim, 0.0);
+    }
+
+    pulp_conv2d_fp32_fw_cl(&args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected.out.data, args.output->data, args.output->dim);
+
+    free_test_vectors(&args, &expected);
+}
+
+TEST_CASE(1, 1, 0)  // HWC, im2col, no padding
+TEST_CASE(0, 1, 0)  // CHW, im2col, no padding
+TEST_CASE(0, 0, 0)  // CHW, naive,  no padding
+TEST_CASE(1, 0, 0)  // HWC, naive,  no padding
+//TEST_CASE(1, 1, 1)  // HWC, im2col, same padding --> fails: Padding not implemented for HWC im2col without DMA!
+TEST_CASE(0, 1, 1)  // CHW, im2col, same padding
+TEST_CASE(0, 0, 1)  // CHW, naive,  same padding
+TEST_CASE(1, 0, 1)  // HWC, naive,  same padding
+void test_pulp_conv2d_fp32_bw_param_grads_cl(int hwc, int use_im2col, int padding)
+{
+    struct Conv2D_args args;
+    struct TestVector expected;
+    create_test_vectors(hwc, use_im2col, padding, &args, &expected);
+
+    // for the naive HWC case, we haven't implemented it yet so expect zeros
+    if (hwc == 1 && use_im2col == 0) {
+        set_array(expected.weight.diff, expected.weight.dim, 0.0);
+        set_array(expected.bias.diff, expected.bias.dim, 0.0);
+    }
+
+    pulp_conv2d_fp32_bw_param_grads_cl(&args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected.weight.diff, args.coeff->diff, args.coeff->dim);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected.bias.diff, args.bias->diff, args.bias->dim);
+
+    free_test_vectors(&args, &expected);
+}
+
+TEST_CASE(1, 1, 0)  // HWC, im2col, no padding
+TEST_CASE(0, 1, 0)  // CHW, im2col, no padding
+TEST_CASE(0, 0, 0)  // CHW, naive,  no padding
+TEST_CASE(1, 0, 0)  // HWC, naive,  no padding
+//TEST_CASE(1, 1, 1)  // HWC, im2col,  same padding --> fails
+//TEST_CASE(0, 1, 1)  // CHW, im2col,  same padding --> fails
+TEST_CASE(0, 0, 1)  // CHW, naive,  same padding
+TEST_CASE(1, 0, 1)  // HWC, naive,  same padding
+void test_pulp_conv2d_fp32_bw_input_grads_cl(int hwc, int use_im2col, int padding)
+{
+    struct Conv2D_args args;
+    struct TestVector expected;
+    create_test_vectors(hwc, use_im2col, padding, &args, &expected);
+
+    // for the naive HWC case, we haven't implemented it yet so expect zeros
+    if (hwc == 1 && use_im2col == 0) {
+        set_array(expected.in.diff, expected.in.dim, 0.0);
+    }
+
+    pulp_conv2d_fp32_bw_input_grads_cl(&args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected.in.diff, args.input->diff, args.input->dim);
+
+    free_test_vectors(&args, &expected);
+}
+
+TEST_CASE(0, 0) // calculate both weight and input gradients
+TEST_CASE(1, 0) // skip weight gradient calculation
+TEST_CASE(0, 1) // skip input gradient calculation
+void test_pulp_conv2d_fp32_bw_cl(int skip_wg_grad, int skip_in_grad)
+{
+    struct Conv2D_args args;
+    struct TestVector expected;
+    create_test_vectors(0, 0, 0, &args, &expected);
+
+    // test skip grad calculations
+    args.skip_wg_grad = skip_wg_grad;
+    args.skip_in_grad = skip_in_grad;
+
+    if (skip_wg_grad) {
+        set_array(expected.weight.diff, expected.weight.dim, 0);
+        set_array(expected.bias.diff, expected.bias.dim, 0);
+    }
+    if (skip_in_grad) {
+        set_array(expected.in.diff, expected.in.dim, 0);
+    }
+
+    pulp_conv2d_fp32_bw_cl(&args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected.in.diff, args.input->diff, args.input->dim);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected.weight.diff, args.coeff->diff, args.coeff->dim);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected.bias.diff, args.bias->diff, args.bias->dim);
+
+    free_test_vectors(&args, &expected);
+}
+
+#endif // TEST

From 660226e9a18c5c1c8806f5d60283b03868c5392c Mon Sep 17 00:00:00 2001
From: Maarten Vandersteegen <maartenvandersteegen@gmail.com>
Date: Tue, 10 Mar 2026 10:56:12 +0100
Subject: [PATCH 3/4] Added unit tests for pulp_linear_fp32

Also introduced test_utils.c for common unittest functions
Refactored pulp_matmul_fp32 tests with conditional test cases
---
 unittests/generic/support/test_utils.c    |  10 +
 unittests/generic/support/test_utils.h    |   8 +
 unittests/generic/test_pulp_conv2d_fp32.c |  30 +-
 unittests/generic/test_pulp_linear_fp32.c | 219 +++++++++
 unittests/generic/test_pulp_matmul_fp32.c | 521 ++++++++--------------
 5 files changed, 447 insertions(+), 341 deletions(-)
 create mode 100644 unittests/generic/support/test_utils.c
 create mode 100644 unittests/generic/support/test_utils.h
 create mode 100644 unittests/generic/test_pulp_linear_fp32.c

diff --git a/unittests/generic/support/test_utils.c b/unittests/generic/support/test_utils.c
new file mode 100644
index 00000000..a0c97ab5
--- /dev/null
+++ b/unittests/generic/support/test_utils.c
@@ -0,0 +1,10 @@
+#include "test_utils.h"
+
+
+void set_array_fp32(float *array, size_t size, float value)
+{
+    for (int i = 0; i < size; i++) {
+        array[i] = value;
+    }
+}
+
diff --git a/unittests/generic/support/test_utils.h b/unittests/generic/support/test_utils.h
new file mode 100644
index 00000000..16786cb8
--- /dev/null
+++ b/unittests/generic/support/test_utils.h
@@ -0,0 +1,8 @@
+#ifndef TEST_UTILS_H
+#define TEST_UTILS_H
+
+#include <stddef.h>
+
+void set_array_fp32(float *array, size_t size, float value);
+
+#endif /* TEST_UTILS_H */
diff --git a/unittests/generic/test_pulp_conv2d_fp32.c b/unittests/generic/test_pulp_conv2d_fp32.c
index f2fe03cb..63487054 100644
--- a/unittests/generic/test_pulp_conv2d_fp32.c
+++ b/unittests/generic/test_pulp_conv2d_fp32.c
@@ -3,6 +3,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "unity.h"
+#include "test_utils.h"
 
 #include "pmsis.h"
 #include "pulp_train_defines.h"
@@ -142,13 +143,6 @@ static struct TestVector test_vectors[] = {
     }
 };
 
-void set_array(float *array, size_t size, float value)
-{
-    for (int i = 0; i < size; i++) {
-        array[i] = value;
-    }
-}
-
 // create a deep copy of a test vector
 void copy_test_vector(const struct TestVector *src, struct TestVector* dst)
 {
@@ -202,10 +196,10 @@ void create_test_vectors(int hwc, int use_im2col, int padding, struct Conv2D_arg
     copy_test_vector(&v, &a);
 
     // set some buffers to zero for the argument blobs
-    set_array(a.in.diff, a.in.dim, 0.0);
-    set_array(a.out.data, a.out.dim, 0.0);
-    set_array(a.weight.diff, a.weight.dim, 0.0);
-    set_array(a.bias.diff, a.bias.dim, 0.0);
+    set_array_fp32(a.in.diff, a.in.dim, 0.0);
+    set_array_fp32(a.out.data, a.out.dim, 0.0);
+    set_array_fp32(a.weight.diff, a.weight.dim, 0.0);
+    set_array_fp32(a.bias.diff, a.bias.dim, 0.0);
 
     // allocate workspace buffers in case needed
     int im2col_buffer_size = v.in.dim * v.out.C * v.weight.W * v.weight.H;
@@ -281,7 +275,7 @@ void test_pulp_conv2d_fp32_fw_cl(int hwc, int use_im2col, int padding)
 
     // for the naive HWC case, we haven't implemented it yet so expect zeros
     if (hwc == 1 && use_im2col == 0) {
-        set_array(expected.out.data, expected.out.dim, 0.0);
+        set_array_fp32(expected.out.data, expected.out.dim, 0.0);
     }
 
     pulp_conv2d_fp32_fw_cl(&args);
@@ -306,8 +300,8 @@ void test_pulp_conv2d_fp32_bw_param_grads_cl(int hwc, int use_im2col, int paddin
 
     // for the naive HWC case, we haven't implemented it yet so expect zeros
     if (hwc == 1 && use_im2col == 0) {
-        set_array(expected.weight.diff, expected.weight.dim, 0.0);
-        set_array(expected.bias.diff, expected.bias.dim, 0.0);
+        set_array_fp32(expected.weight.diff, expected.weight.dim, 0.0);
+        set_array_fp32(expected.bias.diff, expected.bias.dim, 0.0);
     }
 
     pulp_conv2d_fp32_bw_param_grads_cl(&args);
@@ -333,7 +327,7 @@ void test_pulp_conv2d_fp32_bw_input_grads_cl(int hwc, int use_im2col, int paddin
 
     // for the naive HWC case, we haven't implemented it yet so expect zeros
     if (hwc == 1 && use_im2col == 0) {
-        set_array(expected.in.diff, expected.in.dim, 0.0);
+        set_array_fp32(expected.in.diff, expected.in.dim, 0.0);
     }
 
     pulp_conv2d_fp32_bw_input_grads_cl(&args);
@@ -356,11 +350,11 @@ void test_pulp_conv2d_fp32_bw_cl(int skip_wg_grad, int skip_in_grad)
     args.skip_in_grad = skip_in_grad;
 
     if (skip_wg_grad) {
-        set_array(expected.weight.diff, expected.weight.dim, 0);
-        set_array(expected.bias.diff, expected.bias.dim, 0);
+        set_array_fp32(expected.weight.diff, expected.weight.dim, 0);
+        set_array_fp32(expected.bias.diff, expected.bias.dim, 0);
     }
     if (skip_in_grad) {
-        set_array(expected.in.diff, expected.in.dim, 0);
+        set_array_fp32(expected.in.diff, expected.in.dim, 0);
     }
 
     pulp_conv2d_fp32_bw_cl(&args);
diff --git a/unittests/generic/test_pulp_linear_fp32.c b/unittests/generic/test_pulp_linear_fp32.c
new file mode 100644
index 00000000..c526dd51
--- /dev/null
+++ b/unittests/generic/test_pulp_linear_fp32.c
@@ -0,0 +1,219 @@
+#ifdef TEST
+
+#include <stdlib.h>
+#include <string.h>
+#include "unity.h"
+#include "test_utils.h"
+
+#include "pmsis.h"
+#include "pulp_train_defines.h"
+#include "pulp_train_utils_fp32.h"
+#include "pulp_linear_fp32.h"
+#include "pulp_matmul_fp32.h"
+
+#define DELTA   1e-12
+
+
+// known parameters
+static float WEIGHTS[] = {0.009999999776482582f, 0.019999999552965164f, 0.029999999329447746f, 0.03999999910593033f, 0.05000000074505806f, 0.05999999865889549f, 0.07000000029802322f, 0.07999999821186066f, 0.09000000357627869f, 0.10000000149011612f, 0.10999999940395355f, 0.11999999731779099f, 0.12999999523162842f, 0.14000000059604645f, 0.15000000596046448f, 0.1599999964237213f, 0.17000000178813934f, 0.18000000715255737f, 0.1899999976158142f, 0.20000000298023224f, 0.20999999344348907f, 0.2199999988079071f, 0.23000000417232513f, 0.23999999463558197f, 0.25f, 0.25999999046325684f, 0.27000001072883606f, 0.2800000011920929f, 0.28999999165534973f, 0.30000001192092896f, 0.3100000023841858f, 0.3199999928474426f, 0.33000001311302185f, 0.3400000035762787f, 0.3499999940395355f, 0.36000001430511475f, 0.3700000047683716f, 0.3799999952316284f, 0.38999998569488525f, 0.4000000059604645f, 0.4099999964237213f, 0.41999998688697815f, 0.4300000071525574f, 0.4399999976158142f, 0.44999998807907104f, 0.46000000834465027f, 0.4699999988079071f, 0.47999998927116394f, 0.49000000953674316f, 0.5f, 0.5099999904632568f, 0.5199999809265137f, 0.5299999713897705f, 0.5400000214576721f, 0.550000011920929f, 0.5600000023841858f, 0.5699999928474426f, 0.5799999833106995f, 0.5899999737739563f, 0.6000000238418579f, 0.6100000143051147f, 0.6200000047683716f, 0.6299999952316284f, 0.6399999856948853f};
+static float BIASES[] = {0.5f, 1.0f, 1.5f, 2.0f, 2.5f, 3.0f, 3.5f, 4.0f};
+
+// known input and output gradient data
+static float INPUT[] = {0.6613521575927734f, 0.266924113035202f, 0.06167725846171379f, 0.6213173270225525f, -0.4519059658050537f, -0.16613022983074188f, -1.522768497467041f, 0.38168391585350037f};
+static float OUTPUT_GRAD[] = {-0.14249178767204285f, -0.02044878900051117f, 0.10159420967102051f, 0.223637193441391f, 0.34568023681640625f, 0.4677232503890991f, 0.5897662043571472f, 0.7118092179298401f};
+
+// expected output and expected weight/bias/input gradients
+static float EXPECTED_OUTPUT[] = {0.4300328195095062f, 0.9182048439979553f, 1.406376838684082f, 1.894548773765564f, 2.382720947265625f, 2.8708930015563965f, 3.359064817428589f, 3.8472368717193604f};
+static float EXPECTED_WEIGHT_GRAD[] = {-0.09423725306987762f, -0.03803449496626854f, -0.008788502775132656f, -0.08853261917829514f, 0.06439288705587387f, 0.0236721932888031f, 0.21698200702667236f, -0.05438682436943054f, -0.013523850589990616f, -0.005458274856209755f, -0.0012612252030521631f, -0.012705187313258648f, 0.00924092996865511f, 0.0033971620723605156f, 0.031138772144913673f, -0.007804973982274532f, 0.06718955188989639f, 0.02711794339120388f, 0.00626605236902833f, 0.06312224268913269f, -0.0459110289812088f, -0.01687786914408207f, -0.15470446646213531f, 0.03877687454223633f, 0.1479029357433319f, 0.059694159775972366f, 0.013793328776955605f, 0.1389496624469757f, -0.10106298327445984f, -0.03715289756655693f, -0.34054768085479736f, 0.08535871654748917f, 0.2286163717508316f, 0.0922703891992569f, 0.021320609375834465f, 0.2147771269083023f, -0.15621496737003326f, -0.057427939027547836f, -0.5263909697532654f, 0.1319405883550644f, 0.3093297779560089f, 0.12484661489725113f, 0.028847888112068176f, 0.2906045615673065f, -0.2113669216632843f, -0.07770296931266785f, -0.7122342586517334f, 0.17852243781089783f, 0.39004313945770264f, 0.15742282569408417f, 0.03637516126036644f, 0.36643195152282715f, -0.26651886105537415f, -0.09797799587249756f, -0.8980773687362671f, 0.22510427236557007f, 0.47075656056404114f, 0.1899990439414978f, 0.0439024418592453f, 0.44225940108299255f, -0.3216708302497864f, -0.11825302988290787f, -1.0839205980300903f, 0.2716861367225647f};
+static float EXPECTED_BIAS_GRAD[] = {-0.14249178767204285f, -0.02044878900051117f, 0.10159420967102051f, 0.223637193441391f, 0.34568023681640625f, 0.4677232503890991f, 0.5897662043571472f, 0.7118092179298401f};
+static float EXPECTED_INPUT_GRAD[] = {1.0704727172851562f, 1.0932453870773315f, 1.1160180568695068f, 1.1387908458709717f, 1.161563515663147f, 1.1843361854553223f, 1.2071088552474976f, 1.2298816442489624f};
+
+struct TestVector
+{
+    struct blob in;         // in + expected in grad
+    struct blob out;        // out_grad + expected out
+    struct blob weight;     // weight + expected weight grad
+    struct blob bias;       // bias + expected bias grad
+};
+
+static struct TestVector test_vectors[] = {
+    { // in = 8, out = 8
+        .in = {
+            .data = INPUT,
+            .diff = EXPECTED_INPUT_GRAD,
+            .dim = 8
+        },
+        .out = {
+            .diff = OUTPUT_GRAD,
+            .data = EXPECTED_OUTPUT,
+            .dim = 8
+        },
+        .weight = {
+            .data = WEIGHTS,
+            .diff = EXPECTED_WEIGHT_GRAD,
+            .dim = 8*8
+        },
+        .bias = {
+            .data = BIASES,
+            .diff = EXPECTED_BIAS_GRAD,
+            .dim = 8
+        }
+    },
+};
+
+// create a deep copy of a test vector
+void copy_test_vector(const struct TestVector *src, struct TestVector* dst)
+{
+    *dst = *src;
+
+    dst->in.data = malloc(src->in.dim * sizeof(float));
+    dst->in.diff = malloc(src->in.dim * sizeof(float));
+    dst->out.data = malloc(src->out.dim * sizeof(float));
+    dst->out.diff = malloc(src->out.dim * sizeof(float));
+    dst->weight.data = malloc(src->weight.dim * sizeof(float));
+    dst->weight.diff = malloc(src->weight.dim * sizeof(float));
+    dst->bias.data = malloc(src->bias.dim * sizeof(float));
+    dst->bias.diff = malloc(src->bias.dim * sizeof(float));
+    memcpy(dst->in.data, src->in.data, src->in.dim * sizeof(float));
+    memcpy(dst->in.diff, src->in.diff, src->in.dim * sizeof(float));
+    memcpy(dst->out.diff, src->out.diff, src->out.dim * sizeof(float));
+    memcpy(dst->out.data, src->out.data, src->out.dim * sizeof(float));
+    memcpy(dst->weight.data, src->weight.data, src->weight.dim * sizeof(float));
+    memcpy(dst->weight.diff, src->weight.diff, src->weight.dim * sizeof(float));
+    memcpy(dst->bias.data, src->bias.data, src->bias.dim * sizeof(float));
+    memcpy(dst->bias.diff, src->bias.diff, src->bias.dim * sizeof(float));
+}
+
+// free a copied test vector
+void free_test_vector(struct TestVector *v)
+{
+    free(v->in.data);
+    free(v->in.diff);
+    free(v->out.data);
+    free(v->out.diff);
+    free(v->weight.data);
+    free(v->weight.diff);
+    free(v->bias.data);
+    free(v->bias.diff);
+}
+
+void create_test_vectors(struct Linear_args* args, struct TestVector* expected)
+{
+    static struct TestVector a;
+    struct TestVector v = test_vectors[0];
+
+    // create two deep copies of the test vector so that we don't overwrite the
+    // original one. One copy to populate args and one as expected values
+    copy_test_vector(&v, expected);
+    copy_test_vector(&v, &a);
+
+    // set some buffers to zero for the argument blobs
+    set_array_fp32(a.in.diff, a.in.dim, 0.0);
+    set_array_fp32(a.out.data, a.out.dim, 0.0);
+    set_array_fp32(a.weight.diff, a.weight.dim, 0.0);
+    set_array_fp32(a.bias.diff, a.bias.dim, 0.0);
+
+    // potulate linear parameter struct
+    args->input = &a.in;
+    args->coeff = &a.weight;
+    args->bias = &a.bias;
+    args->output = &a.out;
+    args->skip_wg_grad = 0;
+    args->skip_in_grad = 0;
+    args->opt_matmul_type_fw = 0;
+    args->opt_matmul_type_wg = 0;
+    args->opt_matmul_type_ig = 0;
+    args->use_biases = 1;
+}
+
+void free_test_vectors(struct Linear_args* args, struct TestVector* expected)
+{
+    free(args->input->data);
+    free(args->input->diff);
+    free(args->output->data);
+    free(args->output->diff);
+    free(args->coeff->data);
+    free(args->coeff->diff);
+    free(args->bias->data);
+    free(args->bias->diff);
+    free_test_vector(expected);
+}
+
+// called before each test
+void setUp(void)
+{
+}
+
+// called after each test
+void tearDown(void)
+{
+}
+
+void test_pulp_linear_fp32_fw_cl(void)
+{
+    struct Linear_args args;
+    struct TestVector expected;
+    create_test_vectors(&args, &expected);
+
+    pulp_linear_fp32_fw_cl(&args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected.out.data, args.output->data, args.output->dim);
+
+    free_test_vectors(&args, &expected);
+}
+
+void test_pulp_linear_fp32_bw_param_grads_cl(void)
+{
+    struct Linear_args args;
+    struct TestVector expected;
+    create_test_vectors(&args, &expected);
+
+    pulp_linear_fp32_bw_param_grads_cl(&args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected.weight.diff, args.coeff->diff, args.coeff->dim);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected.bias.diff, args.bias->diff, args.bias->dim);
+
+    free_test_vectors(&args, &expected);
+}
+
+void test_pulp_linear_fp32_bw_input_grads_cl(void)
+{
+    struct Linear_args args;
+    struct TestVector expected;
+    create_test_vectors(&args, &expected);
+
+    pulp_linear_fp32_bw_input_grads_cl(&args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected.in.diff, args.input->diff, args.input->dim);
+
+    free_test_vectors(&args, &expected);
+}
+
+TEST_CASE(0, 0) // calculate both weight and input gradients
+TEST_CASE(1, 0) // skip weight gradient calculation
+TEST_CASE(0, 1) // skip input gradient calculation
+void test_pulp_linear_fp32_bw_cl(int skip_wg_grad, int skip_in_grad)
+{
+    struct Linear_args args;
+    struct TestVector expected;
+    create_test_vectors(&args, &expected);
+
+    // test skip grad calculations
+    args.skip_wg_grad = skip_wg_grad;
+    args.skip_in_grad = skip_in_grad;
+
+    if (skip_wg_grad) {
+        set_array_fp32(expected.weight.diff, expected.weight.dim, 0);
+        set_array_fp32(expected.bias.diff, expected.bias.dim, 0);
+    }
+    if (skip_in_grad) {
+        set_array_fp32(expected.in.diff, expected.in.dim, 0);
+    }
+
+    pulp_linear_fp32_bw_cl(&args);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected.in.diff, args.input->diff, args.input->dim);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected.weight.diff, args.coeff->diff, args.coeff->dim);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected.bias.diff, args.bias->diff, args.bias->dim);
+
+    free_test_vectors(&args, &expected);
+}
+
+
+#endif // TEST
diff --git a/unittests/generic/test_pulp_matmul_fp32.c b/unittests/generic/test_pulp_matmul_fp32.c
index 6079af30..74290b68 100644
--- a/unittests/generic/test_pulp_matmul_fp32.c
+++ b/unittests/generic/test_pulp_matmul_fp32.c
@@ -1,6 +1,8 @@
 #ifdef TEST
 
+#include <stdlib.h>
 #include "unity.h"
+#include "test_utils.h"
 
 #include "pmsis.h"
 #include "pulp_train_defines.h"
@@ -9,447 +11,320 @@
 
 #define DELTA   1e-12
 
-#define IN_CH   9
-#define MID_CH  9
-#define OUT_CH  9
-
-static float A[IN_CH*MID_CH] = { 1.0, 4.3, 2.1, 0.9, -1.5, 2.9, -1.2, 7.8, 9.3, -2.3, 5.8, 0.6, 1.4, 8.5, -8.6, -8.3, -9.6, 6.7, 5.6, 7.4, 9.6, 6.0, -0.8, 5.6, -7.6, 2.8, -7.1, 8.9, 0.4, -1.7, -4.7, 5.5, -0.9, 1.4, -9.6, 2.4, 2.2, 2.3, 8.9, 3.6, -2.8, -1.3, 4.0, -8.8, 3.3, 3.4, -5.8, -7.4, -3.7, -2.7, 1.4, -1.2, 9.8, -8.0, -5.8, -6.8, 3.1, -4.9, -0.7, -5.1, -6.8, -7.8, 3.1, -7.2, -6.1, -2.6, 6.4, -8.1, 6.8, -8.1, 9.5, -0.6, 9.5, 2.1, 4.8, -9.2, -4.3, -7.6, -4.1, -7.6, -3.6 };
-static float B[MID_CH*OUT_CH] = { -1.7, -8.7, 3.8, 1.3, -4.7, 0.5, -8.1, 1.5, 8.6, -3.6, 3.3, -7.4, 4.3, -4.2, -6.3, 1.7, -9.6, 6.6, -9.9, 3.6, -4.6, 4.7, 9.2, -5.0, 1.5, 1.8, 1.4, -5.5, 9.1, -1.1, 6.9, 4.0, -4.1, 6.3, -2.1, 7.6, 1.6, 7.6, 3.9, 4.5, 0.0, 9.1, 2.9, -1.5, 2.1, -9.6, -4.0, 3.2, -4.2, 2.4, -1.4, -7.3, -4.0, 1.4, 1.8, 1.5, 3.1, 3.0, -1.4, 7.9, -2.6, -1.3, 7.8, 6.1, 4.1, -8.0, 8.4, 4.3, 10.0, -7.0, 7.4, -6.8, 2.3, -7.5, 7.0, 6.1, 1.4, -1.9, -8.6, 3.9, -0.9 };
-static float Bt[MID_CH*OUT_CH] = { -1.7, -3.6, -9.9, -5.5, 1.6, -9.6, 1.8, 6.1, 2.3, -8.7, 3.3, 3.6, 9.1, 7.6, -4.0, 1.5, 4.1, -7.5, 3.8, -7.4, -4.6, -1.1, 3.9, 3.2, 3.1, -8.0, 7.0, 1.3, 4.3, 4.7, 6.9, 4.5, -4.2, 3.0, 8.4, 6.1, -4.7, -4.2, 9.2, 4.0, 0.0, 2.4, -1.4, 4.3, 1.4, 0.5, -6.3, -5.0, -4.1, 9.1, -1.4, 7.9, 10.0, -1.9, -8.1, 1.7, 1.5, 6.3, 2.9, -7.3, -2.6, -7.0, -8.6, 1.5, -9.6, 1.8, -2.1, -1.5, -4.0, -1.3, 7.4, 3.9, 8.6, 6.6, 1.4, 7.6, 2.1, 1.4, 7.8, -6.8, -0.9 };
-static float C[IN_CH*OUT_CH] = { -6.35, -41.33, -36.26, 135.59, 55.36, -7.64, -148.95, 48.31, -23.1, 7.46, 50.99, 47.64, 44.13, -43.35, -131.34, 156.8, -73.46, 30.3, -232.17, 89.71, -165.1, 55.91, 81.1, -150.09, 37.44, -76.25, 90.27, -6.97, -134.86, 160.36, -60.4, -119.99, -8.49, -38.13, -51.83, 125.41, -150.82, -17.66, 37.26, 30.49, 34.34, -158.98, 46.31, -58.37, 154.29, 130.17, -36.77, -50.22, -68.94, -38.52, 167.6, -86.54, 96.56, -120.03, 25.75, -55.76, 63.38, -104.76, 32.82, -92.31, 90.02, 57.11, -139.06, -11.51, 33.93, -92.44, -16.82, 128.81, -29.49, -29.04, 93.08, -191.91, -16.57, -154.75, -9.6, -105.07, -96.33, -124.8, 13.69, -6.05, 35.63 };
-
-static float A_k1[IN_CH*MID_CH] = { 4.4, 7.3, 9.5, 7.1, -9.8, -2.8, 4.6, -6.6, 0.4 };
-static float B_k1[MID_CH*OUT_CH] = { -8.9, -6.0, -9.6, 5.9, -5.5, -3.1, 8.6, 4.1, -9.4 };
-static float C_k1[IN_CH*OUT_CH] = { -39.16, -26.4, -42.24, 25.96, -24.2, -13.64, 37.84, 18.04, -41.36, -64.97, -43.8, -70.08, 43.07, -40.15, -22.63, 62.78, 29.93, -68.62, -84.55, -57.0, -91.2, 56.05, -52.25, -29.45, 81.7, 38.95, -89.3, -63.19, -42.6, -68.16, 41.89, -39.05, -22.01, 61.06, 29.11, -66.74, 87.22, 58.8, 94.08, -57.82, 53.9, 30.38, -84.28, -40.18, 92.12, 24.92, 16.8, 26.88, -16.52, 15.4, 8.68, -24.08, -11.48, 26.32, -40.94, -27.6, -44.16, 27.14, -25.3, -14.26, 39.56, 18.86, -43.24, 58.74, 39.6, 63.36, -38.94, 36.3, 20.46, -56.76, -27.06, 62.04, -3.56, -2.4, -3.84, 2.36, -2.2, -1.24, 3.44, 1.64, -3.76 };
+static float A[] = { 1.0, 4.3, 2.1, 0.9, -1.5, 2.9, -1.2, 7.8, 9.3, -2.3, 5.8, 0.6, 1.4, 8.5, -8.6, -8.3, -9.6, 6.7, 5.6, 7.4, 9.6, 6.0, -0.8, 5.6, -7.6, 2.8, -7.1, 8.9, 0.4, -1.7, -4.7, 5.5, -0.9, 1.4, -9.6, 2.4, 2.2, 2.3, 8.9, 3.6, -2.8, -1.3, 4.0, -8.8, 3.3, 3.4, -5.8, -7.4, -3.7, -2.7, 1.4, -1.2, 9.8, -8.0, -5.8, -6.8, 3.1, -4.9, -0.7, -5.1, -6.8, -7.8, 3.1, -7.2, -6.1, -2.6, 6.4, -8.1, 6.8, -8.1, 9.5, -0.6, 9.5, 2.1, 4.8, -9.2, -4.3, -7.6, -4.1, -7.6, -3.6 };
+static float B[] = { -1.7, -8.7, 3.8, 1.3, -4.7, 0.5, -8.1, 1.5, 8.6, -3.6, 3.3, -7.4, 4.3, -4.2, -6.3, 1.7, -9.6, 6.6, -9.9, 3.6, -4.6, 4.7, 9.2, -5.0, 1.5, 1.8, 1.4, -5.5, 9.1, -1.1, 6.9, 4.0, -4.1, 6.3, -2.1, 7.6, 1.6, 7.6, 3.9, 4.5, 0.0, 9.1, 2.9, -1.5, 2.1, -9.6, -4.0, 3.2, -4.2, 2.4, -1.4, -7.3, -4.0, 1.4, 1.8, 1.5, 3.1, 3.0, -1.4, 7.9, -2.6, -1.3, 7.8, 6.1, 4.1, -8.0, 8.4, 4.3, 10.0, -7.0, 7.4, -6.8, 2.3, -7.5, 7.0, 6.1, 1.4, -1.9, -8.6, 3.9, -0.9 };
+static float Bt[] = { -1.7, -3.6, -9.9, -5.5, 1.6, -9.6, 1.8, 6.1, 2.3, -8.7, 3.3, 3.6, 9.1, 7.6, -4.0, 1.5, 4.1, -7.5, 3.8, -7.4, -4.6, -1.1, 3.9, 3.2, 3.1, -8.0, 7.0, 1.3, 4.3, 4.7, 6.9, 4.5, -4.2, 3.0, 8.4, 6.1, -4.7, -4.2, 9.2, 4.0, 0.0, 2.4, -1.4, 4.3, 1.4, 0.5, -6.3, -5.0, -4.1, 9.1, -1.4, 7.9, 10.0, -1.9, -8.1, 1.7, 1.5, 6.3, 2.9, -7.3, -2.6, -7.0, -8.6, 1.5, -9.6, 1.8, -2.1, -1.5, -4.0, -1.3, 7.4, 3.9, 8.6, 6.6, 1.4, 7.6, 2.1, 1.4, 7.8, -6.8, -0.9 };
+static float C[] = { -6.35, -41.33, -36.26, 135.59, 55.36, -7.64, -148.95, 48.31, -23.1, 7.46, 50.99, 47.64, 44.13, -43.35, -131.34, 156.8, -73.46, 30.3, -232.17, 89.71, -165.1, 55.91, 81.1, -150.09, 37.44, -76.25, 90.27, -6.97, -134.86, 160.36, -60.4, -119.99, -8.49, -38.13, -51.83, 125.41, -150.82, -17.66, 37.26, 30.49, 34.34, -158.98, 46.31, -58.37, 154.29, 130.17, -36.77, -50.22, -68.94, -38.52, 167.6, -86.54, 96.56, -120.03, 25.75, -55.76, 63.38, -104.76, 32.82, -92.31, 90.02, 57.11, -139.06, -11.51, 33.93, -92.44, -16.82, 128.81, -29.49, -29.04, 93.08, -191.91, -16.57, -154.75, -9.6, -105.07, -96.33, -124.8, 13.69, -6.05, 35.63 };
+
+static float A_k1[] = { 4.4, 7.3, 9.5, 7.1, -9.8, -2.8, 4.6, -6.6, 0.4 };
+static float B_k1[] = { -8.9, -6.0, -9.6, 5.9, -5.5, -3.1, 8.6, 4.1, -9.4 };
+static float C_k1[] = { -39.16, -26.4, -42.24, 25.96, -24.2, -13.64, 37.84, 18.04, -41.36, -64.97, -43.8, -70.08, 43.07, -40.15, -22.63, 62.78, 29.93, -68.62, -84.55, -57.0, -91.2, 56.05, -52.25, -29.45, 81.7, 38.95, -89.3, -63.19, -42.6, -68.16, 41.89, -39.05, -22.01, 61.06, 29.11, -66.74, 87.22, 58.8, 94.08, -57.82, 53.9, 30.38, -84.28, -40.18, 92.12, 24.92, 16.8, 26.88, -16.52, 15.4, 8.68, -24.08, -11.48, 26.32, -40.94, -27.6, -44.16, 27.14, -25.3, -14.26, 39.56, 18.86, -43.24, 58.74, 39.6, 63.36, -38.94, 36.3, 20.46, -56.76, -27.06, 62.04, -3.56, -2.4, -3.84, 2.36, -2.2, -1.24, 3.44, 1.64, -3.76 };
+
+static struct matMul_args test_vectors[] = {
+    { // 9x9 x 9x9
+        .A = A,
+        .B = B,
+        .C = C,
+        .N = 9,
+        .K = 9,
+        .M = 9,
+        .trans_B = 0
+    },
+    { // 9x9 x 9x9 transposed
+        .A = A,
+        .B = Bt,
+        .C = C,
+        .N = 9,
+        .K = 9,
+        .M = 9,
+        .trans_B = 1
+    },
+    { // 9x1 x 1x9
+        .A = A_k1,
+        .B = B_k1,
+        .C = C_k1,
+        .N = 9,
+        .K = 1,
+        .M = 9,
+        .trans_B = 0
+    },
+    { // 9x1 x 1x9 transposed
+        .A = A_k1,
+        .B = B_k1,
+        .C = C_k1,
+        .N = 9,
+        .K = 1,
+        .M = 9,
+        .trans_B = 1
+    }
+};
 
 static struct matMul_args mm_args;
-static struct matMul_args mm_args_k1;
-static float result[IN_CH*OUT_CH];
-static float result_k1[IN_CH*OUT_CH];
+static struct matMul_args mm_expected;
+
+void create_test_vectors(int transp, int use_k1)
+{
+    int idx = transp + use_k1*2;
+    mm_expected = test_vectors[idx];
 
+    mm_args = mm_expected;
+    mm_args.C = malloc(mm_args.N * mm_args.M * sizeof(float));
+    set_array_fp32(mm_args.C, mm_args.N * mm_args.M, 0.0);
+}
 
-void set_array(float *array, size_t size, float value)
+void free_test_vectors(struct matMul_args* args)
 {
-    for (int i = 0; i < size; i++) {
-        array[i] = value;
+    if (args->C != NULL) {
+        free(args->C);
+        args->C = NULL;
     }
 }
 
 // called before each test
 void setUp(void)
 {
-    mm_args.A = A;
-    mm_args.B = B;
-    mm_args.C = result;
-    mm_args.N = IN_CH;
-    mm_args.K = MID_CH;
-    mm_args.M = OUT_CH;
-    mm_args.trans_B = 0;
-
-    mm_args_k1.A = A_k1;
-    mm_args_k1.B = B_k1;
-    mm_args_k1.C = result_k1;
-    mm_args_k1.N = IN_CH;
-    mm_args_k1.K = 1;
-    mm_args_k1.M = OUT_CH;
-    mm_args_k1.trans_B = 0;
-
-    /* make sure the result buffers always start from a known state */
-    set_array(result, IN_CH*OUT_CH, 0.0);
-    set_array(result_k1, OUT_CH, 0.0);
 }
 
 // called after each test
 void tearDown(void)
 {
+    free_test_vectors(&mm_args);
 }
 
-void test_pulp_matmul_fp32_mm(void)
+TEST_CASE(0,0)
+TEST_CASE(1,0)
+TEST_CASE(0,1)
+TEST_CASE(1,1)
+void test_pulp_matmul_fp32_mm(int transp, int use_k1)
 {
+    create_test_vectors(transp, use_k1);
     pi_cl_team_fork(NUM_CORES, mm, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
-
-    pi_cl_team_fork(NUM_CORES, mm, &mm_args_k1);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C_k1, result_k1, IN_CH*OUT_CH);
-}
-
-void test_pulp_matmul_fp32_mm_transp(void)
-{
-    mm_args.trans_B = 1;
-    mm_args.B = Bt;
-    pi_cl_team_fork(NUM_CORES, mm, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
-
-    mm_args_k1.trans_B = 1;
-    pi_cl_team_fork(NUM_CORES, mm, &mm_args_k1);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C_k1, result_k1, IN_CH*OUT_CH);
-}
-
-void test_pulp_matmul_fp32_mm_u2(void)
-{
-    pi_cl_team_fork(NUM_CORES, mm_u2, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, mm_expected.C, mm_args.C, mm_args.N*mm_args.M);
 }
 
-void test_pulp_matmul_fp32_mm_u2_transp(void)
+TEST_CASE(0,0)
+TEST_CASE(1,0)
+void test_pulp_matmul_fp32_mm_u2(int transp, int use_k1)
 {
-    mm_args.trans_B = 1;
-    mm_args.B = Bt;
+    create_test_vectors(transp, use_k1);
     pi_cl_team_fork(NUM_CORES, mm_u2, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, mm_expected.C, mm_args.C, mm_args.N*mm_args.M);
 }
 
-void test_pulp_matmul_fp32_mm_add(void)
+TEST_CASE(0,0)
+TEST_CASE(1,0)
+TEST_CASE(0,1)
+TEST_CASE(1,1)
+void test_pulp_matmul_fp32_mm_add(int transp, int use_k1)
 {
-    float expected[IN_CH*OUT_CH];
+    create_test_vectors(transp, use_k1);
+    set_array_fp32(mm_args.C, mm_args.N * mm_args.M, 1.0);
 
-    set_array(result, IN_CH*OUT_CH, 1.0);   // set result buffer to non-zero
-    for (int i=0; i<IN_CH*OUT_CH; i++) {    // calculate expected result
-        expected[i] = C[i] + 1.0;
+    // add 1.0 to expected and result buffer
+    float expected_result[mm_expected.N * mm_expected.M];
+    for (int i=0; i<(mm_expected.N * mm_expected.M); i++) {
+        expected_result[i] = mm_expected.C[i] + 1.0;
     }
-
     pi_cl_team_fork(NUM_CORES, mm_add, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected, result, IN_CH*OUT_CH);
-
-    set_array(result_k1, IN_CH*OUT_CH, 1.0);    // set result buffer to non-zero
-    for (int i=0; i<IN_CH*OUT_CH; i++) {        // calculate expected result
-        expected[i] = C_k1[i] + 1.0;
-    }
-
-    pi_cl_team_fork(NUM_CORES, mm_add, &mm_args_k1);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected, result_k1, IN_CH*OUT_CH);
-}
-
-void test_pulp_matmul_fp32_mm_add_transp(void)
-{
-    float expected[IN_CH*OUT_CH];
-
-    set_array(result, IN_CH*OUT_CH, 1.0);   // set result buffer to non-zero
-    for (int i=0; i<IN_CH*OUT_CH; i++) {    // calculate expected result
-        expected[i] = C[i] + 1.0;
-    }
-
-    mm_args.trans_B = 1;
-    mm_args.B = Bt;
-    pi_cl_team_fork(NUM_CORES, mm_add, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected, result, IN_CH*OUT_CH);
-
-    set_array(result_k1, IN_CH*OUT_CH, 1.0);    // set result buffer to non-zero
-    for (int i=0; i<IN_CH*OUT_CH; i++) {        // calculate expected result
-        expected[i] = C_k1[i] + 1.0;
-    }
-
-    mm_args_k1.trans_B = 1;
-    pi_cl_team_fork(NUM_CORES, mm_add, &mm_args_k1);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected, result_k1, IN_CH*OUT_CH);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected_result, mm_args.C, mm_args.N*mm_args.M);
 }
 
-void test_pulp_matmul_fp32_mm_M(void)
+TEST_CASE(0,0)
+TEST_CASE(1,0)
+void test_pulp_matmul_fp32_mm_M(int transp, int use_k1)
 {
+    create_test_vectors(transp, use_k1);
     pi_cl_team_fork(NUM_CORES, mm_M, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, mm_expected.C, mm_args.C, mm_args.N*mm_args.M);
 }
 
-void test_pulp_matmul_fp32_mm_M_transp(void)
+TEST_CASE(0,0)
+TEST_CASE(1,0)
+void test_pulp_matmul_fp32_mm_unroll_1x2(int transp, int use_k1)
 {
-    mm_args.trans_B = 1;
-    mm_args.B = Bt;
-    pi_cl_team_fork(NUM_CORES, mm_M, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
-}
-
-void test_pulp_matmul_fp32_mm_unroll_1x2(void)
-{
-    pi_cl_team_fork(NUM_CORES, mm_unroll_1x2, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
-}
-
-void test_pulp_matmul_fp32_mm_unroll_1x2_transp(void)
-{
-    mm_args.trans_B = 1;
-    mm_args.B = Bt;
+    create_test_vectors(transp, use_k1);
     pi_cl_team_fork(NUM_CORES, mm_unroll_1x2, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, mm_expected.C, mm_args.C, mm_args.N*mm_args.M);
 }
 
-void test_pulp_matmul_fp32_mm_unroll_1x4(void)
+TEST_CASE(0,0)
+TEST_CASE(1,0)
+void test_pulp_matmul_fp32_mm_unroll_1x4(int transp, int use_k1)
 {
+    create_test_vectors(transp, use_k1);
     pi_cl_team_fork(NUM_CORES, mm_unroll_1x4, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, mm_expected.C, mm_args.C, mm_args.N*mm_args.M);
 }
 
-void test_pulp_matmul_fp32_mm_unroll_1x4_transp(void)
-{
-    mm_args.trans_B = 1;
-    mm_args.B = Bt;
-    pi_cl_team_fork(NUM_CORES, mm_unroll_1x4, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
-}
-
-void test_pulp_matmul_fp32_mm_unroll_1x8(void)
+TEST_CASE(0,0)
+TEST_CASE(1,0)
+void test_pulp_matmul_fp32_mm_unroll_1x8(int transp, int use_k1)
 {
+    create_test_vectors(transp, use_k1);
     pi_cl_team_fork(NUM_CORES, mm_unroll_1x8, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, mm_expected.C, mm_args.C, mm_args.N*mm_args.M);
 }
 
-void test_pulp_matmul_fp32_mm_unroll_1x8_transp(void)
-{
-    mm_args.trans_B = 1;
-    mm_args.B = Bt;
-    pi_cl_team_fork(NUM_CORES, mm_unroll_1x8, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
-}
-
-void test_pulp_matmul_fp32_mm_unroll_2x1(void)
+TEST_CASE(0,0)
+TEST_CASE(1,0)
+void test_pulp_matmul_fp32_mm_unroll_2x1(int transp, int use_k1)
 {
+    create_test_vectors(transp, use_k1);
     pi_cl_team_fork(NUM_CORES, mm_unroll_2x1, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
-}
-
-void test_pulp_matmul_fp32_mm_unroll_2x1_transp(void)
-{
-    mm_args.trans_B = 1;
-    mm_args.B = Bt;
-    pi_cl_team_fork(NUM_CORES, mm_unroll_2x1, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
-}
-
-void test_pulp_matmul_fp32_mm_unroll_4x1(void)
-{
-    pi_cl_team_fork(NUM_CORES, mm_unroll_4x1, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, mm_expected.C, mm_args.C, mm_args.N*mm_args.M);
 }
 
-void test_pulp_matmul_fp32_mm_unroll_4x1_transp(void)
+TEST_CASE(0,0)
+TEST_CASE(1,0)
+void test_pulp_matmul_fp32_mm_unroll_4x1(int transp, int use_k1)
 {
-    mm_args.trans_B = 1;
-    mm_args.B = Bt;
+    create_test_vectors(transp, use_k1);
     pi_cl_team_fork(NUM_CORES, mm_unroll_4x1, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, mm_expected.C, mm_args.C, mm_args.N*mm_args.M);
 }
 
-void test_pulp_matmul_fp32_mm_unroll_8x1(void)
+TEST_CASE(0,0)
+TEST_CASE(1,0)
+void test_pulp_matmul_fp32_mm_unroll_8x1(int transp, int use_k1)
 {
+    create_test_vectors(transp, use_k1);
     pi_cl_team_fork(NUM_CORES, mm_unroll_8x1, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
-}
-
-void test_pulp_matmul_fp32_mm_unroll_8x1_transp(void)
-{
-    mm_args.trans_B = 1;
-    mm_args.B = Bt;
-    pi_cl_team_fork(NUM_CORES, mm_unroll_8x1, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
-}
-
-void test_pulp_matmul_fp32_mm_unroll_2x2(void)
-{
-    pi_cl_team_fork(NUM_CORES, mm_unroll_2x2, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, mm_expected.C, mm_args.C, mm_args.N*mm_args.M);
 }
 
-void test_pulp_matmul_fp32_mm_unroll_2x2_transp(void)
+TEST_CASE(0,0)
+TEST_CASE(1,0)
+void test_pulp_matmul_fp32_mm_unroll_2x2(int transp, int use_k1)
 {
-    mm_args.trans_B = 1;
-    mm_args.B = Bt;
+    create_test_vectors(transp, use_k1);
     pi_cl_team_fork(NUM_CORES, mm_unroll_2x2, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
-}
-
-void test_pulp_matmul_fp32_mm_unroll_2x4(void)
-{
-    pi_cl_team_fork(NUM_CORES, mm_unroll_2x4, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, mm_expected.C, mm_args.C, mm_args.N*mm_args.M);
 }
 
-void test_pulp_matmul_fp32_mm_unroll_2x4_transp(void)
+TEST_CASE(0,0)
+TEST_CASE(1,0)
+void test_pulp_matmul_fp32_mm_unroll_2x4(int transp, int use_k1)
 {
-    mm_args.trans_B = 1;
-    mm_args.B = Bt;
+    create_test_vectors(transp, use_k1);
     pi_cl_team_fork(NUM_CORES, mm_unroll_2x4, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, mm_expected.C, mm_args.C, mm_args.N*mm_args.M);
 }
 
-void test_pulp_matmul_fp32_mm_unroll_4x2(void)
+TEST_CASE(0,0)
+TEST_CASE(1,0)
+void test_pulp_matmul_fp32_mm_unroll_4x2(int transp, int use_k1)
 {
+    create_test_vectors(transp, use_k1);
     pi_cl_team_fork(NUM_CORES, mm_unroll_4x2, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
-}
-
-void test_pulp_matmul_fp32_mm_unroll_4x2_transp(void)
-{
-    mm_args.trans_B = 1;
-    mm_args.B = Bt;
-    pi_cl_team_fork(NUM_CORES, mm_unroll_4x2, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
-}
-
-void test_pulp_matmul_fp32_mm_unroll_4x4(void)
-{
-    pi_cl_team_fork(NUM_CORES, mm_unroll_4x4, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, mm_expected.C, mm_args.C, mm_args.N*mm_args.M);
 }
 
-void test_pulp_matmul_fp32_mm_unroll_4x4_transp(void)
+TEST_CASE(0,0)
+TEST_CASE(1,0)
+void test_pulp_matmul_fp32_mm_unroll_4x4(int transp, int use_k1)
 {
-    mm_args.trans_B = 1;
-    mm_args.B = Bt;
+    create_test_vectors(transp, use_k1);
     pi_cl_team_fork(NUM_CORES, mm_unroll_4x4, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, mm_expected.C, mm_args.C, mm_args.N*mm_args.M);
 }
 
-void test_pulp_matmul_fp32_mm_M_u2(void)
+TEST_CASE(0,0)
+TEST_CASE(1,0)
+void test_pulp_matmul_fp32_mm_M_u2(int transp, int use_k1)
 {
+    create_test_vectors(transp, use_k1);
     pi_cl_team_fork(NUM_CORES, mm_M_u2, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, mm_expected.C, mm_args.C, mm_args.N*mm_args.M);
 }
 
-void test_pulp_matmul_fp32_mm_M_u2_transp(void)
+TEST_CASE(0,0)
+TEST_CASE(1,0)
+void test_pulp_matmul_fp32_mm_M_unroll_2x1(int transp, int use_k1)
 {
-    mm_args.trans_B = 1;
-    mm_args.B = Bt;
-    pi_cl_team_fork(NUM_CORES, mm_M_u2, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
-}
-
-void test_pulp_matmul_fp32_mm_M_unroll_2x1(void)
-{
-    pi_cl_team_fork(NUM_CORES, mm_M_unroll_2x1, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
-}
-
-void test_pulp_matmul_fp32_mm_M_unroll_2x1_transp(void)
-{
-    mm_args.trans_B = 1;
-    mm_args.B = Bt;
+    create_test_vectors(transp, use_k1);
     pi_cl_team_fork(NUM_CORES, mm_M_unroll_2x1, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, mm_expected.C, mm_args.C, mm_args.N*mm_args.M);
 }
 
-void test_pulp_matmul_fp32_mm_M_unroll_4x1(void)
+TEST_CASE(0,0)
+TEST_CASE(1,0)
+void test_pulp_matmul_fp32_mm_M_unroll_4x1(int transp, int use_k1)
 {
+    create_test_vectors(transp, use_k1);
     pi_cl_team_fork(NUM_CORES, mm_M_unroll_4x1, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, mm_expected.C, mm_args.C, mm_args.N*mm_args.M);
 }
 
-void test_pulp_matmul_fp32_mm_M_unroll_4x1_transp(void)
-{
-    mm_args.trans_B = 1;
-    mm_args.B = Bt;
-    pi_cl_team_fork(NUM_CORES, mm_M_unroll_4x1, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
-}
-
-void test_pulp_matmul_fp32_mm_M_unroll_8x1(void)
+TEST_CASE(0,0)
+TEST_CASE(1,0)
+void test_pulp_matmul_fp32_mm_M_unroll_8x1(int transp, int use_k1)
 {
+    create_test_vectors(transp, use_k1);
     pi_cl_team_fork(NUM_CORES, mm_M_unroll_8x1, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, mm_expected.C, mm_args.C, mm_args.N*mm_args.M);
 }
 
-void test_pulp_matmul_fp32_mm_M_unroll_8x1_transp(void)
+TEST_CASE(0,0)
+TEST_CASE(1,0)
+void test_pulp_matmul_fp32_mm_M_unroll_1x2(int transp, int use_k1)
 {
-    mm_args.trans_B = 1;
-    mm_args.B = Bt;
-    pi_cl_team_fork(NUM_CORES, mm_M_unroll_8x1, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
-}
-
-void test_pulp_matmul_fp32_mm_M_unroll_1x2(void)
-{
-    pi_cl_team_fork(NUM_CORES, mm_M_unroll_1x2, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
-}
-
-void test_pulp_matmul_fp32_mm_M_unroll_1x2_transp(void)
-{
-    mm_args.trans_B = 1;
-    mm_args.B = Bt;
+    create_test_vectors(transp, use_k1);
     pi_cl_team_fork(NUM_CORES, mm_M_unroll_1x2, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, mm_expected.C, mm_args.C, mm_args.N*mm_args.M);
 }
 
-void test_pulp_matmul_fp32_mm_M_unroll_1x4(void)
+TEST_CASE(0,0)
+TEST_CASE(1,0)
+void test_pulp_matmul_fp32_mm_M_unroll_1x4(int transp, int use_k1)
 {
+    create_test_vectors(transp, use_k1);
     pi_cl_team_fork(NUM_CORES, mm_M_unroll_1x4, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, mm_expected.C, mm_args.C, mm_args.N*mm_args.M);
 }
 
-void test_pulp_matmul_fp32_mm_M_unroll_1x4_transp(void)
-{
-    mm_args.trans_B = 1;
-    mm_args.B = Bt;
-    pi_cl_team_fork(NUM_CORES, mm_M_unroll_1x4, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
-}
-
-void test_pulp_matmul_fp32_mm_M_unroll_1x8(void)
+TEST_CASE(0,0)
+TEST_CASE(1,0)
+void test_pulp_matmul_fp32_mm_M_unroll_1x8(int transp, int use_k1)
 {
+    create_test_vectors(transp, use_k1);
     pi_cl_team_fork(NUM_CORES, mm_M_unroll_1x8, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, mm_expected.C, mm_args.C, mm_args.N*mm_args.M);
 }
 
-void test_pulp_matmul_fp32_mm_M_unroll_1x8_transp(void)
-{
-    mm_args.trans_B = 1;
-    mm_args.B = Bt;
-    pi_cl_team_fork(NUM_CORES, mm_M_unroll_1x8, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
-}
-
-void test_pulp_matmul_fp32_mm_M_unroll_2x2(void)
+TEST_CASE(0,0)
+TEST_CASE(1,0)
+void test_pulp_matmul_fp32_mm_M_unroll_2x2(int transp, int use_k1)
 {
+    create_test_vectors(transp, use_k1);
     pi_cl_team_fork(NUM_CORES, mm_M_unroll_2x2, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, mm_expected.C, mm_args.C, mm_args.N*mm_args.M);
 }
 
-void test_pulp_matmul_fp32_mm_M_unroll_2x2_transp(void)
-{
-    mm_args.trans_B = 1;
-    mm_args.B = Bt;
-    pi_cl_team_fork(NUM_CORES, mm_M_unroll_2x2, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
-}
-
-void test_pulp_matmul_fp32_mm_M_unroll_4x2(void)
+TEST_CASE(0,0)
+TEST_CASE(1,0)
+void test_pulp_matmul_fp32_mm_M_unroll_4x2(int transp, int use_k1)
 {
+    create_test_vectors(transp, use_k1);
     pi_cl_team_fork(NUM_CORES, mm_M_unroll_4x2, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
-}
-
-void test_pulp_matmul_fp32_mm_M_unroll_4x2_transp(void)
-{
-    mm_args.trans_B = 1;
-    mm_args.B = Bt;
-    pi_cl_team_fork(NUM_CORES, mm_M_unroll_4x2, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
-}
-
-void test_pulp_matmul_fp32_mm_M_unroll_2x4(void)
-{
-    pi_cl_team_fork(NUM_CORES, mm_M_unroll_2x4, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, mm_expected.C, mm_args.C, mm_args.N*mm_args.M);
 }
 
-void test_pulp_matmul_fp32_mm_M_unroll_2x4_transp(void)
+TEST_CASE(0,0)
+TEST_CASE(1,0)
+void test_pulp_matmul_fp32_mm_M_unroll_2x4(int transp, int use_k1)
 {
-    mm_args.trans_B = 1;
-    mm_args.B = Bt;
+    create_test_vectors(transp, use_k1);
     pi_cl_team_fork(NUM_CORES, mm_M_unroll_2x4, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
-}
-
-void test_pulp_matmul_fp32_mm_M_unroll_4x4(void)
-{
-    pi_cl_team_fork(NUM_CORES, mm_M_unroll_4x4, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, mm_expected.C, mm_args.C, mm_args.N*mm_args.M);
 }
 
-void test_pulp_matmul_fp32_mm_M_unroll_4x4_transp(void)
+TEST_CASE(0,0)
+TEST_CASE(1,0)
+void test_pulp_matmul_fp32_mm_M_unroll_4x4(int transp, int use_k1)
 {
-    mm_args.trans_B = 1;
-    mm_args.B = Bt;
+    create_test_vectors(transp, use_k1);
     pi_cl_team_fork(NUM_CORES, mm_M_unroll_4x4, &mm_args);
-    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, C, result, IN_CH*OUT_CH);
+    TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, mm_expected.C, mm_args.C, mm_args.N*mm_args.M);
 }
 
 #endif // TEST

From 7cf7c0c00ae6b205bfed5a082e76556ea3b6b9da Mon Sep 17 00:00:00 2001
From: Maarten Vandersteegen <maartenvandersteegen@gmail.com>
Date: Tue, 10 Mar 2026 11:18:42 +0100
Subject: [PATCH 4/4] Simplified conv2d_fp32 and linear_fp32 unittests

Moved test vectors to static global variables, which shortens
the test cases itself
---
 unittests/generic/test_pulp_conv2d_fp32.c | 102 ++++++++++------------
 unittests/generic/test_pulp_linear_fp32.c |  73 ++++++----------
 unittests/generic/test_pulp_matmul_fp32.c |  10 +--
 3 files changed, 77 insertions(+), 108 deletions(-)

diff --git a/unittests/generic/test_pulp_conv2d_fp32.c b/unittests/generic/test_pulp_conv2d_fp32.c
index 63487054..69f40490 100644
--- a/unittests/generic/test_pulp_conv2d_fp32.c
+++ b/unittests/generic/test_pulp_conv2d_fp32.c
@@ -143,6 +143,9 @@ static struct TestVector test_vectors[] = {
     }
 };
 
+static struct Conv2D_args args;
+static struct TestVector expected;
+
 // create a deep copy of a test vector
 void copy_test_vector(const struct TestVector *src, struct TestVector* dst)
 {
@@ -179,7 +182,7 @@ void free_test_vector(struct TestVector *v)
     free(v->bias.diff);
 }
 
-void create_test_vectors(int hwc, int use_im2col, int padding, struct Conv2D_args* args, struct TestVector* expected)
+void create_test_vectors(int hwc, int use_im2col, int padding)
 {
     int idx = hwc + 2*padding;
     static struct TestVector a;
@@ -190,9 +193,9 @@ void create_test_vectors(int hwc, int use_im2col, int padding, struct Conv2D_arg
     v.out.dim = v.out.C * v.out.H * v.out.W;
     v.weight.dim = v.weight.C * v.weight.H * v.weight.W * v.out.C;
 
-    // create two deep copies of the test vector so that we don't overwrite the
-    // original one. One copy to populate args and one as expected values
-    copy_test_vector(&v, expected);
+    // create two deep copies of the test vector so that the original one
+    // cannot be overwritten. One copy to populate args and one as expected values
+    copy_test_vector(&v, &expected);
     copy_test_vector(&v, &a);
 
     // set some buffers to zero for the argument blobs
@@ -211,42 +214,42 @@ void create_test_vectors(int hwc, int use_im2col, int padding, struct Conv2D_arg
     float *bt_buffer = calloc(bt_buffer_size, sizeof(float));
 
     // potulate conv2d parameter struct
-    args->input = &a.in;
-    args->coeff = &a.weight;
-    args->bias = &a.bias;
-    args->output = &a.out;
-    args->Lpad = padding;
-    args->Rpad = padding;
-    args->Upad = padding;
-    args->Dpad = padding;
-    args->stride_h = 1;
-    args->stride_w = 1;
-    args->i2c_buffer = im2col_buffer;
-    args->bt_buffer = bt_buffer;
-    args->skip_wg_grad = 0;
-    args->skip_in_grad = 0;
-    args->HWC = hwc;
-    args->opt_matmul_type_fw = 0;
-    args->opt_matmul_type_wg = 0;
-    args->opt_matmul_type_ig = 0;
-    args->USE_IM2COL = use_im2col;
-    args->USE_DMA_IM2COL = 0;
-    args->USE_BIASES = 1;
+    args.input = &a.in;
+    args.coeff = &a.weight;
+    args.bias = &a.bias;
+    args.output = &a.out;
+    args.Lpad = padding;
+    args.Rpad = padding;
+    args.Upad = padding;
+    args.Dpad = padding;
+    args.stride_h = 1;
+    args.stride_w = 1;
+    args.i2c_buffer = im2col_buffer;
+    args.bt_buffer = bt_buffer;
+    args.skip_wg_grad = 0;
+    args.skip_in_grad = 0;
+    args.HWC = hwc;
+    args.opt_matmul_type_fw = 0;
+    args.opt_matmul_type_wg = 0;
+    args.opt_matmul_type_ig = 0;
+    args.USE_IM2COL = use_im2col;
+    args.USE_DMA_IM2COL = 0;
+    args.USE_BIASES = 1;
 }
 
-void free_test_vectors(struct Conv2D_args* args, struct TestVector* expected)
+void free_test_vectors(void)
 {
-    free(args->input->data);
-    free(args->input->diff);
-    free(args->output->data);
-    free(args->output->diff);
-    free(args->coeff->data);
-    free(args->coeff->diff);
-    free(args->bias->data);
-    free(args->bias->diff);
-    free(args->i2c_buffer);
-    free(args->bt_buffer);
-    free_test_vector(expected);
+    free(args.input->data);
+    free(args.input->diff);
+    free(args.output->data);
+    free(args.output->diff);
+    free(args.coeff->data);
+    free(args.coeff->diff);
+    free(args.bias->data);
+    free(args.bias->diff);
+    free(args.i2c_buffer);
+    free(args.bt_buffer);
+    free_test_vector(&expected);
 }
 
 // called before each test
@@ -257,6 +260,7 @@ void setUp(void)
 // called after each test
 void tearDown(void)
 {
+    free_test_vectors();
 }
 
 TEST_CASE(1, 1, 0)  // HWC, im2col, no padding
@@ -269,9 +273,7 @@ TEST_CASE(0, 0, 1)  // CHW, naive,  same padding
 TEST_CASE(1, 0, 1)  // HWC, naive,  same padding
 void test_pulp_conv2d_fp32_fw_cl(int hwc, int use_im2col, int padding)
 {
-    struct Conv2D_args args;
-    struct TestVector expected;
-    create_test_vectors(hwc, use_im2col, padding, &args, &expected);
+    create_test_vectors(hwc, use_im2col, padding);
 
     // for the naive HWC case, we haven't implemented it yet so expect zeros
     if (hwc == 1 && use_im2col == 0) {
@@ -280,8 +282,6 @@ void test_pulp_conv2d_fp32_fw_cl(int hwc, int use_im2col, int padding)
 
     pulp_conv2d_fp32_fw_cl(&args);
     TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected.out.data, args.output->data, args.output->dim);
-
-    free_test_vectors(&args, &expected);
 }
 
 TEST_CASE(1, 1, 0)  // HWC, im2col, no padding
@@ -294,9 +294,7 @@ TEST_CASE(0, 0, 1)  // CHW, naive,  same padding
 TEST_CASE(1, 0, 1)  // HWC, naive,  same padding
 void test_pulp_conv2d_fp32_bw_param_grads_cl(int hwc, int use_im2col, int padding)
 {
-    struct Conv2D_args args;
-    struct TestVector expected;
-    create_test_vectors(hwc, use_im2col, padding, &args, &expected);
+    create_test_vectors(hwc, use_im2col, padding);
 
     // for the naive HWC case, we haven't implemented it yet so expect zeros
     if (hwc == 1 && use_im2col == 0) {
@@ -307,8 +305,6 @@ void test_pulp_conv2d_fp32_bw_param_grads_cl(int hwc, int use_im2col, int paddin
     pulp_conv2d_fp32_bw_param_grads_cl(&args);
     TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected.weight.diff, args.coeff->diff, args.coeff->dim);
     TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected.bias.diff, args.bias->diff, args.bias->dim);
-
-    free_test_vectors(&args, &expected);
 }
 
 TEST_CASE(1, 1, 0)  // HWC, im2col, no padding
@@ -321,9 +317,7 @@ TEST_CASE(0, 0, 1)  // CHW, naive,  same padding
 TEST_CASE(1, 0, 1)  // HWC, naive,  same padding
 void test_pulp_conv2d_fp32_bw_input_grads_cl(int hwc, int use_im2col, int padding)
 {
-    struct Conv2D_args args;
-    struct TestVector expected;
-    create_test_vectors(hwc, use_im2col, padding, &args, &expected);
+    create_test_vectors(hwc, use_im2col, padding);
 
     // for the naive HWC case, we haven't implemented it yet so expect zeros
     if (hwc == 1 && use_im2col == 0) {
@@ -332,8 +326,6 @@ void test_pulp_conv2d_fp32_bw_input_grads_cl(int hwc, int use_im2col, int paddin
 
     pulp_conv2d_fp32_bw_input_grads_cl(&args);
     TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected.in.diff, args.input->diff, args.input->dim);
-
-    free_test_vectors(&args, &expected);
 }
 
 TEST_CASE(0, 0) // calculate both weight and input gradients
@@ -341,9 +333,7 @@ TEST_CASE(1, 0) // skip weight gradient calculation
 TEST_CASE(0, 1) // skip input gradient calculation
 void test_pulp_conv2d_fp32_bw_cl(int skip_wg_grad, int skip_in_grad)
 {
-    struct Conv2D_args args;
-    struct TestVector expected;
-    create_test_vectors(0, 0, 0, &args, &expected);
+    create_test_vectors(0, 0, 0);
 
     // test skip grad calculations
     args.skip_wg_grad = skip_wg_grad;
@@ -361,8 +351,6 @@ void test_pulp_conv2d_fp32_bw_cl(int skip_wg_grad, int skip_in_grad)
     TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected.in.diff, args.input->diff, args.input->dim);
     TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected.weight.diff, args.coeff->diff, args.coeff->dim);
     TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected.bias.diff, args.bias->diff, args.bias->dim);
-
-    free_test_vectors(&args, &expected);
 }
 
 #endif // TEST
diff --git a/unittests/generic/test_pulp_linear_fp32.c b/unittests/generic/test_pulp_linear_fp32.c
index c526dd51..ff8c3c37 100644
--- a/unittests/generic/test_pulp_linear_fp32.c
+++ b/unittests/generic/test_pulp_linear_fp32.c
@@ -61,6 +61,9 @@ static struct TestVector test_vectors[] = {
     },
 };
 
+static struct Linear_args args;
+static struct TestVector expected;
+
 // create a deep copy of a test vector
 void copy_test_vector(const struct TestVector *src, struct TestVector* dst)
 {
@@ -97,14 +100,14 @@ void free_test_vector(struct TestVector *v)
     free(v->bias.diff);
 }
 
-void create_test_vectors(struct Linear_args* args, struct TestVector* expected)
+void create_test_vectors(void)
 {
     static struct TestVector a;
     struct TestVector v = test_vectors[0];
 
     // create two deep copies of the test vector so that we don't overwrite the
     // original one. One copy to populate args and one as expected values
-    copy_test_vector(&v, expected);
+    copy_test_vector(&v, &expected);
     copy_test_vector(&v, &a);
 
     // set some buffers to zero for the argument blobs
@@ -114,76 +117,60 @@ void create_test_vectors(struct Linear_args* args, struct TestVector* expected)
     set_array_fp32(a.bias.diff, a.bias.dim, 0.0);
 
     // potulate linear parameter struct
-    args->input = &a.in;
-    args->coeff = &a.weight;
-    args->bias = &a.bias;
-    args->output = &a.out;
-    args->skip_wg_grad = 0;
-    args->skip_in_grad = 0;
-    args->opt_matmul_type_fw = 0;
-    args->opt_matmul_type_wg = 0;
-    args->opt_matmul_type_ig = 0;
-    args->use_biases = 1;
+    args.input = &a.in;
+    args.coeff = &a.weight;
+    args.bias = &a.bias;
+    args.output = &a.out;
+    args.skip_wg_grad = 0;
+    args.skip_in_grad = 0;
+    args.opt_matmul_type_fw = 0;
+    args.opt_matmul_type_wg = 0;
+    args.opt_matmul_type_ig = 0;
+    args.use_biases = 1;
 }
 
-void free_test_vectors(struct Linear_args* args, struct TestVector* expected)
+void free_test_vectors(void)
 {
-    free(args->input->data);
-    free(args->input->diff);
-    free(args->output->data);
-    free(args->output->diff);
-    free(args->coeff->data);
-    free(args->coeff->diff);
-    free(args->bias->data);
-    free(args->bias->diff);
-    free_test_vector(expected);
+    free(args.input->data);
+    free(args.input->diff);
+    free(args.output->data);
+    free(args.output->diff);
+    free(args.coeff->data);
+    free(args.coeff->diff);
+    free(args.bias->data);
+    free(args.bias->diff);
+    free_test_vector(&expected);
 }
 
 // called before each test
 void setUp(void)
 {
+    create_test_vectors();
 }
 
 // called after each test
 void tearDown(void)
 {
+    free_test_vectors();
 }
 
 void test_pulp_linear_fp32_fw_cl(void)
 {
-    struct Linear_args args;
-    struct TestVector expected;
-    create_test_vectors(&args, &expected);
-
     pulp_linear_fp32_fw_cl(&args);
     TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected.out.data, args.output->data, args.output->dim);
-
-    free_test_vectors(&args, &expected);
 }
 
 void test_pulp_linear_fp32_bw_param_grads_cl(void)
 {
-    struct Linear_args args;
-    struct TestVector expected;
-    create_test_vectors(&args, &expected);
-
     pulp_linear_fp32_bw_param_grads_cl(&args);
     TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected.weight.diff, args.coeff->diff, args.coeff->dim);
     TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected.bias.diff, args.bias->diff, args.bias->dim);
-
-    free_test_vectors(&args, &expected);
 }
 
 void test_pulp_linear_fp32_bw_input_grads_cl(void)
 {
-    struct Linear_args args;
-    struct TestVector expected;
-    create_test_vectors(&args, &expected);
-
     pulp_linear_fp32_bw_input_grads_cl(&args);
     TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected.in.diff, args.input->diff, args.input->dim);
-
-    free_test_vectors(&args, &expected);
 }
 
 TEST_CASE(0, 0) // calculate both weight and input gradients
@@ -191,10 +178,6 @@ TEST_CASE(1, 0) // skip weight gradient calculation
 TEST_CASE(0, 1) // skip input gradient calculation
 void test_pulp_linear_fp32_bw_cl(int skip_wg_grad, int skip_in_grad)
 {
-    struct Linear_args args;
-    struct TestVector expected;
-    create_test_vectors(&args, &expected);
-
     // test skip grad calculations
     args.skip_wg_grad = skip_wg_grad;
     args.skip_in_grad = skip_in_grad;
@@ -211,8 +194,6 @@ void test_pulp_linear_fp32_bw_cl(int skip_wg_grad, int skip_in_grad)
     TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected.in.diff, args.input->diff, args.input->dim);
     TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected.weight.diff, args.coeff->diff, args.coeff->dim);
     TEST_ASSERT_FLOAT_ARRAY_WITHIN(DELTA, expected.bias.diff, args.bias->diff, args.bias->dim);
-
-    free_test_vectors(&args, &expected);
 }
 
 
diff --git a/unittests/generic/test_pulp_matmul_fp32.c b/unittests/generic/test_pulp_matmul_fp32.c
index 74290b68..7b6c91f3 100644
--- a/unittests/generic/test_pulp_matmul_fp32.c
+++ b/unittests/generic/test_pulp_matmul_fp32.c
@@ -72,11 +72,11 @@ void create_test_vectors(int transp, int use_k1)
     set_array_fp32(mm_args.C, mm_args.N * mm_args.M, 0.0);
 }
 
-void free_test_vectors(struct matMul_args* args)
+void free_test_vectors(void)
 {
-    if (args->C != NULL) {
-        free(args->C);
-        args->C = NULL;
+    if (mm_args.C != NULL) {
+        free(mm_args.C);
+        mm_args.C = NULL;
     }
 }
 
@@ -88,7 +88,7 @@ void setUp(void)
 // called after each test
 void tearDown(void)
 {
-    free_test_vectors(&mm_args);
+    free_test_vectors();
 }
 
 TEST_CASE(0,0)