pulp-platform · maartenvds · Mar 6, 2026 · Mar 9, 2026 · Mar 10, 2026 · Mar 10, 2026
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,4 @@
 **/.vscode/
 **/__pycache__/
 .idea/
+build
diff --git a/lib/sources/pulp_conv2d_fp32.c b/lib/sources/pulp_conv2d_fp32.c
@@ -295,10 +295,10 @@ void pulp_conv2d_fp32_bw_param_grads_cl(void *Conv2D_args) {
             im2col_args.c = C2D_args->coeff;
             im2col_args.output = C2D_args->output;
             im2col_args.pBuffer = i2c_buffer;
-            im2col_args.Lpad = 0; //Lpad;
-            im2col_args.Rpad = 0; //Rpad;
-            im2col_args.Upad = 0; //Upad;
-            im2col_args.Dpad = 0; //Dpad;
+            im2col_args.Lpad = Lpad;
+            im2col_args.Rpad = Rpad;
+            im2col_args.Upad = Upad;
+            im2col_args.Dpad = Dpad;
             im2col_args.mod = 0;
             im2col_args.stride_w = stride_w;
             im2col_args.stride_h = stride_h;
@@ -319,8 +319,16 @@ void pulp_conv2d_fp32_bw_param_grads_cl(void *Conv2D_args) {
             matMul_args.bias = biasDiff;
             matMul_args.USE_BIASES = USE_BIASES;
 
-            matMul_args.pH = H_out;
-            matMul_args.pW = W_out;
+            matMul_args.H = H_in;
+            matMul_args.W = W_in;
+            matMul_args.pH = pH;
+            matMul_args.pW = pW;
+            matMul_args.Lpad = Lpad;
+            matMul_args.Rpad = Rpad;
+            matMul_args.Upad = Upad;
+            matMul_args.Dpad = Dpad;
+            matMul_args.stride_h = stride_h;
+            matMul_args.stride_w = stride_w;
 
             matMul_args.bias_dim = bias_dim;
 
@@ -366,7 +374,7 @@ void pulp_conv2d_fp32_bw_param_grads_cl(void *Conv2D_args) {
 
             pi_cl_team_fork(NUM_CORES, transpose, &tr_args);
 
-            matMul_args.A = tr_buffer; // outDiff;
+            matMul_args.A = tr_buffer; // outDiff transposed;
             matMul_args.B = i2c_buffer;
             matMul_args.C = coeffDiff;
             matMul_args.N = C_out;
@@ -378,8 +386,16 @@ void pulp_conv2d_fp32_bw_param_grads_cl(void *Conv2D_args) {
             matMul_args.bias = biasDiff;
             matMul_args.USE_BIASES = USE_BIASES;
 
-            matMul_args.pH = H_out;
-            matMul_args.pW = W_out;
+            matMul_args.H = H_in;
+            matMul_args.W = W_in;
+            matMul_args.pH = pH;
+            matMul_args.pW = pW;
+            matMul_args.Lpad = Lpad;
+            matMul_args.Rpad = Rpad;
+            matMul_args.Upad = Upad;
+            matMul_args.Dpad = Dpad;
+            matMul_args.stride_h = stride_h;
+            matMul_args.stride_w = stride_w;
 
             matMul_args.bias_dim = bias_dim;
 
@@ -691,9 +707,6 @@ void im2col_conv2d_fw_kernel(void *void_args) {
     uint32_t Upad = args->Upad;
     uint32_t Dpad = args->Dpad;
 
-    // const uint32_t H_out = (H_in - pH + Upad + Dpad) / h_str + 1;
-    // const uint32_t W_out = (W_in - pW + Lpad + Rpad) / w_str + 1;
-
     const uint32_t H_out = pH;
     const uint32_t W_out = pW;
 
@@ -747,38 +760,8 @@ void im2col_conv2d_param_grad_kernel(void *void_args) {
     struct mm_manager_args *man_args = (struct mm_manager_args *) void_args;
     struct matMul_args *args = man_args->mm_args;
 
-    float *__restrict__ inData = args->A;
-    float *__restrict__ coeffDiff = args->B;
-    float *__restrict__ outDiff = args->C;
-
-    float *__restrict__ biasDiff = args->bias;
     const uint32_t USE_BIASES = args->USE_BIASES;
 
-    const uint32_t H_in = args->H;
-    const uint32_t W_in = args->W;
-    const uint32_t pW = args->pW;
-    const uint32_t pH = args->pH;
-    const uint32_t C_in = args->pCin;
-    const uint32_t C_out = args->N;
-
-    uint32_t h_str = args->stride_h;
-    uint32_t w_str = args->stride_w;
-    uint32_t Lpad = args->Lpad;
-    uint32_t Rpad = args->Rpad;
-    uint32_t Upad = args->Upad;
-    uint32_t Dpad = args->Dpad;
-
-    const uint32_t H_out = (H_in - pH + Upad + Dpad) / h_str + 1;
-    const uint32_t W_out = (W_in - pW + Lpad + Rpad) / w_str + 1;
-
-    const uint32_t blockSize = (C_out + NUM_CORES - 1) / NUM_CORES;
-    const uint32_t start = pi_core_id() * blockSize;
-    const uint32_t stop = start + blockSize > C_out ? C_out : start + blockSize;
-
-    const uint32_t HWC = args->HWC;
-
-    int padding = Lpad + Rpad + Upad + Dpad;
-
     // Perform simple matrix multiplication
 #ifndef OPTIMIZE
     mm(args);
@@ -787,36 +770,41 @@ void im2col_conv2d_param_grad_kernel(void *void_args) {
 #endif
 
     // Handle biases
-    if (USE_BIASES == 1 && HWC == 0) {
-        for (uint32_t co = start; co < stop; co++) {
-            float temp = 0;
-            for (uint32_t ho = 0; ho < H_out; ho++) {
-                for (uint32_t wo = 0; wo < W_out; wo++) {
-                    temp += outDiff[wo + ho * H_out + co * H_out * W_out];
-                }
-            }
-            biasDiff[co] = temp;
-        }
-    }
+    if (USE_BIASES == 1) {
+        float *__restrict__ outDiff = args->A;
+        float *__restrict__ biasDiff = args->bias;
+
+        const uint32_t H_in = args->H;
+        const uint32_t W_in = args->W;
+        const uint32_t pW = args->pW;
+        const uint32_t pH = args->pH;
+        const uint32_t C_out = args->N;
+
+        uint32_t h_str = args->stride_h;
+        uint32_t w_str = args->stride_w;
+        uint32_t Lpad = args->Lpad;
+        uint32_t Rpad = args->Rpad;
+        uint32_t Upad = args->Upad;
+        uint32_t Dpad = args->Dpad;
+
+        const uint32_t H_out = (H_in - pH + Upad + Dpad) / h_str + 1;
+        const uint32_t W_out = (W_in - pW + Lpad + Rpad) / w_str + 1;
+
+        const uint32_t blockSize = (C_out + NUM_CORES - 1) / NUM_CORES;
+        const uint32_t start = pi_core_id() * blockSize;
+        const uint32_t stop = start + blockSize > C_out ? C_out : start + blockSize;
 
-    else if (USE_BIASES == 1 && HWC == 1) {
         for (uint32_t co = start; co < stop; co++) {
             float temp = 0;
             for (uint32_t ho = 0; ho < H_out; ho++) {
                 for (uint32_t wo = 0; wo < W_out; wo++) {
-                    temp += outDiff[wo * C_out + ho * C_out * W_out + co];
+                    temp += outDiff[wo + ho * W_out + co * H_out * W_out];
                 }
             }
             biasDiff[co] = temp;
         }
     }
 
-    if (HWC != 0 && HWC != 1) {
-        // Unsupported layout
-        printf("[im2col_conv2d_param_grad_kernel:] Invalid selection of the HWC layout (1 for HWC, 0 for CHW). Actual value: %d. Biases not used, even if provided!\n",
-               HWC);
-    }
-
     if (USE_BIASES != 0 && USE_BIASES != 1) {
         printf("[im2col_conv2d_param_grad_kernel:] Invalid selection of the bias option (1 or 0 - use biases or not). Actual value: %d. Biases not used, even if provided!\n",
                USE_BIASES);

diff --git a/lib/sources/pulp_conv_naive_fp32.c b/lib/sources/pulp_conv_naive_fp32.c
@@ -376,17 +376,18 @@ void naive_conv2d_param_grad_kernel_CHW(void *matMul_args) {
                                 // Pad conditions
                                 int pad_cond_h = h_str * ho + hk - Upad;
                                 int pad_cond_w = w_str * wo + wk - Lpad;
+                                int out_idx = wo + ho * W_out + co * H_out * W_out;
 
                                 if ((pad_cond_h >= 0) && (pad_cond_w >= 0) && (pad_cond_h < H_in) &&
                                     (pad_cond_w < W_in)) {
-                                    int out_idx = wo + ho * W_out + co * H_out * W_out;
                                     int in_idx = (w_str * wo + wk - Lpad) + (h_str * ho + hk - Upad) * W_in +
                                                  ci * H_in * W_in;
 
                                     temp += outDiff[out_idx] * inData[in_idx];
 
-                                    if (USE_BIASES == 1) bias_temp += outDiff[out_idx];
                                 }
+
+                                if (USE_BIASES == 1) bias_temp += outDiff[out_idx];
                             }
                         }
                         coeffDiff[wk + hk * pW + ci * pH * pW + co * pH * pW * C_in] = temp;

diff --git a/lib/sources/pulp_linear_fp32.c b/lib/sources/pulp_linear_fp32.c
@@ -217,18 +217,18 @@ void pulp_linear_fp32_fw_cl_kernel( void * man_args ) {
     mm_manager(manager_args);
     #endif
 
-    // pi_cl_team_barrier();
-
-    // if (USE_BIASES == 1) {
-    //     const uint32_t outputSize = N * M;
-    //     const uint32_t blockSize = (outputSize + NUM_CORES - 1) / NUM_CORES;
-    //     const uint32_t start = pi_core_id() * blockSize;
-    //     const uint32_t stop = start + blockSize > outputSize ? outputSize : start + blockSize;
-
-    //     for (uint32_t i = start; i < stop; i++) {
-    //         outData[i] += biasData[i % M];
-    //     }
-    // }
+    pi_cl_team_barrier();
+
+    if (USE_BIASES == 1) {
+        const uint32_t outputSize = N * M;
+        const uint32_t blockSize = (outputSize + NUM_CORES - 1) / NUM_CORES;
+        const uint32_t start = pi_core_id() * blockSize;
+        const uint32_t stop = start + blockSize > outputSize ? outputSize : start + blockSize;
+
+        for (uint32_t i = start; i < stop; i++) {
+            outData[i] += biasData[i];
+        }
+    }
 }
 
 void pulp_linear_fp32_bw_param_grads_cl_kernel( void * man_args )

diff --git a/lib/sources/pulp_matmul_fp32.c b/lib/sources/pulp_matmul_fp32.c
@@ -2119,9 +2119,9 @@ void mm_M_u2(void *matMul_args) {
         // =====> B IS TRANSPOSED <=====
     else {
         for (uint32_t i = 0; i < N; i++) {
-            float temp = 0;
-
             for (uint32_t j = start; j < stop; j++) {
+                float temp = 0;
+
                 for (uint32_t k = 0; k < (K & 0xfffffffe); k = k + 2) {
                     temp += A[i * K + k] * B[k + j * K];
                     temp += A[i * K + k + 1] * B[k + 1 + j * K];
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,3 +2,4 @@ @@
     **/.vscode/
     **/__pycache__/
     .idea/
+    build