Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
**/.vscode/
**/__pycache__/
.idea/
build
110 changes: 49 additions & 61 deletions lib/sources/pulp_conv2d_fp32.c
Original file line number Diff line number Diff line change
Expand Up @@ -295,10 +295,10 @@ void pulp_conv2d_fp32_bw_param_grads_cl(void *Conv2D_args) {
im2col_args.c = C2D_args->coeff;
im2col_args.output = C2D_args->output;
im2col_args.pBuffer = i2c_buffer;
im2col_args.Lpad = 0; //Lpad;
im2col_args.Rpad = 0; //Rpad;
im2col_args.Upad = 0; //Upad;
im2col_args.Dpad = 0; //Dpad;
im2col_args.Lpad = Lpad;
im2col_args.Rpad = Rpad;
im2col_args.Upad = Upad;
im2col_args.Dpad = Dpad;
im2col_args.mod = 0;
im2col_args.stride_w = stride_w;
im2col_args.stride_h = stride_h;
Expand All @@ -319,8 +319,16 @@ void pulp_conv2d_fp32_bw_param_grads_cl(void *Conv2D_args) {
matMul_args.bias = biasDiff;
matMul_args.USE_BIASES = USE_BIASES;

matMul_args.pH = H_out;
matMul_args.pW = W_out;
matMul_args.H = H_in;
matMul_args.W = W_in;
matMul_args.pH = pH;
matMul_args.pW = pW;
matMul_args.Lpad = Lpad;
matMul_args.Rpad = Rpad;
matMul_args.Upad = Upad;
matMul_args.Dpad = Dpad;
matMul_args.stride_h = stride_h;
matMul_args.stride_w = stride_w;

matMul_args.bias_dim = bias_dim;

Expand Down Expand Up @@ -366,7 +374,7 @@ void pulp_conv2d_fp32_bw_param_grads_cl(void *Conv2D_args) {

pi_cl_team_fork(NUM_CORES, transpose, &tr_args);

matMul_args.A = tr_buffer; // outDiff;
matMul_args.A = tr_buffer; // outDiff transposed;
matMul_args.B = i2c_buffer;
matMul_args.C = coeffDiff;
matMul_args.N = C_out;
Expand All @@ -378,8 +386,16 @@ void pulp_conv2d_fp32_bw_param_grads_cl(void *Conv2D_args) {
matMul_args.bias = biasDiff;
matMul_args.USE_BIASES = USE_BIASES;

matMul_args.pH = H_out;
matMul_args.pW = W_out;
matMul_args.H = H_in;
matMul_args.W = W_in;
matMul_args.pH = pH;
matMul_args.pW = pW;
matMul_args.Lpad = Lpad;
matMul_args.Rpad = Rpad;
matMul_args.Upad = Upad;
matMul_args.Dpad = Dpad;
matMul_args.stride_h = stride_h;
matMul_args.stride_w = stride_w;

matMul_args.bias_dim = bias_dim;

Expand Down Expand Up @@ -691,9 +707,6 @@ void im2col_conv2d_fw_kernel(void *void_args) {
uint32_t Upad = args->Upad;
uint32_t Dpad = args->Dpad;

// const uint32_t H_out = (H_in - pH + Upad + Dpad) / h_str + 1;
// const uint32_t W_out = (W_in - pW + Lpad + Rpad) / w_str + 1;

const uint32_t H_out = pH;
const uint32_t W_out = pW;

Expand Down Expand Up @@ -747,38 +760,8 @@ void im2col_conv2d_param_grad_kernel(void *void_args) {
struct mm_manager_args *man_args = (struct mm_manager_args *) void_args;
struct matMul_args *args = man_args->mm_args;

float *__restrict__ inData = args->A;
float *__restrict__ coeffDiff = args->B;
float *__restrict__ outDiff = args->C;

float *__restrict__ biasDiff = args->bias;
const uint32_t USE_BIASES = args->USE_BIASES;

const uint32_t H_in = args->H;
const uint32_t W_in = args->W;
const uint32_t pW = args->pW;
const uint32_t pH = args->pH;
const uint32_t C_in = args->pCin;
const uint32_t C_out = args->N;

uint32_t h_str = args->stride_h;
uint32_t w_str = args->stride_w;
uint32_t Lpad = args->Lpad;
uint32_t Rpad = args->Rpad;
uint32_t Upad = args->Upad;
uint32_t Dpad = args->Dpad;

const uint32_t H_out = (H_in - pH + Upad + Dpad) / h_str + 1;
const uint32_t W_out = (W_in - pW + Lpad + Rpad) / w_str + 1;

const uint32_t blockSize = (C_out + NUM_CORES - 1) / NUM_CORES;
const uint32_t start = pi_core_id() * blockSize;
const uint32_t stop = start + blockSize > C_out ? C_out : start + blockSize;

const uint32_t HWC = args->HWC;

int padding = Lpad + Rpad + Upad + Dpad;

// Perform simple matrix multiplication
#ifndef OPTIMIZE
mm(args);
Expand All @@ -787,36 +770,41 @@ void im2col_conv2d_param_grad_kernel(void *void_args) {
#endif

// Handle biases
if (USE_BIASES == 1 && HWC == 0) {
for (uint32_t co = start; co < stop; co++) {
float temp = 0;
for (uint32_t ho = 0; ho < H_out; ho++) {
for (uint32_t wo = 0; wo < W_out; wo++) {
temp += outDiff[wo + ho * H_out + co * H_out * W_out];
}
}
biasDiff[co] = temp;
}
}
if (USE_BIASES == 1) {
float *__restrict__ outDiff = args->A;
float *__restrict__ biasDiff = args->bias;

const uint32_t H_in = args->H;
const uint32_t W_in = args->W;
const uint32_t pW = args->pW;
const uint32_t pH = args->pH;
const uint32_t C_out = args->N;

uint32_t h_str = args->stride_h;
uint32_t w_str = args->stride_w;
uint32_t Lpad = args->Lpad;
uint32_t Rpad = args->Rpad;
uint32_t Upad = args->Upad;
uint32_t Dpad = args->Dpad;

const uint32_t H_out = (H_in - pH + Upad + Dpad) / h_str + 1;
const uint32_t W_out = (W_in - pW + Lpad + Rpad) / w_str + 1;

const uint32_t blockSize = (C_out + NUM_CORES - 1) / NUM_CORES;
const uint32_t start = pi_core_id() * blockSize;
const uint32_t stop = start + blockSize > C_out ? C_out : start + blockSize;

else if (USE_BIASES == 1 && HWC == 1) {
for (uint32_t co = start; co < stop; co++) {
float temp = 0;
for (uint32_t ho = 0; ho < H_out; ho++) {
for (uint32_t wo = 0; wo < W_out; wo++) {
temp += outDiff[wo * C_out + ho * C_out * W_out + co];
temp += outDiff[wo + ho * W_out + co * H_out * W_out];
}
}
biasDiff[co] = temp;
}
}

if (HWC != 0 && HWC != 1) {
// Unsupported layout
printf("[im2col_conv2d_param_grad_kernel:] Invalid selection of the HWC layout (1 for HWC, 0 for CHW). Actual value: %d. Biases not used, even if provided!\n",
HWC);
}

if (USE_BIASES != 0 && USE_BIASES != 1) {
printf("[im2col_conv2d_param_grad_kernel:] Invalid selection of the bias option (1 or 0 - use biases or not). Actual value: %d. Biases not used, even if provided!\n",
USE_BIASES);
Expand Down
5 changes: 3 additions & 2 deletions lib/sources/pulp_conv_naive_fp32.c
Original file line number Diff line number Diff line change
Expand Up @@ -376,17 +376,18 @@ void naive_conv2d_param_grad_kernel_CHW(void *matMul_args) {
// Pad conditions
int pad_cond_h = h_str * ho + hk - Upad;
int pad_cond_w = w_str * wo + wk - Lpad;
int out_idx = wo + ho * W_out + co * H_out * W_out;

if ((pad_cond_h >= 0) && (pad_cond_w >= 0) && (pad_cond_h < H_in) &&
(pad_cond_w < W_in)) {
int out_idx = wo + ho * W_out + co * H_out * W_out;
int in_idx = (w_str * wo + wk - Lpad) + (h_str * ho + hk - Upad) * W_in +
ci * H_in * W_in;

temp += outDiff[out_idx] * inData[in_idx];

if (USE_BIASES == 1) bias_temp += outDiff[out_idx];
}

if (USE_BIASES == 1) bias_temp += outDiff[out_idx];
}
}
coeffDiff[wk + hk * pW + ci * pH * pW + co * pH * pW * C_in] = temp;
Expand Down
24 changes: 12 additions & 12 deletions lib/sources/pulp_linear_fp32.c
Original file line number Diff line number Diff line change
Expand Up @@ -217,18 +217,18 @@ void pulp_linear_fp32_fw_cl_kernel( void * man_args ) {
mm_manager(manager_args);
#endif

// pi_cl_team_barrier();

// if (USE_BIASES == 1) {
// const uint32_t outputSize = N * M;
// const uint32_t blockSize = (outputSize + NUM_CORES - 1) / NUM_CORES;
// const uint32_t start = pi_core_id() * blockSize;
// const uint32_t stop = start + blockSize > outputSize ? outputSize : start + blockSize;

// for (uint32_t i = start; i < stop; i++) {
// outData[i] += biasData[i % M];
// }
// }
pi_cl_team_barrier();

if (USE_BIASES == 1) {
const uint32_t outputSize = N * M;
const uint32_t blockSize = (outputSize + NUM_CORES - 1) / NUM_CORES;
const uint32_t start = pi_core_id() * blockSize;
const uint32_t stop = start + blockSize > outputSize ? outputSize : start + blockSize;

for (uint32_t i = start; i < stop; i++) {
outData[i] += biasData[i];
}
}
}

void pulp_linear_fp32_bw_param_grads_cl_kernel( void * man_args )
Expand Down
4 changes: 2 additions & 2 deletions lib/sources/pulp_matmul_fp32.c
Original file line number Diff line number Diff line change
Expand Up @@ -2119,9 +2119,9 @@ void mm_M_u2(void *matMul_args) {
// =====> B IS TRANSPOSED <=====
else {
for (uint32_t i = 0; i < N; i++) {
float temp = 0;

for (uint32_t j = start; j < stop; j++) {
float temp = 0;

for (uint32_t k = 0; k < (K & 0xfffffffe); k = k + 2) {
temp += A[i * K + k] * B[k + j * K];
temp += A[i * K + k + 1] * B[k + 1 + j * K];
Expand Down
Loading