From e26250718fd4951ea21535ef0244cdcedf07c937 Mon Sep 17 00:00:00 2001 From: cx922 Date: Thu, 13 Mar 2025 00:20:07 +0000 Subject: [PATCH 1/6] first try --- .../graph/transforms/verilog/emit_bram.py | 49 ++++++++----------- 1 file changed, 20 insertions(+), 29 deletions(-) diff --git a/src/chop/passes/graph/transforms/verilog/emit_bram.py b/src/chop/passes/graph/transforms/verilog/emit_bram.py index a6a000c79..589962274 100644 --- a/src/chop/passes/graph/transforms/verilog/emit_bram.py +++ b/src/chop/passes/graph/transforms/verilog/emit_bram.py @@ -7,7 +7,7 @@ import torch from chop.passes.graph.utils import vf, v2p, get_module_by_name, init_project -from chop.nn.quantizers import integer_quantizer_for_hw, integer_floor_quantizer_for_hw +from chop.nn.quantizers import integer_quantizer_for_hw logger = logging.getLogger(__name__) from pathlib import Path @@ -49,7 +49,7 @@ def emit_parameters_in_mem_internal(node, param_name, file_name, data_name): f"{_cap(verilog_param_name)}_PARALLELISM_DIM_1" ] ) - out_depth = int((total_size + out_size - 1) / out_size) + out_depth = int(total_size / out_size) out_width = int( node.meta["mase"].parameters["common"]["args"][verilog_param_name]["precision"][ 0 @@ -126,7 +126,7 @@ def emit_parameters_in_mem_internal(node, param_name, file_name, data_name): parameter {_cap(verilog_param_name)}_PARALLELISM_DIM_0 = 1, parameter {_cap(verilog_param_name)}_PARALLELISM_DIM_1 = 1, - parameter OUT_DEPTH = (({_cap(verilog_param_name)}_TENSOR_SIZE_DIM_0 + {_cap(verilog_param_name)}_PARALLELISM_DIM_0 - 1) / {_cap(verilog_param_name)}_PARALLELISM_DIM_0) * (({_cap(verilog_param_name)}_TENSOR_SIZE_DIM_1 + {_cap(verilog_param_name)}_PARALLELISM_DIM_1 - 1) / {_cap(verilog_param_name)}_PARALLELISM_DIM_1) + parameter OUT_DEPTH = {_cap(verilog_param_name)}_TENSOR_SIZE_DIM_0 * {_cap(verilog_param_name)}_TENSOR_SIZE_DIM_1 / ({_cap(verilog_param_name)}_PARALLELISM_DIM_0 * {_cap(verilog_param_name)}_PARALLELISM_DIM_1) ) ( input clk, input rst, @@ -148,14 +148,10 @@ def emit_parameters_in_mem_internal(node, param_name, file_name, data_name): end end - logic [1:0] clear; - always_ff @(posedge clk) - if (rst) clear <= 0; - else if ((data_out_ready == 1) && (clear != 2)) clear <= clear + 1; logic ce0; - assign ce0 = data_out_ready; + assign ce0 = 1; - logic [{_cap(verilog_param_name)}_PRECISION_0*{_cap(verilog_param_name)}_PARALLELISM_DIM_0*{_cap(verilog_param_name)}_PARALLELISM_DIM_1-1:0] data_vector; + logic [{_cap(verilog_param_name)}_PRECISION_0*{_cap(verilog_param_name)}_PARALLELISM_DIM_0 * {_cap(verilog_param_name)}_PARALLELISM_DIM_1-1:0] data_vector; {node_param_name} #( .DATA_WIDTH({_cap(verilog_param_name)}_PRECISION_0 * {_cap(verilog_param_name)}_PARALLELISM_DIM_0 * {_cap(verilog_param_name)}_PARALLELISM_DIM_1), .ADDR_RANGE(OUT_DEPTH) @@ -172,7 +168,7 @@ def emit_parameters_in_mem_internal(node, param_name, file_name, data_name): for (genvar j = 0; j < {_cap(verilog_param_name)}_PARALLELISM_DIM_0 * {_cap(verilog_param_name)}_PARALLELISM_DIM_1; j++) assign data_out[j] = data_vector[{_cap(verilog_param_name)}_PRECISION_0*j+{_cap(verilog_param_name)}_PRECISION_0-1:{_cap(verilog_param_name)}_PRECISION_0*j]; - assign data_out_valid = clear == 2; + assign data_out_valid = 1; endmodule """ @@ -205,10 +201,11 @@ def emit_parameters_in_dat_internal(node, param_name, file_name): f"{_cap(verilog_param_name)}_PARALLELISM_DIM_1" ] ) - out_depth = int((total_size + out_size - 1) / out_size) + out_depth = int(total_size / out_size) data_buff = "" param_data = node.meta["mase"].module.get_parameter(param_name).data + print("param_data", param_data) if node.meta["mase"].parameters["hardware"]["interface"][verilog_param_name][ "transpose" ]: @@ -240,33 +237,27 @@ def emit_parameters_in_dat_internal(node, param_name, file_name): "precision" ][1] - if node.meta["mase"].module.config.get("floor", False): - base_quantizer = integer_floor_quantizer_for_hw - else: - base_quantizer = integer_quantizer_for_hw - scale = 2**frac_width thresh = 2**width for i in range(0, out_depth): line_buff = "" for j in range(0, out_size): - if i * out_size + out_size - 1 - j >= len(param_data): - value = 0 - else: - value = param_data[i * out_size + out_size - 1 - j] - - # TODO: please clear this up later - value = base_quantizer(torch.tensor(value), width, frac_width).item() - value = str(bin(value)) + value = param_data[i * out_size + out_size - 1 - j] + value = integer_quantizer_for_hw( + torch.tensor(value), width, frac_width + ).item() + print(width, frac_width) + print("value", value) + breakpoint() + value = str(bin(int(value * scale) % thresh)) value_bits = value[value.find("0b") + 2 :] value_bits = "0" * (width - len(value_bits)) + value_bits assert len(value_bits) == width - value_bits = hex(int(value_bits, 2)) - value_bits = value_bits[value_bits.find("0x") + 2 :] - value_bits = "0" * (width // 4 - len(value_bits)) + value_bits - line_buff = value_bits + line_buff + line_buff += value_bits + print("line_buff", line_buff) - data_buff += line_buff + "\n" + hex_buff = hex(int(line_buff, 2)) + data_buff += hex_buff[hex_buff.find("0x") + 2 :] + "\n" else: assert False, "Emitting non-fixed parameters is not supported." From a2b2038f76e6ca84e3503042c06c8438841e80d8 Mon Sep 17 00:00:00 2001 From: cx922 Date: Thu, 13 Mar 2025 01:07:21 +0000 Subject: [PATCH 2/6] fix bram bugs, and align floor linear --- src/chop/nn/quantizers/quantizers_for_hw.py | 8 +- .../graph/transforms/verilog/emit_bram.py | 76 ++-- .../graph/transforms/verilog/emit_tb.py | 4 +- src/mase_cocotb/utils.py | 36 +- src/mase_components/cast/rtl/fixed_round.sv | 11 +- .../fixed_linear_layer/rtl/fixed_linear.sv | 56 ++- .../rtl/fixed_linear_with_input_circular.sv | 386 ++++++++++++++++++ .../test/fixed_linear_tb.py | 237 ++++++----- .../fixed_linear_with_input_circular_tb.py | 287 +++++++++++++ .../linear_layers/matmul/rtl/matmul.sv | 23 +- .../memory/rtl/blk_mem_gen_0.sv | 22 +- .../memory/rtl/fifo_for_autogen.sv | 42 ++ .../memory/rtl/input_buffer.sv | 4 +- src/mase_components/memory/rtl/matrix_bank.sv | 72 +++- .../memory/rtl/matrix_bank_pkg.sv | 1 + 15 files changed, 1067 insertions(+), 198 deletions(-) create mode 100644 src/mase_components/linear_layers/fixed_linear_layer/rtl/fixed_linear_with_input_circular.sv create mode 100644 src/mase_components/linear_layers/fixed_linear_layer/test/fixed_linear_with_input_circular_tb.py create mode 100644 src/mase_components/memory/rtl/fifo_for_autogen.sv diff --git a/src/chop/nn/quantizers/quantizers_for_hw.py b/src/chop/nn/quantizers/quantizers_for_hw.py index d5ca3d8cf..4a90e2a38 100644 --- a/src/chop/nn/quantizers/quantizers_for_hw.py +++ b/src/chop/nn/quantizers/quantizers_for_hw.py @@ -3,15 +3,15 @@ import torch.nn.functional as F from torch import Tensor -# from .quantizers import integer_quantizer +from .integer import integer_quantizer, integer_floor_quantizer from .utils import block, my_clamp, my_round, unblock, my_floor -def integer_quantizer_for_hw(x: Tensor, width: int, frac_width: int): +def integer_quantizer_for_hw(x: Tensor, width: int, frac_width: int, floor=False): thresh = 2 ** (width - 1) scale = 2**frac_width - - fixed_point_value = my_clamp(my_round(x.mul(scale)), -thresh, thresh - 1) + base_quantizer = integer_floor_quantizer if floor else integer_quantizer + fixed_point_value = base_quantizer(x, width, frac_width) * scale fixed_point_value = fixed_point_value.to(torch.int) fixed_point_value = fixed_point_value % (2**width) return fixed_point_value diff --git a/src/chop/passes/graph/transforms/verilog/emit_bram.py b/src/chop/passes/graph/transforms/verilog/emit_bram.py index 589962274..6060776fb 100644 --- a/src/chop/passes/graph/transforms/verilog/emit_bram.py +++ b/src/chop/passes/graph/transforms/verilog/emit_bram.py @@ -84,7 +84,7 @@ def emit_parameters_in_mem_internal(node, param_name, file_name, data_name): logic [DWIDTH-1:0] q0_t1; initial begin - $readmemh("{data_name}", ram); + $readmemb("{data_name}", ram); end assign q0 = q0_t1; @@ -119,14 +119,14 @@ def emit_parameters_in_mem_internal(node, param_name, file_name, data_name): `timescale 1ns / 1ps module {node_param_name}_source #( - parameter {_cap(verilog_param_name)}_TENSOR_SIZE_DIM_0 = 32, - parameter {_cap(verilog_param_name)}_TENSOR_SIZE_DIM_1 = 1, - parameter {_cap(verilog_param_name)}_PRECISION_0 = 16, - parameter {_cap(verilog_param_name)}_PRECISION_1 = 3, - - parameter {_cap(verilog_param_name)}_PARALLELISM_DIM_0 = 1, - parameter {_cap(verilog_param_name)}_PARALLELISM_DIM_1 = 1, - parameter OUT_DEPTH = {_cap(verilog_param_name)}_TENSOR_SIZE_DIM_0 * {_cap(verilog_param_name)}_TENSOR_SIZE_DIM_1 / ({_cap(verilog_param_name)}_PARALLELISM_DIM_0 * {_cap(verilog_param_name)}_PARALLELISM_DIM_1) + parameter {_cap(verilog_param_name)}_TENSOR_SIZE_DIM_0 = -1, + parameter {_cap(verilog_param_name)}_TENSOR_SIZE_DIM_1 = -1, + parameter {_cap(verilog_param_name)}_PRECISION_0 = -1, + parameter {_cap(verilog_param_name)}_PRECISION_1 = -1, + + parameter {_cap(verilog_param_name)}_PARALLELISM_DIM_0 = -1, + parameter {_cap(verilog_param_name)}_PARALLELISM_DIM_1 = -1, + parameter OUT_DEPTH = ({_cap(verilog_param_name)}_TENSOR_SIZE_DIM_0 / {_cap(verilog_param_name)}_PARALLELISM_DIM_0) * ({_cap(verilog_param_name)}_TENSOR_SIZE_DIM_1 / {_cap(verilog_param_name)}_PARALLELISM_DIM_1) ) ( input clk, input rst, @@ -138,7 +138,6 @@ def emit_parameters_in_mem_internal(node, param_name, file_name, data_name): // 1-bit wider so IN_DEPTH also fits. localparam COUNTER_WIDTH = $clog2(OUT_DEPTH); logic [COUNTER_WIDTH:0] counter; - always_ff @(posedge clk) if (rst) counter <= 0; else begin @@ -147,11 +146,14 @@ def emit_parameters_in_mem_internal(node, param_name, file_name, data_name): else counter <= counter + 1; end end - + logic [1:0] clear; + always_ff @(posedge clk) + if (rst) clear <= 0; + else if ((data_out_ready == 1) && (clear != 2)) clear <= clear + 1; logic ce0; - assign ce0 = 1; + assign ce0 = data_out_ready; - logic [{_cap(verilog_param_name)}_PRECISION_0*{_cap(verilog_param_name)}_PARALLELISM_DIM_0 * {_cap(verilog_param_name)}_PARALLELISM_DIM_1-1:0] data_vector; + logic [{_cap(verilog_param_name)}_PRECISION_0*{_cap(verilog_param_name)}_PARALLELISM_DIM_0*{_cap(verilog_param_name)}_PARALLELISM_DIM_1-1:0] data_vector; {node_param_name} #( .DATA_WIDTH({_cap(verilog_param_name)}_PRECISION_0 * {_cap(verilog_param_name)}_PARALLELISM_DIM_0 * {_cap(verilog_param_name)}_PARALLELISM_DIM_1), .ADDR_RANGE(OUT_DEPTH) @@ -168,7 +170,7 @@ def emit_parameters_in_mem_internal(node, param_name, file_name, data_name): for (genvar j = 0; j < {_cap(verilog_param_name)}_PARALLELISM_DIM_0 * {_cap(verilog_param_name)}_PARALLELISM_DIM_1; j++) assign data_out[j] = data_vector[{_cap(verilog_param_name)}_PRECISION_0*j+{_cap(verilog_param_name)}_PRECISION_0-1:{_cap(verilog_param_name)}_PRECISION_0*j]; - assign data_out_valid = 1; + assign data_out_valid = clear == 2; endmodule """ @@ -205,27 +207,35 @@ def emit_parameters_in_dat_internal(node, param_name, file_name): data_buff = "" param_data = node.meta["mase"].module.get_parameter(param_name).data - print("param_data", param_data) + param_meta = node.meta["mase"].parameters["hardware"]["verilog_param"] if node.meta["mase"].parameters["hardware"]["interface"][verilog_param_name][ "transpose" ]: + raise NotImplementedError("only support linear with not tranposed weight") + else: + assert ( + param_meta[f"{_cap(verilog_param_name)}_TENSOR_SIZE_DIM_1"] + % param_meta[f"{_cap(verilog_param_name)}_PARALLELISM_DIM_1"] + == 0 + ) and ( + param_meta[f"{_cap(verilog_param_name)}_TENSOR_SIZE_DIM_0"] + % param_meta[f"{_cap(verilog_param_name)}_PARALLELISM_DIM_0"] + == 0 + ), "The parallesim parameter must be divisible by the tensor size parameter." param_data = torch.reshape( param_data, ( - node.meta["mase"].parameters["hardware"]["verilog_param"][ - "DATA_OUT_0_SIZE" - ], - node.meta["mase"].parameters["hardware"]["verilog_param"][ - "DATA_IN_0_DEPTH" - ], - node.meta["mase"].parameters["hardware"]["verilog_param"][ - "DATA_IN_0_SIZE" - ], + -1, + param_meta[f"{_cap(verilog_param_name)}_TENSOR_SIZE_DIM_1"] + // param_meta[f"{_cap(verilog_param_name)}_PARALLELISM_DIM_1"], + param_meta[f"{_cap(verilog_param_name)}_PARALLELISM_DIM_1"], + param_meta[f"{_cap(verilog_param_name)}_TENSOR_SIZE_DIM_0"] + // param_meta[f"{_cap(verilog_param_name)}_PARALLELISM_DIM_0"], + param_meta[f"{_cap(verilog_param_name)}_PARALLELISM_DIM_0"], ), ) - param_data = torch.transpose(param_data, 0, 1) + param_data = param_data.permute(0, 1, 3, 2, 4) param_data = torch.flatten(param_data).tolist() - if ( node.meta["mase"].parameters["common"]["args"][verilog_param_name]["type"] == "fixed" @@ -242,22 +252,18 @@ def emit_parameters_in_dat_internal(node, param_name, file_name): for i in range(0, out_depth): line_buff = "" for j in range(0, out_size): - value = param_data[i * out_size + out_size - 1 - j] + value = param_data[i * out_size + j] value = integer_quantizer_for_hw( - torch.tensor(value), width, frac_width + torch.tensor(value), width, frac_width, floor=True ).item() - print(width, frac_width) - print("value", value) - breakpoint() - value = str(bin(int(value * scale) % thresh)) + value = str(bin(value)) value_bits = value[value.find("0b") + 2 :] value_bits = "0" * (width - len(value_bits)) + value_bits assert len(value_bits) == width line_buff += value_bits - print("line_buff", line_buff) - hex_buff = hex(int(line_buff, 2)) - data_buff += hex_buff[hex_buff.find("0x") + 2 :] + "\n" + + data_buff += line_buff + "\n" else: assert False, "Emitting non-fixed parameters is not supported." diff --git a/src/chop/passes/graph/transforms/verilog/emit_tb.py b/src/chop/passes/graph/transforms/verilog/emit_tb.py index ad72954f8..83857a1b4 100644 --- a/src/chop/passes/graph/transforms/verilog/emit_tb.py +++ b/src/chop/passes/graph/transforms/verilog/emit_tb.py @@ -12,8 +12,6 @@ from pathlib import Path -torch.manual_seed(0) - import cocotb from mase_cocotb.testbench import Testbench from mase_cocotb.interfaces.streaming import StreamDriver, StreamMonitor @@ -147,6 +145,7 @@ def load_drivers(self, in_tensors): self.get_parameter(f"{_cap(arg)}_PARALLELISM_DIM_1"), self.get_parameter(f"{_cap(arg)}_PARALLELISM_DIM_0"), ], + floor=True, ) else: @@ -177,6 +176,7 @@ def load_monitors(self, expectation): self.get_parameter(f"DATA_OUT_0_PARALLELISM_DIM_1"), self.get_parameter(f"DATA_OUT_0_PARALLELISM_DIM_0"), ], + floor=True, ) # Set expectation for each monitor diff --git a/src/mase_cocotb/utils.py b/src/mase_cocotb/utils.py index 7271389f2..d62d37694 100644 --- a/src/mase_cocotb/utils.py +++ b/src/mase_cocotb/utils.py @@ -101,9 +101,11 @@ def product_dict(**kwargs): yield dict(zip(keys, instance)) -def fixed_preprocess_tensor(tensor: Tensor, q_config: dict, parallelism: list) -> list: +def fixed_preprocess_tensor( + tensor: Tensor, q_config: dict, parallelism: list, floor=False +) -> list: """Preprocess a tensor before driving it into the DUT. - 1. Quantize to requested fixed-point precision using floor rounding. + 1. Quantize to requested fixed-point precision. 2. Convert to integer format to be compatible with Cocotb drivers. 3. Split into blocks according to parallelism in each dimension. @@ -125,11 +127,13 @@ def fixed_preprocess_tensor(tensor: Tensor, q_config: dict, parallelism: list) - tensor = tensor.view((-1, tensor.shape[-1])) # Quantize - quantizer = partial(integer_floor_quantizer, **q_config) + base_quantizer = integer_floor_quantizer if floor else integer_quantizer + quantizer = partial(base_quantizer, **q_config) q_tensor = quantizer(tensor) - + # breakpoint() # Convert to integer format q_tensor = (q_tensor * 2 ** q_config["frac_width"]).int() + # q_tensor = signed_to_unsigned(q_tensor, bits=q_config["width"]) # Split into chunks according to parallelism in each dimension # parallelism[0]: along rows, parallelism[1]: along columns @@ -174,3 +178,27 @@ def fixed_cast(val, in_width, in_frac_width, out_width, out_frac_width): val = val # val = int(val % (1 << out_width)) return val # << out_frac_width # treat data as data + + +async def check_signal(dut, log, signal_list): + # TODO: support count start + # TODO: support checking signal with different name in valid and ready signal + def handshake_signal_check( + dut, log, signal_base, valid=None, ready=None, count_start: dict = {} + ): + data_valid = getattr(dut, f"{signal_base}_valid") if valid is None else valid + data_ready = getattr(dut, f"{signal_base}_ready") if ready is None else ready + data = getattr(dut, signal_base) + svalue = [i.signed_integer for i in data.value] + if data_valid.value & data_ready.value: + count_start[signal_base] = ( + count_start[signal_base] + 1 + if count_start.get(signal_base) is not None + else " " + ) + log.debug(f"handshake {count_start[signal_base]} {signal_base} = {svalue}") + + while True: + await RisingEdge(dut.clk) + for signal in signal_list: + handshake_signal_check(dut, log, signal) diff --git a/src/mase_components/cast/rtl/fixed_round.sv b/src/mase_components/cast/rtl/fixed_round.sv index a49b43001..2a5291053 100644 --- a/src/mase_components/cast/rtl/fixed_round.sv +++ b/src/mase_components/cast/rtl/fixed_round.sv @@ -24,12 +24,13 @@ module fixed_round #( logic carry_in, input_sign; assign input_sign = data_in[IN_WIDTH-1]; assign input_data = (input_sign) ? ~(data_in[IN_WIDTH-2:0] - 1) : data_in[IN_WIDTH-2:0]; - /* verilator lint_off SELRANGE */ + logic [IN_WIDTH + OUT_FRAC_WIDTH - 1:0] lsb_check; + assign lsb_check = {input_data, {(OUT_FRAC_WIDTH) {1'b0}}}; always_comb begin - lsb_below[2] = (IN_FRAC_WIDTH >= OUT_FRAC_WIDTH) ? input_data[IN_FRAC_WIDTH-OUT_FRAC_WIDTH] : 0; - lsb_below[1] = (IN_FRAC_WIDTH-1 >= OUT_FRAC_WIDTH) ? input_data[IN_FRAC_WIDTH-OUT_FRAC_WIDTH-1] : 0; - // lsb_below[0] = (IN_FRAC_WIDTH-2 >= OUT_FRAC_WIDTH) ? |(input_data[IN_FRAC_WIDTH-OUT_FRAC_WIDTH-2:0]): 0; - lsb_below[0] = '0; // to do: fix + lsb_below[2] = (IN_FRAC_WIDTH >= OUT_FRAC_WIDTH) ? lsb_check[IN_FRAC_WIDTH] : 0; + lsb_below[1] = (IN_FRAC_WIDTH-1 >= OUT_FRAC_WIDTH) ? lsb_check[IN_FRAC_WIDTH-1] : 0; + lsb_below[0] = (IN_FRAC_WIDTH-2 >= OUT_FRAC_WIDTH) ? |(lsb_check[IN_FRAC_WIDTH-2:0]): 0; + // lsb_below[0] = '0; // to do: fix end always_comb begin if ((IN_FRAC_WIDTH - OUT_FRAC_WIDTH) >= 0) diff --git a/src/mase_components/linear_layers/fixed_linear_layer/rtl/fixed_linear.sv b/src/mase_components/linear_layers/fixed_linear_layer/rtl/fixed_linear.sv index 1e0edce53..e6b1aeac6 100644 --- a/src/mase_components/linear_layers/fixed_linear_layer/rtl/fixed_linear.sv +++ b/src/mase_components/linear_layers/fixed_linear_layer/rtl/fixed_linear.sv @@ -19,6 +19,7 @@ module fixed_linear #( /* verilator lint_off UNUSEDPARAM */ parameter HAS_BIAS = 1, parameter WEIGHTS_PRE_TRANSPOSED = 0, + parameter FIFO = 1, parameter DATA_IN_0_PRECISION_0 = 16, parameter DATA_IN_0_PRECISION_1 = 3, @@ -112,7 +113,10 @@ module fixed_linear #( logic [DATA_OUT_0_PRECISION_0 - 1:0] add_bias_in_casted [DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0]; logic add_bias_in_valid; logic add_bias_in_ready; - + + logic [DATA_OUT_0_PRECISION_0 - 1:0] rounding_out [DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0]; + logic rounding_out_valid; + logic rounding_out_ready; // * Instances // * --------------------------------------------------------------------------------------------------- @@ -223,9 +227,9 @@ module fixed_linear #( .data_in_valid(add_bias_in_valid), .data_in_ready(add_bias_in_ready), - .data_out(data_out_0), - .data_out_valid(data_out_0_valid), - .data_out_ready(data_out_0_ready) + .data_out(rounding_out), + .data_out_valid(rounding_out_valid), + .data_out_ready(rounding_out_ready) ); end @@ -245,7 +249,7 @@ module fixed_linear #( // * Add bias if (HAS_BIAS == 1) begin - fixed_cast #( + fixed_rounding #( .IN_SIZE (BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1), .IN_WIDTH (BIAS_PRECISION_0), .IN_FRAC_WIDTH (BIAS_PRECISION_1), @@ -275,10 +279,46 @@ module fixed_linear #( .OUT_FRAC_WIDTH(DATA_OUT_0_PRECISION_1) ) output_cast ( .data_in (matmul_out), - .data_out(data_out_0) + .data_out(rounding_out) ); - assign data_out_0_valid = matmul_out_valid; - assign matmul_out_ready = data_out_0_ready; + assign rounding_out_valid = matmul_out_valid; + assign matmul_out_ready = rounding_out_ready; end + if (FIFO == 1) begin + localparam FIFO_DEPTH = DATA_OUT_0_TENSOR_SIZE_DIM_0 / DATA_OUT_0_PARALLELISM_DIM_0; + + fifo_for_autogen #( + .DATA_IN_0_PRECISION_0(DATA_OUT_0_PRECISION_0), // = 8 + .DATA_IN_0_PRECISION_1(DATA_OUT_0_PRECISION_1), // = 4 + .DATA_IN_0_TENSOR_SIZE_DIM_0(DATA_OUT_0_TENSOR_SIZE_DIM_0), // = 20 + .DATA_IN_0_PARALLELISM_DIM_0(DATA_OUT_0_PARALLELISM_DIM_0), // = 2 + .DATA_IN_0_TENSOR_SIZE_DIM_1(DATA_OUT_0_TENSOR_SIZE_DIM_1), // = 4 + .DATA_IN_0_PARALLELISM_DIM_1(DATA_OUT_0_PARALLELISM_DIM_1), // = 2 + .DEPTH(FIFO_DEPTH), + .DATA_OUT_0_PRECISION_0(DATA_OUT_0_PRECISION_0), + .DATA_OUT_0_PRECISION_1(DATA_OUT_0_PRECISION_1), + .DATA_OUT_0_TENSOR_SIZE_DIM_0(DATA_OUT_0_TENSOR_SIZE_DIM_0), + .DATA_OUT_0_PARALLELISM_DIM_0(DATA_OUT_0_PARALLELISM_DIM_0), + .DATA_OUT_0_TENSOR_SIZE_DIM_1(DATA_OUT_0_TENSOR_SIZE_DIM_1), + .DATA_OUT_0_PARALLELISM_DIM_1(DATA_OUT_0_PARALLELISM_DIM_1) + ) fifo_1_inst ( + .clk(clk), + .rst(rst), + + .data_in_0(rounding_out), + .data_in_0_valid(rounding_out_valid), + .data_in_0_ready(rounding_out_ready), + .data_out_0(data_out_0), + .data_out_0_valid(data_out_0_valid), + .data_out_0_ready(data_out_0_ready) + ); + end + else begin + always_comb begin + data_out_0 = rounding_out; + data_out_0_valid = rounding_out_valid; + rounding_out_ready = data_out_0_ready; + end + end endmodule diff --git a/src/mase_components/linear_layers/fixed_linear_layer/rtl/fixed_linear_with_input_circular.sv b/src/mase_components/linear_layers/fixed_linear_layer/rtl/fixed_linear_with_input_circular.sv new file mode 100644 index 000000000..eb253bec5 --- /dev/null +++ b/src/mase_components/linear_layers/fixed_linear_layer/rtl/fixed_linear_with_input_circular.sv @@ -0,0 +1,386 @@ +`timescale 1ns / 1ps + +/* + * +*/ + +module fixed_linear_with_input_circular #( + /* verilator lint_off UNUSEDPARAM */ + parameter HAS_BIAS = 1, + parameter FIFO = 1, + + parameter DATA_IN_0_PRECISION_0 = 16, + parameter DATA_IN_0_PRECISION_1 = 3, + parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = 20, + parameter DATA_IN_0_TENSOR_SIZE_DIM_1 = 20, + parameter DATA_IN_0_PARALLELISM_DIM_0 = 4, // must equal WEIGHT_PARALLELISM_DIM_1 + parameter DATA_IN_0_PARALLELISM_DIM_1 = 4, + localparam IN_0_DEPTH_DIM_0 = DATA_IN_0_TENSOR_SIZE_DIM_0 / DATA_IN_0_PARALLELISM_DIM_0, + localparam IN_0_DEPTH_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1 / DATA_IN_0_PARALLELISM_DIM_1, + + parameter WEIGHT_PRECISION_0 = 16, + parameter WEIGHT_PRECISION_1 = 3, + parameter WEIGHT_TENSOR_SIZE_DIM_0 = 20, + parameter WEIGHT_TENSOR_SIZE_DIM_1 = 20, + parameter WEIGHT_PARALLELISM_DIM_0 = 4, + parameter WEIGHT_PARALLELISM_DIM_1 = 4, + + // Inferred precision of the output data + // if the data out precision will be replaced by the setting + parameter DATA_OUT_0_PRECISION_0 = DATA_IN_0_PRECISION_0 + WEIGHT_PRECISION_0 + $clog2( + DATA_IN_0_TENSOR_SIZE_DIM_0 + ) + HAS_BIAS, + parameter DATA_OUT_0_PRECISION_1 = DATA_IN_0_PRECISION_1 + WEIGHT_PRECISION_1, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = WEIGHT_TENSOR_SIZE_DIM_1, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1, + parameter DATA_OUT_0_PARALLELISM_DIM_0 = WEIGHT_PARALLELISM_DIM_1, + parameter DATA_OUT_0_PARALLELISM_DIM_1 = DATA_IN_0_PARALLELISM_DIM_1, + + parameter BIAS_PRECISION_0 = 16, + parameter BIAS_PRECISION_1 = 3, + parameter BIAS_TENSOR_SIZE_DIM_0 = DATA_OUT_0_TENSOR_SIZE_DIM_0, + parameter BIAS_TENSOR_SIZE_DIM_1 = 1, + parameter BIAS_PARALLELISM_DIM_0 = DATA_OUT_0_PARALLELISM_DIM_0, + parameter BIAS_PARALLELISM_DIM_1 = 1 +) ( + input clk, + input rst, + + // input port for data_inivations + input logic [DATA_IN_0_PRECISION_0-1:0] data_in_0 [DATA_IN_0_PARALLELISM_DIM_0*DATA_IN_0_PARALLELISM_DIM_1-1:0], + input logic data_in_0_valid, + output logic data_in_0_ready, + + // input port for weight + input logic [WEIGHT_PRECISION_0-1:0] weight [WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1-1:0], + input logic weight_valid, + output logic weight_ready, + + input logic [BIAS_PRECISION_0-1:0] bias[BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1 -1:0], + input logic bias_valid, + output logic bias_ready, + + output logic [DATA_OUT_0_PRECISION_0-1:0] data_out_0 [DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0], + output logic data_out_0_valid, + input logic data_out_0_ready +); + logic [DATA_IN_0_PRECISION_0-1:0]circular_data_in_0[DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1-1:0]; + logic circular_data_in_0_valid, circular_data_in_0_ready; + logic [WEIGHT_PRECISION_0-1:0]circular_weight[WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1-1:0]; + logic circular_weight_valid, circular_weight_ready; + logic [BIAS_PRECISION_0-1:0]circular_bias[BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1-1:0]; + logic circular_bias_valid, circular_bias_ready; + logic [DATA_IN_0_PRECISION_0-1:0] data_in_0_reg [DATA_IN_0_PARALLELISM_DIM_0*DATA_IN_0_PARALLELISM_DIM_1-1:0]; + logic data_in_0_reg_valid, data_in_0_reg_ready; + if (FIFO == 1) begin + localparam FIFO_DEPTH = DATA_IN_0_TENSOR_SIZE_DIM_0 / DATA_IN_0_PARALLELISM_DIM_0; + + fifo_for_autogen #( + .DATA_IN_0_PRECISION_0(DATA_IN_0_PRECISION_0), // = 8 + .DATA_IN_0_PRECISION_1(DATA_IN_0_PRECISION_1), // = 4 + .DATA_IN_0_TENSOR_SIZE_DIM_0(DATA_IN_0_TENSOR_SIZE_DIM_0), // = 20 + .DATA_IN_0_PARALLELISM_DIM_0(DATA_IN_0_PARALLELISM_DIM_0), // = 2 + .DATA_IN_0_TENSOR_SIZE_DIM_1(DATA_IN_0_TENSOR_SIZE_DIM_1), // = 4 + .DATA_IN_0_PARALLELISM_DIM_1(DATA_IN_0_PARALLELISM_DIM_1), // = 2 + .DEPTH(FIFO_DEPTH), + .DATA_OUT_0_PRECISION_0(DATA_IN_0_PRECISION_0), + .DATA_OUT_0_PRECISION_1(DATA_IN_0_PRECISION_1), + .DATA_OUT_0_TENSOR_SIZE_DIM_0(DATA_IN_0_TENSOR_SIZE_DIM_0), + .DATA_OUT_0_PARALLELISM_DIM_0(DATA_IN_0_PARALLELISM_DIM_0), + .DATA_OUT_0_TENSOR_SIZE_DIM_1(DATA_IN_0_TENSOR_SIZE_DIM_1), + .DATA_OUT_0_PARALLELISM_DIM_1(DATA_IN_0_PARALLELISM_DIM_1) + ) fifo_1_inst ( + .clk(clk), + .rst(rst), + .data_in_0(data_in_0), + .data_in_0_valid(data_in_0_valid), + .data_in_0_ready(data_in_0_ready), + .data_out_0(data_in_0_reg), + .data_out_0_valid(data_in_0_reg_valid), + .data_out_0_ready(data_in_0_reg_ready) + ); + end + else begin + always_comb begin + data_in_0_reg = data_in_0; + data_in_0_reg_valid = data_in_0_valid; + data_in_0_ready = data_in_0_reg_ready; + end + end + input_buffer #( + .DATA_WIDTH (DATA_IN_0_PRECISION_0), + .IN_NUM (DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1), + .REPEAT (WEIGHT_TENSOR_SIZE_DIM_1 / WEIGHT_PARALLELISM_DIM_1), + .BUFFER_SIZE(IN_0_DEPTH_DIM_0) + ) data_in_0_buffer ( + .clk, + .rst, + // Input streaming port + .data_in(data_in_0_reg), + .data_in_valid(data_in_0_reg_valid), + .data_in_ready(data_in_0_reg_ready), + // Output streaming port + .data_out(circular_data_in_0), + .data_out_valid(circular_data_in_0_valid), + .data_out_ready(circular_data_in_0_ready) + ); + input_buffer #( + .DATA_WIDTH(WEIGHT_PRECISION_0), + .IN_NUM(WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1), + .REPEAT(IN_0_DEPTH_DIM_1), + .BUFFER_SIZE(WEIGHT_TENSOR_SIZE_DIM_0*WEIGHT_TENSOR_SIZE_DIM_1 / (WEIGHT_PARALLELISM_DIM_0*WEIGHT_PARALLELISM_DIM_1)) + ) weight_buffer ( + .clk, + .rst, + // Input streaming port + .data_in(weight), + .data_in_valid(weight_valid), + .data_in_ready(weight_ready), + // Output streaming port + .data_out(circular_weight), + .data_out_valid(circular_weight_valid), + .data_out_ready(circular_weight_ready) + ); + input_buffer #( + .DATA_WIDTH (BIAS_PRECISION_0), + .IN_NUM (BIAS_PARALLELISM_DIM_0), + .REPEAT (IN_0_DEPTH_DIM_1), + .BUFFER_SIZE(BIAS_TENSOR_SIZE_DIM_0 / (BIAS_PARALLELISM_DIM_0)) + ) bias_buffer ( + .clk, + .rst, + // Input streaming port + .data_in(bias), + .data_in_valid(bias_valid), + .data_in_ready(bias_ready), + // Output streaming port + .data_out(circular_bias), + .data_out_valid(circular_bias_valid), + .data_out_ready(circular_bias_ready) + ); + logic [DATA_OUT_0_PARALLELISM_DIM_1 - 1:0] + linear_1d_data_in_0_ready, + linear_1d_weight_ready, + linear_1d_bias_ready, + linear_1d_data_out_valid; + always_comb begin + circular_data_in_0_ready = linear_1d_data_in_0_ready[0]; + circular_weight_ready = linear_1d_weight_ready[0]; + circular_bias_ready = linear_1d_bias_ready[0]; + data_out_0_valid = linear_1d_data_out_valid[0]; + end + for (genvar i = 0; i < DATA_OUT_0_PARALLELISM_DIM_1; i = i + 1) begin + linear_1d #( + .HAS_BIAS(HAS_BIAS), + + .DATA_IN_0_PRECISION_0 (DATA_IN_0_PRECISION_0), + .DATA_IN_0_PRECISION_1 (DATA_IN_0_PRECISION_1), + .DATA_IN_0_TENSOR_SIZE_DIM_0(DATA_IN_0_TENSOR_SIZE_DIM_0), + .DATA_IN_0_PARALLELISM_DIM_0(DATA_IN_0_PARALLELISM_DIM_0), + + .WEIGHT_PRECISION_0 (WEIGHT_PRECISION_0), + .WEIGHT_PRECISION_1 (WEIGHT_PRECISION_1), + .WEIGHT_TENSOR_SIZE_DIM_0(WEIGHT_TENSOR_SIZE_DIM_0), + .WEIGHT_TENSOR_SIZE_DIM_1(WEIGHT_TENSOR_SIZE_DIM_1), + .WEIGHT_PARALLELISM_DIM_0(WEIGHT_PARALLELISM_DIM_0), + .WEIGHT_PARALLELISM_DIM_1(WEIGHT_PARALLELISM_DIM_1), + + .BIAS_PRECISION_0 (BIAS_PRECISION_0), + .BIAS_PRECISION_1 (BIAS_PRECISION_1), + .BIAS_TENSOR_SIZE_DIM_0(BIAS_TENSOR_SIZE_DIM_0), + .BIAS_PARALLELISM_DIM_0(BIAS_PARALLELISM_DIM_0), + + .DATA_OUT_0_PRECISION_0(DATA_OUT_0_PRECISION_0), + .DATA_OUT_0_PRECISION_1(DATA_OUT_0_PRECISION_1) + ) fixed_linear ( + .clk, + .rst, + .data_in_0 (circular_data_in_0[(i+1) * DATA_IN_0_PARALLELISM_DIM_0 - 1:i * DATA_IN_0_PARALLELISM_DIM_0]), + .data_in_0_valid(circular_data_in_0_valid), + .data_in_0_ready(linear_1d_data_in_0_ready[i]), + .weight(circular_weight), + .weight_valid(circular_weight_valid), + .weight_ready(linear_1d_weight_ready[i]), + .bias(circular_bias), + .bias_valid(circular_bias_valid), + .bias_ready(linear_1d_bias_ready[i]), + .data_out_0 (data_out_0[(i+1) * DATA_OUT_0_PARALLELISM_DIM_0- 1:i * DATA_OUT_0_PARALLELISM_DIM_0]), + .data_out_0_valid(linear_1d_data_out_valid[i]), + .data_out_0_ready(data_out_0_ready) + ); + end +endmodule +module linear_1d #( + /* verilator lint_off UNUSEDPARAM */ + parameter HAS_BIAS = 0, + + parameter DATA_IN_0_PRECISION_0 = 16, + parameter DATA_IN_0_PRECISION_1 = 3, + parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = 1, + parameter DATA_IN_0_PARALLELISM_DIM_0 = 1, + parameter IN_0_DEPTH = DATA_IN_0_TENSOR_SIZE_DIM_0 / DATA_IN_0_PARALLELISM_DIM_0, + + parameter WEIGHT_PRECISION_0 = 16, + parameter WEIGHT_PRECISION_1 = 3, + parameter WEIGHT_TENSOR_SIZE_DIM_1 = 32, + parameter WEIGHT_TENSOR_SIZE_DIM_0 = 1, + parameter WEIGHT_PARALLELISM_DIM_1 = 4, + parameter WEIGHT_PARALLELISM_DIM_0 = 1, + + parameter DATA_OUT_0_PRECISION_0 = 8, + parameter DATA_OUT_0_PRECISION_1 = 4, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = WEIGHT_TENSOR_SIZE_DIM_1, + parameter DATA_OUT_0_PARALLELISM_DIM_0 = WEIGHT_PARALLELISM_DIM_1, + + parameter BIAS_PRECISION_0 = 16, + parameter BIAS_PRECISION_1 = 3, + parameter BIAS_TENSOR_SIZE_DIM_0 = DATA_OUT_0_TENSOR_SIZE_DIM_0, + parameter BIAS_PARALLELISM_DIM_0 = DATA_OUT_0_PARALLELISM_DIM_0 + +) ( + input clk, + input rst, + + // input port for data_inivations + input [DATA_IN_0_PRECISION_0-1:0] data_in_0 [DATA_IN_0_PARALLELISM_DIM_0-1:0], + input data_in_0_valid, + output data_in_0_ready, + + // input port for weight + input [WEIGHT_PRECISION_0-1:0] weight [WEIGHT_PARALLELISM_DIM_1 * WEIGHT_PARALLELISM_DIM_0-1:0], + input weight_valid, + output weight_ready, + + /* verilator lint_off UNUSEDSIGNAL */ + input [BIAS_PRECISION_0-1:0] bias[BIAS_PARALLELISM_DIM_0-1:0], + input bias_valid, + /* verilator lint_on UNUSEDSIGNAL */ + output bias_ready, + + output [DATA_OUT_0_PRECISION_0-1:0] data_out_0 [DATA_OUT_0_PARALLELISM_DIM_0-1:0], + output data_out_0_valid, + input data_out_0_ready +); + + localparam FDP_WIDTH = DATA_IN_0_PRECISION_0 + WEIGHT_PRECISION_0 + $clog2( + DATA_IN_0_PARALLELISM_DIM_0 + ); + localparam ACC_WIDTH = FDP_WIDTH + $clog2(IN_0_DEPTH); + localparam LOSSLESS_OUT_WIDTH = ACC_WIDTH + HAS_BIAS; + logic fdp_join_valid, fdp_join_ready; + join2 #() fdp_join_inst ( + .data_in_ready ({weight_ready, data_in_0_ready}), + .data_in_valid ({weight_valid, data_in_0_valid}), + .data_out_valid(fdp_join_valid), + .data_out_ready(fdp_join_ready) + ); + + /* verilator lint_off UNUSEDSIGNAL */ + // Assume the parallelised hardware above have the same arrival time + // which means that they always have the same state. So we can just + // pick one of the valid signal to use. + logic [WEIGHT_PARALLELISM_DIM_1-1:0] fdp_data_ready, fdp_weight_ready; + assign fdp_join_ready = fdp_data_ready[0]; + /* verilator lint_on UNUSEDSIGNAL */ + + logic acc_ready; + logic [ ACC_WIDTH-1:0] acc_data_out [DATA_OUT_0_PARALLELISM_DIM_0-1:0]; + logic [LOSSLESS_OUT_WIDTH-1:0] cast_data_out_0[DATA_OUT_0_PARALLELISM_DIM_0-1:0]; + // There are WEIGHT_PARALLELISM_DIM_0 number of dot product instances with DATA_IN_0_TENSOR_SIZE_DIM_0 inputs + // and each one computes for IN_0_DEPTH iterations for each inputs. + for (genvar i = 0; i < WEIGHT_PARALLELISM_DIM_1; i = i + 1) begin : linear + // Assume the weight are transposed and partitioned + logic [WEIGHT_PRECISION_0-1:0] current_weight[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + assign current_weight = weight[DATA_IN_0_PARALLELISM_DIM_0*(i+1)-1:DATA_IN_0_PARALLELISM_DIM_0*i]; + + logic [FDP_WIDTH-1:0] fdp_data_out; + logic fdp_data_out_valid, fdp_data_out_ready; + + // The inputs are already sync-ed by the previous join + fixed_dot_product #( + .IN_WIDTH(DATA_IN_0_PRECISION_0), + .WEIGHT_WIDTH(WEIGHT_PRECISION_0), + .IN_SIZE(DATA_IN_0_PARALLELISM_DIM_0) + ) fdp_inst ( + .clk(clk), + .rst(rst), + .data_in(data_in_0), + .data_in_valid(fdp_join_valid), + .data_in_ready(fdp_data_ready[i]), + .weight(current_weight), + .weight_valid(fdp_join_valid), + .weight_ready(fdp_weight_ready[i]), + .data_out(fdp_data_out), + .data_out_valid(fdp_data_out_valid), + .data_out_ready(fdp_data_out_ready) + ); + + /* verilator lint_off UNUSEDSIGNAL */ + logic acc_data_out_valid, acc_data_out_ready; + /* verilator lint_on UNUSEDSIGNAL */ + + fixed_accumulator #( + .IN_WIDTH(FDP_WIDTH), + .IN_DEPTH(IN_0_DEPTH) + ) fixed_accumulator_inst ( + .clk(clk), + .rst(rst), + .data_in(fdp_data_out), + .data_in_valid(fdp_data_out_valid), + .data_in_ready(fdp_data_out_ready), + .data_out(acc_data_out[i]), + .data_out_valid(acc_data_out_valid), + .data_out_ready(acc_data_out_ready) + ); + + // Assume the parallelised hardware above have the same arrival time + // which means that they always have the same state. So we can just + // pick one of the valid signal to use. + assign acc_data_out_ready = acc_ready; + end + + + if (HAS_BIAS == 1) begin + logic [ACC_WIDTH-1:0] bias_sext[BIAS_PARALLELISM_DIM_0-1:0]; + logic acc_join_valid, acc_join_ready; + logic [DATA_OUT_0_PARALLELISM_DIM_0-1:0] reg_ready; + + join2 #() acc_join_inst ( + .data_in_ready ({bias_ready, acc_ready}), + .data_in_valid ({bias_valid, linear[0].acc_data_out_valid}), + .data_out_valid(data_out_0_valid), + .data_out_ready(data_out_0_ready) + ); + + fixed_rounding #( + .IN_SIZE(DATA_OUT_0_PARALLELISM_DIM_0), + .IN_WIDTH(BIAS_PRECISION_0), + .IN_FRAC_WIDTH(BIAS_PRECISION_1), + .OUT_WIDTH(ACC_WIDTH), + .OUT_FRAC_WIDTH(DATA_IN_0_PRECISION_1 + WEIGHT_PRECISION_1) + ) bias_cast ( + .data_in (bias), + .data_out(bias_sext) + ); + + for (genvar i = 0; i < DATA_OUT_0_PARALLELISM_DIM_0; i = i + 1) begin : add_bias + assign cast_data_out_0[i] = $signed(acc_data_out[i]) + $signed(bias_sext[i]); + end + end else begin + assign acc_ready = data_out_0_ready; + assign data_out_0_valid = linear[0].acc_data_out_valid; + assign cast_data_out_0 = acc_data_out; + assign bias_ready = 1; + end + fixed_rounding #( + .IN_SIZE(DATA_OUT_0_PARALLELISM_DIM_0), + .IN_WIDTH(LOSSLESS_OUT_WIDTH), + .IN_FRAC_WIDTH(DATA_IN_0_PRECISION_1 + WEIGHT_PRECISION_1), + .OUT_WIDTH(DATA_OUT_0_PRECISION_0), + .OUT_FRAC_WIDTH(DATA_OUT_0_PRECISION_1) + ) bias_cast ( + .data_in (cast_data_out_0), + .data_out(data_out_0) + ); + +endmodule diff --git a/src/mase_components/linear_layers/fixed_linear_layer/test/fixed_linear_tb.py b/src/mase_components/linear_layers/fixed_linear_layer/test/fixed_linear_tb.py index c7683cbe3..a0f6fcaa7 100644 --- a/src/mase_components/linear_layers/fixed_linear_layer/test/fixed_linear_tb.py +++ b/src/mase_components/linear_layers/fixed_linear_layer/test/fixed_linear_tb.py @@ -109,80 +109,80 @@ def preprocess_tensor(self, tensor, config, parallelism): blocks.append(dim_1_split[i][j].flatten().tolist()) return blocks - async def run_test(self, us): + async def run_test(self, batches=1, us=100): await self.reset() self.log.info(f"Reset finished") self.data_out_0_monitor.ready.value = 1 + for _ in range(batches): + inputs = self.generate_inputs() + exp_out = self.model(inputs) + + # * Load the inputs driver + self.log.info(f"Processing inputs: {inputs}") + inputs = self.preprocess_tensor( + tensor=inputs, + config={ + "width": self.get_parameter("DATA_IN_0_PRECISION_0"), + "frac_width": self.get_parameter("DATA_IN_0_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_1"), + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0"), + ], + ) + self.data_in_0_driver.load_driver(inputs) - inputs = self.generate_inputs() - exp_out = self.model(inputs) - - # * Load the inputs driver - self.log.info(f"Processing inputs: {inputs}") - inputs = self.preprocess_tensor( - tensor=inputs, - config={ - "width": self.get_parameter("DATA_IN_0_PRECISION_0"), - "frac_width": self.get_parameter("DATA_IN_0_PRECISION_1"), - }, - parallelism=[ - self.get_parameter("DATA_IN_0_PARALLELISM_DIM_1"), - self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0"), - ], - ) - self.data_in_0_driver.load_driver(inputs) - - # * Load the weights driver - if self.get_parameter("WEIGHTS_PRE_TRANSPOSED") == 1: - weights = self.model.weight.transpose(0, 1) - else: - weights = self.model.weight - - self.log.info(f"Processing weights: {weights}") - weights = self.preprocess_tensor( - tensor=weights, - config={ - "width": self.get_parameter("WEIGHT_PRECISION_0"), - "frac_width": self.get_parameter("WEIGHT_PRECISION_1"), - }, - parallelism=[ - self.get_parameter("WEIGHT_PARALLELISM_DIM_1"), - self.get_parameter("WEIGHT_PARALLELISM_DIM_0"), - ], - ) - self.weight_driver.load_driver(weights) + # * Load the weights driver + if self.get_parameter("WEIGHTS_PRE_TRANSPOSED") == 1: + weights = self.model.weight.transpose(0, 1) + else: + weights = self.model.weight - # * Load the bias driver - if self.get_parameter("HAS_BIAS") == 1: - bias = self.model.bias - self.log.info(f"Processing bias: {bias}") - bias = self.preprocess_tensor( - tensor=bias, + self.log.info(f"Processing weights: {weights}") + weights = self.preprocess_tensor( + tensor=weights, config={ - "width": self.get_parameter("BIAS_PRECISION_0"), - "frac_width": self.get_parameter("BIAS_PRECISION_1"), + "width": self.get_parameter("WEIGHT_PRECISION_0"), + "frac_width": self.get_parameter("WEIGHT_PRECISION_1"), }, parallelism=[ - self.get_parameter("BIAS_PARALLELISM_DIM_1"), - self.get_parameter("BIAS_PARALLELISM_DIM_0"), + self.get_parameter("WEIGHT_PARALLELISM_DIM_1"), + self.get_parameter("WEIGHT_PARALLELISM_DIM_0"), ], ) - self.bias_driver.load_driver(bias) - - # * Load the output monitor - self.log.info(f"Processing outputs: {exp_out}") - outs = self.preprocess_tensor( - tensor=exp_out, - config={ - "width": self.get_parameter("DATA_OUT_0_PRECISION_0"), - "frac_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), - }, - parallelism=[ - self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_1"), - self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_0"), - ], - ) - self.data_out_0_monitor.load_monitor(outs) + self.weight_driver.load_driver(weights) + + # * Load the bias driver + if self.get_parameter("HAS_BIAS") == 1: + bias = self.model.bias + self.log.info(f"Processing bias: {bias}") + bias = self.preprocess_tensor( + tensor=bias, + config={ + "width": self.get_parameter("BIAS_PRECISION_0"), + "frac_width": self.get_parameter("BIAS_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("BIAS_PARALLELISM_DIM_1"), + self.get_parameter("BIAS_PARALLELISM_DIM_0"), + ], + ) + self.bias_driver.load_driver(bias) + + # * Load the output monitor + self.log.info(f"Processing outputs: {exp_out}") + outs = self.preprocess_tensor( + tensor=exp_out, + config={ + "width": self.get_parameter("DATA_OUT_0_PRECISION_0"), + "frac_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_1"), + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_0"), + ], + ) + self.data_out_0_monitor.load_monitor(outs) await Timer(us, units="us") assert self.data_out_0_monitor.exp_queue.empty() @@ -191,9 +191,20 @@ async def run_test(self, us): @cocotb.test() async def cocotb_test(dut): tb = LinearTB(dut) - await tb.run_test(us=100) + await tb.run_test(batches=10, us=100) + +async def check_signal(dut, log): + num = {"data_out_0": 0, "data_in_0": 0} + while True: + await RisingEdge(dut.clk) + +# verified case +# weight per transpoed = 0 +# weight pre transposed = 1 +# has bias = 0 +# has bias = 1 def get_fixed_linear_config(kwargs={}): # if pretranspose # weight1 = in0 @@ -201,22 +212,22 @@ def get_fixed_linear_config(kwargs={}): # weight0 = in0 config = { "HAS_BIAS": 1, - "WEIGHTS_PRE_TRANSPOSED": 1, + "WEIGHTS_PRE_TRANSPOSED": 0, "DATA_IN_0_TENSOR_SIZE_DIM_0": 32, "DATA_IN_0_TENSOR_SIZE_DIM_1": 16, - "DATA_IN_0_PARALLELISM_DIM_0": 4, + "DATA_IN_0_PARALLELISM_DIM_0": 8, "DATA_IN_0_PARALLELISM_DIM_1": 4, - "WEIGHT_TENSOR_SIZE_DIM_0": 16, - "WEIGHT_TENSOR_SIZE_DIM_1": 32, - "WEIGHT_PARALLELISM_DIM_0": 2, + "WEIGHT_TENSOR_SIZE_DIM_0": 32, + "WEIGHT_TENSOR_SIZE_DIM_1": 16, + "WEIGHT_PARALLELISM_DIM_0": 8, "WEIGHT_PARALLELISM_DIM_1": 4, "DATA_IN_0_PRECISION_0": 8, "DATA_IN_0_PRECISION_1": 4, - "WEIGHT_PRECISION_0": 8, - "WEIGHT_PRECISION_1": 4, - "BIAS_PRECISION_0": 8, - "BIAS_PRECISION_1": 4, - "DATA_OUT_0_PRECISION_0": 10, + "WEIGHT_PRECISION_0": 10, + "WEIGHT_PRECISION_1": 3, + "BIAS_PRECISION_0": 5, + "BIAS_PRECISION_1": 2, + "DATA_OUT_0_PRECISION_0": 8, "DATA_OUT_0_PRECISION_1": 4, } config.update(kwargs) @@ -246,44 +257,44 @@ def test_fixed_linear_smoke(): ) -@pytest.mark.dev -def test_fixed_linear_regression(): - """ - More extensive tests to check realistic parameter sizes. - """ - mase_runner( - trace=True, - module_param_list=[ - get_fixed_linear_config( - { - "DATA_IN_0_TENSOR_SIZE_DIM_0": 768, - "DATA_IN_0_PARALLELISM_DIM_0": 32, - "WEIGHT_TENSOR_SIZE_DIM_0": 768, - "WEIGHT_TENSOR_SIZE_DIM_1": 768, - "WEIGHT_PARALLELISM_DIM_0": 32, - "WEIGHT_PARALLELISM_DIM_1": 32, - "BIAS_TENSOR_SIZE_DIM_0": 768, - "BIAS_PARALLELISM_DIM_0": 32, - } - ), - get_fixed_linear_config( - { - "HAS_BIAS": 1, - "WEIGHTS_PRE_TRANSPOSED": 0, - "DATA_IN_0_TENSOR_SIZE_DIM_0": 768, - "DATA_IN_0_PARALLELISM_DIM_0": 32, - "WEIGHT_TENSOR_SIZE_DIM_0": 768, - "WEIGHT_TENSOR_SIZE_DIM_1": 768, - "WEIGHT_PARALLELISM_DIM_0": 32, - "WEIGHT_PARALLELISM_DIM_1": 32, - "BIAS_TENSOR_SIZE_DIM_0": 768, - "BIAS_PARALLELISM_DIM_0": 32, - } - ), - ], - ) - - +# @pytest.mark.dev +# def test_fixed_linear_regression(): +# """ +# More extensive tests to check realistic parameter sizes. +# """ +# mase_runner( +# trace=True, +# module_param_list=[ +# get_fixed_linear_config( +# { +# "DATA_IN_0_TENSOR_SIZE_DIM_0": 768, +# "DATA_IN_0_PARALLELISM_DIM_0": 32, +# "WEIGHT_TENSOR_SIZE_DIM_0": 768, +# "WEIGHT_TENSOR_SIZE_DIM_1": 768, +# "WEIGHT_PARALLELISM_DIM_0": 32, +# "WEIGHT_PARALLELISM_DIM_1": 32, +# "BIAS_TENSOR_SIZE_DIM_0": 768, +# "BIAS_PARALLELISM_DIM_0": 32, +# } +# ), +# get_fixed_linear_config( +# { +# "HAS_BIAS": 1, +# "WEIGHTS_PRE_TRANSPOSED": 0, +# "DATA_IN_0_TENSOR_SIZE_DIM_0": 768, +# "DATA_IN_0_PARALLELISM_DIM_0": 32, +# "WEIGHT_TENSOR_SIZE_DIM_0": 768, +# "WEIGHT_TENSOR_SIZE_DIM_1": 768, +# "WEIGHT_PARALLELISM_DIM_0": 32, +# "WEIGHT_PARALLELISM_DIM_1": 32, +# "BIAS_TENSOR_SIZE_DIM_0": 768, +# "BIAS_PARALLELISM_DIM_0": 32, +# } +# ), +# ], +# ) + +torch.manual_seed(3) if __name__ == "__main__": test_fixed_linear_smoke() # test_fixed_linear_regression() diff --git a/src/mase_components/linear_layers/fixed_linear_layer/test/fixed_linear_with_input_circular_tb.py b/src/mase_components/linear_layers/fixed_linear_layer/test/fixed_linear_with_input_circular_tb.py new file mode 100644 index 000000000..9976fe97b --- /dev/null +++ b/src/mase_components/linear_layers/fixed_linear_layer/test/fixed_linear_with_input_circular_tb.py @@ -0,0 +1,287 @@ +#!/usr/bin/env python3 + +import os, pytest + +import torch +import logging +from functools import partial + +import cocotb +from cocotb.log import SimLog +from cocotb.triggers import Timer, RisingEdge + +from mase_cocotb.testbench import Testbench +from mase_cocotb.interfaces.streaming import ( + StreamDriver, + StreamMonitor, + ErrorThresholdStreamMonitor, +) +from mase_cocotb.runner import mase_runner + +from mase_cocotb.utils import bit_driver + +# from mase_cocotb import Testbench, StreamDriver, StreamMonitor, mase_runner +from chop.nn.quantized.modules.linear import LinearInteger +from chop.nn.quantizers import integer_floor_quantizer + + +class LinearTB(Testbench): + def __init__(self, dut) -> None: + super().__init__(dut, dut.clk, dut.rst) + + if not hasattr(self, "log"): + self.log = SimLog("%s" % (type(self).__qualname__)) + self.log.setLevel(logging.DEBUG) + + self.data_in_0_driver = StreamDriver( + dut.clk, dut.data_in_0, dut.data_in_0_valid, dut.data_in_0_ready + ) + self.weight_driver = StreamDriver( + dut.clk, dut.weight, dut.weight_valid, dut.weight_ready + ) + + if self.get_parameter("HAS_BIAS") == 1: + self.bias_driver = StreamDriver( + dut.clk, dut.bias, dut.bias_valid, dut.bias_ready + ) + self.bias_driver.log.setLevel(logging.DEBUG) + + self.data_out_0_monitor = StreamMonitor( + dut.clk, + dut.data_out_0, + dut.data_out_0_valid, + dut.data_out_0_ready, + check=True, + ) + + # Model + self.model = LinearInteger( + in_features=self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_0"), + out_features=self.get_parameter("DATA_OUT_0_TENSOR_SIZE_DIM_0"), + bias=True if self.get_parameter("HAS_BIAS") == 1 else False, + config={ + "data_in_width": self.get_parameter("DATA_IN_0_PRECISION_0"), + "data_in_frac_width": self.get_parameter("DATA_IN_0_PRECISION_1"), + "weight_width": self.get_parameter("WEIGHT_PRECISION_0"), + "weight_frac_width": self.get_parameter("WEIGHT_PRECISION_1"), + "bias_width": self.get_parameter("BIAS_PRECISION_0"), + "bias_frac_width": self.get_parameter("BIAS_PRECISION_1"), + }, + out_config={ + "data_out_width": self.get_parameter("DATA_OUT_0_PRECISION_0"), + "data_out_frac_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), + }, + floor=True, + ) + + # Set verbosity of driver and monitor loggers to debug + self.data_in_0_driver.log.setLevel(logging.DEBUG) + self.weight_driver.log.setLevel(logging.DEBUG) + self.data_out_0_monitor.log.setLevel(logging.DEBUG) + + def generate_inputs(self): + return torch.randn( + ( + self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_1"), + self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_0"), + ) + ) + + def preprocess_tensor(self, tensor, config, parallelism): + if len(tensor.shape) == 1: + tensor = tensor.unsqueeze(0) + + # Quantize + quantizer = partial(integer_floor_quantizer, **config) + q_tensor = quantizer(tensor) + self.log.debug(f"Quantized tensor: {q_tensor}") + + # Convert to integer format + q_tensor = (q_tensor * 2 ** config["frac_width"]).int() + self.log.debug(f"Tensor in integer format: {q_tensor}") + + # Split into chunks according to parallelism in each dimension + # parallelism[0]: along rows, parallelism[1]: along columns + dim_0_split = q_tensor.split(parallelism[0], dim=0) + dim_1_split = [x.split(parallelism[1], dim=1) for x in dim_0_split] + blocks = [] + # Flatten the list of blocks + for i in range(len(dim_1_split)): + for j in range(len(dim_1_split[i])): + blocks.append(dim_1_split[i][j].flatten().tolist()) + return blocks + + async def run_test(self, us): + await self.reset() + self.log.info(f"Reset finished") + self.data_out_0_monitor.ready.value = 1 + + inputs = self.generate_inputs() + exp_out = self.model(inputs) + + # * Load the inputs driver + self.log.info(f"Processing inputs: {inputs}") + inputs = self.preprocess_tensor( + tensor=inputs, + config={ + "width": self.get_parameter("DATA_IN_0_PRECISION_0"), + "frac_width": self.get_parameter("DATA_IN_0_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_1"), + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0"), + ], + ) + self.data_in_0_driver.load_driver(inputs) + + # * Load the weights driver + weights = self.model.weight + + self.log.info(f"Processing weights: {weights}") + weights = self.preprocess_tensor( + tensor=weights, + config={ + "width": self.get_parameter("WEIGHT_PRECISION_0"), + "frac_width": self.get_parameter("WEIGHT_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("WEIGHT_PARALLELISM_DIM_1"), + self.get_parameter("WEIGHT_PARALLELISM_DIM_0"), + ], + ) + self.weight_driver.load_driver(weights) + + # * Load the bias driver + if self.get_parameter("HAS_BIAS") == 1: + bias = self.model.bias + self.log.info(f"Processing bias: {bias}") + bias = self.preprocess_tensor( + tensor=bias, + config={ + "width": self.get_parameter("BIAS_PRECISION_0"), + "frac_width": self.get_parameter("BIAS_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("BIAS_PARALLELISM_DIM_1"), + self.get_parameter("BIAS_PARALLELISM_DIM_0"), + ], + ) + self.bias_driver.load_driver(bias) + + # * Load the output monitor + self.log.info(f"Processing outputs: {exp_out}") + outs = self.preprocess_tensor( + tensor=exp_out, + config={ + "width": self.get_parameter("DATA_OUT_0_PRECISION_0"), + "frac_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_1"), + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_0"), + ], + ) + self.data_out_0_monitor.load_monitor(outs) + + await Timer(us, units="us") + assert self.data_out_0_monitor.exp_queue.empty() + + +@cocotb.test() +async def cocotb_test(dut): + tb = LinearTB(dut) + await tb.run_test(us=100) + + +@cocotb.test() +async def repeated_mult_valid_backpressure(dut): + tb = LinearTB(dut) + tb.data_in_0_driver.set_valid_prob(0.7) + tb.weight_driver.set_valid_prob(0.7) + cocotb.start_soon(bit_driver(dut.data_out_0_ready, dut.clk, 0.6)) + await tb.run_test(us=200) + + +def get_fixed_linear_config(kwargs={}): + # if pretranspose + # weight1 = in0 + # else + # weight0 = in0 + config = { + "HAS_BIAS": 1, + "DATA_IN_0_TENSOR_SIZE_DIM_0": 32, + "DATA_IN_0_TENSOR_SIZE_DIM_1": 16, + "DATA_IN_0_PARALLELISM_DIM_0": 8, + "DATA_IN_0_PARALLELISM_DIM_1": 4, + "WEIGHT_TENSOR_SIZE_DIM_0": 32, + "WEIGHT_TENSOR_SIZE_DIM_1": 24, + "WEIGHT_PARALLELISM_DIM_0": 8, + "WEIGHT_PARALLELISM_DIM_1": 2, + "DATA_IN_0_PRECISION_0": 8, + "DATA_IN_0_PRECISION_1": 4, + "WEIGHT_PRECISION_0": 10, + "WEIGHT_PRECISION_1": 3, + "BIAS_PRECISION_0": 5, + "BIAS_PRECISION_1": 2, + "DATA_OUT_0_PRECISION_0": 8, + "DATA_OUT_0_PRECISION_1": 4, + } + config.update(kwargs) + return config + + +@pytest.mark.dev +def test_fixed_linear_smoke(): + """ + Some quick tests to check if the module is working. + """ + mase_runner( + trace=True, + module_param_list=[ + get_fixed_linear_config(), + # noticed here if change WEIGHT_PRE_TRANSPOSED also need to change the DIM_SIZE to match ACTIVATION + ], + ) + + +@pytest.mark.dev +def test_fixed_linear_regression(): + """ + More extensive tests to check realistic parameter sizes. + """ + mase_runner( + trace=True, + module_param_list=[ + get_fixed_linear_config( + { + "DATA_IN_0_TENSOR_SIZE_DIM_0": 768, + "DATA_IN_0_PARALLELISM_DIM_0": 32, + "WEIGHT_TENSOR_SIZE_DIM_0": 768, + "WEIGHT_TENSOR_SIZE_DIM_1": 768, + "WEIGHT_PARALLELISM_DIM_0": 32, + "WEIGHT_PARALLELISM_DIM_1": 32, + "BIAS_TENSOR_SIZE_DIM_0": 768, + "BIAS_PARALLELISM_DIM_0": 32, + } + ), + get_fixed_linear_config( + { + "HAS_BIAS": 1, + "WEIGHTS_PRE_TRANSPOSED": 0, + "DATA_IN_0_TENSOR_SIZE_DIM_0": 768, + "DATA_IN_0_PARALLELISM_DIM_0": 32, + "WEIGHT_TENSOR_SIZE_DIM_0": 768, + "WEIGHT_TENSOR_SIZE_DIM_1": 768, + "WEIGHT_PARALLELISM_DIM_0": 32, + "WEIGHT_PARALLELISM_DIM_1": 32, + "BIAS_TENSOR_SIZE_DIM_0": 768, + "BIAS_PARALLELISM_DIM_0": 32, + } + ), + ], + ) + + +if __name__ == "__main__": + test_fixed_linear_smoke() + # test_fixed_linear_regression() diff --git a/src/mase_components/linear_layers/matmul/rtl/matmul.sv b/src/mase_components/linear_layers/matmul/rtl/matmul.sv index e8443426a..ba418bd55 100644 --- a/src/mase_components/linear_layers/matmul/rtl/matmul.sv +++ b/src/mase_components/linear_layers/matmul/rtl/matmul.sv @@ -134,6 +134,9 @@ module matmul #( // Matrix unflatten output logic [B_WIDTH-1:0] b_buffer_out_data[B_COMPUTE_DIM0*B_COMPUTE_DIM1-1:0]; + logic [SM_OUT_WIDTH-1:0] buffered_sm_out_data[C_COMPUTE_DIM0*C_COMPUTE_DIM1]; + logic buffered_sm_out_valid, buffered_sm_out_ready; + logic [SM_OUT_WIDTH-1:0] sm_out_data[C_COMPUTE_DIM0*C_COMPUTE_DIM1]; logic sm_out_valid, sm_out_ready; @@ -275,10 +278,24 @@ module matmul #( .y_data (b_buffer_out_data), .y_valid (b_buffer_out_valid), .y_ready (b_buffer_out_ready), - .out_data (sm_out_data), - .out_valid(sm_out_valid), - .out_ready(sm_out_ready) + .out_data (buffered_sm_out_data), + .out_valid(buffered_sm_out_valid), + .out_ready(buffered_sm_out_ready) ); + //cut the long ready path + unpacked_skid_buffer #( + .DATA_WIDTH(SM_OUT_WIDTH), + .IN_NUM (C_COMPUTE_DIM0 * C_COMPUTE_DIM1) + ) sm_out_reg_slice ( + .clk (clk), + .rst (rst), + .data_in (buffered_sm_out_data), + .data_in_valid (buffered_sm_out_valid), + .data_in_ready (buffered_sm_out_ready), + .data_out (sm_out_data), + .data_out_valid(sm_out_valid), + .data_out_ready(sm_out_ready) + ); // Direct the result of the simple matmul to the correct matrix_accumulator diff --git a/src/mase_components/memory/rtl/blk_mem_gen_0.sv b/src/mase_components/memory/rtl/blk_mem_gen_0.sv index 55ca60c9c..20dad70f7 100644 --- a/src/mase_components/memory/rtl/blk_mem_gen_0.sv +++ b/src/mase_components/memory/rtl/blk_mem_gen_0.sv @@ -3,21 +3,13 @@ module blk_mem_gen_0 #( parameter DATA_WIDTH = 8, parameter MEM_SIZE = 1 ) ( - clka, - ena, - wea, - addra, - dina, - douta -) -/* synthesis syn_black_box black_box_pad_pin="ena,wea[0:0],addra[9:0],dina[7:0],douta[7:0]" */ -/* synthesis syn_force_seq_prim="clka" */; - input logic clka /* synthesis syn_isclock = 1 */; - input logic ena; - input logic [0:0] wea; - input logic [$clog2(MEM_SIZE):0] addra; - input logic [DATA_WIDTH - 1:0] dina; - output logic [DATA_WIDTH - 1:0] douta; + input logic clka, + input logic ena, + input logic wea, + input logic [$clog2(MEM_SIZE):0] addra, + input logic [DATA_WIDTH - 1:0] dina, + output logic [DATA_WIDTH - 1:0] douta +); logic [DATA_WIDTH - 1:0] ram[0:MEM_SIZE-1]; logic [DATA_WIDTH - 1:0] douta_t1; diff --git a/src/mase_components/memory/rtl/fifo_for_autogen.sv b/src/mase_components/memory/rtl/fifo_for_autogen.sv new file mode 100644 index 000000000..d0e11f93d --- /dev/null +++ b/src/mase_components/memory/rtl/fifo_for_autogen.sv @@ -0,0 +1,42 @@ +`timescale 1 ns / 1 ps +/* verilator lint_off PINMISSING */ +module fifo_for_autogen #( + parameter DATA_IN_0_PRECISION_0 = 16, + parameter DATA_IN_0_PRECISION_1 = 3, + parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = 20, + parameter DATA_IN_0_TENSOR_SIZE_DIM_1 = 20, + parameter DATA_IN_0_PARALLELISM_DIM_0 = 4, // must equal WEIGHT_PARALLELISM_DIM_1 + parameter DATA_IN_0_PARALLELISM_DIM_1 = 4, + parameter DATA_OUT_0_PRECISION_0 = DATA_IN_0_PRECISION_0, + parameter DATA_OUT_0_PRECISION_1 = DATA_IN_0_PRECISION_1, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = DATA_IN_0_TENSOR_SIZE_DIM_0, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1, + parameter DATA_OUT_0_PARALLELISM_DIM_0 = DATA_IN_0_PARALLELISM_DIM_0, + parameter DATA_OUT_0_PARALLELISM_DIM_1 = DATA_IN_0_PARALLELISM_DIM_1, + parameter DEPTH = DATA_IN_0_TENSOR_SIZE_DIM_0 * DATA_IN_0_TENSOR_SIZE_DIM_1 / (DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1) +) ( + input clk, + input rst, + input logic [DATA_IN_0_PRECISION_0-1:0] data_in_0 [DATA_IN_0_PARALLELISM_DIM_0*DATA_IN_0_PARALLELISM_DIM_1-1:0], + input logic data_in_0_valid, + output logic data_in_0_ready, + + output logic [DATA_OUT_0_PRECISION_0-1:0] data_out_0 [DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0], + output logic data_out_0_valid, + input logic data_out_0_ready +); + unpacked_fifo #( + .DEPTH(DEPTH), + .DATA_WIDTH(DATA_IN_0_PRECISION_0), + .IN_NUM(DATA_IN_0_PARALLELISM_DIM_0*DATA_IN_0_PARALLELISM_DIM_1) + ) ff_inst ( + .clk(clk), + .rst(rst), + .data_in(data_in_0), + .data_in_valid(data_in_0_valid), + .data_in_ready(data_in_0_ready), + .data_out(data_out_0), + .data_out_valid(data_out_0_valid), + .data_out_ready(data_out_0_ready) + ); +endmodule diff --git a/src/mase_components/memory/rtl/input_buffer.sv b/src/mase_components/memory/rtl/input_buffer.sv index 66c1c4c3c..fc6311810 100644 --- a/src/mase_components/memory/rtl/input_buffer.sv +++ b/src/mase_components/memory/rtl/input_buffer.sv @@ -101,9 +101,9 @@ module input_buffer #( end assign reg_in_valid = (mode == STRAIGHT) ? data_in_valid : (!(delay2_bos || delay1_bos)); assign data_in_ready = (mode == STRAIGHT) ? reg_in_ready : 0; - unpacked_skid_buffer #( + unpacked_register_slice #( .DATA_WIDTH(DATA_WIDTH), - .IN_NUM(IN_NUM) + .IN_SIZE(IN_NUM) ) reg_inst ( .data_in(reg_in), .data_in_valid(reg_in_valid), diff --git a/src/mase_components/memory/rtl/matrix_bank.sv b/src/mase_components/memory/rtl/matrix_bank.sv index 81d2abc22..47ebce0b7 100644 --- a/src/mase_components/memory/rtl/matrix_bank.sv +++ b/src/mase_components/memory/rtl/matrix_bank.sv @@ -5,8 +5,65 @@ Features to implement: clear all counters and flags and transition to REQ_FETCH */ - -import matrix_bank_pkg::*; +package matrix_bank_pkg; + + parameter AXI_ADDRESS_WIDTH = 32; + parameter MAX_DIMENSION = 1024; + parameter MAX_FEATURE_COUNT = 16; + + + typedef struct packed { + logic [AXI_ADDRESS_WIDTH-1:0] start_address; + logic [$clog2(MAX_DIMENSION):0] columns; + logic [$clog2(MAX_DIMENSION):0] rows; + } REQ_t; + + typedef struct packed {logic partial;} RESP_t; + + typedef struct packed { + // Check request payloads match NSB payloads + logic [$clog2(MAX_FEATURE_COUNT):0] columns; + logic [$clog2(MAX_FEATURE_COUNT):0] rows; + } ROW_CHANNEL_REQ_t; + + typedef struct packed { + logic [MAX_FEATURE_COUNT-1:0][31:0] data; + logic [MAX_FEATURE_COUNT-1:0] valid_mask; + logic done; + } ROW_CHANNEL_RESP_t; + +endpackage + +// package matrix_bank_pkg; + +parameter AXI_ADDRESS_WIDTH = 32; +parameter MAX_DIMENSION = 1024; +parameter MAX_FEATURE_COUNT = 32; + +typedef struct packed { + logic [AXI_ADDRESS_WIDTH-1:0] start_address; + logic [$clog2(MAX_DIMENSION):0] columns; + logic [$clog2(MAX_DIMENSION):0] rows; +} REQ_t; + +typedef struct packed {logic partial;} RESP_t; + +typedef struct packed { + // Check request payloads match NSB payloads + logic [$clog2( +MAX_FEATURE_COUNT +):0] columns; + logic [$clog2(MAX_FEATURE_COUNT):0] rows; +} ROW_CHANNEL_REQ_t; + +typedef struct packed { + logic [MAX_FEATURE_COUNT-1:0][31:0] data; + logic [MAX_FEATURE_COUNT-1:0] valid_mask; + logic done; +} ROW_CHANNEL_RESP_t; + +// endpackage +// import matrix_bank_pkg::*; module matrix_bank #( parameter PRECISION = 0, // 0 = FP32, 1 = FP16 @@ -221,8 +278,9 @@ module matrix_bank #( axi_rm_fetch_byte_count = matrix_bank_req_q.columns * 4; bytes_per_row = matrix_bank_req_q.columns * 4; - bytes_per_row_padded = {bytes_per_row[$clog2(MAX_DIMENSION*4)-1:6], 6'b0} + - (|bytes_per_row[5:0] ? 'd64 : '0); // round up to nearest multiple of 64 + bytes_per_row_padded = { + bytes_per_row[$clog2(MAX_DIMENSION*4)-1:6], 6'b0 + } + (|bytes_per_row[5:0] ? 'd64 : '0); // round up to nearest multiple of 64 axi_rm_fetch_start_address = matrix_bank_req_q.start_address + rows_fetched * bytes_per_row_padded; end @@ -316,9 +374,9 @@ module matrix_bank #( end // Round up in features to the nearest multiple of 16 - assign required_pulses = {matrix_bank_req_q.columns[$clog2( - MAX_DIMENSION - )-1:4], 4'd0} + (|matrix_bank_req_q.columns[3:0] ? 'd16 : '0); + assign required_pulses = { + matrix_bank_req_q.columns[$clog2(MAX_DIMENSION)-1:4], 4'd0 + } + (|matrix_bank_req_q.columns[3:0] ? 'd16 : '0); always_ff @(posedge core_clk or negedge resetn) begin if (!resetn) begin diff --git a/src/mase_components/memory/rtl/matrix_bank_pkg.sv b/src/mase_components/memory/rtl/matrix_bank_pkg.sv index 791d48871..e56bac906 100644 --- a/src/mase_components/memory/rtl/matrix_bank_pkg.sv +++ b/src/mase_components/memory/rtl/matrix_bank_pkg.sv @@ -2,6 +2,7 @@ package matrix_bank_pkg; parameter AXI_ADDRESS_WIDTH = 32; parameter MAX_DIMENSION = 1024; + parameter MAX_FEATURE_COUNT = 32; typedef struct packed { logic [AXI_ADDRESS_WIDTH-1:0] start_address; From d7f7c2fe6eeb9d9809c7730b3e0d2444a3d4176d Mon Sep 17 00:00:00 2001 From: cx922 Date: Thu, 13 Mar 2025 01:08:25 +0000 Subject: [PATCH 3/6] rm un-nessesary file --- .../rtl/fixed_linear_with_input_circular.sv | 386 ------------------ .../fixed_linear_with_input_circular_tb.py | 287 ------------- 2 files changed, 673 deletions(-) delete mode 100644 src/mase_components/linear_layers/fixed_linear_layer/rtl/fixed_linear_with_input_circular.sv delete mode 100644 src/mase_components/linear_layers/fixed_linear_layer/test/fixed_linear_with_input_circular_tb.py diff --git a/src/mase_components/linear_layers/fixed_linear_layer/rtl/fixed_linear_with_input_circular.sv b/src/mase_components/linear_layers/fixed_linear_layer/rtl/fixed_linear_with_input_circular.sv deleted file mode 100644 index eb253bec5..000000000 --- a/src/mase_components/linear_layers/fixed_linear_layer/rtl/fixed_linear_with_input_circular.sv +++ /dev/null @@ -1,386 +0,0 @@ -`timescale 1ns / 1ps - -/* - * -*/ - -module fixed_linear_with_input_circular #( - /* verilator lint_off UNUSEDPARAM */ - parameter HAS_BIAS = 1, - parameter FIFO = 1, - - parameter DATA_IN_0_PRECISION_0 = 16, - parameter DATA_IN_0_PRECISION_1 = 3, - parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = 20, - parameter DATA_IN_0_TENSOR_SIZE_DIM_1 = 20, - parameter DATA_IN_0_PARALLELISM_DIM_0 = 4, // must equal WEIGHT_PARALLELISM_DIM_1 - parameter DATA_IN_0_PARALLELISM_DIM_1 = 4, - localparam IN_0_DEPTH_DIM_0 = DATA_IN_0_TENSOR_SIZE_DIM_0 / DATA_IN_0_PARALLELISM_DIM_0, - localparam IN_0_DEPTH_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1 / DATA_IN_0_PARALLELISM_DIM_1, - - parameter WEIGHT_PRECISION_0 = 16, - parameter WEIGHT_PRECISION_1 = 3, - parameter WEIGHT_TENSOR_SIZE_DIM_0 = 20, - parameter WEIGHT_TENSOR_SIZE_DIM_1 = 20, - parameter WEIGHT_PARALLELISM_DIM_0 = 4, - parameter WEIGHT_PARALLELISM_DIM_1 = 4, - - // Inferred precision of the output data - // if the data out precision will be replaced by the setting - parameter DATA_OUT_0_PRECISION_0 = DATA_IN_0_PRECISION_0 + WEIGHT_PRECISION_0 + $clog2( - DATA_IN_0_TENSOR_SIZE_DIM_0 - ) + HAS_BIAS, - parameter DATA_OUT_0_PRECISION_1 = DATA_IN_0_PRECISION_1 + WEIGHT_PRECISION_1, - parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = WEIGHT_TENSOR_SIZE_DIM_1, - parameter DATA_OUT_0_TENSOR_SIZE_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1, - parameter DATA_OUT_0_PARALLELISM_DIM_0 = WEIGHT_PARALLELISM_DIM_1, - parameter DATA_OUT_0_PARALLELISM_DIM_1 = DATA_IN_0_PARALLELISM_DIM_1, - - parameter BIAS_PRECISION_0 = 16, - parameter BIAS_PRECISION_1 = 3, - parameter BIAS_TENSOR_SIZE_DIM_0 = DATA_OUT_0_TENSOR_SIZE_DIM_0, - parameter BIAS_TENSOR_SIZE_DIM_1 = 1, - parameter BIAS_PARALLELISM_DIM_0 = DATA_OUT_0_PARALLELISM_DIM_0, - parameter BIAS_PARALLELISM_DIM_1 = 1 -) ( - input clk, - input rst, - - // input port for data_inivations - input logic [DATA_IN_0_PRECISION_0-1:0] data_in_0 [DATA_IN_0_PARALLELISM_DIM_0*DATA_IN_0_PARALLELISM_DIM_1-1:0], - input logic data_in_0_valid, - output logic data_in_0_ready, - - // input port for weight - input logic [WEIGHT_PRECISION_0-1:0] weight [WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1-1:0], - input logic weight_valid, - output logic weight_ready, - - input logic [BIAS_PRECISION_0-1:0] bias[BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1 -1:0], - input logic bias_valid, - output logic bias_ready, - - output logic [DATA_OUT_0_PRECISION_0-1:0] data_out_0 [DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0], - output logic data_out_0_valid, - input logic data_out_0_ready -); - logic [DATA_IN_0_PRECISION_0-1:0]circular_data_in_0[DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1-1:0]; - logic circular_data_in_0_valid, circular_data_in_0_ready; - logic [WEIGHT_PRECISION_0-1:0]circular_weight[WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1-1:0]; - logic circular_weight_valid, circular_weight_ready; - logic [BIAS_PRECISION_0-1:0]circular_bias[BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1-1:0]; - logic circular_bias_valid, circular_bias_ready; - logic [DATA_IN_0_PRECISION_0-1:0] data_in_0_reg [DATA_IN_0_PARALLELISM_DIM_0*DATA_IN_0_PARALLELISM_DIM_1-1:0]; - logic data_in_0_reg_valid, data_in_0_reg_ready; - if (FIFO == 1) begin - localparam FIFO_DEPTH = DATA_IN_0_TENSOR_SIZE_DIM_0 / DATA_IN_0_PARALLELISM_DIM_0; - - fifo_for_autogen #( - .DATA_IN_0_PRECISION_0(DATA_IN_0_PRECISION_0), // = 8 - .DATA_IN_0_PRECISION_1(DATA_IN_0_PRECISION_1), // = 4 - .DATA_IN_0_TENSOR_SIZE_DIM_0(DATA_IN_0_TENSOR_SIZE_DIM_0), // = 20 - .DATA_IN_0_PARALLELISM_DIM_0(DATA_IN_0_PARALLELISM_DIM_0), // = 2 - .DATA_IN_0_TENSOR_SIZE_DIM_1(DATA_IN_0_TENSOR_SIZE_DIM_1), // = 4 - .DATA_IN_0_PARALLELISM_DIM_1(DATA_IN_0_PARALLELISM_DIM_1), // = 2 - .DEPTH(FIFO_DEPTH), - .DATA_OUT_0_PRECISION_0(DATA_IN_0_PRECISION_0), - .DATA_OUT_0_PRECISION_1(DATA_IN_0_PRECISION_1), - .DATA_OUT_0_TENSOR_SIZE_DIM_0(DATA_IN_0_TENSOR_SIZE_DIM_0), - .DATA_OUT_0_PARALLELISM_DIM_0(DATA_IN_0_PARALLELISM_DIM_0), - .DATA_OUT_0_TENSOR_SIZE_DIM_1(DATA_IN_0_TENSOR_SIZE_DIM_1), - .DATA_OUT_0_PARALLELISM_DIM_1(DATA_IN_0_PARALLELISM_DIM_1) - ) fifo_1_inst ( - .clk(clk), - .rst(rst), - .data_in_0(data_in_0), - .data_in_0_valid(data_in_0_valid), - .data_in_0_ready(data_in_0_ready), - .data_out_0(data_in_0_reg), - .data_out_0_valid(data_in_0_reg_valid), - .data_out_0_ready(data_in_0_reg_ready) - ); - end - else begin - always_comb begin - data_in_0_reg = data_in_0; - data_in_0_reg_valid = data_in_0_valid; - data_in_0_ready = data_in_0_reg_ready; - end - end - input_buffer #( - .DATA_WIDTH (DATA_IN_0_PRECISION_0), - .IN_NUM (DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1), - .REPEAT (WEIGHT_TENSOR_SIZE_DIM_1 / WEIGHT_PARALLELISM_DIM_1), - .BUFFER_SIZE(IN_0_DEPTH_DIM_0) - ) data_in_0_buffer ( - .clk, - .rst, - // Input streaming port - .data_in(data_in_0_reg), - .data_in_valid(data_in_0_reg_valid), - .data_in_ready(data_in_0_reg_ready), - // Output streaming port - .data_out(circular_data_in_0), - .data_out_valid(circular_data_in_0_valid), - .data_out_ready(circular_data_in_0_ready) - ); - input_buffer #( - .DATA_WIDTH(WEIGHT_PRECISION_0), - .IN_NUM(WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1), - .REPEAT(IN_0_DEPTH_DIM_1), - .BUFFER_SIZE(WEIGHT_TENSOR_SIZE_DIM_0*WEIGHT_TENSOR_SIZE_DIM_1 / (WEIGHT_PARALLELISM_DIM_0*WEIGHT_PARALLELISM_DIM_1)) - ) weight_buffer ( - .clk, - .rst, - // Input streaming port - .data_in(weight), - .data_in_valid(weight_valid), - .data_in_ready(weight_ready), - // Output streaming port - .data_out(circular_weight), - .data_out_valid(circular_weight_valid), - .data_out_ready(circular_weight_ready) - ); - input_buffer #( - .DATA_WIDTH (BIAS_PRECISION_0), - .IN_NUM (BIAS_PARALLELISM_DIM_0), - .REPEAT (IN_0_DEPTH_DIM_1), - .BUFFER_SIZE(BIAS_TENSOR_SIZE_DIM_0 / (BIAS_PARALLELISM_DIM_0)) - ) bias_buffer ( - .clk, - .rst, - // Input streaming port - .data_in(bias), - .data_in_valid(bias_valid), - .data_in_ready(bias_ready), - // Output streaming port - .data_out(circular_bias), - .data_out_valid(circular_bias_valid), - .data_out_ready(circular_bias_ready) - ); - logic [DATA_OUT_0_PARALLELISM_DIM_1 - 1:0] - linear_1d_data_in_0_ready, - linear_1d_weight_ready, - linear_1d_bias_ready, - linear_1d_data_out_valid; - always_comb begin - circular_data_in_0_ready = linear_1d_data_in_0_ready[0]; - circular_weight_ready = linear_1d_weight_ready[0]; - circular_bias_ready = linear_1d_bias_ready[0]; - data_out_0_valid = linear_1d_data_out_valid[0]; - end - for (genvar i = 0; i < DATA_OUT_0_PARALLELISM_DIM_1; i = i + 1) begin - linear_1d #( - .HAS_BIAS(HAS_BIAS), - - .DATA_IN_0_PRECISION_0 (DATA_IN_0_PRECISION_0), - .DATA_IN_0_PRECISION_1 (DATA_IN_0_PRECISION_1), - .DATA_IN_0_TENSOR_SIZE_DIM_0(DATA_IN_0_TENSOR_SIZE_DIM_0), - .DATA_IN_0_PARALLELISM_DIM_0(DATA_IN_0_PARALLELISM_DIM_0), - - .WEIGHT_PRECISION_0 (WEIGHT_PRECISION_0), - .WEIGHT_PRECISION_1 (WEIGHT_PRECISION_1), - .WEIGHT_TENSOR_SIZE_DIM_0(WEIGHT_TENSOR_SIZE_DIM_0), - .WEIGHT_TENSOR_SIZE_DIM_1(WEIGHT_TENSOR_SIZE_DIM_1), - .WEIGHT_PARALLELISM_DIM_0(WEIGHT_PARALLELISM_DIM_0), - .WEIGHT_PARALLELISM_DIM_1(WEIGHT_PARALLELISM_DIM_1), - - .BIAS_PRECISION_0 (BIAS_PRECISION_0), - .BIAS_PRECISION_1 (BIAS_PRECISION_1), - .BIAS_TENSOR_SIZE_DIM_0(BIAS_TENSOR_SIZE_DIM_0), - .BIAS_PARALLELISM_DIM_0(BIAS_PARALLELISM_DIM_0), - - .DATA_OUT_0_PRECISION_0(DATA_OUT_0_PRECISION_0), - .DATA_OUT_0_PRECISION_1(DATA_OUT_0_PRECISION_1) - ) fixed_linear ( - .clk, - .rst, - .data_in_0 (circular_data_in_0[(i+1) * DATA_IN_0_PARALLELISM_DIM_0 - 1:i * DATA_IN_0_PARALLELISM_DIM_0]), - .data_in_0_valid(circular_data_in_0_valid), - .data_in_0_ready(linear_1d_data_in_0_ready[i]), - .weight(circular_weight), - .weight_valid(circular_weight_valid), - .weight_ready(linear_1d_weight_ready[i]), - .bias(circular_bias), - .bias_valid(circular_bias_valid), - .bias_ready(linear_1d_bias_ready[i]), - .data_out_0 (data_out_0[(i+1) * DATA_OUT_0_PARALLELISM_DIM_0- 1:i * DATA_OUT_0_PARALLELISM_DIM_0]), - .data_out_0_valid(linear_1d_data_out_valid[i]), - .data_out_0_ready(data_out_0_ready) - ); - end -endmodule -module linear_1d #( - /* verilator lint_off UNUSEDPARAM */ - parameter HAS_BIAS = 0, - - parameter DATA_IN_0_PRECISION_0 = 16, - parameter DATA_IN_0_PRECISION_1 = 3, - parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = 1, - parameter DATA_IN_0_PARALLELISM_DIM_0 = 1, - parameter IN_0_DEPTH = DATA_IN_0_TENSOR_SIZE_DIM_0 / DATA_IN_0_PARALLELISM_DIM_0, - - parameter WEIGHT_PRECISION_0 = 16, - parameter WEIGHT_PRECISION_1 = 3, - parameter WEIGHT_TENSOR_SIZE_DIM_1 = 32, - parameter WEIGHT_TENSOR_SIZE_DIM_0 = 1, - parameter WEIGHT_PARALLELISM_DIM_1 = 4, - parameter WEIGHT_PARALLELISM_DIM_0 = 1, - - parameter DATA_OUT_0_PRECISION_0 = 8, - parameter DATA_OUT_0_PRECISION_1 = 4, - parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = WEIGHT_TENSOR_SIZE_DIM_1, - parameter DATA_OUT_0_PARALLELISM_DIM_0 = WEIGHT_PARALLELISM_DIM_1, - - parameter BIAS_PRECISION_0 = 16, - parameter BIAS_PRECISION_1 = 3, - parameter BIAS_TENSOR_SIZE_DIM_0 = DATA_OUT_0_TENSOR_SIZE_DIM_0, - parameter BIAS_PARALLELISM_DIM_0 = DATA_OUT_0_PARALLELISM_DIM_0 - -) ( - input clk, - input rst, - - // input port for data_inivations - input [DATA_IN_0_PRECISION_0-1:0] data_in_0 [DATA_IN_0_PARALLELISM_DIM_0-1:0], - input data_in_0_valid, - output data_in_0_ready, - - // input port for weight - input [WEIGHT_PRECISION_0-1:0] weight [WEIGHT_PARALLELISM_DIM_1 * WEIGHT_PARALLELISM_DIM_0-1:0], - input weight_valid, - output weight_ready, - - /* verilator lint_off UNUSEDSIGNAL */ - input [BIAS_PRECISION_0-1:0] bias[BIAS_PARALLELISM_DIM_0-1:0], - input bias_valid, - /* verilator lint_on UNUSEDSIGNAL */ - output bias_ready, - - output [DATA_OUT_0_PRECISION_0-1:0] data_out_0 [DATA_OUT_0_PARALLELISM_DIM_0-1:0], - output data_out_0_valid, - input data_out_0_ready -); - - localparam FDP_WIDTH = DATA_IN_0_PRECISION_0 + WEIGHT_PRECISION_0 + $clog2( - DATA_IN_0_PARALLELISM_DIM_0 - ); - localparam ACC_WIDTH = FDP_WIDTH + $clog2(IN_0_DEPTH); - localparam LOSSLESS_OUT_WIDTH = ACC_WIDTH + HAS_BIAS; - logic fdp_join_valid, fdp_join_ready; - join2 #() fdp_join_inst ( - .data_in_ready ({weight_ready, data_in_0_ready}), - .data_in_valid ({weight_valid, data_in_0_valid}), - .data_out_valid(fdp_join_valid), - .data_out_ready(fdp_join_ready) - ); - - /* verilator lint_off UNUSEDSIGNAL */ - // Assume the parallelised hardware above have the same arrival time - // which means that they always have the same state. So we can just - // pick one of the valid signal to use. - logic [WEIGHT_PARALLELISM_DIM_1-1:0] fdp_data_ready, fdp_weight_ready; - assign fdp_join_ready = fdp_data_ready[0]; - /* verilator lint_on UNUSEDSIGNAL */ - - logic acc_ready; - logic [ ACC_WIDTH-1:0] acc_data_out [DATA_OUT_0_PARALLELISM_DIM_0-1:0]; - logic [LOSSLESS_OUT_WIDTH-1:0] cast_data_out_0[DATA_OUT_0_PARALLELISM_DIM_0-1:0]; - // There are WEIGHT_PARALLELISM_DIM_0 number of dot product instances with DATA_IN_0_TENSOR_SIZE_DIM_0 inputs - // and each one computes for IN_0_DEPTH iterations for each inputs. - for (genvar i = 0; i < WEIGHT_PARALLELISM_DIM_1; i = i + 1) begin : linear - // Assume the weight are transposed and partitioned - logic [WEIGHT_PRECISION_0-1:0] current_weight[DATA_IN_0_PARALLELISM_DIM_0-1:0]; - assign current_weight = weight[DATA_IN_0_PARALLELISM_DIM_0*(i+1)-1:DATA_IN_0_PARALLELISM_DIM_0*i]; - - logic [FDP_WIDTH-1:0] fdp_data_out; - logic fdp_data_out_valid, fdp_data_out_ready; - - // The inputs are already sync-ed by the previous join - fixed_dot_product #( - .IN_WIDTH(DATA_IN_0_PRECISION_0), - .WEIGHT_WIDTH(WEIGHT_PRECISION_0), - .IN_SIZE(DATA_IN_0_PARALLELISM_DIM_0) - ) fdp_inst ( - .clk(clk), - .rst(rst), - .data_in(data_in_0), - .data_in_valid(fdp_join_valid), - .data_in_ready(fdp_data_ready[i]), - .weight(current_weight), - .weight_valid(fdp_join_valid), - .weight_ready(fdp_weight_ready[i]), - .data_out(fdp_data_out), - .data_out_valid(fdp_data_out_valid), - .data_out_ready(fdp_data_out_ready) - ); - - /* verilator lint_off UNUSEDSIGNAL */ - logic acc_data_out_valid, acc_data_out_ready; - /* verilator lint_on UNUSEDSIGNAL */ - - fixed_accumulator #( - .IN_WIDTH(FDP_WIDTH), - .IN_DEPTH(IN_0_DEPTH) - ) fixed_accumulator_inst ( - .clk(clk), - .rst(rst), - .data_in(fdp_data_out), - .data_in_valid(fdp_data_out_valid), - .data_in_ready(fdp_data_out_ready), - .data_out(acc_data_out[i]), - .data_out_valid(acc_data_out_valid), - .data_out_ready(acc_data_out_ready) - ); - - // Assume the parallelised hardware above have the same arrival time - // which means that they always have the same state. So we can just - // pick one of the valid signal to use. - assign acc_data_out_ready = acc_ready; - end - - - if (HAS_BIAS == 1) begin - logic [ACC_WIDTH-1:0] bias_sext[BIAS_PARALLELISM_DIM_0-1:0]; - logic acc_join_valid, acc_join_ready; - logic [DATA_OUT_0_PARALLELISM_DIM_0-1:0] reg_ready; - - join2 #() acc_join_inst ( - .data_in_ready ({bias_ready, acc_ready}), - .data_in_valid ({bias_valid, linear[0].acc_data_out_valid}), - .data_out_valid(data_out_0_valid), - .data_out_ready(data_out_0_ready) - ); - - fixed_rounding #( - .IN_SIZE(DATA_OUT_0_PARALLELISM_DIM_0), - .IN_WIDTH(BIAS_PRECISION_0), - .IN_FRAC_WIDTH(BIAS_PRECISION_1), - .OUT_WIDTH(ACC_WIDTH), - .OUT_FRAC_WIDTH(DATA_IN_0_PRECISION_1 + WEIGHT_PRECISION_1) - ) bias_cast ( - .data_in (bias), - .data_out(bias_sext) - ); - - for (genvar i = 0; i < DATA_OUT_0_PARALLELISM_DIM_0; i = i + 1) begin : add_bias - assign cast_data_out_0[i] = $signed(acc_data_out[i]) + $signed(bias_sext[i]); - end - end else begin - assign acc_ready = data_out_0_ready; - assign data_out_0_valid = linear[0].acc_data_out_valid; - assign cast_data_out_0 = acc_data_out; - assign bias_ready = 1; - end - fixed_rounding #( - .IN_SIZE(DATA_OUT_0_PARALLELISM_DIM_0), - .IN_WIDTH(LOSSLESS_OUT_WIDTH), - .IN_FRAC_WIDTH(DATA_IN_0_PRECISION_1 + WEIGHT_PRECISION_1), - .OUT_WIDTH(DATA_OUT_0_PRECISION_0), - .OUT_FRAC_WIDTH(DATA_OUT_0_PRECISION_1) - ) bias_cast ( - .data_in (cast_data_out_0), - .data_out(data_out_0) - ); - -endmodule diff --git a/src/mase_components/linear_layers/fixed_linear_layer/test/fixed_linear_with_input_circular_tb.py b/src/mase_components/linear_layers/fixed_linear_layer/test/fixed_linear_with_input_circular_tb.py deleted file mode 100644 index 9976fe97b..000000000 --- a/src/mase_components/linear_layers/fixed_linear_layer/test/fixed_linear_with_input_circular_tb.py +++ /dev/null @@ -1,287 +0,0 @@ -#!/usr/bin/env python3 - -import os, pytest - -import torch -import logging -from functools import partial - -import cocotb -from cocotb.log import SimLog -from cocotb.triggers import Timer, RisingEdge - -from mase_cocotb.testbench import Testbench -from mase_cocotb.interfaces.streaming import ( - StreamDriver, - StreamMonitor, - ErrorThresholdStreamMonitor, -) -from mase_cocotb.runner import mase_runner - -from mase_cocotb.utils import bit_driver - -# from mase_cocotb import Testbench, StreamDriver, StreamMonitor, mase_runner -from chop.nn.quantized.modules.linear import LinearInteger -from chop.nn.quantizers import integer_floor_quantizer - - -class LinearTB(Testbench): - def __init__(self, dut) -> None: - super().__init__(dut, dut.clk, dut.rst) - - if not hasattr(self, "log"): - self.log = SimLog("%s" % (type(self).__qualname__)) - self.log.setLevel(logging.DEBUG) - - self.data_in_0_driver = StreamDriver( - dut.clk, dut.data_in_0, dut.data_in_0_valid, dut.data_in_0_ready - ) - self.weight_driver = StreamDriver( - dut.clk, dut.weight, dut.weight_valid, dut.weight_ready - ) - - if self.get_parameter("HAS_BIAS") == 1: - self.bias_driver = StreamDriver( - dut.clk, dut.bias, dut.bias_valid, dut.bias_ready - ) - self.bias_driver.log.setLevel(logging.DEBUG) - - self.data_out_0_monitor = StreamMonitor( - dut.clk, - dut.data_out_0, - dut.data_out_0_valid, - dut.data_out_0_ready, - check=True, - ) - - # Model - self.model = LinearInteger( - in_features=self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_0"), - out_features=self.get_parameter("DATA_OUT_0_TENSOR_SIZE_DIM_0"), - bias=True if self.get_parameter("HAS_BIAS") == 1 else False, - config={ - "data_in_width": self.get_parameter("DATA_IN_0_PRECISION_0"), - "data_in_frac_width": self.get_parameter("DATA_IN_0_PRECISION_1"), - "weight_width": self.get_parameter("WEIGHT_PRECISION_0"), - "weight_frac_width": self.get_parameter("WEIGHT_PRECISION_1"), - "bias_width": self.get_parameter("BIAS_PRECISION_0"), - "bias_frac_width": self.get_parameter("BIAS_PRECISION_1"), - }, - out_config={ - "data_out_width": self.get_parameter("DATA_OUT_0_PRECISION_0"), - "data_out_frac_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), - }, - floor=True, - ) - - # Set verbosity of driver and monitor loggers to debug - self.data_in_0_driver.log.setLevel(logging.DEBUG) - self.weight_driver.log.setLevel(logging.DEBUG) - self.data_out_0_monitor.log.setLevel(logging.DEBUG) - - def generate_inputs(self): - return torch.randn( - ( - self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_1"), - self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_0"), - ) - ) - - def preprocess_tensor(self, tensor, config, parallelism): - if len(tensor.shape) == 1: - tensor = tensor.unsqueeze(0) - - # Quantize - quantizer = partial(integer_floor_quantizer, **config) - q_tensor = quantizer(tensor) - self.log.debug(f"Quantized tensor: {q_tensor}") - - # Convert to integer format - q_tensor = (q_tensor * 2 ** config["frac_width"]).int() - self.log.debug(f"Tensor in integer format: {q_tensor}") - - # Split into chunks according to parallelism in each dimension - # parallelism[0]: along rows, parallelism[1]: along columns - dim_0_split = q_tensor.split(parallelism[0], dim=0) - dim_1_split = [x.split(parallelism[1], dim=1) for x in dim_0_split] - blocks = [] - # Flatten the list of blocks - for i in range(len(dim_1_split)): - for j in range(len(dim_1_split[i])): - blocks.append(dim_1_split[i][j].flatten().tolist()) - return blocks - - async def run_test(self, us): - await self.reset() - self.log.info(f"Reset finished") - self.data_out_0_monitor.ready.value = 1 - - inputs = self.generate_inputs() - exp_out = self.model(inputs) - - # * Load the inputs driver - self.log.info(f"Processing inputs: {inputs}") - inputs = self.preprocess_tensor( - tensor=inputs, - config={ - "width": self.get_parameter("DATA_IN_0_PRECISION_0"), - "frac_width": self.get_parameter("DATA_IN_0_PRECISION_1"), - }, - parallelism=[ - self.get_parameter("DATA_IN_0_PARALLELISM_DIM_1"), - self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0"), - ], - ) - self.data_in_0_driver.load_driver(inputs) - - # * Load the weights driver - weights = self.model.weight - - self.log.info(f"Processing weights: {weights}") - weights = self.preprocess_tensor( - tensor=weights, - config={ - "width": self.get_parameter("WEIGHT_PRECISION_0"), - "frac_width": self.get_parameter("WEIGHT_PRECISION_1"), - }, - parallelism=[ - self.get_parameter("WEIGHT_PARALLELISM_DIM_1"), - self.get_parameter("WEIGHT_PARALLELISM_DIM_0"), - ], - ) - self.weight_driver.load_driver(weights) - - # * Load the bias driver - if self.get_parameter("HAS_BIAS") == 1: - bias = self.model.bias - self.log.info(f"Processing bias: {bias}") - bias = self.preprocess_tensor( - tensor=bias, - config={ - "width": self.get_parameter("BIAS_PRECISION_0"), - "frac_width": self.get_parameter("BIAS_PRECISION_1"), - }, - parallelism=[ - self.get_parameter("BIAS_PARALLELISM_DIM_1"), - self.get_parameter("BIAS_PARALLELISM_DIM_0"), - ], - ) - self.bias_driver.load_driver(bias) - - # * Load the output monitor - self.log.info(f"Processing outputs: {exp_out}") - outs = self.preprocess_tensor( - tensor=exp_out, - config={ - "width": self.get_parameter("DATA_OUT_0_PRECISION_0"), - "frac_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), - }, - parallelism=[ - self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_1"), - self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_0"), - ], - ) - self.data_out_0_monitor.load_monitor(outs) - - await Timer(us, units="us") - assert self.data_out_0_monitor.exp_queue.empty() - - -@cocotb.test() -async def cocotb_test(dut): - tb = LinearTB(dut) - await tb.run_test(us=100) - - -@cocotb.test() -async def repeated_mult_valid_backpressure(dut): - tb = LinearTB(dut) - tb.data_in_0_driver.set_valid_prob(0.7) - tb.weight_driver.set_valid_prob(0.7) - cocotb.start_soon(bit_driver(dut.data_out_0_ready, dut.clk, 0.6)) - await tb.run_test(us=200) - - -def get_fixed_linear_config(kwargs={}): - # if pretranspose - # weight1 = in0 - # else - # weight0 = in0 - config = { - "HAS_BIAS": 1, - "DATA_IN_0_TENSOR_SIZE_DIM_0": 32, - "DATA_IN_0_TENSOR_SIZE_DIM_1": 16, - "DATA_IN_0_PARALLELISM_DIM_0": 8, - "DATA_IN_0_PARALLELISM_DIM_1": 4, - "WEIGHT_TENSOR_SIZE_DIM_0": 32, - "WEIGHT_TENSOR_SIZE_DIM_1": 24, - "WEIGHT_PARALLELISM_DIM_0": 8, - "WEIGHT_PARALLELISM_DIM_1": 2, - "DATA_IN_0_PRECISION_0": 8, - "DATA_IN_0_PRECISION_1": 4, - "WEIGHT_PRECISION_0": 10, - "WEIGHT_PRECISION_1": 3, - "BIAS_PRECISION_0": 5, - "BIAS_PRECISION_1": 2, - "DATA_OUT_0_PRECISION_0": 8, - "DATA_OUT_0_PRECISION_1": 4, - } - config.update(kwargs) - return config - - -@pytest.mark.dev -def test_fixed_linear_smoke(): - """ - Some quick tests to check if the module is working. - """ - mase_runner( - trace=True, - module_param_list=[ - get_fixed_linear_config(), - # noticed here if change WEIGHT_PRE_TRANSPOSED also need to change the DIM_SIZE to match ACTIVATION - ], - ) - - -@pytest.mark.dev -def test_fixed_linear_regression(): - """ - More extensive tests to check realistic parameter sizes. - """ - mase_runner( - trace=True, - module_param_list=[ - get_fixed_linear_config( - { - "DATA_IN_0_TENSOR_SIZE_DIM_0": 768, - "DATA_IN_0_PARALLELISM_DIM_0": 32, - "WEIGHT_TENSOR_SIZE_DIM_0": 768, - "WEIGHT_TENSOR_SIZE_DIM_1": 768, - "WEIGHT_PARALLELISM_DIM_0": 32, - "WEIGHT_PARALLELISM_DIM_1": 32, - "BIAS_TENSOR_SIZE_DIM_0": 768, - "BIAS_PARALLELISM_DIM_0": 32, - } - ), - get_fixed_linear_config( - { - "HAS_BIAS": 1, - "WEIGHTS_PRE_TRANSPOSED": 0, - "DATA_IN_0_TENSOR_SIZE_DIM_0": 768, - "DATA_IN_0_PARALLELISM_DIM_0": 32, - "WEIGHT_TENSOR_SIZE_DIM_0": 768, - "WEIGHT_TENSOR_SIZE_DIM_1": 768, - "WEIGHT_PARALLELISM_DIM_0": 32, - "WEIGHT_PARALLELISM_DIM_1": 32, - "BIAS_TENSOR_SIZE_DIM_0": 768, - "BIAS_PARALLELISM_DIM_0": 32, - } - ), - ], - ) - - -if __name__ == "__main__": - test_fixed_linear_smoke() - # test_fixed_linear_regression() From ab657bf87285f83d155e7aca04c611535bab6a11 Mon Sep 17 00:00:00 2001 From: cx922 Date: Thu, 13 Mar 2025 14:29:41 +0000 Subject: [PATCH 4/6] quantization config cannot parse floor argument --- src/chop/nn/quantized/functional/linear.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/chop/nn/quantized/functional/linear.py b/src/chop/nn/quantized/functional/linear.py index 2c9f4caec..48f46ab9e 100644 --- a/src/chop/nn/quantized/functional/linear.py +++ b/src/chop/nn/quantized/functional/linear.py @@ -44,7 +44,7 @@ def linearInteger( config["data_out_frac_width"], ) - floor = config.get("floor", False) + floor = config.get("floor", True) base_quantizer = integer_floor_quantizer if floor else integer_quantizer w_quantizer = partial(base_quantizer, width=w_width, frac_width=w_frac_width) x_quantizer = partial(base_quantizer, width=x_width, frac_width=x_frac_width) From 79a9b9772f6b3ddc2d3ac2c5a9a63f7dce44a87f Mon Sep 17 00:00:00 2001 From: cx922 Date: Thu, 13 Mar 2025 15:39:17 +0000 Subject: [PATCH 5/6] formatting --- src/mase_components/cast/rtl/fixed_round.sv | 4 +- .../fixed_linear_layer/rtl/fixed_linear.sv | 37 +++++++++---------- .../linear_layers/matmul/rtl/matmul.sv | 28 +++++++------- .../memory/rtl/fifo_for_autogen.sv | 2 +- src/mase_components/memory/rtl/matrix_bank.sv | 17 ++++----- 5 files changed, 43 insertions(+), 45 deletions(-) diff --git a/src/mase_components/cast/rtl/fixed_round.sv b/src/mase_components/cast/rtl/fixed_round.sv index 2a5291053..49d6a5395 100644 --- a/src/mase_components/cast/rtl/fixed_round.sv +++ b/src/mase_components/cast/rtl/fixed_round.sv @@ -28,8 +28,8 @@ module fixed_round #( assign lsb_check = {input_data, {(OUT_FRAC_WIDTH) {1'b0}}}; always_comb begin lsb_below[2] = (IN_FRAC_WIDTH >= OUT_FRAC_WIDTH) ? lsb_check[IN_FRAC_WIDTH] : 0; - lsb_below[1] = (IN_FRAC_WIDTH-1 >= OUT_FRAC_WIDTH) ? lsb_check[IN_FRAC_WIDTH-1] : 0; - lsb_below[0] = (IN_FRAC_WIDTH-2 >= OUT_FRAC_WIDTH) ? |(lsb_check[IN_FRAC_WIDTH-2:0]): 0; + lsb_below[1] = (IN_FRAC_WIDTH - 1 >= OUT_FRAC_WIDTH) ? lsb_check[IN_FRAC_WIDTH-1] : 0; + lsb_below[0] = (IN_FRAC_WIDTH - 2 >= OUT_FRAC_WIDTH) ? |(lsb_check[IN_FRAC_WIDTH-2:0]) : 0; // lsb_below[0] = '0; // to do: fix end always_comb begin diff --git a/src/mase_components/linear_layers/fixed_linear_layer/rtl/fixed_linear.sv b/src/mase_components/linear_layers/fixed_linear_layer/rtl/fixed_linear.sv index e6b1aeac6..8f8de0c55 100644 --- a/src/mase_components/linear_layers/fixed_linear_layer/rtl/fixed_linear.sv +++ b/src/mase_components/linear_layers/fixed_linear_layer/rtl/fixed_linear.sv @@ -113,7 +113,7 @@ module fixed_linear #( logic [DATA_OUT_0_PRECISION_0 - 1:0] add_bias_in_casted [DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0]; logic add_bias_in_valid; logic add_bias_in_ready; - + logic [DATA_OUT_0_PRECISION_0 - 1:0] rounding_out [DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0]; logic rounding_out_valid; logic rounding_out_ready; @@ -282,25 +282,25 @@ module fixed_linear #( .data_out(rounding_out) ); assign rounding_out_valid = matmul_out_valid; - assign matmul_out_ready = rounding_out_ready; + assign matmul_out_ready = rounding_out_ready; end if (FIFO == 1) begin localparam FIFO_DEPTH = DATA_OUT_0_TENSOR_SIZE_DIM_0 / DATA_OUT_0_PARALLELISM_DIM_0; - + fifo_for_autogen #( - .DATA_IN_0_PRECISION_0(DATA_OUT_0_PRECISION_0), // = 8 - .DATA_IN_0_PRECISION_1(DATA_OUT_0_PRECISION_1), // = 4 - .DATA_IN_0_TENSOR_SIZE_DIM_0(DATA_OUT_0_TENSOR_SIZE_DIM_0), // = 20 - .DATA_IN_0_PARALLELISM_DIM_0(DATA_OUT_0_PARALLELISM_DIM_0), // = 2 - .DATA_IN_0_TENSOR_SIZE_DIM_1(DATA_OUT_0_TENSOR_SIZE_DIM_1), // = 4 - .DATA_IN_0_PARALLELISM_DIM_1(DATA_OUT_0_PARALLELISM_DIM_1), // = 2 - .DEPTH(FIFO_DEPTH), - .DATA_OUT_0_PRECISION_0(DATA_OUT_0_PRECISION_0), + .DATA_IN_0_PRECISION_0(DATA_OUT_0_PRECISION_0), // = 8 + .DATA_IN_0_PRECISION_1(DATA_OUT_0_PRECISION_1), // = 4 + .DATA_IN_0_TENSOR_SIZE_DIM_0(DATA_OUT_0_TENSOR_SIZE_DIM_0), // = 20 + .DATA_IN_0_PARALLELISM_DIM_0(DATA_OUT_0_PARALLELISM_DIM_0), // = 2 + .DATA_IN_0_TENSOR_SIZE_DIM_1(DATA_OUT_0_TENSOR_SIZE_DIM_1), // = 4 + .DATA_IN_0_PARALLELISM_DIM_1(DATA_OUT_0_PARALLELISM_DIM_1), // = 2 + .DEPTH(FIFO_DEPTH), + .DATA_OUT_0_PRECISION_0(DATA_OUT_0_PRECISION_0), .DATA_OUT_0_PRECISION_1(DATA_OUT_0_PRECISION_1), - .DATA_OUT_0_TENSOR_SIZE_DIM_0(DATA_OUT_0_TENSOR_SIZE_DIM_0), - .DATA_OUT_0_PARALLELISM_DIM_0(DATA_OUT_0_PARALLELISM_DIM_0), - .DATA_OUT_0_TENSOR_SIZE_DIM_1(DATA_OUT_0_TENSOR_SIZE_DIM_1), + .DATA_OUT_0_TENSOR_SIZE_DIM_0(DATA_OUT_0_TENSOR_SIZE_DIM_0), + .DATA_OUT_0_PARALLELISM_DIM_0(DATA_OUT_0_PARALLELISM_DIM_0), + .DATA_OUT_0_TENSOR_SIZE_DIM_1(DATA_OUT_0_TENSOR_SIZE_DIM_1), .DATA_OUT_0_PARALLELISM_DIM_1(DATA_OUT_0_PARALLELISM_DIM_1) ) fifo_1_inst ( .clk(clk), @@ -313,12 +313,11 @@ module fixed_linear #( .data_out_0_valid(data_out_0_valid), .data_out_0_ready(data_out_0_ready) ); - end - else begin + end else begin always_comb begin - data_out_0 = rounding_out; - data_out_0_valid = rounding_out_valid; - rounding_out_ready = data_out_0_ready; + data_out_0 = rounding_out; + data_out_0_valid = rounding_out_valid; + rounding_out_ready = data_out_0_ready; end end endmodule diff --git a/src/mase_components/linear_layers/matmul/rtl/matmul.sv b/src/mase_components/linear_layers/matmul/rtl/matmul.sv index ba418bd55..d7509d02c 100644 --- a/src/mase_components/linear_layers/matmul/rtl/matmul.sv +++ b/src/mase_components/linear_layers/matmul/rtl/matmul.sv @@ -282,20 +282,20 @@ module matmul #( .out_valid(buffered_sm_out_valid), .out_ready(buffered_sm_out_ready) ); - //cut the long ready path - unpacked_skid_buffer #( - .DATA_WIDTH(SM_OUT_WIDTH), - .IN_NUM (C_COMPUTE_DIM0 * C_COMPUTE_DIM1) - ) sm_out_reg_slice ( - .clk (clk), - .rst (rst), - .data_in (buffered_sm_out_data), - .data_in_valid (buffered_sm_out_valid), - .data_in_ready (buffered_sm_out_ready), - .data_out (sm_out_data), - .data_out_valid(sm_out_valid), - .data_out_ready(sm_out_ready) - ); + //cut the long ready path + unpacked_skid_buffer #( + .DATA_WIDTH(SM_OUT_WIDTH), + .IN_NUM (C_COMPUTE_DIM0 * C_COMPUTE_DIM1) + ) sm_out_reg_slice ( + .clk (clk), + .rst (rst), + .data_in (buffered_sm_out_data), + .data_in_valid (buffered_sm_out_valid), + .data_in_ready (buffered_sm_out_ready), + .data_out (sm_out_data), + .data_out_valid(sm_out_valid), + .data_out_ready(sm_out_ready) + ); // Direct the result of the simple matmul to the correct matrix_accumulator diff --git a/src/mase_components/memory/rtl/fifo_for_autogen.sv b/src/mase_components/memory/rtl/fifo_for_autogen.sv index d0e11f93d..6db2b000f 100644 --- a/src/mase_components/memory/rtl/fifo_for_autogen.sv +++ b/src/mase_components/memory/rtl/fifo_for_autogen.sv @@ -28,7 +28,7 @@ module fifo_for_autogen #( unpacked_fifo #( .DEPTH(DEPTH), .DATA_WIDTH(DATA_IN_0_PRECISION_0), - .IN_NUM(DATA_IN_0_PARALLELISM_DIM_0*DATA_IN_0_PARALLELISM_DIM_1) + .IN_NUM(DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1) ) ff_inst ( .clk(clk), .rst(rst), diff --git a/src/mase_components/memory/rtl/matrix_bank.sv b/src/mase_components/memory/rtl/matrix_bank.sv index 47ebce0b7..707bcd221 100644 --- a/src/mase_components/memory/rtl/matrix_bank.sv +++ b/src/mase_components/memory/rtl/matrix_bank.sv @@ -10,7 +10,7 @@ package matrix_bank_pkg; parameter AXI_ADDRESS_WIDTH = 32; parameter MAX_DIMENSION = 1024; parameter MAX_FEATURE_COUNT = 16; - + typedef struct packed { logic [AXI_ADDRESS_WIDTH-1:0] start_address; @@ -41,7 +41,7 @@ parameter MAX_DIMENSION = 1024; parameter MAX_FEATURE_COUNT = 32; typedef struct packed { - logic [AXI_ADDRESS_WIDTH-1:0] start_address; + logic [AXI_ADDRESS_WIDTH-1:0] start_address; logic [$clog2(MAX_DIMENSION):0] columns; logic [$clog2(MAX_DIMENSION):0] rows; } REQ_t; @@ -53,7 +53,7 @@ typedef struct packed { logic [$clog2( MAX_FEATURE_COUNT ):0] columns; - logic [$clog2(MAX_FEATURE_COUNT):0] rows; + logic [$clog2(MAX_FEATURE_COUNT):0] rows; } ROW_CHANNEL_REQ_t; typedef struct packed { @@ -278,9 +278,8 @@ module matrix_bank #( axi_rm_fetch_byte_count = matrix_bank_req_q.columns * 4; bytes_per_row = matrix_bank_req_q.columns * 4; - bytes_per_row_padded = { - bytes_per_row[$clog2(MAX_DIMENSION*4)-1:6], 6'b0 - } + (|bytes_per_row[5:0] ? 'd64 : '0); // round up to nearest multiple of 64 + bytes_per_row_padded = {bytes_per_row[$clog2(MAX_DIMENSION*4)-1:6], 6'b0} + + (|bytes_per_row[5:0] ? 'd64 : '0); // round up to nearest multiple of 64 axi_rm_fetch_start_address = matrix_bank_req_q.start_address + rows_fetched * bytes_per_row_padded; end @@ -374,9 +373,9 @@ module matrix_bank #( end // Round up in features to the nearest multiple of 16 - assign required_pulses = { - matrix_bank_req_q.columns[$clog2(MAX_DIMENSION)-1:4], 4'd0 - } + (|matrix_bank_req_q.columns[3:0] ? 'd16 : '0); + assign required_pulses = {matrix_bank_req_q.columns[$clog2( + MAX_DIMENSION + )-1:4], 4'd0} + (|matrix_bank_req_q.columns[3:0] ? 'd16 : '0); always_ff @(posedge core_clk or negedge resetn) begin if (!resetn) begin From f12354c19c5fd4ab3b83ba3df19b13980151b012 Mon Sep 17 00:00:00 2001 From: cx922 Date: Thu, 13 Mar 2025 15:45:07 +0000 Subject: [PATCH 6/6] remove some useless settings --- .../fixed_linear_layer/test/fixed_linear_tb.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/mase_components/linear_layers/fixed_linear_layer/test/fixed_linear_tb.py b/src/mase_components/linear_layers/fixed_linear_layer/test/fixed_linear_tb.py index a0f6fcaa7..1ce114ea0 100644 --- a/src/mase_components/linear_layers/fixed_linear_layer/test/fixed_linear_tb.py +++ b/src/mase_components/linear_layers/fixed_linear_layer/test/fixed_linear_tb.py @@ -243,16 +243,6 @@ def test_fixed_linear_smoke(): trace=True, module_param_list=[ get_fixed_linear_config(), - # noticed here if change WEIGHT_PRE_TRANSPOSED also need to change the DIM_SIZE to match ACTIVATION - get_fixed_linear_config( - { - "WEIGHTS_PRE_TRANSPOSED": 0, - "WEIGHT_TENSOR_SIZE_DIM_0": 32, - "WEIGHT_TENSOR_SIZE_DIM_1": 16, - "WEIGHT_PARALLELISM_DIM_0": 4, - "WEIGHT_PARALLELISM_DIM_1": 2, - }, - ), ], )