diff --git a/src/chop/nn/quantized/functional/linear.py b/src/chop/nn/quantized/functional/linear.py index 2c9f4caec..48f46ab9e 100644 --- a/src/chop/nn/quantized/functional/linear.py +++ b/src/chop/nn/quantized/functional/linear.py @@ -44,7 +44,7 @@ def linearInteger( config["data_out_frac_width"], ) - floor = config.get("floor", False) + floor = config.get("floor", True) base_quantizer = integer_floor_quantizer if floor else integer_quantizer w_quantizer = partial(base_quantizer, width=w_width, frac_width=w_frac_width) x_quantizer = partial(base_quantizer, width=x_width, frac_width=x_frac_width) diff --git a/src/chop/nn/quantizers/quantizers_for_hw.py b/src/chop/nn/quantizers/quantizers_for_hw.py index d5ca3d8cf..4a90e2a38 100644 --- a/src/chop/nn/quantizers/quantizers_for_hw.py +++ b/src/chop/nn/quantizers/quantizers_for_hw.py @@ -3,15 +3,15 @@ import torch.nn.functional as F from torch import Tensor -# from .quantizers import integer_quantizer +from .integer import integer_quantizer, integer_floor_quantizer from .utils import block, my_clamp, my_round, unblock, my_floor -def integer_quantizer_for_hw(x: Tensor, width: int, frac_width: int): +def integer_quantizer_for_hw(x: Tensor, width: int, frac_width: int, floor=False): thresh = 2 ** (width - 1) scale = 2**frac_width - - fixed_point_value = my_clamp(my_round(x.mul(scale)), -thresh, thresh - 1) + base_quantizer = integer_floor_quantizer if floor else integer_quantizer + fixed_point_value = base_quantizer(x, width, frac_width) * scale fixed_point_value = fixed_point_value.to(torch.int) fixed_point_value = fixed_point_value % (2**width) return fixed_point_value diff --git a/src/chop/passes/graph/transforms/verilog/emit_bram.py b/src/chop/passes/graph/transforms/verilog/emit_bram.py index a6a000c79..6060776fb 100644 --- a/src/chop/passes/graph/transforms/verilog/emit_bram.py +++ b/src/chop/passes/graph/transforms/verilog/emit_bram.py @@ -7,7 +7,7 @@ import torch from chop.passes.graph.utils import vf, v2p, get_module_by_name, init_project -from chop.nn.quantizers import integer_quantizer_for_hw, integer_floor_quantizer_for_hw +from chop.nn.quantizers import integer_quantizer_for_hw logger = logging.getLogger(__name__) from pathlib import Path @@ -49,7 +49,7 @@ def emit_parameters_in_mem_internal(node, param_name, file_name, data_name): f"{_cap(verilog_param_name)}_PARALLELISM_DIM_1" ] ) - out_depth = int((total_size + out_size - 1) / out_size) + out_depth = int(total_size / out_size) out_width = int( node.meta["mase"].parameters["common"]["args"][verilog_param_name]["precision"][ 0 @@ -84,7 +84,7 @@ def emit_parameters_in_mem_internal(node, param_name, file_name, data_name): logic [DWIDTH-1:0] q0_t1; initial begin - $readmemh("{data_name}", ram); + $readmemb("{data_name}", ram); end assign q0 = q0_t1; @@ -119,14 +119,14 @@ def emit_parameters_in_mem_internal(node, param_name, file_name, data_name): `timescale 1ns / 1ps module {node_param_name}_source #( - parameter {_cap(verilog_param_name)}_TENSOR_SIZE_DIM_0 = 32, - parameter {_cap(verilog_param_name)}_TENSOR_SIZE_DIM_1 = 1, - parameter {_cap(verilog_param_name)}_PRECISION_0 = 16, - parameter {_cap(verilog_param_name)}_PRECISION_1 = 3, - - parameter {_cap(verilog_param_name)}_PARALLELISM_DIM_0 = 1, - parameter {_cap(verilog_param_name)}_PARALLELISM_DIM_1 = 1, - parameter OUT_DEPTH = (({_cap(verilog_param_name)}_TENSOR_SIZE_DIM_0 + {_cap(verilog_param_name)}_PARALLELISM_DIM_0 - 1) / {_cap(verilog_param_name)}_PARALLELISM_DIM_0) * (({_cap(verilog_param_name)}_TENSOR_SIZE_DIM_1 + {_cap(verilog_param_name)}_PARALLELISM_DIM_1 - 1) / {_cap(verilog_param_name)}_PARALLELISM_DIM_1) + parameter {_cap(verilog_param_name)}_TENSOR_SIZE_DIM_0 = -1, + parameter {_cap(verilog_param_name)}_TENSOR_SIZE_DIM_1 = -1, + parameter {_cap(verilog_param_name)}_PRECISION_0 = -1, + parameter {_cap(verilog_param_name)}_PRECISION_1 = -1, + + parameter {_cap(verilog_param_name)}_PARALLELISM_DIM_0 = -1, + parameter {_cap(verilog_param_name)}_PARALLELISM_DIM_1 = -1, + parameter OUT_DEPTH = ({_cap(verilog_param_name)}_TENSOR_SIZE_DIM_0 / {_cap(verilog_param_name)}_PARALLELISM_DIM_0) * ({_cap(verilog_param_name)}_TENSOR_SIZE_DIM_1 / {_cap(verilog_param_name)}_PARALLELISM_DIM_1) ) ( input clk, input rst, @@ -138,7 +138,6 @@ def emit_parameters_in_mem_internal(node, param_name, file_name, data_name): // 1-bit wider so IN_DEPTH also fits. localparam COUNTER_WIDTH = $clog2(OUT_DEPTH); logic [COUNTER_WIDTH:0] counter; - always_ff @(posedge clk) if (rst) counter <= 0; else begin @@ -147,7 +146,6 @@ def emit_parameters_in_mem_internal(node, param_name, file_name, data_name): else counter <= counter + 1; end end - logic [1:0] clear; always_ff @(posedge clk) if (rst) clear <= 0; @@ -205,30 +203,39 @@ def emit_parameters_in_dat_internal(node, param_name, file_name): f"{_cap(verilog_param_name)}_PARALLELISM_DIM_1" ] ) - out_depth = int((total_size + out_size - 1) / out_size) + out_depth = int(total_size / out_size) data_buff = "" param_data = node.meta["mase"].module.get_parameter(param_name).data + param_meta = node.meta["mase"].parameters["hardware"]["verilog_param"] if node.meta["mase"].parameters["hardware"]["interface"][verilog_param_name][ "transpose" ]: + raise NotImplementedError("only support linear with not tranposed weight") + else: + assert ( + param_meta[f"{_cap(verilog_param_name)}_TENSOR_SIZE_DIM_1"] + % param_meta[f"{_cap(verilog_param_name)}_PARALLELISM_DIM_1"] + == 0 + ) and ( + param_meta[f"{_cap(verilog_param_name)}_TENSOR_SIZE_DIM_0"] + % param_meta[f"{_cap(verilog_param_name)}_PARALLELISM_DIM_0"] + == 0 + ), "The parallesim parameter must be divisible by the tensor size parameter." param_data = torch.reshape( param_data, ( - node.meta["mase"].parameters["hardware"]["verilog_param"][ - "DATA_OUT_0_SIZE" - ], - node.meta["mase"].parameters["hardware"]["verilog_param"][ - "DATA_IN_0_DEPTH" - ], - node.meta["mase"].parameters["hardware"]["verilog_param"][ - "DATA_IN_0_SIZE" - ], + -1, + param_meta[f"{_cap(verilog_param_name)}_TENSOR_SIZE_DIM_1"] + // param_meta[f"{_cap(verilog_param_name)}_PARALLELISM_DIM_1"], + param_meta[f"{_cap(verilog_param_name)}_PARALLELISM_DIM_1"], + param_meta[f"{_cap(verilog_param_name)}_TENSOR_SIZE_DIM_0"] + // param_meta[f"{_cap(verilog_param_name)}_PARALLELISM_DIM_0"], + param_meta[f"{_cap(verilog_param_name)}_PARALLELISM_DIM_0"], ), ) - param_data = torch.transpose(param_data, 0, 1) + param_data = param_data.permute(0, 1, 3, 2, 4) param_data = torch.flatten(param_data).tolist() - if ( node.meta["mase"].parameters["common"]["args"][verilog_param_name]["type"] == "fixed" @@ -240,31 +247,21 @@ def emit_parameters_in_dat_internal(node, param_name, file_name): "precision" ][1] - if node.meta["mase"].module.config.get("floor", False): - base_quantizer = integer_floor_quantizer_for_hw - else: - base_quantizer = integer_quantizer_for_hw - scale = 2**frac_width thresh = 2**width for i in range(0, out_depth): line_buff = "" for j in range(0, out_size): - if i * out_size + out_size - 1 - j >= len(param_data): - value = 0 - else: - value = param_data[i * out_size + out_size - 1 - j] - - # TODO: please clear this up later - value = base_quantizer(torch.tensor(value), width, frac_width).item() + value = param_data[i * out_size + j] + value = integer_quantizer_for_hw( + torch.tensor(value), width, frac_width, floor=True + ).item() value = str(bin(value)) value_bits = value[value.find("0b") + 2 :] value_bits = "0" * (width - len(value_bits)) + value_bits assert len(value_bits) == width - value_bits = hex(int(value_bits, 2)) - value_bits = value_bits[value_bits.find("0x") + 2 :] - value_bits = "0" * (width // 4 - len(value_bits)) + value_bits - line_buff = value_bits + line_buff + line_buff += value_bits + hex_buff = hex(int(line_buff, 2)) data_buff += line_buff + "\n" else: diff --git a/src/chop/passes/graph/transforms/verilog/emit_tb.py b/src/chop/passes/graph/transforms/verilog/emit_tb.py index ad72954f8..83857a1b4 100644 --- a/src/chop/passes/graph/transforms/verilog/emit_tb.py +++ b/src/chop/passes/graph/transforms/verilog/emit_tb.py @@ -12,8 +12,6 @@ from pathlib import Path -torch.manual_seed(0) - import cocotb from mase_cocotb.testbench import Testbench from mase_cocotb.interfaces.streaming import StreamDriver, StreamMonitor @@ -147,6 +145,7 @@ def load_drivers(self, in_tensors): self.get_parameter(f"{_cap(arg)}_PARALLELISM_DIM_1"), self.get_parameter(f"{_cap(arg)}_PARALLELISM_DIM_0"), ], + floor=True, ) else: @@ -177,6 +176,7 @@ def load_monitors(self, expectation): self.get_parameter(f"DATA_OUT_0_PARALLELISM_DIM_1"), self.get_parameter(f"DATA_OUT_0_PARALLELISM_DIM_0"), ], + floor=True, ) # Set expectation for each monitor diff --git a/src/mase_cocotb/utils.py b/src/mase_cocotb/utils.py index 7271389f2..d62d37694 100644 --- a/src/mase_cocotb/utils.py +++ b/src/mase_cocotb/utils.py @@ -101,9 +101,11 @@ def product_dict(**kwargs): yield dict(zip(keys, instance)) -def fixed_preprocess_tensor(tensor: Tensor, q_config: dict, parallelism: list) -> list: +def fixed_preprocess_tensor( + tensor: Tensor, q_config: dict, parallelism: list, floor=False +) -> list: """Preprocess a tensor before driving it into the DUT. - 1. Quantize to requested fixed-point precision using floor rounding. + 1. Quantize to requested fixed-point precision. 2. Convert to integer format to be compatible with Cocotb drivers. 3. Split into blocks according to parallelism in each dimension. @@ -125,11 +127,13 @@ def fixed_preprocess_tensor(tensor: Tensor, q_config: dict, parallelism: list) - tensor = tensor.view((-1, tensor.shape[-1])) # Quantize - quantizer = partial(integer_floor_quantizer, **q_config) + base_quantizer = integer_floor_quantizer if floor else integer_quantizer + quantizer = partial(base_quantizer, **q_config) q_tensor = quantizer(tensor) - + # breakpoint() # Convert to integer format q_tensor = (q_tensor * 2 ** q_config["frac_width"]).int() + # q_tensor = signed_to_unsigned(q_tensor, bits=q_config["width"]) # Split into chunks according to parallelism in each dimension # parallelism[0]: along rows, parallelism[1]: along columns @@ -174,3 +178,27 @@ def fixed_cast(val, in_width, in_frac_width, out_width, out_frac_width): val = val # val = int(val % (1 << out_width)) return val # << out_frac_width # treat data as data + + +async def check_signal(dut, log, signal_list): + # TODO: support count start + # TODO: support checking signal with different name in valid and ready signal + def handshake_signal_check( + dut, log, signal_base, valid=None, ready=None, count_start: dict = {} + ): + data_valid = getattr(dut, f"{signal_base}_valid") if valid is None else valid + data_ready = getattr(dut, f"{signal_base}_ready") if ready is None else ready + data = getattr(dut, signal_base) + svalue = [i.signed_integer for i in data.value] + if data_valid.value & data_ready.value: + count_start[signal_base] = ( + count_start[signal_base] + 1 + if count_start.get(signal_base) is not None + else " " + ) + log.debug(f"handshake {count_start[signal_base]} {signal_base} = {svalue}") + + while True: + await RisingEdge(dut.clk) + for signal in signal_list: + handshake_signal_check(dut, log, signal) diff --git a/src/mase_components/cast/rtl/fixed_round.sv b/src/mase_components/cast/rtl/fixed_round.sv index a49b43001..49d6a5395 100644 --- a/src/mase_components/cast/rtl/fixed_round.sv +++ b/src/mase_components/cast/rtl/fixed_round.sv @@ -24,12 +24,13 @@ module fixed_round #( logic carry_in, input_sign; assign input_sign = data_in[IN_WIDTH-1]; assign input_data = (input_sign) ? ~(data_in[IN_WIDTH-2:0] - 1) : data_in[IN_WIDTH-2:0]; - /* verilator lint_off SELRANGE */ + logic [IN_WIDTH + OUT_FRAC_WIDTH - 1:0] lsb_check; + assign lsb_check = {input_data, {(OUT_FRAC_WIDTH) {1'b0}}}; always_comb begin - lsb_below[2] = (IN_FRAC_WIDTH >= OUT_FRAC_WIDTH) ? input_data[IN_FRAC_WIDTH-OUT_FRAC_WIDTH] : 0; - lsb_below[1] = (IN_FRAC_WIDTH-1 >= OUT_FRAC_WIDTH) ? input_data[IN_FRAC_WIDTH-OUT_FRAC_WIDTH-1] : 0; - // lsb_below[0] = (IN_FRAC_WIDTH-2 >= OUT_FRAC_WIDTH) ? |(input_data[IN_FRAC_WIDTH-OUT_FRAC_WIDTH-2:0]): 0; - lsb_below[0] = '0; // to do: fix + lsb_below[2] = (IN_FRAC_WIDTH >= OUT_FRAC_WIDTH) ? lsb_check[IN_FRAC_WIDTH] : 0; + lsb_below[1] = (IN_FRAC_WIDTH - 1 >= OUT_FRAC_WIDTH) ? lsb_check[IN_FRAC_WIDTH-1] : 0; + lsb_below[0] = (IN_FRAC_WIDTH - 2 >= OUT_FRAC_WIDTH) ? |(lsb_check[IN_FRAC_WIDTH-2:0]) : 0; + // lsb_below[0] = '0; // to do: fix end always_comb begin if ((IN_FRAC_WIDTH - OUT_FRAC_WIDTH) >= 0) diff --git a/src/mase_components/linear_layers/fixed_linear_layer/rtl/fixed_linear.sv b/src/mase_components/linear_layers/fixed_linear_layer/rtl/fixed_linear.sv index 1e0edce53..8f8de0c55 100644 --- a/src/mase_components/linear_layers/fixed_linear_layer/rtl/fixed_linear.sv +++ b/src/mase_components/linear_layers/fixed_linear_layer/rtl/fixed_linear.sv @@ -19,6 +19,7 @@ module fixed_linear #( /* verilator lint_off UNUSEDPARAM */ parameter HAS_BIAS = 1, parameter WEIGHTS_PRE_TRANSPOSED = 0, + parameter FIFO = 1, parameter DATA_IN_0_PRECISION_0 = 16, parameter DATA_IN_0_PRECISION_1 = 3, @@ -113,6 +114,9 @@ module fixed_linear #( logic add_bias_in_valid; logic add_bias_in_ready; + logic [DATA_OUT_0_PRECISION_0 - 1:0] rounding_out [DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0]; + logic rounding_out_valid; + logic rounding_out_ready; // * Instances // * --------------------------------------------------------------------------------------------------- @@ -223,9 +227,9 @@ module fixed_linear #( .data_in_valid(add_bias_in_valid), .data_in_ready(add_bias_in_ready), - .data_out(data_out_0), - .data_out_valid(data_out_0_valid), - .data_out_ready(data_out_0_ready) + .data_out(rounding_out), + .data_out_valid(rounding_out_valid), + .data_out_ready(rounding_out_ready) ); end @@ -245,7 +249,7 @@ module fixed_linear #( // * Add bias if (HAS_BIAS == 1) begin - fixed_cast #( + fixed_rounding #( .IN_SIZE (BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1), .IN_WIDTH (BIAS_PRECISION_0), .IN_FRAC_WIDTH (BIAS_PRECISION_1), @@ -275,10 +279,45 @@ module fixed_linear #( .OUT_FRAC_WIDTH(DATA_OUT_0_PRECISION_1) ) output_cast ( .data_in (matmul_out), - .data_out(data_out_0) + .data_out(rounding_out) ); - assign data_out_0_valid = matmul_out_valid; - assign matmul_out_ready = data_out_0_ready; + assign rounding_out_valid = matmul_out_valid; + assign matmul_out_ready = rounding_out_ready; end + if (FIFO == 1) begin + localparam FIFO_DEPTH = DATA_OUT_0_TENSOR_SIZE_DIM_0 / DATA_OUT_0_PARALLELISM_DIM_0; + + fifo_for_autogen #( + .DATA_IN_0_PRECISION_0(DATA_OUT_0_PRECISION_0), // = 8 + .DATA_IN_0_PRECISION_1(DATA_OUT_0_PRECISION_1), // = 4 + .DATA_IN_0_TENSOR_SIZE_DIM_0(DATA_OUT_0_TENSOR_SIZE_DIM_0), // = 20 + .DATA_IN_0_PARALLELISM_DIM_0(DATA_OUT_0_PARALLELISM_DIM_0), // = 2 + .DATA_IN_0_TENSOR_SIZE_DIM_1(DATA_OUT_0_TENSOR_SIZE_DIM_1), // = 4 + .DATA_IN_0_PARALLELISM_DIM_1(DATA_OUT_0_PARALLELISM_DIM_1), // = 2 + .DEPTH(FIFO_DEPTH), + .DATA_OUT_0_PRECISION_0(DATA_OUT_0_PRECISION_0), + .DATA_OUT_0_PRECISION_1(DATA_OUT_0_PRECISION_1), + .DATA_OUT_0_TENSOR_SIZE_DIM_0(DATA_OUT_0_TENSOR_SIZE_DIM_0), + .DATA_OUT_0_PARALLELISM_DIM_0(DATA_OUT_0_PARALLELISM_DIM_0), + .DATA_OUT_0_TENSOR_SIZE_DIM_1(DATA_OUT_0_TENSOR_SIZE_DIM_1), + .DATA_OUT_0_PARALLELISM_DIM_1(DATA_OUT_0_PARALLELISM_DIM_1) + ) fifo_1_inst ( + .clk(clk), + .rst(rst), + + .data_in_0(rounding_out), + .data_in_0_valid(rounding_out_valid), + .data_in_0_ready(rounding_out_ready), + .data_out_0(data_out_0), + .data_out_0_valid(data_out_0_valid), + .data_out_0_ready(data_out_0_ready) + ); + end else begin + always_comb begin + data_out_0 = rounding_out; + data_out_0_valid = rounding_out_valid; + rounding_out_ready = data_out_0_ready; + end + end endmodule diff --git a/src/mase_components/linear_layers/fixed_linear_layer/test/fixed_linear_tb.py b/src/mase_components/linear_layers/fixed_linear_layer/test/fixed_linear_tb.py index c7683cbe3..1ce114ea0 100644 --- a/src/mase_components/linear_layers/fixed_linear_layer/test/fixed_linear_tb.py +++ b/src/mase_components/linear_layers/fixed_linear_layer/test/fixed_linear_tb.py @@ -109,80 +109,80 @@ def preprocess_tensor(self, tensor, config, parallelism): blocks.append(dim_1_split[i][j].flatten().tolist()) return blocks - async def run_test(self, us): + async def run_test(self, batches=1, us=100): await self.reset() self.log.info(f"Reset finished") self.data_out_0_monitor.ready.value = 1 + for _ in range(batches): + inputs = self.generate_inputs() + exp_out = self.model(inputs) + + # * Load the inputs driver + self.log.info(f"Processing inputs: {inputs}") + inputs = self.preprocess_tensor( + tensor=inputs, + config={ + "width": self.get_parameter("DATA_IN_0_PRECISION_0"), + "frac_width": self.get_parameter("DATA_IN_0_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_1"), + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0"), + ], + ) + self.data_in_0_driver.load_driver(inputs) - inputs = self.generate_inputs() - exp_out = self.model(inputs) - - # * Load the inputs driver - self.log.info(f"Processing inputs: {inputs}") - inputs = self.preprocess_tensor( - tensor=inputs, - config={ - "width": self.get_parameter("DATA_IN_0_PRECISION_0"), - "frac_width": self.get_parameter("DATA_IN_0_PRECISION_1"), - }, - parallelism=[ - self.get_parameter("DATA_IN_0_PARALLELISM_DIM_1"), - self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0"), - ], - ) - self.data_in_0_driver.load_driver(inputs) - - # * Load the weights driver - if self.get_parameter("WEIGHTS_PRE_TRANSPOSED") == 1: - weights = self.model.weight.transpose(0, 1) - else: - weights = self.model.weight - - self.log.info(f"Processing weights: {weights}") - weights = self.preprocess_tensor( - tensor=weights, - config={ - "width": self.get_parameter("WEIGHT_PRECISION_0"), - "frac_width": self.get_parameter("WEIGHT_PRECISION_1"), - }, - parallelism=[ - self.get_parameter("WEIGHT_PARALLELISM_DIM_1"), - self.get_parameter("WEIGHT_PARALLELISM_DIM_0"), - ], - ) - self.weight_driver.load_driver(weights) + # * Load the weights driver + if self.get_parameter("WEIGHTS_PRE_TRANSPOSED") == 1: + weights = self.model.weight.transpose(0, 1) + else: + weights = self.model.weight - # * Load the bias driver - if self.get_parameter("HAS_BIAS") == 1: - bias = self.model.bias - self.log.info(f"Processing bias: {bias}") - bias = self.preprocess_tensor( - tensor=bias, + self.log.info(f"Processing weights: {weights}") + weights = self.preprocess_tensor( + tensor=weights, config={ - "width": self.get_parameter("BIAS_PRECISION_0"), - "frac_width": self.get_parameter("BIAS_PRECISION_1"), + "width": self.get_parameter("WEIGHT_PRECISION_0"), + "frac_width": self.get_parameter("WEIGHT_PRECISION_1"), }, parallelism=[ - self.get_parameter("BIAS_PARALLELISM_DIM_1"), - self.get_parameter("BIAS_PARALLELISM_DIM_0"), + self.get_parameter("WEIGHT_PARALLELISM_DIM_1"), + self.get_parameter("WEIGHT_PARALLELISM_DIM_0"), ], ) - self.bias_driver.load_driver(bias) - - # * Load the output monitor - self.log.info(f"Processing outputs: {exp_out}") - outs = self.preprocess_tensor( - tensor=exp_out, - config={ - "width": self.get_parameter("DATA_OUT_0_PRECISION_0"), - "frac_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), - }, - parallelism=[ - self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_1"), - self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_0"), - ], - ) - self.data_out_0_monitor.load_monitor(outs) + self.weight_driver.load_driver(weights) + + # * Load the bias driver + if self.get_parameter("HAS_BIAS") == 1: + bias = self.model.bias + self.log.info(f"Processing bias: {bias}") + bias = self.preprocess_tensor( + tensor=bias, + config={ + "width": self.get_parameter("BIAS_PRECISION_0"), + "frac_width": self.get_parameter("BIAS_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("BIAS_PARALLELISM_DIM_1"), + self.get_parameter("BIAS_PARALLELISM_DIM_0"), + ], + ) + self.bias_driver.load_driver(bias) + + # * Load the output monitor + self.log.info(f"Processing outputs: {exp_out}") + outs = self.preprocess_tensor( + tensor=exp_out, + config={ + "width": self.get_parameter("DATA_OUT_0_PRECISION_0"), + "frac_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_1"), + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_0"), + ], + ) + self.data_out_0_monitor.load_monitor(outs) await Timer(us, units="us") assert self.data_out_0_monitor.exp_queue.empty() @@ -191,9 +191,20 @@ async def run_test(self, us): @cocotb.test() async def cocotb_test(dut): tb = LinearTB(dut) - await tb.run_test(us=100) + await tb.run_test(batches=10, us=100) +async def check_signal(dut, log): + num = {"data_out_0": 0, "data_in_0": 0} + while True: + await RisingEdge(dut.clk) + + +# verified case +# weight per transpoed = 0 +# weight pre transposed = 1 +# has bias = 0 +# has bias = 1 def get_fixed_linear_config(kwargs={}): # if pretranspose # weight1 = in0 @@ -201,22 +212,22 @@ def get_fixed_linear_config(kwargs={}): # weight0 = in0 config = { "HAS_BIAS": 1, - "WEIGHTS_PRE_TRANSPOSED": 1, + "WEIGHTS_PRE_TRANSPOSED": 0, "DATA_IN_0_TENSOR_SIZE_DIM_0": 32, "DATA_IN_0_TENSOR_SIZE_DIM_1": 16, - "DATA_IN_0_PARALLELISM_DIM_0": 4, + "DATA_IN_0_PARALLELISM_DIM_0": 8, "DATA_IN_0_PARALLELISM_DIM_1": 4, - "WEIGHT_TENSOR_SIZE_DIM_0": 16, - "WEIGHT_TENSOR_SIZE_DIM_1": 32, - "WEIGHT_PARALLELISM_DIM_0": 2, + "WEIGHT_TENSOR_SIZE_DIM_0": 32, + "WEIGHT_TENSOR_SIZE_DIM_1": 16, + "WEIGHT_PARALLELISM_DIM_0": 8, "WEIGHT_PARALLELISM_DIM_1": 4, "DATA_IN_0_PRECISION_0": 8, "DATA_IN_0_PRECISION_1": 4, - "WEIGHT_PRECISION_0": 8, - "WEIGHT_PRECISION_1": 4, - "BIAS_PRECISION_0": 8, - "BIAS_PRECISION_1": 4, - "DATA_OUT_0_PRECISION_0": 10, + "WEIGHT_PRECISION_0": 10, + "WEIGHT_PRECISION_1": 3, + "BIAS_PRECISION_0": 5, + "BIAS_PRECISION_1": 2, + "DATA_OUT_0_PRECISION_0": 8, "DATA_OUT_0_PRECISION_1": 4, } config.update(kwargs) @@ -232,58 +243,48 @@ def test_fixed_linear_smoke(): trace=True, module_param_list=[ get_fixed_linear_config(), - # noticed here if change WEIGHT_PRE_TRANSPOSED also need to change the DIM_SIZE to match ACTIVATION - get_fixed_linear_config( - { - "WEIGHTS_PRE_TRANSPOSED": 0, - "WEIGHT_TENSOR_SIZE_DIM_0": 32, - "WEIGHT_TENSOR_SIZE_DIM_1": 16, - "WEIGHT_PARALLELISM_DIM_0": 4, - "WEIGHT_PARALLELISM_DIM_1": 2, - }, - ), - ], - ) - - -@pytest.mark.dev -def test_fixed_linear_regression(): - """ - More extensive tests to check realistic parameter sizes. - """ - mase_runner( - trace=True, - module_param_list=[ - get_fixed_linear_config( - { - "DATA_IN_0_TENSOR_SIZE_DIM_0": 768, - "DATA_IN_0_PARALLELISM_DIM_0": 32, - "WEIGHT_TENSOR_SIZE_DIM_0": 768, - "WEIGHT_TENSOR_SIZE_DIM_1": 768, - "WEIGHT_PARALLELISM_DIM_0": 32, - "WEIGHT_PARALLELISM_DIM_1": 32, - "BIAS_TENSOR_SIZE_DIM_0": 768, - "BIAS_PARALLELISM_DIM_0": 32, - } - ), - get_fixed_linear_config( - { - "HAS_BIAS": 1, - "WEIGHTS_PRE_TRANSPOSED": 0, - "DATA_IN_0_TENSOR_SIZE_DIM_0": 768, - "DATA_IN_0_PARALLELISM_DIM_0": 32, - "WEIGHT_TENSOR_SIZE_DIM_0": 768, - "WEIGHT_TENSOR_SIZE_DIM_1": 768, - "WEIGHT_PARALLELISM_DIM_0": 32, - "WEIGHT_PARALLELISM_DIM_1": 32, - "BIAS_TENSOR_SIZE_DIM_0": 768, - "BIAS_PARALLELISM_DIM_0": 32, - } - ), ], ) +# @pytest.mark.dev +# def test_fixed_linear_regression(): +# """ +# More extensive tests to check realistic parameter sizes. +# """ +# mase_runner( +# trace=True, +# module_param_list=[ +# get_fixed_linear_config( +# { +# "DATA_IN_0_TENSOR_SIZE_DIM_0": 768, +# "DATA_IN_0_PARALLELISM_DIM_0": 32, +# "WEIGHT_TENSOR_SIZE_DIM_0": 768, +# "WEIGHT_TENSOR_SIZE_DIM_1": 768, +# "WEIGHT_PARALLELISM_DIM_0": 32, +# "WEIGHT_PARALLELISM_DIM_1": 32, +# "BIAS_TENSOR_SIZE_DIM_0": 768, +# "BIAS_PARALLELISM_DIM_0": 32, +# } +# ), +# get_fixed_linear_config( +# { +# "HAS_BIAS": 1, +# "WEIGHTS_PRE_TRANSPOSED": 0, +# "DATA_IN_0_TENSOR_SIZE_DIM_0": 768, +# "DATA_IN_0_PARALLELISM_DIM_0": 32, +# "WEIGHT_TENSOR_SIZE_DIM_0": 768, +# "WEIGHT_TENSOR_SIZE_DIM_1": 768, +# "WEIGHT_PARALLELISM_DIM_0": 32, +# "WEIGHT_PARALLELISM_DIM_1": 32, +# "BIAS_TENSOR_SIZE_DIM_0": 768, +# "BIAS_PARALLELISM_DIM_0": 32, +# } +# ), +# ], +# ) + +torch.manual_seed(3) if __name__ == "__main__": test_fixed_linear_smoke() # test_fixed_linear_regression() diff --git a/src/mase_components/linear_layers/matmul/rtl/matmul.sv b/src/mase_components/linear_layers/matmul/rtl/matmul.sv index e8443426a..d7509d02c 100644 --- a/src/mase_components/linear_layers/matmul/rtl/matmul.sv +++ b/src/mase_components/linear_layers/matmul/rtl/matmul.sv @@ -134,6 +134,9 @@ module matmul #( // Matrix unflatten output logic [B_WIDTH-1:0] b_buffer_out_data[B_COMPUTE_DIM0*B_COMPUTE_DIM1-1:0]; + logic [SM_OUT_WIDTH-1:0] buffered_sm_out_data[C_COMPUTE_DIM0*C_COMPUTE_DIM1]; + logic buffered_sm_out_valid, buffered_sm_out_ready; + logic [SM_OUT_WIDTH-1:0] sm_out_data[C_COMPUTE_DIM0*C_COMPUTE_DIM1]; logic sm_out_valid, sm_out_ready; @@ -275,9 +278,23 @@ module matmul #( .y_data (b_buffer_out_data), .y_valid (b_buffer_out_valid), .y_ready (b_buffer_out_ready), - .out_data (sm_out_data), - .out_valid(sm_out_valid), - .out_ready(sm_out_ready) + .out_data (buffered_sm_out_data), + .out_valid(buffered_sm_out_valid), + .out_ready(buffered_sm_out_ready) + ); + //cut the long ready path + unpacked_skid_buffer #( + .DATA_WIDTH(SM_OUT_WIDTH), + .IN_NUM (C_COMPUTE_DIM0 * C_COMPUTE_DIM1) + ) sm_out_reg_slice ( + .clk (clk), + .rst (rst), + .data_in (buffered_sm_out_data), + .data_in_valid (buffered_sm_out_valid), + .data_in_ready (buffered_sm_out_ready), + .data_out (sm_out_data), + .data_out_valid(sm_out_valid), + .data_out_ready(sm_out_ready) ); // Direct the result of the simple matmul to the correct matrix_accumulator diff --git a/src/mase_components/memory/rtl/blk_mem_gen_0.sv b/src/mase_components/memory/rtl/blk_mem_gen_0.sv index 55ca60c9c..20dad70f7 100644 --- a/src/mase_components/memory/rtl/blk_mem_gen_0.sv +++ b/src/mase_components/memory/rtl/blk_mem_gen_0.sv @@ -3,21 +3,13 @@ module blk_mem_gen_0 #( parameter DATA_WIDTH = 8, parameter MEM_SIZE = 1 ) ( - clka, - ena, - wea, - addra, - dina, - douta -) -/* synthesis syn_black_box black_box_pad_pin="ena,wea[0:0],addra[9:0],dina[7:0],douta[7:0]" */ -/* synthesis syn_force_seq_prim="clka" */; - input logic clka /* synthesis syn_isclock = 1 */; - input logic ena; - input logic [0:0] wea; - input logic [$clog2(MEM_SIZE):0] addra; - input logic [DATA_WIDTH - 1:0] dina; - output logic [DATA_WIDTH - 1:0] douta; + input logic clka, + input logic ena, + input logic wea, + input logic [$clog2(MEM_SIZE):0] addra, + input logic [DATA_WIDTH - 1:0] dina, + output logic [DATA_WIDTH - 1:0] douta +); logic [DATA_WIDTH - 1:0] ram[0:MEM_SIZE-1]; logic [DATA_WIDTH - 1:0] douta_t1; diff --git a/src/mase_components/memory/rtl/fifo_for_autogen.sv b/src/mase_components/memory/rtl/fifo_for_autogen.sv new file mode 100644 index 000000000..6db2b000f --- /dev/null +++ b/src/mase_components/memory/rtl/fifo_for_autogen.sv @@ -0,0 +1,42 @@ +`timescale 1 ns / 1 ps +/* verilator lint_off PINMISSING */ +module fifo_for_autogen #( + parameter DATA_IN_0_PRECISION_0 = 16, + parameter DATA_IN_0_PRECISION_1 = 3, + parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = 20, + parameter DATA_IN_0_TENSOR_SIZE_DIM_1 = 20, + parameter DATA_IN_0_PARALLELISM_DIM_0 = 4, // must equal WEIGHT_PARALLELISM_DIM_1 + parameter DATA_IN_0_PARALLELISM_DIM_1 = 4, + parameter DATA_OUT_0_PRECISION_0 = DATA_IN_0_PRECISION_0, + parameter DATA_OUT_0_PRECISION_1 = DATA_IN_0_PRECISION_1, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = DATA_IN_0_TENSOR_SIZE_DIM_0, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1, + parameter DATA_OUT_0_PARALLELISM_DIM_0 = DATA_IN_0_PARALLELISM_DIM_0, + parameter DATA_OUT_0_PARALLELISM_DIM_1 = DATA_IN_0_PARALLELISM_DIM_1, + parameter DEPTH = DATA_IN_0_TENSOR_SIZE_DIM_0 * DATA_IN_0_TENSOR_SIZE_DIM_1 / (DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1) +) ( + input clk, + input rst, + input logic [DATA_IN_0_PRECISION_0-1:0] data_in_0 [DATA_IN_0_PARALLELISM_DIM_0*DATA_IN_0_PARALLELISM_DIM_1-1:0], + input logic data_in_0_valid, + output logic data_in_0_ready, + + output logic [DATA_OUT_0_PRECISION_0-1:0] data_out_0 [DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0], + output logic data_out_0_valid, + input logic data_out_0_ready +); + unpacked_fifo #( + .DEPTH(DEPTH), + .DATA_WIDTH(DATA_IN_0_PRECISION_0), + .IN_NUM(DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1) + ) ff_inst ( + .clk(clk), + .rst(rst), + .data_in(data_in_0), + .data_in_valid(data_in_0_valid), + .data_in_ready(data_in_0_ready), + .data_out(data_out_0), + .data_out_valid(data_out_0_valid), + .data_out_ready(data_out_0_ready) + ); +endmodule diff --git a/src/mase_components/memory/rtl/input_buffer.sv b/src/mase_components/memory/rtl/input_buffer.sv index 66c1c4c3c..fc6311810 100644 --- a/src/mase_components/memory/rtl/input_buffer.sv +++ b/src/mase_components/memory/rtl/input_buffer.sv @@ -101,9 +101,9 @@ module input_buffer #( end assign reg_in_valid = (mode == STRAIGHT) ? data_in_valid : (!(delay2_bos || delay1_bos)); assign data_in_ready = (mode == STRAIGHT) ? reg_in_ready : 0; - unpacked_skid_buffer #( + unpacked_register_slice #( .DATA_WIDTH(DATA_WIDTH), - .IN_NUM(IN_NUM) + .IN_SIZE(IN_NUM) ) reg_inst ( .data_in(reg_in), .data_in_valid(reg_in_valid), diff --git a/src/mase_components/memory/rtl/matrix_bank.sv b/src/mase_components/memory/rtl/matrix_bank.sv index 81d2abc22..707bcd221 100644 --- a/src/mase_components/memory/rtl/matrix_bank.sv +++ b/src/mase_components/memory/rtl/matrix_bank.sv @@ -5,8 +5,65 @@ Features to implement: clear all counters and flags and transition to REQ_FETCH */ +package matrix_bank_pkg; -import matrix_bank_pkg::*; + parameter AXI_ADDRESS_WIDTH = 32; + parameter MAX_DIMENSION = 1024; + parameter MAX_FEATURE_COUNT = 16; + + + typedef struct packed { + logic [AXI_ADDRESS_WIDTH-1:0] start_address; + logic [$clog2(MAX_DIMENSION):0] columns; + logic [$clog2(MAX_DIMENSION):0] rows; + } REQ_t; + + typedef struct packed {logic partial;} RESP_t; + + typedef struct packed { + // Check request payloads match NSB payloads + logic [$clog2(MAX_FEATURE_COUNT):0] columns; + logic [$clog2(MAX_FEATURE_COUNT):0] rows; + } ROW_CHANNEL_REQ_t; + + typedef struct packed { + logic [MAX_FEATURE_COUNT-1:0][31:0] data; + logic [MAX_FEATURE_COUNT-1:0] valid_mask; + logic done; + } ROW_CHANNEL_RESP_t; + +endpackage + +// package matrix_bank_pkg; + +parameter AXI_ADDRESS_WIDTH = 32; +parameter MAX_DIMENSION = 1024; +parameter MAX_FEATURE_COUNT = 32; + +typedef struct packed { + logic [AXI_ADDRESS_WIDTH-1:0] start_address; + logic [$clog2(MAX_DIMENSION):0] columns; + logic [$clog2(MAX_DIMENSION):0] rows; +} REQ_t; + +typedef struct packed {logic partial;} RESP_t; + +typedef struct packed { + // Check request payloads match NSB payloads + logic [$clog2( +MAX_FEATURE_COUNT +):0] columns; + logic [$clog2(MAX_FEATURE_COUNT):0] rows; +} ROW_CHANNEL_REQ_t; + +typedef struct packed { + logic [MAX_FEATURE_COUNT-1:0][31:0] data; + logic [MAX_FEATURE_COUNT-1:0] valid_mask; + logic done; +} ROW_CHANNEL_RESP_t; + +// endpackage +// import matrix_bank_pkg::*; module matrix_bank #( parameter PRECISION = 0, // 0 = FP32, 1 = FP16 diff --git a/src/mase_components/memory/rtl/matrix_bank_pkg.sv b/src/mase_components/memory/rtl/matrix_bank_pkg.sv index 791d48871..e56bac906 100644 --- a/src/mase_components/memory/rtl/matrix_bank_pkg.sv +++ b/src/mase_components/memory/rtl/matrix_bank_pkg.sv @@ -2,6 +2,7 @@ package matrix_bank_pkg; parameter AXI_ADDRESS_WIDTH = 32; parameter MAX_DIMENSION = 1024; + parameter MAX_FEATURE_COUNT = 32; typedef struct packed { logic [AXI_ADDRESS_WIDTH-1:0] start_address;