diff --git a/README.md b/README.md
index c20afc4..f405ac5 100644
--- a/README.md
+++ b/README.md
@@ -331,6 +331,12 @@ Below is a sample of the execution traces, showing on each cycle the execution o
**For anyone trying to run the simulation or play with this repo, please feel free to DM me on [twitter](https://twitter.com/majmudaradam) if you run into any issues - I want you to get this running!**
+## Visualization
+
+The whole GPU is also visualized in [*Digital*](https://github.com/hneemann/Digital). You can play around with different components by manually setting inputs and observing outputs very easily. To view the top `gpu.dig` module, make sure you have [*Digital*](https://github.com/hneemann/Digital) and `icarus-verilog` installed.
+
+
+
# Advanced Functionality
For the sake of simplicity, there were many additional features implemented in modern GPUs that heavily improve performance & functionality that tiny-gpu omits. We'll discuss some of those most critical features in this section.
diff --git a/docs/images/gpu-digital.png b/docs/images/gpu-digital.png
new file mode 100644
index 0000000..01ede8a
Binary files /dev/null and b/docs/images/gpu-digital.png differ
diff --git a/visualization/core.dig b/visualization/core.dig
new file mode 100644
index 0000000..4193ac5
--- /dev/null
+++ b/visualization/core.dig
@@ -0,0 +1,941 @@
+
+
+ 2
+
+
+ backgroundColor
+
+ 204
+ 204
+ 255
+ 255
+
+
+
+ romContent
+
+
+
+
+
+ Width
+ 25
+
+
+
+
+ ExternalFile
+
+
+ applicationType
+ IVERILOG
+
+
+ Label
+ decoder
+
+
+ externalInputs
+ clk,reset,core_state:3,instruction:16
+
+
+ iverilogOptions
+ -I ./src/
+
+
+ externalOutputs
+ decoded_rd_address:4,decoded_rs_address:4,decoded_rt_address:4,decoded_nzp:3,decoded_immediate:8,decoded_reg_write_enable,decoded_mem_read_enable,decoded_mem_write_enable,decoded_nzp_write_enable,decoded_reg_input_mux:2,decoded_alu_arithmetic_mux:2,decoded_alu_output_mux,decoded_pc_mux,decoded_ret
+
+
+ Width
+ 16
+
+
+ CodeFile
+ ./src/decoder.sv
+
+
+
+
+
+ ExternalFile
+
+
+ applicationType
+ IVERILOG
+
+
+ Label
+ fetcher
+
+
+ externalInputs
+ clk,reset,core_state:3,current_pc:8,mem_read_ready,mem_read_data:16
+
+
+ iverilogOptions
+ -I ./src/
+
+
+ externalOutputs
+ mem_read_valid,mem_read_address:8,fetcher_state:3,instruction:16
+
+
+ Width
+ 16
+
+
+ CodeFile
+ ./src/fetcher.sv
+
+
+
+
+
+ ExternalFile
+
+
+ applicationType
+ IVERILOG
+
+
+ Label
+ scheduler
+
+
+ externalInputs
+ clk,reset,start,decoded_mem_read_enable,decoded_mem_write_enable,decoded_ret,fetcher_state:3,lsu_state:8,next_pc:32
+
+
+ iverilogOptions
+ -I ./src/
+
+
+ externalOutputs
+ done,core_state:3,current_pc:8
+
+
+ Width
+ 16
+
+
+ CodeFile
+ ./src/scheduler.sv
+
+
+
+
+
+ Out
+
+
+ Label
+ done
+
+
+
+
+
+ Out
+
+
+ Label
+ program\_mem\_read\_valid
+
+
+
+
+
+ Out
+
+
+ Label
+ program\_mem\_read\_address
+
+
+ Bits
+ 8
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ clk
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ clk
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ reset
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ reset
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ reset
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ clk
+
+
+
+
+
+ Tunnel
+
+
+ NetName
+ cs
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ cs
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ cs
+
+
+
+
+
+ Clock
+
+
+ Label
+ clk
+
+
+
+
+
+ In
+
+
+ Label
+ start
+
+
+
+
+
+ Tunnel
+
+
+ NetName
+ clk
+
+
+
+
+
+ In
+
+
+ Label
+ reset
+
+
+
+
+
+ In
+
+
+ Label
+ block\_id
+
+
+ Bits
+ 8
+
+
+
+
+
+ threads.dig
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ reset
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ clk
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ cs
+
+
+
+
+
+ Out
+
+
+ Label
+ data\_mem\_read\_valid
+
+
+ Bits
+ 4
+
+
+
+
+
+ Out
+
+
+ Label
+ data\_mem\_read\_address
+
+
+ Bits
+ 32
+
+
+
+
+
+ Out
+
+
+ Label
+ data\_mem\_write\_valid
+
+
+ Bits
+ 4
+
+
+
+
+
+ Out
+
+
+ Label
+ data\_mem\_write\_address
+
+
+ Bits
+ 32
+
+
+
+
+
+ Out
+
+
+ Label
+ data\_mem\_write\_data
+
+
+ Bits
+ 32
+
+
+
+
+
+ In
+
+
+ Label
+ thread\_count
+
+
+ Bits
+ 3
+
+
+
+
+
+ Tunnel
+
+
+ NetName
+ reset
+
+
+
+
+
+ In
+
+
+ rotation
+
+
+
+ Label
+ program\_mem\_read\_ready
+
+
+
+
+
+ In
+
+
+ rotation
+
+
+
+ Label
+ program\_mem\_read\_data
+
+
+ Bits
+ 16
+
+
+
+
+
+ In
+
+
+ Label
+ data\_mem\_read\_ready
+
+
+ Bits
+ 4
+
+
+
+
+
+ In
+
+
+ Label
+ data\_mem\_read\_data
+
+
+ Bits
+ 32
+
+
+
+
+
+ In
+
+
+ Label
+ data\_mem\_write\_ready
+
+
+ Bits
+ 4
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/visualization/gpu.dig b/visualization/gpu.dig
new file mode 100644
index 0000000..cae667b
--- /dev/null
+++ b/visualization/gpu.dig
@@ -0,0 +1,1490 @@
+
+
+ 2
+
+
+
+ core.dig
+
+
+ shapeType
+ DIL
+
+
+ Label
+ core0
+
+
+
+
+
+ ExternalFile
+
+
+ applicationType
+ IVERILOG
+
+
+ Label
+ dcr
+
+
+ externalInputs
+ clk,reset,device_control_write_enable,device_control_data:8
+
+
+ externalOutputs
+ thread_count:8
+
+
+ Width
+ 13
+
+
+ iverilogOptions
+ -I ./src/
+
+
+ CodeFile
+ ./src/dcr.sv
+
+
+
+
+
+ ExternalFile
+
+
+ applicationType
+ IVERILOG
+
+
+ Label
+ controllerData
+
+
+ externalInputs
+ clk,reset,consumer_write_data:64,consumer_write_address:64,consumer_write_valid:8,consumer_read_address:64,consumer_read_valid:8,mem_read_ready:4,mem_read_data:32,mem_write_ready:4
+
+
+ iverilogOptions
+ -I ./src/
+
+
+ externalOutputs
+ consumer_read_ready:8,consumer_read_data:64,consumer_write_ready:8,mem_read_valid:4,mem_read_address:32,mem_write_valid:4,mem_write_address:32,mem_write_data:32
+
+
+ Width
+ 20
+
+
+ CodeFile
+ ./src/controller-data.sv
+
+
+
+
+
+ ExternalFile
+
+
+ applicationType
+ IVERILOG
+
+
+ Label
+ dispatch
+
+
+ externalInputs
+ clk,reset,start,thread_count:8,core_done:2
+
+
+ iverilogOptions
+ -I ./src/
+
+
+ externalOutputs
+ core_start:2,core_reset:2,core_block_id:16,core_thread_count:6,done
+
+
+ Width
+ 15
+
+
+ CodeFile
+ ./src/dispatch.sv
+
+
+
+
+
+ ExternalFile
+
+
+ applicationType
+ IVERILOG
+
+
+ Label
+ controllerProg
+
+
+ externalInputs
+ clk,reset,consumer_read_address:16,consumer_read_valid:2,mem_read_ready,mem_read_data:16
+
+
+ iverilogOptions
+ -I ./src/
+
+
+ externalOutputs
+ consumer_read_ready:2,consumer_read_data:32,mem_read_valid,mem_read_address:8
+
+
+ Width
+ 20
+
+
+ CodeFile
+ ./src/controller-program.sv
+
+
+
+
+
+ Clock
+
+
+ Label
+ clk
+
+
+
+
+
+ In
+
+
+ Label
+ reset
+
+
+
+
+
+ Tunnel
+
+
+ NetName
+ clk
+
+
+
+
+
+ Tunnel
+
+
+ NetName
+ reset
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ clk
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ clk
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ clk
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ clk
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ clk
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ reset
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ reset
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ reset
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ reset
+
+
+
+
+
+ In
+
+
+ Label
+ device\_control\_write\_enable
+
+
+
+
+
+ In
+
+
+ Label
+ device\_control\_data
+
+
+ Bits
+ 8
+
+
+
+
+
+ Splitter
+
+
+ Input Splitting
+ 2
+
+
+ Output Splitting
+ 1,1
+
+
+
+
+
+ Splitter
+
+
+ Input Splitting
+ 2
+
+
+ Output Splitting
+ 1,1
+
+
+
+
+
+ Tunnel
+
+
+ NetName
+ done0
+
+
+
+
+
+ Tunnel
+
+
+ NetName
+ done1
+
+
+
+
+
+ Splitter
+
+
+ Input Splitting
+ 1,1
+
+
+ Output Splitting
+ 2
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ done0
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ done1
+
+
+
+
+
+ Splitter
+
+
+ Input Splitting
+ 16
+
+
+ Output Splitting
+ 8,8
+
+
+
+
+
+ core.dig
+
+
+ shapeType
+ DIL
+
+
+ Label
+ core1
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ clk
+
+
+
+
+
+ Splitter
+
+
+ Input Splitting
+ 6
+
+
+ Output Splitting
+ 3,3
+
+
+
+
+
+ Out
+
+
+ rotation
+
+
+
+ Label
+ done
+
+
+
+
+
+ In
+
+
+ Label
+ start
+
+
+
+
+
+ Splitter
+
+
+
+
+ Splitter
+
+
+ Input Splitting
+ 32,32
+
+
+ Output Splitting
+ 64
+
+
+
+
+
+ Splitter
+
+
+
+
+ Splitter
+
+
+ Input Splitting
+ 32,32
+
+
+ Output Splitting
+ 64
+
+
+
+
+
+ Splitter
+
+
+ Input Splitting
+ 32,32
+
+
+ Output Splitting
+ 64
+
+
+
+
+
+ In
+
+
+ rotation
+
+
+
+ Label
+ data\_mem\_read\_ready
+
+
+ Bits
+ 4
+
+
+
+
+
+ In
+
+
+ rotation
+
+
+
+ Label
+ data\_mem\_read\_data
+
+
+ Bits
+ 32
+
+
+
+
+
+ In
+
+
+ rotation
+
+
+
+ Label
+ data\_mem\_write\_ready
+
+
+ Bits
+ 4
+
+
+
+
+
+ Splitter
+
+
+ Input Splitting
+ 1,1
+
+
+ Output Splitting
+ 2
+
+
+
+
+
+ Splitter
+
+
+ Input Splitting
+ 8,8
+
+
+ Output Splitting
+ 16
+
+
+
+
+
+ Splitter
+
+
+ Input Splitting
+ 2
+
+
+ Output Splitting
+ 1,1
+
+
+
+
+
+ Splitter
+
+
+ Input Splitting
+ 32
+
+
+ Output Splitting
+ 16,16
+
+
+
+
+
+ Out
+
+
+ Label
+ program\_mem\_read\_valid
+
+
+
+
+
+ Out
+
+
+ Label
+ program\_mem\_read\_address
+
+
+ Bits
+ 8
+
+
+
+
+
+ In
+
+
+ rotation
+
+
+
+ Label
+ program\_mem\_read\_ready
+
+
+
+
+
+ In
+
+
+ rotation
+
+
+
+ Label
+ program\_mem\_read\_data
+
+
+ Bits
+ 16
+
+
+
+
+
+ Splitter
+
+
+ Input Splitting
+ 8
+
+
+ Output Splitting
+ 4,4
+
+
+
+
+
+ Splitter
+
+
+ Input Splitting
+ 64
+
+
+ Output Splitting
+ 32,32
+
+
+
+
+
+ Splitter
+
+
+ mirror
+ true
+
+
+ rotation
+
+
+
+ Input Splitting
+ 8
+
+
+ Output Splitting
+ 4,4
+
+
+
+
+
+ Out
+
+
+ Label
+ data\_mem\_read\_valid
+
+
+ Bits
+ 4
+
+
+
+
+
+ Out
+
+
+ Label
+ data\_mem\_read\_address
+
+
+ Bits
+ 32
+
+
+
+
+
+ Out
+
+
+ Label
+ data\_mem\_write\_valid
+
+
+ Bits
+ 4
+
+
+
+
+
+ Out
+
+
+ Label
+ data\_mem\_write\_address
+
+
+ Bits
+ 32
+
+
+
+
+
+ Out
+
+
+ Label
+ data\_mem\_write\_data
+
+
+ Bits
+ 32
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/visualization/lsu.dig b/visualization/lsu.dig
new file mode 100644
index 0000000..a50f4fe
--- /dev/null
+++ b/visualization/lsu.dig
@@ -0,0 +1,363 @@
+
+
+ 2
+
+
+
+ ExternalFile
+
+
+ applicationType
+ IVERILOG
+
+
+ Label
+ lsu
+
+
+ externalInputs
+ clk,reset,enable,core_state:3,decoded_mem_read_enable,decoded_mem_write_enable,rs:8,rt:8,mem_read_ready,mem_read_data:8,mem_write_ready
+
+
+ iverilogOptions
+ -I /Users/markren/LocalFiles/tiny-gpu/src/digital
+
+
+ externalOutputs
+ mem_read_valid,mem_read_address:8,mem_write_valid,mem_write_address:8,mem_write_data:8,lsu_state:2,lsu_out:8
+
+
+ Width
+ 20
+
+
+ CodeFile
+ /Users/markren/LocalFiles/tiny-gpu/src/digital/lsu.sv
+
+
+
+
+
+ Out
+
+
+ small
+ true
+
+
+
+
+
+ Out
+
+
+ small
+ true
+
+
+ Bits
+ 8
+
+
+
+
+
+ Out
+
+
+ small
+ true
+
+
+
+
+
+ Out
+
+
+ small
+ true
+
+
+ Bits
+ 8
+
+
+
+
+
+ Out
+
+
+ small
+ true
+
+
+ Bits
+ 8
+
+
+
+
+
+ Out
+
+
+ small
+ true
+
+
+ Bits
+ 2
+
+
+ intFormat
+ bin
+
+
+
+
+
+ Out
+
+
+ small
+ true
+
+
+ Bits
+ 8
+
+
+
+
+
+ Clock
+
+
+ small
+ true
+
+
+
+
+
+ In
+
+
+ small
+ true
+
+
+ Bits
+ 3
+
+
+ intFormat
+ bin
+
+
+
+
+
+ In
+
+
+ small
+ true
+
+
+
+
+
+ In
+
+
+ small
+ true
+
+
+ Bits
+ 8
+
+
+
+
+
+ In
+
+
+ small
+ true
+
+
+
+
+
+ In
+
+
+ small
+ true
+
+
+ Bits
+ 8
+
+
+
+
+
+ In
+
+
+ small
+ true
+
+
+ Bits
+ 8
+
+
+
+
+
+ Const
+
+
+ Value
+ 0
+
+
+
+
+
+ Const
+
+
+
+
+ In
+
+
+ small
+ true
+
+
+
+
+
+ In
+
+
+ small
+ true
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/visualization/pc.dig b/visualization/pc.dig
new file mode 100644
index 0000000..2d21aef
--- /dev/null
+++ b/visualization/pc.dig
@@ -0,0 +1,340 @@
+
+
+ 2
+
+
+
+ Clock
+
+
+ Label
+ clk
+
+
+
+
+
+ Tunnel
+
+
+ NetName
+ clk
+
+
+
+
+
+ In
+
+
+ Label
+ reset
+
+
+
+
+
+ Tunnel
+
+
+ NetName
+ reset
+
+
+
+
+
+ In
+
+
+ Label
+ cs
+
+
+ Bits
+ 3
+
+
+
+
+
+ Tunnel
+
+
+ NetName
+ cs
+
+
+
+
+
+ In
+
+
+ Label
+ en
+
+
+
+
+
+ Tunnel
+
+
+ NetName
+ en
+
+
+
+
+
+ ExternalFile
+
+
+ applicationType
+ IVERILOG
+
+
+ Label
+ pc
+
+
+ externalInputs
+ clk,reset,enable,core_state:3,decoded_nzp:3,decoded_immediate:8,decoded_nzp_write_enable,decoded_pc_mux,alu_out:8,current_pc:8
+
+
+ iverilogOptions
+ -I /Users/markren/LocalFiles/tiny-gpu/src/digital
+
+
+ externalOutputs
+ next_pc:8
+
+
+ Width
+ 13
+
+
+ CodeFile
+ /Users/markren/LocalFiles/tiny-gpu/src/digital/pc.sv
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ reset
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ clk
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ cs
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ en
+
+
+
+
+
+ In
+
+
+ Bits
+ 3
+
+
+
+
+
+ In
+
+
+
+
+ In
+
+
+
+
+ In
+
+
+ Bits
+ 8
+
+
+
+
+
+ Out
+
+
+ Label
+ next\_pc
+
+
+ Bits
+ 8
+
+
+
+
+
+ In
+
+
+ Bits
+ 8
+
+
+
+
+
+ In
+
+
+ Bits
+ 8
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/visualization/registers.dig b/visualization/registers.dig
new file mode 100644
index 0000000..efc2313
--- /dev/null
+++ b/visualization/registers.dig
@@ -0,0 +1,425 @@
+
+
+ 2
+
+
+
+ ExternalFile
+
+
+ applicationType
+ IVERILOG
+
+
+ Label
+ registers
+
+
+ externalInputs
+ clk,reset,enable,block_id:8,core_state:3,decoded_rd_address:4,decoded_rs_address:4,decoded_rt_address:4,decoded_reg_write_enable,decoded_reg_input_mux:2,decoded_immediate:8,alu_out:8,lsu_out:8
+
+
+ iverilogOptions
+ -I /Users/markren/LocalFiles/tiny-gpu/src/digital
+
+
+ externalOutputs
+ rs:8,rt:8
+
+
+ Width
+ 12
+
+
+ CodeFile
+ /Users/markren/LocalFiles/tiny-gpu/src/digital/registers.sv
+
+
+
+
+
+ In
+
+
+ Label
+ reset
+
+
+
+
+
+ Clock
+
+
+ Label
+ clk
+
+
+
+
+
+ Tunnel
+
+
+ NetName
+ clk
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ clk
+
+
+
+
+
+ Tunnel
+
+
+ NetName
+ reset
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ reset
+
+
+
+
+
+ In
+
+
+ Label
+ en
+
+
+
+
+
+ Tunnel
+
+
+ NetName
+ en
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ en
+
+
+
+
+
+ In
+
+
+ Bits
+ 8
+
+
+
+
+
+ In
+
+
+ Bits
+ 3
+
+
+
+
+
+ In
+
+
+ Bits
+ 4
+
+
+
+
+
+ In
+
+
+ Bits
+ 4
+
+
+
+
+
+ In
+
+
+ Bits
+ 4
+
+
+
+
+
+ In
+
+
+
+
+ In
+
+
+ Bits
+ 2
+
+
+
+
+
+ In
+
+
+ Bits
+ 8
+
+
+ rotation
+
+
+
+
+
+
+ In
+
+
+ rotation
+
+
+
+ Bits
+ 8
+
+
+
+
+
+ In
+
+
+ rotation
+
+
+
+ Bits
+ 8
+
+
+
+
+
+ Out
+
+
+ Bits
+ 8
+
+
+
+
+
+ Out
+
+
+ Bits
+ 8
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/visualization/splitter40-10.dig b/visualization/splitter40-10.dig
new file mode 100644
index 0000000..ff71102
--- /dev/null
+++ b/visualization/splitter40-10.dig
@@ -0,0 +1,236 @@
+
+
+ 2
+
+
+ backgroundColor
+
+ 204
+ 204
+ 204
+ 255
+
+
+
+ romContent
+
+
+
+
+
+
+
+ Splitter
+
+
+ Input Splitting
+ 40
+
+
+ Output Splitting
+ 1,1,1,1,8,8,8,8,1,1,1,1
+
+
+
+
+
+ In
+
+
+ Label
+ in40
+
+
+ Bits
+ 40
+
+
+
+
+
+ Multiplexer
+
+
+ Selector Bits
+ 2
+
+
+ flipSelPos
+ true
+
+
+
+
+
+ Multiplexer
+
+
+ Selector Bits
+ 2
+
+
+ Bits
+ 8
+
+
+ flipSelPos
+ true
+
+
+
+
+
+ Multiplexer
+
+
+ Selector Bits
+ 2
+
+
+ flipSelPos
+ true
+
+
+
+
+
+ In
+
+
+ Label
+ id
+
+
+ Bits
+ 2
+
+
+
+
+
+ Out
+
+
+ Label
+ rr
+
+
+
+
+
+ Out
+
+
+ Label
+ rd
+
+
+ Bits
+ 8
+
+
+
+
+
+ Out
+
+
+ Label
+ wr
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/visualization/src/README.md b/visualization/src/README.md
new file mode 100644
index 0000000..1ee9a8c
--- /dev/null
+++ b/visualization/src/README.md
@@ -0,0 +1 @@
+These systemverilog files are exactly identical from the original source, just for compatibility with `Digital`.
\ No newline at end of file
diff --git a/visualization/src/alu.sv b/visualization/src/alu.sv
new file mode 100644
index 0000000..e94ad12
--- /dev/null
+++ b/visualization/src/alu.sv
@@ -0,0 +1,52 @@
+module alu (
+ input wire clk,
+ input wire reset,
+ input wire enable, // If current block has less threads then block size, some ALUs will be inactive
+
+ input [2:0] core_state,
+
+ input [1:0] decoded_alu_arithmetic_mux,
+ input decoded_alu_output_mux,
+
+ input [7:0] rs,
+ input [7:0] rt,
+ output wire [7:0] alu_out
+);
+ localparam ADD = 2'b00,
+ SUB = 2'b01,
+ MUL = 2'b10,
+ DIV = 2'b11;
+
+ reg [7:0] alu_out_reg;
+ assign alu_out = alu_out_reg;
+
+ always @(posedge clk) begin
+ if (reset) begin
+ alu_out_reg <= 8'b0;
+ end else if (enable) begin
+ // Calculate alu_out when core_state = EXECUTE
+ if (core_state == 3'b101) begin
+ if (decoded_alu_output_mux == 1) begin
+ // Set values to compare with NZP register in alu_out[2:0]
+ alu_out_reg <= {5'b0, (rs - rt > 0), (rs - rt == 0), (rs - rt < 0)};
+ end else begin
+ // Execute the specified arithmetic instruction
+ case (decoded_alu_arithmetic_mux)
+ ADD: begin
+ alu_out_reg <= rs + rt;
+ end
+ SUB: begin
+ alu_out_reg <= rs - rt;
+ end
+ MUL: begin
+ alu_out_reg <= rs * rt;
+ end
+ DIV: begin
+ alu_out_reg <= rs / rt;
+ end
+ endcase
+ end
+ end
+ end
+ end
+endmodule
diff --git a/visualization/src/controller-data.sv b/visualization/src/controller-data.sv
new file mode 100644
index 0000000..4cd016d
--- /dev/null
+++ b/visualization/src/controller-data.sv
@@ -0,0 +1,123 @@
+module controllerData (
+ input wire clk,
+ input wire reset,
+
+ // Consumer Interface (8 consumers: Fetchers / LSUs)
+ input [7:0] consumer_read_valid,
+ input [63:0] consumer_read_address, // 8 x 8-bit addresses
+ output reg [7:0] consumer_read_ready,
+ output reg [63:0] consumer_read_data, // 8 x 8-bit data
+ input [7:0] consumer_write_valid,
+ input [63:0] consumer_write_address, // 8 x 8-bit addresses
+ input [63:0] consumer_write_data, // 8 x 8-bit data
+ output reg [7:0] consumer_write_ready,
+
+ // Memory Interface (4 channels: Data / Program)
+ output reg [3:0] mem_read_valid,
+ output reg [31:0] mem_read_address, // 4 x 8-bit addresses
+ input [3:0] mem_read_ready,
+ input [31:0] mem_read_data, // 4 x 8-bit data
+ output reg [3:0] mem_write_valid,
+ output reg [31:0] mem_write_address, // 4 x 8-bit addresses
+ output reg [31:0] mem_write_data, // 4 x 8-bit data
+ input [3:0] mem_write_ready
+);
+ localparam IDLE = 3'b000,
+ READ_WAITING = 3'b010,
+ WRITE_WAITING = 3'b011,
+ READ_RELAYING = 3'b100,
+ WRITE_RELAYING = 3'b101;
+
+ // Keep track of state for each channel and which jobs each channel is handling
+ reg [11:0] controller_state; // 4 x 3-bit states
+ reg [11:0] current_consumer; // 4 x 3-bit consumer IDs
+ reg [7:0] channel_serving_consumer; // Which consumers are being served
+
+ integer i;
+ integer j;
+
+ always @(posedge clk) begin
+ if (reset) begin
+ mem_read_valid <= 4'b0;
+ mem_read_address <= 32'b0;
+
+ mem_write_valid <= 4'b0;
+ mem_write_address <= 32'b0;
+ mem_write_data <= 32'b0;
+
+ consumer_read_ready <= 8'b0;
+ consumer_read_data <= 64'b0;
+ consumer_write_ready <= 8'b0;
+
+ current_consumer <= 12'b0;
+ controller_state <= 12'b0;
+
+ channel_serving_consumer = 8'b0;
+ end else begin
+ // For each channel, we handle processing concurrently
+ for (i = 0; i < 4; i = i + 1) begin
+ case (controller_state[i*3 +: 3])
+ IDLE: begin
+ // While this channel is idle, cycle through consumers looking for one with a pending request
+ for (j = 0; j < 8; j = j + 1) begin
+ if (consumer_read_valid[j] && !channel_serving_consumer[j]) begin
+ channel_serving_consumer[j] = 1;
+ current_consumer[i*3 +: 3] <= j[2:0];
+
+ mem_read_valid[i] <= 1;
+ mem_read_address[i*8 +: 8] <= consumer_read_address[j*8 +: 8];
+ controller_state[i*3 +: 3] <= READ_WAITING;
+
+ // Once we find a pending request, pick it up with this channel and stop looking for requests
+ j = 8;
+ end else if (consumer_write_valid[j] && !channel_serving_consumer[j]) begin
+ channel_serving_consumer[j] = 1;
+ current_consumer[i*3 +: 3] <= j[2:0];
+
+ mem_write_valid[i] <= 1;
+ mem_write_address[i*8 +: 8] <= consumer_write_address[j*8 +: 8];
+ mem_write_data[i*8 +: 8] <= consumer_write_data[j*8 +: 8];
+ controller_state[i*3 +: 3] <= WRITE_WAITING;
+
+ // Once we find a pending request, pick it up with this channel and stop looking for requests
+ j = 8;
+ end
+ end
+ end
+ READ_WAITING: begin
+ // Wait for response from memory for pending read request
+ if (mem_read_ready[i]) begin
+ mem_read_valid[i] <= 0;
+ consumer_read_ready[current_consumer[i*3 +: 3]] <= 1;
+ consumer_read_data[current_consumer[i*3 +: 3]*8 +: 8] <= mem_read_data[i*8 +: 8];
+ controller_state[i*3 +: 3] <= READ_RELAYING;
+ end
+ end
+ WRITE_WAITING: begin
+ // Wait for response from memory for pending write request
+ if (mem_write_ready[i]) begin
+ mem_write_valid[i] <= 0;
+ consumer_write_ready[current_consumer[i*3 +: 3]] <= 1;
+ controller_state[i*3 +: 3] <= WRITE_RELAYING;
+ end
+ end
+ // Wait until consumer acknowledges it received response, then reset
+ READ_RELAYING: begin
+ if (!consumer_read_valid[current_consumer[i*3 +: 3]]) begin
+ channel_serving_consumer[current_consumer[i*3 +: 3]] = 0;
+ consumer_read_ready[current_consumer[i*3 +: 3]] <= 0;
+ controller_state[i*3 +: 3] <= IDLE;
+ end
+ end
+ WRITE_RELAYING: begin
+ if (!consumer_write_valid[current_consumer[i*3 +: 3]]) begin
+ channel_serving_consumer[current_consumer[i*3 +: 3]] = 0;
+ consumer_write_ready[current_consumer[i*3 +: 3]] <= 0;
+ controller_state[i*3 +: 3] <= IDLE;
+ end
+ end
+ endcase
+ end
+ end
+ end
+endmodule
diff --git a/visualization/src/controller-program.sv b/visualization/src/controller-program.sv
new file mode 100644
index 0000000..1f024f5
--- /dev/null
+++ b/visualization/src/controller-program.sv
@@ -0,0 +1,81 @@
+module controllerProg (
+ input wire clk,
+ input wire reset,
+
+ // Consumer Interface (2 consumers)
+ input wire [1:0] consumer_read_valid,
+ input wire [15:0] consumer_read_address, // 2 x 8-bit: {addr1, addr0}
+ output reg [1:0] consumer_read_ready,
+ output reg [31:0] consumer_read_data, // 2 x 16-bit: {data1, data0}
+
+ // Program Memory Interface (1 channel, read-only)
+ output reg mem_read_valid,
+ output reg [7:0] mem_read_address,
+ input wire mem_read_ready,
+ input wire [15:0] mem_read_data
+);
+ localparam IDLE = 2'b00,
+ READ_WAITING = 2'b01,
+ READ_RELAYING = 2'b10;
+
+ reg [1:0] controller_state;
+ reg current_consumer;
+ reg [1:0] serving_consumer;
+
+ integer j;
+
+ always @(posedge clk) begin
+ if (reset) begin
+ mem_read_valid <= 1'b0;
+ mem_read_address <= 8'b0;
+
+ consumer_read_ready <= 2'b0;
+ consumer_read_data <= 32'b0;
+
+ controller_state <= IDLE;
+ current_consumer <= 1'b0;
+ serving_consumer = 2'b0;
+ end else begin
+ case (controller_state)
+ IDLE: begin
+ // Cycle through consumers looking for a pending request
+ for (j = 0; j < 2; j = j + 1) begin
+ if (consumer_read_valid[j] && !serving_consumer[j]) begin
+ serving_consumer[j] = 1'b1;
+ current_consumer <= j[0];
+
+ mem_read_valid <= 1'b1;
+ mem_read_address <= consumer_read_address[j*8 +: 8];
+ controller_state <= READ_WAITING;
+
+ // Stop scanning
+ j = 2;
+ end
+ end
+ end
+
+ READ_WAITING: begin
+ if (mem_read_ready) begin
+ mem_read_valid <= 1'b0;
+ consumer_read_ready[current_consumer] <= 1'b1;
+ consumer_read_data[current_consumer*16 +: 16] <= mem_read_data;
+ controller_state <= READ_RELAYING;
+ end
+ end
+
+ READ_RELAYING: begin
+ // Wait until consumer acknowledges it received response, then reset
+ if (!consumer_read_valid[current_consumer]) begin
+ serving_consumer[current_consumer] = 1'b0;
+ consumer_read_ready[current_consumer] <= 1'b0;
+ controller_state <= IDLE;
+ end
+ end
+
+ default: begin
+ controller_state <= IDLE;
+ end
+ endcase
+ end
+ end
+endmodule
diff --git a/visualization/src/core.sv b/visualization/src/core.sv
new file mode 100644
index 0000000..1918cee
--- /dev/null
+++ b/visualization/src/core.sv
@@ -0,0 +1,216 @@
+`default_nettype none
+`timescale 1ns/1ns
+
+// COMPUTE CORE
+// > Handles processing 1 block at a time
+// > The core also has it's own scheduler to manage control flow
+// > Each core contains 1 fetcher & decoder, and register files, ALUs, LSUs, PC for each thread
+module core #(
+ parameter DATA_MEM_ADDR_BITS = 8,
+ parameter DATA_MEM_DATA_BITS = 8,
+ parameter PROGRAM_MEM_ADDR_BITS = 8,
+ parameter PROGRAM_MEM_DATA_BITS = 16,
+ parameter THREADS_PER_BLOCK = 4
+) (
+ input wire clk,
+ input wire reset,
+
+ // Kernel Execution
+ input wire start,
+ output wire done,
+
+ // Block Metadata
+ input wire [7:0] block_id,
+ input wire [$clog2(THREADS_PER_BLOCK):0] thread_count,
+
+ // Program Memory
+ output reg program_mem_read_valid,
+ output reg [PROGRAM_MEM_ADDR_BITS-1:0] program_mem_read_address,
+ input reg program_mem_read_ready,
+ input reg [PROGRAM_MEM_DATA_BITS-1:0] program_mem_read_data,
+
+ // Data Memory
+ output reg [THREADS_PER_BLOCK-1:0] data_mem_read_valid,
+ output reg [DATA_MEM_ADDR_BITS-1:0] data_mem_read_address [THREADS_PER_BLOCK-1:0],
+ input reg [THREADS_PER_BLOCK-1:0] data_mem_read_ready,
+ input reg [DATA_MEM_DATA_BITS-1:0] data_mem_read_data [THREADS_PER_BLOCK-1:0],
+ output reg [THREADS_PER_BLOCK-1:0] data_mem_write_valid,
+ output reg [DATA_MEM_ADDR_BITS-1:0] data_mem_write_address [THREADS_PER_BLOCK-1:0],
+ output reg [DATA_MEM_DATA_BITS-1:0] data_mem_write_data [THREADS_PER_BLOCK-1:0],
+ input reg [THREADS_PER_BLOCK-1:0] data_mem_write_ready
+);
+ // State
+ reg [2:0] core_state;
+ reg [2:0] fetcher_state;
+ reg [15:0] instruction;
+
+ // Intermediate Signals
+ reg [7:0] current_pc;
+ wire [7:0] next_pc[THREADS_PER_BLOCK-1:0];
+ reg [7:0] rs[THREADS_PER_BLOCK-1:0];
+ reg [7:0] rt[THREADS_PER_BLOCK-1:0];
+ reg [1:0] lsu_state[THREADS_PER_BLOCK-1:0];
+ reg [7:0] lsu_out[THREADS_PER_BLOCK-1:0];
+ wire [7:0] alu_out[THREADS_PER_BLOCK-1:0];
+
+ // Decoded Instruction Signals
+ reg [3:0] decoded_rd_address;
+ reg [3:0] decoded_rs_address;
+ reg [3:0] decoded_rt_address;
+ reg [2:0] decoded_nzp;
+ reg [7:0] decoded_immediate;
+
+ // Decoded Control Signals
+ reg decoded_reg_write_enable; // Enable writing to a register
+ reg decoded_mem_read_enable; // Enable reading from memory
+ reg decoded_mem_write_enable; // Enable writing to memory
+ reg decoded_nzp_write_enable; // Enable writing to NZP register
+ reg [1:0] decoded_reg_input_mux; // Select input to register
+ reg [1:0] decoded_alu_arithmetic_mux; // Select arithmetic operation
+ reg decoded_alu_output_mux; // Select operation in ALU
+ reg decoded_pc_mux; // Select source of next PC
+ reg decoded_ret;
+
+ // Fetcher
+ fetcher #(
+ .PROGRAM_MEM_ADDR_BITS(PROGRAM_MEM_ADDR_BITS),
+ .PROGRAM_MEM_DATA_BITS(PROGRAM_MEM_DATA_BITS)
+ ) fetcher_instance (
+ .clk(clk),
+ .reset(reset),
+ .core_state(core_state),
+ .current_pc(current_pc),
+ .mem_read_valid(program_mem_read_valid),
+ .mem_read_address(program_mem_read_address),
+ .mem_read_ready(program_mem_read_ready),
+ .mem_read_data(program_mem_read_data),
+ .fetcher_state(fetcher_state),
+ .instruction(instruction)
+ );
+
+ // Decoder
+ decoder decoder_instance (
+ .clk(clk),
+ .reset(reset),
+ .core_state(core_state),
+ .instruction(instruction),
+ .decoded_rd_address(decoded_rd_address),
+ .decoded_rs_address(decoded_rs_address),
+ .decoded_rt_address(decoded_rt_address),
+ .decoded_nzp(decoded_nzp),
+ .decoded_immediate(decoded_immediate),
+ .decoded_reg_write_enable(decoded_reg_write_enable),
+ .decoded_mem_read_enable(decoded_mem_read_enable),
+ .decoded_mem_write_enable(decoded_mem_write_enable),
+ .decoded_nzp_write_enable(decoded_nzp_write_enable),
+ .decoded_reg_input_mux(decoded_reg_input_mux),
+ .decoded_alu_arithmetic_mux(decoded_alu_arithmetic_mux),
+ .decoded_alu_output_mux(decoded_alu_output_mux),
+ .decoded_pc_mux(decoded_pc_mux),
+ .decoded_ret(decoded_ret)
+ );
+
+ // Scheduler
+ scheduler scheduler_instance (
+ .clk(clk),
+ .reset(reset),
+ .start(start),
+ .fetcher_state(fetcher_state),
+ .core_state(core_state),
+ .decoded_mem_read_enable(decoded_mem_read_enable),
+ .decoded_mem_write_enable(decoded_mem_write_enable),
+ .decoded_ret(decoded_ret),
+ .lsu_state0(lsu_state[0]),
+ .lsu_state1(lsu_state[1]),
+ .lsu_state2(lsu_state[2]),
+ .lsu_state3(lsu_state[3]),
+ .current_pc(current_pc),
+ .next_pc0(next_pc[0]),
+ .next_pc1(next_pc[1]),
+ .next_pc2(next_pc[2]),
+ .next_pc3(next_pc[3]),
+ .done(done)
+ );
+
+ // Dedicated ALU, LSU, registers, & PC unit for each thread this core has capacity for
+ genvar i;
+ generate
+ for (i = 0; i < THREADS_PER_BLOCK; i = i + 1) begin : threads
+ // ALU
+ alu alu_instance (
+ .clk(clk),
+ .reset(reset),
+ .enable(i < thread_count),
+ .core_state(core_state),
+ .decoded_alu_arithmetic_mux(decoded_alu_arithmetic_mux),
+ .decoded_alu_output_mux(decoded_alu_output_mux),
+ .rs(rs[i]),
+ .rt(rt[i]),
+ .alu_out(alu_out[i])
+ );
+
+ // LSU
+ lsu lsu_instance (
+ .clk(clk),
+ .reset(reset),
+ .enable(i < thread_count),
+ .core_state(core_state),
+ .decoded_mem_read_enable(decoded_mem_read_enable),
+ .decoded_mem_write_enable(decoded_mem_write_enable),
+ .mem_read_valid(data_mem_read_valid[i]),
+ .mem_read_address(data_mem_read_address[i]),
+ .mem_read_ready(data_mem_read_ready[i]),
+ .mem_read_data(data_mem_read_data[i]),
+ .mem_write_valid(data_mem_write_valid[i]),
+ .mem_write_address(data_mem_write_address[i]),
+ .mem_write_data(data_mem_write_data[i]),
+ .mem_write_ready(data_mem_write_ready[i]),
+ .rs(rs[i]),
+ .rt(rt[i]),
+ .lsu_state(lsu_state[i]),
+ .lsu_out(lsu_out[i])
+ );
+
+ // Register File
+ registers #(
+ .THREADS_PER_BLOCK(THREADS_PER_BLOCK),
+ .THREAD_ID(i),
+ .DATA_BITS(DATA_MEM_DATA_BITS)
+ ) register_instance (
+ .clk(clk),
+ .reset(reset),
+ .enable(i < thread_count),
+ .block_id(block_id),
+ .core_state(core_state),
+ .decoded_reg_write_enable(decoded_reg_write_enable),
+ .decoded_reg_input_mux(decoded_reg_input_mux),
+ .decoded_rd_address(decoded_rd_address),
+ .decoded_rs_address(decoded_rs_address),
+ .decoded_rt_address(decoded_rt_address),
+ .decoded_immediate(decoded_immediate),
+ .alu_out(alu_out[i]),
+ .lsu_out(lsu_out[i]),
+ .rs(rs[i]),
+ .rt(rt[i])
+ );
+
+ // Program Counter
+ pc #(
+ .DATA_MEM_DATA_BITS(DATA_MEM_DATA_BITS),
+ .PROGRAM_MEM_ADDR_BITS(PROGRAM_MEM_ADDR_BITS)
+ ) pc_instance (
+ .clk(clk),
+ .reset(reset),
+ .enable(i < thread_count),
+ .core_state(core_state),
+ .decoded_nzp(decoded_nzp),
+ .decoded_immediate(decoded_immediate),
+ .decoded_nzp_write_enable(decoded_nzp_write_enable),
+ .decoded_pc_mux(decoded_pc_mux),
+ .alu_out(alu_out[i]),
+ .current_pc(current_pc),
+ .next_pc(next_pc[i])
+ );
+ end
+ endgenerate
+endmodule
diff --git a/visualization/src/dcr.sv b/visualization/src/dcr.sv
new file mode 100644
index 0000000..c12df85
--- /dev/null
+++ b/visualization/src/dcr.sv
@@ -0,0 +1,22 @@
+module dcr (
+ input wire clk,
+ input wire reset,
+
+ input wire device_control_write_enable,
+ input wire [7:0] device_control_data,
+ output wire [7:0] thread_count
+);
+ // Store device control data in dedicated register
+ reg [7:0] device_conrol_register;
+ assign thread_count = device_conrol_register[7:0];
+
+ always @(posedge clk) begin
+ if (reset) begin
+ device_conrol_register <= 8'b0;
+ end else begin
+ if (device_control_write_enable) begin
+ device_conrol_register <= device_control_data;
+ end
+ end
+ end
+endmodule
\ No newline at end of file
diff --git a/visualization/src/decoder.sv b/visualization/src/decoder.sv
new file mode 100644
index 0000000..a11f3d4
--- /dev/null
+++ b/visualization/src/decoder.sv
@@ -0,0 +1,128 @@
+module decoder (
+ input wire clk,
+ input wire reset,
+
+ input [2:0] core_state,
+ input [15:0] instruction,
+
+ // Instruction Signals
+ output reg [3:0] decoded_rd_address,
+ output reg [3:0] decoded_rs_address,
+ output reg [3:0] decoded_rt_address,
+ output reg [2:0] decoded_nzp,
+ output reg [7:0] decoded_immediate,
+
+ // Control Signals
+ output reg decoded_reg_write_enable, // Enable writing to a register
+ output reg decoded_mem_read_enable, // Enable reading from memory
+ output reg decoded_mem_write_enable, // Enable writing to memory
+ output reg decoded_nzp_write_enable, // Enable writing to NZP register
+ output reg [1:0] decoded_reg_input_mux, // Select input to register
+ output reg [1:0] decoded_alu_arithmetic_mux, // Select arithmetic operation
+ output reg decoded_alu_output_mux, // Select operation in ALU
+ output reg decoded_pc_mux, // Select source of next PC
+
+ // Return (finished executing thread)
+ output reg decoded_ret
+);
+ localparam NOP = 4'b0000,
+ BRnzp = 4'b0001,
+ CMP = 4'b0010,
+ ADD = 4'b0011,
+ SUB = 4'b0100,
+ MUL = 4'b0101,
+ DIV = 4'b0110,
+ LDR = 4'b0111,
+ STR = 4'b1000,
+ CONST = 4'b1001,
+ RET = 4'b1111;
+
+ always @(posedge clk) begin
+ if (reset) begin
+ decoded_rd_address <= 0;
+ decoded_rs_address <= 0;
+ decoded_rt_address <= 0;
+ decoded_immediate <= 0;
+ decoded_nzp <= 0;
+ decoded_reg_write_enable <= 0;
+ decoded_mem_read_enable <= 0;
+ decoded_mem_write_enable <= 0;
+ decoded_nzp_write_enable <= 0;
+ decoded_reg_input_mux <= 0;
+ decoded_alu_arithmetic_mux <= 0;
+ decoded_alu_output_mux <= 0;
+ decoded_pc_mux <= 0;
+ decoded_ret <= 0;
+ end else begin
+ // Decode when core_state = DECODE
+ if (core_state == 3'b010) begin
+ // Get instruction signals from instruction every time
+ decoded_rd_address <= instruction[11:8];
+ decoded_rs_address <= instruction[7:4];
+ decoded_rt_address <= instruction[3:0];
+ decoded_immediate <= instruction[7:0];
+ decoded_nzp <= instruction[11:9];
+
+ // Control signals reset on every decode and set conditionally by instruction
+ decoded_reg_write_enable <= 0;
+ decoded_mem_read_enable <= 0;
+ decoded_mem_write_enable <= 0;
+ decoded_nzp_write_enable <= 0;
+ decoded_reg_input_mux <= 0;
+ decoded_alu_arithmetic_mux <= 0;
+ decoded_alu_output_mux <= 0;
+ decoded_pc_mux <= 0;
+ decoded_ret <= 0;
+
+ // Set the control signals for each instruction
+ case (instruction[15:12])
+ NOP: begin
+ // no-op
+ end
+ BRnzp: begin
+ decoded_pc_mux <= 1;
+ end
+ CMP: begin
+ decoded_alu_output_mux <= 1;
+ decoded_nzp_write_enable <= 1;
+ end
+ ADD: begin
+ decoded_reg_write_enable <= 1;
+ decoded_reg_input_mux <= 2'b00;
+ decoded_alu_arithmetic_mux <= 2'b00;
+ end
+ SUB: begin
+ decoded_reg_write_enable <= 1;
+ decoded_reg_input_mux <= 2'b00;
+ decoded_alu_arithmetic_mux <= 2'b01;
+ end
+ MUL: begin
+ decoded_reg_write_enable <= 1;
+ decoded_reg_input_mux <= 2'b00;
+ decoded_alu_arithmetic_mux <= 2'b10;
+ end
+ DIV: begin
+ decoded_reg_write_enable <= 1;
+ decoded_reg_input_mux <= 2'b00;
+ decoded_alu_arithmetic_mux <= 2'b11;
+ end
+ LDR: begin
+ decoded_reg_write_enable <= 1;
+ decoded_reg_input_mux <= 2'b01;
+ decoded_mem_read_enable <= 1;
+ end
+ STR: begin
+ decoded_mem_write_enable <= 1;
+ end
+ CONST: begin
+ decoded_reg_write_enable <= 1;
+ decoded_reg_input_mux <= 2'b10;
+ end
+ RET: begin
+ decoded_ret <= 1;
+ end
+ endcase
+ end
+ end
+ end
+endmodule
diff --git a/visualization/src/dispatch.sv b/visualization/src/dispatch.sv
new file mode 100644
index 0000000..e47a975
--- /dev/null
+++ b/visualization/src/dispatch.sv
@@ -0,0 +1,82 @@
+module dispatch (
+ input wire clk,
+ input wire reset,
+ input wire start,
+
+ // Kernel Metadata
+ input wire [7:0] thread_count,
+
+ // Core States
+ input [1:0] core_done,
+ output reg [1:0] core_start,
+ output reg [1:0] core_reset,
+ output reg [15:0] core_block_id, // 2 x 8-bit: {core1_id, core0_id}
+ output reg [5:0] core_thread_count, // 2 x 3-bit: {core1_count, core0_count}
+
+ // Kernel Execution
+ output reg done
+);
+ // Calculate the total number of blocks based on total threads & threads per block
+ wire [7:0] total_blocks;
+ assign total_blocks = (thread_count + 3) / 4;
+
+ // Keep track of how many blocks have been processed
+ reg [7:0] blocks_dispatched; // How many blocks have been sent to cores?
+ reg [7:0] blocks_done; // How many blocks have finished processing?
+ reg start_execution; // EDA: Unimportant hack used because of EDA tooling
+
+ integer i;
+
+ always @(posedge clk) begin
+ if (reset) begin
+ done <= 0;
+ blocks_dispatched = 0;
+ blocks_done = 0;
+ start_execution <= 0;
+
+ core_start <= 2'b0;
+ core_reset <= 2'b11;
+ core_block_id <= 16'b0;
+ core_thread_count <= 6'b0;
+ end else if (start) begin
+ // EDA: Indirect way to get @(posedge start) without driving from 2 different clocks
+ if (!start_execution) begin
+ start_execution <= 1;
+ for (i = 0; i < 2; i = i + 1) begin
+ core_reset[i] <= 1;
+ end
+ end
+
+ // If the last block has finished processing, mark this kernel as done executing
+ if (blocks_done == total_blocks) begin
+ done <= 1;
+ end
+
+ for (i = 0; i < 2; i = i + 1) begin
+ if (core_reset[i]) begin
+ core_reset[i] <= 0;
+
+ // If this core was just reset, check if there are more blocks to be dispatched
+ if (blocks_dispatched < total_blocks) begin
+ core_start[i] <= 1;
+ core_block_id[i*8 +: 8] <= blocks_dispatched;
+ core_thread_count[i*3 +: 3] <= (blocks_dispatched == total_blocks - 1)
+ ? thread_count - (blocks_dispatched * 4)
+ : 4;
+
+ blocks_dispatched = blocks_dispatched + 1;
+ end
+ end
+ end
+
+ for (i = 0; i < 2; i = i + 1) begin
+ if (core_start[i] && core_done[i]) begin
+ // If a core just finished executing it's current block, reset it
+ core_reset[i] <= 1;
+ core_start[i] <= 0;
+ blocks_done = blocks_done + 1;
+ end
+ end
+ end
+ end
+endmodule
\ No newline at end of file
diff --git a/visualization/src/fetcher.sv b/visualization/src/fetcher.sv
new file mode 100644
index 0000000..db1cfef
--- /dev/null
+++ b/visualization/src/fetcher.sv
@@ -0,0 +1,59 @@
+module fetcher #(
+ parameter PROGRAM_MEM_ADDR_BITS = 8,
+ parameter PROGRAM_MEM_DATA_BITS = 16
+) (
+ input wire clk,
+ input wire reset,
+
+ // Execution State
+ input [2:0] core_state,
+ input [7:0] current_pc,
+
+ // Program Memory
+ output reg mem_read_valid,
+ output reg [7:0] mem_read_address,
+ input mem_read_ready,
+ input [15:0] mem_read_data,
+
+ // Fetcher Output
+ output reg [2:0] fetcher_state,
+ output reg [15:0] instruction
+);
+ localparam IDLE = 3'b000,
+ FETCHING = 3'b001,
+ FETCHED = 3'b010;
+
+ always @(posedge clk) begin
+ if (reset) begin
+ fetcher_state <= IDLE;
+ mem_read_valid <= 0;
+ mem_read_address <= 0;
+ instruction <= 16'b0;
+ end else begin
+ case (fetcher_state)
+ IDLE: begin
+ // Start fetching when core_state = FETCH
+ if (core_state == 3'b001) begin
+ fetcher_state <= FETCHING;
+ mem_read_valid <= 1;
+ mem_read_address <= current_pc;
+ end
+ end
+ FETCHING: begin
+ // Wait for response from program memory
+ if (mem_read_ready) begin
+ fetcher_state <= FETCHED;
+ instruction <= mem_read_data; // Store the instruction when received
+ mem_read_valid <= 0;
+ end
+ end
+ FETCHED: begin
+ // Reset when core_state = DECODE
+ if (core_state == 3'b010) begin
+ fetcher_state <= IDLE;
+ end
+ end
+ endcase
+ end
+ end
+endmodule
diff --git a/visualization/src/gpu.sv b/visualization/src/gpu.sv
new file mode 100644
index 0000000..652abc2
--- /dev/null
+++ b/visualization/src/gpu.sv
@@ -0,0 +1,217 @@
+`default_nettype none
+`timescale 1ns/1ns
+
+// GPU
+// > Built to use an external async memory with multi-channel read/write
+// > Assumes that the program is loaded into program memory, data into data memory, and threads into
+// the device control register before the start signal is triggered
+// > Has memory controllers to interface between external memory and its multiple cores
+// > Configurable number of cores and thread capacity per core
+module gpu #(
+ parameter DATA_MEM_ADDR_BITS = 8, // Number of bits in data memory address (256 rows)
+ parameter DATA_MEM_DATA_BITS = 8, // Number of bits in data memory value (8 bit data)
+ parameter DATA_MEM_NUM_CHANNELS = 4, // Number of concurrent channels for sending requests to data memory
+ parameter PROGRAM_MEM_ADDR_BITS = 8, // Number of bits in program memory address (256 rows)
+ parameter PROGRAM_MEM_DATA_BITS = 16, // Number of bits in program memory value (16 bit instruction)
+ parameter PROGRAM_MEM_NUM_CHANNELS = 1, // Number of concurrent channels for sending requests to program memory
+ parameter NUM_CORES = 2, // Number of cores to include in this GPU
+ parameter THREADS_PER_BLOCK = 4 // Number of threads to handle per block (determines the compute resources of each core)
+) (
+ input wire clk,
+ input wire reset,
+
+ // Kernel Execution
+ input wire start,
+ output wire done,
+
+ // Device Control Register
+ input wire device_control_write_enable,
+ input wire [7:0] device_control_data,
+
+ // Program Memory
+ output wire [PROGRAM_MEM_NUM_CHANNELS-1:0] program_mem_read_valid,
+ output wire [PROGRAM_MEM_ADDR_BITS-1:0] program_mem_read_address [PROGRAM_MEM_NUM_CHANNELS-1:0],
+ input wire [PROGRAM_MEM_NUM_CHANNELS-1:0] program_mem_read_ready,
+ input wire [PROGRAM_MEM_DATA_BITS-1:0] program_mem_read_data [PROGRAM_MEM_NUM_CHANNELS-1:0],
+
+ // Data Memory
+ output wire [DATA_MEM_NUM_CHANNELS-1:0] data_mem_read_valid,
+ output wire [DATA_MEM_ADDR_BITS-1:0] data_mem_read_address [DATA_MEM_NUM_CHANNELS-1:0],
+ input wire [DATA_MEM_NUM_CHANNELS-1:0] data_mem_read_ready,
+ input wire [DATA_MEM_DATA_BITS-1:0] data_mem_read_data [DATA_MEM_NUM_CHANNELS-1:0],
+ output wire [DATA_MEM_NUM_CHANNELS-1:0] data_mem_write_valid,
+ output wire [DATA_MEM_ADDR_BITS-1:0] data_mem_write_address [DATA_MEM_NUM_CHANNELS-1:0],
+ output wire [DATA_MEM_DATA_BITS-1:0] data_mem_write_data [DATA_MEM_NUM_CHANNELS-1:0],
+ input wire [DATA_MEM_NUM_CHANNELS-1:0] data_mem_write_ready
+);
+ // Control
+ wire [7:0] thread_count;
+
+ // Compute Core State
+ reg [NUM_CORES-1:0] core_start;
+ reg [NUM_CORES-1:0] core_reset;
+ reg [NUM_CORES-1:0] core_done;
+ reg [7:0] core_block_id [NUM_CORES-1:0];
+ reg [$clog2(THREADS_PER_BLOCK):0] core_thread_count [NUM_CORES-1:0];
+
+ // LSU <> Data Memory Controller Channels
+ localparam NUM_LSUS = NUM_CORES * THREADS_PER_BLOCK;
+ reg [NUM_LSUS-1:0] lsu_read_valid;
+ reg [DATA_MEM_ADDR_BITS-1:0] lsu_read_address [NUM_LSUS-1:0];
+ reg [NUM_LSUS-1:0] lsu_read_ready;
+ reg [DATA_MEM_DATA_BITS-1:0] lsu_read_data [NUM_LSUS-1:0];
+ reg [NUM_LSUS-1:0] lsu_write_valid;
+ reg [DATA_MEM_ADDR_BITS-1:0] lsu_write_address [NUM_LSUS-1:0];
+ reg [DATA_MEM_DATA_BITS-1:0] lsu_write_data [NUM_LSUS-1:0];
+ reg [NUM_LSUS-1:0] lsu_write_ready;
+
+ // Fetcher <> Program Memory Controller Channels
+ localparam NUM_FETCHERS = NUM_CORES;
+ reg [NUM_FETCHERS-1:0] fetcher_read_valid;
+ reg [PROGRAM_MEM_ADDR_BITS-1:0] fetcher_read_address [NUM_FETCHERS-1:0];
+ reg [NUM_FETCHERS-1:0] fetcher_read_ready;
+ reg [PROGRAM_MEM_DATA_BITS-1:0] fetcher_read_data [NUM_FETCHERS-1:0];
+
+ // Device Control Register
+ dcr dcr_instance (
+ .clk(clk),
+ .reset(reset),
+
+ .device_control_write_enable(device_control_write_enable),
+ .device_control_data(device_control_data),
+ .thread_count(thread_count)
+ );
+
+ // Data Memory Controller
+ controller #(
+ .ADDR_BITS(DATA_MEM_ADDR_BITS),
+ .DATA_BITS(DATA_MEM_DATA_BITS),
+ .NUM_CONSUMERS(NUM_LSUS),
+ .NUM_CHANNELS(DATA_MEM_NUM_CHANNELS)
+ ) data_memory_controller (
+ .clk(clk),
+ .reset(reset),
+
+ .consumer_read_valid(lsu_read_valid),
+ .consumer_read_address(lsu_read_address),
+ .consumer_read_ready(lsu_read_ready),
+ .consumer_read_data(lsu_read_data),
+ .consumer_write_valid(lsu_write_valid),
+ .consumer_write_address(lsu_write_address),
+ .consumer_write_data(lsu_write_data),
+ .consumer_write_ready(lsu_write_ready),
+
+ .mem_read_valid(data_mem_read_valid),
+ .mem_read_address(data_mem_read_address),
+ .mem_read_ready(data_mem_read_ready),
+ .mem_read_data(data_mem_read_data),
+ .mem_write_valid(data_mem_write_valid),
+ .mem_write_address(data_mem_write_address),
+ .mem_write_data(data_mem_write_data),
+ .mem_write_ready(data_mem_write_ready)
+ );
+
+ // Program Memory Controller
+ controller #(
+ .ADDR_BITS(PROGRAM_MEM_ADDR_BITS),
+ .DATA_BITS(PROGRAM_MEM_DATA_BITS),
+ .NUM_CONSUMERS(NUM_FETCHERS),
+ .NUM_CHANNELS(PROGRAM_MEM_NUM_CHANNELS),
+ .WRITE_ENABLE(0)
+ ) program_memory_controller (
+ .clk(clk),
+ .reset(reset),
+
+ .consumer_read_valid(fetcher_read_valid),
+ .consumer_read_address(fetcher_read_address),
+ .consumer_read_ready(fetcher_read_ready),
+ .consumer_read_data(fetcher_read_data),
+
+ .mem_read_valid(program_mem_read_valid),
+ .mem_read_address(program_mem_read_address),
+ .mem_read_ready(program_mem_read_ready),
+ .mem_read_data(program_mem_read_data)
+ );
+
+ // Dispatcher
+ dispatch #(
+ .NUM_CORES(NUM_CORES),
+ .THREADS_PER_BLOCK(THREADS_PER_BLOCK)
+ ) dispatch_instance (
+ .clk(clk),
+ .reset(reset),
+ .start(start),
+ .thread_count(thread_count),
+ .core_done(core_done),
+ .core_start(core_start),
+ .core_reset(core_reset),
+ .core_block_id(core_block_id),
+ .core_thread_count(core_thread_count),
+ .done(done)
+ );
+
+ // Compute Cores
+ genvar i;
+ generate
+ for (i = 0; i < NUM_CORES; i = i + 1) begin : cores
+ // EDA: We create separate signals here to pass to cores because of a requirement
+ // by the OpenLane EDA flow (uses Verilog 2005) that prevents slicing the top-level signals
+ reg [THREADS_PER_BLOCK-1:0] core_lsu_read_valid;
+ reg [DATA_MEM_ADDR_BITS-1:0] core_lsu_read_address [THREADS_PER_BLOCK-1:0];
+ reg [THREADS_PER_BLOCK-1:0] core_lsu_read_ready;
+ reg [DATA_MEM_DATA_BITS-1:0] core_lsu_read_data [THREADS_PER_BLOCK-1:0];
+ reg [THREADS_PER_BLOCK-1:0] core_lsu_write_valid;
+ reg [DATA_MEM_ADDR_BITS-1:0] core_lsu_write_address [THREADS_PER_BLOCK-1:0];
+ reg [DATA_MEM_DATA_BITS-1:0] core_lsu_write_data [THREADS_PER_BLOCK-1:0];
+ reg [THREADS_PER_BLOCK-1:0] core_lsu_write_ready;
+
+ // Pass through signals between LSUs and data memory controller
+ genvar j;
+ for (j = 0; j < THREADS_PER_BLOCK; j = j + 1) begin
+ localparam lsu_index = i * THREADS_PER_BLOCK + j;
+ always @(posedge clk) begin
+ lsu_read_valid[lsu_index] <= core_lsu_read_valid[j];
+ lsu_read_address[lsu_index] <= core_lsu_read_address[j];
+
+ lsu_write_valid[lsu_index] <= core_lsu_write_valid[j];
+ lsu_write_address[lsu_index] <= core_lsu_write_address[j];
+ lsu_write_data[lsu_index] <= core_lsu_write_data[j];
+
+ core_lsu_read_ready[j] <= lsu_read_ready[lsu_index];
+ core_lsu_read_data[j] <= lsu_read_data[lsu_index];
+ core_lsu_write_ready[j] <= lsu_write_ready[lsu_index];
+ end
+ end
+
+ // Compute Core
+ core #(
+ .DATA_MEM_ADDR_BITS(DATA_MEM_ADDR_BITS),
+ .DATA_MEM_DATA_BITS(DATA_MEM_DATA_BITS),
+ .PROGRAM_MEM_ADDR_BITS(PROGRAM_MEM_ADDR_BITS),
+ .PROGRAM_MEM_DATA_BITS(PROGRAM_MEM_DATA_BITS),
+ .THREADS_PER_BLOCK(THREADS_PER_BLOCK)
+ ) core_instance (
+ .clk(clk),
+ .reset(core_reset[i]),
+ .start(core_start[i]),
+ .done(core_done[i]),
+ .block_id(core_block_id[i]),
+ .thread_count(core_thread_count[i]),
+
+ .program_mem_read_valid(fetcher_read_valid[i]),
+ .program_mem_read_address(fetcher_read_address[i]),
+ .program_mem_read_ready(fetcher_read_ready[i]),
+ .program_mem_read_data(fetcher_read_data[i]),
+
+ .data_mem_read_valid(core_lsu_read_valid),
+ .data_mem_read_address(core_lsu_read_address),
+ .data_mem_read_ready(core_lsu_read_ready),
+ .data_mem_read_data(core_lsu_read_data),
+ .data_mem_write_valid(core_lsu_write_valid),
+ .data_mem_write_address(core_lsu_write_address),
+ .data_mem_write_data(core_lsu_write_data),
+ .data_mem_write_ready(core_lsu_write_ready)
+ );
+ end
+ endgenerate
+endmodule
diff --git a/visualization/src/lsu.sv b/visualization/src/lsu.sv
new file mode 100644
index 0000000..7249d36
--- /dev/null
+++ b/visualization/src/lsu.sv
@@ -0,0 +1,104 @@
+module lsu (
+ input wire clk,
+ input wire reset,
+ input wire enable, // If current block has less threads then block size, some LSUs will be inactive
+
+ // State
+ input [2:0] core_state,
+
+ // Memory Control Signals
+ input decoded_mem_read_enable,
+ input decoded_mem_write_enable,
+
+ // Registers
+ input [7:0] rs,
+ input [7:0] rt,
+
+ // Data Memory
+ output reg mem_read_valid,
+ output reg [7:0] mem_read_address,
+ input mem_read_ready,
+ input [7:0] mem_read_data,
+ output reg mem_write_valid,
+ output reg [7:0] mem_write_address,
+ output reg [7:0] mem_write_data,
+ input mem_write_ready,
+
+ // LSU Outputs
+ output reg [1:0] lsu_state,
+ output reg [7:0] lsu_out
+);
+ localparam IDLE = 2'b00, REQUESTING = 2'b01, WAITING = 2'b10, DONE = 2'b11;
+
+ always @(posedge clk) begin
+ if (reset) begin
+ lsu_state <= IDLE;
+ lsu_out <= 0;
+ mem_read_valid <= 0;
+ mem_read_address <= 0;
+ mem_write_valid <= 0;
+ mem_write_address <= 0;
+ mem_write_data <= 0;
+ end else if (enable) begin
+ // If memory read enable is triggered (LDR instruction)
+ if (decoded_mem_read_enable) begin
+ case (lsu_state)
+ IDLE: begin
+ // Only read when core_state = REQUEST
+ if (core_state == 3'b011) begin
+ lsu_state <= REQUESTING;
+ end
+ end
+ REQUESTING: begin
+ mem_read_valid <= 1;
+ mem_read_address <= rs;
+ lsu_state <= WAITING;
+ end
+ WAITING: begin
+ if (mem_read_ready == 1) begin
+ mem_read_valid <= 0;
+ lsu_out <= mem_read_data;
+ lsu_state <= DONE;
+ end
+ end
+ DONE: begin
+ // Reset when core_state = UPDATE
+ if (core_state == 3'b110) begin
+ lsu_state <= IDLE;
+ end
+ end
+ endcase
+ end
+
+ // If memory write enable is triggered (STR instruction)
+ if (decoded_mem_write_enable) begin
+ case (lsu_state)
+ IDLE: begin
+ // Only read when core_state = REQUEST
+ if (core_state == 3'b011) begin
+ lsu_state <= REQUESTING;
+ end
+ end
+ REQUESTING: begin
+ mem_write_valid <= 1;
+ mem_write_address <= rs;
+ mem_write_data <= rt;
+ lsu_state <= WAITING;
+ end
+ WAITING: begin
+ if (mem_write_ready) begin
+ mem_write_valid <= 0;
+ lsu_state <= DONE;
+ end
+ end
+ DONE: begin
+ // Reset when core_state = UPDATE
+ if (core_state == 3'b110) begin
+ lsu_state <= IDLE;
+ end
+ end
+ endcase
+ end
+ end
+ end
+endmodule
diff --git a/visualization/src/pc.sv b/visualization/src/pc.sv
new file mode 100644
index 0000000..73f4c34
--- /dev/null
+++ b/visualization/src/pc.sv
@@ -0,0 +1,57 @@
+module pc (
+ input wire clk,
+ input wire reset,
+ input wire enable, // If current block has less threads then block size, some PCs will be inactive
+
+ // State
+ input [2:0] core_state,
+
+ // Control Signals
+ input [2:0] decoded_nzp,
+ input [7:0] decoded_immediate,
+ input decoded_nzp_write_enable,
+ input decoded_pc_mux,
+
+ // ALU Output - used for alu_out[2:0] to compare with NZP register
+ input [7:0] alu_out,
+
+ // Current & Next PCs
+ input [7:0] current_pc,
+ output reg [7:0] next_pc
+);
+ reg [2:0] nzp;
+
+ always @(posedge clk) begin
+ if (reset) begin
+ nzp <= 3'b0;
+ next_pc <= 0;
+ end else if (enable) begin
+ // Update PC when core_state = EXECUTE
+ if (core_state == 3'b101) begin
+ if (decoded_pc_mux == 1) begin
+ if (((nzp & decoded_nzp) != 3'b0)) begin
+ // On BRnzp instruction, branch to immediate if NZP case matches previous CMP
+ next_pc <= decoded_immediate;
+ end else begin
+ // Otherwise, just update to PC + 1 (next line)
+ next_pc <= current_pc + 1;
+ end
+ end else begin
+ // By default update to PC + 1 (next line)
+ next_pc <= current_pc + 1;
+ end
+ end
+
+ // Store NZP when core_state = UPDATE
+ if (core_state == 3'b110) begin
+ // Write to NZP register on CMP instruction
+ if (decoded_nzp_write_enable) begin
+ nzp[2] <= alu_out[2];
+ nzp[1] <= alu_out[1];
+ nzp[0] <= alu_out[0];
+ end
+ end
+ end
+ end
+
+endmodule
diff --git a/visualization/src/registers.sv b/visualization/src/registers.sv
new file mode 100644
index 0000000..95009e7
--- /dev/null
+++ b/visualization/src/registers.sv
@@ -0,0 +1,92 @@
+module registers (
+ input wire clk,
+ input wire reset,
+ input wire enable, // If current block has less threads then block size, some registers will be inactive
+
+ // Kernel Execution
+ input wire [7:0] block_id,
+
+ // State
+ input wire [2:0] core_state,
+
+ // Instruction Signals
+ input wire [3:0] decoded_rd_address,
+ input wire [3:0] decoded_rs_address,
+ input wire [3:0] decoded_rt_address,
+
+ // Control Signals
+ input wire decoded_reg_write_enable,
+ input wire [1:0] decoded_reg_input_mux,
+ input wire [7:0] decoded_immediate,
+
+ // Thread Unit Outputs
+ input wire [7:0] alu_out,
+ input wire [7:0] lsu_out,
+
+ // Registers
+ output reg [7:0] rs,
+ output reg [7:0] rt
+);
+ localparam ARITHMETIC = 2'b00,
+ MEMORY = 2'b01,
+ CONSTANT = 2'b10;
+
+ // 16 registers per thread (13 free registers and 3 read-only registers)
+ reg [7:0] registers[15:0];
+
+ always @(posedge clk) begin
+ if (reset) begin
+ // Empty rs, rt
+ rs <= 0;
+ rt <= 0;
+ // Initialize all free registers
+ registers[0] <= 8'b0;
+ registers[1] <= 8'b0;
+ registers[2] <= 8'b0;
+ registers[3] <= 8'b0;
+ registers[4] <= 8'b0;
+ registers[5] <= 8'b0;
+ registers[6] <= 8'b0;
+ registers[7] <= 8'b0;
+ registers[8] <= 8'b0;
+ registers[9] <= 8'b0;
+ registers[10] <= 8'b0;
+ registers[11] <= 8'b0;
+ registers[12] <= 8'b0;
+ // Initialize read-only registers
+ registers[13] <= 8'b0; // %blockIdx
+ registers[14] <= 4; // %blockDim
+ registers[15] <= 0; // %threadIdx
+ end else if (enable) begin
+ // [Bad Solution] Shouldn't need to set this every cycle
+ registers[13] <= block_id; // Update the block_id when a new block is issued from dispatcher
+
+ // Fill rs/rt when core_state = REQUEST
+ if (core_state == 3'b011) begin
+ rs <= registers[decoded_rs_address];
+ rt <= registers[decoded_rt_address];
+ end
+
+ // Store rd when core_state = UPDATE
+ if (core_state == 3'b110) begin
+ // Only allow writing to R0 - R12
+ if (decoded_reg_write_enable && decoded_rd_address < 13) begin
+ case (decoded_reg_input_mux)
+ ARITHMETIC: begin
+ // ADD, SUB, MUL, DIV
+ registers[decoded_rd_address] <= alu_out;
+ end
+ MEMORY: begin
+ // LDR
+ registers[decoded_rd_address] <= lsu_out;
+ end
+ CONSTANT: begin
+ // CONST
+ registers[decoded_rd_address] <= decoded_immediate;
+ end
+ endcase
+ end
+ end
+ end
+ end
+endmodule
diff --git a/visualization/src/scheduler.sv b/visualization/src/scheduler.sv
new file mode 100644
index 0000000..d2eab70
--- /dev/null
+++ b/visualization/src/scheduler.sv
@@ -0,0 +1,102 @@
+module scheduler (
+ input wire clk,
+ input wire reset,
+ input wire start,
+
+ // Control Signals
+ input wire decoded_mem_read_enable,
+ input wire decoded_mem_write_enable,
+ input wire decoded_ret,
+
+ // Memory Access State
+ input wire [2:0] fetcher_state,
+ input wire [7:0] lsu_state,
+
+ // Current & Next PC
+ output reg [7:0] current_pc,
+ input wire [31:0] next_pc,
+
+ // Execution State
+ output reg [2:0] core_state,
+ output reg done
+);
+ wire [1:0] lsu_state0 = lsu_state[1:0];
+ wire [1:0] lsu_state1 = lsu_state[3:2];
+ wire [1:0] lsu_state2 = lsu_state[5:4];
+ wire [1:0] lsu_state3 = lsu_state[7:6];
+
+ localparam IDLE = 3'b000, // Waiting to start
+ FETCH = 3'b001, // Fetch instructions from program memory
+ DECODE = 3'b010, // Decode instructions into control signals
+ REQUEST = 3'b011, // Request data from registers or memory
+ WAIT = 3'b100, // Wait for response from memory if necessary
+ EXECUTE = 3'b101, // Execute ALU and PC calculations
+ UPDATE = 3'b110, // Update registers, NZP, and PC
+ DONE = 3'b111; // Done executing this block
+
+ wire any_lsu_waiting;
+ assign any_lsu_waiting =
+ (lsu_state0 == 2'b01) || (lsu_state0 == 2'b10) ||
+ (lsu_state1 == 2'b01) || (lsu_state1 == 2'b10) ||
+ (lsu_state2 == 2'b01) || (lsu_state2 == 2'b10) ||
+ (lsu_state3 == 2'b01) || (lsu_state3 == 2'b10);
+
+ always @(posedge clk) begin
+ if (reset) begin
+ current_pc <= 0;
+ core_state <= IDLE;
+ done <= 0;
+ end else begin
+ case (core_state)
+ IDLE: begin
+ // Here after reset (before kernel is launched, or after previous block has been processed)
+ if (start) begin
+ // Start by fetching the next instruction for this block based on PC
+ core_state <= FETCH;
+ end
+ end
+ FETCH: begin
+ // Move on once fetcher_state = FETCHED
+ if (fetcher_state == 3'b010) begin
+ core_state <= DECODE;
+ end
+ end
+ DECODE: begin
+ // Decode is synchronous so we move on after one cycle
+ core_state <= REQUEST;
+ end
+ REQUEST: begin
+ // Request is synchronous so we move on after one cycle
+ core_state <= WAIT;
+ end
+ WAIT: begin
+ // Wait for all LSUs to finish their request before continuing
+ // If no LSU is waiting for a response, move onto the next stage
+ if (!any_lsu_waiting) begin
+ core_state <= EXECUTE;
+ end
+ end
+ EXECUTE: begin
+ // Execute is synchronous so we move on after one cycle
+ core_state <= UPDATE;
+ end
+ UPDATE: begin
+ if (decoded_ret) begin
+ // If we reach a RET instruction, this block is done executing
+ done <= 1;
+ core_state <= DONE;
+ end else begin
+ // TODO: Branch divergence. For now assume all next_pc converge
+ current_pc <= next_pc[31:24];
+
+ // Update is synchronous so we move on after one cycle
+ core_state <= FETCH;
+ end
+ end
+ DONE: begin
+ // no-op
+ end
+ endcase
+ end
+ end
+endmodule
diff --git a/visualization/src/splitter144.sv b/visualization/src/splitter144.sv
new file mode 100644
index 0000000..8493eb2
--- /dev/null
+++ b/visualization/src/splitter144.sv
@@ -0,0 +1,79 @@
+module splitter144 (
+ input wire [35:0] in0,
+ input wire [35:0] in1,
+ input wire [35:0] in2,
+ input wire [35:0] in3,
+
+ output wire [3:0] out0,
+ output wire [31:0] out1,
+ output wire [3:0] out2,
+ output wire [31:0] out3,
+ output wire [31:0] out4,
+ output wire [7:0] out5,
+ output wire [31:0] out6
+);
+ // Input field layout (low -> high): (1, 8, 1, 8, 8, 2, 8) = 36 bits
+ // Output fields are bit-interleaved across in0..in3, producing:
+ // (4, 32, 4, 32, 32, 8, 32)
+
+ // out0: field0 (1 bit) => 4 bits
+ assign out0 = {in3[0], in2[0], in1[0], in0[0]};
+
+ // out1: field1 (8 bits, in[*][8:1]) => 32 bits, interleaved by bit index
+ assign out1 = {
+ {in3[8], in2[8], in1[8], in0[8]},
+ {in3[7], in2[7], in1[7], in0[7]},
+ {in3[6], in2[6], in1[6], in0[6]},
+ {in3[5], in2[5], in1[5], in0[5]},
+ {in3[4], in2[4], in1[4], in0[4]},
+ {in3[3], in2[3], in1[3], in0[3]},
+ {in3[2], in2[2], in1[2], in0[2]},
+ {in3[1], in2[1], in1[1], in0[1]}
+ };
+
+ // out2: field2 (1 bit, in[*][9]) => 4 bits
+ assign out2 = {in3[9], in2[9], in1[9], in0[9]};
+
+ // out3: field3 (8 bits, in[*][17:10]) => 32 bits
+ assign out3 = {
+ {in3[17], in2[17], in1[17], in0[17]},
+ {in3[16], in2[16], in1[16], in0[16]},
+ {in3[15], in2[15], in1[15], in0[15]},
+ {in3[14], in2[14], in1[14], in0[14]},
+ {in3[13], in2[13], in1[13], in0[13]},
+ {in3[12], in2[12], in1[12], in0[12]},
+ {in3[11], in2[11], in1[11], in0[11]},
+ {in3[10], in2[10], in1[10], in0[10]}
+ };
+
+ // out4: field4 (8 bits, in[*][25:18]) => 32 bits
+ assign out4 = {
+ {in3[25], in2[25], in1[25], in0[25]},
+ {in3[24], in2[24], in1[24], in0[24]},
+ {in3[23], in2[23], in1[23], in0[23]},
+ {in3[22], in2[22], in1[22], in0[22]},
+ {in3[21], in2[21], in1[21], in0[21]},
+ {in3[20], in2[20], in1[20], in0[20]},
+ {in3[19], in2[19], in1[19], in0[19]},
+ {in3[18], in2[18], in1[18], in0[18]}
+ };
+
+ // out5: field5 (2 bits, in[*][27:26]) => 8 bits
+ assign out5 = {
+ {in3[27], in2[27], in1[27], in0[27]},
+ {in3[26], in2[26], in1[26], in0[26]}
+ };
+
+ // out6: field6 (8 bits, in[*][35:28]) => 32 bits
+ assign out6 = {
+ {in3[35], in2[35], in1[35], in0[35]},
+ {in3[34], in2[34], in1[34], in0[34]},
+ {in3[33], in2[33], in1[33], in0[33]},
+ {in3[32], in2[32], in1[32], in0[32]},
+ {in3[31], in2[31], in1[31], in0[31]},
+ {in3[30], in2[30], in1[30], in0[30]},
+ {in3[29], in2[29], in1[29], in0[29]},
+ {in3[28], in2[28], in1[28], in0[28]}
+ };
+endmodule
+
diff --git a/visualization/thread.dig b/visualization/thread.dig
new file mode 100644
index 0000000..4d79eda
--- /dev/null
+++ b/visualization/thread.dig
@@ -0,0 +1,1214 @@
+
+
+ 2
+
+
+ backgroundColor
+
+ 204
+ 204
+ 255
+ 255
+
+
+
+ romContent
+
+
+
+
+
+ view1
+
+
+ 0.38742048900000015
+ 0.0
+ 0.0
+ 0.38742048900000015
+ 313.20857437599994
+ 208.51280642699982
+
+
+
+
+ Width
+ 22
+
+
+ view4
+
+
+ 0.38742048900000015
+ 0.0
+ 0.0
+ 0.38742048900000015
+ 313.20857437599994
+ 208.51280642699982
+
+
+
+
+
+
+ ExternalFile
+
+
+ applicationType
+ IVERILOG
+
+
+ Label
+ alu
+
+
+ externalInputs
+ clk,reset,enable,core_state:3,decoded_alu_arithmetic_mux:2,decoded_alu_output_mux,rs:8,rt:8
+
+
+ iverilogOptions
+ -I ./src/
+
+
+ externalOutputs
+ alu_out:8
+
+
+ Width
+ 14
+
+
+ CodeFile
+ ./src/alu.sv
+
+
+
+
+
+ ExternalFile
+
+
+ applicationType
+ IVERILOG
+
+
+ Label
+ lsu
+
+
+ externalInputs
+ clk,reset,enable,core_state:3,decoded_mem_read_enable,decoded_mem_write_enable,rs:8,rt:8,mem_read_ready,mem_read_data:8,mem_write_ready
+
+
+ iverilogOptions
+ -I ./src/
+
+
+ externalOutputs
+ mem_read_valid,mem_read_address:8,mem_write_valid,mem_write_address:8,mem_write_data:8,lsu_state:2,lsu_out:8
+
+
+ Width
+ 20
+
+
+ CodeFile
+ ./src/lsu.sv
+
+
+
+
+
+ ExternalFile
+
+
+ applicationType
+ IVERILOG
+
+
+ Label
+ registers
+
+
+ externalInputs
+ clk,reset,enable,block_id:8,core_state:3,decoded_rd_address:4,decoded_rs_address:4,decoded_rt_address:4,decoded_reg_write_enable,decoded_reg_input_mux:2,decoded_immediate:8,alu_out:8,lsu_out:8
+
+
+ iverilogOptions
+ -I ./src/
+
+
+ externalOutputs
+ rs:8,rt:8
+
+
+ Width
+ 12
+
+
+ CodeFile
+ ./src/registers.sv
+
+
+
+
+
+ ExternalFile
+
+
+ applicationType
+ IVERILOG
+
+
+ Label
+ pc
+
+
+ externalInputs
+ clk,reset,enable,core_state:3,decoded_nzp:3,decoded_immediate:8,decoded_nzp_write_enable,decoded_pc_mux,alu_out:8,current_pc:8
+
+
+ iverilogOptions
+ -I ./src/
+
+
+ externalOutputs
+ next_pc:8
+
+
+ Width
+ 13
+
+
+ CodeFile
+ ./src/pc.sv
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ reset
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ reset
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ reset
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ reset
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ clk
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ clk
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ clk
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ clk
+
+
+
+
+
+ Clock
+
+
+ Label
+ clk
+
+
+
+
+
+ Tunnel
+
+
+ NetName
+ clk
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ cs
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ cs
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ cs
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ cs
+
+
+
+
+
+ In
+
+
+ Label
+ reset
+
+
+
+
+
+ Tunnel
+
+
+ NetName
+ reset
+
+
+
+
+
+ In
+
+
+ Label
+ cs
+
+
+ Bits
+ 3
+
+
+
+
+
+ Tunnel
+
+
+ NetName
+ cs
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ en
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ en
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ en
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ en
+
+
+
+
+
+ In
+
+
+ Label
+ en
+
+
+
+
+
+ In
+
+
+ Label
+ decoded\_rd\_address
+
+
+ Bits
+ 4
+
+
+
+
+
+ In
+
+
+ Label
+ decoded\_rs\_address
+
+
+ Bits
+ 4
+
+
+
+
+
+ In
+
+
+ Label
+ decoded\_rt\_address
+
+
+ Bits
+ 4
+
+
+
+
+
+ In
+
+
+ Label
+ decoded\_nzp
+
+
+ Bits
+ 3
+
+
+
+
+
+ Tunnel
+
+
+ NetName
+ en
+
+
+
+
+
+ In
+
+
+ Label
+ decoded\_immediate
+
+
+ Bits
+ 8
+
+
+
+
+
+ In
+
+
+ Label
+ decoded\_reg\_write\_enable
+
+
+
+
+
+ In
+
+
+ Label
+ decoded\_mem\_read\_enable
+
+
+
+
+
+ Out
+
+
+ Label
+ mem\_read\_valid
+
+
+
+
+
+ Out
+
+
+ Label
+ mem\_read\_address
+
+
+ Bits
+ 8
+
+
+
+
+
+ Out
+
+
+ Label
+ mem\_write\_valid
+
+
+
+
+
+ Out
+
+
+ Label
+ mem\_write\_address
+
+
+ Bits
+ 8
+
+
+
+
+
+ Out
+
+
+ Label
+ mem\_write\_data
+
+
+ Bits
+ 8
+
+
+
+
+
+ Out
+
+
+ Label
+ lsu\_state
+
+
+ Bits
+ 2
+
+
+
+
+
+ In
+
+
+ Label
+ decoded\_mem\_write\_enable
+
+
+
+
+
+ In
+
+
+ Label
+ decoded\_nzp\_write\_enable
+
+
+
+
+
+ In
+
+
+ Label
+ decoded\_reg\_input\_mux
+
+
+ Bits
+ 2
+
+
+
+
+
+ In
+
+
+ Label
+ decoded\_alu\_arithmetic\_mux
+
+
+ Bits
+ 2
+
+
+
+
+
+ In
+
+
+ Label
+ decoded\_alu\_output\_mux
+
+
+
+
+
+ In
+
+
+ Label
+ decoded\_pc\_mux
+
+
+
+
+
+ In
+
+
+ Label
+ block\_id
+
+
+ Bits
+ 8
+
+
+
+
+
+ In
+
+
+ Label
+ current\_pc
+
+
+ Bits
+ 8
+
+
+
+
+
+ In
+
+
+ Label
+ mem\_read\_ready
+
+
+
+
+
+ In
+
+
+ Label
+ mem\_read\_data
+
+
+ Bits
+ 8
+
+
+
+
+
+ In
+
+
+ Label
+ mem\_write\_ready
+
+
+
+
+
+ Out
+
+
+ Label
+ next\_pc
+
+
+ Bits
+ 8
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/visualization/threads.dig b/visualization/threads.dig
new file mode 100644
index 0000000..de871de
--- /dev/null
+++ b/visualization/threads.dig
@@ -0,0 +1,1800 @@
+
+
+ 2
+
+
+ backgroundColor
+
+ 255
+ 204
+ 204
+ 255
+
+
+
+ romContent
+
+
+
+
+
+ Width
+ 21
+
+
+
+
+ thread.dig
+
+
+ Label
+ thread0
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ clk
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ reset
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ cs
+
+
+
+
+
+ thread.dig
+
+
+ Label
+ thread2
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ clk
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ reset
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ cs
+
+
+
+
+
+ Clock
+
+
+ Label
+ clk
+
+
+
+
+
+ In
+
+
+ Label
+ reset
+
+
+
+
+
+ In
+
+
+ Label
+ cs
+
+
+ Bits
+ 3
+
+
+
+
+
+ In
+
+
+ Label
+ decoded\_rd\_address
+
+
+ Bits
+ 4
+
+
+
+
+
+ In
+
+
+ Label
+ decoded\_rs\_address
+
+
+ Bits
+ 4
+
+
+
+
+
+ In
+
+
+ Label
+ decoded\_rt\_address
+
+
+ Bits
+ 4
+
+
+
+
+
+ In
+
+
+ Label
+ decoded\_nzp
+
+
+ Bits
+ 3
+
+
+
+
+
+ Tunnel
+
+
+ NetName
+ clk
+
+
+
+
+
+ In
+
+
+ Label
+ decoded\_immediate
+
+
+ Bits
+ 8
+
+
+
+
+
+ Tunnel
+
+
+ NetName
+ reset
+
+
+
+
+
+ In
+
+
+ Label
+ decoded\_reg\_write\_enable
+
+
+
+
+
+ Tunnel
+
+
+ NetName
+ cs
+
+
+
+
+
+ In
+
+
+ Label
+ decoded\_mem\_read\_enable
+
+
+
+
+
+ In
+
+
+ Label
+ decoded\_mem\_write\_enable
+
+
+
+
+
+ In
+
+
+ Label
+ decoded\_nzp\_write\_enable
+
+
+
+
+
+ In
+
+
+ Label
+ decoded\_reg\_input\_mux
+
+
+ Bits
+ 2
+
+
+
+
+
+ In
+
+
+ Label
+ decoded\_alu\_arithmetic\_mux
+
+
+ Bits
+ 2
+
+
+
+
+
+ In
+
+
+ Label
+ decoded\_alu\_output\_mux
+
+
+
+
+
+ In
+
+
+ Label
+ decoded\_pc\_mux
+
+
+
+
+
+ In
+
+
+ Label
+ current\_pc
+
+
+ Bits
+ 8
+
+
+
+
+
+ In
+
+
+ Label
+ block\_id
+
+
+ Bits
+ 8
+
+
+
+
+
+ In
+
+
+ Label
+ data\_mem\_read\_ready
+
+
+ Bits
+ 4
+
+
+
+
+
+ In
+
+
+ Label
+ data\_mem\_read\_data
+
+
+ Bits
+ 32
+
+
+
+
+
+ In
+
+
+ Label
+ data\_mem\_write\_ready
+
+
+ Bits
+ 4
+
+
+
+
+
+ Splitter
+
+
+ splitterSpreading
+ 2
+
+
+ Input Splitting
+ 4,4,4,3,8,1,1,1,1,2,2,1,1,8,8
+
+
+ Output Splitting
+ 49
+
+
+
+
+
+ Splitter
+
+
+ Input Splitting
+ 49
+
+
+ Output Splitting
+ 4,4,4,3,8,1,1,1,1,2,2,1,1,8,8
+
+
+
+
+
+ Splitter
+
+
+ Input Splitting
+ 49
+
+
+ Output Splitting
+ 4,4,4,3,8,1,1,1,1,2,2,1,1,8,8
+
+
+
+
+
+ thread.dig
+
+
+ Label
+ thread1
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ clk
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ reset
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ cs
+
+
+
+
+
+ thread.dig
+
+
+ Label
+ thread3
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ clk
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ reset
+
+
+
+
+
+ Tunnel
+
+
+ rotation
+
+
+
+ NetName
+ cs
+
+
+
+
+
+ Splitter
+
+
+ Input Splitting
+ 49
+
+
+ Output Splitting
+ 4,4,4,3,8,1,1,1,1,2,2,1,1,8,8
+
+
+
+
+
+ Splitter
+
+
+ Input Splitting
+ 49
+
+
+ Output Splitting
+ 4,4,4,3,8,1,1,1,1,2,2,1,1,8,8
+
+
+
+
+
+ In
+
+
+ Label
+ thread\_count
+
+
+ Bits
+ 3
+
+
+
+
+
+ Comparator
+
+
+ Bits
+ 3
+
+
+
+
+
+ Const
+
+
+ Value
+ 0
+
+
+ Bits
+ 2
+
+
+
+
+
+ Const
+
+
+ Bits
+ 2
+
+
+
+
+
+ Const
+
+
+ Value
+ 2
+
+
+ Bits
+ 2
+
+
+
+
+
+ Const
+
+
+ Value
+ 3
+
+
+ Bits
+ 2
+
+
+
+
+
+ Out
+
+
+ Label
+ data\_mem\_read\_valid
+
+
+ Bits
+ 4
+
+
+
+
+
+ Out
+
+
+ Label
+ data\_mem\_read\_address
+
+
+ Bits
+ 32
+
+
+
+
+
+ Splitter
+
+
+ splitterSpreading
+ 2
+
+
+ Input Splitting
+ 4,32,4
+
+
+ Output Splitting
+ 40
+
+
+
+
+
+ splitter40-10.dig
+
+
+
+
+ splitter40-10.dig
+
+
+
+
+ splitter40-10.dig
+
+
+
+
+ splitter40-10.dig
+
+
+
+
+ Splitter
+
+
+ Input Splitting
+ 1,8,1,8,8,2,8
+
+
+ Output Splitting
+ 36
+
+
+
+
+
+ Splitter
+
+
+ Input Splitting
+ 1,8,1,8,8,2,8
+
+
+ Output Splitting
+ 36
+
+
+
+
+
+ Splitter
+
+
+ Input Splitting
+ 1,8,1,8,8,2,8
+
+
+ Output Splitting
+ 36
+
+
+
+
+
+ Splitter
+
+
+ Input Splitting
+ 1,8,1,8,8,2,8
+
+
+ Output Splitting
+ 36
+
+
+
+
+
+ Out
+
+
+ Label
+ data\_mem\_write\_valid
+
+
+ Bits
+ 4
+
+
+
+
+
+ Out
+
+
+ Label
+ data\_mem\_write\_address
+
+
+ Bits
+ 32
+
+
+
+
+
+ Out
+
+
+ Label
+ data\_mem\_write\_data
+
+
+ Bits
+ 32
+
+
+
+
+
+ Out
+
+
+ Label
+ lsu\_state
+
+
+ Bits
+ 8
+
+
+
+
+
+ Out
+
+
+ Label
+ next\_pc
+
+
+ Bits
+ 32
+
+
+
+
+
+ ExternalFile
+
+
+ applicationType
+ IVERILOG
+
+
+ Label
+ splitter144
+
+
+ externalInputs
+ in0:36,in1:36,in2:36,in3:36
+
+
+ iverilogOptions
+ -I ./src/
+
+
+ externalOutputs
+ out0:4,out1:32,out2:4,out3:32,out4:32,out5:8,out6:32
+
+
+ Width
+ 4
+
+
+ CodeFile
+ ./src/splitter144.sv
+
+
+
+
+
+ Const
+
+
+ Value
+ 0
+
+
+ Bits
+ 3
+
+
+
+
+
+ Comparator
+
+
+ Bits
+ 3
+
+
+
+
+
+ Const
+
+
+ Bits
+ 3
+
+
+
+
+
+ Comparator
+
+
+ Bits
+ 3
+
+
+
+
+
+ Const
+
+
+ Value
+ 2
+
+
+ Bits
+ 3
+
+
+
+
+
+ Comparator
+
+
+ Bits
+ 3
+
+
+
+
+
+ Const
+
+
+ Value
+ 3
+
+
+ Bits
+ 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file