5 vuotta sitten · a67e2bb430
--- a/.gitignore
+++ b/.gitignore
@@ -11,6 +11,7 @@
 
				 !*.sv
			
 
				 !*.py
			
 
				 !sim_*.do
			
 
				+!wave_*_tb.do
			
 
				 !*.qip
			
 
				 
			
 
				 # Making sure nothing from there will be picked up
			
--- a/Makefile
+++ b/Makefile
@@ -31,11 +31,13 @@ QUARTUS_MACROS = --set VERILOG_MACRO="SYNTHESIS=1"
 
				 VSIM_ARGS = -L altera_ver -L lpm_ver -L sgate_ver -L altera_mf_ver -L altera_lnsim_ver -L cycloneive_ver -voptargs="+acc"
			
 
				 
			
 
				 
			
 
				-### Optional parameters
			
 
				-tb_file ?=
			
 
				-tb_dir = $(dirname "${testbench_file}")
			
 
				-tb_mod ?=
			
 
				-do_file ?=
			
 
				+### VERILOG SOURCE FILES
			
 
				+## It finds all verilog files in src/*.sv and
			
 
				+## also includes subdirectories that has syntax
			
 
				+## src/{MOD_NAME}/{MOD_NAME}.sv or src/{MOD_NAME}/include.sv
			
 
				+###
			
 
				+VERILOG_SRC=$(foreach SRC,$(sort $(dir $(wildcard ./src/*/*.sv))),$(wildcard $(SRC)$(shell basename $(SRC)).sv $(SRC)include.sv)) $(wildcard ./src/*.sv)
			
 
				+$(info VERILOG_SRC=$(VERILOG_SRC))
			
 
				 SIM_DIR ?= ./simulation/modelsim
			
 
				 
			
 
				 ### ================================================================
			
@@ -57,8 +59,8 @@ modelsim_cli:
 
				 %.gui: ${SIM_DIR}/%.do
			
 
				 	${MODELSIM_BIN} -gui -do "$<"
			
 
				 
			
 
				-testbench:
			
 
				-	${MODELSIM_BIN} -c -do "vlog -sv +incdir+${tb_dir} {${tb_file}}; vsim -t 1ps ${VSIM_ARGS} ${tb_mod}; run -all"
			
 
				+%_tb:
			
 
				+	${MODELSIM_BIN} -gui -do "$(foreach SRC,$(VERILOG_SRC),vlog -sv {${SRC}};) vsim -t 1ps ${VSIM_ARGS} ${@}; if { [file exists ${SIM_DIR}/wave_${@}.do ] == 1} { do ${SIM_DIR}/wave_${@}.do }"
			
 
				 
			
 
				 sim_fpa_mod.do:
			
 
				 	cd ./simulation/modelsim && ${MODELSIM_BIN} -gui -do $@
			
--- a/readme.md
+++ b/readme.md
@@ -14,12 +14,19 @@ make modelsim
 
				 
			
 
				 ### Running testbench
			
 
				 
			
 
				-This will run test testbench in console without opening modelsim GUI
			
 
				+Running testbench directly on GUI
			
 
				 ```bash
			
 
				-make tb_file=${file} tb_mod=${module} testbench
			
 
				-# Example
			
 
				-make tb_file=./src/root.sv tb_mod=root_tb testbench
			
 
				+make my_module_tb
			
 
				 ```
			
 
				+This includes all modules from src/*.sv and subdirectories that contains __main__ system verilog file with the same name as 
			
 
				+subdirectory or include.sv
			
 
				+
			
 
				+Any other system verilog files in subdirectory can be included using `` `_include {FILE.sv} `` in subdirectory main file. 
			
 
				+
			
 
				+This command will also include saved wave instructions that are located in **simulation/modelsim/wave_${my_module_tb}.do** 
			
 
				+
			
 
				+
			
 
				+### Other testbench methods
			
 
				 Running testbench with defined simulation tcl script.
			
 
				 Scripts has be located in **simulation/modelsim/sim_\*.do**
			
 
				 ```bash
			
@@ -27,4 +34,6 @@ Scripts has be located in **simulation/modelsim/sim_\*.do**
 
				 make sim_root_tb.gui
			
 
				 # Without GUI
			
 
				 make sim_root_tb.cli
			
 
				-```
			
 
				+```
			
 
				+
			
 
				+
			
--- a/simulation/modelsim/sim_neural.do
+++ b/simulation/modelsim/sim_neural.do
--- a/simulation/modelsim/wave_floating32_tb.do
+++ b/simulation/modelsim/wave_floating32_tb.do
@@ -0,0 +1,24 @@
 
				+onerror {resume}
			
 
				+quietly WaveActivateNextPane {} 0
			
 
				+add wave -noupdate -label INPUT_A -radix float32 /floating32_tb/input_a
			
 
				+add wave -noupdate -label INPUT_B -radix float32 /floating32_tb/input_b
			
 
				+add wave -noupdate -label RESULT_ADD -radix float32 /floating32_tb/result_add
			
 
				+add wave -noupdate -label RESULT_MULT -radix float32 /floating32_tb/result_mult
			
 
				+TreeUpdate [SetDefaultTree]
			
 
				+WaveRestoreCursors {{Cursor 1} {0 ps} 0}
			
 
				+quietly wave cursor active 0
			
 
				+configure wave -namecolwidth 150
			
 
				+configure wave -valuecolwidth 100
			
 
				+configure wave -justifyvalue left
			
 
				+configure wave -signalnamewidth 0
			
 
				+configure wave -snapdistance 10
			
 
				+configure wave -datasetprefix 0
			
 
				+configure wave -rowmargin 4
			
 
				+configure wave -childrowmargin 2
			
 
				+configure wave -gridoffset 0
			
 
				+configure wave -gridperiod 1
			
 
				+configure wave -griddelta 40
			
 
				+configure wave -timeline 0
			
 
				+configure wave -timelineunits ns
			
 
				+update
			
 
				+WaveRestoreZoom {0 ps} {47 ps}
			
--- a/src/FPA_module_test.sv
+++ b/src/FPA_module_test.sv
@@ -1,204 +1,156 @@
 
				-module floating_add #(parameter N=16, M=4)(input_1, input_2, sum, diff, clk, reset);
			
 
				+module floating_add #(parameter N=16, M=4)(input_1, input_2, sum, diff);
			
 
				 	input logic [N-1:0] input_1, input_2;
			
 
				-	input logic clk, reset;
			
 
				 	output logic [N-1:0] sum;
			
 
				 	output logic [M:0] diff;
			
 
				 
			
 
				-//	logic flag_a;
			
 
				-//	logic flag_b;
			
 
				-//	logic [M:0] abs;
			
 
				-//	logic [N-3-M:0] res;
			
 
				-	logic [N-1:0] D0 [7:0];
			
 
				-	logic [N-1:0] Q0 [7:0];
			
 
				-	logic [N-1:0] Q1 [7:0];
			
 
				-	logic [N-1:0] Q2 [7:0];
			
 
				+	logic flag_a;
			
 
				+	logic flag_b;
			
 
				+	logic [M:0] abs;
			
 
				+	logic [N-3-M:0] res;
			
 
				 	
			
 
				 	// sign_x = x[N-1]
			
 
				 	// exponent_x = x[N-2:N-2-M]
			
 
				 	// mantissa_x = x[N-3-M:0]
			
 
				 	
			
 
				-	// Pipeline stage 0
			
 
				-	always_comb
			
 
				-		begin
			
 
				-			D0[0] = input_1;
			
 
				-			D0[1] = input_2;
			
 
				-			D0[2] = 0 // sum
			
 
				-			D0[3] = 0 // diff
			
 
				-			D0[4] = 0 // flag_a
			
 
				-			Do[5] = 0 // flag_b
			
 
				-			D0[6] = 0 // abs
			
 
				-			D0[7] = 0 // res
			
 
				-		end
			
 
				-		
			
 
				-		pipe pipe_0(.clk(clk), .reset(reset), .D(D0), .Q(Q0));
			
 
				-	
			
 
				 	always_comb
			
 
				 		begin
			
 
				-			if (Q0[0][N-2:N-2-M] > Q0[1][N-2:N-2-M]) // If input 1 has the bigger exponent 
			
 
				+			if (input_1[N-2:N-2-M] > input_2[N-2:N-2-M]) // If input 1 has the bigger exponent 
			
 
				 				begin
			
 
				 					// Flags input a as larger and calculates the absolute difference
			
 
				-					Q0]4] = 1;
			
 
				-					Q0[5] = 0;
			
 
				-					Q0[6] = Q0[0][N-2:N-2-M] - Q0[1][N-2:N-2-M];
			
 
				+					flag_a = 1;
			
 
				+					flag_b = 0;
			
 
				+					abs = input_1[N-2:N-2-M] - input_2[N-2:N-2-M];
			
 
				 					// ASsigning overall sign of the output
			
 
				-					Q0[2][N-1] = Q0[0][N-1];
			
 
				+					sum[N-1] = input_1[N-1];
			
 
				 					// Sets output to have the same exponent
			
 
				-					Q0[2][N-2:N-2-M] = Q0[0][N-2:N-2-M];
			
 
				+					sum[N-2:N-2-M] = input_1[N-2:N-2-M];
			
 
				 				end
			
 
				-			else if (Q0[1][N-2:N-2-M] > Q0[0][N-2:N-2-M]) // If input 2 has the bigger exponent
			
 
				+			else if (input_2[N-2:N-2-M] > input_1[N-2:N-2-M]) // If input 2 has the bigger exponent
			
 
				 				begin
			
 
				 					// Similarly flags input b as larger and calculates the absolute difference
			
 
				-					Q0[4] = 0;
			
 
				-					Q0[5] = 1;
			
 
				-					Q0[6] = Q0[1][N-2:N-2-M] - Q0[0][N-2:N-2-M];
			
 
				+					flag_a = 0;
			
 
				+					flag_b = 1;
			
 
				+					abs = input_2[N-2:N-2-M] - input_1[N-2:N-2-M];
			
 
				 					// ASsigning overall sign of the output
			
 
				-					Q0[2][N-1] = Q0[1][N-1];
			
 
				+					sum[N-1] = input_2[N-1];
			
 
				 					// Sets ouput to have the same exponent
			
 
				-					Q0[2][N-2:N-2-M] = Q0[1][N-2:N-2-M];
			
 
				+					sum[N-2:N-2-M] = input_2[N-2:N-2-M];
			
 
				 				end
			
 
				 			else 
			
 
				 				begin
			
 
				 					// THe condition that both inputs have the same exponent
			
 
				-					Q0[4] = 1;
			
 
				-					Q0[5] = 1;
			
 
				-					Q0[6] = 0;
			
 
				+					flag_a = 1;
			
 
				+					flag_b = 1;
			
 
				+					abs = 0;
			
 
				 					// ASsigning overall sign of the output based on size of the mantissa
			
 
				-					if (Q0[0][N-3-M:0] >= Q0[1][N-3-M:0]) sum[N-1] = Q0[0][N-1];
			
 
				-					else Q0[2][N-1] = Q0[1][N-1];
			
 
				-					Q0[2][N-2:N-2-M] = Q0[0][N-2:N-2-M];
			
 
				+					if (input_1[N-3-M:0] >= input_2[N-3-M:0]) sum[N-1] = input_1[N-1];
			
 
				+					else sum[N-1] = input_2[N-1];
			
 
				+					sum[N-2:N-2-M] = input_1[N-2:N-2-M];
			
 
				 				end
			
 
				-			Q0[3] = Q0[6];
			
 
				+			diff = abs;
			
 
				 		end
			
 
				 		
			
 
				-		//Pipeline stage 1
			
 
				-		pipe pipe_1(.clk(clk), .reset(reset), .D(Q0), .Q(Q1));
			
 
				-		
			
 
				 	always_comb
			
 
				 		begin
			
 
				 			// Condition for overflow is that it sets the output to the larger input
			
 
				-			if (Q1[6] > N-M-2) // Because size of mantissa is 10 bits and shifting by 10 would give 0
			
 
				+			if (abs > 9) // Because size of mantissa is 10 bits and shifting by 10 would give 0
			
 
				 				begin
			
 
				-					if (Q1[4] & ~Q1[5]) Q1[2] = Q1[0]; // input 1 is larger and is translated to output
			
 
				-					else if (~Q1[4] & Q1[5]) Q1[2] = Q1[1]; // input 2 is larger and is translated to output
			
 
				+					if (flag_a & ~flag_b) sum = input_1; // input 1 is larger and is translated to output
			
 
				+					else if (~flag_a & flag_b) sum = input_2; // input 2 is larger and is translated to output
			
 
				 					else // exponents are the same
			
 
				 						begin
			
 
				-							if (Q1[0][N-3-M:0] >= Q1[1][N-3-M:0]) Q1[2] = Q1[0];// input 1 has the bigger mantissa
			
 
				-							else Q1[2] = Q1[1]; // input 2 has the bigger mantissa
			
 
				+							if (input_1[N-3-M:0] >= input_2[N-3-M:0]) sum = input_1;// input 1 has the bigger mantissa
			
 
				+							else sum = input_2; // input 2 has the bigger mantissa
			
 
				 						end
			
 
				 				end
			
 
				 			else
			
 
				 				begin
			
 
				 					// Shifts the smaller input's mantissa to the right based on abs
			
 
				-					if (Q1[4] & ~Q1[5])// If input 1 has the larger exponent
			
 
				+					if (flag_a & ~flag_b)// If input 1 has the larger exponent
			
 
				 						begin
			
 
				 							// If the signs of both inputs are the same you add, otherwise you subtract
			
 
				-							if (Q1[0][N-1] == Q1[1][N-1])
			
 
				+							if (input_1[N-1] == input_2[N-1])
			
 
				 								begin
			
 
				-									Q1[7] = Q1[0][N-3-M:0] + (Q1[1][N-3-M:0] >> Q1[6]-1); // Sum the mantissa
			
 
				-									Q1[2][N-3-M:0] = Q1[7];
			
 
				+									res = input_1[N-3-M:0] + (input_2[N-3-M:0] >> abs-1); // Sum the mantissa
			
 
				+									sum[N-3-M:0] = res;
			
 
				 								end
			
 
				 							else
			
 
				 								begin
			
 
				-									Q1[7] = Q1[0][N-3-M:0] - (Q1[1][N-3-M:0] >> Q1[6]-1); // Subtract the mantissas
			
 
				-									Q1[2][N-3-M:0] = Q1[7];
			
 
				+									res = input_1[N-3-M:0] - (input_2[N-3-M:0] >> abs-1); // Subtract the mantissas
			
 
				+									sum[N-3-M:0] = res;
			
 
				 								end
			
 
				 						end
			
 
				-					else if (~Q1[4] & Q1[5])
			
 
				+					else if (~flag_a & flag_b)
			
 
				 						begin
			
 
				 							// If the signs of both inputs are the same you add, otherwise you subtract
			
 
				-							if (Q1[0][N-1] == Q1[1][N-1])
			
 
				+							if (input_1[N-1] == input_2[N-1])
			
 
				 								begin
			
 
				-									Q1[7] = (Q1[0][N-3-M:0] >> Q1[6]-1) + Q1[1][N-3-M:0]; // Sum the mantissa
			
 
				-									Q1[2][N-3-M:0] = Q1[7];
			
 
				+									res = (input_1[N-3-M:0] >> abs-1) + input_2[N-3-M:0]; // Sum the mantissa
			
 
				+									sum[N-3-M:0] = res;
			
 
				 								end
			
 
				 							else
			
 
				 								begin
			
 
				-									Q1[7] = Q1[1][N-3-M:0] - (Q1[0][N-3-M:0] >> Q1[6]-1); // Subtract the mantissas
			
 
				-									Q1[2][N-3-M:0] = Q1[7];
			
 
				+									res = input_2[N-3-M:0] - (input_1[N-3-M:0] >> abs-1); // Subtract the mantissas
			
 
				+									sum[N-3-M:0] = res;
			
 
				 								end
			
 
				 						end
			
 
				 					else
			
 
				 						begin 
			
 
				-							if (Q1[0][N-1] == Q1[1][N-1]) // If exponents and signs equal
			
 
				+							if (input_1[N-1] == input_2[N-1]) // If exponents and signs equal
			
 
				 								begin
			
 
				-									Q1[7] = Q1[0][N-3-M:0] + Q1[1][N-3-M:0]; // Sum the mantissa
			
 
				-									Q1[2][N-3-M:0] = Q1[7];
			
 
				+									res = input_1[N-3-M:0] + input_2[N-3-M:0]; // Sum the mantissa
			
 
				+									sum[N-3-M:0] = res;
			
 
				 								end
			
 
				 							else // In this case it will be a subtraction
			
 
				 								begin
			
 
				-									if (Q1[0][N-3-M:0] > Q1[1][N-3-M:0]) // Which has the larger mantissa 
			
 
				+									if (input_1[N-3-M:0] > input_2[N-3-M:0]) // Which has the larger mantissa 
			
 
				 										begin
			
 
				-											Q1[7] = Q1[0][N-3-M:0] - Q1[1][N-3-M:0]; // Subtract the mantissa
			
 
				-											Q1[2][N-3-M:0] = Q1[7];
			
 
				+											res = input_1[N-3-M:0] - input_2[N-3-M:0]; // Subtract the mantissa
			
 
				+											sum[N-3-M:0] = res;
			
 
				 										end
			
 
				-									else if (Q1[0][N-3-M:0] < Q1[1][N-3-M:0])
			
 
				+									else if (input_1[N-3-M:0] < input_2[N-3-M:0])
			
 
				 										begin
			
 
				-											Q1[7] = Q1[1][N-3-M:0] - Q1[0][N-3-M:0]; // Subtract the mantissa
			
 
				-											Q1[2][N-3-M:0] = Q1[7];
			
 
				+											res = input_2[N-3-M:0] - input_1[N-3-M:0]; // Subtract the mantissa
			
 
				+											sum[N-3-M:0] = res;
			
 
				 										end
			
 
				-									else Q1[7] = 0; // Both the exponent and the mantissa are equal so subtraction leads to 0
			
 
				-									Q1[2][N-3-M:0] = Q1[7];
			
 
				+									else res = 0; // Both the exponent and the mantissa are equal so subtraction leads to 0
			
 
				+									sum[N-3-M:0] = res;
			
 
				 								end
			
 
				 						end
			
 
				 				end
			
 
				 		end
			
 
				-		
			
 
				-		//Final pipeline stage 2
			
 
				-		pipe pipe2(.clk(clk), .reset(reset), .D(Q1), .Q(Q2));
			
 
				-		assign sum = Q2[2];
			
 
				-		assign diff = Q2[3];
			
 
				 endmodule : floating_add
			
 
				 
			
 
				 
			
 
				 
			
 
				-module floating_product #(parameter N=16, M=4)(input_1, input_2, product, clk, reset);
			
 
				+module floating_product #(parameter N=16, M=4)(input_1, input_2, product);
			
 
				 	input logic [N-1:0] input_1, input_2;
			
 
				-	input logic clk, reset;
			
 
				 	output logic [N-1:0] product;
			
 
				 
			
 
				 	// sign_x = x[N-1]
			
 
				 	// exponent_x = x[N-2:N-2-M]
			
 
				 	// mantissa_x = x[N-3-M:0]
			
 
				 
			
 
				-//	logic [N-2:N-2-M] sum;
			
 
				-//	logic [2*(N-3-M):0] mult;
			
 
				-	logic [2*(N-3-M):0] D0 [4:0];
			
 
				-	logic [2*(N-3-M):0] Q0 [4:0];
			
 
				-	logic [2*(N-3-M):0] Q1 [4:0];
			
 
				-	logic [2*(N-3-M):0] Q2 [4:0];
			
 
				-	
			
 
				-	// First pipeline stage 0
			
 
				-	assign D0[0] = input_1;
			
 
				-	assign D0[1] = input_2;
			
 
				-	assign D0[2] = 0; // product
			
 
				-	assign D0[3] = 0; // sum
			
 
				-	assign D0[4] = 0; // mult
			
 
				-	pipe pipe0 #(N = 32, K = 4)(.clk(clk), .reset(reset), .D(D0), .Q(Q0));
			
 
				+	logic [N-2:N-2-M] sum;
			
 
				+	logic [2*(N-3-M):0] mult;
			
 
				 
			
 
				 	// We have assigned an {M+1} bit exponent so we must have a 2^{M} offset
			
 
				-	assign Q0[3] = Q0[0][N-2:N-2-M] + Q0[1][N-2:N-2-M];
			
 
				-	assign Q0[2][N-2:N-2-M] = Q0[3] - (1'b1 << M) + 2;
			
 
				-	
			
 
				-	// Second pipeline stage 1
			
 
				-	pipe pipe1 #(N = 32, K = 4)(.clk(clk), .reset(reset), .D(Q0), .Q(Q1));
			
 
				+	assign sum = input_1[N-2:N-2-M] + input_2[N-2:N-2-M];
			
 
				+	assign product[N-2:N-2-M] = sum - (1'b1 << M) + 2;
			
 
				 
			
 
				 	always_comb
			
 
				 		begin
			
 
				 				// Setting the mantissa of the output
			
 
				-				Q1[4] = Q1[0][N-3-M:0] * Q1[1][N-3-M:0];
			
 
				-				if (Q1[4][N-3-M]) Q1[2][N-3-M:0] = Q1[4][2*(N-3-M):2*(N-3-M)-9];
			
 
				-				else Q1[2][N-3-M:0] = Q1[4][2*(N-3-M):2*(N-3-M)-9] << 1;
			
 
				-				Q1[2][N-1] = Q1[0][N-1] ^ Q1[1][N-1];
			
 
				+				mult = input_1[N-3-M:0] * input_2[N-3-M:0];
			
 
				+				if (mult[N-3-M]) product[N-3-M:0] = mult[2*(N-3-M):2*(N-3-M)-9];
			
 
				+				else product[N-3-M:0] = mult[2*(N-3-M):2*(N-3-M)-9] << 1;
			
 
				+				product[N-1] = input_1[N-1] ^ input_2[N-1];
			
 
				 		end
			
 
				-		
			
 
				-		// Final Pipeline Stage 2
			
 
				-		pipe pipe2 #(N = 32, K = 4)(.clk(clk), .reset(reset), .D(Q1), .Q(Q2));
			
 
				-		assign product = Q2[2][N-1:0];
			
 
				 endmodule : floating_product
			
 
				 
			
 
				 
			
 
				 
			
 
				-module pipe #(parameter N = 16, K = 7)pipe(clk, reset, D, Q);
			
 
				+module pipe #(parameter N=16)(clk, reset, Q, D);
			
 
				 	input logic clk, reset;
			
 
				 	input logic [N-1:0] D [K:0];
			
 
				 	output reg [N-1:0] Q [K:0];
			
--- a/src/fpu32/adder.v
+++ b/src/fpu32/adder.v
@@ -2,6 +2,22 @@
 
				 //Copyright (C) Jonathan P Dawson 2013
			
 
				 //2013-12-12
			
 
				 
			
 
				+typedef enum logic [3:0] {
			
 
				+  add_unpack,
			
 
				+  add_special,
			
 
				+  add_align,
			
 
				+  add_0,
			
 
				+  add_1,
			
 
				+  add_norm_0,
			
 
				+  add_norm_1,
			
 
				+  add_round,
			
 
				+  add_pack,
			
 
				+  add_output,
			
 
				+  add_input
			
 
				+
			
 
				+} adder_state;
			
 
				+
			
 
				+
			
 
				 module adder(
			
 
				         clk,
			
 
				         rst,
			
@@ -31,21 +47,7 @@ module adder(
 
				   reg       [31:0] s_output_z;
			
 
				   reg       s_input_ack;
			
 
				 
			
 
				-  reg       [3:0] state;
			
 
				-	
			
 
				-  parameter get_a         = 4'd0,
			
 
				-            get_b         = 4'd1,
			
 
				-            unpack        = 4'd2,
			
 
				-            special_cases = 4'd3,
			
 
				-            align         = 4'd4,
			
 
				-            add_0         = 4'd5,
			
 
				-            add_1         = 4'd6,
			
 
				-            normalise_1   = 4'd7,
			
 
				-            normalise_2   = 4'd8,
			
 
				-            round         = 4'd9,
			
 
				-            pack          = 4'd10,
			
 
				-            put_z         = 4'd11,
			
 
				-            get_input     = 4'd12;
			
 
				+  adder_state state;
			
 
				 
			
 
				   reg       [31:0] a, b, z;
			
 
				   reg       [26:0] a_m, b_m;
			
@@ -78,20 +80,20 @@ module adder(
 
				 //        end
			
 
				 //      end
			
 
				 
			
 
				-      get_input:
			
 
				+      add_input:
			
 
				       begin
			
 
				         s_input_ack <= 1;
			
 
				         if (s_input_ack && input_stb) begin
			
 
				           a <= input_a;
			
 
				           b <= input_b;
			
 
				           s_input_ack <= 0;
			
 
				-          state <= unpack;
			
 
				+          state <= add_unpack;
			
 
				         end
			
 
				       end
			
 
				 
			
 
				 
			
 
				 
			
 
				-      unpack:
			
 
				+      add_unpack:
			
 
				       begin
			
 
				         a_m <= {a[22 : 0], 3'd0};
			
 
				         b_m <= {b[22 : 0], 3'd0};
			
@@ -99,18 +101,18 @@ module adder(
 
				         b_e <= b[30 : 23] - 127;
			
 
				         a_s <= a[31];
			
 
				         b_s <= b[31];
			
 
				-        state <= special_cases;
			
 
				+        state <= add_special;
			
 
				       end
			
 
				 
			
 
				-      special_cases:
			
 
				+      add_special:
			
 
				       begin
			
 
				         //if a is NaN return a
			
 
				         if (a_e == 128 && a_m != 0) begin
			
 
				           z <= {a_s, 8'hff, a[22], a[21:0]};
			
 
				-          state <= put_z;
			
 
				+          state <= add_output;
			
 
				         end else if (b_e == 128 && b_m != 0) begin
			
 
				           z <= {b_s, 8'hff, b[22:0]};
			
 
				-          state <= put_z;
			
 
				+          state <= add_output;
			
 
				         //if a is inf return inf
			
 
				         end else if (a_e == 128 && a_m == 0) begin
			
 
				           z[31] <= a_s;
			
@@ -123,31 +125,31 @@ module adder(
 
				               z[22] <= 1;
			
 
				               z[21:0] <= 0;
			
 
				           end
			
 
				-          state <= put_z;
			
 
				+          state <= add_output;
			
 
				         //if b is inf return inf
			
 
				         end else if (b_e == 128 && b_m == 0) begin
			
 
				           z[31] <= b_s;
			
 
				           z[30:23] <= 255;
			
 
				           z[22:0] <= 0;
			
 
				-          state <= put_z;
			
 
				+          state <= add_output;
			
 
				         //if a is zero return b
			
 
				         end else if ((($signed(a_e) == -127) && (a_m == 0)) && (($signed(b_e) == -127) && (b_m == 0))) begin
			
 
				           z[31] <= a_s & b_s;
			
 
				           z[30:23] <= b_e[7:0] + 127;
			
 
				           z[22:0] <= b_m[26:3];
			
 
				-          state <= put_z;
			
 
				+          state <= add_output;
			
 
				         //if a is zero return b
			
 
				         end else if (($signed(a_e) == -127) && (a_m == 0)) begin
			
 
				           z[31] <= b_s;
			
 
				           z[30:23] <= b_e[7:0] + 127;
			
 
				           z[22:0] <= b_m[26:3];
			
 
				-          state <= put_z;
			
 
				+          state <= add_output;
			
 
				         //if b is zero return a
			
 
				         end else if (($signed(b_e) == -127) && (b_m == 0)) begin
			
 
				           z[31] <= a_s;
			
 
				           z[30:23] <= a_e[7:0] + 127;
			
 
				           z[22:0] <= a_m[26:3];
			
 
				-          state <= put_z;
			
 
				+          state <= add_output;
			
 
				         end else begin
			
 
				           //Denormalised Number
			
 
				           if ($signed(a_e) == -127) begin
			
@@ -161,11 +163,11 @@ module adder(
 
				           end else begin
			
 
				             b_m[26] <= 1;
			
 
				           end
			
 
				-          state <= align;
			
 
				+          state <= add_align;
			
 
				         end
			
 
				       end
			
 
				 
			
 
				-      align:
			
 
				+      add_align:
			
 
				       begin
			
 
				         if ($signed(a_e) > $signed(b_e)) begin
			
 
				           b_e <= b_e + 1;
			
@@ -212,10 +214,10 @@ module adder(
 
				           round_bit <= sum[1];
			
 
				           sticky <= sum[0];
			
 
				         end
			
 
				-        state <= normalise_1;
			
 
				+        state <= add_norm_0;
			
 
				       end
			
 
				 
			
 
				-      normalise_1:
			
 
				+      add_norm_0:
			
 
				       begin
			
 
				         if (z_m[23] == 0 && $signed(z_e) > -126) begin
			
 
				           z_e <= z_e - 1;
			
@@ -224,11 +226,11 @@ module adder(
 
				           guard <= round_bit;
			
 
				           round_bit <= 0;
			
 
				         end else begin
			
 
				-          state <= normalise_2;
			
 
				+          state <= add_norm_1;
			
 
				         end
			
 
				       end
			
 
				 
			
 
				-      normalise_2:
			
 
				+      add_norm_1:
			
 
				       begin
			
 
				         if ($signed(z_e) < -126) begin
			
 
				           z_e <= z_e + 1;
			
@@ -237,11 +239,11 @@ module adder(
 
				           round_bit <= guard;
			
 
				           sticky <= sticky | round_bit;
			
 
				         end else begin
			
 
				-          state <= round;
			
 
				+          state <= add_round;
			
 
				         end
			
 
				       end
			
 
				 
			
 
				-      round:
			
 
				+      add_round:
			
 
				       begin
			
 
				         if (guard && (round_bit | sticky | z_m[0])) begin
			
 
				           z_m <= z_m + 1;
			
@@ -249,10 +251,10 @@ module adder(
 
				             z_e <=z_e + 1;
			
 
				           end
			
 
				         end
			
 
				-        state <= pack;
			
 
				+        state <= add_pack;
			
 
				       end
			
 
				 
			
 
				-      pack:
			
 
				+      add_pack:
			
 
				       begin
			
 
				         z[22 : 0] <= z_m[22:0];
			
 
				         z[30 : 23] <= z_e[7:0] + 127;
			
@@ -269,23 +271,23 @@ module adder(
 
				           z[30 : 23] <= 255;
			
 
				           z[31] <= z_s;
			
 
				         end
			
 
				-        state <= put_z;
			
 
				+        state <= add_output;
			
 
				       end
			
 
				 
			
 
				-      put_z:
			
 
				+      add_output:
			
 
				       begin
			
 
				         s_output_z_stb <= 1;
			
 
				         s_output_z <= z;
			
 
				         if (s_output_z_stb && output_z_ack) begin
			
 
				           s_output_z_stb <= 0;
			
 
				-          state <= get_input;
			
 
				+          state <= add_input;
			
 
				         end
			
 
				       end
			
 
				 
			
 
				     endcase
			
 
				 
			
 
				     if (rst == 1) begin
			
 
				-      state <= get_input;
			
 
				+      state <= add_input;
			
 
				       s_input_ack <= 0;
			
 
				       s_output_z_stb <= 0;
			
 
				     end
			
--- a/src/fpu32/fpu32.sv
+++ b/src/fpu32/fpu32.sv
@@ -1,4 +1,4 @@
 
				-`include "adder.v"
			
 
				+`include "adder.sv"
			
 
				 `include "mult.v"
			
 
				 
			
 
				 // synopsys translate_off
			
--- a/src/fpu32/mult.v
+++ b/src/fpu32/mult.v
@@ -2,6 +2,23 @@
 
				 //Copyright (C) Jonathan P Dawson 2013
			
 
				 //2013-12-12
			
 
				 
			
 
				+typedef enum logic [3:0] {
			
 
				+  mul_unpack,
			
 
				+  mul_special,
			
 
				+  mul_norm_a,
			
 
				+  mul_norm_b,
			
 
				+  mul_0,
			
 
				+  mul_1,
			
 
				+  mul_norm_1,
			
 
				+  mul_norm_2,
			
 
				+  mul_round,
			
 
				+  mul_pack,
			
 
				+  mul_output,
			
 
				+  mul_input
			
 
				+
			
 
				+} mult_state;
			
 
				+
			
 
				+
			
 
				 module multiplier(
			
 
				         input_a,
			
 
				         input_b,
			
@@ -31,21 +48,7 @@ module multiplier(
 
				   reg       [31:0] s_output_z;
			
 
				   reg       s_input_ack;
			
 
				 
			
 
				-  reg       [3:0] state;
			
 
				-  parameter get_a         = 4'd0,
			
 
				-            get_b         = 4'd1,
			
 
				-            unpack        = 4'd2,
			
 
				-            special_cases = 4'd3,
			
 
				-            normalise_a   = 4'd4,
			
 
				-            normalise_b   = 4'd5,
			
 
				-            multiply_0    = 4'd6,
			
 
				-            multiply_1    = 4'd7,
			
 
				-            normalise_1   = 4'd8,
			
 
				-            normalise_2   = 4'd9,
			
 
				-            round         = 4'd10,
			
 
				-            pack          = 4'd11,
			
 
				-            put_z         = 4'd12,
			
 
				-            get_input     = 4'd13;
			
 
				+  mult_state state;
			
 
				 
			
 
				   reg       [31:0] a, b, z;
			
 
				   reg       [23:0] a_m, b_m, z_m;
			
@@ -79,18 +82,18 @@ module multiplier(
 
				 //        end
			
 
				 //      end
			
 
				 
			
 
				-      get_input:
			
 
				+      mul_input:
			
 
				       begin
			
 
				         s_input_ack <= 1;
			
 
				         if (s_input_ack && input_stb) begin
			
 
				           a <= input_a;
			
 
				           b <= input_b;
			
 
				           s_input_ack <= 0;
			
 
				-          state <= unpack;
			
 
				+          state <= mul_unpack;
			
 
				         end
			
 
				       end
			
 
				 
			
 
				-      unpack:
			
 
				+      mul_unpack:
			
 
				       begin
			
 
				         a_m <= a[22 : 0];
			
 
				         b_m <= b[22 : 0];
			
@@ -98,10 +101,10 @@ module multiplier(
 
				         b_e <= b[30 : 23] - 127;
			
 
				         a_s <= a[31];
			
 
				         b_s <= b[31];
			
 
				-        state <= special_cases;
			
 
				+        state <= mul_special;
			
 
				       end
			
 
				 
			
 
				-      special_cases:
			
 
				+      mul_special:
			
 
				       begin
			
 
				         //if a is NaN or b is NaN return NaN
			
 
				         if ((a_e == 128 && a_m != 0) || (b_e == 128 && b_m != 0)) begin
			
@@ -109,7 +112,7 @@ module multiplier(
 
				           z[30:23] <= 255;
			
 
				           z[22] <= 1;
			
 
				           z[21:0] <= 0;
			
 
				-          state <= put_z;
			
 
				+          state <= mul_output;
			
 
				         //if a is inf return inf
			
 
				         end else if (a_e == 128) begin
			
 
				           z[31] <= a_s ^ b_s;
			
@@ -122,7 +125,7 @@ module multiplier(
 
				             z[22] <= 1;
			
 
				             z[21:0] <= 0;
			
 
				           end
			
 
				-          state <= put_z;
			
 
				+          state <= mul_output;
			
 
				         //if b is inf return inf
			
 
				         end else if (b_e == 128) begin
			
 
				           z[31] <= a_s ^ b_s;
			
@@ -135,19 +138,19 @@ module multiplier(
 
				             z[22] <= 1;
			
 
				             z[21:0] <= 0;
			
 
				           end
			
 
				-          state <= put_z;
			
 
				+          state <= mul_output;
			
 
				         //if a is zero return zero
			
 
				         end else if (($signed(a_e) == -127) && (a_m == 0)) begin
			
 
				           z[31] <= a_s ^ b_s;
			
 
				           z[30:23] <= 0;
			
 
				           z[22:0] <= 0;
			
 
				-          state <= put_z;
			
 
				+          state <= mul_output;
			
 
				         //if b is zero return zero
			
 
				         end else if (($signed(b_e) == -127) && (b_m == 0)) begin
			
 
				           z[31] <= a_s ^ b_s;
			
 
				           z[30:23] <= 0;
			
 
				           z[22:0] <= 0;
			
 
				-          state <= put_z;
			
 
				+          state <= mul_output;
			
 
				         end else begin
			
 
				           //Denormalised Number
			
 
				           if ($signed(a_e) == -127) begin
			
@@ -161,48 +164,48 @@ module multiplier(
 
				           end else begin
			
 
				             b_m[23] <= 1;
			
 
				           end
			
 
				-          state <= normalise_a;
			
 
				+          state <= mul_norm_a;
			
 
				         end
			
 
				       end
			
 
				 
			
 
				-      normalise_a:
			
 
				+      mul_norm_a:
			
 
				       begin
			
 
				         if (a_m[23]) begin
			
 
				-          state <= normalise_b;
			
 
				+          state <= mul_norm_b;
			
 
				         end else begin
			
 
				           a_m <= a_m << 1;
			
 
				           a_e <= a_e - 1;
			
 
				         end
			
 
				       end
			
 
				 
			
 
				-      normalise_b:
			
 
				+      mul_norm_b:
			
 
				       begin
			
 
				         if (b_m[23]) begin
			
 
				-          state <= multiply_0;
			
 
				+          state <= mul_0;
			
 
				         end else begin
			
 
				           b_m <= b_m << 1;
			
 
				           b_e <= b_e - 1;
			
 
				         end
			
 
				       end
			
 
				 
			
 
				-      multiply_0:
			
 
				+      mul_0:
			
 
				       begin
			
 
				         z_s <= a_s ^ b_s;
			
 
				         z_e <= a_e + b_e + 1;
			
 
				         product <= a_m * b_m * 4;
			
 
				-        state <= multiply_1;
			
 
				+        state <= mul_1;
			
 
				       end
			
 
				 
			
 
				-      multiply_1:
			
 
				+      mul_1:
			
 
				       begin
			
 
				         z_m <= product[49:26];
			
 
				         guard <= product[25];
			
 
				         round_bit <= product[24];
			
 
				         sticky <= (product[23:0] != 0);
			
 
				-        state <= normalise_1;
			
 
				+        state <= mul_norm_1;
			
 
				       end
			
 
				 
			
 
				-      normalise_1:
			
 
				+      mul_norm_1:
			
 
				       begin
			
 
				         if (z_m[23] == 0) begin
			
 
				           z_e <= z_e - 1;
			
@@ -211,11 +214,11 @@ module multiplier(
 
				           guard <= round_bit;
			
 
				           round_bit <= 0;
			
 
				         end else begin
			
 
				-          state <= normalise_2;
			
 
				+          state <= mul_norm_2;
			
 
				         end
			
 
				       end
			
 
				 
			
 
				-      normalise_2:
			
 
				+      mul_norm_2:
			
 
				       begin
			
 
				         if ($signed(z_e) < -126) begin
			
 
				           z_e <= z_e + 1;
			
@@ -224,11 +227,11 @@ module multiplier(
 
				           round_bit <= guard;
			
 
				           sticky <= sticky | round_bit;
			
 
				         end else begin
			
 
				-          state <= round;
			
 
				+          state <= mul_round;
			
 
				         end
			
 
				       end
			
 
				 
			
 
				-      round:
			
 
				+      mul_round:
			
 
				       begin
			
 
				         if (guard && (round_bit | sticky | z_m[0])) begin
			
 
				           z_m <= z_m + 1;
			
@@ -236,10 +239,10 @@ module multiplier(
 
				             z_e <=z_e + 1;
			
 
				           end
			
 
				         end
			
 
				-        state <= pack;
			
 
				+        state <= mul_pack;
			
 
				       end
			
 
				 
			
 
				-      pack:
			
 
				+      mul_pack:
			
 
				       begin
			
 
				         z[22 : 0] <= z_m[22:0];
			
 
				         z[30 : 23] <= z_e[7:0] + 127;
			
@@ -253,23 +256,23 @@ module multiplier(
 
				           z[30 : 23] <= 255;
			
 
				           z[31] <= z_s;
			
 
				         end
			
 
				-        state <= put_z;
			
 
				+        state <= mul_output;
			
 
				       end
			
 
				 
			
 
				-      put_z:
			
 
				+      mul_output:
			
 
				       begin
			
 
				         s_output_z_stb <= 1;
			
 
				         s_output_z <= z;
			
 
				         if (s_output_z_stb && output_z_ack) begin
			
 
				           s_output_z_stb <= 0;
			
 
				-          state <= get_input;
			
 
				+          state <= mul_input;
			
 
				         end
			
 
				       end
			
 
				 
			
 
				     endcase
			
 
				 
			
 
				     if (rst == 1) begin
			
 
				-      state <= get_input;
			
 
				+      state <= mul_input;
			
 
				       s_input_ack <= 0;
			
 
				       s_output_z_stb <= 0;
			
 
				     end
			
--- a/src/neural/comp.sv
+++ b/src/neural/comp.sv
@@ -1,90 +1,41 @@
 
				-`include "../blocks/abus.sv"
			
 
				-`include "../fpu32/fpu32.sv"
			
 
				-
			
 
				 /*
			
 
				-          ____
			
 
				-   x0 -->|ADD0|--> y0
			
 
				-   x1 -->|    |
			
 
				- bus0 <->|    |<-- ack0
			
 
				- bus1 <->|____|--> stb0 
			
 
				-          ____ 
			
 
				-   x2 -->|ADD1|--> y1
			
 
				-   x3 -->|    |
			
 
				- bus2 <->|    |<-- ack1
			
 
				- bus3 <->|____|--> stb1
			
 
				+              _____
			
 
				+     x[0] ==>|  A  |
			
 
				+ x_stb[0] -->|  D  |
			
 
				+ x_ack[0] <--|  D  |==> y
			
 
				+             |  E  |--> y_stb
			
 
				+     x[1] ==>|  R  |<-- y_ack
			
 
				+ x_stb[1] -->|     |
			
 
				+ x_ack[1] <--|_____|
			
 
				 
			
 
				 */
			
 
				-
			
 
				-module adder4to2#(parameter N=32)(x, clk, rst, y, left, right);
			
 
				+module cadder#(parameter N=32)(clk, rst, x, x_ack, x_stb, y, y_ack, y_stb);
			
 
				     input logic clk;
			
 
				     input logic rst;
			
 
				-    input wire [N-1:0] x [3:0];
			
 
				-    output logic [N-1:0] y [1:0];
			
 
				-    abus_io left[3:0];
			
 
				-    abus_io right[1:0];
			
 
				+    input wire [N-1:0] x [1:0];
			
 
				+    output logic [N-1:0] y;
			
 
				+    output x_ack[1:0];
			
 
				+    input x_stb[1:0];
			
 
				+    input y_ack;
			
 
				+    output y_stb;
			
 
				 
			
 
				-    wire out_stb [1:0];
			
 
				-    assign right.stb = out_stb[0] & out_stb[1];
			
 
				+    wire left_ack, left_stb;
			
 
				+    assign x_ack[0] = left_ack;
			
 
				+    assign x_ack[1] = left_ack;
			
 
				+    assign left_stb = x_stb[0] & x_stb[1];
			
 
				 
			
 
				     adder add0 (
			
 
				         .clk(clk),
			
 
				         .rst(rst),
			
 
				         .input_a(x[0]),
			
 
				         .input_b(x[1]),
			
 
				-        .input_stb(left0.stb),
			
 
				-        .input_ack(left0.ack),
			
 
				-        .output_z(y[0]),
			
 
				-        .output_z_ack(right.ack),
			
 
				-        .output_z_stb(out_stb[0])
			
 
				+        .input_stb(left_stb),
			
 
				+        .input_ack(left_ack),
			
 
				+        .output_z(y),
			
 
				+        .output_z_ack(y_ack),
			
 
				+        .output_z_stb(y_stb)
			
 
				     );
			
 
				-
			
 
				-    adder add1 (
			
 
				-        .clk(clk),
			
 
				-        .rst(rst),
			
 
				-        .input_a(x[2]),
			
 
				-        .input_b(x[3]),
			
 
				-        .input_stb(left1.stb),
			
 
				-        .input_ack(left1.ack),
			
 
				-        .output_z(y[1]),
			
 
				-        .output_z_ack(right.ack),
			
 
				-        .output_z_stb(out_stb[1])
			
 
				-    );
			
 
				-
			
 
				-endmodule : adder4to2
			
 
				-
			
 
				-
			
 
				-module adder4to2_tb();
			
 
				-    logic clk, rst;
			
 
				-    
			
 
				-    logic [31:0] x [3:0];
			
 
				-    logic [31:0] y [1:0];
			
 
				-    abus_io inputBus();
			
 
				-    abus_io outputBus();
			
 
				-    
			
 
				-    adder4to2 adder_casc(.clk(clk), .rst(rst), .x(x), .y(y), .left(inputBus.right), .right(outputBus.left));    
			
 
				-    initial forever #5 clk = ~clk;
			
 
				-    initial begin
			
 
				-        $display("Testing adder4to2");
			
 
				-        clk = 0;
			
 
				-        rst = 1;
			
 
				-        inputBus.stb = 0;
			
 
				-        outputBus.ack = 0;
			
 
				-        #20
			
 
				-        rst = 0;
			
 
				-        x = {'h41388000, 'h407c0000, 'h42480000, 'h42460000};
			
 
				-        inputBus.stb = 1;
			
 
				-        wait(inputBus.ack == 1);
			
 
				-        #15 inputBus.stb = 0;
			
 
				-        
			
 
				-        wait(outputBus.stb == 1);
			
 
				-        outputBus.ack = 1;
			
 
				-        assert(y[0] == 'h42c70000);
			
 
				-        assert(y[1] == 'h41778000);
			
 
				-        wait(outputBus.stb == 0);
			
 
				-        outputBus.ack = 0;
			
 
				-    end
			
 
				-    
			
 
				-endmodule : adder4to2_tb
			
 
				+endmodule : cadder
			
 
				 
			
 
				 /*
			
 
				   K layers of cascade adder
			
@@ -108,11 +59,6 @@ IN | K3 |  K2  |  K1  | OUT
 
				 [inputs]
			
 
				 x size: 2**K
			
 
				 left io size: 2**K
			
 
				-
			
 
				-[internal]
			
 
				-layer connecting wires: 2**K - 2
			
 
				-number of io buses: 2**(K-1) - 1
			
 
				-adder4to2 modules: 2**(K-2)
			
 
				 */
			
 
				 
			
 
				 module adder_casc#(parameter K,N=32)(clk, rst, x, y, left, right);
			
@@ -123,51 +69,60 @@ module adder_casc#(parameter K,N=32)(clk, rst, x, y, left, right);
 
				     
			
 
				     abus_io right;
			
 
				     abus_io left[2**K-1:0];
			
 
				-    
			
 
				+
			
 
				     wire [N-1:0] layer_w [2**K-3:0];
			
 
				-    abus_io bus_w[2**(K-1)-2:0]();
			
 
				-    
			
 
				+    wire ack_w [2**K-3:0];
			
 
				+    wire stb_w [2**K-3:0];
			
 
				+
			
 
				     genvar i,j;
			
 
				     generate
			
 
				-        for(i=0; i<K; i++) begin : generate_layers    
			
 
				+        for(i=0; i<K; i++) begin : generate_layers
			
 
				             // First layers
			
 
				             if(i == 0) begin
			
 
				-                for(j=0; j<2**(K-2); j++) begin : generate_casc0
			
 
				-                    adder4to2 a(
			
 
				+                for(j=0; j<2**(K-1); j++) begin : generate_casc0
			
 
				+                    cadder a(
			
 
				                       .clk(clk),
			
 
				                       .rst(rst),
			
 
				-                      .x(x[j*4+:4]),
			
 
				+                      .x(x[j*2+:2]),
			
 
				                       .y(layer_w[j]),
			
 
				-                      .left0(left[j*2].right),
			
 
				-                      .left1(left[j*2+1].right),
			
 
				-                      .right(bus_w[j].left)
			
 
				+                      .x_ack({left[j*2].ack, left[j*2+1].ack}),
			
 
				+                      .x_stb({left[j*2].stb, left[j*2+1].stb}),
			
 
				+                      .y_ack(ack_w[j]),
			
 
				+                      .y_stb(stb_w[j])
			
 
				                     );
			
 
				                 end
			
 
				             end
			
 
				             // Last layer
			
 
				             else if((K-i) <= 1) begin
			
 
				-                adder c(
			
 
				+                localparam s0 = 2**K-4;
			
 
				+                localparam s1 = 2**K-3;
			
 
				+                cadder c(
			
 
				                     .clk(clk),
			
 
				                     .rst(rst),
			
 
				-                    .input_a(layer_w[i-1][0]),
			
 
				-                    .input_b(layer_w[i-1][1]),
			
 
				-                    .input_stb(bus_w[i-1].stb),
			
 
				-                    .input_ack(bus_w[i-1].ack),
			
 
				-                    .output_z(y),
			
 
				-                    .output_z_ack(right.ack),
			
 
				-                    .output_z_stb(right.stb)
			
 
				-                    );
			
 
				+                    .x(layer_w[s0+:2]),
			
 
				+                    .y(y),
			
 
				+                    .x_ack({ack_w[s0], ack_w[s1]}),
			
 
				+                    .x_stb({stb_w[s0], stb_w[s1]}),
			
 
				+                    .y_ack(right.ack),
			
 
				+                    .y_stb(right.stb)
			
 
				+                );
			
 
				             end
			
 
				             // Middle layers
			
 
				             else begin
			
 
				-                for(j=0; j<2**(K-i-2); j++) begin : generate_casc1
			
 
				-                    adder4to2 b(
			
 
				-                      .clk(clk),
			
 
				-                      .rst(rst),
			
 
				-                      .x(layer_w[i-1][j*4+:4]),
			
 
				-                      .y(layer_w[i][j*2+:2]),
			
 
				-                      .left(bus_w[i-1][j].right),
			
 
				-                      .right(bus_w[i][j].left)
			
 
				+                for(j=0; j<2**(K-i-1); j++) begin : generate_casc1
			
 
				+                    localparam s = $floor((2.0**(K-1.0) * (2.0**(i-1)-1.0)/2.0**(i-1))+j);
			
 
				+                    localparam ix = s*2;
			
 
				+                    localparam iy = s+2**(K-1);
			
 
				+
			
 
				+                    cadder b(
			
 
				+                        .clk(clk),
			
 
				+                        .rst(rst),
			
 
				+                        .x(layer_w[ix+:2]),
			
 
				+                        .y(layer_w[iy]),
			
 
				+                        .x_ack(ack_w[ix+:2]),
			
 
				+                        .x_stb(stb_w[ix+:2]),
			
 
				+                        .y_ack(ack_w[iy]),
			
 
				+                        .y_stb(stb_w[iy])
			
 
				                     );
			
 
				                 end
			
 
				             end
			
@@ -179,24 +134,25 @@ endmodule : adder_casc
 
				 module adder_casc_tb();
			
 
				     logic clk, rst;
			
 
				     
			
 
				-    localparam K=3;
			
 
				-    logic [31:0] x [7:0];
			
 
				+    localparam K=4;
			
 
				+    logic [31:0] x [2**K-1:0];
			
 
				     logic [31:0] y;
			
 
				+    logic ack [2**K-1:0];
			
 
				+    logic stb [2**K-1:0];
			
 
				+
			
 
				     abus_io input_ios[2**K-1:0]();
			
 
				     abus_io output_io();
			
 
				     
			
 
				-    virtual abus_io input_vios[2**K-1:0];
			
 
				     genvar k;
			
 
				     generate
			
 
				-        for(k=0; k<2**K; k++) begin : map_generator
			
 
				-            initial begin : map_physical2virtual
			
 
				-                input_vios[k] = input_ios[k];
			
 
				-            end : map_physical2virtual
			
 
				+        for(k=0; k<2**K; k++) begin : io_mapper
			
 
				+            assign input_ios[k].stb = stb[k];
			
 
				+            assign ack[k] = input_ios[k].ack;
			
 
				         end
			
 
				     endgenerate
			
 
				     
			
 
				     
			
 
				-    adder_casc#(.K(K)) adder_casc0(.clk(clk), .rst(rst), .x(x), .y(y), .left(input_ios), .right(output_io.left));    
			
 
				+    adder_casc#(.K(K)) adder_casc0(.clk(clk), .rst(rst), .x(x), .y(y), .left(input_ios), .right(output_io.left));
			
 
				     initial forever #5 clk = ~clk;
			
 
				     initial begin
			
 
				         
			
@@ -204,26 +160,28 @@ module adder_casc_tb();
 
				         clk = 0;
			
 
				         rst = 1;
			
 
				         
			
 
				-        foreach(input_vios[i]) input_vios[i].stb = 0;
			
 
				+        foreach(stb[i]) stb[i] = 0;
			
 
				         output_io.ack = 0;
			
 
				         #20
			
 
				         rst = 0;
			
 
				-        x = {'h43800000, 'h43000000, 'h42800000, 'h42000000, 'h41800000, 'h41000000, 'h40800000, 'h40000000};
			
 
				+        // Initialise with floating point 2**i
			
 
				+        foreach(x[i]) x[i] = ('h400 + (i*8)) << 20;
			
 
				+        foreach(stb[i]) stb[i] = 1;
			
 
				+
			
 
				         fork
			
 
				-            foreach(input_vios[i]) begin
			
 
				+            foreach(ack[i]) begin
			
 
				                 fork
			
 
				-                    input_vios[i].stb = 1;
			
 
				-                    wait(input_vios[i].ack == 1);
			
 
				-                    #10
			
 
				-                    input_vios[i].stb = 0;
			
 
				+                    wait(ack[i] == 1);
			
 
				+                    #20
			
 
				+                    stb[i] = 0;
			
 
				                 join
			
 
				             end
			
 
				         join
			
 
				-        #20 
			
 
				+        #20
			
 
				         
			
 
				         wait(output_io.stb == 1);
			
 
				         output_io.ack = 1;
			
 
				-        assert(y[0] == 'h43ff0000);
			
 
				+        assert(y[0] == 'h47ffff00);
			
 
				         wait(output_io.stb == 0);
			
 
				         output_io.ack = 0;
			
 
				     end
			
--- a/src/neural/neuron.sv
+++ b/src/neural/neuron.sv
@@ -2,30 +2,142 @@
 
				 `timescale 1 ps / 1 ps
			
 
				 // synopsys translate_on
			
 
				 
			
 
				+/*
			
 
				+             ______     _________
			
 
				+     w[i] =>| MULT |==>|         |
			
 
				+     x[i] =>|______|   |         |
			
 
				+             ______    |         |
			
 
				+   w[i+1] =>| MULT |==>| CASCADE |      _____
			
 
				+   x[i+1] =>|______|   |         | b =>| ADD |
			
 
				+                .      |  ADDER  |====>|_____|==> y
			
 
				+                .      |         |
			
 
				+             ______    |         |
			
 
				+   w[M-1] =>| MULT |==>|         |
			
 
				+   x[M-1] =>|______|   |_________|
			
 
				 
			
 
				-module neuron#(parameter M, N=32)(x, y, w, b, stb, ack, clk, rst);
			
 
				+*/
			
 
				+
			
 
				+module neuron#(parameter K, N=32)(x, y, w, b, left, right, clk, rst);
			
 
				+    localparam M = 2**K;
			
 
				     input wire [N-1:0] x [M-1:0];
			
 
				-    input wire [N-1:0] w;
			
 
				-    input wire [N-1:0] b [M-1:0];
			
 
				-    output logic stb;
			
 
				-    input logic ack;
			
 
				-    input logic clk;
			
 
				-    input logic rst;
			
 
				+    input wire [N-1:0] w [M-1:0];
			
 
				+    input wire [N-1:0] b;
			
 
				     output logic [N-1:0] y;
			
 
				 
			
 
				-    multiplier mult_array[M-1:0](
			
 
				+    input wire clk;
			
 
				+    input wire rst;
			
 
				+
			
 
				+    abus_io left[M-1:0];
			
 
				+    abus_io inner_io0[M-1:0]();
			
 
				+    abus_io inner_io1();
			
 
				+    abus_io right;
			
 
				+
			
 
				+    wire [N-1:0] inner_w [M-1:0];
			
 
				+    wire [N-1:0] casc_w;
			
 
				+
			
 
				+    genvar i;
			
 
				+    generate
			
 
				+        for(i=0;i<M;i++) begin: gen_mult_layer
			
 
				+            multiplier mult(
			
 
				+                .clk(clk),
			
 
				+                .rst(rst),
			
 
				+                .input_a(x[i]),
			
 
				+                .input_b(w[i]),
			
 
				+                .input_stb(left[i].stb),
			
 
				+                .input_ack(left[i].ack),
			
 
				+                .output_z(inner_w[i]),
			
 
				+                .output_z_ack(inner_io0[i].ack),
			
 
				+                .output_z_stb(inner_io0[i].stb)
			
 
				+            );
			
 
				+        end
			
 
				+    endgenerate
			
 
				+
			
 
				+    adder_casc#(.K(K), .N(N)) adder0(
			
 
				         .clk(clk),
			
 
				         .rst(rst),
			
 
				-        .input_a(input_a),
			
 
				-        .input_b(input_b),
			
 
				-        .input_stb(mult_input_stb),
			
 
				-        .input_ack(mult_input_ack),
			
 
				-        .output_z(result_mult),
			
 
				-        .output_z_ack(mult_output_z_ack),
			
 
				-        .output_z_stb(mult_output_z_stb),
			
 
				+        .x(inner_w),
			
 
				+        .y(casc_w),
			
 
				+        .left(inner_io0),
			
 
				+        .right(inner_io1)
			
 
				+    );
			
 
				+
			
 
				+    adder adder1 (
			
 
				+        .clk(clk),
			
 
				+        .rst(rst),
			
 
				+        .input_a(b),
			
 
				+        .input_b(casc_w),
			
 
				+        .input_stb(inner_io1.stb),
			
 
				+        .input_ack(inner_io1.ack),
			
 
				+        .output_z(y),
			
 
				+        .output_z_ack(right.ack),
			
 
				+        .output_z_stb(right.stb)
			
 
				     );
			
 
				 
			
 
				 endmodule : neuron
			
 
				 
			
 
				 
			
 
				+module neuron_tb;
			
 
				+    logic clk, rst;
			
 
				+
			
 
				+    logic [31:0] x [7:0];
			
 
				+    logic [31:0] w [7:0];
			
 
				+    logic [31:0] b;
			
 
				+    logic [31:0] y;
			
 
				+
			
 
				+    logic ack [7:0];
			
 
				+    logic stb [7:0];
			
 
				+
			
 
				+    abus_io left[7:0]();
			
 
				+    abus_io right();
			
 
				+
			
 
				+    neuron#(.K(3)) neu0(
			
 
				+        .clk(clk),
			
 
				+        .rst(rst),
			
 
				+        .x(x),
			
 
				+        .y(y),
			
 
				+        .w(w),
			
 
				+        .b(b),
			
 
				+        .left(left),
			
 
				+        .right(right)
			
 
				+    );
			
 
				+
			
 
				+    genvar k;
			
 
				+    generate
			
 
				+        for(k=0; k<8; k++) begin : io_mapper
			
 
				+            assign left[k].stb = stb[k];
			
 
				+            assign ack[k] = left[k].ack;
			
 
				+        end
			
 
				+    endgenerate
			
 
				+
			
 
				+    initial forever #5 clk = ~clk;
			
 
				+    initial begin
			
 
				+        clk = 0;
			
 
				+        rst = 1;
			
 
				+        foreach(stb[i]) stb[i] = 0;
			
 
				+        right.ack = 0;
			
 
				+        b = 'h3f000000;
			
 
				+        w = {
			
 
				+            'h3fa00000, 'h3fa00000, 'h3fa00000, 'h3fa00000,
			
 
				+            'h3fa00000, 'h3fa00000, 'h3fa00000, 'h3fa00000
			
 
				+        };
			
 
				+        x = {
			
 
				+            // 'h3fa00000, 'h3fa00000, 'h3fa00000, 'h3fa00000,
			
 
				+            // 'h3fa00000, 'h3fa00000, 'h3fa00000, 'h3fa00000
			
 
				+            'h417a0000, 'h40fa0000, 'h41fa0000, 'h427a0000,
			
 
				+            'h407a0000, 'h40780000, 'h40440000, 'h40cc0000
			
 
				+        };
			
 
				+        #10;
			
 
				+        rst = 0;
			
 
				+        foreach(stb[i]) stb[i] = 1;
			
 
				+        #20;
			
 
				+        foreach(stb[i]) stb[i] = 0;
			
 
				+        wait(right.stb == 1);
			
 
				+        right.ack = 1;
			
 
				+        #10
			
 
				+        wait(right.stb == 0);
			
 
				+        right.ack = 0;
			
 
				+    end
			
 
				+
			
 
				+
			
 
				 
			
 
				+endmodule : neuron_tb