Min 4 gadi atpakaļ
vecāks
revīzija
a64c5d0f1e

+ 5 - 0
src/fpu16/fpu16.sv

@@ -62,6 +62,11 @@ module fpu16_tb;
 		expected_mult = 0;
 
 		// test some common values
+		input_a = 'hbec64dc6;
+		input_b = 'h3ecc3194;
+		# 30;
+		expected_add = 'h3c3c79c0;
+
 		input_a = 'h41ac3000;
 		input_b = 'h431f9000;
 		# 30;

+ 13 - 0
src/fpu32/fpu32.sv

@@ -52,6 +52,7 @@ module fpu32_tb();
         static int num_err = 0;
         static int num_tests = $size(test_mem) * 2;
 
+
         clk = 0;
         reset = 1;
         adder_input_stb = 0;
@@ -63,6 +64,18 @@ module fpu32_tb();
         #20;
         reset = 0;
 
+        input_a = 'hbec64dc6;
+        input_b = 'h3ecc3194;
+
+        adder_input_stb = 1;
+        wait(adder_input_ack == 1);
+        #15;
+        adder_input_stb = 0;
+        mult_input_stb = 0;
+        # 100;
+        // expected_add = 'h3c3c79c0;
+
+
 
         for (int i=0; i < $size(test_mem); i++) begin
             input_a = test_mem[i][0];

+ 6 - 2
src/fpu32p/add32.sv

@@ -83,17 +83,19 @@ module adder_32(
                         begin
                             mantissaa0 <= a[22:0];
                             mantissab0 <= b[22:0];
+                            outsign0 <= a[31];
                         end
                     else
                         begin
                             mantissaa0 <= b[22:0];
                             mantissab0 <= a[22:0];
+                            outsign0 <= b[31];
                         end
                     outexponent0 <= a[30:23];
-                    outsign0 <= a[31];
                 end
         end
 
+    // Stage 1
     always_ff @(posedge clk)
         begin
             mantissab1 <= mantissab0;
@@ -106,6 +108,7 @@ module adder_32(
             count1 <= trailingzerodetector(mantissab0);
         end
 
+    // Stage 2
     always_ff @(posedge clk)
         begin
             stickybit2 <= (d1 > count1) ? 1 : 0;
@@ -117,9 +120,9 @@ module adder_32(
             eop2 <= eop1;
         end
 
+    // Stage 3
     assign mantissabnew2 = (eop2) ? ~(mantissabshift2 + stickybit2) : mantissabshift2 + stickybit2;
     assign {carrya2, outmantissaa2} = mantissaanew2 + mantissabnew2 + eop2;
-    assign {carrya2, outmantissaa2} = mantissaanew2 + mantissabnew2 + eop2;
 
     wire cond2;
     reg ext3, cond3;
@@ -139,6 +142,7 @@ module adder_32(
                 shift3 <= equal2 | (carrya2 & (~eop2));
         end
 
+    // Stage 4
     always_ff @(posedge clk)
         begin
             outsign4 <= outsign3;

+ 92 - 0
src/fpu32p/add32a.sv

@@ -0,0 +1,92 @@
+
+module adder_32a (clk, rst, a, b, y);
+    input clk, rst;
+    input [31:0] a, b;
+    output [31:0] y;
+
+    // Alignment
+    wire exchange = (b[30:0] > a[30:0]);
+    wire [31:0] fp_large = exchange? b : a;
+    wire [31:0] fp_small = exchange? a : b;
+
+    wire [23:0] large_frac24 = {|fp_large[30:23], fp_large[22:0]};
+    wire [23:0] small_frac24 = {|fp_small[30:23], fp_small[22:0]};
+
+    wire sign = exchange ? b[31] : a[31];
+    wire op_sub = fp_large[31] ^ fp_small[31];
+
+    wire [7:0] exp_diff = fp_large[30:23] - fp_small[30:23];
+    wire small_den_only = (fp_large[30:23] != 0) & (fp_small[30:23] == 0);
+    wire [7:0] shift_amount = small_den_only? exp_diff - 8'h1 : exp_diff;
+    wire [49:0] small_frac50 = (shift_amount >= 26) ? {26'h0,small_frac24} : {small_frac24,26'h0} >> shift_amount;
+    wire [26:0] small_frac27 = {small_frac50[49:24],|small_frac50[23:0]};
+
+    reg sign1, op_sub1;
+    reg [7:0] exp1;
+    reg [23:0] large_frac1;
+    reg [26:0] small_frac1;
+
+    always_ff @(posedge clk) begin
+        sign1 <= sign;
+        exp1 <= fp_large[30:23];
+        op_sub1 <= op_sub;
+        large_frac1 <= large_frac24;
+        small_frac1 <= small_frac27;
+    end
+
+    // Calculation
+    wire [27:0] aligned_large_frac = {1'b0,large_frac1,3'b000};
+    wire [27:0] aligned_small_frac = {1'b0,small_frac1};
+    wire cal_frac [27:0] = op_sub1 ?
+        aligned_large_frac - aligned_small_frac :
+        aligned_large_frac + aligned_small_frac;
+
+    always_ff @(posedge clk) begin
+        sign2 <= sign1;
+        exp2 <= exp1;
+        frac2 <= cal_frac;
+    end
+
+    wire [26:0] f4,f3,f2,f1,f0;
+    wire [4:0] zeros;
+    assign zeros[4] = ~|frac2[26:11]; // 16-bit 0
+    assign f4 = zeros[4]? {frac2[10:0],16'b0} : frac2[26:0];
+    assign zeros[3] = ~|f4[26:19]; // 8-bit 0
+    assign f3 = zeros[3]? {f4[18:0], 8'b0} : f4;
+    assign zeros[2] = ~|f3[26:23]; // 4-bit 0
+    assign f2 = zeros[2]? {f3[22:0], 4'b0} : f3;
+    assign zeros[1] = ~|f2[26:25]; // 2-bit 0
+    assign f1 = zeros[1]? {f2[24:0], 2'b0} : f2;
+    assign zeros[0] = ~f1[26]; // 1-bit 0
+    assign f0 = zeros[0]? {f1[25:0], 1'b0} : f1;
+
+    reg [26:0] frac3;
+    reg [7:0] exp3;
+    reg sign3;
+
+    always_ff @(posedge clk) begin
+        sign3 <= sing2;
+        if (cal_frac[27]) begin
+            frac3 <= frac2[27:1]; // 1x.xxxxxxxxxxxxxxxxxxxxxxx xxx
+            exp3 <= exp2 + 8'h1; // 1.xxxxxxxxxxxxxxxxxxxxxxx xxx
+        end else begin
+            if ((temp_exp > zeros) && (f0[26])) begin // a normalized number
+                exp3 <= temp_exp - zeros;
+                frac3 <= f0; // 01.xxxxxxxxxxxxxxxxxxxxxxx xxx
+            end else begin // is a denormalized number or 0
+                exp3 <= 0;
+                if (temp_exp != 0) // (e - 127) = ((e - 1) - 126)
+                    frac3 <= frac2[26:0] << (exp2 - 8'h1);
+                else frac3 <= frac2[26:0];
+            end
+        end
+    end
+
+    wire frac_plus_1 = // for rounding
+    frac3[2] & (frac3[1] | frac3[0]) | frac3[2] & ~frac3[1] & ~frac3[0] & frac3[3];
+
+    wire [24:0] frac_round = {1'b0,frac3[26:3]} + frac_plus_1;
+    wire [7:0] exponent = frac_round[24]? exp3 + 8'h1 : exp3;
+    assign s = {sign3,exponent,frac_round[22:0]};
+
+endmodule : adder_32a

+ 268 - 0
src/fpu32p/add32c.sv

@@ -0,0 +1,268 @@
+
+module adder_32c (a,b,sub,rm,s,clk,clrn,e); // pipelined fp adder
+    input clk, clrn; // clock and reset
+    input [31:0] a, b; // fp a and b
+    input [1:0] rm; // round mode
+    input sub; // 1: sub; 0: add
+    input e; // enable
+    output [31:0] s; // fp output
+    wire [26:0] a_small_frac;
+    wire [23:0] a_large_frac;
+    wire [22:0] a_inf_nan_frac;
+    wire [7:0] a_exp;
+    wire a_is_nan,a_is_inf;
+    wire a_sign;
+    wire a_op_sub;
+    // exe1: alignment stage
+    fadd_align alignment (a,b,sub,a_is_nan,a_is_inf,a_inf_nan_frac,a_sign,
+        a_exp,a_op_sub,a_large_frac,a_small_frac);
+    wire [26:0] c_small_frac;
+    wire [23:0] c_large_frac;
+    wire [22:0] c_inf_nan_frac;
+    wire [7:0] c_exp;
+    wire [1:0] c_rm;
+    wire c_is_nan,c_is_inf;
+    wire c_sign;
+    wire c_op_sub;
+    // pipelined registers
+    reg_align_cal reg_ac (rm,a_is_nan,a_is_inf,a_inf_nan_frac,a_sign,a_exp,
+        a_op_sub,a_large_frac,a_small_frac,clk,clrn,e,
+        c_rm,c_is_nan,c_is_inf,c_inf_nan_frac,c_sign,
+        c_exp,c_op_sub,c_large_frac,c_small_frac);
+    wire [27:0] c_frac;
+    // exe2: calculation stage
+    fadd_cal calculation(c_op_sub,c_large_frac,c_small_frac,c_frac);
+    wire [27:0] n_frac;
+    wire [22:0] n_inf_nan_frac;
+    wire [7:0] n_exp;
+    wire [1:0] n_rm;
+    wire n_is_nan,n_is_inf;
+    wire n_sign;
+    // pipelined registers
+    reg_cal_norm reg_cn (c_rm,c_is_nan,c_is_inf,c_inf_nan_frac,c_sign,c_exp,
+        c_frac,clk,clrn,e,n_rm,n_is_nan,n_is_inf,
+        n_inf_nan_frac,n_sign,n_exp,n_frac);
+    // exe3: normalization stage
+    fadd_norm normalization (n_rm,n_is_nan,n_is_inf,n_inf_nan_frac,n_sign,
+        n_exp,n_frac,s);
+endmodule
+
+//==============================================
+// the alignment stage.
+module fadd_align (a,b,sub,s_is_nan,s_is_inf,inf_nan_frac,sign,temp_exp,
+    op_sub,large_frac24,small_frac27); //alignment stage
+    input [31:0] a,b;
+    input sub;
+    output [26:0] small_frac27;
+    output [23:0] large_frac24;
+    output [22:0] inf_nan_frac;
+    output [7:0] temp_exp;
+    output s_is_nan;
+    output s_is_inf;
+    output sign;
+    output op_sub;
+    wire exchange = (b[30:0] > a[30:0]);
+    wire [31:0] fp_large = exchange? b : a;
+    wire [31:0] fp_small = exchange? a : b;
+    wire fp_large_hidden_bit = |fp_large[30:23];
+    wire fp_small_hidden_bit = |fp_small[30:23];
+    wire [23:0] large_frac24 = {fp_large_hidden_bit,fp_large[22:0]};
+    wire [23:0] small_frac24 = {fp_small_hidden_bit,fp_small[22:0]};
+
+    assign temp_exp = fp_large[30:23];
+    assign sign = exchange? sub ^ b[31] : a[31];
+    assign op_sub = sub ^ fp_large[31] ^ fp_small[31];
+
+    wire fp_large_expo_is_ff = &fp_large[30:23]; // exp == 0xff
+    wire fp_small_expo_is_ff = &fp_small[30:23];
+    wire fp_large_frac_is_00 = ~|fp_large[22:0]; // frac == 0x0
+    wire fp_small_frac_is_00 = ~|fp_small[22:0];
+    wire fp_large_is_inf=fp_large_expo_is_ff & fp_large_frac_is_00;
+    wire fp_small_is_inf=fp_small_expo_is_ff & fp_small_frac_is_00;
+    wire fp_large_is_nan=fp_large_expo_is_ff & ~fp_large_frac_is_00;
+    wire fp_small_is_nan=fp_small_expo_is_ff & ~fp_small_frac_is_00;
+    assign s_is_inf = fp_large_is_inf | fp_small_is_inf;
+    wire s_is_nan = fp_large_is_nan | fp_small_is_nan |
+        ((sub ^ fp_small[31] ^ fp_large[31]) &
+            fp_large_is_inf & fp_small_is_inf);
+    wire [22:0] nan_frac = (a[21:0] > b[21:0])?
+        {1'b1,a[21:0]} : {1'b1,b[21:0]};
+    assign inf_nan_frac = s_is_nan? nan_frac : 23'h0;
+    wire [7:0] exp_diff = fp_large[30:23] - fp_small[30:23];
+    wire small_den_only = (fp_large[30:23] != 0) &
+        (fp_small[30:23] == 0);
+    wire [7:0] shift_amount = small_den_only? exp_diff - 8'h1 : exp_diff;
+    wire [49:0] small_frac50 = (shift_amount >= 26)?
+        {26'h0,small_frac24} :
+        {small_frac24,26'h0} >> shift_amount;
+    assign small_frac27 = {small_frac50[49:24],|small_frac50[23:0]};
+endmodule
+
+//======================================================================
+// pipeline registers in between the alignment and calculation stages
+module reg_align_cal (a_rm,a_is_nan,a_is_inf,a_inf_nan_frac,a_sign,a_exp,
+    a_op_sub,a_large_frac,a_small_frac,clk,clrn,e,c_rm,
+    c_is_nan,c_is_inf,c_inf_nan_frac,c_sign,c_exp,
+    c_op_sub,c_large_frac,c_small_frac); // pipeline regs
+    input [26:0] a_small_frac;
+    input [23:0] a_large_frac;
+    input [22:0] a_inf_nan_frac;
+    input [7:0] a_exp;
+    input [1:0] a_rm;
+    input a_is_nan, a_is_inf, a_sign, a_op_sub;
+    input e; // e: enable
+    input clk, clrn; // clock and reset
+    output reg [26:0] c_small_frac;
+    output reg [23:0] c_large_frac;
+    output reg [22:0] c_inf_nan_frac;
+    output reg [7:0] c_exp;
+    output reg [1:0] c_rm;
+    output reg c_is_nan,c_is_inf,c_sign,c_op_sub;
+
+    always @ (posedge clk or negedge clrn) begin
+        if (!clrn) begin
+            c_rm <= 0;
+            c_is_nan <= 0;
+            c_is_inf <= 0;
+            c_inf_nan_frac <= 0;
+            c_sign <= 0;
+            c_exp <= 0;
+            c_op_sub <= 0;
+            c_large_frac <= 0;
+            c_small_frac <= 0;
+        end else if (e) begin
+            c_rm <= a_rm;
+            c_is_nan <= a_is_nan;
+            c_is_inf <= a_is_inf;
+            c_inf_nan_frac <= a_inf_nan_frac;
+            c_sign <= a_sign;
+            c_exp <= a_exp;
+            c_op_sub <= a_op_sub;
+            c_large_frac <= a_large_frac;
+            c_small_frac <= a_small_frac;
+        end
+    end
+endmodule
+
+// ===========================================================================
+// the calculation stage.
+module fadd_cal (op_sub,large_frac24,small_frac27, cal_frac); // calculation
+    input [23:0] large_frac24;
+    input op_sub;
+    input [26:0] small_frac27;
+    output [27:0] cal_frac;
+    wire [27:0] aligned_large_frac = {1'b0,large_frac24,3'b000};
+    wire [27:0] aligned_small_frac = {1'b0,small_frac27};
+    assign cal_frac = op_sub?
+        aligned_large_frac - aligned_small_frac :
+        aligned_large_frac + aligned_small_frac;
+endmodule
+
+module reg_cal_norm (c_rm,c_is_nan,c_is_inf,c_inf_nan_frac,c_sign,c_exp,
+    c_frac,clk,clrn,e,n_rm,n_is_nan,n_is_inf,
+    n_inf_nan_frac,n_sign,n_exp,n_frac); // pipeline regs
+    input [27:0] c_frac;
+    input [22:0] c_inf_nan_frac;
+    input [7:0] c_exp;
+    input [1:0] c_rm;
+    input c_is_nan, c_is_inf, c_sign;
+    input e; // e: enable
+    input clk, clrn; // clock and reset
+    output reg [27:0] n_frac;
+    output reg [22:0] n_inf_nan_frac;
+    output reg [7:0] n_exp;
+    output reg [1:0] n_rm;
+    output reg n_is_nan,n_is_inf,n_sign;
+    always @ (posedge clk or negedge clrn) begin
+        if (!clrn) begin
+            n_rm <= 0;
+            n_is_nan <= 0;
+            n_is_inf <= 0;
+            n_inf_nan_frac <= 0;
+            n_sign <= 0;
+            n_exp <= 0;
+            n_frac <= 0;
+        end else if (e) begin
+            n_rm <= c_rm;
+            n_is_nan <= c_is_nan;
+            n_is_inf <= c_is_inf;
+            n_inf_nan_frac <= c_inf_nan_frac;
+            n_sign <= c_sign;
+            n_exp <= c_exp;
+            n_frac <= c_frac;
+        end
+    end
+endmodule
+
+//=================================================================
+// normalization
+
+module fadd_norm (rm,is_nan,is_inf,inf_nan_frac,sign,temp_exp,cal_frac,s);
+    input [27:0] cal_frac;
+    input [22:0] inf_nan_frac;
+    input [7:0] temp_exp;
+    input [1:0] rm;
+    input is_nan,is_inf;
+    input sign;
+    output [31:0] s;
+    wire [26:0] f4,f3,f2,f1,f0;
+    wire [4:0] zeros;
+    assign zeros[4] = ~|cal_frac[26:11]; // 16-bit 0
+    assign f4 = zeros[4]? {cal_frac[10:0],16'b0} : cal_frac[26:0];
+    assign zeros[3] = ~|f4[26:19]; // 8-bit 0
+    assign f3 = zeros[3]? {f4[18:0], 8'b0} : f4;
+    assign zeros[2] = ~|f3[26:23]; // 4-bit 0
+    assign f2 = zeros[2]? {f3[22:0], 4'b0} : f3;
+    assign zeros[1] = ~|f2[26:25]; // 2-bit 0
+    assign f1 = zeros[1]? {f2[24:0], 2'b0} : f2;
+    assign zeros[0] = ~f1[26]; // 1-bit 0
+    assign f0 = zeros[0]? {f1[25:0], 1'b0} : f1;
+    reg [26:0] frac0;
+    reg [7:0] exp0;
+    always @ * begin
+        if (cal_frac[27]) begin
+            frac0 = cal_frac[27:1]; // 1x.xxxxxxxxxxxxxxxxxxxxxxx xxx
+            exp0 = temp_exp + 8'h1; // 1.xxxxxxxxxxxxxxxxxxxxxxx xxx
+        end else begin
+            if ((temp_exp > zeros) && (f0[26])) begin // a normalized number
+                exp0 = temp_exp - zeros;
+                frac0 = f0; // 01.xxxxxxxxxxxxxxxxxxxxxxx xxx
+            end else begin // is a denormalized number or 0
+                exp0 = 0;
+                if (temp_exp != 0) // (e - 127) = ((e - 1) - 126)
+                    frac0 = cal_frac[26:0] << (temp_exp - 8'h1);
+                else frac0 = cal_frac[26:0];
+            end
+        end
+    end
+    wire frac_plus_1 = // for rounding
+    ~rm[1] & ~rm[0] & frac0[2] & (frac0[1] | frac0[0]) |
+        ~rm[1] & ~rm[0] & frac0[2] & ~frac0[1] & ~frac0[0] & frac0[3] |
+            ~rm[1] & rm[0] & (frac0[2] | frac0[1] | frac0[0]) & sign |
+                rm[1] & ~rm[0] & (frac0[2] | frac0[1] | frac0[0]) & ~sign;
+    wire [24:0] frac_round = {1'b0,frac0[26:3]} + frac_plus_1;
+    wire [7:0] exponent = frac_round[24]? exp0 + 8'h1 : exp0;
+    wire overflow = &exp0 | &exponent;
+    assign s = {sign,exponent,frac_round[22:0]};
+    // assign s = final_result(overflow, rm, sign, is_nan, is_inf, exponent,
+    //     frac_round[22:0], inf_nan_frac);
+    // function [31:0] final_result;
+    //     input overflow;
+    //     input [1:0] rm;
+    //     input sign, is_nan, is_inf;
+    //     input [7:0] exponent;
+    //     input [22:0] fraction, inf_nan_frac;
+    //     casex ({overflow, rm, sign, is_nan, is_inf})
+    //         6'b1_00_x_0_x : final_result = {sign,8'hff,23'h000000}; // inf
+    //         6'b1_01_0_0_x : final_result = {sign,8'hfe,23'h7fffff}; // max
+    //         6'b1_01_1_0_x : final_result = {sign,8'hff,23'h000000}; // inf
+    //         6'b1_10_0_0_x : final_result = {sign,8'hff,23'h000000}; // inf
+    //         6'b1_10_1_0_x : final_result = {sign,8'hfe,23'h7fffff}; // max
+    //         6'b1_11_x_0_x : final_result = {sign,8'hfe,23'h7fffff}; // max
+    //         6'b0_xx_x_0_0 : final_result = {sign,exponent,fraction}; // nor
+    //         6'bx_xx_x_1_x : final_result = {1'b1,8'hff,inf_nan_frac}; // nan
+    //         6'bx_xx_x_0_1 : final_result = {sign,8'hff,inf_nan_frac}; // inf
+    //         default : final_result = {sign,8'h00,23'h000000}; // 0
+    //     endcase
+    // endfunction : final_result
+endmodule

+ 22 - 7
src/fpu32p/fpu32p.sv

@@ -1,6 +1,10 @@
 `include "mult32.v"
-`include "add32.sv"
+// `include "add32.sv"
+// `include "add32b.sv"
+`include "add32c.sv"
 
+`define PP_FP_MULT 12
+`define PP_FP_ADDER 1
 
 module fpu32p_tb;
 
@@ -15,11 +19,22 @@ module fpu32p_tb;
     assign {w_exp_sign, w_exp_exp, w_exp_man} = expected_add;
     assign {w_res_sign, w_res_exp, w_res_man} = result_add;
 
-    adder_32 adder1(
+    // adder_32 adder1(
+    //     .clk(clk),
+    //     .a(input_a),
+    //     .b(input_b),
+    //     .out(result_add)
+    // );
+    // pipelined_fadder adder1(result_add, input_a, input_b, clk);
+    adder_32c adder0(
         .clk(clk),
+        .clrn(~reset),
         .a(input_a),
         .b(input_b),
-        .out(result_add)
+        .sub(1'b0),
+        .rm(2'b00),
+        .s(result_add),
+        .e(1'b1)
     );
 
     mult_32 multiplier1(
@@ -33,8 +48,8 @@ module fpu32p_tb;
     );
 
     initial forever #5 clk = ~clk;
-    localparam PIPELINES_ADD = 4;
-    localparam PIPELINES_MUL = 12;
+    localparam PIPELINES_ADD = `PP_FP_ADDER;
+    localparam PIPELINES_MUL = `PP_FP_MULT;
 
     reg [31:0] test_mem [29:0][3:0];
 
@@ -50,8 +65,8 @@ module fpu32p_tb;
         #5;
         reset = 0;
 
-        input_a = 'hbec64dc6;
-        input_b = 'h3ecc3194;
+        input_b = 'hbec64dc6;
+        input_a = 'h3ecc3194;
 
         expected_add = 'h3c3c79c0;
         expected_mult = 0;

+ 51 - 0
src/neural/adder_casc_tb.sv

@@ -0,0 +1,51 @@
+module adder_casc_tb();
+    logic clk, rst;
+
+    localparam K=2;
+    logic [31:0] x [2**K-1:0];
+    logic [31:0] y0, y1;
+    logic ack [2**K-1:0];
+    logic stb [2**K-1:0];
+
+    abus_io input_ios[2**K-1:0]();
+    abus_io output_io();
+
+    genvar k;
+    generate
+        for(k=0; k<2**K; k++) begin : io_mapper
+            assign input_ios[k].stb = stb[k];
+            assign ack[k] = input_ios[k].ack;
+        end
+    endgenerate
+
+
+    adder_casc#(.K(K)) adder_casc0(.clk(clk), .rst(rst), .x(x), .y(y0), .left(input_ios), .right(output_io.left));
+
+    adder_casc_p#(.K(K)) adder_casc1(.clk(clk), .rst(rst), .x(x), .y(y1));
+
+    initial forever #5 clk = ~clk;
+    initial begin
+
+        $display("Testing adder_casc");
+        clk = 0;
+        rst = 1;
+
+        foreach(stb[i]) stb[i] = 0;
+        output_io.ack = 0;
+        // Initialise with floating point 2**i
+        // foreach(x[i]) x[i] = ('h400 + (i*8)) << 20;
+        x = {'h41ea6000, 'h42ea6000, 'h411ba000, 'h413cc000};
+
+        #10;
+        rst = 0;
+        foreach(stb[i]) stb[i] = 1;
+        #20;
+        foreach(stb[i]) stb[i] = 0;
+        wait(output_io.stb == 1);
+        output_io.ack = 1;
+        assert(y0[0] == 'h47ffff00);
+        wait(output_io.stb == 0);
+        output_io.ack = 0;
+    end
+
+endmodule : adder_casc_tb

+ 1 - 52
src/neural/comp.sv

@@ -172,7 +172,7 @@ module adder_casc_p #(parameter K, N=32)(clk, rst, x, y);
                 // Middle layer
             else begin
                 for(j=0; j<2**(K-i-1); j++) begin : gen_mid_layer
-                    localparam s = $floor((2.0**(K-1.0) * (2.0**(i-1)-1.0)/2.0**(i-1))+j);
+                    localparam s = $floor((2.0**(K-1.0) * \c(2.0**(i-1)-1.0)/2.0**(i-1))+j);
                     localparam ix = s*2;
                     localparam ix1 = s*2+1;
                     localparam iy = s+2**(K-1);
@@ -189,57 +189,6 @@ module adder_casc_p #(parameter K, N=32)(clk, rst, x, y);
     endgenerate
 endmodule : adder_casc_p
 
-module adder_casc_tb();
-    logic clk, rst;
-    
-    localparam K=2;
-    logic [31:0] x [2**K-1:0];
-    logic [31:0] y0, y1;
-    logic ack [2**K-1:0];
-    logic stb [2**K-1:0];
-
-    abus_io input_ios[2**K-1:0]();
-    abus_io output_io();
-    
-    genvar k;
-    generate
-        for(k=0; k<2**K; k++) begin : io_mapper
-            assign input_ios[k].stb = stb[k];
-            assign ack[k] = input_ios[k].ack;
-        end
-    endgenerate
-    
-    
-    adder_casc#(.K(K)) adder_casc0(.clk(clk), .rst(rst), .x(x), .y(y0), .left(input_ios), .right(output_io.left));
-
-    adder_casc_p#(.K(K)) adder_casc1(.clk(clk), .rst(rst), .x(x), .y(y1));
-
-    initial forever #5 clk = ~clk;
-    initial begin
-        
-        $display("Testing adder_casc");
-        clk = 0;
-        rst = 1;
-        
-        foreach(stb[i]) stb[i] = 0;
-        output_io.ack = 0;
-        // Initialise with floating point 2**i
-        // foreach(x[i]) x[i] = ('h400 + (i*8)) << 20;
-        x = {'h41ea6000, 'h42ea6000, 'h411ba000, 'h413cc000};
-
-        #10;
-        rst = 0;
-        foreach(stb[i]) stb[i] = 1;
-        #20;
-        foreach(stb[i]) stb[i] = 0;
-        wait(output_io.stb == 1);
-        output_io.ack = 1;
-        assert(y0[0] == 'h47ffff00);
-        wait(output_io.stb == 0);
-        output_io.ack = 0;
-    end
-    
-endmodule : adder_casc_tb
 
 
 

+ 110 - 47
src/neural/layer.sv

@@ -85,83 +85,143 @@ ONE HOT -> ... [ ] ... [ ] ..
 
 */
 
-module neuron_network_tb;
-    reg clk, rst;
-    reg [31:0] x [3:0];
-    reg [31:0] y [1:0];
+module neural_network_encoder(clk, rst, x, y);
+    input clk, rst;
+    input [31:0] x;
+    output [31:0] y;
 
-    abus_io left[3:0]();
-    abus_io right[1:0]();
+    reg [31:0] layer1_s [7:0];
+    reg [31:0] layer2_s [7:0];
+    reg [31:0] layer3_s [1:0];
+    reg [31:0] y [1:0];
 
     reg [31:0] layer1_w [0:7][3:0];
     reg [31:0] layer1_b [0:7];
-    reg [31:0] layer1_o [7:0];
-    abus_io layer1_io [7:0]();
-
     reg [31:0] layer2_w [0:7][7:0];
     reg [31:0] layer2_b [7:0];
-    reg [31:0] layer2_o [7:0];
-    abus_io layer2_io [7:0]();
-
     reg [31:0] layer3_w [0:1][7:0];
     reg [31:0] layer3_b [1:0];
-    reg [31:0] layer3_o [1:0];
-    abus_io layer3_io [1:0]();
-
-    logic y_stb;
-    assign y_stb = right[0].stb & right[1].stb;
 
-    neuron_layer#(.C(2), .K(3)) layer1(
+    neuron_layer_p#(.C(2), .K(3)) layer_s1(
         .clk(clk),
         .rst(rst),
         .x(x),
-        .y(layer1_o),
+        .y(layer1_s),
         .w(layer1_w),
-        .b(layer1_b),
-        .left(left),
-        .right(layer1_io)
+        .b(layer1_b)
     );
 
-    neuron_layer#(.C(3), .K(3)) layer2(
+    neuron_layer_p#(.C(3), .K(3)) layer_s2(
         .clk(clk),
         .rst(rst),
-        .x(layer1_o),
-        .y(layer2_o),
+        .x(layer1_s),
+        .y(layer2_s),
         .w(layer2_w),
-        .b(layer2_b),
-        .left(layer1_io),
-        .right(layer2_io)
+        .b(layer2_b)
     );
 
-    neuron_layer#(.C(3), .K(1)) layer3(
+    neuron_layer_p#(.C(3), .K(1)) layer_s3(
         .clk(clk),
         .rst(rst),
-        .x(layer2_o),
-        .y(layer3_o),
+        .x(layer2_s),
+        .y(layer3_s),
         .w(layer3_w),
-        .b(layer3_b),
-        .left(layer2_io),
-        .right(layer3_io)
+        .b(layer3_b)
     );
 
-    hard_sigmoid sigmoid0(
+    hard_sigmoid_p sigmoid_s0(
         .clk(clk),
         .rst(rst),
-        .x(layer3_o[0]),
-        .y(y[0]),
-        .left(layer3_io[0]),
-        .right(right[0])
+        .x(layer3_s[0]),
+        .y(ys[0])
     );
 
-    hard_sigmoid sigmoid1(
+    hard_sigmoid_p sigmoid_s1(
         .clk(clk),
         .rst(rst),
-        .x(layer3_o[1]),
-        .y(y[1]),
-        .left(layer3_io[1]),
-        .right(right[1])
+        .x(layer3_s[1]),
+        .y(ys[1])
     );
 
+endmodule : neural_network_encoder
+
+module neuron_network_tb;
+    reg clk, rst;
+    reg [31:0] x [3:0];
+    reg [31:0] y [1:0];
+
+    // abus_io left[3:0]();
+    // abus_io right[1:0]();
+    //
+    // reg [31:0] layer1_w [0:7][3:0];
+    // reg [31:0] layer1_b [0:7];
+    // reg [31:0] layer1_o [7:0];
+    // abus_io layer1_io [7:0]();
+    //
+    // reg [31:0] layer2_w [0:7][7:0];
+    // reg [31:0] layer2_b [7:0];
+    // reg [31:0] layer2_o [7:0];
+    // abus_io layer2_io [7:0]();
+    //
+    // reg [31:0] layer3_w [0:1][7:0];
+    // reg [31:0] layer3_b [1:0];
+    // reg [31:0] layer3_o [1:0];
+    // abus_io layer3_io [1:0]();
+    //
+    // logic y_stb;
+    // assign y_stb = right[0].stb & right[1].stb;
+    //
+    // neuron_layer#(.C(2), .K(3)) layer1(
+    //     .clk(clk),
+    //     .rst(rst),
+    //     .x(x),
+    //     .y(layer1_o),
+    //     .w(layer1_w),
+    //     .b(layer1_b),
+    //     .left(left),
+    //     .right(layer1_io)
+    // );
+    //
+    // neuron_layer#(.C(3), .K(3)) layer2(
+    //     .clk(clk),
+    //     .rst(rst),
+    //     .x(layer1_o),
+    //     .y(layer2_o),
+    //     .w(layer2_w),
+    //     .b(layer2_b),
+    //     .left(layer1_io),
+    //     .right(layer2_io)
+    // );
+    //
+    // neuron_layer#(.C(3), .K(1)) layer3(
+    //     .clk(clk),
+    //     .rst(rst),
+    //     .x(layer2_o),
+    //     .y(layer3_o),
+    //     .w(layer3_w),
+    //     .b(layer3_b),
+    //     .left(layer2_io),
+    //     .right(layer3_io)
+    // );
+    //
+    // hard_sigmoid sigmoid0(
+    //     .clk(clk),
+    //     .rst(rst),
+    //     .x(layer3_o[0]),
+    //     .y(y[0]),
+    //     .left(layer3_io[0]),
+    //     .right(right[0])
+    // );
+    //
+    // hard_sigmoid sigmoid1(
+    //     .clk(clk),
+    //     .rst(rst),
+    //     .x(layer3_o[1]),
+    //     .y(y[1]),
+    //     .left(layer3_io[1]),
+    //     .right(right[1])
+    // );
+
     /* ******************
     Pipelined network
     ********************/
@@ -175,7 +235,8 @@ module neuron_network_tb;
         .rst(rst),
         .x(x),
         .y(layer1_s),
-        .w(layer1_w), .b(layer1_b)
+        .w(layer1_w),
+        .b(layer1_b)
     );
 
     neuron_layer_p#(.C(3), .K(3)) layer_s2(
@@ -183,7 +244,8 @@ module neuron_network_tb;
         .rst(rst),
         .x(layer1_s),
         .y(layer2_s),
-        .w(layer2_w), .b(layer2_b)
+        .w(layer2_w),
+        .b(layer2_b)
     );
 
     neuron_layer_p#(.C(3), .K(1)) layer_s3(
@@ -191,7 +253,8 @@ module neuron_network_tb;
         .rst(rst),
         .x(layer2_s),
         .y(layer3_s),
-        .w(layer3_w), .b(layer3_b)
+        .w(layer3_w),
+        .b(layer3_b)
     );
 
     hard_sigmoid_p sigmoid_s0(

+ 8 - 4
src/neural/neural.sv

@@ -16,11 +16,15 @@ module neural_adder(clk, rst, x0, x1, y);
     //     .clk(clk),
     //     .reset(rst)
     // );
-    adder_32 adder0(
+    adder_32c adder0(
         .clk(clk),
+        .clrn(~rst),
         .a(x0),
         .b(x1),
-        .out(y)
+        .sub(1'b0),
+        .rm(2'b11),
+        .s(y),
+        .e(1'b1)
     );
 
 endmodule : neural_adder
@@ -51,11 +55,11 @@ endmodule : neural_mult
 module neural_comp_gt(x0, x1, y);
     input [31:0] x0, x1;
     output y;
-    fpu32_gt gt0(x0, x1, y0);
+    fpu32_gt gt0(x0, x1, y);
 endmodule : neural_comp_gt
 
 module neural_comp_lt(x0, x1, y);
     input [31:0] x0, x1;
     output y;
-    fpu32_lt lt0(x0, x1, y0);
+    fpu32_lt lt0(x0, x1, y);
 endmodule : neural_comp_lt

+ 77 - 16
src/neural/sigmoid.sv

@@ -1,4 +1,10 @@
 `include "../fpu32/compare.sv"
+`include "../fpu32p/fpu32p.sv"
+
+// synopsys translate_off
+`timescale 1 ps / 1 ps
+// synopsys translate_on
+
 
 typedef enum logic [2:0] {
     hs_input,
@@ -123,11 +129,35 @@ module hard_sigmoid #(parameter N=32)(clk, rst, x, y, left, right);
 
 endmodule : hard_sigmoid
 
+module pipeline_casc #(parameter STAGES=5, N=32) (clk, rst, x, y);
+    input clk, rst;
+    input [N-1:0] x;
+    output logic [N-1:0] y;
+    reg [N-1:0] stages [STAGES-1:0];
+
+    assign y = stages[STAGES-1];
+
+    genvar k;
+    generate
+        for (k=0; k<STAGES; k++) begin : gen_stages
+            always_ff @(posedge clk) begin
+                if(rst) begin
+                    stages[k] <= 0;
+                end else begin
+                    if(k == 0) stages[k] <= x;
+                    else stages[k] <= stages[k-1];
+                end
+            end
+        end
+    endgenerate
+
+endmodule : pipeline_casc
+
 module hard_sigmoid_p #(parameter N=32)(clk, rst, x, y);
     input clk, rst;
     input [N-1:0] x;
     output logic [N-1:0] y;
-    logic [N-1:0] value, comp_result;
+    logic [N-1:0] comp_result, value;
 
     logic gt_neg0, gt_neg1;
     logic lt_pos0, lt_pos1;
@@ -147,37 +177,68 @@ module hard_sigmoid_p #(parameter N=32)(clk, rst, x, y);
         .clk(clk),
         .rst(rst),
         .x0('h40200000),
-        .x1(value),
+        .x1(x),
         .y(join_value)
     );
 
+    pipeline_casc #(.STAGES(`PP_FP_MULT + `PP_FP_ADDER + 1), .N(N)) delay0(
+        .clk(clk),
+        .rst(rst),
+        .x(x),
+        .y(value)
+    );
+
     neural_comp_gt gt0(value, 'hc0200000, gt_neg0); // more then -2.5
     neural_comp_lt lt0(value, 'h40200000, lt_pos0); // less then +2.5
 
-    always_ff @(posedge clk) begin
-        gt_neg1 <= gt_neg0;
-        lt_pos1 <= lt_pos0;
-        if(~gt_neg1) begin
-            y <= 0;
-        end
-        // if in between -2.5 and 2.5
-        else if(gt_neg1 & lt_pos1) begin
-            y <= comp_result;
-        end
-        // if more than 2.5 ouput 1
-        else begin
-            y <= 'h3f800000;
-        end
+    always_comb begin
+        if(~gt_neg1 | rst)
+            y = 0;
+        else if(gt_neg1 & lt_pos1)  // if in between -2.5 and 2.5
+            y = comp_result;
+        else  // if more than 2.5 ouput 1
+            y = 'h3f800000;
+    end
 
+    always_ff @(posedge clk) begin
         if(rst) begin
             gt_neg1 <= 0;
             lt_pos1 <= 0;
+        end else begin
+            gt_neg1 <= gt_neg0;
+            lt_pos1 <= lt_pos0;
         end
     end
 
 endmodule : hard_sigmoid_p
 
 
+module hard_sigmoid_p_tb;
+    reg rst, clk;
+    reg [31:0] x;
+    wire [31:0] y;
+
+    hard_sigmoid_p sigmoid0(clk, rst, x, y);
+
+    reg [31:0] test_mem [5000:0];
+    initial $readmemh("scripts/sigmoid_test.hex", test_mem);
+
+    initial forever #5 clk = ~clk;
+    initial begin
+        clk = 0;
+        rst = 1;
+        # 15;
+        rst = 0;
+        for (int i=0; i < $size(test_mem); i++) begin
+            x = test_mem[i];
+            #10;
+        end
+        $finish();
+    end
+
+endmodule : hard_sigmoid_p_tb
+
+
 module hard_sigmoid_tb;
     reg rst, clk;
     reg [31:0] x;

+ 4 - 0
src/root.sv

@@ -2,6 +2,8 @@
 `timescale 1 ps / 1 ps
 // synopsys translate_on
 
+`include "neural/neural.sv"
+
 module root(
     input  clk,
     input  [1:0] keys,
@@ -21,6 +23,8 @@ module root(
         .locked(pll_lock)
     );
 
+    neural_network_encoder nn0(mclk, reset, 32'd0);
+
 endmodule : root