Ver código fonte

Merge branch 'master' of https://gogs.infcof.com/4ycp/altera_devel into Oliver_FPA

Oliver Jaison 4 anos atrás
pai
commit
4f7fcbca22

+ 38 - 14
scripts/fpu_test_gen.py

@@ -1,6 +1,6 @@
 import numpy as np
 import os
-import struct
+import sys
 
 
 def reverse_endian(data):
@@ -10,34 +10,58 @@ def reverse_endian(data):
     return bytes(result)
 
 
-def generate_fp_vector(cases, filename, dtype=np.float16, big_endian=False):
-    dsize = 0
+def dtype_size(dtype):
     if dtype == np.float16:
-        dsize = 2
+        return 2
     elif dtype == np.float32:
-        dsize = 4
+        return 4
     else:
         raise ValueError(f"Unknown dtype {dtype}")
 
+def generate_numbers(cases, dtype=np.float16):
+    dsize = dtype_size(dtype)
     x = np.frombuffer(os.urandom(cases * dsize), dtype=dtype)
     y = np.frombuffer(os.urandom(cases * dsize), dtype=dtype)
+    return x, y, dsize
+
+
+def generate_fp_vector(cases, filename, dtype=np.float16, big_endian=False, comp_file=None):
+    x, y, dsize = generate_numbers(cases, dtype)
     np.seterr(all='ignore')
     sum = x + y
     mul = x * y
-    x = x.tobytes()
-    y = y.tobytes()
-    sum = sum.tobytes()
-    mul = mul.tobytes()
     with open(filename, 'w') as f:
         for i in range(cases):
             t = lambda v: reverse_endian(v) if big_endian else v
             f.write(' '.join([
-                t(x[i * dsize:i * dsize + dsize]).hex(),
-                t(y[i * dsize:i * dsize + dsize]).hex(),
-                t(sum[i * dsize:i * dsize + dsize]).hex(),
-                t(mul[i * dsize:i * dsize + dsize]).hex(),
+                t(x.tobytes()[i * dsize:i * dsize + dsize]).hex(),
+                t(y.tobytes()[i * dsize:i * dsize + dsize]).hex(),
+                t(sum.tobytes()[i * dsize:i * dsize + dsize]).hex(),
+                t(mul.tobytes()[i * dsize:i * dsize + dsize]).hex(),
             ]) + '\n')
+    if comp_file is not None:
+        gt = x > y
+        lt = x < y
+        ge = x >= y
+        le = x <= y
+        eq = x == y
+        with open(comp_file, 'w') as f:
+            for i in range(cases):
+                f.write(''.join([
+                    '1' if gt[i] else '0',
+                    '1' if lt[i] else '0',
+                    '1' if ge[i] else '0',
+                    '1' if le[i] else '0',
+                    '1' if eq[i] else '0'
+                ]) + f'  // {x[i]:10.3e} {y[i]:10.3e}\n')
 
 
 if __name__ == '__main__':
-    generate_fp_vector(30, 'fp16_test.hex', dtype=np.float16, big_endian=True)
+    generate = 50
+    if len(sys.argv) == 2 and sys.argv[1].isdigit():
+        generate = int(sys.argv[1])
+    else:
+        print(f"Usage: {sys.argv[0]} [number of generated tests]")
+
+    generate_fp_vector(generate, 'fp16_test.hex', dtype=np.float16, big_endian=True)
+    generate_fp_vector(generate, 'fp32_test.hex', dtype=np.float32, big_endian=True, comp_file='fp32_test_comp.hex')

+ 45 - 0
scripts/neuron_net_test.py

@@ -0,0 +1,45 @@
+import numpy as np
+from matplotlib import pyplot as plt
+from fpu_test_gen import reverse_endian, dtype_size
+
+WEIGTHS = [
+    [
+        [0.7095146, -0.41895103, -0.08075078, -0.5218736],
+        [-0.4351325, 1.0214638, 0.14494987, -0.78134096],
+        [0.38553882, -1.0607314, 0.01327306, -0.28972188],
+        [0.84955347, 0.32464203, 0.8879888, 0.00756884],
+        [-0.8693629, 0.8418823, 0.60206324, -0.78290594],
+        [0.1586302, 0.01737848, 0.75329006, -0.57819647],
+        [-0.16126093, 0.5317601, 0.34316933, -0.7074082],
+        [0.09219088, -0.624525, -0.61903083, -0.87057704]
+    ],
+    [[0.36770403, -0.78046024, 0.3979908, 0.5494289, -0.13859335, 0.40053025, 0.08249452, -0.32528356],
+     [-0.17659009, 0.13901198, -0.45248222, -0.7894139, -0.81092286, -0.521815, 0.30632392, -0.3143816],
+     [-0.04314173, 0.14361085, 0.6259473, 0.3571782, -0.38011226, 0.01378736, 0.05794358, 0.09667788],
+     [-0.46864474, 0.36618456, -0.45595396, -0.39789405, 0.73964316, -0.30294785, 0.2482118, -0.2127953],
+     [-0.37941265, 0.45330787, -0.12066315, 0.5636705, 0.68990386, 0.6543718, 0.86367106, -0.5707757],
+     [-0.78606385, 0.24032554, -0.4472755, -0.24661142, -0.2698564, -0.8365823, -0.13674814, -0.39799848],
+     [0.11138931, 0.48950365, 0.12998834, 0.4947537, 0.516593, 0.82281274, 0.04789656, 0.30206403],
+     [0.23097174, 0.30290592, -0.596446, -0.40108407, 0.12246455, -0.47260976, -0.55030185, 0.44481543]
+     ]
+    [
+        [0.5724262, 0.5853241, 0.3748752, -0.892384, -1.0270239, 0.2170913, -0.07271451, 0.14661156],
+        [0.30391088, -0.92324615, 0.8088594, -1.0522624, 0.07374455, -0.550893, 0.8194236, -0.62796086]
+    ]
+]
+BIAS = [
+    # L1
+    [0.01425434, -0.06219335, -0.0201127, -0.04791382, -0.04360008, -0.05311861, -0.01731363, -0.00014839],
+    # L2
+    [0.03480967, 0.06208326, -0.01576317, -0.00037753, -0.03940378, 0.05157978, -0.02775403, 0.04540931],
+    # L3
+    [0.03787775, -0.03655371],
+]
+
+
+def generate_nn_values(dtype=np.float32):
+    dsize = dtype_size(dtype)
+
+
+if __name__ == '__main__':
+    generate_nn_values()

+ 24 - 0
scripts/number_conv.py

@@ -0,0 +1,24 @@
+import numpy as np
+from fpu_test_gen import reverse_endian, dtype_size
+import sys
+
+
+def float2verilog(lines, dtype=np.float32):
+    dsize = dtype_size(dtype)
+    print("")
+    for i, line in enumerate(lines):
+        arr = line.replace('[', '').replace(']', '').split()
+        nums = np.array([float(f.strip(',')) for f in arr if f], dtype=dtype)
+        b = nums.tobytes()
+        print(f'[{i}] = {{' +
+              ', '.join(["'h" + reverse_endian(b[i*dsize:i*dsize+dsize]).hex() for i in range(len(arr))]) +
+              '};')
+
+
+if __name__ == '__main__':
+    print("Press Ctrl-D to proceed")
+    while True:
+        lines = sys.stdin.readlines()
+        if len(lines) == 0:
+            break
+        float2verilog(lines)

+ 43 - 0
scripts/sigmoid_test.py

@@ -0,0 +1,43 @@
+import numpy as np
+from matplotlib import pyplot as plt
+from fpu_test_gen import reverse_endian, dtype_size
+
+
+def generate(fname, samples, dtype=np.float32):
+    dsize = dtype_size(dtype)
+    numbers = np.linspace(-4, 4, samples, dtype=dtype)
+    data = numbers.tobytes()
+    with open(fname, 'w') as f:
+        for i in range(samples):
+            f.write(f"{reverse_endian(data[i*dsize:i*dsize+dsize]).hex()}  // {numbers[i]:0.6f}\n")
+
+
+def view_result(fname, dtype=np.float32):
+    x_bytes = b''
+    y_bytes = b''
+    timing = []
+    with open(fname, 'r') as f:
+        for line in f.readlines():
+            parts = line.split()
+            x_bytes += reverse_endian(bytes.fromhex(parts[0]))
+            y_bytes += reverse_endian(bytes.fromhex(parts[1]))
+            timing.append(int(parts[2]))
+    x = np.frombuffer(x_bytes, dtype=dtype)
+    y = np.frombuffer(y_bytes, dtype=dtype)
+    t = np.array(timing) / 10
+
+    fig, ax = plt.subplots()
+    plt.title('Digital circuit sigmoid function test')
+    ax2 = ax.twinx()
+    ax.plot(x, y, '.', markersize=0.95)
+    ax2.plot(x, t, '.', markersize=0.95, color='m')
+    ax.set_xlabel('Function input')
+    ax.set_ylabel('Function output')
+    ax2.set_ylabel('Timing in cycles', color='m')
+    plt.grid()
+    plt.show()
+
+
+if __name__ == '__main__':
+    # generate('sigmoid_test.hex', 5001)
+    view_result('sigmoid_result.hex')

Diferenças do arquivo suprimidas por serem muito extensas
+ 42 - 0
simulation/modelsim/wave_adder_casc_tb.do


+ 32 - 0
simulation/modelsim/wave_fpu32_compare_tb.do

@@ -0,0 +1,32 @@
+onerror {resume}
+quietly WaveActivateNextPane {} 0
+add wave -noupdate -radix float32 /fpu32_compare_tb/x0
+add wave -noupdate -radix float32 /fpu32_compare_tb/x1
+add wave -noupdate /fpu32_compare_tb/y_gt
+add wave -noupdate /fpu32_compare_tb/exp_gt
+add wave -noupdate /fpu32_compare_tb/y_lt
+add wave -noupdate /fpu32_compare_tb/exp_lt
+add wave -noupdate /fpu32_compare_tb/y_ge
+add wave -noupdate /fpu32_compare_tb/exp_ge
+add wave -noupdate /fpu32_compare_tb/y_le
+add wave -noupdate /fpu32_compare_tb/exp_le
+add wave -noupdate /fpu32_compare_tb/y_eq
+add wave -noupdate /fpu32_compare_tb/exp_eq
+TreeUpdate [SetDefaultTree]
+WaveRestoreCursors {{Cursor 1} {47 ps} 0}
+quietly wave cursor active 1
+configure wave -namecolwidth 294
+configure wave -valuecolwidth 88
+configure wave -justifyvalue left
+configure wave -signalnamewidth 0
+configure wave -snapdistance 10
+configure wave -datasetprefix 0
+configure wave -rowmargin 4
+configure wave -childrowmargin 2
+configure wave -gridoffset 0
+configure wave -gridperiod 1
+configure wave -griddelta 40
+configure wave -timeline 0
+configure wave -timelineunits ns
+update
+WaveRestoreZoom {0 ps} {28 ps}

Diferenças do arquivo suprimidas por serem muito extensas
+ 80 - 0
simulation/modelsim/wave_neuron_tb.do


+ 113 - 0
src/fpu32/compare.sv

@@ -0,0 +1,113 @@
+
+
+// Check if x0 > x1
+module fpu32_gt(x0, x1, y);
+    input logic [31:0] x0;
+    input logic [31:0] x1;
+    output logic y;
+
+    wire s_x0, s_x1;
+    wire [7:0] e_x0, e_x1;
+    wire [22:0] m_x0, m_x1;
+
+    assign s_x0 = x0[31];
+    assign e_x0 = x0[30:23];
+    assign m_x0 = x0[22:0];
+
+    assign s_x1 = x1[31];
+    assign e_x1 = x1[30:23];
+    assign m_x1 = x1[22:0];
+
+    wire comp;
+    assign comp = s_x0 ^ ((e_x0 > e_x1) | (e_x0 == e_x1 & m_x0 > m_x1));
+    assign y = (~s_x0 & s_x1) | (~(s_x0 ^ s_x1) & comp);
+endmodule : fpu32_gt
+
+// Check if x0 >= x1
+module fpu32_ge(x0, x1, y);
+    input logic [31:0] x0;
+    input logic [31:0] x1;
+    output logic y;
+
+    wire y0;
+    fpu32_gt gt0(x0, x1, y0);
+    assign y = y0 | (x0 == x1);
+endmodule : fpu32_ge
+
+// Check if x0 < x1
+module fpu32_lt(x0, x1, y);
+    input logic [31:0] x0;
+    input logic [31:0] x1;
+    output logic y;
+
+    wire y0;
+    fpu32_ge ge0(x0, x1, y0);
+    assign y = ~y0;
+endmodule : fpu32_lt
+
+// Check if x0 <= x1
+module fpu32_le(x0, x1, y);
+    input logic [31:0] x0;
+    input logic [31:0] x1;
+    output logic y;
+
+    wire y0;
+    fpu32_gt gt0(x0, x1, y0);
+    assign y = ~y0;
+endmodule : fpu32_le
+
+
+module fpu32_compare_tb();
+    logic [31:0] x0;
+    logic [31:0] x1;
+
+    reg [31:0] test_mem [9999:0][3:0];
+    initial $readmemh("scripts/fp32_test.hex", test_mem);
+    reg [4:0] test_mem_cmp [9999:0];
+    initial $readmemb("scripts/fp32_test_comp.hex", test_mem_cmp);
+
+    wire y_gt, y_lt, y_ge, y_le, y_eq;
+    logic exp_gt, exp_lt, exp_ge, exp_le, exp_eq;
+    fpu32_gt gt0(x0, x1, y_gt);
+    fpu32_lt lt0(x0, x1, y_lt);
+    fpu32_ge ge0(x0, x1, y_ge);
+    fpu32_le le0(x0, x1, y_le);
+    assign y_eq = x0 == x1;
+
+    static int num_err = 0;
+    static int num_tests = $size(test_mem) * 5;
+
+    task test_val;
+        input int i;
+        input val, exp;
+        input string name;
+        if(val != exp) begin
+            if(num_err < 20) begin
+                $display("FAIL %d at %s: 0x%H, 0x%H => %b, expected %b", i, name, x0, x1, val, exp);
+            end
+            num_err++;
+        end
+    endtask : test_val
+
+    initial begin
+        for (int i=0; i < $size(test_mem); i++) begin
+            x0 = test_mem[i][0];
+            x1 = test_mem[i][1];
+            exp_gt = test_mem_cmp[i][4];
+            exp_lt = test_mem_cmp[i][3];
+            exp_ge = test_mem_cmp[i][2];
+            exp_le = test_mem_cmp[i][1];
+            exp_eq = test_mem_cmp[i][0];
+            #1;
+            test_val(i, y_gt, exp_gt, "GT");
+            test_val(i, y_lt, exp_lt, "LT");
+            test_val(i, y_ge, exp_ge, "GE");
+            test_val(i, y_le, exp_le, "LE");
+            test_val(i, y_eq, exp_eq, "EQ");
+        end
+
+        $display("Passed %d of %d tests", num_tests-num_err, num_tests);
+        $finish();
+    end
+
+endmodule : fpu32_compare_tb

+ 1 - 0
src/fpu32/fpu32.sv

@@ -1,5 +1,6 @@
 `include "adder.sv"
 `include "mult.v"
+`include "compare.sv"
 
 // synopsys translate_off
 `timescale 1 ps / 1 ps

+ 10 - 18
src/neural/comp.sv

@@ -20,8 +20,8 @@ module cadder#(parameter N=32)(clk, rst, x, x_ack, x_stb, y, y_ack, y_stb);
     output y_stb;
 
     wire left_ack, left_stb;
-    assign x_ack[0] = left_ack;
-    assign x_ack[1] = left_ack;
+    assign x_ack[0] = left_stb & left_ack;
+    assign x_ack[1] = left_stb & left_ack;
     assign left_stb = x_stb[0] & x_stb[1];
 
     adder add0 (
@@ -134,7 +134,7 @@ endmodule : adder_casc
 module adder_casc_tb();
     logic clk, rst;
     
-    localparam K=4;
+    localparam K=2;
     logic [31:0] x [2**K-1:0];
     logic [31:0] y;
     logic ack [2**K-1:0];
@@ -162,23 +162,15 @@ module adder_casc_tb();
         
         foreach(stb[i]) stb[i] = 0;
         output_io.ack = 0;
-        #20
-        rst = 0;
         // Initialise with floating point 2**i
-        foreach(x[i]) x[i] = ('h400 + (i*8)) << 20;
-        foreach(stb[i]) stb[i] = 1;
+        // foreach(x[i]) x[i] = ('h400 + (i*8)) << 20;
+        x = {'h41ea6000, 'h42ea6000, 'h411ba000, 'h413cc000};
 
-        fork
-            foreach(ack[i]) begin
-                fork
-                    wait(ack[i] == 1);
-                    #20
-                    stb[i] = 0;
-                join
-            end
-        join
-        #20
-        
+        #10;
+        rst = 0;
+        foreach(stb[i]) stb[i] = 1;
+        #20;
+        foreach(stb[i]) stb[i] = 0;
         wait(output_io.stb == 1);
         output_io.ack = 1;
         assert(y[0] == 'h47ffff00);

+ 192 - 0
src/neural/layer.sv

@@ -0,0 +1,192 @@
+// synopsys translate_off
+`timescale 1 ps / 1 ps
+// synopsys translate_on
+
+module neuron_layer#(parameter C, K, N=32)(clk, rst, x, y, w, b, left, right);
+    localparam NEURONS = 2**K;
+    localparam CONNS = 2**C;
+
+    input wire clk, rst;
+    input wire [N-1:0] x [CONNS-1:0];
+    input wire [N-1:0] w [NEURONS-1:0][CONNS-1:0];
+    input wire [N-1:0] b [NEURONS-1:0];
+    output wire [N-1:0] y [NEURONS-1:0];
+    abus_io left [CONNS-1:0];
+    abus_io right [NEURONS-1:0];
+
+    wire [NEURONS-1:0] ack_t [CONNS-1:0];
+    reg [CONNS-1:0] ack [NEURONS-1:0];
+    reg [CONNS-1:0] stb;
+
+    genvar i, j;
+    generate
+        for(i=0; i<CONNS; i++) begin
+            assign stb[i] = left[i].stb;
+            assign left[i].ack = &ack_t[i];
+        end
+    endgenerate
+
+    generate
+        for(i=0; i<NEURONS; i++) begin
+            for(j=0; j<CONNS; j++) begin
+                assign ack_t[j][i] = ack[i][j];
+            end
+            neuron#(.K(C), .N(N)) n(
+                .clk(clk),
+                .rst(rst),
+                .x(x),
+                .y(y[i]),
+                .w(w[i]),
+                .b(b[i]),
+                .ack(ack[i]),
+                .stb(stb),
+                .right(right[i])
+            );
+        end
+    endgenerate
+
+endmodule : neuron_layer
+
+/*
+Testbench for a 8x8x2 neuron network as shown below:
+
+                8       8
+             . [ ] ... [ ] .
+ONE HOT -> ... [ ] ... [ ] ..   2
+ONE HOT -> ... [ ] ... [ ] ... [ ] -> [ Hard Sigmoid ] -> Q
+ONE HOT -> ... [ ] ... [ ] ... [ ] -> [ Hard Sigmoid ] -> I
+ONE HOT -> ... [ ] ... [ ] ..
+             . [ ] ... [ ] .
+
+*/
+
+module neuron_network_tb;
+    reg clk, rst;
+    reg [31:0] x [3:0];
+    reg [31:0] y [1:0];
+
+    abus_io left[3:0]();
+    abus_io right[1:0]();
+
+    reg [31:0] layer1_w [7:0][3:0];
+    reg [31:0] layer1_b [7:0];
+    reg [31:0] layer1_o [7:0];
+    abus_io layer1_io [7:0]();
+
+    reg [31:0] layer2_w [7:0][7:0];
+    reg [31:0] layer2_b [7:0];
+    reg [31:0] layer2_o [7:0];
+    abus_io layer2_io [7:0]();
+
+    reg [31:0] layer3_w [1:0][7:0];
+    reg [31:0] layer3_b [1:0];
+    reg [31:0] layer3_o [1:0];
+    abus_io layer3_io [1:0]();
+
+    neuron_layer#(.C(2), .K(3)) layer1(
+        .clk(clk),
+        .rst(rst),
+        .x(x),
+        .y(layer1_o),
+        .w(layer1_w),
+        .b(layer1_b),
+        .left(left),
+        .right(layer1_io)
+    );
+
+    neuron_layer#(.C(3), .K(3)) layer2(
+        .clk(clk),
+        .rst(rst),
+        .x(layer1_o),
+        .y(layer2_o),
+        .w(layer2_w),
+        .b(layer2_b),
+        .left(layer1_io),
+        .right(layer2_io)
+    );
+
+    neuron_layer#(.C(3), .K(1)) layer3(
+        .clk(clk),
+        .rst(rst),
+        .x(layer2_o),
+        .y(layer3_o),
+        .w(layer3_w),
+        .b(layer3_b),
+        .left(layer2_io),
+        .right(layer3_io)
+    );
+
+    hard_sigmoid sigmoid0(
+        .clk(clk),
+        .rst(rst),
+        .x(layer3_o[0]),
+        .y(y[0]),
+        .left(layer3_io[0]),
+        .right(right[0])
+    );
+
+    hard_sigmoid sigmoid1(
+        .clk(clk),
+        .rst(rst),
+        .x(layer3_o[1]),
+        .y(y[1]),
+        .left(layer3_io[1]),
+        .right(right[1])
+    );
+
+    initial forever #5 clk = ~clk;
+
+    initial begin
+        clk = 0;
+        rst = 1;
+
+        left[0].stb = 0;
+        left[1].stb = 0;
+        left[2].stb = 0;
+        left[3].stb = 0;
+        right[0].ack = 0;
+        right[1].ack = 0;
+
+        layer1_w[0] = {'h3f35a2c0, 'hbed680c0, 'hbda560aa, 'hbf059982};
+        layer1_w[1] = {'hbedec9b0, 'h3f82bf53, 'h3e146dbd, 'hbf4805f6};
+        layer1_w[2] = {'h3ec56558, 'hbf87c60c, 'h3c597740, 'hbe94566d};
+        layer1_w[3] = {'h3f597c56, 'h3ea6377b, 'h3f63533c, 'h3bf80408};
+        layer1_w[4] = {'hbf5e8e91, 'h3f578599, 'h3f1a20d1, 'hbf486c86};
+        layer1_w[5] = {'h3e226ff5, 'h3c8e5d50, 'h3f40d79e, 'hbf1404af};
+        layer1_w[6] = {'hbe252196, 'h3f08216e, 'h3eafb3e4, 'hbf3518b4};
+        layer1_w[7] = {'h3dbcce92, 'hbf1fe0df, 'hbf1e78ce, 'hbf5ede23};
+
+        layer2_w[0] = {'h3ebc43b4, 'hbf47cc3e, 'h3ecbc573, 'h3f0ca75f, 'hbe0deb6a, 'h3ecd124d, 'h3da8f2e3, 'hbea68b91};
+        layer2_w[1] = {'hbe34d408, 'h3e0e5928, 'hbee7abc0, 'hbf4a1708, 'hbf4f98a4, 'hbf0595ab, 'h3e9cd67d, 'hbea0f6a0};
+        layer2_w[2] = {'hbd30b562, 'h3e130eb9, 'h3f203e15, 'h3eb6e010, 'hbec29e13, 'h3c61e461, 'h3d6d563f, 'h3dc5ff0d};
+        layer2_w[3] = {'hbeeff234, 'h3ebb7c8b, 'hbee972cc, 'hbecbb8c5, 'h3f3d5941, 'hbe9b1bfb, 'h3e7e2b3c, 'hbe59e703};
+        layer2_w[4] = {'hbec24260, 'h3ee817f8, 'hbdf71e3e, 'h3f104cb6, 'h3f309d8a, 'h3f2784e9, 'h3f5d198c, 'hbf121e5b};
+        layer2_w[5] = {'hbf493b7b, 'h3e7617e6, 'hbee5014b, 'hbe7c87b4, 'hbe8a2a9e, 'hbf562a42, 'hbe0c07b4, 'hbecbc675};
+        layer2_w[6] = {'h3de42014, 'h3efaa039, 'h3e051baa, 'h3efd505b, 'h3f043f70, 'h3f52a3db, 'h3d442f2f, 'h3e9aa823};
+        layer2_w[7] = {'h3e6c83db, 'h3e9b167c, 'hbf18b0af, 'hbecd5ae4, 'h3dfaceb2, 'hbef1f9e8, 'hbf0ce095, 'h3ee3bed9};
+
+        layer3_w[0] = {'h3f128a86, 'h3f15d7cd, 'h3ebfefa4, 'hbf647347, 'hbf837585, 'h3e5e4d2f, 'hbd94eb58, 'h3e162157};
+        layer3_w[1] = {'h3e9b9a35, 'hbf6c59dc, 'h3f4f1169, 'hbf86b089, 'h3d970762, 'hbf0d0753, 'h3f51c5bf, 'hbf20c20b};
+
+        layer1_b = {'h3c698b09, 'hbd7ebe74, 'hbca4c364, 'hbd444148, 'hbd3295ff, 'hbd5992e6, 'hbc8dd550, 'hb91b9923};
+        layer2_b = {'h3d0e9496, 'h3d7e4b04, 'hbc8121c3, 'hb9c5ef38, 'hbd2165dc, 'h3d534552, 'hbce35c6b, 'h3d39ff1d};
+        layer3_b = {'h3d1b25b3, 'hbd15b958};
+
+        #15;
+        rst = 0;
+        x = {0, 0, 0 , 'h3f800000};
+        left[0].stb = 1;
+        left[1].stb = 1;
+        left[2].stb = 1;
+        left[3].stb = 1;
+
+        #15;
+        left[0].stb = 0;
+        left[1].stb = 0;
+        left[2].stb = 0;
+        left[3].stb = 0;
+
+    end
+
+
+endmodule : neuron_network_tb

+ 4 - 0
src/neural/neural.sv

@@ -0,0 +1,4 @@
+`include "comp.sv"
+`include "sigmoid.sv"
+`include "neuron.sv"
+`include "layer.sv"

+ 8 - 5
src/neural/neuron.sv

@@ -17,7 +17,7 @@
 
 */
 
-module neuron#(parameter K, N=32)(x, y, w, b, left, right, clk, rst);
+module neuron#(parameter K, N=32)(x, y, w, b, ack, stb, right, clk, rst);
     localparam M = 2**K;
     input wire [N-1:0] x [M-1:0];
     input wire [N-1:0] w [M-1:0];
@@ -27,7 +27,9 @@ module neuron#(parameter K, N=32)(x, y, w, b, left, right, clk, rst);
     input wire clk;
     input wire rst;
 
-    abus_io left[M-1:0];
+    output wire [M-1:0] ack;
+    input wire [M-1:0] stb;
+
     abus_io inner_io0[M-1:0]();
     abus_io inner_io1();
     abus_io right;
@@ -43,8 +45,8 @@ module neuron#(parameter K, N=32)(x, y, w, b, left, right, clk, rst);
                 .rst(rst),
                 .input_a(x[i]),
                 .input_b(w[i]),
-                .input_stb(left[i].stb),
-                .input_ack(left[i].ack),
+                .input_stb(stb[i]),
+                .input_ack(ack[i]),
                 .output_z(inner_w[i]),
                 .output_z_ack(inner_io0[i].ack),
                 .output_z_stb(inner_io0[i].stb)
@@ -97,7 +99,8 @@ module neuron_tb;
         .y(y),
         .w(w),
         .b(b),
-        .left(left),
+        .ack(),
+        .stb(),
         .right(right)
     );
 

+ 171 - 0
src/neural/sigmoid.sv

@@ -0,0 +1,171 @@
+`include "../fpu32/compare.sv"
+
+typedef enum logic [2:0] {
+    hs_input,
+    hs_compare_0,
+    hs_compare_1,
+    hs_compute_0,
+    hs_compute_1,
+    hs_output
+} hs_stage;
+
+/*
+    Function:
+    y = (x + 2.5) * 0.2
+    minimum = 0
+    maximum = 1
+*/
+module hard_sigmoid #(parameter N=32)(clk, rst, x, y, left, right);
+    input clk, rst;
+    input [N-1:0] x;
+    output logic [N-1:0] y;
+    abus_io left, right;
+
+    logic [N-1:0] value, comp_result;
+    hs_stage stage;
+
+    logic gt_neg;
+    logic lt_pos;
+    logic compute;  // Flag to tell if mult and add compution is needed
+
+    wire join_ack, join_stb;
+    wire [N-1:0] join_value;
+    logic in_stb, out_ack;
+    wire in_ack, out_stb;
+
+    // Multiply by 0.2
+    multiplier mult0(
+        .clk(clk),
+        .rst(rst),
+        .input_a('h3e4ccccd),
+        .input_b(join_value),
+        .input_stb(join_stb),
+        .input_ack(join_ack),
+        .output_z(comp_result),
+        .output_z_ack(out_ack),
+        .output_z_stb(out_stb)
+    );
+
+    // Add +2.5
+    adder add0(
+        .clk(clk),
+        .rst(rst),
+        .input_a('h40200000),
+        .input_b(value),
+        .input_stb(in_stb),
+        .input_ack(in_ack),
+        .output_z(join_value),
+        .output_z_ack(join_ack),
+        .output_z_stb(join_stb)
+    );
+
+    fpu32_gt gt0(value, 'hc0200000, gt_neg); // more then -2.5
+    fpu32_lt lt0(value, 'h40200000, lt_pos); // less then +2.5
+
+    always_ff @(posedge clk) begin
+        case (stage)
+            hs_input: begin
+                left.ack <= 1;
+                if (left.ack && left.stb) begin
+                    value <= x;
+                    left.ack <= 0;
+                    stage <= hs_compare_1;
+                end
+            end
+            hs_compare_1: begin
+                // if less than -2.5 output 0
+                if(~gt_neg) begin
+                    y <= 0;
+                    stage <= hs_output;
+                end else
+                // if in between -2.5 and 2.5
+                if(gt_neg & lt_pos) begin
+                    in_stb <= 1;
+                    stage <= hs_compute_0;
+                end else
+                // if more than 2.5 ouput 1
+                begin
+                    y <= 'h3f800000;
+                    stage <= hs_output;
+                end
+            end
+            hs_compute_0: begin
+                if (in_ack) begin
+                    in_stb <= 0;
+                    stage <= hs_compute_1;
+                end
+            end
+            hs_compute_1: begin
+                out_ack <= 1;
+                if (out_ack && out_stb) begin
+                    y <= comp_result;
+                    out_ack <= 0;
+                    stage <= hs_output;
+                end
+            end
+            hs_output: begin
+                right.stb <= 1;
+                if (right.stb && right.ack) begin
+                    right.stb <= 0;
+                    stage <= hs_input;
+                end
+            end
+        endcase
+
+        if (rst == 1) begin
+            stage <= hs_input;
+            left.ack <= 0;
+            right.stb <= 0;
+            y <= 0;
+        end
+    end
+
+
+endmodule : hard_sigmoid
+
+
+module hard_sigmoid_tb;
+    reg rst, clk;
+    reg [31:0] x;
+    wire [31:0] y;
+    abus_io left();
+    abus_io right();
+
+    hard_sigmoid sigmoid0(clk, rst, x, y, left, right);
+
+    reg [31:0] test_mem [5000:0];
+    initial $readmemh("scripts/sigmoid_test.hex", test_mem);
+
+    initial forever #5 clk = ~clk;
+    initial begin
+        int fd, start, delta;
+        fd = $fopen("scripts/sigmoid_result.hex", "w");
+        if(!fd) $display("Failed to open file! %0d", fd);
+
+        clk = 0;
+        rst = 1;
+        left.stb = 0;
+        right.ack = 0;
+        # 10;
+        rst = 0;
+        for (int i=0; i < $size(test_mem); i++) begin
+            x = test_mem[i];
+            left.stb = 1;
+            wait(left.ack == 1);
+            start = $time;
+            #15;
+            left.stb = 0;
+            wait(right.stb == 1);
+            right.ack = 1;
+            delta = $time - start;
+            #15;
+            right.ack = 0;
+            $fdisplay(fd, "%H %H %d", x, y, delta);
+        end
+        $fclose(fd);
+        $finish();
+    end
+
+
+endmodule : hard_sigmoid_tb
+

+ 28 - 0
wave_hard_sigmoid_tb.do

@@ -0,0 +1,28 @@
+onerror {resume}
+quietly WaveActivateNextPane {} 0
+add wave -noupdate /hard_sigmoid_tb/rst
+add wave -noupdate /hard_sigmoid_tb/clk
+add wave -noupdate -radix float32 /hard_sigmoid_tb/x
+add wave -noupdate -radix float32 /hard_sigmoid_tb/y
+add wave -noupdate /hard_sigmoid_tb/left/stb
+add wave -noupdate /hard_sigmoid_tb/left/ack
+add wave -noupdate /hard_sigmoid_tb/right/stb
+add wave -noupdate /hard_sigmoid_tb/right/ack
+TreeUpdate [SetDefaultTree]
+WaveRestoreCursors {{Cursor 1} {46937 ps} 0}
+quietly wave cursor active 1
+configure wave -namecolwidth 294
+configure wave -valuecolwidth 88
+configure wave -justifyvalue left
+configure wave -signalnamewidth 0
+configure wave -snapdistance 10
+configure wave -datasetprefix 0
+configure wave -rowmargin 4
+configure wave -childrowmargin 2
+configure wave -gridoffset 0
+configure wave -gridperiod 1
+configure wave -griddelta 40
+configure wave -timeline 0
+configure wave -timelineunits ns
+update
+WaveRestoreZoom {1087134 ps} {1087351 ps}