Skip to content

Commit

Permalink
Merge dev
Browse files Browse the repository at this point in the history
  • Loading branch information
auphelia committed Oct 24, 2024
2 parents 1d7636b + ac2fa5a commit 50045ff
Show file tree
Hide file tree
Showing 43 changed files with 1,351 additions and 291 deletions.
3 changes: 3 additions & 0 deletions .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,6 @@ sphinx:
python:
install:
- requirements: docs/requirements.txt

formats:
- pdf
2 changes: 1 addition & 1 deletion docs/finn/faq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ Which data layout do FINN-generated accelerators use? Big-endian? Little-endian?
If you need to do this manually, first examine how the `FINN PYNQ Python drivers <https://github.com/Xilinx/finn-examples/blob/main/finn_examples/driver.py#L379>`_ do this – notice how the input data is
first reshaped to create the “folded input shape” that reflects the word size of the first layer based on how much it
was parallelized, then data packing is applied to obtain a raw byte array (with some reversals going on) that can be
fed directly to the hardware. Another example of this is the `npy_to_rtlsim_input <https://github.com/Xilinx/finn-base/blob/dev/src/finn/util/data_packing.py#L289>`_ function, which converts npy arrays to lists of Python arbitrary-precision integers that we feed into pyverilator for rtl simulation.
fed directly to the hardware. Another example of this is the `npy_to_rtlsim_input <https://github.com/Xilinx/finn/blob/dev/src/finn/util/data_packing.py#L284>`_ function, which converts npy arrays to lists of Python arbitrary-precision integers that we feed into pyverilator for rtl simulation.

Why does FIFO sizing take so long for my network? Is something wrong?
The automatic FIFO sizing in FINN can take quite long. It unfortunately doesn’t really parallelize on multiple cores since
Expand Down
4 changes: 2 additions & 2 deletions fetch-repos.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

QONNX_COMMIT="fd61cfeebbdaba351abf7e9d54cd785d7776fa4f"
QONNX_COMMIT="2281a777d84aa5cbd7469085c2e534fb4a03ccf9"
FINN_EXP_COMMIT="0724be21111a21f0d81a072fccc1c446e053f851"
BREVITAS_COMMIT="84f42259ec869eb151af4cb8a8b23ad925f493db"
BREVITAS_COMMIT="d4834bd2a0fad3c1fbc0ff7e1346d5dcb3797ea4"
PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1"
CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
HLSLIB_COMMIT="16e5847a5e3ef76cffe84c8fad2f010d593457d3"
Expand Down
10 changes: 6 additions & 4 deletions finn-rtllib/mvu/mvu_4sx4u.sv
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
module mvu_4sx4u #(
int unsigned PE,
int unsigned SIMD,
int unsigned WEIGHT_WIDTH,
int unsigned ACTIVATION_WIDTH,
int unsigned ACCU_WIDTH,

int unsigned VERSION = 1, // Version 1 (DSP48E1) *must* commit to NARROW_WEIGHTS
Expand All @@ -49,8 +51,8 @@ module mvu_4sx4u #(
// Input
input logic last,
input logic zero, // ignore current inputs and force this partial product to zero
input logic signed [PE-1:0][SIMD-1:0][3:0] w, // signed weights
input logic [SIMD-1:0][3:0] a, // unsigned activations (override by SIGNED_ACTIVATIONS)
input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH -1:0] w, // signed weights
input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // unsigned activations (override by SIGNED_ACTIVATIONS)

// Ouput
output logic vld,
Expand Down Expand Up @@ -141,14 +143,14 @@ module mvu_4sx4u #(
for(genvar s = 0; s < SIMD; s++) begin : genSIMD

// Input Lane Assembly
uwire [17:0] bb = { {(14){SIGNED_ACTIVATIONS && a[s][3]}}, a[s] };
uwire [17:0] bb = { {(18-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] };
logic [29:0] aa;
logic [26:0] dd;
logic [ 1:0] xx[3:1];
if(1) begin : blkVectorize
uwire signed [3:0] ww[PE_END - PE_BEG];
for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin
assign ww[pe] = w[PE_BEG + pe][s];
assign ww[pe] = $signed(w[PE_BEG + pe][s]);
if(pe > 0) begin
if(BEHAVIORAL) assign xx[pe + PE_REM] = zero? 0 : ww[pe] * a[s];
`ifndef VERILATOR
Expand Down
16 changes: 10 additions & 6 deletions finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@
module mvu_8sx8u_dsp48 #(
int unsigned PE,
int unsigned SIMD,
int unsigned ACCU_WIDTH,
int unsigned ACTIVATION_WIDTH,
int unsigned WEIGHT_WIDTH,
int unsigned ACTIVATION_WIDTH,
int unsigned ACCU_WIDTH,

int unsigned VERSION = 1,
bit SIGNED_ACTIVATIONS = 0,
Expand Down Expand Up @@ -72,6 +72,10 @@ module mvu_8sx8u_dsp48 #(
return res;
endfunction : init_leave_loads

function int unsigned sum_width(input int unsigned n, input int unsigned w);
return w <= 16? $clog2(1 + n*(2**w - 1)) : w + $clog2(n);
endfunction : sum_width

// Pipeline for last indicator flag
logic [1:5] L = '0;
always_ff @(posedge clk) begin
Expand Down Expand Up @@ -445,7 +449,7 @@ module mvu_8sx8u_dsp48 #(
// Stage #4: Cross-SIMD Reduction

// Count leaves reachable from each node
localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0 }; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop

// Range of Cross-lane Contribution Tracked in Hi4
/*
Expand All @@ -462,7 +466,7 @@ module mvu_8sx8u_dsp48 #(
* signed value is determined by its lower bound to be at least:
* 1 + $clog2(2^(w-1)+SIMD)
*/
localparam int unsigned HI_WIDTH = 1 + $clog2(2**(ACCU_WIDTH-D[1]-1)+SIMD);
localparam int unsigned HI_WIDTH = 1 + ($clog2(SIMD) < ACCU_WIDTH-D[1]? ACCU_WIDTH-D[1] : $clog2(2**(ACCU_WIDTH-D[1]-1)+SIMD));

uwire signed [ACCU_WIDTH -1:0] up4;
uwire signed [HI_WIDTH -1:0] hi4;
Expand Down Expand Up @@ -504,12 +508,12 @@ module mvu_8sx8u_dsp48 #(
// Conclusive low part accumulation
if(i >= PE_REM) begin : blkLo
// Adder Tree across all SIMD low contributions (all unsigned arithmetic)
localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
localparam int unsigned ROOT_WIDTH = sum_width(SIMD, LO_WIDTH);
uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree;
for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
for(genvar n = 0; n < SIMD-1; n++) begin
// Sum truncated to actual maximum bit width at this node
localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1));
localparam int unsigned NODE_WIDTH = sum_width(LEAVE_LOAD[n], LO_WIDTH);
uwire [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2];
assign tree[n] = s;
end
Expand Down
7 changes: 4 additions & 3 deletions finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,10 @@ module mvu_vvu_8sx9_dsp58 #(
bit IS_MVU,
int unsigned PE,
int unsigned SIMD,
int unsigned ACTIVATION_WIDTH,
int unsigned WEIGHT_WIDTH,
int unsigned ACCU_WIDTH,
int unsigned WEIGHT_WIDTH,
int unsigned ACTIVATION_WIDTH,
int unsigned ACCU_WIDTH,

bit SIGNED_ACTIVATIONS = 0,
int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment)
bit FORCE_BEHAVIORAL = 0,
Expand Down
23 changes: 16 additions & 7 deletions finn-rtllib/mvu/mvu_vvu_axi.sv
Original file line number Diff line number Diff line change
Expand Up @@ -300,17 +300,22 @@ module mvu_vvu_axi #(

case(COMPUTE_CORE)
"mvu_vvu_8sx9_dsp58":
mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(DSP_SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
mvu_vvu_8sx9_dsp58 #(
.IS_MVU(IS_MVU),
.PE(PE), .SIMD(DSP_SIMD),
.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
) core (
.clk(dsp_clk), .rst, .en(dsp_en),
.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
.vld(dsp_vld), .p(dsp_p)
);
"mvu_4sx4u_dsp48e1":
mvu_4sx4u #(
.PE(PE), .SIMD(DSP_SIMD),
.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .NARROW_WEIGHTS(NARROW_WEIGHTS),
.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .NARROW_WEIGHTS(NARROW_WEIGHTS),
.VERSION(1), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
) core (
.clk(dsp_clk), .rst, .en(dsp_en),
Expand All @@ -320,16 +325,20 @@ module mvu_vvu_axi #(
"mvu_4sx4u_dsp48e2":
mvu_4sx4u #(
.PE(PE), .SIMD(DSP_SIMD),
.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .NARROW_WEIGHTS(NARROW_WEIGHTS),
.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .NARROW_WEIGHTS(NARROW_WEIGHTS),
.VERSION(2), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
) core (
.clk(dsp_clk), .rst, .en(dsp_en),
.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
.vld(dsp_vld), .p(dsp_p)
);
"mvu_8sx8u_dsp48":
mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
mvu_8sx8u_dsp48 #(
.PE(PE), .SIMD(DSP_SIMD),
.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
) core (
.clk(dsp_clk), .rst, .en(dsp_en),
.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
.vld(dsp_vld), .p(dsp_p)
Expand Down
165 changes: 165 additions & 0 deletions finn-rtllib/mvu/tb/mvu_3sx3u_tb.sv
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
module mvu_3sx3u_tb;

localparam int unsigned ROUNDS = 157;

localparam int unsigned MH = 32;
localparam int unsigned MW = 60;
localparam int unsigned PE = 1;
localparam int unsigned SIMD = 1;

localparam int unsigned ACTIVATION_WIDTH = 3;
localparam int unsigned WEIGHT_WIDTH = 3;
localparam int unsigned ACCU_WIDTH = 16;


//-----------------------------------------------------------------------
// Global Control
logic clk = 1;
always #5ns clk = !clk;

logic rst = 1;
initial begin
repeat(16) @(posedge clk);
rst <= 0;
end

//-----------------------------------------------------------------------
// DUT
logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] s_axis_weights_tdata;
logic s_axis_weights_tvalid;
uwire s_axis_weights_tready;

logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] s_axis_input_tdata;
logic s_axis_input_tvalid;
uwire s_axis_input_tready;

uwire [PE-1:0][ACCU_WIDTH-1:0] m_axis_output_tdata;
uwire m_axis_output_tvalid;
logic m_axis_output_tready;

mvu_vvu_axi #(
.IS_MVU(1),
.COMPUTE_CORE("mvu_4sx4u_dsp48e2"),
.MH(MH), .MW(MW),
.PE(PE), .SIMD(SIMD),

.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
.WEIGHT_WIDTH(WEIGHT_WIDTH),
.ACCU_WIDTH(ACCU_WIDTH)
//int unsigned SEGMENTLEN = 0,
//bit FORCE_BEHAVIORAL = 0,
) dut (
.ap_clk(clk), .ap_clk2x('x), .ap_rst_n(!rst),
.s_axis_weights_tdata, .s_axis_weights_tvalid, .s_axis_weights_tready,
.s_axis_input_tdata, .s_axis_input_tvalid, .s_axis_input_tready,
.m_axis_output_tdata, .m_axis_output_tvalid, .m_axis_output_tready
);

//-----------------------------------------------------------------------
// Stimuli

//- Infinite Weight Feed ------------
typedef logic signed [WEIGHT_WIDTH-1:0] weights_t[MH][MW];
function weights_t calc_WEIGHTS();
automatic weights_t ret;
std::randomize(ret);
return ret;
endfunction : calc_WEIGHTS
weights_t WEIGHTS = calc_WEIGHTS();

initial begin
s_axis_weights_tdata = 'x;
s_axis_weights_tvalid = 0;
@(posedge clk iff !rst);

forever begin
for(int unsigned h = 0; h < MH; h+=PE) begin
for(int unsigned w = 0; w < MW; w+=SIMD) begin
for(int unsigned pe = 0; pe < PE; pe++) begin
for(int unsigned simd = 0; simd < SIMD; simd++) begin
s_axis_weights_tdata[pe][simd] <= WEIGHTS[h+pe][w+simd];
end
end
s_axis_weights_tvalid <= 1;
@(posedge clk iff s_axis_weights_tready);
s_axis_weights_tvalid <= 0;
s_axis_weights_tdata <= 'x;
end
end
end
end

//- Input Feed and Reference Computation
typedef logic [PE-1:0][ACCU_WIDTH-1:0] outvec_t;
outvec_t Q_ref[$] = {};

initial begin
s_axis_input_tdata = 'x;
s_axis_input_tvalid = 0;
@(posedge clk iff !rst);

repeat(ROUNDS) begin : blkRounds
automatic logic [MH-1:0][ACCU_WIDTH-1:0] accus = '{ default: 0 };

for(int unsigned w = 0; w < MW; w+=SIMD) begin : blkSF
for(int unsigned simd = 0; simd < SIMD; simd++) begin : blkSIMD
automatic logic [ACTIVATION_WIDTH-1:0] act = $urandom();
for(int unsigned h = 0; h < MH; h++) begin : blkMH
automatic logic signed [ACCU_WIDTH-1:0] prod = WEIGHTS[h][w+simd] * $signed({1'b0, act});
accus[h] += prod;
end : blkMH
s_axis_input_tdata[simd] <= act;
end : blkSIMD
s_axis_input_tvalid <= 1;
@(posedge clk iff s_axis_input_tready);
s_axis_input_tvalid <= 0;
s_axis_input_tdata <= 'x;
end : blkSF

for(int unsigned h = 0; h < MH; h+=PE) begin
Q_ref.push_back(accus[h+:PE]);
end

end : blkRounds
end

//- Output Checker
initial begin
automatic int timeout = 0;

m_axis_output_tready = 0;
@(posedge clk iff !rst);

m_axis_output_tready <= 1;
while(timeout < MW/SIMD+16) begin
@(posedge clk);
if(!m_axis_output_tvalid) timeout++;
else begin
automatic outvec_t exp;

assert(Q_ref.size()) else begin
$error("Spurious output.");
$stop;
end

exp = Q_ref.pop_front();
assert(m_axis_output_tdata === exp) else begin
$error("Mismatched output %p instead of %p.", m_axis_output_tdata, exp);
$stop;
end

timeout = 0;
end
end
m_axis_output_tready <= 0;

assert(Q_ref.size() == 0) else begin
$error("Missing output.");
$stop;
end

$display("Test completed.");
$finish;
end

endmodule : mvu_3sx3u_tb
Loading

0 comments on commit 50045ff

Please sign in to comment.