import argparse
from pynq import Overlay
import numpy as np
from pynq import allocate
import time
from finn.util.data_packing import (
finnpy_to_packed_bytearray,
packed_bytearray_to_finnpy
)
from finn.core.datatype import DataType
class FINNAccelDriver():
def __init__(self, N, bitfile):
"""Instantiate the FINN accelerator driver.
Gets batchsize (N) as integer and path to bitfile as string."""
self.N = N
# input FINN DataType
self.idt = DataType.BINARY
# output FINN DataType
self.odt = DataType.UINT32
# input and output shapes
self.ishape_normal = (N, 784)
self.oshape_normal = (N, 10)
self.ishape_folded = (N, 16, 49)
self.oshape_folded = (N, 1, 10)
self.ishape_packed = (N, 16, 7) # datatype np.uint8
self.oshape_packed = (N, 1, 40) # datatype np.uint8
# load bitfile and set up accelerator
self.ol = Overlay(bitfile)
self.dma = self.ol.axi_dma_0
self.ctrl_regs = self.ol.resize_accel_0
# neuron folding factor of output = iterations per sample
self.itersPerSample = self.oshape_packed[-2]
# AXI lite register offset for number of iterations
# used by TLastMarker to signal end of transmission for AXI CDMA
self.REG_OFFSET_NUM_ITERS = 0x10
# set up TLastMarker with correct num. samples
self.ctrl_regs.write(self.REG_OFFSET_NUM_ITERS, self.N*self.itersPerSample)
# allocate a PYNQ buffer for the packed input and buffer
self.ibuf_packed_device = allocate(shape=self.ishape_packed, dtype=np.uint8)
self.obuf_packed_device = allocate(shape=self.oshape_packed, dtype=np.uint8)
def fold_input(self, ibuf_normal):
"""Reshapes input in desired shape.
Gets input data (ibuf_normal), checks if data is in expected normal shape.
Returns folded input."""
# ensure that shape is as expected
assert ibuf_normal.shape == self.ishape_normal
# convert to folded form
ibuf_folded = ibuf_normal.reshape(self.ishape_folded)
return ibuf_folded
def pack_input(self, ibuf_folded):
"""Packs folded input and reverses both SIMD dim and endianness.
Gets input data in folded shape and returns packed input data."""
ibuf_packed = finnpy_to_packed_bytearray(
ibuf_folded, self.idt, reverse_endian=True, reverse_inner=True
)
return ibuf_packed
def unpack_output(self, obuf_packed):
"""Unpacks the packed output buffer from accelerator.
Gets packed output and returns output data in folded shape."""
obuf_folded = packed_bytearray_to_finnpy(
obuf_packed, self.odt, self.oshape_folded, reverse_endian=True, reverse_inner=True
)
return obuf_folded
def unfold_output(self, obuf_folded):
"""Unfolds output data to normal shape.
Gets folded output data and returns output data in normal shape."""
obuf_normal = obuf_folded.reshape(self.oshape_normal)
return obuf_normal
def copy_input_data_to_device(self, data):
"""Copies given input data to PYNQ buffer."""
np.copyto(self.ibuf_packed_device, data)
def execute(self):
"""Executes accelerator by setting up the DMA and
waiting until all transfers complete. Uses only member variables and
returns nothing."""
dma = self.dma
dma.sendchannel.transfer(self.ibuf_packed_device)
dma.recvchannel.transfer(self.obuf_packed_device)
dma.sendchannel.wait()
dma.recvchannel.wait()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Set exec mode, batchsize N, bitfile name, inputfile name and outputfile name')
parser.add_argument('--exec_mode', help='Please select functional verification ("execute") or throughput test ("throughput_test")', default="execute")
parser.add_argument('--batchsize', help='number of samples for inference', type=int, default=1)
parser.add_argument('--bitfile', help='name of bitfile (i.e. "resizer.bit")', default="resizer.bit")
parser.add_argument('--inputfile', help='name of input npy file (i.e. "input.npy")', default="input.npy")
parser.add_argument('--outputfile', help='name of output npy file (i.e. "output.npy")', default="output.npy")
# parse arguments
args = parser.parse_args()
exec_mode = args.exec_mode
N = args.batchsize
bitfile = args.bitfile
inputfile = args.inputfile
outputfile = args.outputfile
# instantiate FINN accelerator driver and pass batchsize and bitfile
finnDriver = FINNAccelDriver(N, bitfile)
# for the remote execution the data from the input npy file has to be loaded,
# packed and copied to the PYNQ buffer
if exec_mode == "execute":
# load desired input .npy file
ibuf_normal = np.load(inputfile)
ibuf_folded = finnDriver.fold_input(ibuf_normal)
ibuf_packed = finnDriver.pack_input(ibuf_folded)
finnDriver.copy_input_data_to_device(ibuf_packed)
elif exec_mode != "throughput_test":
raise Exception("Exec mode has to be set to remote_pynq or throughput_test")
# for the throughput test the runtime of the network has to be measured
if exec_mode == "throughput_test":
# measure runtime of network
start = time.time()
# dictionary for results of throughput test
res={}
# execute accelerator
finnDriver.execute()
# measure run time and fill dictionary with results of the throughput test
if exec_mode == "throughput_test":
end = time.time()
runtime = end - start
res["runtime[ms]"] = runtime*1000
res["throughput[images/s]"] = N / runtime
res["DRAM_in_bandwidth[Mb/s]"] = np.prod(finnDriver.ishape_packed)*0.000001 / runtime
res["DRAM_out_bandwidth[Mb/s]"] = np.prod(finnDriver.oshape_packed)*0.000001 / runtime
file = open("nw_metrics.txt", "w")
file.write(str(res))
file.close()
# if execution is selected unpack, unfold and save output to output npy file
else:
obuf_folded = finnDriver.unpack_output(finnDriver.obuf_packed_device)
obuf_normal = finnDriver.unfold_output(obuf_folded)
np.save(outputfile, obuf_normal)
・ Inserting the IP into a PYNQ Overlay Shell
・ Synthesis, Place and Route
・ Driver Generation
・ Deployment and Remote Execution
・ Throughput Test on PYNQ Board
ip_config.tcl resizer.cache resizer.ip_user_files resizer.xpr
make_project.sh resizer.hw resizer.srcs synth_project.sh
================================================================
== Vivado HLS Report for 'StreamingFCLayer_Batch_0'
================================================================
* Date: Tue Jun 2 19:58:23 2020
* Version: 2019.2 (Build 2698951 on Thu Oct 24 19:15:34 MDT 2019)
* Project: project_StreamingFCLayer_Batch_0
* Solution: sol1
* Product family: zynq
* Target device: xc7z020-clg400-1
================================================================
== Performance Estimates
================================================================
+ Timing:
* Summary:
+--------+----------+----------+------------+
| Clock | Target | Estimated| Uncertainty|
+--------+----------+----------+------------+
|ap_clk | 10.00 ns | 8.488 ns | 1.25 ns |
+--------+----------+----------+------------+
+ Latency:
* Summary:
+---------+---------+----------+----------+-----+-----+---------+
| Latency (cycles) | Latency (absolute) | Interval | Pipeline|
| min | max | min | max | min | max | Type |
+---------+---------+----------+----------+-----+-----+---------+
| 70| 70| 0.700 us | 0.700 us | 70| 70| none |
+---------+---------+----------+----------+-----+-----+---------+
+ Detail:
* Instance:
+--------------------------------+----------------------+---------+---------+----------+----------+-----+-----+---------+
| | | Latency (cycles) | Latency (absolute) | Interval | Pipeline|
| Instance | Module | min | max | min | max | min | max | Type |
+--------------------------------+----------------------+---------+---------+----------+----------+-----+-----+---------+
|grp_Matrix_Vector_Activa_fu_28 |Matrix_Vector_Activa | 67| 67| 0.670 us | 0.670 us | 67| 67| none |
+--------------------------------+----------------------+---------+---------+----------+----------+-----+-----+---------+
* Loop:
N/A
================================================================
== Utilization Estimates
================================================================
* Summary:
+-----------------+---------+-------+--------+-------+-----+
| Name | BRAM_18K| DSP48E| FF | LUT | URAM|
+-----------------+---------+-------+--------+-------+-----+
|DSP | -| -| -| -| -|
|Expression | -| -| 0| 2| -|
|FIFO | -| -| -| -| -|
|Instance | -| -| 2530| 25464| -|
|Memory | -| -| -| -| -|
|Multiplexer | -| -| -| 45| -|
|Register | -| -| 5| -| -|
+-----------------+---------+-------+--------+-------+-----+
|Total | 0| 0| 2535| 25511| 0|
+-----------------+---------+-------+--------+-------+-----+
|Available | 280| 220| 106400| 53200| 0|
+-----------------+---------+-------+--------+-------+-----+
|Utilization (%) | 0| 0| 2| 47| 0|
+-----------------+---------+-------+--------+-------+-----+
+ Detail:
* Instance:
+--------------------------------+----------------------+---------+-------+-
-----+-------+-----+
| Instance | Module | BRAM_18K| DSP48E|
FF | LUT | URAM|
+--------------------------------+----------------------+---------+-------+-
-----+-------+-----+
|grp_Matrix_Vector_Activa_fu_28 |Matrix_Vector_Activa | 0| 0|
2530| 25464| 0|
+--------------------------------+----------------------+---------+-------+-
-----+-------+-----+
|Total | | 0| 0|
2530| 25464| 0|
+--------------------------------+----------------------+---------+-------+-
-----+-------+-----+
* DSP48E:
N/A
* Memory:
N/A
* FIFO:
N/A
* Expression:
+-----------------------------------------------+----------+-------+---+----
+------------+------------+
| Variable Name | Operation| DSP48E| FF| LUT
| Bitwidth P0| Bitwidth P1|
+-----------------------------------------------+----------+-------+---+----
+------------+------------+
|grp_Matrix_Vector_Activa_fu_28_out_V_V_TREADY | and | 0| 0| 2
| 1| 1|
+-----------------------------------------------+----------+-------+---+----
+------------+------------+
|Total | | 0| 0| 2
| 1| 1|
+-----------------------------------------------+----------+-------+---+----
+------------+------------+
* Multiplexer:
+------------------------+----+-----------+-----+-----------+
| Name | LUT| Input Size| Bits| Total Bits|
+------------------------+----+-----------+-----+-----------+
|ap_NS_fsm | 27| 5| 1| 5|
|in0_V_V_TREADY_int | 9| 2| 1| 2|
|weights_V_V_TREADY_int | 9| 2| 1| 2|
+------------------------+----+-----------+-----+-----------+
|Total | 45| 9| 3| 9|
+------------------------+----+-----------+-----+-----------+
* Register:
+---------------------------------------------+---+----+-----+-----------+
| Name | FF| LUT| Bits| Const Bits|
+---------------------------------------------+---+----+-----+-----------+
|ap_CS_fsm | 4| 0| 4| 0|
|grp_Matrix_Vector_Activa_fu_28_ap_start_reg | 1| 0| 1| 0|
+---------------------------------------------+---+----+-----+-----------+
|Total | 5| 0| 5| 0|
+---------------------------------------------+---+----+-----+-----------+
================================================================
== Interface
================================================================
* Summary:
+--------------------+-----+-----+--------------+--------------------------+----
----------+
| RTL Ports | Dir | Bits| Protocol | Source Object |
C Type |
+--------------------+-----+-----+--------------+--------------------------+----
----------+
|ap_clk | in | 1| ap_ctrl_none | StreamingFCLayer_Batch_0 | ret
urn value |
|ap_rst_n | in | 1| ap_ctrl_none | StreamingFCLayer_Batch_0 | ret
urn value |
|in0_V_V_TDATA | in | 56| axis | in0_V_V |
pointer |
|in0_V_V_TVALID | in | 1| axis | in0_V_V |
pointer |
|in0_V_V_TREADY | out | 1| axis | in0_V_V |
pointer |
|weights_V_V_TDATA | in | 784| axis | weights_V_V |
pointer |
|weights_V_V_TVALID | in | 1| axis | weights_V_V |
pointer |
|weights_V_V_TREADY | out | 1| axis | weights_V_V |
pointer |
|out_V_V_TDATA | out | 16| axis | out_V_V |
pointer |
|out_V_V_TVALID | out | 1| axis | out_V_V |
pointer |
|out_V_V_TREADY | in | 1| axis | out_V_V |
pointer |
+--------------------+-----+-----+--------------+--------------------------+----
----------+
void StreamingFCLayer_Batch_0(
hls::stream<ap_uint<49>> &in0,
hls::stream<ap_uint<784>> &weights,
hls::stream<ap_uint<16>> &out
)
entity StreamingFCLayer_Batch_0_StreamingFCLayer_Batch_0 is
port (
ap_clk : IN STD_LOGIC;
ap_rst_n : IN STD_LOGIC;
in0_V_V_TDATA : IN STD_LOGIC_VECTOR (55 downto 0);
in0_V_V_TVALID : IN STD_LOGIC;
in0_V_V_TREADY : OUT STD_LOGIC;
weights_V_V_TDATA : IN STD_LOGIC_VECTOR (783 downto 0);
weights_V_V_TVALID : IN STD_LOGIC;
weights_V_V_TREADY : OUT STD_LOGIC;
out_V_V_TDATA : OUT STD_LOGIC_VECTOR (15 downto 0);
out_V_V_TVALID : OUT STD_LOGIC;
out_V_V_TREADY : IN STD_LOGIC );
end;
================================================================
== Vivado HLS Report for 'StreamingDataWidthConverter_Batch_0'
================================================================
* Date: Tue Jun 2 19:55:07 2020
* Version: 2019.2 (Build 2698951 on Thu Oct 24 19:15:34 MDT 2019)
* Project: project_StreamingDataWidthConverter_Batch_0
* Solution: sol1
* Product family: zynq
* Target device: xc7z020-clg400-1
================================================================
== Performance Estimates
================================================================
+ Timing:
* Summary:
+--------+----------+----------+------------+
| Clock | Target | Estimated| Uncertainty|
+--------+----------+----------+------------+
|ap_clk | 10.00 ns | 5.723 ns | 1.25 ns |
+--------+----------+----------+------------+
+ Latency:
* Summary:
+---------+---------+-----------+-----------+-----+-----+---------+
| Latency (cycles) | Latency (absolute) | Interval | Pipeline|
| min | max | min | max | min | max | Type |
+---------+---------+-----------+-----------+-----+-----+---------+
| 7| 7| 70.000 ns | 70.000 ns | 7| 7| none |
+---------+---------+-----------+-----------+-----+-----+---------+
+ Detail:
* Instance:
+----------------------------------+------------------------+---------+---------+-----------+-----------+-----+-----+---------+
| | | Latency (cycles) | Latency (absolute) | Interval | Pipeline|
| Instance | Module | min | max | min | max | min | max | Type |
+----------------------------------+------------------------+---------+---------+-----------+-----------+-----+-----+---------+
|grp_StreamingDataWidthCo_1_fu_26 |StreamingDataWidthCo_1 | 4| 4| 40.000 ns | 40.000 ns | 4| 4| none |
+----------------------------------+------------------------+---------+---------+-----------+-----------+-----+-----+---------+
* Loop:
N/A
================================================================
== Utilization Estimates
================================================================
* Summary:
+-----------------+---------+-------+--------+-------+-----+
| Name | BRAM_18K| DSP48E| FF | LUT | URAM|
+-----------------+---------+-------+--------+-------+-----+
|DSP | -| -| -| -| -|
|Expression | -| -| 0| 2| -|
|FIFO | -| -| -| -| -|
|Instance | -| -| 65| 241| -|
|Memory | -| -| -| -| -|
|Multiplexer | -| -| -| 36| -|
|Register | -| -| 5| -| -|
+-----------------+---------+-------+--------+-------+-----+
|Total | 0| 0| 70| 279| 0|
+-----------------+---------+-------+--------+-------+-----+
|Available | 280| 220| 106400| 53200| 0|
+-----------------+---------+-------+--------+-------+-----+
|Utilization (%) | 0| 0| ~0 | ~0 | 0|
+-----------------+---------+-------+--------+-------+-----+
+ Detail:
* Instance:
+----------------------------------+------------------------+---------+-------+----+-----+-----+
| Instance | Module | BRAM_18K| DSP48E| FF | LUT | URAM|
+----------------------------------+------------------------+---------+-------+----+-----+-----+
|grp_StreamingDataWidthCo_1_fu_26 |StreamingDataWidthCo_1 | 0| 0| 65| 241| 0|
+----------------------------------+------------------------+---------+-------+----+-----+-----+
|Total | | 0| 0| 65| 241| 0|
+----------------------------------+------------------------+---------+-------+----+-----+-----+
* DSP48E:
N/A
* Memory:
N/A
* FIFO:
N/A
* Expression:
+-------------------------------------------------+----------+-------+---+----+------------+------------+
| Variable Name | Operation| DSP48E| FF| LUT| Bitwidth P0| Bitwidth P1|
+-------------------------------------------------+----------+-------+---+----+------------+------------+
|grp_StreamingDataWidthCo_1_fu_26_out_V_V_TREADY | and | 0| 0| 2| 1| 1|
+-------------------------------------------------+----------+-------+---+----+------------+------------+
|Total | | 0| 0| 2| 1| 1|
+-------------------------------------------------+----------+-------+---+----+------------+------------+
* Multiplexer:
+--------------------+----+-----------+-----+-----------+
| Name | LUT| Input Size| Bits| Total Bits|
+--------------------+----+-----------+-----+-----------+
|ap_NS_fsm | 27| 5| 1| 5|
|in0_V_V_TREADY_int | 9| 2| 1| 2|
+--------------------+----+-----------+-----+-----------+
|Total | 36| 7| 2| 7|
+--------------------+----+-----------+-----+-----------+
* Register:
+-----------------------------------------------+---+----+-----+-----------+
| Name | FF| LUT| Bits| Const Bits|
+-----------------------------------------------+---+----+-----+-----------+
|ap_CS_fsm | 4| 0| 4| 0|
|grp_StreamingDataWidthCo_1_fu_26_ap_start_reg | 1| 0| 1| 0|
+-----------------------------------------------+---+----+-----+-----------+
|Total | 5| 0| 5| 0|
+-----------------------------------------------+---+----+-----+-----------+
================================================================
== Interface
================================================================
* Summary:
+----------------+-----+-----+--------------+-------------------------------------+--------------+
| RTL Ports | Dir | Bits| Protocol | Source Object | C Type |
+----------------+-----+-----+--------------+-------------------------------------+--------------+
|ap_clk | in | 1| ap_ctrl_none | StreamingDataWidthConverter_Batch_0 | return value |
|ap_rst_n | in | 1| ap_ctrl_none | StreamingDataWidthConverter_Batch_0 | return value |
|in0_V_V_TDATA | in | 16| axis | in0_V_V | pointer |
|in0_V_V_TVALID | in | 1| axis | in0_V_V | pointer |
|in0_V_V_TREADY | out | 1| axis | in0_V_V | pointer |
|out_V_V_TDATA | out | 8| axis | out_V_V | pointer |
|out_V_V_TVALID | out | 1| axis | out_V_V | pointer |
|out_V_V_TREADY | in | 1| axis | out_V_V | pointer |
+----------------+-----+-----+--------------+-------------------------------------+--------------+
void StreamingDataWidthConverter_Batch_0(hls::stream
> &in0, hls::stream > &out)
entity StreamingDataWidthConverter_Batch_0_StreamingDataWidthConverter_Batch_0 i
s
port (
ap_clk : IN STD_LOGIC;
ap_rst_n : IN STD_LOGIC;
in0_V_V_TDATA : IN STD_LOGIC_VECTOR (15 downto 0);
in0_V_V_TVALID : IN STD_LOGIC;
in0_V_V_TREADY : OUT STD_LOGIC;
out_V_V_TDATA : OUT STD_LOGIC_VECTOR (7 downto 0);
out_V_V_TVALID : OUT STD_LOGIC;
out_V_V_TREADY : IN STD_LOGIC );
end;
drwx------ 3 masaaki docker 4096 Jun 2 19:55 code_gen_ipgen_StreamingFCLayer_Batch_0_av35euos
drwx------ 3 masaaki docker 4096 Jun 2 19:54 code_gen_ipgen_StreamingDataWidthConverter_Batch_0_nzm_awqb
drwx------ 3 masaaki docker 4096 Jun 2 19:53 code_gen_ipgen_StreamingFCLayer_Batch_1_ze13mbt0
drwx------ 3 masaaki docker 4096 Jun 2 19:52 code_gen_ipgen_StreamingFCLayer_Batch_2_cc9g0rxo
drwx------ 3 masaaki docker 4096 Jun 2 19:51 code_gen_ipgen_StreamingFCLayer_Batch_3_8fn3j9gt
drwx------ 3 masaaki docker 4096 Jun 2 19:50 code_gen_ipgen_TLastMarker_0_dkbjhrkw
drwx------ 3 masaaki docker 4096 Jun 2 19:46 code_gen_ipgen_StreamingFIFO_5_rpmfwai3
drwx------ 3 masaaki docker 4096 Jun 2 19:46 code_gen_ipgen_StreamingFIFO_4_hkgtx9lw
drwx------ 3 masaaki docker 4096 Jun 2 19:46 code_gen_ipgen_StreamingFIFO_3_iz1kpch8
drwx------ 3 masaaki docker 4096 Jun 2 19:46 code_gen_ipgen_StreamingFIFO_2_7lkicamh
drwx------ 3 masaaki docker 4096 Jun 2 19:46 code_gen_ipgen_StreamingFIFO_1_eimmu84e
drwx------ 3 masaaki docker 4096 Jun 2 19:46 code_gen_ipgen_StreamingFIFO_0_qnzwb_sa
drwx------ 2 masaaki docker 4096 Jun 1 19:27 dataflow_partition_evha3xbg
日 | 月 | 火 | 水 | 木 | 金 | 土 |
---|---|---|---|---|---|---|
- | - | - | - | 1 | 2 | 3 |
4 | 5 | 6 | 7 | 8 | 9 | 10 |
11 | 12 | 13 | 14 | 15 | 16 | 17 |
18 | 19 | 20 | 21 | 22 | 23 | 24 |
25 | 26 | 27 | 28 | 29 | 30 | - |