// DMA2axis_3buf.h
// 2022/07/26 by marsee
//
#ifndef __DMA2AXIS_3BUF_H__
#define __DMA2AXIS_3BUF_H__
#define NUM_FRAME_BUFFER 3
#define DMA_WRITE_MODE 0
#define FREE_RUN_MODE 1
#define ROW_SIZE 600
#define COL_SIZE 800
#endif
// DMA2axis_3buf.cpp
// 2022/07/22 by marsee
//
// frame_buffer0, frame_buffer1, frame_buffer2 には3つのフレームバッファのアドレスを入れる
// mode = 0 : DMA Write IP の active_frame を見て、その1つ前のフレームをDMA Readするモード(DMA_WRITE_MODE)
// mode = 1 : フリーラン モード(FREE_RUN_MODE),3このフレームバッファを順番にDMA Readする
//
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <ap_int.h>
#include <hls_stream.h>
#include <ap_axi_sdata.h>
#include "DMA2axis_3buf.h"
int DMA2axis_3buf(int32_t *fb0, int32_t *fb1, int32_t *fb2, hls::stream<ap_axis<32,1,1,1> >& outs,
ap_uint<2> active_frame, ap_uint<1> mode){
#pragma HLS DATAFLOW
#pragma HLS INTERFACE mode=s_axilite port=mode
#pragma HLS INTERFACE mode=m_axi depth=480000 port=fb2 offset=slave
#pragma HLS INTERFACE mode=m_axi depth=480000 port=fb1 offset=slave
#pragma HLS INTERFACE mode=m_axi depth=480000 port=fb0 offset=slave
#pragma HLS INTERFACE mode=ap_none port=active_frame register
#pragma HLS INTERFACE mode=axis register_mode=both port=outs register
#pragma HLS INTERFACE mode=s_axilite port=return
ap_axis<32,1,1,1> vals;
int32_t *fb;
static int n = 0;
if (mode == DMA_WRITE_MODE){
n = (int)active_frame;
}else{
n++;
if (n > 2)
n = 0;
}
if(n == 0)
fb = fb2;
else if(n == 1)
fb = fb0;
else // n == 2
fb = fb1;
vals.keep = 0x7;
vals.strb = 0x7;
for (int y=0; y<ROW_SIZE; y++){
#pragma HLS LOOP_TRIPCOUNT avg=600 max=1080 min=480
for (int x=0; x<COL_SIZE; x++){
#pragma HLS LOOP_TRIPCOUNT avg=800 max=1920 min=640
#pragma HLS PIPELINE II=1
vals.data = fb[y*COL_SIZE+x];
if (y==0 && x==0)
vals.user = 1;
else
vals.user = 0;
if (x == (COL_SIZE-1))
vals.last = 1;
else
vals.last = 0;
outs << vals;
}
}
return 0;
}
// DMA2axis_3buf_tb.cpp
// 2022/07/22 by marsee
//
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <ap_int.h>
#include <hls_stream.h>
#include <ap_axi_sdata.h>
#include "DMA2axis_3buf.h"
int DMA2axis_3buf(int32_t *fb0, int32_t *fb1, int32_t *fb2, hls::stream<ap_axis<32,1,1,1> >& outs,
ap_uint<2> active_frame, ap_uint<1> mode);
int main(){
using namespace std;
hls::stream<ap_axis<32,1,1,1> > outs;
ap_axis<32,1,1,1> vals;
int32_t *frame_buffer;
ap_uint<2> active_frame;
// frame buffer をアロケートする、3倍の領域を取ってそれを3つに分ける
if ((frame_buffer =(int32_t *)malloc(NUM_FRAME_BUFFER * sizeof(int32_t) * (COL_SIZE * ROW_SIZE))) == NULL){
fprintf(stderr, "Can't allocate frame_buffer0 ~ 2\n");
exit(1);
}
// フレーム・バッファにデータを入力する
for(int i=0; i<NUM_FRAME_BUFFER; i++){
for (int y=0; y<ROW_SIZE; y++){
for (int x=0; x<COL_SIZE; x++){
frame_buffer[i*(COL_SIZE * ROW_SIZE)+y*COL_SIZE+x] = y*COL_SIZE+x;
}
}
}
for(int i=0; i<3; i++){
active_frame = i;
DMA2axis_3buf(frame_buffer,
&frame_buffer[COL_SIZE * ROW_SIZE],
&frame_buffer[2 * (COL_SIZE * ROW_SIZE)],
outs, active_frame, DMA_WRITE_MODE);
}
// 比較
for(int i=0; i<NUM_FRAME_BUFFER; i++){
for(int y=0; y<ROW_SIZE; y++){
for(int x=0; x<COL_SIZE; x++){
outs >> vals;
if(vals.data != y*COL_SIZE+x){
printf("vals.data = %x, correct data = %x\n", vals.data, y*COL_SIZE+x);
exit(1);
}
}
}
}
printf("Success\n");
//free(frame_buffer);
return 0;
}
// vflip_dma_write.h
// 2022/07/21 by marsee
#ifndef __VFLIP_DMA_WRITE_H__
#define __VFLIP_DMA_WRITE_H__
#include <stdint.h>
#include "ap_int.h"
#include "hls_stream.h"
#include "ap_axi_sdata.h"
#define MAX_WIDTH 800
#define MAX_HEIGHT 600
typedef hls::stream<ap_axiu<32,1,1,1> > AXI_STREAM;
typedef ap_axiu<32,1,1,1> AP_AXIU32;
#endif
// vflip_dma_write.cpp
// 2022/07/21 by marsee
// 使用する解像度は 3840 x 2160 まで
#include "vflip_dma_write.h"
int vflip_dma_write(AXI_STREAM & ins, ap_int<32> *fb0, ap_int<32> *fb1, ap_int<32> *fb2,
volatile ap_uint<2> &active_frame){
#pragma HLS INTERFACE mode=ap_none port=active_frame register
#pragma HLS INTERFACE mode=m_axi depth=480000 port=fb0 offset=slave
#pragma HLS INTERFACE mode=m_axi depth=480000 port=fb1 offset=slave
#pragma HLS INTERFACE mode=m_axi depth=480000 port=fb2 offset=slave
#pragma HLS INTERFACE mode=axis register_mode=both port=ins register
#pragma HLS INTERFACE mode=s_axilite port=return
AP_AXIU32 pix;
int max_fb_chk;
active_frame = 0;
LOOP_WAIT0: do { // user が 1になった時にフレームがスタートする
#pragma HLS LOOP_TRIPCOUNT min=1 max=1 avg=1
ins >> pix;
} while(pix.user == 0);
LOOP_Y0: for (int y=MAX_HEIGHT-1; y>=0; y--){ // vflip
#pragma HLS LOOP_TRIPCOUNT avg=600 max=1080 min=480
LOOP_X0: for (int x=0; x<MAX_WIDTH; x++){
#pragma HLS LOOP_TRIPCOUNT avg=800 max=1920 min=640
#pragma HLS PIPELINE II=1 rewind
if (!(x==0 && y==MAX_HEIGHT-1)) // 最初の入力はすでに入力されている
ins >> pix; // AXI4-Stream からの入力
fb0[(y*MAX_WIDTH)+x] = pix.data;
}
}
active_frame = 1;
LOOP_WAIT1: do { // user が 1になった時にフレームがスタートする
#pragma HLS LOOP_TRIPCOUNT min=1 max=1 avg=1
ins >> pix;
} while(pix.user == 0);
LOOP_Y1: for (int y=MAX_HEIGHT-1; y>=0; y--){ // vflip
#pragma HLS LOOP_TRIPCOUNT avg=600 max=1080 min=480
LOOP_X1: for (int x=0; x<MAX_WIDTH; x++){
#pragma HLS LOOP_TRIPCOUNT avg=800 max=1920 min=640
#pragma HLS PIPELINE II=1 rewind
if (!(x==0 && y==MAX_HEIGHT-1)) // 最初の入力はすでに入力されている
ins >> pix; // AXI4-Stream からの入力
fb1[(y*MAX_WIDTH)+x] = pix.data;
}
}
active_frame = 2;
LOOP_WAIT2: do { // user が 1になった時にフレームがスタートする
#pragma HLS LOOP_TRIPCOUNT min=1 max=1 avg=1
ins >> pix;
} while(pix.user == 0);
LOOP_Y2: for (int y=MAX_HEIGHT-1; y>=0; y--){ // vflip
#pragma HLS LOOP_TRIPCOUNT avg=600 max=1080 min=480
LOOP_X2: for (int x=0; x<MAX_WIDTH; x++){
#pragma HLS LOOP_TRIPCOUNT avg=800 max=1920 min=640
#pragma HLS PIPELINE II=1 rewind
if (!(x==0 && y==MAX_HEIGHT-1)) // 最初の入力はすでに入力されている
ins >> pix; // AXI4-Stream からの入力
fb2[(y*MAX_WIDTH)+x] = pix.data;
}
}
end:
return(0);
}
// vflip_dma_write_tb_tb.cpp
// 2019/01/01 by marsee
//
#include <iostream>
#include <fstream>
#include "vflip_dma_write.h"
int vflip_dma_write(AXI_STREAM & ins, ap_int<32> *fb0, ap_int<32> *fb1, ap_int<32> *fb2,
volatile ap_uint<2> &active_frame);
#define NUM_FRAME_BUFFER 3
int main()
{
AXI_STREAM ins;
AP_AXIU32 pix;
ap_uint<2> active_frame;
int *frame_buffer;
// Mat フォーマットから AXI4 Stream へ変換、3画面分
for(int i=0; i<NUM_FRAME_BUFFER; i++){
for (int y=0; y<MAX_HEIGHT; y++){
for (int x=0; x<MAX_WIDTH; x++){
pix.data = i*(MAX_WIDTH * MAX_HEIGHT)+y*MAX_WIDTH+x;
if(y==0 && x==0)
pix.user = 1;
else
pix.user = 0;
ins << pix;
}
}
}
// frame buffer をアロケートする、3倍の領域を取ってそれを3つに分ける
if ((frame_buffer =(int *)malloc(NUM_FRAME_BUFFER * sizeof(int) * (MAX_WIDTH * MAX_HEIGHT))) == NULL){
fprintf(stderr, "Can't allocate frame_buffer0 ~ 2\n");
exit(1);
}
vflip_dma_write(ins, (ap_int<32> *)frame_buffer,
(ap_int<32> *)&frame_buffer[MAX_WIDTH * MAX_HEIGHT],
(ap_int<32> *)&frame_buffer[2 * (MAX_WIDTH * MAX_HEIGHT)],
active_frame);
// 比較
for(int i=0; i<NUM_FRAME_BUFFER; i++){
for(int y=0; y<MAX_HEIGHT; y++){
for(int x=0; x<MAX_WIDTH; x++){
int rgb = frame_buffer[i*(MAX_WIDTH * MAX_HEIGHT)+y*MAX_WIDTH+x];
if(rgb != i*(MAX_WIDTH * MAX_HEIGHT)+(MAX_HEIGHT-1-y)*MAX_WIDTH+x){
printf("rgb = %x, correct data = %x\n", rgb, i*(MAX_WIDTH * MAX_HEIGHT)+y*MAX_WIDTH+x);
exit(1);
}
}
}
}
printf("Success\n");
//free(frame_buffer);
return 0;
}
// vflip_dma_write.h
// 2022/07/21 by marsee
#ifndef __VFLIP_DMA_WRITE_H__
#define __VFLIP_DMA_WRITE_H__
#include <stdint.h>
#include "ap_int.h"
#include "hls_stream.h"
#include "ap_axi_sdata.h"
typedef hls::stream<ap_axiu<32,1,1,1> > AXI_STREAM;
typedef ap_axiu<32,1,1,1> AP_AXIU32;
#endif
// vflip_dma_write.cpp
// 2022/07/21 by marsee
#include "vflip_dma_write.h"
int vflip_dma_write(AXI_STREAM & ins, ap_int<32> *fb0, ap_int<32> *fb1, ap_int<32> *fb2,
int32_t row_size, int32_t col_size, volatile ap_uint<2> &active_frame){
#pragma HLS INTERFACE mode=s_axilite port=col_size
#pragma HLS INTERFACE mode=s_axilite port=row_size
#pragma HLS INTERFACE mode=ap_none port=active_frame register
#pragma HLS INTERFACE mode=m_axi depth=19200 port=fb0 offset=slave
#pragma HLS INTERFACE mode=m_axi depth=19200 port=fb1 offset=slave
#pragma HLS INTERFACE mode=m_axi depth=19200 port=fb2 offset=slave
#pragma HLS INTERFACE mode=axis register_mode=both port=ins register
#pragma HLS INTERFACE mode=s_axilite port=return
AP_AXIU32 pix;
int max_fb_chk;
active_frame = 0;
LOOP_WAIT0: do { // user が 1になった時にフレームがスタートする
#pragma HLS LOOP_TRIPCOUNT min=1 max=1 avg=1
ins >> pix;
} while(pix.user == 0);
LOOP_Y0: for (int y=row_size-1; y>=0; y--){ // vflip
#pragma HLS LOOP_TRIPCOUNT avg=600 max=1080 min=480
LOOP_X0: for (int x=0; x<col_size; x++){
#pragma HLS LOOP_TRIPCOUNT avg=800 max=1920 min=640
#pragma HLS PIPELINE II=1 rewind
if (!(x==0 && y==row_size-1)) // 最初の入力はすでに入力されている
ins >> pix; // AXI4-Stream からの入力
fb0[(y*col_size)+x] = pix.data;
}
}
active_frame = 1;
LOOP_WAIT1: do { // user が 1になった時にフレームがスタートする
#pragma HLS LOOP_TRIPCOUNT min=1 max=1 avg=1
ins >> pix;
} while(pix.user == 0);
LOOP_Y1: for (int y=row_size-1; y>=0; y--){ // vflip
#pragma HLS LOOP_TRIPCOUNT avg=600 max=1080 min=480
LOOP_X1: for (int x=0; x<col_size; x++){
#pragma HLS LOOP_TRIPCOUNT avg=800 max=1920 min=640
#pragma HLS PIPELINE II=1 rewind
if (!(x==0 && y==row_size-1)) // 最初の入力はすでに入力されている
ins >> pix; // AXI4-Stream からの入力
fb1[(y*col_size)+x] = pix.data;
}
}
active_frame = 2;
LOOP_WAIT2: do { // user が 1になった時にフレームがスタートする
#pragma HLS LOOP_TRIPCOUNT min=1 max=1 avg=1
ins >> pix;
} while(pix.user == 0);
LOOP_Y2: for (int y=row_size-1; y>=0; y--){ // vflip
#pragma HLS LOOP_TRIPCOUNT avg=600 max=1080 min=480
LOOP_X2: for (int x=0; x<col_size; x++){
#pragma HLS LOOP_TRIPCOUNT avg=800 max=1920 min=640
#pragma HLS PIPELINE II=1 rewind
if (!(x==0 && y==row_size-1)) // 最初の入力はすでに入力されている
ins >> pix; // AXI4-Stream からの入力
fb2[(y*col_size)+x] = pix.data;
}
}
end:
return(0);
}
// vflip_dma_write_tb_tb.cpp
// 2019/01/01 by marsee
//
#include <iostream>
#include <fstream>
#include "vflip_dma_write.h"
int vflip_dma_write(AXI_STREAM & ins, ap_int<32> *fb0, ap_int<32> *fb1, ap_int<32> *fb2,
int32_t row_size, int32_t col_size, volatile ap_uint<2> &active_frame);
#define NUM_FRAME_BUFFER 3
#define MAX_HEIGHT 240
#define MAX_WIDTH 320
int main()
{
AXI_STREAM ins;
AP_AXIU32 pix;
ap_uint<2> active_frame;
int *frame_buffer;
// Mat フォーマットから AXI4 Stream へ変換、3画面分
for(int i=0; i<NUM_FRAME_BUFFER; i++){
for (int y=0; y<MAX_HEIGHT; y++){
for (int x=0; x<MAX_WIDTH; x++){
pix.data = i*(MAX_WIDTH * MAX_HEIGHT)+y*MAX_WIDTH+x;
if(y==0 && x==0)
pix.user = 1;
else
pix.user = 0;
ins << pix;
}
}
}
// frame buffer をアロケートする、3倍の領域を取ってそれを3つに分ける
if ((frame_buffer =(int *)malloc(NUM_FRAME_BUFFER * sizeof(int) * (MAX_WIDTH * MAX_HEIGHT))) == NULL){
fprintf(stderr, "Can't allocate frame_buffer0 ~ 2\n");
exit(1);
}
vflip_dma_write(ins, (ap_int<32> *)frame_buffer,
(ap_int<32> *)&frame_buffer[MAX_WIDTH * MAX_HEIGHT],
(ap_int<32> *)&frame_buffer[2 * (MAX_WIDTH * MAX_HEIGHT)],
MAX_HEIGHT, MAX_WIDTH,
active_frame);
// 比較
for(int i=0; i<NUM_FRAME_BUFFER; i++){
for(int y=0; y<MAX_HEIGHT; y++){
for(int x=0; x<MAX_WIDTH; x++){
int rgb = frame_buffer[i*(MAX_WIDTH * MAX_HEIGHT)+y*MAX_WIDTH+x];
if(rgb != i*(MAX_WIDTH * MAX_HEIGHT)+(MAX_HEIGHT-1-y)*MAX_WIDTH+x){
printf("rgb = %x, correct data = %x\n", rgb, i*(MAX_WIDTH * MAX_HEIGHT)+y*MAX_WIDTH+x);
exit(1);
}
}
}
}
printf("Success\n");
//free(frame_buffer);
return 0;
}
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from pynq import allocate, Overlay
sobel_filter = Overlay("./sobel.bit")
dma = sobel_filter.axi_dma_0
sobel = sobel_filter.sobel_axis_RGB24_0
image_path = "./test2.jpg"
original_image = Image.open(image_path)
canvas = plt.gcf()
size = canvas.get_size_inches()
canvas.set_size_inches(size*2)
width, height = original_image.size
print("Image size: {}x{} pixels.".format(width, height))
plt.figure(figsize=(6, 5));
_ = plt.imshow(original_image)
in_buffer = allocate(shape=(height, width, 3),
dtype=np.uint8, cacheable=1)
out_buffer = allocate(shape=(height, width, 3),
dtype=np.uint8, cacheable=1)
in_buffer[:] = np.array(original_image)
def run_kernel():
dma.sendchannel.transfer(in_buffer)
dma.recvchannel.transfer(out_buffer)
sobel.write(0x00,0x01) # start
dma.sendchannel.wait()
dma.recvchannel.wait()
print(height)
print(width)
sobel.register_map.row_size = height
sobel.register_map.col_size = width
sobel.register_map.function_r = 3 # SOBELwAxiDma
run_kernel()
sobel_image = Image.fromarray(out_buffer)
print("Image size: {}x{} pixels.".format(width, height))
plt.figure(figsize=(6, 5));
_ = plt.imshow(sobel_image)
del in_buffer
del out_buffer
-I/usr/local/include
-L/usr/local/lib -lopencv_core -lopencv_imgcodecs -lopencv_imgproc
U-Boot 2020.01 (Apr 29 2022 - 04:00:55 +0000)
CPU: Zynq 7z020
Silicon: v3.1
DRAM: ECC disabled 512 MiB
Flash: 0 Bytes
NAND: 0 MiB
MMC: mmc@e0100000: 0
Loading Environment from SPI Flash... SF: Detected s25fl128s with page size 256 Bytes, erase size 64 KiB, total 16 MiB
*** Warning - bad CRC, using default environment
In: serial@e0001000
Out: serial@e0001000
Err: serial@e0001000
Net:
ZYNQ GEM: e000b000, mdio bus e000b000, phyaddr 1, interface rgmii-id
SF: Detected s25fl128s with page size 256 Bytes, erase size 64 KiB, total 16 MiB
Warning: ethernet@e000b000 using MAC address from ROM
eth0: ethernet@e000b000
Hit any key to stop autoboot: 0
switch to partitions #0, OK
mmc0 is current device
Scanning mmc 0:1...
Found U-Boot script /boot.scr
1636 bytes read in 12 ms (132.8 KiB/s)
## Executing script at 03000000
5891020 bytes read in 337 ms (16.7 MiB/s)
## Loading kernel from FIT Image at 10000000 ...
Using 'conf@1' configuration
Verifying Hash Integrity ... OK
Trying 'kernel@0' kernel subimage
Description: Linux Kernel
Type: Kernel Image
Compression: uncompressed
Data Start: 0x100000d4
Data Size: 5869432 Bytes = 5.6 MiB
Architecture: ARM
OS: Linux
Load Address: 0x00080000
Entry Point: 0x00080000
Hash algo: sha1
Hash value: 7beea257b23b2b39f4ac933e0ec614468b0ee346
Verifying Hash Integrity ... sha1+ OK
## Loading fdt from FIT Image at 10000000 ...
Using 'conf@1' configuration
Verifying Hash Integrity ... OK
Trying 'fdt@0' fdt subimage
Description: Flattened Device Tree blob
Type: Flat Device Tree
Compression: uncompressed
Data Start: 0x10599140
Data Size: 19771 Bytes = 19.3 KiB
Architecture: ARM
Hash algo: sha1
Hash value: f18dc7415d970642feff826f1fe9f4d261311c87
Verifying Hash Integrity ... sha1+ OK
Booting using the fdt blob at 0x10599140
Loading Kernel Image
Loading Device Tree to 1eb00000, end 1eb07d3a ... OK
Starting kernel ...
Booting Linux on physical CPU 0x0
Linux version 5.4.0-xilinx-v2020.2 (oe-user@oe-host) (gcc version 9.2.0 (GCC)) #1 SMP PREEMPT Fri Apr 29 02:08:29 UTC 2022
CPU: ARMv7 Processor [413fc090] revision 0 (ARMv7), cr=18c5387d
CPU: PIPT / VIPT nonaliasing data cache, VIPT aliasing instruction cache
OF: fdt: Machine model: xlnx,zynq-7000
Memory policy: Data cache writealloc
cma: Reserved 128 MiB at 0x16800000
percpu: Embedded 15 pages/cpu s32332 r8192 d20916 u61440
Built 1 zonelists, mobility grouping on. Total pages: 129920
Kernel command line: root=/dev/mmcblk0p2 rw earlyprintk rootfstype=ext4 rootwait devtmpfs.mount=1 uio_pdrv_genirq.of_id="generic-uio" clk_ignore_unused
Dentry cache hash table entries: 65536 (order: 6, 262144 bytes, linear)
Inode-cache hash table entries: 32768 (order: 5, 131072 bytes, linear)
mem auto-init: stack:off, heap alloc:off, heap free:off
Memory: 374392K/524288K available (8192K kernel code, 269K rwdata, 2360K rodata, 1024K init, 298K bss, 18824K reserved, 131072K cma-reserved, 0K highmem)
rcu: Preemptible hierarchical RCU implementation.
rcu: RCU restricting CPUs from NR_CPUS=4 to nr_cpu_ids=2.
Tasks RCU enabled.
rcu: RCU calculated value of scheduler-enlistment delay is 10 jiffies.
rcu: Adjusting geometry for rcu_fanout_leaf=16, nr_cpu_ids=2
NR_IRQS: 16, nr_irqs: 16, preallocated irqs: 16
efuse mapped to (ptrval)
slcr mapped to (ptrval)
L2C: platform modifies aux control register: 0x72360000 -> 0x72760000
L2C: DT/platform modifies aux control register: 0x72360000 -> 0x72760000
L2C-310 erratum 769419 enabled
L2C-310 enabling early BRESP for Cortex-A9
L2C-310 full line of zeros enabled for Cortex-A9
L2C-310 ID prefetch enabled, offset 1 lines
L2C-310 dynamic clock gating enabled, standby mode enabled
L2C-310 cache controller enabled, 8 ways, 512 kB
L2C-310: CACHE_ID 0x410000c8, AUX_CTRL 0x76760001
random: get_random_bytes called from start_kernel+0x260/0x444 with crng_init=0
zynq_clock_init: clkc starts at (ptrval)
Zynq clock init
sched_clock: 64 bits at 324MHz, resolution 3ns, wraps every 4398046511102ns
clocksource: arm_global_timer: mask: 0xffffffffffffffff max_cycles: 0x4af472fdc8, max_idle_ns: 440795202990 ns
Switching to timer-based delay loop, resolution 3ns
Console: colour dummy device 80x30
printk: console [tty0] enabled
Calibrating delay loop (skipped), value calculated using timer frequency.. 649.99 BogoMIPS (lpj=3249996)
pid_max: default: 32768 minimum: 301
Mount-cache hash table entries: 1024 (order: 0, 4096 bytes, linear)
Mountpoint-cache hash table entries: 1024 (order: 0, 4096 bytes, linear)
CPU: Testing write buffer coherency: ok
CPU0: Spectre v2: using BPIALL workaround
CPU0: thread -1, cpu 0, socket 0, mpidr 80000000
Setting up static identity map for 0x100000 - 0x100060
rcu: Hierarchical SRCU implementation.
smp: Bringing up secondary CPUs ...
CPU1: thread -1, cpu 1, socket 0, mpidr 80000001
CPU1: Spectre v2: using BPIALL workaround
smp: Brought up 1 node, 2 CPUs
SMP: Total of 2 processors activated (1299.99 BogoMIPS).
CPU: All CPU(s) started in SVC mode.
devtmpfs: initialized
VFP support v0.3: implementor 41 architecture 3 part 30 variant 9 rev 4
clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 19112604462750000 ns
futex hash table entries: 512 (order: 3, 32768 bytes, linear)
xor: measuring software checksum speed
arm4regs : 991.200 MB/sec
8regs : 754.800 MB/sec
32regs : 797.600 MB/sec
xor: using function: arm4regs (991.200 MB/sec)
pinctrl core: initialized pinctrl subsystem
NET: Registered protocol family 16
DMA: preallocated 256 KiB pool for atomic coherent allocations
cpuidle: using governor menu
hw-breakpoint: found 5 (+1 reserved) breakpoint and 1 watchpoint registers.
hw-breakpoint: maximum watchpoint size is 4 bytes.
zynq-ocm f800c000.ocmc: ZYNQ OCM pool: 256 KiB @ 0x(ptrval)
e0001000.serial: ttyPS0 at MMIO 0xe0001000 (irq = 26, base_baud = 6249993) is a xuartps
printk: console [ttyPS0] enabled
raid6: int32x8 gen() 102 MB/s
raid6: int32x8 xor() 74 MB/s
raid6: int32x4 gen() 92 MB/s
raid6: int32x4 xor() 76 MB/s
raid6: int32x2 gen() 111 MB/s
raid6: int32x2 xor() 116 MB/s
raid6: int32x1 gen() 109 MB/s
raid6: int32x1 xor() 113 MB/s
raid6: using algorithm int32x2 gen() 111 MB/s
raid6: .... xor() 116 MB/s, rmw enabled
raid6: using intx1 recovery algorithm
vgaarb: loaded
SCSI subsystem initialized
usbcore: registered new interface driver usbfs
usbcore: registered new interface driver hub
usbcore: registered new device driver usb
mc: Linux media interface: v0.10
videodev: Linux video capture interface: v2.00
pps_core: LinuxPPS API ver. 1 registered
pps_core: Software ver. 5.3.6 - Copyright 2005-2007 Rodolfo Giometti <giometti@linux.it>
PTP clock support registered
EDAC MC: Ver: 3.0.0
FPGA manager framework
Advanced Linux Sound Architecture Driver Initialized.
clocksource: Switched to clocksource arm_global_timer
thermal_sys: Registered thermal governor 'step_wise'
NET: Registered protocol family 2
tcp_listen_portaddr_hash hash table entries: 512 (order: 0, 6144 bytes, linear)
TCP established hash table entries: 4096 (order: 2, 16384 bytes, linear)
TCP bind hash table entries: 4096 (order: 3, 32768 bytes, linear)
TCP: Hash tables configured (established 4096 bind 4096)
UDP hash table entries: 256 (order: 1, 8192 bytes, linear)
UDP-Lite hash table entries: 256 (order: 1, 8192 bytes, linear)
NET: Registered protocol family 1
RPC: Registered named UNIX socket transport module.
RPC: Registered udp transport module.
RPC: Registered tcp transport module.
RPC: Registered tcp NFSv4.1 backchannel transport module.
PCI: CLS 0 bytes, default 64
hw perfevents: no interrupt-affinity property for /pmu@f8891000, guessing.
hw perfevents: enabled with armv7_cortex_a9 PMU driver, 7 counters available
Initialise system trusted keyrings
workingset: timestamp_bits=14 max_order=17 bucket_order=3
squashfs: version 4.0 (2009/01/31) Phillip Lougher
jffs2: version 2.2. (NAND) (SUMMARY) © 2001-2006 Red Hat, Inc.
Key type asymmetric registered
Asymmetric key parser 'x509' registered
io scheduler mq-deadline registered
io scheduler kyber registered
zynq-pinctrl 700.pinctrl: zynq pinctrl initialized
dma-pl330 f8003000.dmac: Loaded driver for PL330 DMAC-241330
dma-pl330 f8003000.dmac: DBUFF-128x8bytes Num_Chans-8 Num_Peri-4 Num_Events-16
brd: module loaded
loop: module loaded
spi_master spi0: cannot find modalias for /amba/spi@e000d000/flash@0
spi_master spi0: Failed to create SPI device for /amba/spi@e000d000/flash@0
libphy: Fixed MDIO Bus: probed
tun: Universal TUN/TAP device driver, 1.6
CAN device driver interface
libphy: MACB_mii_bus: probed
RTL8211E Gigabit Ethernet e000b000.ethernet-ffffffff:01: attached PHY driver [RTL8211E Gigabit Ethernet] (mii_bus:phy_addr=e000b000.ethernet-ffffffff:01, irq=POLL)
macb e000b000.ethernet eth0: Cadence GEM rev 0x00020118 at 0xe000b000 irq 28 (00:18:3e:02:89:52)
e1000e: Intel(R) PRO/1000 Network Driver - 3.2.6-k
e1000e: Copyright(c) 1999 - 2015 Intel Corporation.
ehci_hcd: USB 2.0 'Enhanced' Host Controller (EHCI) Driver
ehci-pci: EHCI PCI platform driver
usbcore: registered new interface driver cdc_acm
cdc_acm: USB Abstract Control Model driver for USB modems and ISDN adapters
usbcore: registered new interface driver cdc_wdm
usbcore: registered new interface driver usb-storage
usbcore: registered new interface driver usbserial_generic
usbserial: USB Serial support registered for generic
usbcore: registered new interface driver usb_serial_simple
usbserial: USB Serial support registered for carelink
usbserial: USB Serial support registered for zio
usbserial: USB Serial support registered for funsoft
usbserial: USB Serial support registered for flashloader
usbserial: USB Serial support registered for google
usbserial: USB Serial support registered for libtransistor
usbserial: USB Serial support registered for vivopay
usbserial: USB Serial support registered for moto_modem
usbserial: USB Serial support registered for motorola_tetra
usbserial: USB Serial support registered for novatel_gps
usbserial: USB Serial support registered for hp4x
usbserial: USB Serial support registered for suunto
usbserial: USB Serial support registered for siemens_mpi
chipidea-usb2 e0002000.usb: e0002000.usb supply vbus not found, using dummy regulator
ULPI transceiver vendor/product ID 0x0424/0x0007
Found SMSC USB3320 ULPI transceiver.
ULPI integrity check: passed.
i2c /dev entries driver
cdns-i2c e0004000.i2c: 100 kHz mmio e0004000 irq 22
cdns-i2c e0005000.i2c: 100 kHz mmio e0005000 irq 23
cdns-wdt f8005000.watchdog: Xilinx Watchdog Timer with timeout 10s
device-mapper: ioctl: 4.41.0-ioctl (2019-09-16) initialised: dm-devel@redhat.com
EDAC MC: ECC not enabled
Xilinx Zynq CpuIdle Driver started
sdhci: Secure Digital Host Controller Interface driver
sdhci: Copyright(c) Pierre Ossman
sdhci-pltfm: SDHCI platform and OF driver helper
mmc0: SDHCI controller on e0100000.mmc [e0100000.mmc] using ADMA
ledtrig-cpu: registered to indicate activity on CPUs
clocksource: ttc_clocksource: mask: 0xffff max_cycles: 0xffff, max_idle_ns: 551318127 ns
timer #0 at (ptrval), irq=41
usbcore: registered new interface driver usbhid
usbhid: USB HID core driver
xlnk xlnk: Major 243
xlnk xlnk: xlnk driver loaded
xlnk xlnk: xlnk_pdev is not null
fpga_manager fpga0: Xilinx Zynq FPGA Manager registered
IPVS: Registered protocols (TCP, UDP)
IPVS: Connection hash table configured (size=4096, memory=32Kbytes)
IPVS: ipvs loaded.
IPVS: [rr] scheduler registered.
Initializing XFRM netlink socket
NET: Registered protocol family 10
Segment Routing with IPv6
sit: IPv6, IPv4 and MPLS over IPv4 tunneling driver
NET: Registered protocol family 17
can: controller area network core (rev 20170425 abi 9)
NET: Registered protocol family 29
can: raw protocol (rev 20170425)
can: broadcast manager protocol (rev 20170425 t)
can: netlink gateway (rev 20190810) max_hops=1
Registering SWP/SWPB emulation handler
mmc0: new high speed SDHC card at address 1234
mmcblk0: mmc0:1234 SA16G 14.4 GiB
Loading compiled-in X.509 certificates
Btrfs loaded, crc32c=crc32c-generic
mmcblk0: p1 p2
of-fpga-region fpga-full: FPGA Region probed
hctosys: unable to open rtc device (rtc0)
of_cfs_init
of_cfs_init: OK
cfg80211: Loading compiled-in X.509 certificates for regulatory database
cfg80211: Loaded X.509 cert 'sforshee: 00b28ddf47aef9cea7'
platform regulatory.0: Direct firmware load for regulatory.db failed with error -2
clk: Not disabling unused clocks
cfg80211: failed to load regulatory.db
ALSA device list:
No soundcards found.
EXT4-fs (mmcblk0p2): mounted filesystem with ordered data mode. Opts: (null)
VFS: Mounted root (ext4 filesystem) on device 179:2.
devtmpfs: mounted
Freeing unused kernel memory: 1024K
Run /sbin/init as init process
random: fast init done
systemd[1]: System time before build time, advancing clock.
systemd[1]: Failed to find module 'autofs4'
systemd[1]: systemd 245.4-4ubuntu3 running in system mode. (+PAM +AUDIT +SELINUX +IMA +APPARMOR +SMACK +SYSVINIT +UTMP +LIBCRYPTSETUP +GCRYPT +GNUTLS +ACL +XZ +LZ4 +SECCOMP +BLKID +ELFUTILS +KMOD +IDN2 -IDN +PCRE2 default-hierarchy=hybrid)
systemd[1]: Detected architecture arm.
Welcome to PynqLinux, based on Ubuntu 20.04!
systemd[1]: Set hostname to <pynq>.
systemd[1]: /lib/systemd/system/dbus.socket:5: ListenStream= references a path below legacy directory /var/run/, updating /var/run/dbus/system_bus_socket → /run/dbus/system_bus_socket; please update the unit file accordingly.
random: systemd: uninitialized urandom read (16 bytes read)
systemd[1]: Created slice system-getty.slice.
[ OK ] Created slice system-getty.slice.
random: systemd: uninitialized urandom read (16 bytes read)
systemd[1]: Created slice system-modprobe.slice.
[ OK ] Created slice system-modprobe.slice.
random: systemd: uninitialized urandom read (16 bytes read)
systemd[1]: Created slice system-serial\x2dgetty.slice.
[ OK ] Created slice system-serial\x2dgetty.slice.
systemd[1]: Created slice User and Session Slice.
[ OK ] Created slice User and Session Slice.
systemd[1]: Started Dispatch Password Requests to Console Directory Watch.
[ OK ] Started Dispatch Password …ts to Console Directory Watch.
systemd[1]: Started Forward Password Requests to Wall Directory Watch.
[ OK ] Started Forward Password R…uests to Wall Directory Watch.
systemd[1]: Condition check resulted in Arbitrary Executable File Formats File System Automount Point being skipped.
systemd[1]: Reached target Local Encrypted Volumes.
[ OK ] Reached target Local Encrypted Volumes.
systemd[1]: Reached target Remote File Systems.
[ OK ] Reached target Remote File Systems.
systemd[1]: Reached target Slices.
[ OK ] Reached target Slices.
systemd[1]: Listening on Syslog Socket.
[ OK ] Listening on Syslog Socket.
systemd[1]: Listening on initctl Compatibility Named Pipe.
[ OK ] Listening on initctl Compatibility Named Pipe.
systemd[1]: Condition check resulted in Journal Audit Socket being skipped.
systemd[1]: Listening on Journal Socket (/dev/log).
[ OK ] Listening on Journal Socket (/dev/log).
systemd[1]: Listening on Journal Socket.
[ OK ] Listening on Journal Socket.
systemd[1]: Listening on udev Control Socket.
[ OK ] Listening on udev Control Socket.
systemd[1]: Listening on udev Kernel Socket.
[ OK ] Listening on udev Kernel Socket.
systemd[1]: Condition check resulted in Huge Pages File System being skipped.
systemd[1]: Mounting POSIX Message Queue File System...
Mounting POSIX Message Queue File System...
systemd[1]: Mounting Kernel Debug File System...
Mounting Kernel Debug File System...
systemd[1]: Condition check resulted in Kernel Trace File System being skipped.
systemd[1]: Starting Journal Service...
Starting Journal Service...
systemd[1]: Starting Restore / save the current clock...
Starting Restore / save the current clock...
systemd[1]: Condition check resulted in Create list of static device nodes for the current kernel being skipped.
systemd[1]: Condition check resulted in Load Kernel Module drm being skipped.
systemd[1]: Started Nameserver information manager.
[ OK ] Started Nameserver information manager.
systemd[1]: Reached target Network (Pre).
[ OK ] Reached target Network (Pre).
systemd[1]: Condition check resulted in Set Up Additional Binary Formats being skipped.
systemd[1]: Starting Load Kernel Modules...
Starting Load Kernel Modules...
systemd[1]: Starting Remount Root and Kernel File Systems...
Starting Remount Root and Kernel File Systems...
systemd[1]: Starting udev Coldplug all Devices...
Starting udev Coldplug all Devices...
systemd[1]: Mounted POSIX Message Queue File System.
[ OK ] Mounted POSIX Message Queue File System.
systemd[1]: Mounted Kernel Debug File System.
[ OK ] Mounted Kernel Debug File System.
systemd[1]: Finished Restore / save the current clock.
[ OK ] Finished Restore / save the current clock.
systemd[1]: Finished Load Kernel Modules.
[ OK ] Finished Load Kernel Modules.
systemd[1]: Finished Remount Root and Kernel File Systems.
[ OK ] Finished Remount Root and Kernel File Systems.
systemd[1]: Started Journal Service.
[ OK ] Started Journal Service.
Activating swap /var/swap...
Mounting Kernel Configuration File System...
Starting Flush Journal to Persistent Storage...
Starting Load/Save Random Seed...
Starting Apply Kernel Variables...
systemd-journald[122]: Received client request to flush runtime journal.
Starting Create System Users...
systemd-journald[122]: File /var/log/journal/236dbcc4a6dba1caa1d5d5bc61955d45/system.journal corrupted or uncleanly shut down, renaming and replacing.
[ OK ] Mounted Kernel Configuration File System.
Adding 524284k swap on /var/swap. Priority:-2 extents:1 across:524284k SS
[ OK ] Activated swap /var/swap.
[ OK ] Finished udev Coldplug all Devices.
[ OK ] Reached target Swap.
Starting Helper to synchronize boot up for ifupdown...
[ OK ] Finished Apply Kernel Variables.
[ OK ] Finished Create System Users.
Starting Create Static Device Nodes in /dev...
[ OK ] Finished Flush Journal to Persistent Storage.
[ OK ] Finished Create Static Device Nodes in /dev.
[ OK ] Reached target Local File Systems (Pre).
[ OK ] Reached target Local File Systems.
Starting Enable support fo…l executable binary formats...
Starting Create Volatile Files and Directories...
Starting udev Kernel Device Manager...
[ OK ] Finished Enable support fo…nal executable binary formats.
[ OK ] Finished Create Volatile Files and Directories.
Starting Network Name Resolution...
Starting Network Time Synchronization...
Starting Update UTMP about System Boot/Shutdown...
[ OK ] Finished Update UTMP about System Boot/Shutdown.
[ OK ] Started udev Kernel Device Manager.
uio_pdrv_genirq 43c00000.audio-codec-ctrl: IRQ index 0 not found
[ OK ] Found device /dev/ttyPS0.
zocl-drm amba:zyxclmm_drm: IRQ index 0 not found
[ OK ] Started Network Time Synchronization.
[ OK ] Finished Helper to synchronize boot up for ifupdown.
[ OK ] Started Network Name Resolution.
[ OK ] Finished Load/Save Random Seed.
[ OK ] Reached target Host and Network Name Lookups.
[ OK ] Reached target System Time Set.
[ OK ] Reached target System Time Synchronized.
[ OK ] Started Entropy daemon using the HAVEGE algorithm.
[ OK ] Reached target System Initialization.
[ OK ] Started resolvconf-pull-resolved.path.
[ OK ] Started Daily apt download activities.
[ OK ] Started Daily apt upgrade and clean activities.
[ OK ] Started Periodic ext4 Onli…ata Check for All Filesystems.
[ OK ] Started Discard unused blocks once a week.
[ OK ] Started Daily rotation of log files.
[ OK ] Started Daily man-db regeneration.
[ OK ] Started Message of the Day.
[ OK ] Started Daily Cleanup of Temporary Directories.
[ OK ] Reached target Paths.
[ OK ] Reached target Timers.
[ OK ] Listening on Avahi mDNS/DNS-SD Stack Activation Socket.
[ OK ] Listening on D-Bus System Message Bus Socket.
[ OK ] Listening on UUID daemon activation socket.
[ OK ] Reached target Sockets.
[ OK ] Reached target Basic System.
Starting LSB: automatic crash report generation...
Starting Avahi mDNS/DNS-SD Stack...
[ OK ] Started Regular background program processing daemon.
[ OK ] Started D-Bus System Message Bus.
Starting Network Manager...
[ OK ] Started Save initial kernel messages after boot.
Starting Remove Stale Onli…t4 Metadata Check Snapshots...
[ OK ] Started ifup for eth0.
Starting Jupyter Notebook Server...
Starting LSB: Load kernel …d to enable cpufreq scaling...
Starting Dispatcher daemon for systemd-networkd...
Starting Raise network interfaces...
[ OK ] Started Set the CPU Frequency Scaling governor.
Starting PYNQ PL Server...
Starting Authorization Manager...
Starting Restore /etc/reso… the ppp link was shut down...
Starting Resize Filesystem on SD card...
Starting resolvconf-pull-resolved.service...
Starting System Logging Service...
Starting Login Service...
[ OK ] Started PYNQ PL Server.
[ OK ] Finished Restore /etc/reso…re the ppp link was shut down.
[ OK ] Finished Resize Filesystem on SD card.
[ OK ] Started System Logging Service.
[ OK ] Started LSB: automatic crash report generation.
[ OK ] Started Avahi mDNS/DNS-SD Stack.
[ OK ] Started Network Manager.
Starting Network Manager Wait Online...
[ OK ] Started LSB: Load kernel m…ded to enable cpufreq scaling.
Starting LSB: set CPUFreq kernel parameters...
Starting Hostname Service...
[ OK ] Started Authorization Manager.
[ OK ] Finished resolvconf-pull-resolved.service.
Starting Modem Manager...
[ OK ] Started LSB: set CPUFreq kernel parameters.
[ OK ] Started Login Service.
[ OK ] Started Modem Manager.
[ OK ] Started Hostname Service.
[ OK ] Started Dispatcher daemon for systemd-networkd.
Starting Network Manager Script Dispatcher Service...
[ OK ] Started Network Manager Script Dispatcher Service.
[ OK ] Finished Network Manager Wait Online.
[ OK ] Finished Remove Stale Onli…ext4 Metadata Check Snapshots.
Stopping Network Name Resolution...
[ OK ] Stopped Network Name Resolution.
Starting Network Name Resolution...
[ OK ] Started Network Name Resolution.
Starting resolvconf-pull-resolved.service...
[ OK ] Finished resolvconf-pull-resolved.service.
[ OK ] Finished Raise network interfaces.
[ OK ] Reached target Network.
[ OK ] Reached target Network is Online.
[ OK ] Started ISC DHCP IPv4 server.
[ OK ] Started ISC DHCP IPv6 server.
Starting Samba NMB Daemon...
Starting OpenBSD Secure Shell server...
Starting Permit User Sessions...
[ OK ] Started Unattended Upgrades Shutdown.
[ OK ] Finished Permit User Sessions.
[ OK ] Started Getty on tty1.
[ OK ] Started Serial Getty on ttyPS0.
[ OK ] Reached target Login Prompts.
PYNQ Linux, based on Ubuntu 20.04 pynq ttyPS0
pynq login: xilinx (automatic login)
Welcome to PYNQ Linux, based on Ubuntu 20.04 (GNU/Linux 5.4.0-xilinx-v2020.2 armv7l)
Last login: Wed Nov 17 20:00:41 UTC 2021 on ttyPS0
xilinx@pynq:~$
C:\Users
\AppData\Roaming\Xilinx\Vivado\tclapp\mainfest.tcl
このチュートリアルでは、Darknet YoloV3-tiny推論をVTAアクセラレータデザイン上で実行し、画像検出タスクを実行する方法について、エンドツーエンドデモを提供します。このチュートリアルでは、量子化(VTAはint8/32推論のみをサポート)とグラフパッキング(コアでのテンソル化を可能にするため)を行い、ハードウェアターゲット向けに計算グラフを調整するフロントエンドコンパイラとしてのRelayを紹介します。
cfg_path = /home/masaaki/.tvm_test_data/darknet/yolov3-tiny.cfg
weights_path = /home/masaaki/.tvm_test_data/darknet/yolov3-tiny.weights
coco_path = /home/masaaki/.tvm_test_data/data/coco.names
font_path = /home/masaaki/.tvm_test_data/data/arial.ttf
Gluon model zooからVisionモデルを取得し、Relayでコンパイルします。コンパイルの手順は
1. フロントエンドはMxNetからRelayモジュールに変換。
2. 8ビット量子化の適用:ここでは、CPUのfp32で実行される最初のconv層とdense層をスキップしています。
3. グラフパッキングを行い、テンソル化するためのデータレイアウトを変更する。
4. 定数折りたたみを行い、演算子の数を減らす(例:バッチノルム乗算をなくす)。
5. オブジェクトファイルへのリレービルドを行う。
6. オブジェクトファイルをリモート(FPGAデバイス)にロードする。
7. グラフエクゼキュータmを生成する。
(base) masaaki@masaaki-H110M4-M01:/media/masaaki/Ubuntu_Disk/DNN/tvm$ python3 vta/tests/python/pynq/test_program_rpc.py
(base) masaaki@masaaki-H110M4-M01:/media/masaaki/Ubuntu_Disk/DNN/tvm$ python3 vta/tests/python/integration/deploy_detection.py
cfg_path = /home/masaaki/.tvm_test_data/darknet/yolov3-tiny.cfg
weights_path = /home/masaaki/.tvm_test_data/darknet/yolov3-tiny.weights
coco_path = /home/masaaki/.tvm_test_data/data/coco.names
font_path = /home/masaaki/.tvm_test_data/data/arial.ttf
Reconfigured FPGA and RPC runtime in 3.17s!
/media/masaaki/Ubuntu_Disk/DNN/tvm/python/tvm/driver/build_module.py:263: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
warnings.warn(
[04:32:33] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:32:33] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:32:33] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:32:33] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:32:34] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:32:34] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:32:34] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:32:34] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:32:35] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:32:35] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:32:35] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
yolov3-tiny inference graph built in 19.06s!
Performed inference in 658.92ms (std = 0.37) for 1 samples
Average per sample inference time: 658.92ms
開催日 :8月3日 午後1時から午後5時(予定)
費 用 :無償
講 師 :「FPGAの部屋」小野 講師
開催方法:Webex Meetingを使用したオンライン開催
(base) masaaki@masaaki-H110M4-M01:/media/masaaki/Ubuntu_Disk/DNN/tvm$ python3 vta/tests/python/integration/deploy_detection.py
Reconfigured FPGA and RPC runtime in 2.88s!
/media/masaaki/Ubuntu_Disk/DNN/tvm/python/tvm/driver/build_module.py:263: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
warnings.warn(
[04:24:30] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:24:30] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:24:30] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:24:30] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:24:30] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:24:31] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:24:31] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:24:31] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:24:31] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:24:32] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:24:32] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
yolov3-tiny inference graph built in 18.91s!
Performed inference in 660.54ms (std = 2.14) for 1 samples
Average per sample inference time: 660.54ms
xilinx@pynq:~$ cd tvm
xilinx@pynq:~/tvm$ sudo ./apps/vta_rpc/start_rpc_server.sh
INFO:RPCServer:bind to 0.0.0.0:9091
INFO:RPCServer:connection from ('192.168.3.10', 43720)
INFO:root:Program FPGA with 1x16_i8w8a32_15_15_18_17.bit
INFO:RPCServer:Finish serving ('192.168.3.10', 43720)
INFO:RPCServer:connection from ('192.168.3.10', 43722)
INFO:root:Skip reconfig_runtime due to same config.
INFO:RPCServer:Finish serving ('192.168.3.10', 43722)
INFO:RPCServer:connection from ('192.168.3.10', 43724)
INFO:root:Skip reconfig_runtime due to same config.
INFO:root:Program FPGA with 1x16_i8w8a32_15_15_18_17.bit
INFO:root:Loading VTA library: /home/xilinx/tvm/vta/python/vta/../../../build/libvta.so
INFO:RPCServer:load_module /tmp/tmp9vv5ioxf/graphlib.tar
このチュートリアルでは、ImageNet分類推論をVTAアクセラレータ上で実行し、ImageNet分類タスクを実行する方法について、エンドツーエンド・デモを提供します。このチュートリアルでは、量子化(VTAはint8/32推論のみをサポート)とグラフパッキング(コアでのテンソル化を可能にするため)を行い、ハードウェアターゲットに合わせた計算グラフを作成するフロントエンドコンパイラとしてRelayが紹介されています。
Gluon model zooからVisionモデルを取得し、Relayでコンパイルします。コンパイルの手順は
1. フロントエンドはMxNetからRelayモジュールに変換。
2. 8ビット量子化の適用:ここでは、CPUのfp32で実行される最初のconv層とdense層をスキップしています。
3. グラフパッキングを行い、テンソル化するためのデータレイアウトを変更する。
4. 定数折りたたみを行い、演算子の数を減らす(例:バッチノルム乗算をなくす)。
5. オブジェクトファイルへのリレービルドを行う。
6. オブジェクトファイルをリモート(FPGAデバイス)にロードする。
7. グラフエクゼキュータmを生成する。
resnet18_v1 inference graph built in 26.26s!
Performed inference in 408.14ms (std = 2.18) for 1 samples
Average per sample inference time: 408.14ms
resnet18_v1 prediction for sample 0
#1: tiger cat
#2: Egyptian cat
#3: tabby, tabby cat
#4: lynx, catamount
#5: weasel
(base) masaaki@masaaki-H110M4-M01:/media/masaaki/Ubuntu_Disk/DNN/tvm$ python3 vta/tests/python/integration/deploy_classification.py
Reconfigured FPGA and RPC runtime in 2.89s!
Downloading /home/masaaki/.mxnet/models/resnet18_v1-a0666292.zipefe2e9c7-23b8-4e5f-9eb8-62f6771a59a7 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
/media/masaaki/Ubuntu_Disk/DNN/tvm/python/tvm/driver/build_module.py:263: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
warnings.warn(
[04:08:34] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:08:34] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:08:34] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:08:34] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:08:34] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:08:34] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:08:35] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:08:35] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:08:35] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:08:35] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:08:35] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:08:35] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:08:35] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:08:36] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:08:36] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
[04:08:36] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
vta/tests/python/integration/deploy_classification.py:212: DeprecationWarning: legacy graph executor behavior of producing json / lib / params will be removed in the next release. Please see documents of tvm.contrib.graph_executor.GraphModule for the new recommended usage.
graph, lib, params = relay.build(
resnet18_v1 inference graph built in 27.19s!
resnet18_v1 inference graph built in 26.26s!
Performed inference in 408.14ms (std = 2.18) for 1 samples
Average per sample inference time: 408.14ms
resnet18_v1 prediction for sample 0
#1: tiger cat
#2: Egyptian cat
#3: tabby, tabby cat
#4: lynx, catamount
#5: weasel
モジュールを実行する手順
1. リモート・コンテキストを作成する(Pynq でもリモート実行用)
2. tvm.nd.array でデータをフォーマットする
3. f() で計算を実行する
4. numpy()は結果の配列を解釈可能な形式にコピーして返す
と表示された。乗算は正しい。Successful matrix multiply test!
C_nd.numpy().shape = (1, 16, 1, 16)
[[[[ -98 45 -126 8 101 -6 -124 82 -19 -39 47 53 -74
-76 -4 84]]
[[ 120 -69 -106 89 -121 -3 125 103 -51 61 -42 -54 127
-80 -103 -1]]
[[ 17 103 63 -8 -126 9 -79 -112 -118 17 104 -14 -69
62 -100 -38]]
[[ 125 -82 95 115 27 0 -19 21 -8 -28 35 101 -109
-27 -72 -123]]
[[ 72 -26 34 101 -64 29 -83 8 23 -62 111 52 -117
-46 50 -118]]
[[ 58 -47 -113 62 29 -78 -101 62 -116 -25 22 -84 37
-40 -65 4]]
[[-109 52 -33 -114 -118 -93 106 73 102 -122 -88 -98 64
-44 -36 -52]]
[[ 84 33 -17 -19 -71 104 67 7 -102 90 91 8 -111
4 -91 48]]
[[ -20 116 79 41 8 8 62 -71 -64 -25 -78 13 -72
-75 -88 -56]]
[[-110 11 -38 -58 -77 -34 42 69 98 -51 -95 -53 -21
75 -81 3]]
[[ 78 102 91 -73 -69 -47 -86 16 122 91 -105 -8 106
37 82 -103]]
[[ -87 84 116 -25 -64 67 -70 85 36 -3 65 59 14
26 93 -16]]
[[ 11 -37 -104 -5 43 -94 -78 -71 37 -44 -37 -103 -34
110 84 -83]]
[[ 109 81 63 65 -44 122 -77 -57 -24 -72 -4 -99 95
-26 86 46]]
[[ 92 -127 -55 -1 -46 -79 -18 114 46 64 55 -90 -83
-93 -79 -77]]
[[ -32 -31 8 21 -43 -71 50 -126 59 63 69 43 -78
112 18 116]]]]
これで行列の乗算の結果テンソルCを別の計算操作で記述する準備が整った。計算関数はテンソルの形と、テンソルの各位置の計算規則を記述したラムダ関数を受け取る。
行列の乗算を実装するために、ラムダ関数には入力チャンネルの次元軸に対する縮小式が必要である。削減式を作成するには,te.reduce_axisで削減軸を宣言し,削減範囲を取り込みます.te.sumは削減する式と削減軸を取り込み,宣言した範囲内のすべてのkに対する値の総和を計算します.
このリダクションは32ビットenv.acc_dtypeアキュムレータデータ型に対して実行される必要があることに注意してください。
このフェーズでは,計算をどのように行うかを宣言しているだけなので,計算は行われません.
C_buf = Tensor(shape=[1, 16, 1, 16], op.name=C_buf)
スケジュールとは、元の計算に対する変換の集合であり、正しさに影響を与えることなく計算の実装を変換するものである。この簡単なVTAプログラミングのチュートリアルは、元のスケジュールをVTAハードウェアプリミティブにマップダウンする基本的なスケジュール変換を示すことを目的としています。
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(int8), int8, [256], []),
B: Buffer(B_2: Pointer(int8), int8, [65536], []),
C: Buffer(C_2: Pointer(int8), int8, [256], [])}
buffer_map = {A_1: A, B_1: B, C_1: C}
preflattened_buffer_map = {A_1: A_3: Buffer(A_2, int8, [1, 16, 1, 16], []), B_1: B_3: Buffer(B_2, int8, [16, 16, 16, 16], []), C_1: C_3: Buffer(C_2, int8, [1, 16, 1, 16], [])} {
allocate(A_buf: Pointer(global int8), int8, [256]), storage_scope = global;
allocate(B_buf: Pointer(global int8), int8, [65536]), storage_scope = global;
allocate(C_buf: Pointer(global int32), int32, [256]), storage_scope = global {
for (i1: int32, 0, 16) {
for (i3: int32, 0, 16) {
let cse_var_1: int32 = ((i1*16) + i3)
A_buf_1: Buffer(A_buf, int8, [256], [])[cse_var_1] = A[cse_var_1]
}
}
for (i0: int32, 0, 16) {
for (i1_1: int32, 0, 16) {
for (i2: int32, 0, 16) {
for (i3_1: int32, 0, 16) {
let cse_var_2: int32 = ((((i0*4096) + (i1_1*256)) + (i2*16)) + i3_1)
B_buf_1: Buffer(B_buf, int8, [65536], [])[cse_var_2] = B[cse_var_2]
}
}
}
}
for (co: int32, 0, 16) {
for (ci: int32, 0, 16) {
C_buf_1: Buffer(C_buf, int32, [256], [])[((co*16) + ci)] = 0
for (ko: int32, 0, 16) {
for (ki: int32, 0, 16) {
let cse_var_3: int32 = ((co*16) + ci)
C_buf_1[cse_var_3] = (C_buf_1[cse_var_3] + (cast(int32, A_buf_1[((ko*16) + ki)])*cast(int32, B_buf_1[((((co*4096) + (ko*256)) + (ci*16)) + ki)])))
}
}
}
}
for (i1_2: int32, 0, 16) {
for (i3_2: int32, 0, 16) {
let cse_var_4: int32 = ((i1_2*16) + i3_2)
C[cse_var_4] = cast(int8, C_buf_1[cse_var_4])
}
}
}
}
このスケジュールは理にかなっていますが、VTAにコンパイルすることはできません。正しいコード生成のためには、スケジューリングプリミティブとコードアノテーションを適用して、スケジュールをVTAのハードウェアイントリニックスに直接落とせるようなものに変換する必要があるのです。それらは以下の通りです。
・DMAコピーオペレーションは、グローバルにスコープされたテンソルを受け取り、それをローカルにスコープされたテンソルにコピーする。
・行列の乗算を実行するテンソル演算。
VTAのオンチップSRAM
VTAは3種類のメモリスコープを持ち、それぞれが異なるオンチップSRAMバッファに対応しています。
env.inp_scope : 入力バッファ。env.inp_dtype型の形状(env.BATCH, env.BLOCK_IN)の入力マトリクスを格納するリードオンリーのSRAMバッファです。入力バッファには、2 ^ LOG_INP_BUFF_SIZE行列要素(vta_config.jsonファイルで指定されたもの)が格納されます。
env.wgt_scope : env.wgt_dtype 型の形状 (env.BLOCK_OUT, env.BLOCK_IN) のウェイト行列を格納する読み込み専用の SRAM バッファです。ウェイトバッファは2 ^ LOG_WGT_BUFF_SIZE行列要素を含みます。
env.acc_scope : env.acc_dtype 型の形状 (env.BATCH, env.BLOCK_OUT) のアキュムレータ行列を格納するリード/ライト SRAM バッファです。アキュムレータバッファはVTAの汎用レジスタファイルであり,畳み込みや行列の乗算の中間結果や,プーリング,バッチ正規化,活性化レイヤの中間結果などを保持します.アキュムレータバッファは、2 ^ LOG_ACC_BUFF_SIZE の行列要素を含みます。
スケジュール変換の最後のステップは、スケジュールにテンソル化を適用することである。テンソル化はベクトル化に類似しているが、その概念をより高次元の計算単位に拡張するものである。その結果、テンソル化はデータレイアウトの入力プレースホルダーを宣言するときに説明したように、データレイアウトの制約を課すことになる。我々はすでにテンソルをタイル状に配置したので、次に行うべきことはテンソル化に対応するためのループの再順序付けである。
ここでは、一番外側の縮小軸をずっと外側に移動することにした。このため、まず入力チャネル、次にバッチ次元、最後に出力チャネルを反復処理することになる。最後に、テンソル化スケジューリングプリミティブtensorizeを、最内周の行列乗算テンソルブロックの外軸に沿って適用する。最終的なスケジュールはVTAランタイムJITコンパイラによるコード生成に対応できるように出力される。
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(int8), int8, [256], []),
B: Buffer(B_2: Pointer(int8), int8, [65536], []),
C: Buffer(C_2: Pointer(int8), int8, [256], [])}
buffer_map = {A_1: A, B_1: B, C_1: C}
preflattened_buffer_map = {A_1: A_3: Buffer(A_2, int8, [1, 16, 1, 16], []), B_1: B_3: Buffer(B_2, int8, [16, 16, 16, 16], []), C_1: C_3: Buffer(C_2, int8, [1, 16, 1, 16], [])} {
allocate(C_buf: Pointer(local.acc_buffer int32), int32, [256]), storage_scope = local.acc_buffer;
allocate(A_buf: Pointer(local.inp_buffer int8), int8, [16]), storage_scope = local.inp_buffer;
allocate(B_buf: Pointer(local.wgt_buffer int8), int8, [16]), storage_scope = local.wgt_buffer {
for (co: int32, 0, 16) {
for (ci: int32, 0, 16) {
C_buf_1: Buffer(C_buf, int32, [256], [], scope="local.acc_buffer", align=16)[((co*16) + ci)] = 0
for (ko: int32, 0, 16) {
attr [IterVar(i0: int32, (nullptr), "DataPar", "")] "pragma_dma_copy" = 1;
for (i3: int32, 0, 16) {
A_buf_1: Buffer(A_buf, int8, [16], [], scope="local.inp_buffer", align=16)[i3] = A[((ko*16) + i3)]
}
attr [IterVar(i0_1: int32, (nullptr), "DataPar", "")] "pragma_dma_copy" = 1;
for (i3_1: int32, 0, 16) {
B_buf_1: Buffer(B_buf, int8, [16], [], scope="local.wgt_buffer", align=256)[i3_1] = B[((((co*4096) + (ko*256)) + (ci*16)) + i3_1)]
}
for (ki: int32, 0, 16) {
let cse_var_1: int32 = ((co*16) + ci)
C_buf_1[cse_var_1] = (C_buf_1[cse_var_1] + (cast(int32, A_buf_1[ki])*cast(int32, B_buf_1[ki])))
}
}
}
}
attr [IterVar(i0_2: int32, (nullptr), "DataPar", "")] "pragma_dma_copy" = 1;
for (i1: int32, 0, 16) {
for (i3_2: int32, 0, 16) {
let cse_var_2: int32 = ((i1*16) + i3_2)
C[cse_var_2] = cast(int8, C_buf_1[cse_var_2])
}
}
}
}
アクセラレータをターゲットとするとき、複雑さの原因の一つは、データレイアウトがアクセラレータ設計によって課されるレイアウトと一致することを確認することです。VTAは、下図に示すように、活性化行列と重み行列の間で1サイクルあたり1回の行列-行列演算を行い、その結果行列をアキュムレータ行列に追加するテンソルコアを中心に設計されている。
その行列と行列の掛け算の次元は、設定ファイル vta_config.json で指定される。活性化行列は(BATCH, BLOCK_IN)、転置ウエイト行列は(BLOCK_OUT, BLOCK_IN)の形状をしており、結果として出力行列は(BATCH, BLOCK_OUT)の形状をしていると推察される。従って、VTAが処理する入力テンソルおよび出力テンソルは、前述の次元に従ってタイリングされる必要がある。
下図は、元々(4, 8)の形状を持つ行列に対して、データのタイリングを行った場合の影響を示している。(2,2)のタイル形状でタイル化することで、各タイル内のデータが連続することが保証される。その結果、タイル化されたテンソルは(2, 4, 2, 2)の形状を持つことになる。
env.BATCH = 1
env.BLOCK_OUT = 16
env.BLOCK_IN = 16
A = Tensor(shape=[1, 16, 1, 16], op.name=A)
B = Tensor(shape=[16, 16, 16, 16], op.name=B)
A_buf = Tensor(shape=[1, 16, 1, 16], op.name=A_buf)
B_buf = Tensor(shape=[16, 16, 16, 16], op.name=B_buf)
C_buf = Tensor(shape=[1, 16, 1, 16], op.name=C_buf)
(base) masaaki@masaaki-H110M4-M01:/media/masaaki/Ubuntu_Disk/DNN/tvm$ python3 vta/tests/python/integration/matrix_multiply.py
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(int8), int8, [256], []),
B: Buffer(B_2: Pointer(int8), int8, [65536], []),
C: Buffer(C_2: Pointer(int8), int8, [256], [])}
buffer_map = {A_1: A, B_1: B, C_1: C}
preflattened_buffer_map = {A_1: A_3: Buffer(A_2, int8, [1, 16, 1, 16], []), B_1: B_3: Buffer(B_2, int8, [16, 16, 16, 16], []), C_1: C_3: Buffer(C_2, int8, [1, 16, 1, 16], [])} {
allocate(A_buf: Pointer(global int8), int8, [256]), storage_scope = global;
allocate(B_buf: Pointer(global int8), int8, [65536]), storage_scope = global;
allocate(C_buf: Pointer(global int32), int32, [256]), storage_scope = global {
for (i1: int32, 0, 16) {
for (i3: int32, 0, 16) {
let cse_var_1: int32 = ((i1*16) + i3)
A_buf_1: Buffer(A_buf, int8, [256], [])[cse_var_1] = A[cse_var_1]
}
}
for (i0: int32, 0, 16) {
for (i1_1: int32, 0, 16) {
for (i2: int32, 0, 16) {
for (i3_1: int32, 0, 16) {
let cse_var_2: int32 = ((((i0*4096) + (i1_1*256)) + (i2*16)) + i3_1)
B_buf_1: Buffer(B_buf, int8, [65536], [])[cse_var_2] = B[cse_var_2]
}
}
}
}
for (co: int32, 0, 16) {
for (ci: int32, 0, 16) {
C_buf_1: Buffer(C_buf, int32, [256], [])[((co*16) + ci)] = 0
for (ko: int32, 0, 16) {
for (ki: int32, 0, 16) {
let cse_var_3: int32 = ((co*16) + ci)
C_buf_1[cse_var_3] = (C_buf_1[cse_var_3] + (cast(int32, A_buf_1[((ko*16) + ki)])*cast(int32, B_buf_1[((((co*4096) + (ko*256)) + (ci*16)) + ki)])))
}
}
}
}
for (i1_2: int32, 0, 16) {
for (i3_2: int32, 0, 16) {
let cse_var_4: int32 = ((i1_2*16) + i3_2)
C[cse_var_4] = cast(int8, C_buf_1[cse_var_4])
}
}
}
}
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(int8), int8, [256], []),
B: Buffer(B_2: Pointer(int8), int8, [65536], []),
C: Buffer(C_2: Pointer(int8), int8, [256], [])}
buffer_map = {A_1: A, B_1: B, C_1: C}
preflattened_buffer_map = {A_1: A_3: Buffer(A_2, int8, [1, 16, 1, 16], []), B_1: B_3: Buffer(B_2, int8, [16, 16, 16, 16], []), C_1: C_3: Buffer(C_2, int8, [1, 16, 1, 16], [])} {
allocate(C_buf: Pointer(local.acc_buffer int32), int32, [256]), storage_scope = local.acc_buffer;
allocate(A_buf: Pointer(local.inp_buffer int8), int8, [16]), storage_scope = local.inp_buffer;
allocate(B_buf: Pointer(local.wgt_buffer int8), int8, [16]), storage_scope = local.wgt_buffer {
for (co: int32, 0, 16) {
for (ci: int32, 0, 16) {
C_buf_1: Buffer(C_buf, int32, [256], [], scope="local.acc_buffer", align=16)[((co*16) + ci)] = 0
for (ko: int32, 0, 16) {
attr [IterVar(i0: int32, (nullptr), "DataPar", "")] "pragma_dma_copy" = 1;
for (i3: int32, 0, 16) {
A_buf_1: Buffer(A_buf, int8, [16], [], scope="local.inp_buffer", align=16)[i3] = A[((ko*16) + i3)]
}
attr [IterVar(i0_1: int32, (nullptr), "DataPar", "")] "pragma_dma_copy" = 1;
for (i3_1: int32, 0, 16) {
B_buf_1: Buffer(B_buf, int8, [16], [], scope="local.wgt_buffer", align=256)[i3_1] = B[((((co*4096) + (ko*256)) + (ci*16)) + i3_1)]
}
for (ki: int32, 0, 16) {
let cse_var_1: int32 = ((co*16) + ci)
C_buf_1[cse_var_1] = (C_buf_1[cse_var_1] + (cast(int32, A_buf_1[ki])*cast(int32, B_buf_1[ki])))
}
}
}
}
attr [IterVar(i0_2: int32, (nullptr), "DataPar", "")] "pragma_dma_copy" = 1;
for (i1: int32, 0, 16) {
for (i3_2: int32, 0, 16) {
let cse_var_2: int32 = ((i1*16) + i3_2)
C[cse_var_2] = cast(int8, C_buf_1[cse_var_2])
}
}
}
}
[04:36:57] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(int8), int8, [256], []),
B: Buffer(B_2: Pointer(int8), int8, [65536], []),
C: Buffer(C_2: Pointer(int8), int8, [256], [])}
buffer_map = {A_1: A, B_1: B, C_1: C}
preflattened_buffer_map = {A_1: A_3: Buffer(A_2, int8, [1, 16, 1, 16], []), B_1: B_3: Buffer(B_2, int8, [16, 16, 16, 16], []), C_1: C_3: Buffer(C_2, int8, [1, 16, 1, 16], [])} {
attr [IterVar(vta: int32, (nullptr), "ThreadIndex", "vta")] "coproc_scope" = 2 {
attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_uop_scope" = "VTAPushGEMMOp" {
@tir.call_extern("VTAUopLoopBegin", 16, 1, 0, 0, dtype=int32)
@tir.vta.uop_push(0, 1, 0, 0, 0, 0, 0, 0, dtype=int32)
@tir.call_extern("VTAUopLoopEnd", dtype=int32)
}
@tir.vta.coproc_dep_push(2, 1, dtype=int32)
}
for (ko: int32, 0, 16) {
attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_scope" = 1 {
@tir.vta.coproc_dep_pop(2, 1, dtype=int32)
@tir.call_extern("VTALoadBuffer2D", @tir.tvm_thread_context(@tir.vta.command_handle(, dtype=handle), dtype=handle), A_2, ko, 1, 1, 1, 0, 0, 0, 0, 0, 2, dtype=int32)
@tir.call_extern("VTALoadBuffer2D", @tir.tvm_thread_context(@tir.vta.command_handle(, dtype=handle), dtype=handle), B_2, ko, 1, 16, 16, 0, 0, 0, 0, 0, 1, dtype=int32)
@tir.vta.coproc_dep_push(1, 2, dtype=int32)
}
attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_scope" = 2 {
@tir.vta.coproc_dep_pop(1, 2, dtype=int32)
attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_uop_scope" = "VTAPushGEMMOp" {
@tir.call_extern("VTAUopLoopBegin", 16, 1, 0, 1, dtype=int32)
@tir.vta.uop_push(0, 0, 0, 0, 0, 0, 0, 0, dtype=int32)
@tir.call_extern("VTAUopLoopEnd", dtype=int32)
}
@tir.vta.coproc_dep_push(2, 1, dtype=int32)
}
}
@tir.vta.coproc_dep_push(2, 3, dtype=int32)
@tir.vta.coproc_dep_pop(2, 1, dtype=int32)
attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_scope" = 3 {
@tir.vta.coproc_dep_pop(2, 3, dtype=int32)
@tir.call_extern("VTAStoreBuffer2D", @tir.tvm_thread_context(@tir.vta.command_handle(, dtype=handle), dtype=handle), 0, 4, C_2, 0, 16, 1, 16, dtype=int32)
}
@tir.vta.coproc_sync(, dtype=int32)
}
[04:36:57] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
/media/masaaki/Ubuntu_Disk/DNN/tvm/python/tvm/driver/build_module.py:263: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
warnings.warn(
Successful matrix multiply test!
xilinx@pynq:~/tvm$ sudo ./apps/vta_rpc/start_rpc_server.sh
[sudo] password for xilinx:
INFO:RPCServer:bind to 0.0.0.0:9091
INFO:RPCServer:connection from ('192.168.3.10', 56482)
INFO:root:Program FPGA with 1x16_i8w8a32_15_15_18_17.bit
INFO:RPCServer:Finish serving ('192.168.3.10', 56482)
INFO:RPCServer:connection from ('192.168.3.10', 56484)
INFO:root:Skip reconfig_runtime due to same config.
INFO:RPCServer:Finish serving ('192.168.3.10', 56484)
INFO:RPCServer:connection from ('192.168.3.10', 56486)
INFO:root:Skip reconfig_runtime due to same config.
INFO:root:Program FPGA with 1x16_i8w8a32_15_15_18_17.bit
INFO:root:Loading VTA library: /home/xilinx/tvm/vta/python/vta/../../../build/libvta.so
INFO:RPCServer:load_module /tmp/tmpow_b_kzg/gemm.o
INFO:RPCServer:Finish serving ('192.168.3.10', 56486)
モジュールを実行する手順
1. リモート・コンテキストを作成する(Pynq でもリモート実行用)
2. tvm.nd.array でデータをフォーマットする
3. f() で計算を実行する
4. numpy()は結果の配列を解釈可能な形式にコピーして返す
A_orig.shape = (1, 1024)
A_packed.shape = (1, 64, 1, 16)
が出力された。Successful vector add test!
C_nd.numpy().shape = (1, 64, 1, 16)
[[[[ 26 -75 -59 ... -76 118 99]]
[[ 22 -98 31 ... -28 66 -50]]
[[ 8 23 82 ... -5 -4 120]]
...
[[ 36 83 8 ... -47 45 -105]]
[[ 63 -28 94 ... 69 119 113]]
[[-116 -31 -124 ... -110 -58 -57]]]]
(base) masaaki@masaaki-H110M4-M01:/media/masaaki/Ubuntu_Disk/DNN/tvm$ python3 vta/tests/python/integration/vta_get_started.py
A.shape = [1, 64, 1, 16]
A = Tensor(shape=[1, 64, 1, 16], op.name=A)
B = Tensor(shape=[1, 64, 1, 16], op.name=B)
env.BATCH = 1
env.BLOCK_OUT = 16
A_buf = Tensor(shape=[1, 64, 1, 16], op.name=A_buf)
B_buf = Tensor(shape=[1, 64, 1, 16], op.name=B_buf)
C_buf = Tensor(shape=[1, 64, 1, 16], op.name=C_buf)
C = Tensor(shape=[1, 64, 1, 16], op.name=C)
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(int32), int32, [1024], []),
B: Buffer(B_2: Pointer(int32), int32, [1024], []),
C: Buffer(C_2: Pointer(int8), int8, [1024], [])}
buffer_map = {A_1: A, B_1: B, C_1: C}
preflattened_buffer_map = {A_1: A_3: Buffer(A_2, int32, [1, 64, 1, 16], []), B_1: B_3: Buffer(B_2, int32, [1, 64, 1, 16], []), C_1: C_3: Buffer(C_2, int8, [1, 64, 1, 16], [])} {
allocate(A_buf: Pointer(global int32), int32, [1024]), storage_scope = global;
allocate(B_buf: Pointer(global int32), int32, [1024]), storage_scope = global {
for (i1: int32, 0, 64) {
for (i3: int32, 0, 16) {
let cse_var_1: int32 = ((i1*16) + i3)
A_buf_1: Buffer(A_buf, int32, [1024], [])[cse_var_1] = A[cse_var_1]
}
}
for (i1_1: int32, 0, 64) {
for (i3_1: int32, 0, 16) {
let cse_var_2: int32 = ((i1_1*16) + i3_1)
B_buf_1: Buffer(B_buf, int32, [1024], [])[cse_var_2] = B[cse_var_2]
}
}
for (i1_2: int32, 0, 64) {
for (i3_2: int32, 0, 16) {
let cse_var_3: int32 = ((i1_2*16) + i3_2)
A_buf_2: Buffer(A_buf, int32, [1024], [])[cse_var_3] = (A_buf_1[cse_var_3] + B_buf_1[cse_var_3])
}
}
for (i1_3: int32, 0, 64) {
for (i3_3: int32, 0, 16) {
let cse_var_4: int32 = ((i1_3*16) + i3_3)
C[cse_var_4] = cast(int8, A_buf_2[cse_var_4])
}
}
}
}
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(int32), int32, [1024], []),
B: Buffer(B_2: Pointer(int32), int32, [1024], []),
C: Buffer(C_2: Pointer(int8), int8, [1024], [])}
buffer_map = {A_1: A, B_1: B, C_1: C}
preflattened_buffer_map = {A_1: A_3: Buffer(A_2, int32, [1, 64, 1, 16], []), B_1: B_3: Buffer(B_2, int32, [1, 64, 1, 16], []), C_1: C_3: Buffer(C_2, int8, [1, 64, 1, 16], [])} {
attr [IterVar(vta: int32, (nullptr), "ThreadIndex", "vta")] "coproc_scope" = 2 {
@tir.call_extern("VTALoadBuffer2D", @tir.tvm_thread_context(@tir.vta.command_handle(, dtype=handle), dtype=handle), A_2, 0, 64, 1, 64, 0, 0, 0, 0, 0, 3, dtype=int32)
@tir.call_extern("VTALoadBuffer2D", @tir.tvm_thread_context(@tir.vta.command_handle(, dtype=handle), dtype=handle), B_2, 0, 64, 1, 64, 0, 0, 0, 0, 64, 3, dtype=int32)
attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_uop_scope" = "VTAPushALUOp" {
@tir.call_extern("VTAUopLoopBegin", 64, 1, 1, 0, dtype=int32)
@tir.vta.uop_push(1, 0, 0, 64, 0, 2, 0, 0, dtype=int32)
@tir.call_extern("VTAUopLoopEnd", dtype=int32)
}
@tir.vta.coproc_dep_push(2, 3, dtype=int32)
}
attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_scope" = 3 {
@tir.vta.coproc_dep_pop(2, 3, dtype=int32)
@tir.call_extern("VTAStoreBuffer2D", @tir.tvm_thread_context(@tir.vta.command_handle(, dtype=handle), dtype=handle), 0, 4, C_2, 0, 64, 1, 64, dtype=int32)
}
@tir.vta.coproc_sync(, dtype=int32)
}
/media/masaaki/Ubuntu_Disk/DNN/tvm/python/tvm/driver/build_module.py:263: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
warnings.warn(
A_orig.shape = (1, 1024)
A_packed.shape = (1, 64, 1, 16)
Successful vector add test!
C_nd.numpy().shape = (1, 64, 1, 16)
[[[[ 26 -75 -59 ... -76 118 99]]
[[ 22 -98 31 ... -28 66 -50]]
[[ 8 23 82 ... -5 -4 120]]
...
[[ 36 83 8 ... -47 45 -105]]
[[ 63 -28 94 ... 69 119 113]]
[[-116 -31 -124 ... -110 -58 -57]]]]
C = Tensor(shape=[1, 64, 1, 16], op.name=C)
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(int32), int32, [1024], []),
B: Buffer(B_2: Pointer(int32), int32, [1024], []),
C: Buffer(C_2: Pointer(int8), int8, [1024], [])}
buffer_map = {A_1: A, B_1: B, C_1: C}
preflattened_buffer_map = {A_1: A_3: Buffer(A_2, int32, [1, 64, 1, 16], []), B_1: B_3: Buffer(B_2, int32, [1, 64, 1, 16], []), C_1: C_3: Buffer(C_2, int8, [1, 64, 1, 16], [])} {
allocate(A_buf: Pointer(global int32), int32, [1024]), storage_scope = global;
allocate(B_buf: Pointer(global int32), int32, [1024]), storage_scope = global {
for (i1: int32, 0, 64) {
for (i3: int32, 0, 16) {
let cse_var_1: int32 = ((i1*16) + i3)
A_buf_1: Buffer(A_buf, int32, [1024], [])[cse_var_1] = A[cse_var_1]
}
}
for (i1_1: int32, 0, 64) {
for (i3_1: int32, 0, 16) {
let cse_var_2: int32 = ((i1_1*16) + i3_1)
B_buf_1: Buffer(B_buf, int32, [1024], [])[cse_var_2] = B[cse_var_2]
}
}
for (i1_2: int32, 0, 64) {
for (i3_2: int32, 0, 16) {
let cse_var_3: int32 = ((i1_2*16) + i3_2)
A_buf_2: Buffer(A_buf, int32, [1024], [])[cse_var_3] = (A_buf_1[cse_var_3] + B_buf_1[cse_var_3])
}
}
for (i1_3: int32, 0, 64) {
for (i3_3: int32, 0, 16) {
let cse_var_4: int32 = ((i1_3*16) + i3_3)
C[cse_var_4] = cast(int8, A_buf_2[cse_var_4])
}
}
}
}
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(int32), int32, [1024], []),
B: Buffer(B_2: Pointer(int32), int32, [1024], []),
C: Buffer(C_2: Pointer(int8), int8, [1024], [])}
buffer_map = {A_1: A, B_1: B, C_1: C}
preflattened_buffer_map = {A_1: A_3: Buffer(A_2, int32, [1, 64, 1, 16], []), B_1: B_3: Buffer(B_2, int32, [1, 64, 1, 16], []), C_1: C_3: Buffer(C_2, int8, [1, 64, 1, 16], [])} {
attr [IterVar(vta: int32, (nullptr), "ThreadIndex", "vta")] "coproc_scope" = 2 {
@tir.call_extern("VTALoadBuffer2D", @tir.tvm_thread_context(@tir.vta.command_handle(, dtype=handle), dtype=handle), A_2, 0, 64, 1, 64, 0, 0, 0, 0, 0, 3, dtype=int32)
@tir.call_extern("VTALoadBuffer2D", @tir.tvm_thread_context(@tir.vta.command_handle(, dtype=handle), dtype=handle), B_2, 0, 64, 1, 64, 0, 0, 0, 0, 64, 3, dtype=int32)
attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_uop_scope" = "VTAPushALUOp" {
@tir.call_extern("VTAUopLoopBegin", 64, 1, 1, 0, dtype=int32)
@tir.vta.uop_push(1, 0, 0, 64, 0, 2, 0, 0, dtype=int32)
@tir.call_extern("VTAUopLoopEnd", dtype=int32)
}
@tir.vta.coproc_dep_push(2, 3, dtype=int32)
}
attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_scope" = 3 {
@tir.vta.coproc_dep_pop(2, 3, dtype=int32)
@tir.call_extern("VTAStoreBuffer2D", @tir.tvm_thread_context(@tir.vta.command_handle(, dtype=handle), dtype=handle), 0, 4, C_2, 0, 64, 1, 64, dtype=int32)
}
@tir.vta.coproc_sync(, dtype=int32)
}
A.shape = [1, 64, 1, 16]
A = Tensor(shape=[1, 64, 1, 16], op.name=A)
B = Tensor(shape=[1, 64, 1, 16], op.name=B)
env.BATCH = 1
env.BLOCK_OUT = 16
A_buf = Tensor(shape=[1, 64, 1, 16], op.name=A_buf)
B_buf = Tensor(shape=[1, 64, 1, 16], op.name=B_buf)
さて、ベクトル加算結果のテンソルCを別の計算操作で記述する準備が整った。計算関数はテンソルの形と、テンソルの各位置の計算規則を記述したラムダ関数を受け取る。
この段階では、計算をどのように行うかを宣言しているだけであり、計算は行われない。
C_buf = Tensor(shape=[1, 64, 1, 16], op.name=C_buf)
(base) masaaki@masaaki-H110M4-M01:/media/masaaki/Ubuntu_Disk/DNN/tvm$ python3 vta/tests/python/integration/test_benchmark_topi_group_conv2d.py
GroupConv2DWorkload(batch=1, height=112, width=112, in_filter=32, out_filter=32, groups=2, hkernel=3, wkernel=3, hpad=1, wpad=1, hstride=1, wstride=1)
/media/masaaki/Ubuntu_Disk/DNN/tvm/python/tvm/driver/build_module.py:263: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
warnings.warn(
CPU GROUP CONV2D TEST PASSED: Time cost = 0.285452 sec/op, 0.404991 GOPS
GroupConv2DWorkload(batch=1, height=112, width=112, in_filter=64, out_filter=64, groups=4, hkernel=3, wkernel=3, hpad=1, wpad=1, hstride=2, wstride=2)
CPU GROUP CONV2D TEST PASSED: Time cost = 0.160443 sec/op, 0.36027 GOPS
GroupConv2DWorkload(batch=1, height=56, width=56, in_filter=128, out_filter=128, groups=8, hkernel=3, wkernel=3, hpad=1, wpad=1, hstride=1, wstride=1)
CPU GROUP CONV2D TEST PASSED: Time cost = 0.283096 sec/op, 0.408361 GOPS
GroupConv2DWorkload(batch=1, height=56, width=56, in_filter=128, out_filter=128, groups=8, hkernel=3, wkernel=3, hpad=1, wpad=1, hstride=2, wstride=2)
CPU GROUP CONV2D TEST PASSED: Time cost = 0.077123 sec/op, 0.374744 GOPS
GroupConv2DWorkload(batch=1, height=28, width=28, in_filter=256, out_filter=256, groups=16, hkernel=3, wkernel=3, hpad=1, wpad=1, hstride=1, wstride=1)
CPU GROUP CONV2D TEST PASSED: Time cost = 0.128987 sec/op, 0.448128 GOPS
GroupConv2DWorkload(batch=1, height=28, width=28, in_filter=256, out_filter=256, groups=16, hkernel=3, wkernel=3, hpad=1, wpad=1, hstride=2, wstride=2)
CPU GROUP CONV2D TEST PASSED: Time cost = 0.0342869 sec/op, 0.421464 GOPS
GroupConv2DWorkload(batch=1, height=14, width=14, in_filter=512, out_filter=512, groups=32, hkernel=3, wkernel=3, hpad=1, wpad=1, hstride=1, wstride=1)
CPU GROUP CONV2D TEST PASSED: Time cost = 0.0639032 sec/op, 0.452268 GOPS
GroupConv2DWorkload(batch=1, height=14, width=14, in_filter=512, out_filter=512, groups=32, hkernel=3, wkernel=3, hpad=1, wpad=1, hstride=2, wstride=2)
CPU GROUP CONV2D TEST PASSED: Time cost = 0.0164063 sec/op, 0.440401 GOPS
GroupConv2DWorkload(batch=1, height=7, width=7, in_filter=1024, out_filter=1024, groups=64, hkernel=3, wkernel=3, hpad=1, wpad=1, hstride=1, wstride=1)
CPU GROUP CONV2D TEST PASSED: Time cost = 0.0320891 sec/op, 0.450331 GOPS
GroupConv2DWorkload(batch=1, height=112, width=112, in_filter=32, out_filter=32, groups=2, hkernel=3, wkernel=3, hpad=1, wpad=1, hstride=1, wstride=1)
[04:00:30] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
VTA GROUP CONV2D TEST PASSED: Time cost = 0.00531863 sec/op, 21.7359 GOPS
GroupConv2DWorkload(batch=1, height=112, width=112, in_filter=64, out_filter=64, groups=4, hkernel=3, wkernel=3, hpad=1, wpad=1, hstride=2, wstride=2)
[04:00:31] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
VTA GROUP CONV2D TEST PASSED: Time cost = 0.00412703 sec/op, 14.0059 GOPS
GroupConv2DWorkload(batch=1, height=56, width=56, in_filter=128, out_filter=128, groups=8, hkernel=3, wkernel=3, hpad=1, wpad=1, hstride=1, wstride=1)
[04:00:32] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
VTA GROUP CONV2D TEST PASSED: Time cost = 0.00527315 sec/op, 21.9234 GOPS
GroupConv2DWorkload(batch=1, height=56, width=56, in_filter=128, out_filter=128, groups=8, hkernel=3, wkernel=3, hpad=1, wpad=1, hstride=2, wstride=2)
[04:00:33] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
VTA GROUP CONV2D TEST PASSED: Time cost = 0.00197861 sec/op, 14.6069 GOPS
GroupConv2DWorkload(batch=1, height=28, width=28, in_filter=256, out_filter=256, groups=16, hkernel=3, wkernel=3, hpad=1, wpad=1, hstride=1, wstride=1)
[04:00:34] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
VTA GROUP CONV2D TEST PASSED: Time cost = 0.00267508 sec/op, 21.6079 GOPS
GroupConv2DWorkload(batch=1, height=28, width=28, in_filter=256, out_filter=256, groups=16, hkernel=3, wkernel=3, hpad=1, wpad=1, hstride=2, wstride=2)
[04:00:35] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
VTA GROUP CONV2D TEST PASSED: Time cost = 0.00104579 sec/op, 13.818 GOPS
GroupConv2DWorkload(batch=1, height=14, width=14, in_filter=512, out_filter=512, groups=32, hkernel=3, wkernel=3, hpad=1, wpad=1, hstride=1, wstride=1)
[04:00:36] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
VTA GROUP CONV2D TEST PASSED: Time cost = 0.00162464 sec/op, 17.7894 GOPS
GroupConv2DWorkload(batch=1, height=14, width=14, in_filter=512, out_filter=512, groups=32, hkernel=3, wkernel=3, hpad=1, wpad=1, hstride=2, wstride=2)
[04:00:36] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
VTA GROUP CONV2D TEST PASSED: Time cost = 0.000917008 sec/op, 7.87926 GOPS
GroupConv2DWorkload(batch=1, height=7, width=7, in_filter=1024, out_filter=1024, groups=64, hkernel=3, wkernel=3, hpad=1, wpad=1, hstride=1, wstride=1)
[04:00:37] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
VTA GROUP CONV2D TEST PASSED: Time cost = 0.00141807 sec/op, 10.1904 GOPS
xilinx@pynq:~/tvm$ sudo ./apps/vta_rpc/start_rpc_server.sh
INFO:RPCServer:bind to 0.0.0.0:9091
INFO:RPCServer:connection from ('192.168.3.10', 40778)
INFO:root:Program FPGA with 1x16_i8w8a32_15_15_18_17.bit
INFO:RPCServer:Finish serving ('192.168.3.10', 40778)
INFO:RPCServer:connection from ('192.168.3.10', 40780)
INFO:root:Skip reconfig_runtime due to same config.
INFO:RPCServer:Finish serving ('192.168.3.10', 40780)
INFO:RPCServer:connection from ('192.168.3.10', 40784)
INFO:root:Loading VTA library: /home/xilinx/tvm/vta/python/vta/../../../build/libvta.so
INFO:RPCServer:load_module /tmp/tmpaxnl7q04/conv2d.o
INFO:root:Loading VTA library: /home/xilinx/tvm/vta/python/vta/../../../build/libvta.so
INFO:RPCServer:load_module /tmp/tmpaxnl7q04/conv2d.o
INFO:root:Loading VTA library: /home/xilinx/tvm/vta/python/vta/../../../build/libvta.so
INFO:RPCServer:load_module /tmp/tmpaxnl7q04/conv2d.o
INFO:root:Loading VTA library: /home/xilinx/tvm/vta/python/vta/../../../build/libvta.so
INFO:RPCServer:load_module /tmp/tmpaxnl7q04/conv2d.o
INFO:root:Loading VTA library: /home/xilinx/tvm/vta/python/vta/../../../build/libvta.so
INFO:RPCServer:load_module /tmp/tmpaxnl7q04/conv2d.o
INFO:root:Loading VTA library: /home/xilinx/tvm/vta/python/vta/../../../build/libvta.so
INFO:RPCServer:load_module /tmp/tmpaxnl7q04/conv2d.o
INFO:root:Loading VTA library: /home/xilinx/tvm/vta/python/vta/../../../build/libvta.so
INFO:RPCServer:load_module /tmp/tmpaxnl7q04/conv2d.o
INFO:root:Loading VTA library: /home/xilinx/tvm/vta/python/vta/../../../build/libvta.so
INFO:RPCServer:load_module /tmp/tmpaxnl7q04/conv2d.o
INFO:root:Loading VTA library: /home/xilinx/tvm/vta/python/vta/../../../build/libvta.so
INFO:RPCServer:load_module /tmp/tmpaxnl7q04/conv2d.o
INFO:RPCServer:Finish serving ('192.168.3.10', 40784)
INFO:RPCServer:connection from ('192.168.3.10', 40786)
INFO:root:Program FPGA with 1x16_i8w8a32_15_15_18_17.bit
INFO:root:Skip reconfig_runtime due to same config.
INFO:root:Loading VTA library: /home/xilinx/tvm/vta/python/vta/../../../build/libvta.so
INFO:RPCServer:load_module /tmp/tmpqkivkeds/conv2d.o
INFO:root:Loading VTA library: /home/xilinx/tvm/vta/python/vta/../../../build/libvta.so
INFO:RPCServer:load_module /tmp/tmpqkivkeds/conv2d.o
INFO:root:Loading VTA library: /home/xilinx/tvm/vta/python/vta/../../../build/libvta.so
INFO:RPCServer:load_module /tmp/tmpqkivkeds/conv2d.o
INFO:root:Loading VTA library: /home/xilinx/tvm/vta/python/vta/../../../build/libvta.so
INFO:RPCServer:load_module /tmp/tmpqkivkeds/conv2d.o
INFO:root:Loading VTA library: /home/xilinx/tvm/vta/python/vta/../../../build/libvta.so
INFO:RPCServer:load_module /tmp/tmpqkivkeds/conv2d.o
INFO:root:Loading VTA library: /home/xilinx/tvm/vta/python/vta/../../../build/libvta.so
INFO:RPCServer:load_module /tmp/tmpqkivkeds/conv2d.o
INFO:root:Loading VTA library: /home/xilinx/tvm/vta/python/vta/../../../build/libvta.so
INFO:RPCServer:load_module /tmp/tmpqkivkeds/conv2d.o
INFO:root:Loading VTA library: /home/xilinx/tvm/vta/python/vta/../../../build/libvta.so
INFO:RPCServer:load_module /tmp/tmpqkivkeds/conv2d.o
INFO:root:Loading VTA library: /home/xilinx/tvm/vta/python/vta/../../../build/libvta.so
INFO:RPCServer:load_module /tmp/tmpqkivkeds/conv2d.o
INFO:RPCServer:Finish serving ('192.168.3.10', 40786)
なので、だいぶ遅い気がする。VTA DENSE TEST PASSED: Time cost = 0.126375 sec/op, 0.130683 GOPS
(base) masaaki@masaaki-H110M4-M01:/media/masaaki/Ubuntu_Disk/DNN/tvm$ python3 vta/tests/python/integration/test_benchmark_topi_dense.py
Cannot find config for target=ext_dev -keys=vta,cpu -device=vta -model=pynq_1x16_i8w8a32_15_15_18_17, workload=('dense_packed.vta', ('TENSOR', (16, 32, 1, 16), 'int8'), ('TENSOR', (63, 32, 16, 16), 'int8'), None, 'int32'). A fallback configuration is used, which may bring great performance regression.
[03:54:12] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
@main = primfn(data_1: handle, kernel_1: handle, compute_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {data: Buffer(data_2: Pointer(int8), int8, [8192], []),
kernel: Buffer(kernel_2: Pointer(int8), int8, [516096], []),
compute: Buffer(compute_2: Pointer(int8), int8, [16128], [])}
buffer_map = {data_1: data, kernel_1: kernel, compute_1: compute}
preflattened_buffer_map = {data_1: data_3: Buffer(data_2, int8, [16, 32, 1, 16], []), kernel_1: kernel_3: Buffer(kernel_2, int8, [63, 32, 16, 16], []), compute_1: compute_3: Buffer(compute_2, int8, [16, 63, 1, 16], [])} {
@tir.vta.coproc_dep_push(3, 2, dtype=int32)
for (i0.outer: int32, 0, 16) {
for (i1.outer: int32, 0, 63) {
attr [IterVar(vta: int32, (nullptr), "ThreadIndex", "vta")] "coproc_scope" = 2 {
@tir.vta.coproc_dep_pop(3, 2, dtype=int32)
attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_uop_scope" = "VTAPushGEMMOp";
@tir.vta.uop_push(0, 1, 0, 0, 0, 0, 0, 0, dtype=int32)
@tir.vta.coproc_dep_push(2, 1, dtype=int32)
}
for (k_o.outer: int32, 0, 32) {
attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_scope" = 1 {
@tir.vta.coproc_dep_pop(2, 1, dtype=int32)
@tir.call_extern("VTALoadBuffer2D", @tir.tvm_thread_context(@tir.vta.command_handle(, dtype=handle), dtype=handle), data_2, ((i0.outer*32) + k_o.outer), 1, 1, 1, 0, 0, 0, 0, 0, 2, dtype=int32)
@tir.call_extern("VTALoadBuffer2D", @tir.tvm_thread_context(@tir.vta.command_handle(, dtype=handle), dtype=handle), kernel_2, ((i1.outer*32) + k_o.outer), 1, 1, 1, 0, 0, 0, 0, 0, 1, dtype=int32)
@tir.vta.coproc_dep_push(1, 2, dtype=int32)
}
attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_scope" = 2 {
@tir.vta.coproc_dep_pop(1, 2, dtype=int32)
attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_uop_scope" = "VTAPushGEMMOp";
@tir.vta.uop_push(0, 0, 0, 0, 0, 0, 0, 0, dtype=int32)
@tir.vta.coproc_dep_push(2, 1, dtype=int32)
}
}
@tir.vta.coproc_dep_pop(2, 1, dtype=int32)
attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_scope" = 2 {
attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_uop_scope" = "VTAPushALUOp";
@tir.vta.uop_push(1, 0, 0, 0, 0, 3, 1, 8, dtype=int32)
attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_uop_scope" = "VTAPushALUOp";
@tir.vta.uop_push(1, 0, 0, 0, 0, 0, 1, 127, dtype=int32)
attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_uop_scope" = "VTAPushALUOp";
@tir.vta.uop_push(1, 0, 0, 0, 0, 1, 1, 0, dtype=int32)
@tir.vta.coproc_dep_push(2, 3, dtype=int32)
}
attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_scope" = 3 {
@tir.vta.coproc_dep_pop(2, 3, dtype=int32)
@tir.call_extern("VTAStoreBuffer2D", @tir.tvm_thread_context(@tir.vta.command_handle(, dtype=handle), dtype=handle), 0, 4, compute_2, ((i0.outer*63) + i1.outer), 1, 1, 1, dtype=int32)
@tir.vta.coproc_dep_push(3, 2, dtype=int32)
}
}
}
@tir.vta.coproc_sync(, dtype=int32)
@tir.vta.coproc_dep_pop(3, 2, dtype=int32)
}
[03:54:13] /media/masaaki/Ubuntu_Disk/DNN/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement required_alignment=256, provided_alignment=128
/media/masaaki/Ubuntu_Disk/DNN/tvm/python/tvm/driver/build_module.py:263: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
warnings.warn(
VTA DENSE TEST PASSED: Time cost = 0.126375 sec/op, 0.130683 GOPS
xilinx@pynq:~/tvm$ sudo ./apps/vta_rpc/start_rpc_server.sh
INFO:RPCServer:bind to 0.0.0.0:9091
INFO:RPCServer:connection from ('192.168.3.10', 40252)
INFO:root:Program FPGA with 1x16_i8w8a32_15_15_18_17.bit
INFO:RPCServer:Finish serving ('192.168.3.10', 40252)
INFO:RPCServer:connection from ('192.168.3.10', 40254)
INFO:root:Skip reconfig_runtime due to same config.
INFO:RPCServer:Finish serving ('192.168.3.10', 40254)
INFO:RPCServer:connection from ('192.168.3.10', 40262)
INFO:root:Program FPGA with 1x16_i8w8a32_15_15_18_17.bit
INFO:root:Skip reconfig_runtime due to same config.
INFO:root:Loading VTA library: /home/xilinx/tvm/vta/python/vta/../../../build/libvta.so
INFO:RPCServer:load_module /tmp/tmpl5cso_hi/dense.o
INFO:RPCServer:Finish serving ('192.168.3.10', 40262)
日 | 月 | 火 | 水 | 木 | 金 | 土 |
---|---|---|---|---|---|---|
- | - | - | - | - | 1 | 2 |
3 | 4 | 5 | 6 | 7 | 8 | 9 |
10 | 11 | 12 | 13 | 14 | 15 | 16 |
17 | 18 | 19 | 20 | 21 | 22 | 23 |
24 | 25 | 26 | 27 | 28 | 29 | 30 |
31 | - | - | - | - | - | - |