// output_layer.h
// 2018/03/11 by marsee
// 2018/03/12 : 出力フォーマットを変更
// 2018/04/30 : HLS Streamに変更
//
#ifndef __OUTPUT_LAYER_H__
#define __OUTPUT_LAYER_H__
#include <ap_fixed.h>
static const size_t NUMBER_OF_OUTPUT_LAYER = 3;
static const size_t W = 12;
static const size_t I = 7;
typedef struct {
ap_fixed<W,I,AP_TRN,AP_WRAP> data [NUMBER_OF_OUTPUT_LAYER];
} mdata_type;
typedef struct {
float data [NUMBER_OF_OUTPUT_LAYER];
} fmdata_type;
typedef ap_fixed<W,I,AP_TRN,AP_WRAP> out_affine_type;
typedef ap_uint<2> output_type;
#endif
// output_layer.cpp
// 2018/03/11 by marsee
// 2018/03/12 : 出力フォーマットを変更
// 2018/04/30 : HLS Streamに変更
//
#include <ap_int.h>
#include <hls_stream.h>
#include <ap_axi_sdata.h>
#include <hls_video.h>
#include "layer_general.h"
#include "output_layer.h"
int output_layer(hls::stream<ap_fixed_axis<W,I,1,1> >& ins, output_type& output,
out_affine_type dot2[NUMBER_OF_OUTPUT_LAYER]){
#pragma HLS ARRAY_PARTITION variable=dot2 complete dim=1
#pragma HLS DATA_PACK variable=ins
ap_fixed_axis<W,I,1,1> stdata;
mdata_type af2;
int max_num;
out_affine_type max_val;
Loop1: do {
#pragma HLS LOOP_TRIPCOUNT min=1 max=1 avg=1
// user が 1になった時にフレームがスタートする
ins >> stdata;
} while(stdata.user == 0);
Loop2: for(int i=0; i<NUMBER_OF_OUTPUT_LAYER; i++){
#pragma HLS PIPELINE II=1
if(i != 0) // 最初の入力はすでに入力されている
ins >> stdata; // AXI4-Stream からの入力
af2.data[i] = stdata.data[0];
}
max_val = 0;
Loop3: for(int i=0; i<NUMBER_OF_OUTPUT_LAYER; i++){
#pragma HLS UNROLL
dot2[i] = af2.data[i];
if(i == 0){
max_val = af2.data[0];
max_num = 0;
} else if (max_val < af2.data[i]){
max_val = af2.data[i];
max_num = i;
}
}
output = output_type(max_num);
return(0);
}
// output_layer_tb.cpp
// 2018/03/12 by marsee
// 2018/03/12 : 出力フォーマットを変更
// 2018/04/30 : HLS Streamに変更
//
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <ap_int.h>
#include <hls_stream.h>
#include <iostream>
#include <fstream>
#include <iomanip>
#include <math.h>
#include <ap_axi_sdata.h>
#include <hls_video.h>
#include "layer_general.h"
#include "output_layer.h"
#include "affine_layer2_output.h"
int output_layer(hls::stream<ap_fixed_axis<W,I,1,1> >& ins, output_type& output,
out_affine_type dot2[NUMBER_OF_OUTPUT_LAYER]);
int output_layer_soft(hls::stream<float_axis<1,1> >& ins, output_type& output,
float dot2[NUMBER_OF_OUTPUT_LAYER]);
int main(){
using namespace std;
hls::stream<ap_fixed_axis<W,I,1,1> > ins;
hls::stream<float_axis<1,1> > ins_soft;
ap_fixed_axis<W,I,1,1> pix;
float_axis<1,1> fpix;
output_type out, out_soft;
out_affine_type dot2[NUMBER_OF_OUTPUT_LAYER];
float fdot2[NUMBER_OF_OUTPUT_LAYER];
// ins に入力データを用意する
for(int i=0; i<5; i++){ // dummy data
pix.user = 0;
pix.data[0] = (out_affine_type)i;
ins << pix;
fpix.user = 0;
fpix.data[0] = (float)i;
ins_soft << fpix;
}
// 1 画面分のデータを ins、ins_soft に入力する
for(int i=0; i < NUMBER_OF_OUTPUT_LAYER; i++){
pix.data[0] = affine2_out[i];
fpix.data[0] = affine2_fout[i];
if (i == 0){ // 最初のデータの時に TUSER を 1 にする
pix.user = 1;
fpix.user = 1;
} else {
pix.user = 0;
fpix.user = 0;
}
if (i == NUMBER_OF_OUTPUT_LAYER-1){ // 行の最後でTLASTをアサートする
pix.last = 1;
fpix.last = 1;
} else {
pix.last = 0;
fpix.last = 0;
}
ins << pix;
ins_soft << fpix;
}
output_layer(ins, out, dot2);
output_layer_soft(ins_soft, out_soft, fdot2);
// out と out_soft を比較する
cout << "out" << " = " << int(out) << " out_soft" " = " << int(out_soft) << endl;
for(int i=0; i<NUMBER_OF_OUTPUT_LAYER; i++){
cout << "dot2[" << i << "] = " << float(dot2[i]) << " fdot2[" << i << "] = " << fdot2[i] << endl;
}
if(out != out_soft){
cout << "error: out" << " = " << int(out) << " out_soft" " = " << int(out_soft) << endl;
//return(1);
}
return(0);
}
int output_layer_soft(hls::stream<float_axis<1,1> >& ins, output_type& output,
float dot2[NUMBER_OF_OUTPUT_LAYER]){
float_axis<1,1> stdata;
fmdata_type af2;
int max_num;
float max_val;
Loop1: do {
// user が 1になった時にフレームがスタートする
ins >> stdata;
} while(stdata.user == 0);
Loop2: for(int i=0; i<NUMBER_OF_OUTPUT_LAYER; i++){
if(i != 0) // 最初の入力はすでに入力されている
ins >> stdata; // AXI4-Stream からの入力
af2.data[i] = stdata.data[0];
}
max_val = 0;
Loop3: for(int i=0; i<NUMBER_OF_OUTPUT_LAYER; i++){
dot2[i] = af2.data[i];
if(i == 0){
max_val = af2.data[0];
max_num = 0;
} else if (max_val < af2.data[i]){
max_val = af2.data[i];
max_num = i;
}
}
output = output_type(max_num);
return(0);
}
// affine_layer1.h
// 2018/04/27 by marsee (HLS stream)
//
#ifndef __AFFINE_LAYER1_H__
#define __AFFINE_LAYER1_H__
#include <ap_fixed.h>#include "af1_weight.h"#include "af1_bias.h"static const size_t V_PRE_LAYER_HIGHT = 3;
static const size_t H_PRE_LAYER_WIDTH = 26;
static const size_t NUMBER_OF_MIDDLE_LAYER = 100;
static const size_t NUMBER_OF_KERNEL = 2;
static const size_t ARRAY_SIZE = 5;
static const size_t CW = 16;
static const size_t CI = 6;
static const size_t NW = 19;
static const size_t NI = 7;
typedef struct {
ap_fixed<NW,NI,AP_TRN,AP_WRAP> data [NUMBER_OF_MIDDLE_LAYER];
} mdata_type;
typedef struct {
float data [NUMBER_OF_MIDDLE_LAYER];
} fmdata_type;
typedef ap_fixed<NW,NI,AP_TRN,AP_WRAP> affine_type;
typedef ap_fixed<CW,CI,AP_TRN,AP_WRAP> conv_type;
#endif
// affine_layer1.cpp
// 2018/04/27 by marsee (HLS stream)
//
#include <ap_int.h>
#include <hls_stream.h>
#include <ap_axi_sdata.h>
#include <hls_video.h>
#include "layer_general.h"
#include "affine_layer1.h"
int affine_layer1(hls::stream<ap_fixed_axis<CW,CI,NUMBER_OF_KERNEL,1> >& ins,
hls::stream<ap_fixed_axis<NW,NI,1,1> >& outs){
//#pragma HLS ARRAY_PARTITION variable=af1_weight complete dim=1
#pragma HLS DATA_PACK variable=outs
#pragma HLS DATA_PACK variable=ins
ap_fixed_axis<CW,CI,NUMBER_OF_KERNEL,1> stdata;
affine_type dot[NUMBER_OF_MIDDLE_LAYER];
//#pragma HLS ARRAY_PARTITION variable=dot complete dim=1
ap_fixed_axis<NW,NI,1,1> outd;
Loop1: do {
#pragma HLS PIPELINE II=1
#pragma HLS LOOP_TRIPCOUNT min=1 max=1 avg=1
// user が 1になった時にフレームがスタートする
ins >> stdata;
} while(stdata.user == 0);
Loop2: for (int y=0; y<V_PRE_LAYER_HIGHT; y++){
Loop3: for (int x=0; x<H_PRE_LAYER_WIDTH; x++){
//#pragma HLS PIPELINE II=1
if (!(x==0 && y==0)) // 最初の入力はすでに入力されている
ins >> stdata; // AXI4-Stream からの入力
Loop4: for (int col=0; col<NUMBER_OF_MIDDLE_LAYER; col++){
#pragma HLS PIPELINE II=1
if (x==0 && y==0) // 最初は 0 にクリアする
dot[col] = 0;
affine_type dot_temp = (affine_type)0;
for (int i=0; i<NUMBER_OF_KERNEL; i++){
dot_temp += stdata.data[i] * af1_weight[V_PRE_LAYER_HIGHT*H_PRE_LAYER_WIDTH*i+y*H_PRE_LAYER_WIDTH+x][col];
}
dot[col] += dot_temp;
if (y==V_PRE_LAYER_HIGHT-1 && x==H_PRE_LAYER_WIDTH-1){ // 最後はバイアスを加算する
dot[col] += af1_bias[col];
outd.data[0] = dot[col];
if(col == 0)
outd.user = 1;
else
outd.user = 0;
if(col == NUMBER_OF_MIDDLE_LAYER-1)
outd.last = 1;
else
outd.last = 0;
outs << outd;
}
}
}
}
return(0);
}
// affine_layer1_tb.cpp
// 2018/04/27 by marsee (HLS stream)
//
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <ap_int.h>
#include <hls_stream.h>
#include <iostream>
#include <fstream>
#include <iomanip>
#include <math.h>
#include <ap_axi_sdata.h>
#include <hls_video.h>
#include "layer_general.h"
#include "affine_layer1.h"
#include "max_pooling_output.h"
int affine_layer1(hls::stream<ap_fixed_axis<CW,CI,NUMBER_OF_KERNEL,1> >& ins,
hls::stream<ap_fixed_axis<NW,NI,1,1> >& outs);
int affine_layer1_2(hls::stream<ap_fixed_axis<CW,CI,NUMBER_OF_KERNEL,1> >& ins,
hls::stream<ap_fixed_axis<NW,NI,1,1> >& outs);
int affine_layer1_soft(hls::stream<float_axis<NUMBER_OF_KERNEL,1> >& ins,
hls::stream<float_axis<1,1> >& outs);
int main(){
using namespace std;
hls::stream<ap_fixed_axis<CW,CI,NUMBER_OF_KERNEL,1> > ins;
hls::stream<ap_fixed_axis<CW,CI,NUMBER_OF_KERNEL,1> > ins2;
hls::stream<ap_fixed_axis<NW,NI,1,1> > outs;
hls::stream<ap_fixed_axis<NW,NI,1,1> > outs2;
hls::stream<float_axis<NUMBER_OF_KERNEL,1> > ins_soft;
hls::stream<float_axis<1,1> > outs_soft;
mdata_type dot;
mdata_type dot2;
fmdata_type fdot;
ap_fixed_axis<CW,CI,NUMBER_OF_KERNEL,1> pix;
ap_fixed_axis<CW,CI,NUMBER_OF_KERNEL,1> pix2;
float_axis<NUMBER_OF_KERNEL,1> fpix;
ap_fixed_axis<NW,NI,1,1> pdata;
ap_fixed_axis<NW,NI,1,1> pdata2;
float_axis<1,1> fpdata;
// ins に入力データを用意する
for(int i=0; i<5; i++){ // dummy data
pix.user = 0;
for(int k=0; k<NUMBER_OF_KERNEL; k++){
pix.data[k] = (affine_type)i;
}
ins << pix;
ins2 << pix;
fpix.user = 0;
for(int k=0; k<NUMBER_OF_KERNEL; k++){
fpix.data[k] = (float)i;
}
ins_soft << fpix;
}
// 1 画面分のデータを ins、ins_soft に入力する
for(int j=0; j < V_PRE_LAYER_HIGHT; j++){
for(int i=0; i < H_PRE_LAYER_WIDTH; i++){
for(int k=0; k<NUMBER_OF_KERNEL; k++){
pix.data[k] = mp_out[j*H_PRE_LAYER_WIDTH+i][k];
fpix.data[k] = mp_fout[j*H_PRE_LAYER_WIDTH+i][k];
}
if (j==0 && i==0){ // 最初のデータの時に TUSER を 1 にする
pix.user = 1;
fpix.user = 1;
} else {
pix.user = 0;
fpix.user = 0;
}
if (i == H_PRE_LAYER_WIDTH-1){ // 行の最後でTLASTをアサートする
pix.last = 1;
fpix.last = 1;
} else {
pix.last = 0;
fpix.last = 0;
}
ins << pix;
ins2 << pix;
ins_soft << fpix;
}
}
affine_layer1(ins, outs);
affine_layer1_2(ins2, outs2);
affine_layer1_soft(ins_soft, outs_soft);
// outs, outs2 を dot[] と dot2[] に代入して比較する
int errcnt = 0;
for(int i=0; i<NUMBER_OF_MIDDLE_LAYER; i++){
outs >> pdata;
outs2 >> pdata2;
outs_soft >> fpdata;
dot.data[i] = pdata.data[0];
dot2.data[i] = pdata2.data[0];
fdot.data[i] = fpdata.data[0];
printf("i = %d, HW = %f, HW2 = %f, SW = %f\n", i, (float)dot.data[i], (float)dot2.data[i], fdot.data[i]);
if(dot.data[i] != dot2.data[i]){ // 2つの実装の値が合わない
printf("ERROR HW and SW results mismatch i = %d, HW = %f, HW2 = %f, SW = %f\n", i, (float)dot.data[i], (float)dot2.data[i], fdot.data[i]);
errcnt++;
//return(1);
}
}
cout << "Error Count = " << errcnt << endl;
cout << endl;
// max_pooling の結果をヘッダファイルに出力
ofstream OH("affine_layer1_output.h");
OH << "// affine_layer1_output.h" << endl;
time_t now = time(0);
struct tm* localNow = localtime(&now);
OH << "// " << localNow->tm_year+1900 << "/" << localNow->tm_mon+1 << "/" << localNow->tm_mday;
OH << " " << setw(2) << setfill('0') << localNow->tm_hour << ":" << localNow->tm_min << ":" << localNow->tm_sec << " by marsee" << endl;
OH << "//" << endl;
OH << endl;
OH << "#ifndef __AFFINE_LAYER1_OUTPUT_H__" << endl;
OH << "#define __AFFINE_LAYER1_OUTPUT_H__" << endl;
OH << endl;
OH << "const float affine1_fout[" << NUMBER_OF_MIDDLE_LAYER << "] = {" << endl;
for (int i=0; i<NUMBER_OF_MIDDLE_LAYER ; i++){
OH << " " << fixed << setprecision(14) << fdot.data[i];
if (i == NUMBER_OF_MIDDLE_LAYER-1)
OH << endl;
else
OH << "," << endl;
}
OH << "};" << endl << endl;
OH << "const ap_fixed<19,7,AP_TRN,AP_WRAP> affine1_out[" << NUMBER_OF_MIDDLE_LAYER << "] = {" << endl;
for (int i=0; i<NUMBER_OF_MIDDLE_LAYER ; i++){
OH << " " << fixed << setprecision(14) << (float)dot.data[i];
if (i == NUMBER_OF_MIDDLE_LAYER-1)
OH << endl;
else
OH << "," << endl;
}
OH << "};" << endl << endl;
OH << "#endif" << endl;
return(0);
}
int affine_layer1_soft(hls::stream<float_axis<NUMBER_OF_KERNEL,1> >& ins,
hls::stream<float_axis<1,1> >& outs){
float_axis<NUMBER_OF_KERNEL,1> stdata;
float dot[100];
float_axis<1,1> outd;
Loop1: do {
// user が 1になった時にフレームがスタートする
ins >> stdata;
} while(stdata.user == 0);
Loop2: for (int y=0; y<V_PRE_LAYER_HIGHT; y++){
Loop3: for (int x=0; x<H_PRE_LAYER_WIDTH; x++){
if (!(x==0 && y==0)) // 最初の入力はすでに入力されている
ins >> stdata; // AXI4-Stream からの入力
Loop4: for (int col=0; col<100; col++){
if (x==0 && y==0) // 最初は 0 にクリアする
dot[col] = 0;
for (int i=0; i<NUMBER_OF_KERNEL; i++){
dot[col] += stdata.data[i] * af1_fweight[V_PRE_LAYER_HIGHT*H_PRE_LAYER_WIDTH*i+y*H_PRE_LAYER_WIDTH+x][col];
}
if (y==V_PRE_LAYER_HIGHT-1 && x==H_PRE_LAYER_WIDTH-1){ // 最後はバイアスを加算する
dot[col] += af1_fbias[col];
outd.data[0] = dot[col];
if(col == 0)
outd.user = 1;
else
outd.user = 0;
if(col == NUMBER_OF_MIDDLE_LAYER-1)
outd.last = 1;
else
outd.last = 0;
outs << outd;
}
}
}
}
return(0);
}
// 検証用 affine_layer1_2()
// 検証用に affine_layer1() とは異なる実装でコーディング
int affine_layer1_2(hls::stream<ap_fixed_axis<CW,CI,NUMBER_OF_KERNEL,1> >& ins,
hls::stream<ap_fixed_axis<NW,NI,1,1> >& outs){
ap_fixed_axis<CW,CI,NUMBER_OF_KERNEL,1> stdata;
conv_type aff_in[NUMBER_OF_KERNEL][V_PRE_LAYER_HIGHT][H_PRE_LAYER_WIDTH];
affine_type dot1[NUMBER_OF_MIDDLE_LAYER];
ap_fixed_axis<NW,NI,1,1> outd;
do {
// user が 1になった時にフレームがスタートする
ins >> stdata;
} while(stdata.user == 0);
for (int y=0; y<V_PRE_LAYER_HIGHT; y++){
for (int x=0; x<H_PRE_LAYER_WIDTH; x++){
if (!(x==0 && y==0)) // 最初の入力はすでに入力されている
ins >> stdata; // AXI4-Stream からの入力
for (int i=0; i<NUMBER_OF_KERNEL; i++){
aff_in[i][y][x] = stdata.data[i];
}
}
}
for(int col=0; col<NUMBER_OF_MIDDLE_LAYER; col++){
dot1[col] = 0;
for(int i=0; i<NUMBER_OF_KERNEL; i++){
for(int j=0; j<V_PRE_LAYER_HIGHT; j++){
for(int k=0; k<H_PRE_LAYER_WIDTH; k++){
dot1[col] += aff_in[i][j][k]*af1_weight[i*V_PRE_LAYER_HIGHT*H_PRE_LAYER_WIDTH+j*H_PRE_LAYER_WIDTH+k][col];
}
}
}
dot1[col] += af1_bias[col];
outd.data[0] = dot1[col];
if(col == 0)
outd.user = 1;
else
outd.user = 0;
if(col == NUMBER_OF_MIDDLE_LAYER-1)
outd.last = 1;
else
outd.last = 0;
outs << outd;
}
return(0);
}
// max_pooling.h
// 2018/04/19 by marsee (HLS stream)
//
#ifndef __MAX_POOLING_H__
#define __MAX_POOLING_H__
#include <ap_fixed.h>
static const size_t H_PIXEL_WIDTH_IN = 52;
static const size_t V_PIXEL_WIDTH_IN = 6;
static const size_t H_PIXEL_WIDTH_OUT = 26;
static const size_t V_PIXEL_WIDTH_OUT = 3;
static const size_t NUMBER_OF_KERNEL = 2;
static const size_t ARRAY_SIZE = 2;
static const size_t W = 16;
static const size_t I = 6;
static const size_t X_STRIDE = 2;
static const size_t Y_STRIDE = 2;
typedef ap_fixed<W, I, AP_TRN, AP_WRAP> conv_type;
#endif
// max_pooling.cpp
// 2018/04/19 by marsee (HLS stream)
// 2018/04/20 : bug fix
// 2018/04/25 : Loop10 bug fix
//
#include <ap_int.h>
#include <hls_stream.h>
#include <ap_axi_sdata.h>
#include <hls_video.h>
#include "layer_general.h"
#include "max_pooling.h"
int max_pooling(hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> >& ins,
hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> >& outs){
#pragma HLS DATA_PACK variable=outs
#pragma HLS DATA_PACK variable=ins
ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> pix;
ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> mp_out;
conv_type line_buf[NUMBER_OF_KERNEL][ARRAY_SIZE-1][H_PIXEL_WIDTH_IN];
#pragma HLS ARRAY_PARTITION variable=line_buf block factor=2 dim=1
#pragma HLS ARRAY_PARTITION variable=line_buf block factor=1 dim=2
conv_type pix_mat[NUMBER_OF_KERNEL][ARRAY_SIZE][ARRAY_SIZE];
#pragma HLS array_partition variable=pix_mat complete
conv_type val[NUMBER_OF_KERNEL], conv_data;
Loop1: do {
#pragma HLS PIPELINE II=1
#pragma HLS LOOP_TRIPCOUNT min=1 max=1 avg=1
// user が 1になった時にフレームがスタートする
ins >> pix;
} while(pix.user == 0);
Loop2: for (int y=0; y<V_PIXEL_WIDTH_IN; y++){
Loop3: for (int x=0; x<H_PIXEL_WIDTH_IN; x++){
#pragma HLS PIPELINE II=1
if (!(x==0 && y==0)) // 最初の入力はすでに入力されている
ins >> pix; // AXI4-Stream からの入力
Loop4: for (int n=0; n<NUMBER_OF_KERNEL; n++){
#pragma HLS UNROLL
conv_data = pix.data[n];
// 2次元配列のデータを左シフト
Loop5 : for (int k=0; k<ARRAY_SIZE; k++){
#pragma HLS UNROLL
Loop6 : for (int m=0; m<ARRAY_SIZE-1; m++){
pix_mat[n][k][m] = pix_mat[n][k][m+1];
}
}
Loop7: for (int i=0; i<ARRAY_SIZE-1; i++){ // 以前の行のデータを line_buf から入力
pix_mat[n][i][ARRAY_SIZE-1] = line_buf[n][i][x];
}
pix_mat[n][ARRAY_SIZE-1][ARRAY_SIZE-1] = conv_data; // pix_mat の最後に新しいデータを入力
Loop8: for (int i=0; i<ARRAY_SIZE-2; i++){ // 行の入れ替え
line_buf[n][i][x] = line_buf[n][i+1][x];
}
line_buf[n][ARRAY_SIZE-2][x] = conv_data;
// max pooling の検索
Loop9 : for (int k=0; k<ARRAY_SIZE; k++){
#pragma HLS UNROLL
Loop10 : for (int m=0; m<ARRAY_SIZE; m++){
if (k==0 && m==0){
val[n] = pix_mat[n][k][m];
} else if (val[n] < pix_mat[n][k][m]){
val[n] = pix_mat[n][k][m];
}
}
}
mp_out.data[n] = val[n];
if (x==X_STRIDE-1 && y==Y_STRIDE-1){ // 最初のデータでは、TUSERをアサートする
mp_out.user = 1;
} else {
mp_out.user = 0;
}
if (x == H_PIXEL_WIDTH_IN-1){ // 行の最後で TLAST をアサートする
mp_out.last = 1;
} else {
mp_out.last = 0;
}
}
if (x%X_STRIDE==X_STRIDE-1 && y%Y_STRIDE==Y_STRIDE-1){ // ストライド
outs << mp_out;
}
}
}
return(0);
}
// max_pooling_tb.cpp
// 2018/04/19 by marsee (HLS stream)
//
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <ap_int.h>
#include <hls_stream.h>
#include <iostream>
#include <fstream>
#include <iomanip>
#include <math.h>
#include <ap_axi_sdata.h>
#include <hls_video.h>
#include "layer_general.h"
#include "max_pooling.h"
#include "relu_output.h"
int max_pooling(hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> >& ins,
hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> >& outs);
int max_pooling2(hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> >& ins,
hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> >& outs);
int max_pooling_soft(hls::stream<float_axis<NUMBER_OF_KERNEL,1> >& ins,
hls::stream<float_axis<NUMBER_OF_KERNEL,1> >& outs);
int main(){
using namespace std;
hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> > ins;
hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> > ins2;
hls::stream<float_axis<NUMBER_OF_KERNEL,1> > ins_soft;
hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> > outs;
hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> > outs2;
hls::stream<float_axis<NUMBER_OF_KERNEL,1> > outs_soft;
float mp_fout[H_PIXEL_WIDTH_OUT*V_PIXEL_WIDTH_OUT][NUMBER_OF_KERNEL];
conv_type mp_out[H_PIXEL_WIDTH_OUT*V_PIXEL_WIDTH_OUT][NUMBER_OF_KERNEL];
ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> pix;
ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> pix2;
float_axis<NUMBER_OF_KERNEL,1> fpix;
// ins に入力データを用意する
for(int i=0; i<5; i++){ // dummy data
pix.user = 0;
for(int k=0; k<NUMBER_OF_KERNEL; k++){
pix.data[k] = (conv_type)i;
}
ins << pix;
ins2 << pix;
fpix.user = 0;
for(int k=0; k<NUMBER_OF_KERNEL; k++){
fpix.data[k] = (float)i;
}
ins_soft << fpix;
}
// 1 画面分のデータを ins、ins_soft に入力する
ofstream OHX("relu_output_X0.csv");
ofstream OHF("relu_output_F0.csv");
for(int j=0; j < V_PIXEL_WIDTH_IN; j++){
for(int i=0; i < H_PIXEL_WIDTH_IN; i++){
for(int k=0; k<NUMBER_OF_KERNEL; k++){
pix.data[k] = relu_out[j*H_PIXEL_WIDTH_IN+i][k];
fpix.data[k] = relu_fout[j*H_PIXEL_WIDTH_IN+i][k];
}
OHX << pix.data[0];
if(i != H_PIXEL_WIDTH_IN-1)
OHX << ",";
else
OHX << endl;
OHF << fpix.data[0];
if(i != H_PIXEL_WIDTH_IN-1)
OHF << ",";
else
OHF << endl;
if (j==0 && i==0){ // 最初のデータの時に TUSER を 1 にする
pix.user = 1;
fpix.user = 1;
} else {
pix.user = 0;
fpix.user = 0;
}
if (i == H_PIXEL_WIDTH_IN-1){ // 行の最後でTLASTをアサートする
pix.last = 1;
fpix.last = 1;
} else {
pix.last = 0;
fpix.last = 0;
}
ins << pix;
ins2 << pix;
ins_soft << fpix;
}
}
max_pooling(ins, outs);
max_pooling_soft(ins_soft, outs_soft);
max_pooling2(ins2, outs2);
// outs, outs_soft を mp_out[][], relu_fout[][] に出力する
int errcnt = 0;
for(int j=0; j < V_PIXEL_WIDTH_OUT; j++){
for(int i=0; i < H_PIXEL_WIDTH_OUT; i++){
outs >> pix;
outs2 >> pix2;
outs_soft >> fpix;
for(int k=0; k<NUMBER_OF_KERNEL; k++){
mp_out[j*H_PIXEL_WIDTH_OUT+i][k] = pix.data[k];
mp_fout[j*H_PIXEL_WIDTH_OUT+i][k] = fpix.data[k];
printf("%d, %d, data[%d] = %f, fdata[%d] = %f\n", j, i, k, (float)pix.data[k], k, fpix.data[k]);
if (pix.data[k] != pix2.data[k]){
printf("ERROR HW and SW results mismatch i = %ld, j = %ld, HW[%d] = %f, HW2[%d] = %f, SW[%d] = %f\n", i, j, k, (float)pix.data[k], k, (float)pix2.data[k], k,fpix.data[k]);
errcnt++;
//return(1);
}
}
}
}
cout << "Error Count = " << errcnt << endl;
cout << "Success HW and SW results match" << endl;
cout << endl;
// max_pooling の結果をヘッダファイルに出力
ofstream OH("max_pooling_output.h");
OH << "// max_pooling_output.h" << endl;
time_t now = time(0);
struct tm* localNow = localtime(&now);
OH << "// " << localNow->tm_year+1900 << "/" << localNow->tm_mon+1 << "/" << localNow->tm_mday;
OH << " " << setw(2) << setfill('0') << localNow->tm_hour << ":" << localNow->tm_min << ":" << localNow->tm_sec << " by marsee" << endl;
OH << "//" << endl;
OH << endl;
OH << "#ifndef __MAX_POOLING_OUTPUT_H__" << endl;
OH << "#define __MAX_POOLING_OUTPUT_H__" << endl;
OH << endl;
OH << "const float mp_fout[" << V_PIXEL_WIDTH_OUT*H_PIXEL_WIDTH_OUT << "][" << NUMBER_OF_KERNEL << "] = {" << endl;
for (int y=0; y<V_PIXEL_WIDTH_OUT ; y++){
for (int x=0; x<H_PIXEL_WIDTH_OUT ; x++){
OH << " {" << fixed << setprecision(12) << mp_fout[H_PIXEL_WIDTH_OUT*y+x][0];
for (int i=1; i<NUMBER_OF_KERNEL; ++i)
{
OH << ", " << mp_fout[H_PIXEL_WIDTH_OUT*y+x][i];
}
OH << "}";
if (y==V_PIXEL_WIDTH_OUT-1 && x==H_PIXEL_WIDTH_OUT-1)
OH << endl;
else
OH << "," << endl;
}
}
OH << "};" << endl << endl;
OH << "const ap_fixed<16, 6, AP_TRN, AP_WRAP> mp_out[" << V_PIXEL_WIDTH_OUT*H_PIXEL_WIDTH_OUT << "][" << NUMBER_OF_KERNEL << "] = {" << endl;
for (int y=0; y<V_PIXEL_WIDTH_OUT ; y++){
for (int x=0; x<H_PIXEL_WIDTH_OUT ; x++){
OH << " {" << fixed << setprecision(12) << (float)mp_out[H_PIXEL_WIDTH_OUT*y+x][0];
for(int i=1; i<NUMBER_OF_KERNEL; i++){
OH << ", " << (float)mp_out[H_PIXEL_WIDTH_OUT*y+x][i];
}
OH << "}";
if (y==V_PIXEL_WIDTH_OUT -1 && x==H_PIXEL_WIDTH_OUT -1)
OH << endl;
else
OH << "," << endl;
}
}
OH << "};" << endl << endl;
OH << "#endif" << endl;
return(0);
}
int max_pooling_soft(hls::stream<float_axis<NUMBER_OF_KERNEL,1> >& ins,
hls::stream<float_axis<NUMBER_OF_KERNEL,1> >& outs){
float_axis<NUMBER_OF_KERNEL,1> fpix;
float fpixd_ary[NUMBER_OF_KERNEL][V_PIXEL_WIDTH_IN][H_PIXEL_WIDTH_IN];
float fval[NUMBER_OF_KERNEL];
do {
// user が 1になった時にフレームがスタートする
ins >> fpix;
} while(fpix.user == 0);
for (int y=0; y<V_PIXEL_WIDTH_IN; y++){
for (int x=0; x<H_PIXEL_WIDTH_IN; x++){
if (!(x==0 && y==0)) // 最初の入力はすでに入力されている
ins >> fpix;
for(int i=0; i<NUMBER_OF_KERNEL; i++){
fpixd_ary[i][y][x] = fpix.data[i];
}
}
}
for (int y=0; y<V_PIXEL_WIDTH_IN-1; y+=Y_STRIDE){
for (int x=0; x<H_PIXEL_WIDTH_IN-1; x+=X_STRIDE){
for(int p=0; p<NUMBER_OF_KERNEL; p++){
for(int m=0; m<Y_STRIDE; m++){
for(int n=0; n<X_STRIDE; n++){
if(m==0 && n==0){
fval[p] = fpixd_ary[p][y][x];
} else if(fval[p] < fpixd_ary[p][y+m][x+n]){
fval[p] = fpixd_ary[p][y+m][x+n];
}
}
}
}
for(int i=0; i<NUMBER_OF_KERNEL; i++){
fpix.data[i] = fval[i];
}
if(x==0 && y==0)
fpix.user = 1;
else
fpix.user = 0;
if(x==V_PIXEL_WIDTH_OUT - X_STRIDE)
fpix.last = 1;
else
fpix.last = 0;
outs << fpix;
}
}
return(0);
}
int max_pooling2(hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> >& ins,
hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> >& outs){
ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> pix;
conv_type maxp_val[NUMBER_OF_KERNEL][V_PIXEL_WIDTH_IN][H_PIXEL_WIDTH_IN];
conv_type pool_out[NUMBER_OF_KERNEL][V_PIXEL_WIDTH_OUT][H_PIXEL_WIDTH_OUT];
ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> maxp_out;
Loop1: do {
// user が 1になった時にフレームがスタートする
ins >> pix;
} while(pix.user == 0);
for (int y=0; y<V_PIXEL_WIDTH_IN; y++){
for (int x=0; x<H_PIXEL_WIDTH_IN; x++){
if (!(x==0 && y==0)) // 最初の入力はすでに入力されている
ins >> pix; // AXI4-Stream からの入力
for (int i=0; i<NUMBER_OF_KERNEL; i++){
maxp_val[i][y][x] = pix.data[i];
}
}
}
// Pooling Kernel = 2 x 2, Stride = 2
POOL1: for(int i=0; i<NUMBER_OF_KERNEL; i++){
POOL2: for(int j=0; j<V_PIXEL_WIDTH_IN; j += Y_STRIDE){
POOL3: for(int k=0; k<H_PIXEL_WIDTH_IN; k += X_STRIDE){
POOL4: for(int m=0; m<Y_STRIDE; m++){
POOL5: for(int n=0; n<X_STRIDE; n++){
if(m==0 && n==0){
pool_out[i][j/Y_STRIDE][k/X_STRIDE] = maxp_val[i][j][k];
} else if(pool_out[i][j/Y_STRIDE][k/X_STRIDE] < maxp_val[i][j+m][k+n]){
pool_out[i][j/Y_STRIDE][k/X_STRIDE] = maxp_val[i][j+m][k+n];
}
}
}
}
}
}
for(int y=0; y<V_PIXEL_WIDTH_OUT; y++){
for(int x=0; x<H_PIXEL_WIDTH_OUT; x++){
for(int i=0; i<NUMBER_OF_KERNEL; i++){
maxp_out.data[i] = pool_out[i][y][x];
}
if (x==0 && y==0){ // 最初のデータでは、TUSERをアサートする
maxp_out.user = 1;
} else {
maxp_out.user = 0;
}
if (x == (H_PIXEL_WIDTH_OUT-1)){ // 行の最後で TLAST をアサートする
maxp_out.last = 1;
} else {
maxp_out.last = 0;
}
outs << maxp_out;
}
}
return(0);
}
// conv_layer_tb.cpp
// 2018/02/13 by marsee (HLS stream)
// 2018/04/14 : HLS ストリーム対応
// 2018/04/24 : 検証用に異なる実装のconv_layer2()と比較
//
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <ap_int.h>
#include <hls_stream.h>
#include <iostream>
#include <fstream>
#include <iomanip>
#include <math.h>
#include <ap_axi_sdata.h>
#include <hls_video.h>
#include "layer_general.h"
#include "conv_layer.h"
#include "conv1_weight.h"
#include "conv1_bias.h"
#include "bmp_header.h"
int conv_layer(hls::stream<ap_axiu<32,1,1,1> >& ins,
hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> >& outs);
int conv_layer_soft(hls::stream<ap_axiu<32,1,1,1> >& ins,
hls::stream<float_axis<NUMBER_OF_KERNEL,1> >& outs);
int conv_layer2(hls::stream<ap_axiu<32,1,1,1> >&ins,
hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> >&outs);
#define BMP_FILE_NAME "straight_RED_rect0_00_rgb.bmp"
int main(){
using namespace std;
hls::stream<ap_axiu<32,1,1,1> > ins;
hls::stream<ap_axiu<32,1,1,1> > ins2;
hls::stream<ap_axiu<32,1,1,1> > ins_soft;
hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> > outs;
hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> > outs2;
hls::stream<float_axis<NUMBER_OF_KERNEL,1> > outs_soft;
ap_axiu<32,1,1,1> pix;
ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> vals;
ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> vals2;
float_axis<NUMBER_OF_KERNEL,1> vals_soft;
BITMAPFILEHEADER bmpfhr; // BMPファイルのファイルヘッダ(for Read)
BITMAPINFOHEADER bmpihr; // BMPファイルのINFOヘッダ(for Read)
FILE *fbmpr, *fbmpw, *fbmpwf;
int *rd_bmp;
int *hw_conv[NUMBER_OF_KERNEL];
int *sw_conv[NUMBER_OF_KERNEL];
float *hw_convf[NUMBER_OF_KERNEL];
float *sw_convf[NUMBER_OF_KERNEL];
int blue, green, red;
ap_uint<2> r_l;
char fhname[100];
char fsname[100];
if ((fbmpr = fopen(BMP_FILE_NAME, "rb")) == NULL){ // test.bmp をオープン
fprintf(stderr, "Can't open straight_RED_rect0_00.bmp by binary read mode\n");
exit(1);
}
// bmpヘッダの読み出し
fread(&bmpfhr.bfType, sizeof(uint16_t), 1, fbmpr);
fread(&bmpfhr.bfSize, sizeof(uint32_t), 1, fbmpr);
fread(&bmpfhr.bfReserved1, sizeof(uint16_t), 1, fbmpr);
fread(&bmpfhr.bfReserved2, sizeof(uint16_t), 1, fbmpr);
fread(&bmpfhr.bfOffBits, sizeof(uint32_t), 1, fbmpr);
fread(&bmpihr, sizeof(BITMAPINFOHEADER), 1, fbmpr);
// ピクセルを入れるメモリをアロケートする
if ((rd_bmp =(int *)malloc(sizeof(int) * (bmpihr.biWidth * bmpihr.biHeight))) == NULL){
fprintf(stderr, "Can't allocate rd_bmp memory\n");
exit(1);
}
for(int i=0; i<NUMBER_OF_KERNEL; i++){
if ((hw_conv[i] =(int *)malloc(sizeof(int) * (bmpihr.biWidth * bmpihr.biHeight))) == NULL){
fprintf(stderr, "Can't allocate hw_conv[%d] memory\n", i);
exit(1);
}
if ((sw_conv[i] =(int *)malloc(sizeof(int) * (bmpihr.biWidth * bmpihr.biHeight))) == NULL){
fprintf(stderr, "Can't allocate sw_conv[%d] memory\n", i);
exit(1);
}
}
for(int i=0; i<NUMBER_OF_KERNEL; i++){
if ((hw_convf[i] =(float *)malloc(sizeof(float) * (bmpihr.biWidth * bmpihr.biHeight))) == NULL){
fprintf(stderr, "Can't allocate hw_convf[%d] memory\n", i);
exit(1);
}
if ((sw_convf[i] =(float *)malloc(sizeof(float) * (bmpihr.biWidth * bmpihr.biHeight))) == NULL){
fprintf(stderr, "Can't allocate sw_convf[%d] memory\n", i);
exit(1);
}
}
// rd_bmp にBMPのピクセルを代入。その際に、行を逆転する必要がある
for (int y=0; y<bmpihr.biHeight; y++){
for (int x=0; x<bmpihr.biWidth; x++){
blue = fgetc(fbmpr);
green = fgetc(fbmpr);
red = fgetc(fbmpr);
rd_bmp[((bmpihr.biHeight-1)-y)*bmpihr.biWidth+x] = (blue & 0xff) | ((green & 0xff)<<8) | ((red & 0xff)<<16);
}
}
fclose(fbmpr);
// ins に入力データを用意する
for(int i=0; i<5; i++){ // dummy data
pix.user = 0;
pix.data = i;
ins << pix;
ins2 << pix;
ins_soft << pix;
}
// 1 画面分のデータを ins、ins_soft に入力する
for(int j=0; j < bmpihr.biHeight; j++){
for(int i=0; i < bmpihr.biWidth; i++){
pix.data = (ap_uint<32>)rd_bmp[(j*bmpihr.biWidth)+i];
if (j==0 && i==0) // 最初のデータの時に TUSER を 1 にする
pix.user = 1;
else
pix.user = 0;
if (i == bmpihr.biWidth-1) // 行の最後でTLASTをアサートする
pix.last = 1;
else
pix.last = 0;
ins << pix;
ins2 << pix;
ins_soft << pix;
}
}
// 畳み込み演算
conv_layer(ins, outs);
conv_layer2(ins2, outs2);
conv_layer_soft(ins_soft, outs_soft);
// 画像サイズの縮小(畳み込みをすると行、列共に -4
bmpfhr.bfSize = (HORIZONTAL_PIXEL_WIDTH-4) * (VERTICAL_PIXEL_WIDTH-4) * 3 + 54;
bmpihr.biHeight = VERTICAL_PIXEL_WIDTH - 4;
bmpihr.biWidth = HORIZONTAL_PIXEL_WIDTH - 4;
// ハードウェアとソフトウェアのラプラシアン・フィルタの値のチェック
out_type val[NUMBER_OF_KERNEL];
out_type val2[NUMBER_OF_KERNEL];
float val_soft[NUMBER_OF_KERNEL];
cout << endl;
cout << "outs" << endl;
int errcnt=0;
for(int j=0; j < bmpihr.biHeight; j++){
for(int i=0; i < bmpihr.biWidth; i++){
outs >> vals;
outs2 >> vals2;
outs_soft >> vals_soft;
for(int k=0; k<NUMBER_OF_KERNEL; k++){
val[k] = vals.data[k];
val2[k] = vals2.data[k];
val_soft[k] = vals_soft.data[k];
int *hw_convp = hw_conv[k];
int *sw_convp = sw_conv[k];
hw_convp[(j*bmpihr.biWidth)+i] = ((int)val[k]+32)*4; // 32を足して負の符号を排除し、整数部6ビットなので、2ビット分補正する
sw_convp[(j*bmpihr.biWidth)+i] = ((int)val_soft[k]+32)*4;
float *hw_convfp = hw_convf[k];
float *sw_convfp = sw_convf[k];
hw_convfp[(j*bmpihr.biWidth)+i] = (float)val[k];
sw_convfp[(j*bmpihr.biWidth)+i] = val_soft[k];
if (val[k] != val2[k]){
printf("ERROR val and val2 results mismatch i = %d, j = %d, val[%d] = %f, val2[%d] = %f\n", i, j, k, (float)val[k], k, (float)val2[k]);
errcnt++;
//return(1);
}
printf("HW and SW results i = %d, j = %d, HW[%d] = %f, HW2[%d] = %f, SW[%d] = %f\n", i, j, k, (float)val[k], k, (float)val2[k], k, val_soft[k]);
}
}
}
cout << "Error Count = " << errcnt << endl;
cout << "Success HW and SW results match" << endl;
cout << endl;
// ハードウェアの畳み込み演算の結果を temp_conv0.bmp, temp_conv1.bmp に出力する
for (int k=0; k<NUMBER_OF_KERNEL; k++){
if (k==0){
if ((fbmpw=fopen("temp_conv0.bmp", "wb")) == NULL){
fprintf(stderr, "Can't open temp_conv0.bmp by binary write mode\n");
exit(1);
}
} else {
if ((fbmpw=fopen("temp_conv1.bmp", "wb")) == NULL){
fprintf(stderr, "Can't open temp_conv1.bmp by binary write mode\n");
exit(1);
}
}
// BMPファイルヘッダの書き込み
fwrite(&bmpfhr.bfType, sizeof(uint16_t), 1, fbmpw);
fwrite(&bmpfhr.bfSize, sizeof(uint32_t), 1, fbmpw);
fwrite(&bmpfhr.bfReserved1, sizeof(uint16_t), 1, fbmpw);
fwrite(&bmpfhr.bfReserved2, sizeof(uint16_t), 1, fbmpw);
fwrite(&bmpfhr.bfOffBits, sizeof(uint32_t), 1, fbmpw);
fwrite(&bmpihr, sizeof(BITMAPINFOHEADER), 1, fbmpw);
// RGB データの書き込み、逆順にする
for (int y=0; y<bmpihr.biHeight; y++){
for (int x=0; x<bmpihr.biWidth; x++){
int *hw_convp = hw_conv[k];
blue = hw_convp[((bmpihr.biHeight-1)-y)*bmpihr.biWidth+x] & 0xff;
green = blue;
red = blue;
fputc(blue, fbmpw);
fputc(green, fbmpw);
fputc(red, fbmpw);
}
}
fclose(fbmpw);
}
// ソフトウェアの畳み込み演算の結果を temp_conv_float0.bmp, temp_conv_float1.bmp に出力する
for(int k=0; k<2; k++){
if (k == 0){
if ((fbmpwf=fopen("temp_conv_float0.bmp", "wb")) == NULL){
fprintf(stderr, "Can't open temp_conv_float0.bmp by binary write mode\n");
exit(1);
}
} else {
if ((fbmpwf=fopen("temp_conv_float1.bmp", "wb")) == NULL){
fprintf(stderr, "Can't open temp_conv_float1.bmp by binary write mode\n");
exit(1);
}
}
// BMPファイルヘッダの書き込み
fwrite(&bmpfhr.bfType, sizeof(uint16_t), 1, fbmpwf);
fwrite(&bmpfhr.bfSize, sizeof(uint32_t), 1, fbmpwf);
fwrite(&bmpfhr.bfReserved1, sizeof(uint16_t), 1, fbmpwf);
fwrite(&bmpfhr.bfReserved2, sizeof(uint16_t), 1, fbmpwf);
fwrite(&bmpfhr.bfOffBits, sizeof(uint32_t), 1, fbmpwf);
fwrite(&bmpihr, sizeof(BITMAPINFOHEADER), 1, fbmpwf);
// RGB データの書き込み、逆順にする
for (int y=0; y<bmpihr.biHeight; y++){
for (int x=0; x<bmpihr.biWidth; x++){
int *sw_convp = sw_conv[k];
blue = sw_convp[((bmpihr.biHeight-1)-y)*bmpihr.biWidth+x] & 0xff;
green = blue;
red = blue;
fputc(blue, fbmpwf);
fputc(green, fbmpwf);
fputc(red, fbmpwf);
}
}
fclose(fbmpwf);
}
// ヘッダ出力
ofstream OH("conv_layer_output.h");
OH << "// conv_layer_output.h" << endl;
time_t now = time(0);
struct tm* localNow = localtime(&now);
OH << "// " << localNow->tm_year+1900 << "/" << localNow->tm_mon+1 << "/" << localNow->tm_mday;
OH << " " << setw(2) << setfill('0') << localNow->tm_hour << ":" << localNow->tm_min << ":" << localNow->tm_sec << " by marsee" << endl;
OH << "//" << endl;
OH << endl;
OH << "#ifndef __CONV_LAYER_OUTPUT_H__" << endl;
OH << "#define __CONV_LAYER_OUTPUT_H__" << endl;
OH << endl;
OH << "const float conv_layer_fout[" << bmpihr.biHeight*bmpihr.biWidth << "][" << NUMBER_OF_KERNEL << "] = {" << endl;
for (int y=0; y<bmpihr.biHeight; y++){
for (int x=0; x<bmpihr.biWidth; x++){
OH << " {" << fixed << setprecision(12) << sw_convf[0][bmpihr.biWidth*y+x];
for(int i=1; i<NUMBER_OF_KERNEL; i++){
OH << ", " << sw_convf[i][bmpihr.biWidth*y+x];
}
OH << "}";
if (y==bmpihr.biHeight-1 && x==bmpihr.biWidth-1)
OH << endl;
else
OH << "," << endl;
}
}
OH << "};" << endl << endl;
OH << "const ap_fixed<16, 6, AP_TRN, AP_WRAP> conv_layer_out[" << bmpihr.biHeight*bmpihr.biWidth << "][" << NUMBER_OF_KERNEL << "] = {" << endl;
for (int y=0; y<bmpihr.biHeight; y++){
for (int x=0; x<bmpihr.biWidth; x++){
OH << " {" << hw_convf[0][bmpihr.biWidth*y+x];
for(int i=1; i<NUMBER_OF_KERNEL; i++){
OH << ", " << hw_convf[i][bmpihr.biWidth*y+x];
}
OH << "}";
if (y==bmpihr.biHeight-1 && x==bmpihr.biWidth-1)
OH << endl;
else
OH << "," << endl;
}
}
OH << "};" << endl << endl;
OH << "#endif" << endl;
free(rd_bmp);
for(int k=0; k<NUMBER_OF_KERNEL; k++){
free(hw_conv[k]);
free(sw_conv[k]);
free(hw_convf[k]);
free(sw_convf[k]);
}
return(0);
}
int conv_layer_soft(hls::stream<ap_axiu<32,1,1,1> >& ins,
hls::stream<float_axis<NUMBER_OF_KERNEL,1> >& outs){
ap_axiu<32,1,1,1> pix;
float_axis<NUMBER_OF_KERNEL,1> conv_out;
hls::LineBuffer<ARRAY_SIZE-1, HORIZONTAL_PIXEL_WIDTH, float> linebuf;
hls::Window<ARRAY_SIZE, ARRAY_SIZE, float> mbuf;
float ap_uf_pix;
float val;
do {
// user が 1になった時にフレームがスタートする
ins >> pix;
} while(pix.user == 0);
Loop1: for (int y=0; y<VERTICAL_PIXEL_WIDTH; y++){
Loop2: for (int x=0; x<HORIZONTAL_PIXEL_WIDTH; x++){
if (!(x==0 && y==0)) // 最初の入力はすでに入力されている
ins >> pix; // AXI4-Stream からの入力
ap_uf_pix = (float)(pix.data & 0xff) / 256.0;
//printf("ap_uf_pix_soft = %f\n", ap_uf_pix);
mbuf.shift_pixels_left(); // mbuf の列を1ビット左シフト
for(int i=0; i<ARRAY_SIZE-1; i++){
mbuf.insert_pixel(linebuf.getval(i,x), i, ARRAY_SIZE-1);
}
mbuf.insert_pixel(ap_uf_pix, ARRAY_SIZE-1, ARRAY_SIZE-1);
// LineBuffer の更新
linebuf.shift_pixels_up(x);
linebuf.insert_bottom_row(ap_uf_pix, x);
// conv_layer の演算
for (int k=0; k<NUMBER_OF_KERNEL; k++){
val=0.0;
for (int j=0; j<ARRAY_SIZE; j++){
for (int i=0; i<ARRAY_SIZE; i++){
val += mbuf.getval(j,i) * conv1_fweight[k][0][j][i];
}
}
val += conv1_fbias[k];
conv_out.data[k] = val;
}
// 最初のARRAY_SIZE-1行とその他の行の最初のARRAY_SIZE-1列は無効データなので出力しない
if (x<(ARRAY_SIZE-1) || y<(ARRAY_SIZE-1))
continue;
else { // 有効なデータの時
if (x==(ARRAY_SIZE-1) && y==(ARRAY_SIZE-1)){ // 最初のデータでは、TUSERをアサートする
conv_out.user = 1;
} else {
conv_out.user = 0;
}
if (x == (HORIZONTAL_PIXEL_WIDTH-1)){ // 行の最後で TLAST をアサートする
conv_out.last = 1;
} else {
conv_out.last = 0;
}
outs << conv_out;
}
}
}
return(0);
}
// 検証用 conv_layer2()
// 検証用に conv_layer() とは異なる実装でコーディング
int conv_layer2(hls::stream<ap_axiu<32,1,1,1> >&ins,
hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> >&outs){
ap_axiu<32,1,1,1> pix;
val_type conv_val[NUMBER_OF_KERNEL][VERTICAL_PIXEL_WIDTH][HORIZONTAL_PIXEL_WIDTH];
in_type ap_uf_pix[VERTICAL_PIXEL_WIDTH][HORIZONTAL_PIXEL_WIDTH];
ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> conv_out;
Loop1: do {
// user が 1になった時にフレームがスタートする
ins >> pix;
} while(pix.user == 0);
Loop2: for (int y=0; y<VERTICAL_PIXEL_WIDTH; y++){
Loop3: for (int x=0; x<HORIZONTAL_PIXEL_WIDTH; x++){
if (!(x==0 && y==0)) // 最初の入力はすでに入力されている
ins >> pix; // AXI4-Stream からの入力
ap_uf_pix[y][x] = (in_type)((ap_ufixed<16, 8, AP_TRN, AP_WRAP>)(pix.data & 0xff) / 256);
}
}
for(int i=0; i<NUMBER_OF_KERNEL; i++){ // カーネルの個数
for(int j=0; j<VERTICAL_PIXEL_WIDTH-(ARRAY_SIZE-1); j++){
for(int k=0; k<HORIZONTAL_PIXEL_WIDTH-(ARRAY_SIZE-1); k++){
conv_val[i][j][k] = 0;
for(int m=0; m<ARRAY_SIZE; m++){
for(int n=0; n<ARRAY_SIZE; n++){
conv_val[i][j][k] += (val_type)ap_uf_pix[j+m][k+n] * (val_type)conv1_weight[i][0][m][n];
}
}
conv_val[i][j][k] += (val_type)conv1_bias[i];
}
}
}
for(int y=0; y<VERTICAL_PIXEL_WIDTH-(ARRAY_SIZE-1); y++){
for(int x=0; x<HORIZONTAL_PIXEL_WIDTH-(ARRAY_SIZE-1); x++){
for(int i=0; i<NUMBER_OF_KERNEL; i++){
conv_out.data[i] = conv_val[i][y][x];
}
if (x==0 && y==0){ // 最初のデータでは、TUSERをアサートする
conv_out.user = 1;
} else {
conv_out.user = 0;
}
if (x == (HORIZONTAL_PIXEL_WIDTH - ARRAY_SIZE)){ // 行の最後で TLAST をアサートする
conv_out.last = 1;
} else {
conv_out.last = 0;
}
outs << conv_out;
}
}
return(0);
}
entity relu is
port (
ap_clk : IN STD_LOGIC;
ap_rst : IN STD_LOGIC;
ap_start : IN STD_LOGIC;
ap_done : OUT STD_LOGIC;
ap_idle : OUT STD_LOGIC;
ap_ready : OUT STD_LOGIC;
ins_V_dout : IN STD_LOGIC_VECTOR (33 downto 0);
ins_V_empty_n : IN STD_LOGIC;
ins_V_read : OUT STD_LOGIC;
outs_V_din : OUT STD_LOGIC_VECTOR (33 downto 0);
outs_V_full_n : IN STD_LOGIC;
outs_V_write : OUT STD_LOGIC;
ap_return : OUT STD_LOGIC_VECTOR (31 downto 0) );
end;
// relu.h
// 2018/02/20 by marsee (HLS stream)
//
#ifndef __RELU_H__
#define __RELU_H__
static const size_t HORIZONTAL_PIXEL_WIDTH = 52;
static const size_t VERTICAL_PIXEL_WIDTH = 6;
static const size_t ALL_PIXELS = HORIZONTAL_PIXEL_WIDTH * VERTICAL_PIXEL_WIDTH;
static const size_t NUMBER_OF_KERNEL = 2;
static const size_t ARRAY_SIZE = 2;
static const size_t W = 16;
static const size_t I = 6;
typedef ap_fixed<W, I, AP_TRN, AP_WRAP> conv_type;
#endif
// relu.cpp
// 2018/04/15 by marsee (HLS stream)
//
#include <ap_int.h>
#include <hls_stream.h>
#include <ap_axi_sdata.h>
#include "layer_general.h"
#include "relu.h"
int relu(hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> >& ins,
hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> >& outs){
#pragma HLS DATA_PACK variable=outs
#pragma HLS DATA_PACK variable=ins
ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> pix;
do {
#pragma HLS PIPELINE II=1
#pragma HLS LOOP_TRIPCOUNT min=1 max=1 avg=1
// user が 1になった時にフレームがスタートする
ins >> pix;
} while(pix.user == 0);
Loop1: for (int y=0; y<VERTICAL_PIXEL_WIDTH; y++){
Loop2: for (int x=0; x<HORIZONTAL_PIXEL_WIDTH; x++){
#pragma HLS PIPELINE II=1
if (!(x==0 && y==0)) // 最初の入力はすでに入力されている
ins >> pix; // AXI4-Stream からの入力
for(int i=0; i<NUMBER_OF_KERNEL; i++){
if (pix.data[i] < conv_type(0.0)) // データが 0 以下だったら 0 にする
pix.data[i] = conv_type(0.0);
}
outs << pix;
}
}
return(0);
}
// relu_tb.cpp
// 2018/02/20 by marsee (HLS stream)
//
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <ap_int.h>
#include <hls_stream.h>
#include <iostream>
#include <fstream>
#include <iomanip>
#include <math.h>
#include <ap_axi_sdata.h>
#include <hls_video.h>
#include "layer_general.h"
#include "relu.h"
#include "conv_layer_output.h"
int relu(hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> >& ins,
hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> >& outs);
int relu_soft( hls::stream<float_axis<NUMBER_OF_KERNEL,1> >& ins,
hls::stream<float_axis<NUMBER_OF_KERNEL,1> >& outs);
int main(){
using namespace std;
hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> > ins;
hls::stream<float_axis<NUMBER_OF_KERNEL,1> > ins_soft;
hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> > outs;
hls::stream<float_axis<NUMBER_OF_KERNEL,1> > outs_soft;
float relu_fout[ALL_PIXELS][NUMBER_OF_KERNEL];
conv_type relu_out[ALL_PIXELS][NUMBER_OF_KERNEL];
ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> pix;
float_axis<NUMBER_OF_KERNEL,1> fpix;
// ins に入力データを用意する
for(int i=0; i<5; i++){ // dummy data
pix.user = 0;
for(int j=0; j<NUMBER_OF_KERNEL; j++){
pix.data[j] = (conv_type)i;
}
ins << pix;
fpix.user = 0;
for(int j=0; j<NUMBER_OF_KERNEL; j++){
fpix.data[j] = (float)i;
}
ins_soft << fpix;
}
// 1 画面分のデータを ins、ins_soft に入力する
for(int j=0; j < VERTICAL_PIXEL_WIDTH; j++){
for(int i=0; i < HORIZONTAL_PIXEL_WIDTH; i++){
for(int k=0; k<NUMBER_OF_KERNEL; k++){
pix.data[k] = conv_layer_out[j*HORIZONTAL_PIXEL_WIDTH+i][k];
fpix.data[k] = conv_layer_fout[j*HORIZONTAL_PIXEL_WIDTH+i][k];
}
if (j==0 && i==0){ // 最初のデータの時に TUSER を 1 にする
pix.user = 1;
fpix.user = 1;
} else {
pix.user = 0;
fpix.user = 0;
}
if (i == HORIZONTAL_PIXEL_WIDTH-1){ // 行の最後でTLASTをアサートする
pix.last = 1;
fpix.last = 1;
} else {
pix.last = 0;
fpix.last = 0;
}
ins << pix;
ins_soft << fpix;
}
}
relu(ins, outs);
relu_soft(ins_soft, outs_soft);
// outs, outs_soft を relu_out[][], relu_fout[][] に出力する
int errcnt=0;
for(int j=0; j < VERTICAL_PIXEL_WIDTH; j++){
for(int i=0; i < HORIZONTAL_PIXEL_WIDTH; i++){
outs >> pix;
outs_soft >> fpix;
for(int k=0; k<NUMBER_OF_KERNEL; k++){
relu_out[j*HORIZONTAL_PIXEL_WIDTH+i][k] = pix.data[k];
relu_fout[j*HORIZONTAL_PIXEL_WIDTH+i][k] = fpix.data[k];
if ((double)pow((double)pix.data[k]-(double)fpix.data[k],(double)2) > 4){ // 2乗誤差が4よりも大きい
printf("ERROR HW and SW results mismatch i = %d, j = %d, HW[%d] = %f, SW[%d] = %f\n", i, j, k, (float)pix.data[k], k, fpix.data[k]);
errcnt++;
return(1);
}
printf("HW and SW results i = %d, j = %d, HW[%d] = %f, SW[%d] = %f\n", i, j, k, (float)pix.data[k], k, fpix.data[k]);
}
}
}
cout << "Error Count = " << errcnt << endl;
cout << "Success HW and SW results match" << endl;
cout << endl;
// ReLU の結果をヘッダファイルに出力
ofstream OH("relu_output.h");
OH << "// relu_output.h" << endl;
time_t now = time(0);
struct tm* localNow = localtime(&now);
OH << "// " << localNow->tm_year+1900 << "/" << localNow->tm_mon+1 << "/" << localNow->tm_mday;
OH << " " << setw(2) << setfill('0') << localNow->tm_hour << ":" << localNow->tm_min << ":" << localNow->tm_sec << " by marsee" << endl;
OH << "//" << endl;
OH << endl;
OH << "#ifndef __RELU_OUTPUT_H__" << endl;
OH << "#define __RELU_OUTPUT_H__" << endl;
OH << endl;
OH << "const float relu_fout[" << VERTICAL_PIXEL_WIDTH*HORIZONTAL_PIXEL_WIDTH << "][" << NUMBER_OF_KERNEL << "] = {" << endl;
for (int y=0; y<VERTICAL_PIXEL_WIDTH ; y++){
for (int x=0; x<HORIZONTAL_PIXEL_WIDTH ; x++){
OH << " {" << fixed << setprecision(12) << relu_fout[HORIZONTAL_PIXEL_WIDTH*y+x][0];
for(int i=1; i<NUMBER_OF_KERNEL; i++){
OH << ", " << relu_fout[HORIZONTAL_PIXEL_WIDTH*y+x][i];
}
OH << "}";
if (y==VERTICAL_PIXEL_WIDTH-1 && x==HORIZONTAL_PIXEL_WIDTH-1)
OH << endl;
else
OH << "," << endl;
}
}
OH << "};" << endl << endl;
OH << "const ap_fixed<16, 6, AP_TRN, AP_WRAP> relu_out[" << VERTICAL_PIXEL_WIDTH*HORIZONTAL_PIXEL_WIDTH << "][" << NUMBER_OF_KERNEL << "] = {" << endl;
for (int y=0; y<VERTICAL_PIXEL_WIDTH ; y++){
for (int x=0; x<HORIZONTAL_PIXEL_WIDTH ; x++){
OH << " {" << (float)relu_out[HORIZONTAL_PIXEL_WIDTH*y+x][0];
for(int i=1; i<NUMBER_OF_KERNEL; i++){
OH << ", " << (float)relu_out[HORIZONTAL_PIXEL_WIDTH*y+x][1];
}
OH << "}";
if (y==VERTICAL_PIXEL_WIDTH -1 && x==HORIZONTAL_PIXEL_WIDTH -1)
OH << endl;
else
OH << "," << endl;
}
}
OH << "};" << endl << endl;
OH << "#endif" << endl;
return(0);
}
int relu_soft(hls::stream<float_axis<2,1> >& ins,
hls::stream<float_axis<2,1> >& outs){
float_axis<2,1> fpix;
do {
// user が 1になった時にフレームがスタートする
ins >> fpix;
} while(fpix.user == 0);
Loop1: for (int y=0; y<VERTICAL_PIXEL_WIDTH; y++){
Loop2: for (int x=0; x<HORIZONTAL_PIXEL_WIDTH; x++){
if (!(x==0 && y==0)) // 最初の入力はすでに入力されている
ins >> fpix; // AXI4-Stream からの入力
for(int i=0; i<NUMBER_OF_KERNEL; i++){
if (fpix.data[i] < 0.0) // データが 0 以下だったら 0 にする
fpix.data[i] = 0.0;
}
outs << fpix;
}
}
return(0);
}
masaaki@masaaki-H110M4-M01:~/DNN/darknet$ ./darknet classify cfg/tiny.cfg tiny.weights data/dog.jpg
layer filters size input output
0 conv 16 3 x 3 / 1 224 x 224 x 3 -> 224 x 224 x 16 0.043 BFLOPs
1 max 2 x 2 / 2 224 x 224 x 16 -> 112 x 112 x 16
2 conv 32 3 x 3 / 1 112 x 112 x 16 -> 112 x 112 x 32 0.116 BFLOPs
3 max 2 x 2 / 2 112 x 112 x 32 -> 56 x 56 x 32
4 conv 16 1 x 1 / 1 56 x 56 x 32 -> 56 x 56 x 16 0.003 BFLOPs
5 conv 128 3 x 3 / 1 56 x 56 x 16 -> 56 x 56 x 128 0.116 BFLOPs
6 conv 16 1 x 1 / 1 56 x 56 x 128 -> 56 x 56 x 16 0.013 BFLOPs
7 conv 128 3 x 3 / 1 56 x 56 x 16 -> 56 x 56 x 128 0.116 BFLOPs
8 max 2 x 2 / 2 56 x 56 x 128 -> 28 x 28 x 128
9 conv 32 1 x 1 / 1 28 x 28 x 128 -> 28 x 28 x 32 0.006 BFLOPs
10 conv 256 3 x 3 / 1 28 x 28 x 32 -> 28 x 28 x 256 0.116 BFLOPs
11 conv 32 1 x 1 / 1 28 x 28 x 256 -> 28 x 28 x 32 0.013 BFLOPs
12 conv 256 3 x 3 / 1 28 x 28 x 32 -> 28 x 28 x 256 0.116 BFLOPs
13 max 2 x 2 / 2 28 x 28 x 256 -> 14 x 14 x 256
14 conv 64 1 x 1 / 1 14 x 14 x 256 -> 14 x 14 x 64 0.006 BFLOPs
15 conv 512 3 x 3 / 1 14 x 14 x 64 -> 14 x 14 x 512 0.116 BFLOPs
16 conv 64 1 x 1 / 1 14 x 14 x 512 -> 14 x 14 x 64 0.013 BFLOPs
17 conv 512 3 x 3 / 1 14 x 14 x 64 -> 14 x 14 x 512 0.116 BFLOPs
18 conv 128 1 x 1 / 1 14 x 14 x 512 -> 14 x 14 x 128 0.026 BFLOPs
19 conv 1000 1 x 1 / 1 14 x 14 x 128 -> 14 x 14 x1000 0.050 BFLOPs
20 avg 14 x 14 x1000 -> 1000
21 softmax 1000
22 cost 1000
Loading weights from tiny.weights...Done!
data/dog.jpg: Predicted in 0.388883 seconds.
14.51%: malamute
6.09%: Newfoundland
5.59%: dogsled
4.55%: standard schnauzer
4.05%: Eskimo dog
data/eagle.jpg: Predicted in 0.384116 seconds.
54.11%: bald eagle
12.01%: ruddy turnstone
11.61%: kite
8.80%: hen
4.15%: vulture
data/giraffe.jpg: Predicted in 0.378156 seconds.
29.71%: zebra
8.75%: tiger cat
7.81%: great grey owl
6.33%: prairie chicken
4.63%: bustard
masaaki@masaaki-H110M4-M01:~/DNN/darknet$ ./darknet classifier predict cfg/imagenet1k.data cfg/extraction.cfg extraction.weights data/dog.jpg
layer filters size input output
0 conv 64 7 x 7 / 2 224 x 224 x 3 -> 112 x 112 x 64 0.236 BFLOPs
1 max 2 x 2 / 2 112 x 112 x 64 -> 56 x 56 x 64
2 conv 192 3 x 3 / 1 56 x 56 x 64 -> 56 x 56 x 192 0.694 BFLOPs
3 max 2 x 2 / 2 56 x 56 x 192 -> 28 x 28 x 192
4 conv 128 1 x 1 / 1 28 x 28 x 192 -> 28 x 28 x 128 0.039 BFLOPs
5 conv 256 3 x 3 / 1 28 x 28 x 128 -> 28 x 28 x 256 0.462 BFLOPs
6 conv 256 1 x 1 / 1 28 x 28 x 256 -> 28 x 28 x 256 0.103 BFLOPs
7 conv 512 3 x 3 / 1 28 x 28 x 256 -> 28 x 28 x 512 1.850 BFLOPs
8 max 2 x 2 / 2 28 x 28 x 512 -> 14 x 14 x 512
9 conv 256 1 x 1 / 1 14 x 14 x 512 -> 14 x 14 x 256 0.051 BFLOPs
10 conv 512 3 x 3 / 1 14 x 14 x 256 -> 14 x 14 x 512 0.462 BFLOPs
11 conv 256 1 x 1 / 1 14 x 14 x 512 -> 14 x 14 x 256 0.051 BFLOPs
12 conv 512 3 x 3 / 1 14 x 14 x 256 -> 14 x 14 x 512 0.462 BFLOPs
13 conv 256 1 x 1 / 1 14 x 14 x 512 -> 14 x 14 x 256 0.051 BFLOPs
14 conv 512 3 x 3 / 1 14 x 14 x 256 -> 14 x 14 x 512 0.462 BFLOPs
15 conv 256 1 x 1 / 1 14 x 14 x 512 -> 14 x 14 x 256 0.051 BFLOPs
16 conv 512 3 x 3 / 1 14 x 14 x 256 -> 14 x 14 x 512 0.462 BFLOPs
17 conv 512 1 x 1 / 1 14 x 14 x 512 -> 14 x 14 x 512 0.103 BFLOPs
18 conv 1024 3 x 3 / 1 14 x 14 x 512 -> 14 x 14 x1024 1.850 BFLOPs
19 max 2 x 2 / 2 14 x 14 x1024 -> 7 x 7 x1024
20 conv 512 1 x 1 / 1 7 x 7 x1024 -> 7 x 7 x 512 0.051 BFLOPs
21 conv 1024 3 x 3 / 1 7 x 7 x 512 -> 7 x 7 x1024 0.462 BFLOPs
22 conv 512 1 x 1 / 1 7 x 7 x1024 -> 7 x 7 x 512 0.051 BFLOPs
23 conv 1024 3 x 3 / 1 7 x 7 x 512 -> 7 x 7 x1024 0.462 BFLOPs
24 conv 1000 1 x 1 / 1 7 x 7 x1024 -> 7 x 7 x1000 0.100 BFLOPs
25 avg 7 x 7 x1000 -> 1000
26 softmax 1000
27 cost 1000
Loading weights from extraction.weights...Done!
data/dog.jpg: Predicted in 2.876122 seconds.
12.77%: malamute
10.03%: Siberian husky
7.23%: Eskimo dog
4.91%: miniature schnauzer
4.88%: Afghan hound
Loading weights from extraction.weights...Done!
data/eagle.jpg: Predicted in 2.890507 seconds.
61.74%: bald eagle
36.86%: kite
0.48%: vulture
0.19%: ptarmigan
0.14%: hen
Loading weights from extraction.weights...Done!
data/giraffe.jpg: Predicted in 2.897918 seconds.
28.03%: zebra
14.40%: bustard
11.90%: gazelle
6.38%: cheetah
5.97%: impala
entity conv_layer is
port (
ap_clk : IN STD_LOGIC;
ap_rst_n : IN STD_LOGIC;
ap_start : IN STD_LOGIC;
ap_done : OUT STD_LOGIC;
ap_idle : OUT STD_LOGIC;
ap_ready : OUT STD_LOGIC;
ins_TDATA : IN STD_LOGIC_VECTOR (31 downto 0);
ins_TVALID : IN STD_LOGIC;
ins_TREADY : OUT STD_LOGIC;
ins_TKEEP : IN STD_LOGIC_VECTOR (3 downto 0);
ins_TSTRB : IN STD_LOGIC_VECTOR (3 downto 0);
ins_TUSER : IN STD_LOGIC_VECTOR (0 downto 0);
ins_TLAST : IN STD_LOGIC_VECTOR (0 downto 0);
ins_TID : IN STD_LOGIC_VECTOR (0 downto 0);
ins_TDEST : IN STD_LOGIC_VECTOR (0 downto 0);
outs_V_din : OUT STD_LOGIC_VECTOR (33 downto 0);
outs_V_full_n : IN STD_LOGIC;
outs_V_write : OUT STD_LOGIC;
ap_return : OUT STD_LOGIC_VECTOR (31 downto 0) );
end;
/opt/Xilinx/Vivado/2017.4/include/floating_point_v7_0_bitacc_cmodel.h:245:45: error: ‘mpfr_srcptr’ has not been declared
/opt/Xilinx/Vivado/2017.4/include/floating_point_v7_0_bitacc_cmodel.h:246:53: error: ‘mpfr_srcptr’ has not been declared
make: *** [obj/conv_layer.cpp_pre.cpp.tb.o] エラー 1
ERROR: [COSIM 212-317] C++ compile error.
ERROR: [COSIM 212-321] EXE file generate failed.
ERROR: [COSIM 212-321] EXE file generate failed.
ERROR: [COSIM 212-331] Aborting co-simulation: C simulation failed, compilation errors.
ERROR: [COSIM 212-4] *** C/RTL co-simulation finished: FAIL ***
command 'ap_source' returned error code
while executing
"source /home/masaaki/Vivado_HLS/ZYBO-Z7-20/hlss_cnn/conv_layer/solution1/cosim.tcl"
invoked from within
"hls::main /home/masaaki/Vivado_HLS/ZYBO-Z7-20/hlss_cnn/conv_layer/solution1/cosim.tcl"
("uplevel" body line 1)
invoked from within
"uplevel 1 hls::main {*}$args"
(procedure "hls_proc" line 5)
invoked from within
"hls_proc $argv"
Finished C/RTL cosimulation.
// conv_layer_tb.cpp
// 2018/02/13 by marsee
// 2018/04/14 : HLS ストリーム対応
// 2018/04/24 : 検証用に異なる実装のconv_layer2()と比較
//
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <ap_int.h>
#include <hls_stream.h>
#include <iostream>
#include <fstream>
#include <iomanip>
#include <math.h>
#include <ap_axi_sdata.h>
#include <hls_video.h>
#include "layer_general.h"
#include "conv_layer.h"
#include "bmp_header.h"
int conv_layer(hls::stream<ap_axiu<32,1,1,1> >& ins,
hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> >& outs);
int conv_layer_soft(hls::stream<ap_axiu<32,1,1,1> >& ins,
hls::stream<float_axis<NUMBER_OF_KERNEL,1> >& outs);
int conv_layer2(hls::stream<ap_axiu<32,1,1,1> >&ins,
hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> >&outs);
#define BMP_FILE_NAME "straight_RED_rect0_00_rgb.bmp"
int main(){
using namespace std;
hls::stream<ap_axiu<32,1,1,1> > ins;
hls::stream<ap_axiu<32,1,1,1> > ins2;
hls::stream<ap_axiu<32,1,1,1> > ins_soft;
hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> > outs;
hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> > outs2;
hls::stream<float_axis<NUMBER_OF_KERNEL,1> > outs_soft;
ap_axiu<32,1,1,1> pix;
ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> vals;
ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> vals2;
float_axis<NUMBER_OF_KERNEL,1> vals_soft;
BITMAPFILEHEADER bmpfhr; // BMPファイルのファイルヘッダ(for Read)
BITMAPINFOHEADER bmpihr; // BMPファイルのINFOヘッダ(for Read)
FILE *fbmpr, *fbmpw, *fbmpwf;
int *rd_bmp;
int *hw_conv[NUMBER_OF_KERNEL];
int *sw_conv[NUMBER_OF_KERNEL];
float *hw_convf[NUMBER_OF_KERNEL];
float *sw_convf[NUMBER_OF_KERNEL];
int blue, green, red;
ap_uint<2> r_l;
char fhname[100];
char fsname[100];
if ((fbmpr = fopen(BMP_FILE_NAME, "rb")) == NULL){ // test.bmp をオープン
fprintf(stderr, "Can't open straight_RED_rect0_00.bmp by binary read mode\n");
exit(1);
}
// bmpヘッダの読み出し
fread(&bmpfhr.bfType, sizeof(uint16_t), 1, fbmpr);
fread(&bmpfhr.bfSize, sizeof(uint32_t), 1, fbmpr);
fread(&bmpfhr.bfReserved1, sizeof(uint16_t), 1, fbmpr);
fread(&bmpfhr.bfReserved2, sizeof(uint16_t), 1, fbmpr);
fread(&bmpfhr.bfOffBits, sizeof(uint32_t), 1, fbmpr);
fread(&bmpihr, sizeof(BITMAPINFOHEADER), 1, fbmpr);
// ピクセルを入れるメモリをアロケートする
if ((rd_bmp =(int *)malloc(sizeof(int) * (bmpihr.biWidth * bmpihr.biHeight))) == NULL){
fprintf(stderr, "Can't allocate rd_bmp memory\n");
exit(1);
}
for(int i=0; i<NUMBER_OF_KERNEL; i++){
if ((hw_conv[i] =(int *)malloc(sizeof(int) * (bmpihr.biWidth * bmpihr.biHeight))) == NULL){
fprintf(stderr, "Can't allocate hw_conv[%d] memory\n", i);
exit(1);
}
if ((sw_conv[i] =(int *)malloc(sizeof(int) * (bmpihr.biWidth * bmpihr.biHeight))) == NULL){
fprintf(stderr, "Can't allocate sw_conv[%d] memory\n", i);
exit(1);
}
}
for(int i=0; i<NUMBER_OF_KERNEL; i++){
if ((hw_convf[i] =(float *)malloc(sizeof(float) * (bmpihr.biWidth * bmpihr.biHeight))) == NULL){
fprintf(stderr, "Can't allocate hw_convf[%d] memory\n", i);
exit(1);
}
if ((sw_convf[i] =(float *)malloc(sizeof(float) * (bmpihr.biWidth * bmpihr.biHeight))) == NULL){
fprintf(stderr, "Can't allocate sw_convf[%d] memory\n", i);
exit(1);
}
}
// rd_bmp にBMPのピクセルを代入。その際に、行を逆転する必要がある
for (int y=0; y<bmpihr.biHeight; y++){
for (int x=0; x<bmpihr.biWidth; x++){
blue = fgetc(fbmpr);
green = fgetc(fbmpr);
red = fgetc(fbmpr);
rd_bmp[((bmpihr.biHeight-1)-y)*bmpihr.biWidth+x] = (blue & 0xff) | ((green & 0xff)<<8) | ((red & 0xff)<<16);
}
}
fclose(fbmpr);
// ins に入力データを用意する
for(int i=0; i<5; i++){ // dummy data
pix.user = 0;
pix.data = i;
ins << pix;
ins2 << pix;
ins_soft << pix;
}
// 1 画面分のデータを ins、ins_soft に入力する
for(int j=0; j < bmpihr.biHeight; j++){
for(int i=0; i < bmpihr.biWidth; i++){
pix.data = (ap_uint<32>)rd_bmp[(j*bmpihr.biWidth)+i];
if (j==0 && i==0) // 最初のデータの時に TUSER を 1 にする
pix.user = 1;
else
pix.user = 0;
if (i == bmpihr.biWidth-1) // 行の最後でTLASTをアサートする
pix.last = 1;
else
pix.last = 0;
ins << pix;
ins2 << pix;
ins_soft << pix;
}
}
// 畳み込み演算
conv_layer(ins, outs);
conv_layer2(ins2, outs2);
conv_layer_soft(ins_soft, outs_soft);
// 画像サイズの縮小(畳み込みをすると行、列共に -4
bmpfhr.bfSize = (HORIZONTAL_PIXEL_WIDTH-4) * (VERTICAL_PIXEL_WIDTH-4) * 3 + 54;
bmpihr.biHeight = VERTICAL_PIXEL_WIDTH - 4;
bmpihr.biWidth = HORIZONTAL_PIXEL_WIDTH - 4;
// ハードウェアとソフトウェアのラプラシアン・フィルタの値のチェック
out_type val[NUMBER_OF_KERNEL];
out_type val2[NUMBER_OF_KERNEL];
float val_soft[NUMBER_OF_KERNEL];
cout << endl;
cout << "outs" << endl;
int errcnt=0;
for(int j=0; j < bmpihr.biHeight; j++){
for(int i=0; i < bmpihr.biWidth; i++){
outs >> vals;
outs2 >> vals2;
outs_soft >> vals_soft;
for(int k=0; k<NUMBER_OF_KERNEL; k++){
val[k] = vals.data[k];
val2[k] = vals2.data[k];
val_soft[k] = vals_soft.data[k];
int *hw_convp = hw_conv[k];
int *sw_convp = sw_conv[k];
hw_convp[(j*bmpihr.biWidth)+i] = ((int)val[k]+32)*4; // 32を足して負の符号を排除し、整数部6ビットなので、2ビット分補正する
sw_convp[(j*bmpihr.biWidth)+i] = ((int)val_soft[k]+32)*4;
float *hw_convfp = hw_convf[k];
float *sw_convfp = sw_convf[k];
hw_convfp[(j*bmpihr.biWidth)+i] = (float)val[k];
sw_convfp[(j*bmpihr.biWidth)+i] = val_soft[k];
if (val[k] != val2[k]){
printf("ERROR val and val2 results mismatch i = %d, j = %d, val[%d] = %f, val2[%d] = %f\n", i, j, k, (float)val[k], k, (float)val2[k]);
errcnt++;
//return(1);
}
printf("HW and SW results i = %d, j = %d, HW[%d] = %f, HW2[%d] = %f, SW[%d] = %f\n", i, j, k, (float)val[k], k, (float)val2[k], k, val_soft[k]);
}
}
}
cout << "Error Count = " << errcnt << endl;
cout << "Success HW and SW results match" << endl;
cout << endl;
// ハードウェアの畳み込み演算の結果を temp_conv0.bmp, temp_conv1.bmp に出力する
for (int k=0; k<NUMBER_OF_KERNEL; k++){
if (k==0){
if ((fbmpw=fopen("temp_conv0.bmp", "wb")) == NULL){
fprintf(stderr, "Can't open temp_conv0.bmp by binary write mode\n");
exit(1);
}
} else {
if ((fbmpw=fopen("temp_conv1.bmp", "wb")) == NULL){
fprintf(stderr, "Can't open temp_conv1.bmp by binary write mode\n");
exit(1);
}
}
// BMPファイルヘッダの書き込み
fwrite(&bmpfhr.bfType, sizeof(uint16_t), 1, fbmpw);
fwrite(&bmpfhr.bfSize, sizeof(uint32_t), 1, fbmpw);
fwrite(&bmpfhr.bfReserved1, sizeof(uint16_t), 1, fbmpw);
fwrite(&bmpfhr.bfReserved2, sizeof(uint16_t), 1, fbmpw);
fwrite(&bmpfhr.bfOffBits, sizeof(uint32_t), 1, fbmpw);
fwrite(&bmpihr, sizeof(BITMAPINFOHEADER), 1, fbmpw);
// RGB データの書き込み、逆順にする
for (int y=0; y<bmpihr.biHeight; y++){
for (int x=0; x<bmpihr.biWidth; x++){
int *hw_convp = hw_conv[k];
blue = hw_convp[((bmpihr.biHeight-1)-y)*bmpihr.biWidth+x] & 0xff;
green = blue;
red = blue;
fputc(blue, fbmpw);
fputc(green, fbmpw);
fputc(red, fbmpw);
}
}
fclose(fbmpw);
}
// ソフトウェアの畳み込み演算の結果を temp_conv_float0.bmp, temp_conv_float1.bmp に出力する
for(int k=0; k<2; k++){
if (k == 0){
if ((fbmpwf=fopen("temp_conv_float0.bmp", "wb")) == NULL){
fprintf(stderr, "Can't open temp_conv_float0.bmp by binary write mode\n");
exit(1);
}
} else {
if ((fbmpwf=fopen("temp_conv_float1.bmp", "wb")) == NULL){
fprintf(stderr, "Can't open temp_conv_float1.bmp by binary write mode\n");
exit(1);
}
}
// BMPファイルヘッダの書き込み
fwrite(&bmpfhr.bfType, sizeof(uint16_t), 1, fbmpwf);
fwrite(&bmpfhr.bfSize, sizeof(uint32_t), 1, fbmpwf);
fwrite(&bmpfhr.bfReserved1, sizeof(uint16_t), 1, fbmpwf);
fwrite(&bmpfhr.bfReserved2, sizeof(uint16_t), 1, fbmpwf);
fwrite(&bmpfhr.bfOffBits, sizeof(uint32_t), 1, fbmpwf);
fwrite(&bmpihr, sizeof(BITMAPINFOHEADER), 1, fbmpwf);
// RGB データの書き込み、逆順にする
for (int y=0; y<bmpihr.biHeight; y++){
for (int x=0; x<bmpihr.biWidth; x++){
int *sw_convp = sw_conv[k];
blue = sw_convp[((bmpihr.biHeight-1)-y)*bmpihr.biWidth+x] & 0xff;
green = blue;
red = blue;
fputc(blue, fbmpwf);
fputc(green, fbmpwf);
fputc(red, fbmpwf);
}
}
fclose(fbmpwf);
}
// ヘッダ出力
ofstream OH("conv_layer_output.h");
OH << "// conv_layer_output.h" << endl;
time_t now = time(0);
struct tm* localNow = localtime(&now);
OH << "// " << localNow->tm_year+1900 << "/" << localNow->tm_mon+1 << "/" << localNow->tm_mday;
OH << " " << setw(2) << setfill('0') << localNow->tm_hour << ":" << localNow->tm_min << ":" << localNow->tm_sec << " by marsee" << endl;
OH << "//" << endl;
OH << endl;
OH << "#ifndef __CONV_LAYER_OUTPUT_H__" << endl;
OH << "#define __CONV_LAYER_OUTPUT_H__" << endl;
OH << endl;
OH << "const float conv_layer_fout[" << bmpihr.biHeight*bmpihr.biWidth << "][" << NUMBER_OF_KERNEL << "] = {" << endl;
for (int y=0; y<bmpihr.biHeight; y++){
for (int x=0; x<bmpihr.biWidth; x++){
OH << " {" << fixed << setprecision(12) << sw_convf[0][bmpihr.biWidth*y+x];
for(int i=1; i<NUMBER_OF_KERNEL; i++){
OH << ", " << sw_convf[i][bmpihr.biWidth*y+x];
}
OH << "}";
if (y==bmpihr.biHeight-1 && x==bmpihr.biWidth-1)
OH << endl;
else
OH << "," << endl;
}
}
OH << "};" << endl << endl;
OH << "const ap_fixed<16, 6, AP_TRN, AP_WRAP> conv_layer_out[" << bmpihr.biHeight*bmpihr.biWidth << "][" << NUMBER_OF_KERNEL << "] = {" << endl;
for (int y=0; y<bmpihr.biHeight; y++){
for (int x=0; x<bmpihr.biWidth; x++){
OH << " {" << hw_convf[0][bmpihr.biWidth*y+x];
for(int i=1; i<NUMBER_OF_KERNEL; i++){
OH << ", " << hw_convf[i][bmpihr.biWidth*y+x];
}
OH << "}";
if (y==bmpihr.biHeight-1 && x==bmpihr.biWidth-1)
OH << endl;
else
OH << "," << endl;
}
}
OH << "};" << endl << endl;
OH << "#endif" << endl;
free(rd_bmp);
for(int k=0; k<NUMBER_OF_KERNEL; k++){
free(hw_conv[k]);
free(sw_conv[k]);
free(hw_convf[k]);
free(sw_convf[k]);
}
return(0);
}
int conv_layer_soft(hls::stream<ap_axiu<32,1,1,1> >& ins,
hls::stream<float_axis<NUMBER_OF_KERNEL,1> >& outs){
ap_axiu<32,1,1,1> pix;
float_axis<NUMBER_OF_KERNEL,1> conv_out;
hls::LineBuffer<ARRAY_SIZE-1, HORIZONTAL_PIXEL_WIDTH, float> linebuf;
hls::Window<ARRAY_SIZE, ARRAY_SIZE, float> mbuf;
float ap_uf_pix;
float val;
do {
// user が 1になった時にフレームがスタートする
ins >> pix;
} while(pix.user == 0);
Loop1: for (int y=0; y<VERTICAL_PIXEL_WIDTH; y++){
Loop2: for (int x=0; x<HORIZONTAL_PIXEL_WIDTH; x++){
if (!(x==0 && y==0)) // 最初の入力はすでに入力されている
ins >> pix; // AXI4-Stream からの入力
ap_uf_pix = (float)(pix.data & 0xff) / 256.0;
//printf("ap_uf_pix_soft = %f\n", ap_uf_pix);
mbuf.shift_pixels_left(); // mbuf の列を1ビット左シフト
for(int i=0; i<ARRAY_SIZE-1; i++){
mbuf.insert_pixel(linebuf.getval(i,x), i, ARRAY_SIZE-1);
}
mbuf.insert_pixel(ap_uf_pix, ARRAY_SIZE-1, ARRAY_SIZE-1);
// LineBuffer の更新
linebuf.shift_pixels_up(x);
linebuf.insert_bottom_row(ap_uf_pix, x);
// conv_layer の演算
for (int k=0; k<NUMBER_OF_KERNEL; k++){
val=0.0;
for (int j=0; j<ARRAY_SIZE; j++){
for (int i=0; i<ARRAY_SIZE; i++){
val += mbuf.getval(j,i) * conv1_fweight[k][0][j][i];
}
}
val += conv1_fbias[k];
conv_out.data[k] = val;
}
// 最初のARRAY_SIZE-1行とその他の行の最初のARRAY_SIZE-1列は無効データなので出力しない
if (x<(ARRAY_SIZE-1) || y<(ARRAY_SIZE-1))
continue;
else { // 有効なデータの時
if (x==(ARRAY_SIZE-1) && y==(ARRAY_SIZE-1)){ // 最初のデータでは、TUSERをアサートする
conv_out.user = 1;
} else {
conv_out.user = 0;
}
if (x == (HORIZONTAL_PIXEL_WIDTH-1)){ // 行の最後で TLAST をアサートする
conv_out.last = 1;
} else {
conv_out.last = 0;
}
outs << conv_out;
}
}
}
return(0);
}
// 検証用 conv_layer2()
// 検証用に conv_layer() とは異なる実装でコーディング
int conv_layer2(hls::stream<ap_axiu<32,1,1,1> >&ins,
hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> >&outs){
ap_axiu<32,1,1,1> pix;
val_type conv_val[NUMBER_OF_KERNEL][VERTICAL_PIXEL_WIDTH][HORIZONTAL_PIXEL_WIDTH];
in_type ap_uf_pix[VERTICAL_PIXEL_WIDTH][HORIZONTAL_PIXEL_WIDTH];
ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> conv_out;
Loop1: do {
// user が 1になった時にフレームがスタートする
ins >> pix;
} while(pix.user == 0);
Loop2: for (int y=0; y<VERTICAL_PIXEL_WIDTH; y++){
Loop3: for (int x=0; x<HORIZONTAL_PIXEL_WIDTH; x++){
if (!(x==0 && y==0)) // 最初の入力はすでに入力されている
ins >> pix; // AXI4-Stream からの入力
ap_uf_pix[y][x] = (in_type)((ap_ufixed<16, 8, AP_TRN, AP_WRAP>)(pix.data & 0xff) / 256);
}
}
for(int i=0; i<NUMBER_OF_KERNEL; i++){ // カーネルの個数
for(int j=0; j<VERTICAL_PIXEL_WIDTH-(ARRAY_SIZE-1); j++){
for(int k=0; k<HORIZONTAL_PIXEL_WIDTH-(ARRAY_SIZE-1); k++){
conv_val[i][j][k] = 0;
for(int m=0; m<ARRAY_SIZE; m++){
for(int n=0; n<ARRAY_SIZE; n++){
conv_val[i][j][k] += (val_type)ap_uf_pix[j+m][k+n] * (val_type)conv1_weight[i][0][m][n];
}
}
conv_val[i][j][k] += (val_type)conv1_bias[i];
}
}
}
for(int y=0; y<VERTICAL_PIXEL_WIDTH-(ARRAY_SIZE-1); y++){
for(int x=0; x<HORIZONTAL_PIXEL_WIDTH-(ARRAY_SIZE-1); x++){
for(int i=0; i<NUMBER_OF_KERNEL; i++){
conv_out.data[i] = conv_val[i][y][x];
}
if (x==0 && y==0){ // 最初のデータでは、TUSERをアサートする
conv_out.user = 1;
} else {
conv_out.user = 0;
}
if (x == (HORIZONTAL_PIXEL_WIDTH - ARRAY_SIZE)){ // 行の最後で TLAST をアサートする
conv_out.last = 1;
} else {
conv_out.last = 0;
}
outs << conv_out;
}
}
return(0);
}
// layer_general.h
// 2018/04/12 by marsee (HLS stream)
//
#ifndef __LAYER_GENERAL_H__
#define __LAYER_GENERAL_H__
#include <ap_fixed.h>
template<int W, int I, int N, int U>
struct ap_fixed_axis{
ap_fixed<W, I, AP_TRN, AP_WRAP> data[N];
ap_uint<U> user;
ap_uint<1> last;
};
template<int W, int I, int N, int U>
struct ap_ufixed_axis{
ap_ufixed<W, I, AP_TRN, AP_WRAP> data[N];
ap_uint<U> user;
ap_uint<1> last;
};
template<int N, int U>
struct ap_float_axis{
float data[N];
ap_uint<U> user;
ap_uint<1> last;
};
#endif
// conv_layer.h
// 2018/04/12 by marsee (HLS stream)
#ifndef __CONV_LAYER_H__
#define __CONV_LAYER_H__#include "conv1_weight.h"#include "conv1_bias.h"
#define HORIZONTAL_PIXEL_WIDTH 56
#define VERTICAL_PIXEL_WIDTH 10
static const size_t NUMBER_OF_KERNEL = 2;
static const size_t ARRAY_SIZE = 5;
static const size_t W = 16;
static const size_t I = 6;
typedef ap_ufixed<8, 0, AP_TRN, AP_WRAP> in_type;
typedef ap_fixed<22, 6, AP_TRN, AP_WRAP> val_type;
typedef ap_fixed<16, 6, AP_TRN, AP_WRAP> out_type;
#endif
// conv_layer.cpp
// 2018/04/12 by marsee (HLS stream)
//
#include "ap_int.h"
#include "hls_stream.h"
#include "layer_general.h"
#include <ap_axi_sdata.h>
#include "conv_layer.h"
int conv_layer(hls::stream<ap_axiu<32,1,1,1> >&ins,
hls::stream<ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> >&outs){
#pragma HLS INTERFACE axis register both port=ins
#pragma HLS DATA_PACK variable=outs
ap_axiu<32,1,1,1> pix;
ap_fixed_axis<W,I,NUMBER_OF_KERNEL,1> conv_out;
in_type line_buf[ARRAY_SIZE-1][HORIZONTAL_PIXEL_WIDTH];
#pragma HLS ARRAY_PARTITION variable=line_buf block factor=4 dim=1
#pragma HLS resource variable=line_buf core=RAM_2P
in_type pix_mat[ARRAY_SIZE][ARRAY_SIZE];
#pragma HLS array_partition variable=pix_mat complete
in_type ap_uf_pix;
val_type val;
Loop1: do {
#pragma HLS LOOP_TRIPCOUNT min=1 max=1 avg=1
// user が 1になった時にフレームがスタートする
ins >> pix;
} while(pix.user == 0);
Loop2: for (int y=0; y<VERTICAL_PIXEL_WIDTH; y++){
Loop3: for (int x=0; x<HORIZONTAL_PIXEL_WIDTH; x++){
#pragma HLS PIPELINE II=1
if (!(x==0 && y==0)) // 最初の入力はすでに入力されている
ins >> pix; // AXI4-Stream からの入力
ap_uf_pix = (in_type)((ap_ufixed<16, 8, AP_TRN, AP_WRAP>)(pix.data & 0xff) / 256);
// 2次元配列のデータを左シフト
Loop4 : for (int k=0; k<ARRAY_SIZE; k++){
Loop5 : for (int m=0; m<ARRAY_SIZE-1; m++){
#pragma HLS UNROLL
pix_mat[k][m] = pix_mat[k][m+1];
}
}
Loop6: for (int i=0; i<ARRAY_SIZE-1; i++){ // 以前の行のデータを line_buf から入力
pix_mat[i][ARRAY_SIZE-1] = line_buf[i][x];
}
pix_mat[ARRAY_SIZE-1][ARRAY_SIZE-1] = ap_uf_pix; // pix_mat の最後に新しいデータを入力
Loop7: for (int i=0; i<ARRAY_SIZE-2; i++){ // 行の入れ替え
line_buf[i][x] = line_buf[i+1][x];
}
line_buf[ARRAY_SIZE-2][x] = ap_uf_pix;
// conv_layer の演算
for (int k=0; k<NUMBER_OF_KERNEL; k++){
val = 0.0;
for (int j=0; j<ARRAY_SIZE; j++){
for (int i=0; i<ARRAY_SIZE; i++){
val += (val_type)pix_mat[j][i] * (val_type)conv1_weight[k][0][j][i];
}
}
val += (val_type)conv1_bias[k];
conv_out.data[k] = val;
}
// 最初のARRAY_SIZE-1行とその他の行の最初のARRAY_SIZE-1列は無効データなので出力しない
if (x<(ARRAY_SIZE-1) || y<(ARRAY_SIZE-1))
continue;
else { // 有効なデータの時
if (x==(ARRAY_SIZE-1) && y==(ARRAY_SIZE-1)){ // 最初のデータでは、TUSERをアサートする
conv_out.user = 1;
} else {
conv_out.user = 0;
}
if (x == (HORIZONTAL_PIXEL_WIDTH-1)){ // 行の最後で TLAST をアサートする
conv_out.last = 1;
} else {
conv_out.last = 0;
}
outs << conv_out;
}
}
}
return(0);
}
// stream_test.cpp
// 2018/02/11 by marsee
// 2018/04/12 : DATA_PACK指示子を追加
//
#include <ap_int.h>
#include <hls_stream.h>
#include <ap_axi_sdata.h>
#include <hls_video.h>
#include "stream_test.h"
int stream_test(hls::stream<ap_fixed_axis<16,6,2,1,1,1> >& ins,
hls::stream<ap_fixed_axis<16,6,2,1,1,1> >& outs, int num){
#pragma HLS DATA_PACK variable=outs
#pragma HLS DATA_PACK variable=ins
ap_fixed_axis<16,6,2,1,1,1> ins_t;
ap_fixed_axis<16,6,2,1,1,1> outs_t;
for(int y=0; y<10; y++){
for(int x=0; x<56; x++){
#pragma HLS PIPELINE II=1
ins >> ins_t;
for(int i=0; i<num; i++){
outs_t.data[i] = ins_t.data[i] * (ap_fixed<16, 6, AP_TRN, AP_WRAP>)(i+2);
}
outs_t.user = 1;
outs_t.last = 0;
outs << outs_t;
}
}
return(0);
}
int stream_top(hls::stream<ap_fixed_axis<16,6,2,1,1,1> >& ins,
hls::stream<ap_fixed_axis<16,6,2,1,1,1> >& outs){
#pragma HLS DATA_PACK variable=outs
#pragma HLS DATA_PACK variable=ins
#pragma HLS DATAFLOW
hls::stream<ap_fixed_axis<16,6,2,1,1,1> > temp_st;
stream_test(ins, temp_st, 2);
stream_test(temp_st, outs, 2);
return(0);
}
entity stream_top is
port (
ins_V_dout : IN STD_LOGIC_VECTOR (39 downto 0);
ins_V_empty_n : IN STD_LOGIC;
ins_V_read : OUT STD_LOGIC;
outs_V_din : OUT STD_LOGIC_VECTOR (39 downto 0);
outs_V_full_n : IN STD_LOGIC;
outs_V_write : OUT STD_LOGIC;
ap_clk : IN STD_LOGIC;
ap_rst : IN STD_LOGIC;
ap_done : OUT STD_LOGIC;
ap_start : IN STD_LOGIC;
ap_ready : OUT STD_LOGIC;
ap_idle : OUT STD_LOGIC;
ap_return : OUT STD_LOGIC_VECTOR (31 downto 0) );
end;
entity stream_top is
port (
ins_V_data_0_V_dout : IN STD_LOGIC_VECTOR (15 downto 0);
ins_V_data_0_V_empty_n : IN STD_LOGIC;
ins_V_data_0_V_read : OUT STD_LOGIC;
ins_V_data_1_V_dout : IN STD_LOGIC_VECTOR (15 downto 0);
ins_V_data_1_V_empty_n : IN STD_LOGIC;
ins_V_data_1_V_read : OUT STD_LOGIC;
ins_V_keep_V_dout : IN STD_LOGIC_VECTOR (1 downto 0);
ins_V_keep_V_empty_n : IN STD_LOGIC;
ins_V_keep_V_read : OUT STD_LOGIC;
ins_V_strb_V_dout : IN STD_LOGIC_VECTOR (1 downto 0);
ins_V_strb_V_empty_n : IN STD_LOGIC;
ins_V_strb_V_read : OUT STD_LOGIC;
ins_V_user_V_dout : IN STD_LOGIC_VECTOR (0 downto 0);
ins_V_user_V_empty_n : IN STD_LOGIC;
ins_V_user_V_read : OUT STD_LOGIC;
ins_V_last_V_dout : IN STD_LOGIC_VECTOR (0 downto 0);
ins_V_last_V_empty_n : IN STD_LOGIC;
ins_V_last_V_read : OUT STD_LOGIC;
ins_V_id_V_dout : IN STD_LOGIC_VECTOR (0 downto 0);
ins_V_id_V_empty_n : IN STD_LOGIC;
ins_V_id_V_read : OUT STD_LOGIC;
ins_V_dest_V_dout : IN STD_LOGIC_VECTOR (0 downto 0);
ins_V_dest_V_empty_n : IN STD_LOGIC;
ins_V_dest_V_read : OUT STD_LOGIC;
outs_V_data_0_V_din : OUT STD_LOGIC_VECTOR (15 downto 0);
outs_V_data_0_V_full_n : IN STD_LOGIC;
outs_V_data_0_V_write : OUT STD_LOGIC;
outs_V_data_1_V_din : OUT STD_LOGIC_VECTOR (15 downto 0);
outs_V_data_1_V_full_n : IN STD_LOGIC;
outs_V_data_1_V_write : OUT STD_LOGIC;
outs_V_keep_V_din : OUT STD_LOGIC_VECTOR (1 downto 0);
outs_V_keep_V_full_n : IN STD_LOGIC;
outs_V_keep_V_write : OUT STD_LOGIC;
outs_V_strb_V_din : OUT STD_LOGIC_VECTOR (1 downto 0);
outs_V_strb_V_full_n : IN STD_LOGIC;
outs_V_strb_V_write : OUT STD_LOGIC;
outs_V_user_V_din : OUT STD_LOGIC_VECTOR (0 downto 0);
outs_V_user_V_full_n : IN STD_LOGIC;
outs_V_user_V_write : OUT STD_LOGIC;
outs_V_last_V_din : OUT STD_LOGIC_VECTOR (0 downto 0);
outs_V_last_V_full_n : IN STD_LOGIC;
outs_V_last_V_write : OUT STD_LOGIC;
outs_V_id_V_din : OUT STD_LOGIC_VECTOR (0 downto 0);
outs_V_id_V_full_n : IN STD_LOGIC;
outs_V_id_V_write : OUT STD_LOGIC;
outs_V_dest_V_din : OUT STD_LOGIC_VECTOR (0 downto 0);
outs_V_dest_V_full_n : IN STD_LOGIC;
outs_V_dest_V_write : OUT STD_LOGIC;
ap_clk : IN STD_LOGIC;
ap_rst : IN STD_LOGIC;
ap_done : OUT STD_LOGIC;
ap_start : IN STD_LOGIC;
ap_ready : OUT STD_LOGIC;
ap_idle : OUT STD_LOGIC;
ap_return : OUT STD_LOGIC_VECTOR (31 downto 0) );
end;
Transformation Matrix
1.01332 0.0172398 -19.5289
0.00700084 1.01997 -16.4525
0 0 0
elapsed time 14995626
Max_err: 4 Min_err: 1 Num_errs: 342 Num_errs > 1: 342
linux_boot_cmd=setenv bootargs console=ttyPS0,115200 root=/dev/mmcblk0p2 rw rootwait uio_pdrv_genirq.of_id=generic-uio && bootz 0x03000000 - 0x02A00000
linux_boot_cmd=setenv bootargs console=ttyPS0,115200 root=/dev/mmcblk0p2 rw rootwait uio_pdrv_genirq.of_id=generic-uio cma=64M && bootz 0x03000000 - 0x02A00000
elapsed time 9409130
ocv corner count = 98, Hls corner count = 122
Commmon = 98 Success = 80.327866 Loss = 0.000000 Gain = 19.672131
./sd_card/bilateral_ex.elf: error while loading shared libraries: libv4l2subdev.so.0: cannot open shared object file: No such file or directory
./sd_card/bilateral_ex.elf: error while loading shared libraries: libmediactl.so.0: cannot open shared object file: No such file or directory
sigma_color: 7.72211 sigma_space: 0.901059
elapsed time 14358244
Minimum error in intensity = 0
Maximum error in intensity = 1
Percentage of pixels above error threshold = 0.00236304 Count: 4
Failed to allocate memory
Failed to allocate memory
Message from syslogd@debian-fpga at Apr 6 05:03:36 ...
kernel:[ 4499.414190] Internal error: Oops: 17 [#1] PREEMPT SMP ARM
Message from syslogd@debian-fpga at Apr 6 05:03:36 ...
kernel:[ 4499.505378] Process dnp_of.elf (pid: 4117, stack limit = 0xe25b4218)
Message from syslogd@debian-fpga at Apr 6 05:03:36 ...
kernel:[ 4499.511714] Stack: (0xe25b5e00 to 0xe25b6000)
Message from syslogd@debian-fpga at Apr 6 05:03:36 ...
kernel:[ 4499.516061] 5e00: 00000000 c0113bc4 00000000 ee477600 000007e8 00000000 00000002 00000000
Message from syslogd@debian-fpga at Apr 6 05:03:36 ...
kernel:[ 4499.524220] 5e20: 001fa400 00000001 00000005 e32b007c 600f0013 00000000 00000014 ee7ed840
Message from syslogd@debian-fpga at Apr 6 05:03:36 ...
kernel:[ 4499.532378] 5e40: 83c00743 00000000 bebdf230 00000000 c0045808 bebdf230 e25b4000 00000000
Message from syslogd@debian-fpga at Apr 6 05:03:36 ...
kernel:[ 4499.540538] 5e60: 00000000 bf0376d0 00000000 e25b5ed0 00000000 00000001 e25b5ea4 00000000
Message from syslogd@debian-fpga at Apr 6 05:03:36 ...
kernel:[ 4499.548697] 5e80: e297adb8 83c00703 00000000 00000000 00000000 ffefe70c ef29a734 00000000
Message from syslogd@debian-fpga at Apr 6 05:03:36 ...
kernel:[ 4499.556856] 5ea0: efaac420 ee477300 d5480010 00000000 00000000 00000000 007e9000 00000000
Message from syslogd@debian-fpga at Apr 6 05:03:36 ...
kernel:[ 4499.565015] 5ec0: 00000000 00000000 00000002 00000000 00000000 b659a000 00032030 0000002c
Message from syslogd@debian-fpga at Apr 6 05:03:36 ...
kernel:[ 4499.573175] 5ee0: bebdf2b4 00000000 00000001 001fa400 00000000 0002f258 00032030 0000002c
Message from syslogd@debian-fpga at Apr 6 05:03:36 ...
kernel:[ 4499.581334] 5f00: 0016b790 bebdf230 ee6cc508 ee4f7780 c0045808 bebdf230 e25b4000 00000000
Message from syslogd@debian-fpga at Apr 6 05:03:36 ...
kernel:[ 4499.589492] 5f20: 00000000 c01f89fc bebdf230 c01f9288 00000000 00000000 ef29a700 00000000
Message from syslogd@debian-fpga at Apr 6 05:03:36 ...
kernel:[ 4499.597653] 5f40: ef085de0 ae43e000 e3324300 ef085de8 ef29a704 c01d0804 00000009 c02033ac
Message from syslogd@debian-fpga at Apr 6 05:03:36 ...
kernel:[ 4499.605811] 5f60: 00000009 0002f258 007e9000 c0045808 00000009 ee4f7781 ee4f7780 c0045808
Message from syslogd@debian-fpga at Apr 6 05:03:36 ...
kernel:[ 4499.613971] 5f80: bebdf230 e25b4000 00000000 c01f93cc 00000001 0002f258 007e9000 00000036
Message from syslogd@debian-fpga at Apr 6 05:03:36 ...
kernel:[ 4499.622130] 5fa0: c0107ac4 c01078e0 00000001 0002f258 00000009 c0045808 bebdf230 0002ff48
Message from syslogd@debian-fpga at Apr 6 05:03:36 ...
kernel:[ 4499.630288] 5fc0: 00000001 0002f258 007e9000 00000036 00000002 00000000 00000000 00000000
Message from syslogd@debian-fpga at Apr 6 05:03:36 ...
kernel:[ 4499.638448] 5fe0: 0002f2e4 bebdf22c 0001b66c b653f716 000f0030 00000009 04050607 00020103
Message from syslogd@debian-fpga at Apr 6 05:03:36 ...
kernel:[ 4499.686451] Code: eb44a1dc e15b0006 0a000011 e7990106 (e5903014)
^CApplication Terminated by User
にする必要があるということだ。-D ENABLE_PRECOMPILED_HEADERS=OFF
が出ている。Gtk-Message: Failed to load module "canberra-gtk-module"
#pragma SDS data zero_copy(in[0:10])
#pragma SDS data zero_copy(out[0:10])
int DMA_pow2(int *in, int *out){
int i;
for (i=0; i<10; i++){
int temp = *in++;
*out++ =temp * temp;
}
return(0);
}
#include <stdio.h>
#include <stdlib.h>
#include "sds_lib.h"
int DMA_pow2(int *in, int *out);
int main(){
int *data;
int *result;
int i;
if((data=(int *)sds_alloc(sizeof(int)*10)) == NULL){
fprintf(stderr, "Can't allocate data[10]\n");
exit(1);
}
if((result=(int *)sds_alloc(sizeof(int)*10)) == NULL){
fprintf(stderr, "Can't allocate result[10]\n");
exit(1);
}
for(int i=0; i<10; i++){
data[i] = i;
}
DMA_pow2(data, result);
for(i=0; i<10; i++){
printf("data[%d] = %d, result[%d] = %d\n", i, data[i], i, result[i]);
}
}
で ZYBO-Z7-20 の Debian に入って、Xクライアントのコマンドを入力すれば良いようだ。ssh CentOSのIPアドレス -X
日 | 月 | 火 | 水 | 木 | 金 | 土 |
---|---|---|---|---|---|---|
1 | 2 | 3 | 4 | 5 | 6 | 7 |
8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 | 16 | 17 | 18 | 19 | 20 | 21 |
22 | 23 | 24 | 25 | 26 | 27 | 28 |
29 | 30 | - | - | - | - | - |