// krnl_dma_write2.cpp
// 2020/01/30 by marsee
#include <ap_int.h>
#include <hls_stream.h>
#include <ap_axi_sdata.h>
#include <stdint.h>
#define X_SIZE 64
#define Y_SIZE 48
extern "C" {
void dma_write2(hls::stream<ap_axiu<32,0,0,0> >& ins, volatile int32_t *outm){
#pragma HLS INTERFACE m_axi depth=3072 port=outm bundle=gmem
#pragma HLS INTERFACE axis register both port=ins
#pragma HLS INTERFACE s_axilite port=return bundle=control
ap_axiu<32,0,0,0> pix;
LOOP_DWY: for(int y=0; y<Y_SIZE; y++){
LOOP_DWX: for(int x=0; x<X_SIZE; x++){
#pragma HLS PIPELINE II=1
ins >> pix;
outm[X_SIZE*y+x] = pix.data;
}
}
}
}
// krnl_dma_write2_tb.cpp
// 2020/01/30 by marsee
#include <stdio.h>
#include <stdint.h>
#include "hls_opencv.h"
#include <ap_int.h>
#include <hls_stream.h>
#include <ap_axi_sdata.h>
void dma_write2(hls::stream<ap_axiu<32,0,0,0> >& ins, volatile int32_t *outm);
void dma_write_soft(hls::stream<ap_axiu<32,0,0,0> >& ins, volatile int32_t *outm,
int32_t x_size, int32_t y_size);
const char INPUT_BMP_FILE[] = "test.bmp";
const char OUTPUT_BMP_FILE[] = "dma_write.bmp";
int main(){
hls::stream<ap_axiu<32,0,0,0> > ins;
hls::stream<ap_axiu<32,0,0,0> > ins_soft;
ap_axiu<32,0,0,0> pix;
// BMPファイルをMat に読み込む
cv::Mat img = cv::imread(INPUT_BMP_FILE);
// ピクセルを入れる領域の確保
std::vector<int32_t> rd_bmp(sizeof(int32_t)*img.cols*img.rows);
std::vector<int32_t> hw_dmaw(sizeof(int32_t)*(img.cols-2)*(img.rows-2));
std::vector<int32_t> sw_dmaw(sizeof(int32_t)*(img.cols-2)*(img.rows-2));
// rd_bmp にBMPのピクセルを代入
cv::Mat_<cv::Vec3b> dst_vec3b = cv::Mat_<cv::Vec3b>(img);
for (int y=0; y<img.rows; y++){
for (int x=0; x<img.cols; x++){
cv::Vec3b pixel;
pixel = dst_vec3b(y,x);
rd_bmp[y*img.cols+x] = (pixel[0] & 0xff) | ((pixel[1] & 0xff)<<8) | ((pixel[2] & 0xff)<<16);
// blue - pixel[0]; green - pixel[1]; red - pixel[2];
}
}
// ins に入力データを用意する
for(int j=0; j < img.rows; j++){
for(int i=0; i < img.cols; i++){
pix.data = (ap_int<32>)rd_bmp[(j*img.cols)+i];
if ((i==img.cols-1) && (j==img.rows-1)) // フレームの最後で last をアサートする
pix.last = 1;
else
pix.last = 0;
ins << pix;
ins_soft << pix;
}
}
dma_write2(ins, hw_dmaw.data());
dma_write_soft(ins_soft, sw_dmaw.data(), img.cols, img.rows);
// ハードウェアとソフトウェアの dma_write の値のチェック
for (int y=0; y<img.rows; y++){
for (int x=0; x<img.cols; x++){
if (hw_dmaw[y*img.cols+x] != sw_dmaw[y*img.cols+x]){
printf("ERROR HW and SW results mismatch x = %ld, y = %ld, HW = %x, SW = %x\n",
x, y, hw_dmaw[y*img.cols+x], sw_dmaw[y*img.cols+x]);
return(1);
}
}
}
printf("Success HW and SW results match\n");
const int dmaw_rows = img.rows;
const int dmaw_cols = img.cols;
cv::Mat wbmpf(dmaw_rows, dmaw_cols, CV_8UC3);
// wbmpf にラプラシアンフィルタ処理後の画像を入力
cv::Mat_<cv::Vec3b> lap_vec3b = cv::Mat_<cv::Vec3b>(wbmpf);
for (int y=0; y<wbmpf.rows; y++){
for (int x=0; x<wbmpf.cols; x++){
cv::Vec3b pixel;
pixel = lap_vec3b(y,x);
int32_t rgb = hw_dmaw[y*wbmpf.cols+x];
pixel[0] = (rgb & 0xff); // blue
pixel[1] = (rgb & 0xff00) >> 8; // green
pixel[2] = (rgb & 0xff0000) >> 16; // red
lap_vec3b(y,x) = pixel;
}
}
// ハードウェアのラプラシアンフィルタの結果を bmp ファイルへ出力する
cv::imwrite(OUTPUT_BMP_FILE, wbmpf);
return(0);
}
void dma_write_soft(hls::stream<ap_axiu<32,0,0,0> >& ins, volatile int32_t *outm,
int32_t x_size, int32_t y_size){
ap_axiu<32,0,0,0> pix;
LOOP_DWY: for(int y=0; y<y_size; y++){
LOOP_DWX: for(int x=0; x<x_size; x++){
ins >> pix;
outm[x_size*y+x] = pix.data;
}
}
}
// krnl_dma_write.cpp
// 2020/01/28 by marsee
#include <ap_int.h>
#include <hls_stream.h>
#include <ap_axi_sdata.h>
#include <stdint.h>
//extern "C" {
void dma_write(hls::stream<ap_axiu<32,0,0,0> >& ins, volatile int32_t *outm,
int32_t x_size, int32_t y_size){
#pragma HLS INTERFACE m_axi depth=3072 port=outm bundle=gmem
#pragma HLS INTERFACE axis register both port=ins
#pragma HLS INTERFACE s_axilite port=y_size bundle=control
#pragma HLS INTERFACE s_axilite port=x_size bundle=control
#pragma HLS INTERFACE s_axilite port=return bundle=control
ap_axiu<32,0,0,0> pix;
LOOP_DWY: for(int y=0; y<y_size; y++){
#pragma HLS LOOP_TRIPCOUNT min=48 max=600
LOOP_DWX: for(int x=0; x<x_size; x++){
#pragma HLS LOOP_TRIPCOUNT min=64 max=800
#pragma HLS PIPELINE II=1
ins >> pix;
outm[x_size*y+x] = pix.data;
}
}
}
//}
// krnl_dma_write_tb.cpp
// 2020/01/28 by marsee
#include <stdio.h>
#include <stdint.h>
#include "hls_opencv.h"
#include <ap_int.h>
#include <hls_stream.h>
#include <ap_axi_sdata.h>
void dma_write(hls::stream<ap_axiu<32,0,0,0> >& ins, volatile int32_t *outm,
int32_t x_size, int32_t y_size);
void dma_write_soft(hls::stream<ap_axiu<32,0,0,0> >& ins, volatile int32_t *outm,
int32_t x_size, int32_t y_size);
const char INPUT_BMP_FILE[] = "test.bmp";
const char OUTPUT_BMP_FILE[] = "dma_write.bmp";
int main(){
hls::stream<ap_axiu<32,0,0,0> > ins;
hls::stream<ap_axiu<32,0,0,0> > ins_soft;
ap_axiu<32,0,0,0> pix;
// BMPファイルをMat に読み込む
cv::Mat img = cv::imread(INPUT_BMP_FILE);
// ピクセルを入れる領域の確保
std::vector<int32_t> rd_bmp(sizeof(int32_t)*img.cols*img.rows);
std::vector<int32_t> hw_dmaw(sizeof(int32_t)*(img.cols-2)*(img.rows-2));
std::vector<int32_t> sw_dmaw(sizeof(int32_t)*(img.cols-2)*(img.rows-2));
// rd_bmp にBMPのピクセルを代入
cv::Mat_<cv::Vec3b> dst_vec3b = cv::Mat_<cv::Vec3b>(img);
for (int y=0; y<img.rows; y++){
for (int x=0; x<img.cols; x++){
cv::Vec3b pixel;
pixel = dst_vec3b(y,x);
rd_bmp[y*img.cols+x] = (pixel[0] & 0xff) | ((pixel[1] & 0xff)<<8) | ((pixel[2] & 0xff)<<16);
// blue - pixel[0]; green - pixel[1]; red - pixel[2];
}
}
// ins に入力データを用意する
for(int j=0; j < img.rows; j++){
for(int i=0; i < img.cols; i++){
pix.data = (ap_int<32>)rd_bmp[(j*img.cols)+i];
if ((i==img.cols-1) && (j==img.rows-1)) // フレームの最後で last をアサートする
pix.last = 1;
else
pix.last = 0;
ins << pix;
ins_soft << pix;
}
}
dma_write(ins, hw_dmaw.data(), img.cols, img.rows);
dma_write_soft(ins_soft, sw_dmaw.data(), img.cols, img.rows);
// ハードウェアとソフトウェアの dma_write の値のチェック
for (int y=0; y<img.rows; y++){
for (int x=0; x<img.cols; x++){
if (hw_dmaw[y*img.cols+x] != sw_dmaw[y*img.cols+x]){
printf("ERROR HW and SW results mismatch x = %ld, y = %ld, HW = %x, SW = %x\n",
x, y, hw_dmaw[y*img.cols+x], sw_dmaw[y*img.cols+x]);
return(1);
}
}
}
printf("Success HW and SW results match\n");
const int dmaw_rows = img.rows;
const int dmaw_cols = img.cols;
cv::Mat wbmpf(dmaw_rows, dmaw_cols, CV_8UC3);
// wbmpf にラプラシアンフィルタ処理後の画像を入力
cv::Mat_<cv::Vec3b> lap_vec3b = cv::Mat_<cv::Vec3b>(wbmpf);
for (int y=0; y<wbmpf.rows; y++){
for (int x=0; x<wbmpf.cols; x++){
cv::Vec3b pixel;
pixel = lap_vec3b(y,x);
int32_t rgb = hw_dmaw[y*wbmpf.cols+x];
pixel[0] = (rgb & 0xff); // blue
pixel[1] = (rgb & 0xff00) >> 8; // green
pixel[2] = (rgb & 0xff0000) >> 16; // red
lap_vec3b(y,x) = pixel;
}
}
// ハードウェアのラプラシアンフィルタの結果を bmp ファイルへ出力する
cv::imwrite(OUTPUT_BMP_FILE, wbmpf);
return(0);
}
void dma_write_soft(hls::stream<ap_axiu<32,0,0,0> >& ins, volatile int32_t *outm,
int32_t x_size, int32_t y_size){
ap_axiu<32,0,0,0> pix;
LOOP_DWY: for(int y=0; y<y_size; y++){
LOOP_DWX: for(int x=0; x<x_size; x++){
ins >> pix;
outm[x_size*y+x] = pix.data;
}
}
}
// krnl_lap_filter.cpp
// 2020/01/26 by marsse
#include <ap_int.h>
#include <hls_stream.h>
#include <ap_axi_sdata.h>
#include <stdint.h>
// RGBからYへの変換
// RGBのフォーマットは、{8'd0, R(8bits), G(8bits), B(8bits)}, 1pixel = 32bits
// 輝度信号Yのみに変換する。変換式は、Y = 0.299R + 0.587G + 0.114B
// "YUVフォーマット及び YUV<->RGB変換"を参考にした。http://vision.kuee.kyoto-u.ac.jp/~hiroaki/firewire/yuv.html
// 2013/09/27 : float を止めて、すべてint にした
int32_t conv_rgb2y(int32_t rgb){
int32_t r, g, b, y_f;
int32_t y;
b = rgb & 0xff;
g = (rgb>>8) & 0xff;
r = (rgb>>16) & 0xff;
y_f = 77*r + 150*g + 29*b; //y_f = 0.299*r + 0.587*g + 0.114*b;の係数に256倍した
y = y_f >> 8; // 256で割る
return(y);
}
// ラプラシアンフィルタ
// x0y0 x1y0 x2y0 -1 -1 -1
// x0y1 x1y1 x2y1 -1 8 -1
// x0y2 x1y2 x2y2 -1 -1 -1
int32_t laplacian_fil(int32_t x0y0, int32_t x1y0, int32_t x2y0, int32_t x0y1,
int32_t x1y1, int32_t x2y1, int32_t x0y2, int32_t x1y2, int32_t x2y2)
{
int32_t y;
y = -x0y0 -x1y0 -x2y0 -x0y1 +8*x1y1 -x2y1 -x0y2 -x1y2 -x2y2;
if (y<0)
y = -y;
else if (y>255)
y = 255;
return(y);
}
//extern "C" {
void krnl_lap_filter(hls::stream<ap_axiu<32,0,0,0> >& ins, hls::stream<ap_axiu<32,0,0,0> >& outs,
int32_t x_size, int32_t y_size){
#pragma HLS INTERFACE s_axilite port=y_size bundle=control
#pragma HLS INTERFACE s_axilite port=x_size bundle=control
#pragma HLS INTERFACE axis register both port=outs
#pragma HLS INTERFACE axis register both port=ins
#pragma HLS INTERFACE s_axilite port=return bundle=control
ap_axiu<32,0,0,0> pix;
ap_axiu<32,0,0,0> lap;
int32_t line_buf[2][1920]; // supported HD resolution
#pragma HLS array_partition variable=line_buf block factor=2 dim=1
#pragma HLS resource variable=line_buf core=RAM_2P
int32_t pix_mat[3][3];
#pragma HLS array_partition variable=pix_mat complete
int32_t lap_fil_val;
LOOP_X : for (int y=0; y<y_size; y++){
#pragma HLS LOOP_TRIPCOUNT min=48 max=600
LOOP_Y : for (int x=0; x<x_size; x++){
#pragma HLS LOOP_TRIPCOUNT min=64 max=800
#pragma HLS PIPELINE II=1
ins >> pix; // AXI4-Stream からの入力
Loop4 : for (int k=0; k<3; k++){
Loop5 : for (int m=0; m<2; m++){
#pragma HLS UNROLL
pix_mat[k][m] = pix_mat[k][m+1];
}
}
pix_mat[0][2] = line_buf[0][x];
pix_mat[1][2] = line_buf[1][x];
int32_t y_val = conv_rgb2y(pix.data);
pix_mat[2][2] = y_val;
line_buf[0][x] = line_buf[1][x]; // 行の入れ替え
line_buf[1][x] = y_val;
lap_fil_val = laplacian_fil( pix_mat[0][0], pix_mat[0][1], pix_mat[0][2],
pix_mat[1][0], pix_mat[1][1], pix_mat[1][2],
pix_mat[2][0], pix_mat[2][1], pix_mat[2][2]);
lap.data = (lap_fil_val<<16)+(lap_fil_val<<8)+lap_fil_val; // RGB同じ値を入れる
if (x<2 || y<2) // 最初の2行とその他の行の最初の2列は無効データなので0とする
lap.data = 0;
if (x==(x_size-1) && y==(y_size-1)) // フレームの最後で TLAST をアサートする
lap.last = 1;
else
lap.last = 0;
outs << lap; // ストリームへ出力
}
}
LOOP_WAIT_LAST: while(pix.last == 0) { // last が 1 になるまで待つ
#pragma HLS PIPELINE II=1
#pragma HLS LOOP_TRIPCOUNT min=1 max=1 avg=1
ins >> pix;
};
}
//}
// krnl_lap_filter_tb.cpp
// 2020/01/26 by marsee
#include <stdio.h>
#include <stdint.h>
#include "hls_opencv.h"
#include <ap_int.h>
#include <hls_stream.h>
#include <ap_axi_sdata.h>
void krnl_lap_filter(hls::stream<ap_axiu<32,0,0,0> >& ins, hls::stream<ap_axiu<32,0,0,0> >& outs,
int32_t x_size, int32_t y_size);
void krnl_lap_filter_soft(hls::stream<ap_axiu<32,0,0,0> >& ins, hls::stream<ap_axiu<32,0,0,0> >& outs,
int32_t x_size, int32_t y_size);
const char INPUT_BMP_FILE[] = "test.bmp";
const char OUTPUT_BMP_FILE[] = "lap.bmp";
int main(){
hls::stream<ap_axiu<32,0,0,0> > ins;
hls::stream<ap_axiu<32,0,0,0> > ins_soft;
hls::stream<ap_axiu<32,0,0,0> > outs;
hls::stream<ap_axiu<32,0,0,0> > outs_soft;
ap_axiu<32,0,0,0> pix;
ap_axiu<32,0,0,0> vals, vals_soft;
// BMPファイルをMat に読み込む
cv::Mat img = cv::imread(INPUT_BMP_FILE);
// ピクセルを入れる領域の確保
std::vector<int32_t> rd_bmp(sizeof(int32_t)*img.cols*img.rows);
std::vector<int32_t> hw_lap(sizeof(int32_t)*(img.cols-2)*(img.rows-2));
std::vector<int32_t> sw_lap(sizeof(int32_t)*(img.cols-2)*(img.rows-2));
// rd_bmp にBMPのピクセルを代入
cv::Mat_<cv::Vec3b> dst_vec3b = cv::Mat_<cv::Vec3b>(img);
for (int y=0; y<img.rows; y++){
for (int x=0; x<img.cols; x++){
cv::Vec3b pixel;
pixel = dst_vec3b(y,x);
rd_bmp[y*img.cols+x] = (pixel[0] & 0xff) | ((pixel[1] & 0xff)<<8) | ((pixel[2] & 0xff)<<16);
// blue - pixel[0]; green - pixel[1]; red - pixel[2];
}
}
// ins に入力データを用意する
for(int j=0; j < img.rows; j++){
for(int i=0; i < img.cols; i++){
pix.data = (ap_int<32>)rd_bmp[(j*img.cols)+i];
if ((i==img.cols-1) && (j==img.rows-1)) // フレームの最後で last をアサートする
pix.last = 1;
else
pix.last = 0;
ins << pix;
ins_soft << pix;
}
}
krnl_lap_filter(ins, outs,img.cols, img.rows); // ハードウェアのラプラシアンフィルタ
krnl_lap_filter_soft(ins_soft, outs_soft,img.cols, img.rows); // ソフトウェアのラプラシアンフィルタ
// ハードウェアとソフトウェアのラプラシアンフィルタの値のチェック
for (int y=0; y<img.rows; y++){
for (int x=0; x<img.cols; x++){
outs >> vals;
ap_int<32> val = vals.data;
hw_lap[y*img.cols+x] = (int32_t)val;
outs_soft >> vals_soft;
ap_int<32> val_soft = vals_soft.data;
if (val != val_soft){
printf("ERROR HW and SW results mismatch x = %ld, y = %ld, HW = %x, SW = %x\n",
x, y, val, val_soft);
return(1);
}
}
}
printf("Success HW and SW results match\n");
const int lap_rows = img.rows;
const int lap_cols = img.cols;
cv::Mat wbmpf(lap_rows, lap_cols, CV_8UC3);
// wbmpf にラプラシアンフィルタ処理後の画像を入力
cv::Mat_<cv::Vec3b> lap_vec3b = cv::Mat_<cv::Vec3b>(wbmpf);
for (int y=0; y<wbmpf.rows; y++){
for (int x=0; x<wbmpf.cols; x++){
cv::Vec3b pixel;
pixel = lap_vec3b(y,x);
int32_t rgb = hw_lap[y*wbmpf.cols+x];
pixel[0] = (rgb & 0xff); // blue
pixel[1] = (rgb & 0xff00) >> 8; // green
pixel[2] = (rgb & 0xff0000) >> 16; // red
lap_vec3b(y,x) = pixel;
}
}
// ハードウェアのラプラシアンフィルタの結果を bmp ファイルへ出力する
cv::imwrite(OUTPUT_BMP_FILE, wbmpf);
return(0);
}
// RGBからYへの変換
// RGBのフォーマットは、{8'd0, R(8bits), G(8bits), B(8bits)}, 1pixel = 32bits
// 輝度信号Yのみに変換する。変換式は、Y = 0.299R + 0.587G + 0.114B
// "YUVフォーマット及び YUV<->RGB変換"を参考にした。http://vision.kuee.kyoto-u.ac.jp/~hiroaki/firewire/yuv.html
// 2013/09/27 : float を止めて、すべてint にした
int32_t conv_rgb2y_soft(int32_t rgb){
int32_t r, g, b, y_f;
int32_t y;
b = rgb & 0xff;
g = (rgb>>8) & 0xff;
r = (rgb>>16) & 0xff;
y_f = 77*r + 150*g + 29*b; //y_f = 0.299*r + 0.587*g + 0.114*b;の係数に256倍した
y = y_f >> 8; // 256で割る
return(y);
}
// ラプラシアンフィルタ
// x0y0 x1y0 x2y0 -1 -1 -1
// x0y1 x1y1 x2y1 -1 8 -1
// x0y2 x1y2 x2y2 -1 -1 -1
int32_t laplacian_fil_soft(int32_t x0y0, int32_t x1y0, int32_t x2y0, int32_t x0y1,
int32_t x1y1, int32_t x2y1, int32_t x0y2, int32_t x1y2, int32_t x2y2)
{
int32_t y;
y = -x0y0 -x1y0 -x2y0 -x0y1 +8*x1y1 -x2y1 -x0y2 -x1y2 -x2y2;
if (y<0)
y = -y;
else if (y>255)
y = 255;
return(y);
}
void krnl_lap_filter_soft(hls::stream<ap_axiu<32,0,0,0> >& ins, hls::stream<ap_axiu<32,0,0,0> >& outs,
int32_t x_size, int32_t y_size){
ap_axiu<32,0,0,0> pix;
ap_axiu<32,0,0,0> lap;
int32_t line_buf[2][1920]; // supported HD resolution
int32_t pix_mat[3][3];
int32_t lap_fil_val;
LOOP_X : for (int y=0; y<y_size; y++){
LOOP_Y : for (int x=0; x<x_size; x++){
ins >> pix; // AXI4-Stream からの入力
Loop4 : for (int k=0; k<3; k++){
Loop5 : for (int m=0; m<2; m++){
pix_mat[k][m] = pix_mat[k][m+1];
}
}
pix_mat[0][2] = line_buf[0][x];
pix_mat[1][2] = line_buf[1][x];
int32_t y_val = conv_rgb2y_soft(pix.data);
pix_mat[2][2] = y_val;
line_buf[0][x] = line_buf[1][x]; // 行の入れ替え
line_buf[1][x] = y_val;
lap_fil_val = laplacian_fil_soft( pix_mat[0][0], pix_mat[0][1], pix_mat[0][2],
pix_mat[1][0], pix_mat[1][1], pix_mat[1][2],
pix_mat[2][0], pix_mat[2][1], pix_mat[2][2]);
lap.data = (lap_fil_val<<16)+(lap_fil_val<<8)+lap_fil_val; // RGB同じ値を入れる
if (x<2 || y<2) // 最初の2行とその他の行の最初の2列は無効データなので0とする
lap.data = 0;
if (x==(x_size-1) && y==(y_size-1)) // フレームの最後で TLAST をアサートする
lap.last = 1;
else
lap.last = 0;
outs << lap; // ストリームへ出力
}
}
LOOP_WAIT_LAST: while(pix.last == 0) { // last が 1 になるまで待つ
ins >> pix;
};
}
// square_root8.cpp
// 2020/01/22 by marsee
#include <stdint.h>
int square_root8(int32_t val, int32_t *result){
#pragma HLS PIPELINE II=1
int32_t temp = 0;
int32_t square;
for(int i=7; i>=0; --i){
temp += (1 << i);
square = temp * temp;
if(square > val){
temp -= (1 << i);
}
}
*result = int32_t(temp);
return(0);
}
// square_root8_tb.cpp
// 2020/01/22 by marsee
#include <stdio.h>
#include <stdint.h>
int square_root8(int32_t val, int32_t *result);
int main(){
int64_t val;
int32_t result;
int32_t bit_len;
for(int i=0; i<256; i++){
square_root8(i, &result);
printf("i = %d, result = %d\n", i, result);
}
int i = 255*255+1;
square_root8(i, &result);
printf("i = %d, result = %d\n", i, result);
return(0);
}
// square_root8.cpp
// 2020/01/22 by marsee
#include <stdint.h>
int square_root8(int32_t val, int32_t *result){
#pragma HLS PIPELINE II=1
int32_t temp = 0;
int32_t tempn = 0;
int32_t square;
for(int i=7; i>=0; --i){
tempn = temp + (1 << i);
square = tempn * tempn;
if(square <= val){
temp = tempn ;
}
}
*result = int32_t(temp);
return(0);
}
// square_root.cpp
// 2020/01/22 by marsee
#include <stdint.h>
int square_root(int64_t val, int32_t *result, int32_t bit_len){
int64_t temp = 0;
int64_t square;
for(int i=(bit_len-1); i>=0; --i){
#pragma HLS LOOP_TRIPCOUNT min=1 max=16 avg=7
temp += (1 << i);
square = temp * temp;
if(square > val){
temp -= (1 << i);
}
}
*result = int32_t(temp);
return(0);
}
// square_root_tb.cpp
// 2020/01/22 by marsee
#include <stdio.h>
#include <stdint.h>
int square_root(int64_t val, int32_t *result, int32_t bit_len);
int main(){
int64_t val;
int32_t result;
int32_t bit_len;
for(int i=0; i<256; i++){
square_root(i, &result, 8);
printf("i = %d, result = %d\n", i, result);
}
int i=255*255+1;
square_root(i, &result, 8);
printf("i = %d, result = %d\n", i, result);
return(0);
}
INFO: [SIM 2] *************** CSIM start ***************
INFO: [SIM 4] CSIM will launch GCC as the compiler.
Compiling ../../../square_root_tb.cpp in debug mode
Generating csim.exe
i = 0, result = 0
i = 1, result = 1
i = 2, result = 1
i = 3, result = 1
i = 4, result = 2
i = 5, result = 2
i = 6, result = 2
i = 7, result = 2
i = 8, result = 2
i = 9, result = 3
i = 10, result = 3
i = 11, result = 3
i = 12, result = 3
i = 13, result = 3
i = 14, result = 3
i = 15, result = 3
i = 16, result = 4
i = 17, result = 4
i = 18, result = 4
i = 19, result = 4
i = 20, result = 4
i = 21, result = 4
i = 22, result = 4
i = 23, result = 4
i = 24, result = 4
i = 25, result = 5
i = 26, result = 5
i = 27, result = 5
i = 28, result = 5
i = 29, result = 5
i = 30, result = 5
i = 31, result = 5
i = 32, result = 5
i = 33, result = 5
i = 34, result = 5
i = 35, result = 5
i = 36, result = 6
i = 37, result = 6
i = 38, result = 6
i = 39, result = 6
i = 40, result = 6
i = 41, result = 6
i = 42, result = 6
i = 43, result = 6
i = 44, result = 6
i = 45, result = 6
i = 46, result = 6
i = 47, result = 6
i = 48, result = 6
i = 49, result = 7
i = 50, result = 7
i = 51, result = 7
i = 52, result = 7
i = 53, result = 7
i = 54, result = 7
i = 55, result = 7
i = 56, result = 7
i = 57, result = 7
i = 58, result = 7
i = 59, result = 7
i = 60, result = 7
i = 61, result = 7
i = 62, result = 7
i = 63, result = 7
i = 64, result = 8
i = 65, result = 8
i = 66, result = 8
i = 67, result = 8
i = 68, result = 8
i = 69, result = 8
i = 70, result = 8
i = 71, result = 8
i = 72, result = 8
i = 73, result = 8
i = 74, result = 8
i = 75, result = 8
i = 76, result = 8
i = 77, result = 8
i = 78, result = 8
i = 79, result = 8
i = 80, result = 8
i = 81, result = 9
i = 82, result = 9
i = 83, result = 9
i = 84, result = 9
i = 85, result = 9
i = 86, result = 9
i = 87, result = 9
i = 88, result = 9
i = 89, result = 9
i = 90, result = 9
i = 91, result = 9
i = 92, result = 9
i = 93, result = 9
i = 94, result = 9
i = 95, result = 9
i = 96, result = 9
i = 97, result = 9
i = 98, result = 9
i = 99, result = 9
i = 100, result = 10
i = 101, result = 10
i = 102, result = 10
i = 103, result = 10
i = 104, result = 10
i = 105, result = 10
i = 106, result = 10
i = 107, result = 10
i = 108, result = 10
i = 109, result = 10
i = 110, result = 10
i = 111, result = 10
i = 112, result = 10
i = 113, result = 10
i = 114, result = 10
i = 115, result = 10
i = 116, result = 10
i = 117, result = 10
i = 118, result = 10
i = 119, result = 10
i = 120, result = 10
i = 121, result = 11
i = 122, result = 11
i = 123, result = 11
i = 124, result = 11
i = 125, result = 11
i = 126, result = 11
i = 127, result = 11
i = 128, result = 11
i = 129, result = 11
i = 130, result = 11
i = 131, result = 11
i = 132, result = 11
i = 133, result = 11
i = 134, result = 11
i = 135, result = 11
i = 136, result = 11
i = 137, result = 11
i = 138, result = 11
i = 139, result = 11
i = 140, result = 11
i = 141, result = 11
i = 142, result = 11
i = 143, result = 11
i = 144, result = 12
i = 145, result = 12
i = 146, result = 12
i = 147, result = 12
i = 148, result = 12
i = 149, result = 12
i = 150, result = 12
i = 151, result = 12
i = 152, result = 12
i = 153, result = 12
i = 154, result = 12
i = 155, result = 12
i = 156, result = 12
i = 157, result = 12
i = 158, result = 12
i = 159, result = 12
i = 160, result = 12
i = 161, result = 12
i = 162, result = 12
i = 163, result = 12
i = 164, result = 12
i = 165, result = 12
i = 166, result = 12
i = 167, result = 12
i = 168, result = 12
i = 169, result = 13
i = 170, result = 13
i = 171, result = 13
i = 172, result = 13
i = 173, result = 13
i = 174, result = 13
i = 175, result = 13
i = 176, result = 13
i = 177, result = 13
i = 178, result = 13
i = 179, result = 13
i = 180, result = 13
i = 181, result = 13
i = 182, result = 13
i = 183, result = 13
i = 184, result = 13
i = 185, result = 13
i = 186, result = 13
i = 187, result = 13
i = 188, result = 13
i = 189, result = 13
i = 190, result = 13
i = 191, result = 13
i = 192, result = 13
i = 193, result = 13
i = 194, result = 13
i = 195, result = 13
i = 196, result = 14
i = 197, result = 14
i = 198, result = 14
i = 199, result = 14
i = 200, result = 14
i = 201, result = 14
i = 202, result = 14
i = 203, result = 14
i = 204, result = 14
i = 205, result = 14
i = 206, result = 14
i = 207, result = 14
i = 208, result = 14
i = 209, result = 14
i = 210, result = 14
i = 211, result = 14
i = 212, result = 14
i = 213, result = 14
i = 214, result = 14
i = 215, result = 14
i = 216, result = 14
i = 217, result = 14
i = 218, result = 14
i = 219, result = 14
i = 220, result = 14
i = 221, result = 14
i = 222, result = 14
i = 223, result = 14
i = 224, result = 14
i = 225, result = 15
i = 226, result = 15
i = 227, result = 15
i = 228, result = 15
i = 229, result = 15
i = 230, result = 15
i = 231, result = 15
i = 232, result = 15
i = 233, result = 15
i = 234, result = 15
i = 235, result = 15
i = 236, result = 15
i = 237, result = 15
i = 238, result = 15
i = 239, result = 15
i = 240, result = 15
i = 241, result = 15
i = 242, result = 15
i = 243, result = 15
i = 244, result = 15
i = 245, result = 15
i = 246, result = 15
i = 247, result = 15
i = 248, result = 15
i = 249, result = 15
i = 250, result = 15
i = 251, result = 15
i = 252, result = 15
i = 253, result = 15
i = 254, result = 15
i = 255, result = 15
i = 65026, result = 255
INFO: [SIM 1] CSim done with 0 errors.
INFO: [SIM 3] *************** CSIM finish ***************
// krnl_dma_read.cpp
// 2020/01/21 by marsee
#include <stdint.h>
#include <ap_int.h>
#include <hls_stream.h>
#include <ap_axi_sdata.h>
//extern "C" {
void dma_read(volatile int32_t *inm, hls::stream<ap_axiu<32,0,0,0> >& outs, int32_t x_size, int32_t y_size){
#pragma HLS INTERFACE s_axilite port=return bundle=control
#pragma HLS INTERFACE s_axilite port=y_size bundle=control
#pragma HLS INTERFACE s_axilite port=x_size bundle=control
#pragma HLS INTERFACE axis register both port=outs
#pragma HLS INTERFACE m_axi depth=3072 port=inm offset=slave bundle=gmem
ap_axiu<32,0,0,0> pix;
LOOP_DRY: for(int y=0; y<y_size; y++){
#pragma HLS LOOP_TRIPCOUNT min=48 max=600
LOOP_DRX: for(int x=0; x<x_size; x++){
#pragma HLS LOOP_TRIPCOUNT min=64 max=800
#pragma HLS PIPELINE II=1
pix.data = inm[x_size*y+x];
if(x==(x_size-1) && y==(y_size-1))
pix.last = 1;
else
pix.last = 0;
outs << pix;
}
}
}
//}
// krnl_dma_read_tb.cpp
// 2020/01/21 by marsee
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <ap_int.h>
#include <hls_stream.h>
#include <iostream>
#include <fstream>
#include <ap_axi_sdata.h>
#include "bmp_header.h"
void dma_read(volatile int32_t *inm, hls::stream<ap_axiu<32,0,0,0> >& outs, int32_t x_size, int32_t y_size);
int main(){
BITMAPFILEHEADER bmpfhr; // BMPファイルのファイルヘッダ(for Read)
BITMAPINFOHEADER bmpihr; // BMPファイルのINFOヘッダ(for Read)
FILE *fbmpr, *fbmpw;
int32_t *rd_bmp, *dmar;
int32_t blue, green, red;
hls::stream<ap_axiu<32,0,0,0> > outs;
ap_axiu<32,0,0,0> vals;
if ((fbmpr = fopen("test.bmp", "rb")) == NULL){ // test.bmp をオープン
fprintf(stderr, "Can't open test.bmp by binary read mode\n");
exit(1);
}
// bmpヘッダの読み出し
fread(&bmpfhr.bfType, sizeof(uint16_t), 1, fbmpr);
fread(&bmpfhr.bfSize, sizeof(uint32_t), 1, fbmpr);
fread(&bmpfhr.bfReserved1, sizeof(uint16_t), 1, fbmpr);
fread(&bmpfhr.bfReserved2, sizeof(uint16_t), 1, fbmpr);
fread(&bmpfhr.bfOffBits, sizeof(uint32_t), 1, fbmpr);
fread(&bmpihr, sizeof(BITMAPINFOHEADER), 1, fbmpr);
// ピクセルを入れるメモリをアロケートする
if ((rd_bmp =(int32_t *)malloc(sizeof(int32_t) * (bmpihr.biWidth * bmpihr.biHeight))) == NULL){
fprintf(stderr, "Can't allocate rd_bmp memory\n");
exit(1);
}
if ((dmar =(int32_t *)malloc(sizeof(int32_t) * (bmpihr.biWidth * bmpihr.biHeight))) == NULL){
fprintf(stderr, "Can't allocate hw_lapd memory\n");
exit(1);
}
// rd_bmp にBMPのピクセルを代入。その際に、行を逆転する必要がある
for(int y=0; y<bmpihr.biHeight; y++){
for(int x=0; x<bmpihr.biWidth; x++){
blue = fgetc(fbmpr);
green = fgetc(fbmpr);
red = fgetc(fbmpr);
rd_bmp[((bmpihr.biHeight-1)-y)*bmpihr.biWidth+x] = (int32_t)((blue & 0xff) | ((green & 0xff)<<8) | ((red & 0xff)<<16));
}
}
fclose(fbmpr);
dma_read(rd_bmp, outs, bmpihr.biWidth, bmpihr.biHeight);
// DMAされた値のチェック
for(int y=0; y<bmpihr.biHeight; y++){
for(int x=0; x<bmpihr.biWidth; x++){
outs >> vals;
dmar[(y*bmpihr.biWidth)+x] = (int32_t)vals.data;
if ((int32_t)vals.data != rd_bmp[(y*bmpihr.biWidth)+x]){
printf("ERROR HW and SW results mismatch x = %ld, y = %ld, DMAR = %d, ORG = %d\n", x, y, (int)vals.data, (int)rd_bmp[(y*bmpihr.biWidth)+x]);
return(1);
}
}
}
std::cout << "Success DMA READ results match" << std::endl;
std::cout << std::endl;
// DMA_Read の結果を dma_read.bmp へ出力する
if ((fbmpw=fopen("dma_read.bmp", "wb")) == NULL){
fprintf(stderr, "Can't open temp_lap.bmp by binary write mode\n");
exit(1);
}
// BMPファイルヘッダの書き込み
fwrite(&bmpfhr.bfType, sizeof(uint16_t), 1, fbmpw);
fwrite(&bmpfhr.bfSize, sizeof(uint32_t), 1, fbmpw);
fwrite(&bmpfhr.bfReserved1, sizeof(uint16_t), 1, fbmpw);
fwrite(&bmpfhr.bfReserved2, sizeof(uint16_t), 1, fbmpw);
fwrite(&bmpfhr.bfOffBits, sizeof(uint32_t), 1, fbmpw);
fwrite(&bmpihr, sizeof(BITMAPINFOHEADER), 1, fbmpw);
// RGB データの書き込み、逆順にする
for (int y=0; y<bmpihr.biHeight; y++){
for (int x=0; x<bmpihr.biWidth; x++){
blue = dmar[((bmpihr.biHeight-1)-y)*bmpihr.biWidth+x] & 0xff;
green = (dmar[((bmpihr.biHeight-1)-y)*bmpihr.biWidth+x] >> 8) & 0xff;
red = (dmar[((bmpihr.biHeight-1)-y)*bmpihr.biWidth+x]>>16) & 0xff;
fputc(blue, fbmpw);
fputc(green, fbmpw);
fputc(red, fbmpw);
}
}
fclose(fbmpw);
free(rd_bmp);
free(dmar);
return(0);
}
// bmp_header.h
// BMP ファイルフォーマットから引用させて頂きました
// http://www.kk.iij4u.or.jp/~kondo/bmp/
//
// 2017/05/04 : takseiさんのご指摘によりintX_tを使った宣言に変更。takseiさんありがとうございました
// 変数の型のサイズの違いによってLinuxの64ビット版では動作しなかったためです
// http://marsee101.blog19.fc2.com/blog-entry-3354.html#comment2808
//
#include <stdio.h>
#include <stdint.h>
// BITMAPFILEHEADER 14bytes
typedef struct tagBITMAPFILEHEADER {
uint16_t bfType;
uint32_t bfSize;
uint16_t bfReserved1;
uint16_t bfReserved2;
uint32_t bfOffBits;
} BITMAPFILEHEADER;
// BITMAPINFOHEADER 40bytes
typedef struct tagBITMAPINFOHEADER{
uint32_t biSize;
int32_t biWidth;
int32_t biHeight;
uint16_t biPlanes;
uint16_t biBitCount;
uint32_t biCompression;
uint32_t biSizeImage;
int32_t biXPixPerMeter;
int32_t biYPixPerMeter;
uint32_t biClrUsed;
uint32_t biClrImporant;
} BITMAPINFOHEADER;
typedef struct BMP24bitsFORMAT {
uint8_t blue;
uint8_t green;
uint8_t red;
} BMP24FORMAT;
を入力した。--config ../src/krnl_stream_vadd_vmult.ini
ERROR: [CFGEN 83-2284] No stream resources found that can accomodate compute unit "krnl_stream_vadd_1.out"
ERROR: [SYSTEM_LINK 82-36] [20:11:30] cfgen failed
Time (s): cpu = 00:00:00.28 ; elapsed = 00:00:00.29 . Memory (MB): peak = 296.441 ; gain = 0.000 ; free physical = 14175 ; free virtual = 39277
ERROR: [SYSTEM_LINK 82-62] Error generating design file for /home/masaaki/Vitis_Work/2019.2/streaming_k2k_mm2/Hardware/krnl_stream_vadd_vmult.build/link/sys_link/cfgraph/cfgen_cfgraph.xml, command: /media/masaaki/Ubuntu_Disk/tools/Xilinx/Vitis/2019.2/bin/cfgen -nk krnl_stream_vadd:1 -nk krnl_stream_vmult:1 -dmclkid 0 -r /home/masaaki/Vitis_Work/2019.2/streaming_k2k_mm2/Hardware/krnl_stream_vadd_vmult.build/link/sys_link/_sysl/.cdb/xd_ip_db.xml -o /home/masaaki/Vitis_Work/2019.2/streaming_k2k_mm2/Hardware/krnl_stream_vadd_vmult.build/link/sys_link/cfgraph/cfgen_cfgraph.xml
ERROR: [SYSTEM_LINK 82-96] Error applying explicit connections to the system connectivity graph
ERROR: [SYSTEM_LINK 82-79] Unable to create system connectivity graph
INFO: [v++ 60-1442] [20:11:30] Run run_link: Step system_link: Failed
Time (s): cpu = 00:00:05 ; elapsed = 00:00:05 . Memory (MB): peak = 677.906 ; gain = 0.000 ; free physical = 14193 ; free virtual = 39295
ERROR: [v++ 60-661] v++ link run 'run_link' failed
ERROR: [v++ 60-626] Kernel link failed to complete
ERROR: [v++ 60-703] Failed to finish linking
makefile:94: recipe for target 'krnl_stream_vadd_vmult.xclbin' failed
make: *** [krnl_stream_vadd_vmult.xclbin] Error 1
[connectivity]
#stream_connect=<cu_name>.<output_port>:<cu_name>.<input_port>
stream_connect=vadd_1.stream_out:vadd_2.stream_in
v++ -l --config vadd_config.txt ...
// all_layers_template_host.cpp
// 2019/12/25 by marsee
//
// Vitis-Tutorials/docs/mixing-c-rtl-kernels/reference-files/src/host/host_step1.cpp のコードを引用します
// https://github.com/Xilinx/Vitis-Tutorials/blob/master/docs/mixing-c-rtl-kernels/reference-files/src/host/host_step1.cpp
#define CL_HPP_CL_1_2_DEFAULT_BUILD
#define CL_HPP_TARGET_OPENCL_VERSION 120
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <vector>
#include <CL/cl2.hpp>
#include <iostream>
#include <fstream>
#include <CL/cl_ext_xilinx.h>
#include <unistd.h>
#include <limits.h>
#include <sys/stat.h>
#include <ap_int.h>
#include <hls_stream.h>
#include <ap_axi_sdata.h>
#include "layer_general.h"
#include "curve_data_0_100.h"
//#include "curve_data_2500_2600.h"
//#include "curve_data_5000_5100.h"
#define ALL_DATA_NUM 300
#define NUM_OF_KERNELS 2
#define COULMN_PIXELS 56
#define ROW_PIXELS 10
#define ALL_PIXELS 560
#define NUM_OF_OUTPUT 3
#define NUM_ITERATIONS 300 // C Simulation
//#define NUM_ITERATIONS 1 // C/RTL CoSimulation 2
typedef ap_uint<2> output_type;
typedef ap_fixed<12,7,AP_TRN,AP_WRAP> out_affine_type;
void all_layers_dnn(volatile uint32_t *inm, volatile uint32_t *output,
volatile int32_t *dot2, int32_t x_size, int32_t y_size);
int all_layers_soft(hls::stream<ap_axiu<32,1,1,1> >& ins, output_type& output,
float dot2[NUM_OF_OUTPUT]);
static const std::string error_message =
"Error: Result mismatch:\n"
"i = %d CPU result = %d Device result = %d\n";
//Some Library functions to be used.
template <typename T>
struct aligned_allocator
{
using value_type = T;
T* allocate(std::size_t num)
{
void* ptr = nullptr;
if (posix_memalign(&ptr,4096,num*sizeof(T)))
throw std::bad_alloc();
return reinterpret_cast<T*>(ptr);
}
void deallocate(T* p, std::size_t num)
{
free(p);
}
};
#define OCL_CHECK(error,call) \
call; \
if (error != CL_SUCCESS) { \
printf("%s:%d Error calling " #call ", error code is: %d\n", \
__FILE__,__LINE__, error); \
exit(EXIT_FAILURE); \
}
namespace xcl {
std::vector<cl::Device> get_devices(const std::string& vendor_name) {
size_t i;
cl_int err;
std::vector<cl::Platform> platforms;
OCL_CHECK(err, err = cl::Platform::get(&platforms));
cl::Platform platform;
for (i = 0 ; i < platforms.size(); i++){
platform = platforms[i];
OCL_CHECK(err, std::string platformName = platform.getInfo<CL_PLATFORM_NAME>(&err));
if (platformName == vendor_name){
std::cout << "Found Platform" << std::endl;
std::cout << "Platform Name: " << platformName.c_str() << std::endl;
break;
}
}
if (i == platforms.size()) {
std::cout << "Error: Failed to find Xilinx platform" << std::endl;
exit(EXIT_FAILURE);
}
//Getting ACCELERATOR Devices and selecting 1st such device
std::vector<cl::Device> devices;
OCL_CHECK(err, err = platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices));
return devices;
}
std::vector<cl::Device> get_xil_devices() {
return get_devices("Xilinx");
}
char* read_binary_file(const std::string &xclbin_file_name, unsigned &nb)
{
std::cout << "INFO: Reading " << xclbin_file_name << std::endl;
if(access(xclbin_file_name.c_str(), R_OK) != 0) {
printf("ERROR: %s xclbin not available please build\n", xclbin_file_name.c_str());
exit(EXIT_FAILURE);
}
//Loading XCL Bin into char buffer
std::cout << "Loading: '" << xclbin_file_name.c_str() << "'\n";
std::ifstream bin_file(xclbin_file_name.c_str(), std::ifstream::binary);
bin_file.seekg (0, bin_file.end);
nb = bin_file.tellg();
bin_file.seekg (0, bin_file.beg);
char *buf = new char [nb];
bin_file.read(buf, nb);
return buf;
}
};
int main(int argc, char* argv[]){
hls::stream<ap_axiu<32,1,1,1> > ins_soft;
output_type output_soft;
float dot2_soft[NUM_OF_OUTPUT];
ap_axiu<32,1,1,1> pix;
int hw_err_cnt = 0;
int sw_err_cnt = 0;
const char* xclbinFilename;
if (argc==2) {
xclbinFilename = argv[1];
std::cout <<"Using FPGA binary file specfied through the command line: " << xclbinFilename << std::endl;
}
else {
xclbinFilename = "../lap_filter_axim.xclbin";
std::cout << "No FPGA binary file specified through the command line, using:" << xclbinFilename <<std::endl;
}
// t_train256[][]を入れるメモリをアロケート
std::vector<int32_t,aligned_allocator<int32_t>> pixel(ROW_PIXELS*COULMN_PIXELS);
size_t pixel_in_bytes = (ROW_PIXELS*COULMN_PIXELS) * sizeof(int32_t);
std::vector<uint32_t,aligned_allocator<uint32_t>> output(1);
size_t output_in_bytes = sizeof(uint32_t);
std::vector<int32_t,aligned_allocator<int32_t>> dot2(NUM_OF_OUTPUT);
size_t dot2_in_bytes = (NUM_OF_OUTPUT * sizeof(int32_t));
std::vector<cl::Device> devices = xcl::get_xil_devices();
cl::Device device = devices[0];
devices.resize(1);
// Creating Context and Command Queue for selected device
cl::Context context(device);
cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE);
// Load xclbin
std::cout << "Loading: '" << xclbinFilename << "'\n";
std::ifstream bin_file(xclbinFilename, std::ifstream::binary);
bin_file.seekg (0, bin_file.end);
unsigned nb = bin_file.tellg();
bin_file.seekg (0, bin_file.beg);
char *buf = new char [nb];
bin_file.read(buf, nb);
// Creating Program from Binary File
cl::Program::Binaries bins;
bins.push_back({buf,nb});
cl::Program program(context, devices, bins);
// This call will get the kernel object from program. A kernel is an
// OpenCL function that is executed on the FPGA.
cl::Kernel krnl_all_layers_dnn(program,"all_layers_dnn");
// These commands will allocate memory on the Device. The cl::Buffer objects can
// be used to reference the memory locations on the device.
cl::Buffer pixel_buf(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY,
pixel_in_bytes, pixel.data());
cl::Buffer output_buf(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE,
output_in_bytes, output.data());
cl::Buffer dot2_buf(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY,
dot2_in_bytes, dot2.data());
//set the kernel Arguments
krnl_all_layers_dnn.setArg(0,pixel_buf);
krnl_all_layers_dnn.setArg(1,output_buf);
krnl_all_layers_dnn.setArg(2,dot2_buf);
krnl_all_layers_dnn.setArg(3,COULMN_PIXELS);
krnl_all_layers_dnn.setArg(4,ROW_PIXELS);
for(int i=0; i<NUM_ITERATIONS; i++){
for(int y=0; y<ROW_PIXELS; y++){
for(int x=0; x<COULMN_PIXELS; x++){
// 1 画面分のデータを ins、ins_soft に入力する
pix.data = ap_uint<32>(t_train_256[i][y*COULMN_PIXELS+x]);
if (x==0 && y==0) // 最初のデータの時に TUSER を 1 にする
pix.user = 1;
else
pix.user = 0;
if (x == COULMN_PIXELS-1) // 行の最後でTLASTをアサートする
pix.last = 1;
else
pix.last = 0;
ins_soft << pix;
pixel[y*COULMN_PIXELS+x] = uint32_t(t_train_256[i][y*COULMN_PIXELS+x]);
}
}
// Data will be transferred from system memory over PCIe to the FPGA on-board
// DDR memory.
q.enqueueMigrateMemObjects({pixel_buf},0/* 0 means from host*/);
//Launch the Kernel
q.enqueueTask(krnl_all_layers_dnn);
//q.enqueueTask(krnl_all_layers_dnn);
// The result of the previous kernel execution will need to be retrieved in
// order to view the results. This call will transfer the data from FPGA to
// source_results vector
q.enqueueMigrateMemObjects({output_buf, dot2_buf},CL_MIGRATE_MEM_OBJECT_HOST);
all_layers_soft(ins_soft, output_soft, dot2_soft);
int t_test_num = 0;
for(int m=0; m<NUM_OF_OUTPUT; m++){
if(t_test[i][m] == 1.0f){
t_test_num = m;
break;
}
}
// out と out_soft を比較する
/* cout << "output" << " = " << int(output) << " output_soft = " << int(output_soft) << endl;
for(int j=0; j<NUM_OF_OUTPUT; j++){
cout << "dot2[" << j << "] = " << float(dot2[j]) << " dot2_soft[" << j << "] = " << dot2_soft[j] << endl;
} */
if(int(output[0]) != t_test_num){
std::cout << "hw_error: i = " << i << " output = " << int(output[0]) << " t_test_num = " << t_test_num << std::endl;
hw_err_cnt++;
//return(1);
}
if(int(output_soft) != t_test_num){
std::cout << "sw_error: i = "<< i << " output_soft = " << int(output_soft) << " t_test_num" " = " << t_test_num << std::endl;
sw_err_cnt++;
//return(1);
}
if(int(output[0]) != t_test_num || int(output_soft) != t_test_num){
for(int j=0; j<NUM_OF_OUTPUT; j++){
std::cout << "dot2[" << j << "] = " << std::fixed << std::setprecision(8) << float(dot2[j])/float(256.0) << " dot2_soft[" << j << "] = " << dot2_soft[j] << std::endl;
}
std::cout << std::endl;
}
}
q.finish();
std::cout << "hw_err_cnt = " << hw_err_cnt << " sw_err_cnt = " << sw_err_cnt << std::endl;
return(0);
}
だった。1.41.1
26076a4de974ead31f97692a0d32f90d735645c0
x64
C/C++
Japanese Language Pack
C++ Intellisense
Python
svls-vscode
verilog HDL/SystemVerilog
VHDL
INFO: [SIM 2] *************** CSIM start ***************
INFO: [SIM 4] CSIM will launch GCC as the compiler.
Compiling ../../../all_layers_template_axim.cpp in debug mode
Generating csim.exe
hw_error: i = 25 output = 2 t_test_num = 1
sw_error: i = 25 output_soft = 2 t_test_num = 1
dot2[0] = -5.59375000 dot2_soft[0] = -3.77501726
dot2[1] = 0.12500000 dot2_soft[1] = -0.13269189
dot2[2] = 0.25000000 dot2_soft[2] = 1.61074853
hw_error: i = 30 output = 2 t_test_num = 1
sw_error: i = 30 output_soft = 2 t_test_num = 1
dot2[0] = -6.53125000 dot2_soft[0] = -4.67336369
dot2[1] = 0.40625000 dot2_soft[1] = 0.12951475
dot2[2] = 0.43750000 dot2_soft[2] = 1.71587336
sw_error: i = 31 output_soft = 2 t_test_num = 1
dot2[0] = -7.31250000 dot2_soft[0] = -5.31440449
dot2[1] = 0.90625000 dot2_soft[1] = 0.69655895
dot2[2] = -0.25000000 dot2_soft[2] = 1.00723171
sw_error: i = 35 output_soft = 2 t_test_num = 1
dot2[0] = -7.12500000 dot2_soft[0] = -5.15462875
dot2[1] = 0.50000000 dot2_soft[1] = 0.19586089
dot2[2] = 0.43750000 dot2_soft[2] = 1.79063916
sw_error: i = 36 output_soft = 2 t_test_num = 1
dot2[0] = -7.68750000 dot2_soft[0] = -5.64889669
dot2[1] = 1.03125000 dot2_soft[1] = 0.69646239
dot2[2] = -0.25000000 dot2_soft[2] = 1.09402716
sw_error: i = 40 output_soft = 2 t_test_num = 1
dot2[0] = -7.21875000 dot2_soft[0] = -5.31394196
dot2[1] = 0.59375000 dot2_soft[1] = 0.30034199
dot2[2] = 0.09375000 dot2_soft[2] = 1.52586949
sw_error: i = 41 output_soft = 2 t_test_num = 1
dot2[0] = -8.12500000 dot2_soft[0] = -5.94443941
dot2[1] = 0.87500000 dot2_soft[1] = 0.61903512
dot2[2] = 0.06250000 dot2_soft[2] = 1.28180122
sw_error: i = 42 output_soft = 2 t_test_num = 1
dot2[0] = -10.21875000 dot2_soft[0] = -7.44187164
dot2[1] = 1.31250000 dot2_soft[1] = 1.10615981
dot2[2] = 0.37500000 dot2_soft[2] = 1.35738707
sw_error: i = 45 output_soft = 2 t_test_num = 1
dot2[0] = -8.21875000 dot2_soft[0] = -5.92508411
dot2[1] = 0.68750000 dot2_soft[1] = 0.44851223
dot2[2] = 0.18750000 dot2_soft[2] = 1.43742454
sw_error: i = 46 output_soft = 2 t_test_num = 1
dot2[0] = -10.37500000 dot2_soft[0] = -7.76649952
dot2[1] = 1.06250000 dot2_soft[1] = 0.82863915
dot2[2] = 0.90625000 dot2_soft[2] = 1.88942850
sw_error: i = 47 output_soft = 2 t_test_num = 1
dot2[0] = -12.12500000 dot2_soft[0] = -9.50911713
dot2[1] = 1.75000000 dot2_soft[1] = 1.48399019
dot2[2] = 0.75000000 dot2_soft[2] = 1.85759318
hw_error: i = 75 output = 2 t_test_num = 1
sw_error: i = 75 output_soft = 2 t_test_num = 1
dot2[0] = -5.96875000 dot2_soft[0] = -4.04238653
dot2[1] = -1.03125000 dot2_soft[1] = -1.22402656
dot2[2] = 2.12500000 dot2_soft[2] = 3.36929369
hw_error: i = 76 output = 2 t_test_num = 1
sw_error: i = 76 output_soft = 2 t_test_num = 1
dot2[0] = -6.18750000 dot2_soft[0] = -4.09871578
dot2[1] = -0.21875000 dot2_soft[1] = -0.46985394
dot2[2] = 0.40625000 dot2_soft[2] = 1.61257589
hw_error: i = 80 output = 2 t_test_num = 1
sw_error: i = 80 output_soft = 2 t_test_num = 1
dot2[0] = -6.37500000 dot2_soft[0] = -4.33292818
dot2[1] = -0.75000000 dot2_soft[1] = -0.96692348
dot2[2] = 1.78125000 dot2_soft[2] = 2.98383069
hw_error: i = 81 output = 2 t_test_num = 1
sw_error: i = 81 output_soft = 2 t_test_num = 1
dot2[0] = -6.46875000 dot2_soft[0] = -4.40864801
dot2[1] = 0.06250000 dot2_soft[1] = -0.15780880
dot2[2] = 0.09375000 dot2_soft[2] = 1.26864278
hw_error: i = 85 output = 2 t_test_num = 1
sw_error: i = 85 output_soft = 2 t_test_num = 1
dot2[0] = -6.15625000 dot2_soft[0] = -4.16326904
dot2[1] = -0.59375000 dot2_soft[1] = -0.84592772
dot2[2] = 1.21875000 dot2_soft[2] = 2.42255425
sw_error: i = 86 output_soft = 2 t_test_num = 1
dot2[0] = -6.50000000 dot2_soft[0] = -4.36515617
dot2[1] = 0.09375000 dot2_soft[1] = -0.08813666
dot2[2] = -0.28125000 dot2_soft[2] = 0.97706115
hw_error: i = 90 output = 2 t_test_num = 1
sw_error: i = 90 output_soft = 2 t_test_num = 1
dot2[0] = -5.81250000 dot2_soft[0] = -4.02276182
dot2[1] = -0.53125000 dot2_soft[1] = -0.66237617
dot2[2] = 0.46875000 dot2_soft[2] = 1.72938108
sw_error: i = 91 output_soft = 2 t_test_num = 1
dot2[0] = -5.71875000 dot2_soft[0] = -3.85103607
dot2[1] = 0.15625000 dot2_soft[1] = -0.09844255
dot2[2] = -0.78125000 dot2_soft[2] = 0.42963967
sw_error: i = 95 output_soft = 2 t_test_num = 1
dot2[0] = -6.00000000 dot2_soft[0] = -4.07760668
dot2[1] = -0.03125000 dot2_soft[1] = -0.30057180
dot2[2] = -0.43750000 dot2_soft[2] = 0.90393031
hw_err_cnt = 8 sw_err_cnt = 20
WARNING: Hls::stream 'hls::stream<ap_axiu<32, 1, 1, 1> >.1' contains leftover data, which may result in RTL simulation hanging.
INFO: [SIM 1] CSim done with 0 errors.
INFO: [SIM 3] *************** CSIM finish ***************
// all_layers_template_axim.cpp
// 2018/05/10 by marsee
// 2019/12/28: VitisのRTLカーネルととして使用するためにall_layers_dnnを追加
//
#include <ap_int.h>
#include <hls_stream.h>
#include <ap_axi_sdata.h>
#include <hls_video.h>
#include <stdint.h>
#include "layer_general.h"
#include "all_layers_template.h"
int input_layer(hls::stream<ap_axiu<32,1,1,1> >&ins,
hls::stream<ap_fixed_axis<9,1,1,1> >&outs);
int conv_layer1(hls::stream<ap_fixed_axis<9,1,1,1> >& ins,
hls::stream<ap_fixed_axis<16,6,2,1> >& outs);
int relu_conv1(hls::stream<ap_fixed_axis<16,6,2,1> >& ins,
hls::stream<ap_fixed_axis<16,6,2,1> >& outs);
int max_pooling(hls::stream<ap_fixed_axis<16,6,2,1> >& ins,
hls::stream<ap_fixed_axis<16,6,2,1> >& outs);
int affine_layer1(hls::stream<ap_fixed_axis<16,6,2,1> >& ins,
hls::stream<ap_fixed_axis<19,7,1,1> >& outs);
int relu_affine1(hls::stream<ap_fixed_axis<19,7,1,1> >& ins,
hls::stream<ap_fixed_axis<19,7,1,1> >& outs);
int affine_layer2(hls::stream<ap_fixed_axis<19,7,1,1> >& ins,
hls::stream<ap_fixed_axis<12,7,1,1> >& outs);
int output_layer(hls::stream<ap_fixed_axis<12,7,1,1> >& ins, output_type& output,
out_affine_type dot2[NUMBER_OF_OUTPUT_LAYER]);
int all_layers(hls::stream<ap_axiu<32,1,1,1> >& ins, output_type& output,
out_affine_type dot2[NUMBER_OF_OUTPUT_LAYER]){
//#pragma HLS INTERFACE s_axilite port=output
//#pragma HLS INTERFACE s_axilite port=dot2
//#pragma HLS ARRAY_PARTITION variable=dot2 complete dim=1
//#pragma HLS INTERFACE s_axilite port=return
//#pragma HLS INTERFACE axis register both port=ins
#pragma HLS DATAFLOW
hls::stream<ap_fixed_axis<9,1,1,1> > outs_input_layer;
//#pragma HLS STREAM variable=outs_input_layer depth=560 dim=1
hls::stream<ap_fixed_axis<16,6,2,1> > outs_conv_layer;
//#pragma HLS STREAM variable=outs_conv_layer depth=312 dim=1
hls::stream<ap_fixed_axis<16,6,2,1> > outs_relu_conv1;
//#pragma HLS STREAM variable=outs_relu depth=312 dim=1
hls::stream<ap_fixed_axis<16,6,2,1> > outs_max_pooling;
//#pragma HLS STREAM variable=outs_max_pooling depth=78 dim=1
hls::stream<ap_fixed_axis<19,7,1,1> > outs_affine_layer1;
//#pragma HLS STREAM variable=outs_affine_layer1 depth=100 dim=1
hls::stream<ap_fixed_axis<19,7,1,1> > outs_relu_affine1;
//#pragma HLS STREAM variable=outs_relu_affine1 depth=100 dim=1
hls::stream<ap_fixed_axis<12,7,1,1> > outs_affine_layer2;
//#pragma HLS STREAM variable=outs_affine_layer2 depth=3 dim=1
input_layer(ins, outs_input_layer);
conv_layer1(outs_input_layer, outs_conv_layer);
relu_conv1(outs_conv_layer, outs_relu_conv1);
max_pooling(outs_relu_conv1, outs_max_pooling);
affine_layer1(outs_max_pooling, outs_affine_layer1);
relu_affine1(outs_affine_layer1, outs_relu_affine1);
affine_layer2(outs_relu_affine1, outs_affine_layer2);
output_layer(outs_affine_layer2, output, dot2);
return(0);
}
extern "C" {
void all_layers_dnn(volatile uint32_t *inm, volatile uint32_t *output,
volatile int32_t *dot2, int32_t x_size, int32_t y_size){
#pragma HLS INTERFACE m_axi port=inm offset = slave bundle = gmem
#pragma HLS INTERFACE m_axi port=output offset = slave bundle = gmem
#pragma HLS INTERFACE m_axi port=dot2 offset = slave bundle = gmem2
#pragma HLS INTERFACE s_axilite port = x_size bundle = control
#pragma HLS INTERFACE s_axilite port = y_size bundle = control
#pragma HLS INTERFACE s_axilite port = return bundle = control
#pragma HLS DATAFLOW
hls::stream<ap_axiu<32, 1, 1, 1> > ins;
ap_axiu<32,1,1,1> element;
out_affine_type dot2_o[NUMBER_OF_OUTPUT_LAYER];
output_type output_o;
Loop_y: for(int y=0; y<y_size; y++){
#pragma HLS LOOP_TRIPCOUNT min=10 max=10 avg=10
Loop_x: for(int x=0; x<x_size; x++){
#pragma HLS LOOP_TRIPCOUNT min=56 max=56 avg=56
#pragma HLS PIPELINE II=1
element.data = ap_uint<32>(inm[x_size*y+x]);
if(x==0 && y==0) // 最初のデータ
element.user = 1;
else
element.user = 0;
if(x==(x_size-1)) // 行の終了
element.last = 1;
else
element.last = 0;
ins << element;
}
}
all_layers(ins, output_o, dot2_o);
*output = uint32_t(output_o);
Loop_dot2: for(int i=0; i<NUMBER_OF_OUTPUT_LAYER; i++){
#pragma HLS PIPELINE II=1
ap_fixed<20,7,AP_TRN,AP_WRAP> dot2_t = dot2_o[i];
dot2[i] = int32_t(dot2_t*256); // 8 bits shift left
}
}
}
// all_layers_template_axim_tb.cpp
// 2018/05/12 by marsee
// 2019/12/28: VitisのRTLカーネルととして使用するためにall_layers_dnnを追加
//
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <ap_int.h>
#include <hls_stream.h>
#include <iostream>
#include <fstream>
#include <iomanip>
#include <math.h>
#include <ap_axi_sdata.h>
#include <hls_video.h>
#include <vector>
#include <stdint.h>
#include "layer_general.h"
#include "all_layers_template.h"
#include "curve_data_0_100.h"
//#include "curve_data_2500_2600.h"
//#include "curve_data_5000_5100.h"
#define ALL_DATA_NUM 300
#define NUM_OF_KERNELS 2
#define COULMN_PIXELS 56
#define ROW_PIXELS 10
#define ALL_PIXELS 560
#define NUM_OF_OUTPUT 3
#define NUM_ITERATIONS 300 // C Simulation
//#define NUM_ITERATIONS 2 // C/RTL CoSimulation
void all_layers_dnn(volatile uint32_t *inm, volatile uint32_t *output,
volatile int32_t *dot2, int32_t x_size, int32_t y_size);
int all_layers_soft(hls::stream<ap_axiu<32,1,1,1> >& ins, output_type& output,
float dot2[NUMBER_OF_OUTPUT_LAYER]);
int main(){
using namespace std;
hls::stream<ap_axiu<32,1,1,1> > ins;
hls::stream<ap_axiu<32,1,1,1> > ins_soft;
output_type output_soft;
uint32_t output;
float dot2_soft[NUMBER_OF_OUTPUT_LAYER];
ap_axiu<32,1,1,1> pix;
int hw_err_cnt = 0;
int sw_err_cnt = 0;
vector<uint32_t> pixel(ROW_PIXELS * COULMN_PIXELS);
vector<int32_t> dot2(NUMBER_OF_OUTPUT_LAYER);
for(int i=0; i<NUM_ITERATIONS; i++){
// ins に入力データを用意する
for(int y=0; y<ROW_PIXELS; y++){
for(int x=0; x<COULMN_PIXELS; x++){
// 1 画面分のデータを ins、ins_soft に入力する
pix.data = ap_uint<32>(t_train_256[i][y*COULMN_PIXELS+x]);
pixel[y*COULMN_PIXELS+x] = uint32_t(t_train_256[i][y*COULMN_PIXELS+x]);
if (x==0 && y==0) // 最初のデータの時に TUSER を 1 にする
pix.user = 1;
else
pix.user = 0;
if (x == COULMN_PIXELS-1) // 行の最後でTLASTをアサートする
pix.last = 1;
else
pix.last = 0;
ins << pix;
ins_soft << pix;
}
}
all_layers_dnn(pixel.data(), &output, dot2.data(), COULMN_PIXELS, ROW_PIXELS);
all_layers_soft(ins_soft, output_soft, dot2_soft);
int t_test_num = 0;
for(int m=0; m<NUMBER_OF_OUTPUT_LAYER; m++){
if(t_test[i][m] == 1.0f){
t_test_num = m;
break;
}
}
// out と out_soft を比較する
/* cout << "output" << " = " << int(output) << " output_soft = " << int(output_soft) << endl;
for(int j=0; j<NUMBER_OF_OUTPUT_LAYER; j++){
cout << "dot2[" << j << "] = " << float(dot2[j]) << " dot2_soft[" << j << "] = " << dot2_soft[j] << endl;
} */
if(int(output) != t_test_num){
cout << "hw_error: i = " << i << " output = " << int(output) << " t_test_num = " << t_test_num << endl;
hw_err_cnt++;
//return(1);
}
if(int(output_soft) != t_test_num){
cout << "sw_error: i = "<< i << " output_soft = " << int(output_soft) << " t_test_num" " = " << t_test_num << endl;
sw_err_cnt++;
//return(1);
}
if(int(output) != t_test_num || int(output_soft) != t_test_num){
for(int j=0; j<NUMBER_OF_OUTPUT_LAYER; j++){
cout << "dot2[" << j << "] = " << fixed << setprecision(8) << float(dot2[j])/float(256.0) << " dot2_soft[" << j << "] = " << dot2_soft[j] << endl;
}
cout << endl;
}
}
cout << "hw_err_cnt = " << hw_err_cnt << " sw_err_cnt = " << sw_err_cnt << endl;
return(0);
}
// square_cubed_host.cpp
// 2020/01/06 by marsee
//
// Vitis-Tutorials/docs/mixing-c-rtl-kernels/reference-files/src/host/host_step1.cpp のコードを引用します
// https://github.com/Xilinx/Vitis-Tutorials/blob/master/docs/mixing-c-rtl-kernels/reference-files/src/host/host_step1.cpp
#define CL_HPP_CL_1_2_DEFAULT_BUILD
#define CL_HPP_TARGET_OPENCL_VERSION 120
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#include <vector>
#include <CL/cl2.hpp>
#include <iostream>
#include <fstream>
#include <CL/cl_ext_xilinx.h>
#include <unistd.h>
#include <limits.h>
#include <sys/stat.h>
static const std::string error_message =
"Error: Result mismatch:\n"
"i = %d CPU result = %d Device result = %d\n";
//Some Library functions to be used.
template <typename T>
struct aligned_allocator
{
using value_type = T;
T* allocate(std::size_t num)
{
void* ptr = nullptr;
if (posix_memalign(&ptr,4096,num*sizeof(T)))
throw std::bad_alloc();
return reinterpret_cast<T*>(ptr);
}
void deallocate(T* p, std::size_t num)
{
free(p);
}
};
#define OCL_CHECK(error,call) \
call; \
if (error != CL_SUCCESS) { \
printf("%s:%d Error calling " #call ", error code is: %d\n", \
__FILE__,__LINE__, error); \
exit(EXIT_FAILURE); \
}
namespace xcl {
std::vector<cl::Device> get_devices(const std::string& vendor_name) {
size_t i;
cl_int err;
std::vector<cl::Platform> platforms;
OCL_CHECK(err, err = cl::Platform::get(&platforms));
cl::Platform platform;
for (i = 0 ; i < platforms.size(); i++){
platform = platforms[i];
OCL_CHECK(err, std::string platformName = platform.getInfo<CL_PLATFORM_NAME>(&err));
if (platformName == vendor_name){
std::cout << "Found Platform" << std::endl;
std::cout << "Platform Name: " << platformName.c_str() << std::endl;
break;
}
}
if (i == platforms.size()) {
std::cout << "Error: Failed to find Xilinx platform" << std::endl;
exit(EXIT_FAILURE);
}
//Getting ACCELERATOR Devices and selecting 1st such device
std::vector<cl::Device> devices;
OCL_CHECK(err, err = platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices));
return devices;
}
std::vector<cl::Device> get_xil_devices() {
return get_devices("Xilinx");
}
char* read_binary_file(const std::string &xclbin_file_name, unsigned &nb)
{
std::cout << "INFO: Reading " << xclbin_file_name << std::endl;
if(access(xclbin_file_name.c_str(), R_OK) != 0) {
printf("ERROR: %s xclbin not available please build\n", xclbin_file_name.c_str());
exit(EXIT_FAILURE);
}
//Loading XCL Bin into char buffer
std::cout << "Loading: '" << xclbin_file_name.c_str() << "'\n";
std::ifstream bin_file(xclbin_file_name.c_str(), std::ifstream::binary);
bin_file.seekg (0, bin_file.end);
nb = bin_file.tellg();
bin_file.seekg (0, bin_file.beg);
char *buf = new char [nb];
bin_file.read(buf, nb);
return buf;
}
};
// Vitis-Tutorials/docs/mixing-c-rtl-kernels/reference-files/src/host/host_step1.cpp のコードを引用終了
#define DATA_SIZE 10
// Vitis-Tutorials/docs/mixing-c-rtl-kernels/reference-files/src/host/host_step1.cpp のコードを自分用に変更して引用します
int main(int argc, char* argv[])
{
const char* xclbinFilename;
if (argc==2) {
xclbinFilename = argv[1];
std::cout <<"Using FPGA binary file specfied through the command line: " << xclbinFilename << std::endl;
}
else {
xclbinFilename = "../lap_filter_axim.xclbin";
std::cout << "No FPGA binary file specified through the command line, using:" << xclbinFilename <<std::endl;
}
std::vector<int32_t,aligned_allocator<int32_t>> in_data(DATA_SIZE);
std::vector<int32_t,aligned_allocator<int32_t>> square_data(DATA_SIZE);
std::vector<int32_t,aligned_allocator<int32_t>> cubed_data(DATA_SIZE);
size_t size_in_bytes = (DATA_SIZE) * sizeof(int32_t);
// input data
for(int i=0; i<DATA_SIZE; i++){
in_data[i] = i;
square_data[i] = 0;
}
std::vector<cl::Device> devices = xcl::get_xil_devices();
cl::Device device = devices[0];
devices.resize(1);
// Creating Context and Command Queue for selected device
cl::Context context(device);
cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE);
// Load xclbin
std::cout << "Loading: '" << xclbinFilename << "'\n";
std::ifstream bin_file(xclbinFilename, std::ifstream::binary);
bin_file.seekg (0, bin_file.end);
unsigned nb = bin_file.tellg();
bin_file.seekg (0, bin_file.beg);
char *buf = new char [nb];
bin_file.read(buf, nb);
// Creating Program from Binary File
cl::Program::Binaries bins;
bins.push_back({buf,nb});
cl::Program program(context, devices, bins);
// This call will get the kernel object from program. A kernel is an
// OpenCL function that is executed on the FPGA.
cl::Kernel krnl_squara_cubed(program,"square_cubed");
// These commands will allocate memory on the Device. The cl::Buffer objects can
// be used to reference the memory locations on the device.
cl::Buffer ind_buf(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY,
size_in_bytes, in_data.data());
cl::Buffer squared_buf(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE,
size_in_bytes, square_data.data());
cl::Buffer cubed_buf(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE,
size_in_bytes, cubed_data.data());
// Data will be transferred from system memory over PCIe to the FPGA on-board
// DDR memory.
q.enqueueMigrateMemObjects({ind_buf},0/* 0 means from host*/);
//set the kernel Arguments
krnl_squara_cubed.setArg(0,ind_buf);
krnl_squara_cubed.setArg(1,squared_buf);
krnl_squara_cubed.setArg(2,cubed_buf);
//Launch the Kernel
q.enqueueTask(krnl_squara_cubed);
// The result of the previous kernel execution will need to be retrieved in
// order to view the results. This call will transfer the data from FPGA to
// source_results vector
q.enqueueMigrateMemObjects({squared_buf, cubed_buf},CL_MIGRATE_MEM_OBJECT_HOST);
q.finish();
// Compare the results
int error = 0;
for(int i=0; i<DATA_SIZE; i++){
if(square_data[i] != i*i || cubed_data[i] != i*i*i){
std::cout << "Error: i = " << i << " i^2 = " << i*i << " square_data = " << int(square_data[i]) <<
" cubed_data = " << int(cubed_data[i]) << std::endl;
error = 1;
}else{
//std::cout << "Error: i = " << i << " i^2 = " << i*i << " square_data = " << int(square_data[i]) <<
//" cubed_data = " << int(cubed_data[i]) << std::endl;
}
}
std::cout << "TEST " << (error ? "FAILED" : "PASSED") << std::endl;
return (error ? EXIT_FAILURE : EXIT_SUCCESS);
}
// all_layers_template_host.cpp
// 2019/12/25 by marsee
//
// Vitis-Tutorials/docs/mixing-c-rtl-kernels/reference-files/src/host/host_step1.cpp のコードを引用します
// https://github.com/Xilinx/Vitis-Tutorials/blob/master/docs/mixing-c-rtl-kernels/reference-files/src/host/host_step1.cpp
#define CL_HPP_CL_1_2_DEFAULT_BUILD
#define CL_HPP_TARGET_OPENCL_VERSION 120
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <vector>
#include <CL/cl2.hpp>
#include <iostream>
#include <fstream>
#include <CL/cl_ext_xilinx.h>
#include <unistd.h>
#include <limits.h>
#include <sys/stat.h>
#include <ap_int.h>
#include <hls_stream.h>
#include <ap_axi_sdata.h>
#include "layer_general.h"
#include "curve_data_0_100.h"
//#include "curve_data_2500_2600.h"
//#include "curve_data_5000_5100.h"
#define ALL_DATA_NUM 300
#define NUM_OF_KERNELS 2
#define COULMN_PIXELS 56
#define ROW_PIXELS 10
#define ALL_PIXELS 560
#define NUM_OF_OUTPUT 3
#define NUM_ITERATIONS 100 // C Simulation
//#define NUM_ITERATIONS 1 // C/RTL CoSimulation 2
typedef ap_uint<2> output_type;
typedef ap_fixed<12,7,AP_TRN,AP_WRAP> out_affine_type;
void all_layers_dnn(volatile uint32_t *inm, volatile uint32_t *output,
volatile int32_t *dot2, int32_t x_size, int32_t y_size);
int all_layers_soft(hls::stream<ap_axiu<32,1,1,1> >& ins, output_type& output,
float dot2[NUM_OF_OUTPUT]);
static const std::string error_message =
"Error: Result mismatch:\n"
"i = %d CPU result = %d Device result = %d\n";
//Some Library functions to be used.
template <typename T>
struct aligned_allocator
{
using value_type = T;
T* allocate(std::size_t num)
{
void* ptr = nullptr;
if (posix_memalign(&ptr,4096,num*sizeof(T)))
throw std::bad_alloc();
return reinterpret_cast<T*>(ptr);
}
void deallocate(T* p, std::size_t num)
{
free(p);
}
};
#define OCL_CHECK(error,call) \
call; \
if (error != CL_SUCCESS) { \
printf("%s:%d Error calling " #call ", error code is: %d\n", \
__FILE__,__LINE__, error); \
exit(EXIT_FAILURE); \
}
namespace xcl {
std::vector<cl::Device> get_devices(const std::string& vendor_name) {
size_t i;
cl_int err;
std::vector<cl::Platform> platforms;
OCL_CHECK(err, err = cl::Platform::get(&platforms));
cl::Platform platform;
for (i = 0 ; i < platforms.size(); i++){
platform = platforms[i];
OCL_CHECK(err, std::string platformName = platform.getInfo<CL_PLATFORM_NAME>(&err));
if (platformName == vendor_name){
std::cout << "Found Platform" << std::endl;
std::cout << "Platform Name: " << platformName.c_str() << std::endl;
break;
}
}
if (i == platforms.size()) {
std::cout << "Error: Failed to find Xilinx platform" << std::endl;
exit(EXIT_FAILURE);
}
//Getting ACCELERATOR Devices and selecting 1st such device
std::vector<cl::Device> devices;
OCL_CHECK(err, err = platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices));
return devices;
}
std::vector<cl::Device> get_xil_devices() {
return get_devices("Xilinx");
}
char* read_binary_file(const std::string &xclbin_file_name, unsigned &nb)
{
std::cout << "INFO: Reading " << xclbin_file_name << std::endl;
if(access(xclbin_file_name.c_str(), R_OK) != 0) {
printf("ERROR: %s xclbin not available please build\n", xclbin_file_name.c_str());
exit(EXIT_FAILURE);
}
//Loading XCL Bin into char buffer
std::cout << "Loading: '" << xclbin_file_name.c_str() << "'\n";
std::ifstream bin_file(xclbin_file_name.c_str(), std::ifstream::binary);
bin_file.seekg (0, bin_file.end);
nb = bin_file.tellg();
bin_file.seekg (0, bin_file.beg);
char *buf = new char [nb];
bin_file.read(buf, nb);
return buf;
}
};
int main(int argc, char* argv[]){
hls::stream<ap_axiu<32,1,1,1> > ins_soft;
output_type output_soft;
float dot2_soft[NUM_OF_OUTPUT];
ap_axiu<32,1,1,1> pix;
int hw_err_cnt = 0;
int sw_err_cnt = 0;
const char* xclbinFilename;
if (argc==2) {
xclbinFilename = argv[1];
std::cout <<"Using FPGA binary file specfied through the command line: " << xclbinFilename << std::endl;
}
else {
xclbinFilename = "../lap_filter_axim.xclbin";
std::cout << "No FPGA binary file specified through the command line, using:" << xclbinFilename <<std::endl;
}
// t_train256[][]を入れるメモリをアロケート
std::vector<int32_t,aligned_allocator<int32_t>> pixel(ROW_PIXELS*COULMN_PIXELS);
size_t pixel_in_bytes = (ROW_PIXELS*COULMN_PIXELS) * sizeof(int32_t);
std::vector<uint32_t,aligned_allocator<uint32_t>> output(1);
size_t output_in_bytes = sizeof(uint32_t);
std::vector<int32_t,aligned_allocator<int32_t>> dot2(NUM_OF_OUTPUT);
size_t dot2_in_bytes = (NUM_OF_OUTPUT * sizeof(int32_t));
std::vector<cl::Device> devices = xcl::get_xil_devices();
cl::Device device = devices[0];
devices.resize(1);
for(int y=0; y<ROW_PIXELS; y++){
for(int x=0; x<COULMN_PIXELS; x++){
// 1 画面分のデータを ins、ins_soft に入力する
pix.data = ap_uint<32>(t_train_256[0][y*COULMN_PIXELS+x]);
if (x==0 && y==0) // 最初のデータの時に TUSER を 1 にする
pix.user = 1;
else
pix.user = 0;
if (x == COULMN_PIXELS-1) // 行の最後でTLASTをアサートする
pix.last = 1;
else
pix.last = 0;
ins_soft << pix;
pixel[y*COULMN_PIXELS+x] = uint32_t(t_train_256[0][y*COULMN_PIXELS+x]);
}
}
// Creating Context and Command Queue for selected device
cl::Context context(device);
cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE);
// Load xclbin
std::cout << "Loading: '" << xclbinFilename << "'\n";
std::ifstream bin_file(xclbinFilename, std::ifstream::binary);
bin_file.seekg (0, bin_file.end);
unsigned nb = bin_file.tellg();
bin_file.seekg (0, bin_file.beg);
char *buf = new char [nb];
bin_file.read(buf, nb);
// Creating Program from Binary File
cl::Program::Binaries bins;
bins.push_back({buf,nb});
cl::Program program(context, devices, bins);
// This call will get the kernel object from program. A kernel is an
// OpenCL function that is executed on the FPGA.
cl::Kernel krnl_all_layers_dnn(program,"all_layers_dnn");
// These commands will allocate memory on the Device. The cl::Buffer objects can
// be used to reference the memory locations on the device.
cl::Buffer pixel_buf(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY,
pixel_in_bytes, pixel.data());
cl::Buffer output_buf(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE,
output_in_bytes, output.data());
cl::Buffer dot2_buf(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY,
dot2_in_bytes, dot2.data());
// Data will be transferred from system memory over PCIe to the FPGA on-board
// DDR memory.
q.enqueueMigrateMemObjects({pixel_buf},0/* 0 means from host*/);
//set the kernel Arguments
krnl_all_layers_dnn.setArg(0,pixel_buf);
krnl_all_layers_dnn.setArg(1,output_buf);
krnl_all_layers_dnn.setArg(2,dot2_buf);
krnl_all_layers_dnn.setArg(3,COULMN_PIXELS);
krnl_all_layers_dnn.setArg(4,ROW_PIXELS);
cl::Event event;
uint64_t dnn_start, dnn_end;
//Launch the Kernel
q.enqueueTask(krnl_all_layers_dnn, NULL, &event);
//q.enqueueTask(krnl_all_layers_dnn);
// The result of the previous kernel execution will need to be retrieved in
// order to view the results. This call will transfer the data from FPGA to
// source_results vector
q.enqueueMigrateMemObjects({output_buf, dot2_buf},CL_MIGRATE_MEM_OBJECT_HOST);
q.finish();
// 時間計測
event.getProfilingInfo<uint64_t>(CL_PROFILING_COMMAND_START, &dnn_start);
event.getProfilingInfo<uint64_t>(CL_PROFILING_COMMAND_END, &dnn_end);
auto dnn_time = dnn_end - dnn_start;
printf("all_layers_dnn : %lu ns\n", dnn_time);
all_layers_soft(ins_soft, output_soft, dot2_soft);
int t_test_num = 0;
for(int m=0; m<NUM_OF_OUTPUT; m++){
if(t_test[0][m] == 1.0f){
t_test_num = m;
break;
}
}
// out と out_soft を比較する
/* cout << "output" << " = " << int(output) << " output_soft = " << int(output_soft) << endl;
for(int j=0; j<NUM_OF_OUTPUT; j++){
cout << "dot2[" << j << "] = " << float(dot2[j]) << " dot2_soft[" << j << "] = " << dot2_soft[j] << endl;
} */
int i = 0;
if(int(output[0]) != t_test_num){
std::cout << "hw_error: i = " << i << " output = " << int(output[0]) << " t_test_num = " << t_test_num << std::endl;
hw_err_cnt++;
//return(1);
}
if(int(output_soft) != t_test_num){
std::cout << "sw_error: i = "<< i << " output_soft = " << int(output_soft) << " t_test_num" " = " << t_test_num << std::endl;
sw_err_cnt++;
//return(1);
}
//if(int(output[0]) != t_test_num || int(output_soft) != t_test_num){
for(int j=0; j<NUM_OF_OUTPUT; j++){
std::cout << "dot2[" << j << "] = " << std::fixed << std::setprecision(8) << float(dot2[j])/float(256.0) << " dot2_soft[" << j << "] = " << dot2_soft[j] << std::endl;
}
std::cout << std::endl;
//}
std::cout << "hw_err_cnt = " << hw_err_cnt << " sw_err_cnt = " << sw_err_cnt << std::endl;
return(0);
}
も入っていた。m_axi_addr64=ture
を元に戻した。extern "C" { }
が設定されていた。config_sdx -target xocc
// square_cubed.cpp
// 2020/01/05 by marsee
//
#include <stdint.h>
//extern "C" {
void square_cubed(volatile int32_t *in, volatile int32_t *square, volatile int32_t *cubed){
#pragma HLS INTERFACE m_axi depth=10 port=cubed offset=slave bundle=gmem
#pragma HLS INTERFACE m_axi depth=10 port=square offset=slave bundle=gmem
#pragma HLS INTERFACE m_axi depth=10 port=in offset=slave bundle=gmem
#pragma HLS INTERFACE s_axilite port=return bundle=control
for(int i=0; i<10; i++){
#pragma HLS PIPELINE II=1
int32_t in_t = in[i];
square[i] = in_t * in_t;
cubed[i] = in_t * in_t * in_t;
}
}
//}
// square_cubed_tb.cpp
// 2020/01/06 by marsee
//
#include <iostream>
#include <stdint.h>
void square_cubed(volatile int32_t *in, volatile int32_t *square, volatile int32_t *cubed);
int main(){
int32_t data[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
int32_t square[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
int32_t cubed[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
square_cubed(data, square, cubed);
for(int i=0; i<10; i++){
std::cout << "data[" << i << "]= " << data[i] << ", square[" << i << "]= " <<
square[i] << ", cubed[" << i << "]= " << cubed[i] << std::endl;
}
}
とq.enqueueMigrateMemObjects({output_buf, dot2_buf},CL_MIGRATE_MEM_OBJECT_HOST);
の 2 つやってみたが、やはり output と dot2[ ] の値が 0 だ。q.enqueueMigrateMemObjects({output_buf},CL_MIGRATE_MEM_OBJECT_HOST);
q.enqueueMigrateMemObjects({dot2_buf},CL_MIGRATE_MEM_OBJECT_HOST);
// lap_filter_axis_dma_tb.c
// BMPデータをハードウェアとソフトウェアで、ラプラシアン・フィルタを掛けて、それを比較する
// m_axi offset=slave version
// 2019/12/31 by marsee
//
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include "bmp_header.h"
int32_t laplacian_fil_soft(int32_t x0y0, int32_t x1y0, int32_t x2y0, int32_t x0y1, int32_t x1y1, int32_t x2y1, int32_t x0y2, int32_t x1y2, int32_t x2y2);
int32_t conv_rgb2y_soft(int32_t rgb);
void lap_filter_axis_dma(volatile int32_t *inm, volatile int32_t *outm, int32_t x_size, int32_t y_size); // hardware
void laplacian_filter_soft(int32_t *cam_fb, int32_t *lap_fb, long width, long height); // software
int main()
{
int32_t *s, *h;
long x, y;
BITMAPFILEHEADER bmpfhr; // BMPファイルのファイルヘッダ(for Read)
BITMAPINFOHEADER bmpihr; // BMPファイルのINFOヘッダ(for Read)
FILE *fbmpr, *fbmpw;
int32_t *rd_bmp, *hw_lapd, *sw_lapd;
int32_t blue, green, red;
char blue_c, green_c, red_c;
if ((fbmpr = fopen("test.bmp", "rb")) == NULL){ // test.bmp をオープン
fprintf(stderr, "Can't open test.bmp by binary read mode\n");
exit(1);
}
// bmpヘッダの読み出し
fread(&bmpfhr.bfType, sizeof(uint16_t), 1, fbmpr);
fread(&bmpfhr.bfSize, sizeof(uint32_t), 1, fbmpr);
fread(&bmpfhr.bfReserved1, sizeof(uint16_t), 1, fbmpr);
fread(&bmpfhr.bfReserved2, sizeof(uint16_t), 1, fbmpr);
fread(&bmpfhr.bfOffBits, sizeof(uint32_t), 1, fbmpr);
fread(&bmpihr, sizeof(BITMAPINFOHEADER), 1, fbmpr);
// ピクセルを入れるメモリをアロケートする
if ((rd_bmp =(int32_t *)malloc(sizeof(int32_t) * (bmpihr.biWidth * bmpihr.biHeight))) == NULL){
fprintf(stderr, "Can't allocate rd_bmp memory\n");
exit(1);
}
if ((hw_lapd =(int32_t *)malloc(sizeof(int32_t) * (bmpihr.biWidth * bmpihr.biHeight))) == NULL){
fprintf(stderr, "Can't allocate hw_lapd memory\n");
exit(1);
}
if ((sw_lapd =(int32_t *)malloc(sizeof(int32_t) * (bmpihr.biWidth * bmpihr.biHeight))) == NULL){
fprintf(stderr, "Can't allocate sw_lapd memory\n");
exit(1);
}
// rd_bmp にBMPのピクセルを代入。その際に、行を逆転する必要がある
for (y=0; y<bmpihr.biHeight; y++){
for (x=0; x<bmpihr.biWidth; x++){
blue = fgetc(fbmpr);
green = fgetc(fbmpr);
red = fgetc(fbmpr);
rd_bmp[((bmpihr.biHeight-1)-y)*bmpihr.biWidth+x] = (blue & 0xff) | ((green & 0xff)<<8) | ((red & 0xff)<<16);
}
}
fclose(fbmpr);
lap_filter_axis_dma((volatile int32_t *)rd_bmp, (volatile int32_t *)hw_lapd, (int32_t)bmpihr.biWidth, (int32_t)bmpihr.biHeight); // ハードウェアのラプラシアン・フィルタ
laplacian_filter_soft(rd_bmp, sw_lapd, bmpihr.biWidth, bmpihr.biHeight); // ソフトウェアのラプラシアン・フィルタ
// ハードウェアとソフトウェアのラプラシアン・フィルタの値のチェック
for (y=0, h=hw_lapd, s=sw_lapd; y<bmpihr.biHeight; y++){
for (x=0; x<bmpihr.biWidth; x++){
if (*h != *s){
printf("ERROR HW and SW results mismatch x = %ld, y = %ld, HW = %x, SW = %x\n", x, y, *h, *s);
return(1);
} else {
h++;
s++;
}
}
}
printf("Success HW and SW results match\n");
// ハードウェアのラプラシアンフィルタの結果を temp_lap.bmp へ出力する
if ((fbmpw=fopen("temp_lap.bmp", "wb")) == NULL){
fprintf(stderr, "Can't open temp_lap.bmp by binary write mode\n");
exit(1);
}
// BMPファイルヘッダの書き込み
fwrite(&bmpfhr.bfType, sizeof(uint16_t), 1, fbmpw);
fwrite(&bmpfhr.bfSize, sizeof(uint32_t), 1, fbmpw);
fwrite(&bmpfhr.bfReserved1, sizeof(uint16_t), 1, fbmpw);
fwrite(&bmpfhr.bfReserved2, sizeof(uint16_t), 1, fbmpw);
fwrite(&bmpfhr.bfOffBits, sizeof(uint32_t), 1, fbmpw);
fwrite(&bmpihr, sizeof(BITMAPINFOHEADER), 1, fbmpw);
// RGB データの書き込み、逆順にする
for (y=0; y<bmpihr.biHeight; y++){
for (x=0; x<bmpihr.biWidth; x++){
blue = hw_lapd[((bmpihr.biHeight-1)-y)*bmpihr.biWidth+x] & 0xff;
green = (hw_lapd[((bmpihr.biHeight-1)-y)*bmpihr.biWidth+x] >> 8) & 0xff;
red = (hw_lapd[((bmpihr.biHeight-1)-y)*bmpihr.biWidth+x]>>16) & 0xff;
fputc(blue, fbmpw);
fputc(green, fbmpw);
fputc(red, fbmpw);
}
}
fclose(fbmpw);
free(rd_bmp);
free(hw_lapd);
free(sw_lapd);
return(0);
}
void laplacian_filter_soft(int32_t *cam_fb, int32_t *lap_fb, long width, long height)
{
int32_t **line_buf;
int32_t *lap_buf;
int32_t x, y, i;
int32_t lap_fil_val;
int32_t a, b;
int32_t fl, sl, tl;
// line_buf の1次元目の配列をアロケートする
if ((line_buf =(int32_t **)malloc(sizeof(int32_t *) * 3)) == NULL){
fprintf(stderr, "Can't allocate line_buf[3][]\n");
exit(1);
}
// メモリをアロケートする
for (i=0; i<3; i++){
if ((line_buf[i]=(int32_t *)malloc(sizeof(int32_t) * width)) == NULL){
fprintf(stderr, "Can't allocate line_buf[%d]\n", i);
exit(1);
}
}
if ((lap_buf=(int32_t *)malloc(sizeof(int32_t) * (width))) == NULL){
fprintf(stderr, "Can't allocate lap_buf memory\n");
exit(1);
}
// RGB値をY(輝度成分)のみに変換し、ラプラシアンフィルタを掛けた。
for (y=0; y<height; y++){
for (x=0; x<width; x++){
line_buf[y%3][x] = conv_rgb2y_soft(cam_fb[y*width+x]);
fl = (y+1)%3; // 最初のライン, y%3=0 120, y%3=1 201, y=2 012, y=3 120
sl = (y+2)%3; // 2番めのライン
tl = y%3; // 3番目のライン
// ラプラシアンフィルタ・データの書き込み
if (y<2 || x<2){
lap_fil_val = 0;
lap_fb[(y*width)+x] = 0;
} else {
lap_fil_val = laplacian_fil_soft( line_buf[fl][x-2], line_buf[fl][x-1], line_buf[fl][x],
line_buf[sl][x-2], line_buf[sl][x-1], line_buf[sl][x],
line_buf[tl][x-2], line_buf[tl][x-1], line_buf[tl][x]);
lap_fb[(y*width)+x] = (lap_fil_val<<16)+(lap_fil_val<<8)+lap_fil_val ;
}
}
}
free(lap_buf);
for (i=0; i<3; i++)
free(line_buf[i]);
free(line_buf);
}
// RGBからYへの変換
// RGBのフォーマットは、{8'd0, R(8bits), G(8bits), B(8bits)}, 1pixel = 32bits
// 輝度信号Yのみに変換する。変換式は、Y = 0.299R + 0.587G + 0.114B
// "YUVフォーマット及び YUV<->RGB変換"を参考にした。http://vision.kuee.kyoto-u.ac.jp/~hiroaki/firewire/yuv.html
// 2013/09/27 : float を止めて、すべてint にした
int32_t conv_rgb2y_soft(int32_t rgb){
int32_t r, g, b, y_f;
int32_t y;
b = rgb & 0xff;
g = (rgb>>8) & 0xff;
r = (rgb>>16) & 0xff;
y_f = 77*r + 150*g + 29*b; //y_f = 0.299*r + 0.587*g + 0.114*b;の係数に256倍した
y = y_f >> 8; // 256で割る
return(y);
}
// ラプラシアンフィルタ
// x0y0 x1y0 x2y0 -1 -1 -1
// x0y1 x1y1 x2y1 -1 8 -1
// x0y2 x1y2 x2y2 -1 -1 -1
int32_t laplacian_fil_soft(int32_t x0y0, int32_t x1y0, int32_t x2y0, int32_t x0y1, int32_t x1y1, int32_t x2y1, int32_t x0y2, int32_t x1y2, int32_t x2y2)
{
int32_t y;
y = -x0y0 -x1y0 -x2y0 -x0y1 +8*x1y1 -x2y1 -x0y2 -x1y2 -x2y2;
if (y<0)
y = 0;
else if (y>255)
y = 255;
return(y);
}
config_interface -m_axi_addr64
config_sdx -target xocc
/home/masaaki/Vivado_HLS/Ultra96/test/lap_filter_axis_dma/solution1/csim/build/../../../lap_filter_axis_dma_tb.cpp:67: `lap_filter_axis_dma(int volatile*, int volatile*, int, int)' に対する定義されていない参照です
collect2: エラー: ld はステータス 1 で終了しました
#pragma HLS INTERFACE m_axi depth=480000 port=outm offset=slave bundle=gmem
#pragma HLS INTERFACE m_axi depth=480000 port=inm offset=slave bundle=gmem
1. Vivado HLS プロジェクトを作成して、Soulition メニューからSolution Settings... のGeneral でconfig_interface -m_axi_addr64、 config_sdx -target xocc に設定する。
2. カーネル・アプリケーションのソースコードで、 extern "C" { } をコメントアウトして、C シミュレーション、C コードの合成、C/RTL 協調シミュレーションを行う。
3. C/RTL 協調シミュレーションはエラーになるが、波形は確認できる。
4. extern "C" { } を戻して、C コードの合成をして、Export RTL を行って、”.xo”ファイルを生成する。
日 | 月 | 火 | 水 | 木 | 金 | 土 |
---|---|---|---|---|---|---|
- | - | - | 1 | 2 | 3 | 4 |
5 | 6 | 7 | 8 | 9 | 10 | 11 |
12 | 13 | 14 | 15 | 16 | 17 | 18 |
19 | 20 | 21 | 22 | 23 | 24 | 25 |
26 | 27 | 28 | 29 | 30 | 31 | - |