#pragma SDS data data_mover(in_A:AXIDMA_SG, in_B:AXIDMA_SIMPLE, out_C:AXIFIFO)
を追加する。#pragma SDS data sys_port(in_A:ACP, in_B:AFI)
だそうだ。[Place 30-575] Sub-optimal placement for a clock-capable IO pin and MMCM pair. If this sub optimal condition is acceptable for this design, you may use the CLOCK_DEDICATED_ROUTE constraint in the .xdc file to demote this message to a WARNING. However, the use of this override is highly discouraged. These examples can be used directly in the .xdc file to override this clock rule.
< set_property CLOCK_DEDICATED_ROUTE BACKBONE [get_nets dvi2vga_i/dvi2rgb_0/U0/TMDS_ClockingX/CLK_IN_hdmi_clk] >
[Place 30-149] Unroutable Placement! A MMCM / (BUFIO/BUFR) component pair is not placed in a routable site pair. The MMCM component can use the dedicated path between the MMCM and the (BUFIO/BUFR) if both are placed in the same clock region or if they are placed in horizontally adjacent clock regions. If this sub optimal condition is acceptable for this design, you may use the CLOCK_DEDICATED_ROUTE constraint in the .xdc file to demote this message to a WARNING. However, the use of this override is highly discouraged. These examples can be used directly in the .xdc file to override this clock rule.
< set_property CLOCK_DEDICATED_ROUTE FALSE [get_nets dvi2vga_i/dvi2rgb_0/U0/TMDS_ClockingX/CLK_OUT_5x_hdmi_clk] >
[Place 30-512] Clock region assignment has failed. Clock buffer 'dvi2vga_i/dvi2rgb_0/U0/TMDS_ClockingX/DVI_ClkGenerator' (MMCME2_ADV) is placed at site MMCME2_ADV_X0Y0 in CLOCKREGION_X1Y0. Its loads need to be placed in the area enclosed by clock regions CLOCKREGION_X1Y0 and CLOCKREGION_X1Y0. One of its loads 'dvi2vga_i/dvi2rgb_0/U0/TMDS_ClockingX/SerialClkBuffer' (BUFIO) is placed in site BUFIO_X0Y5 in CLOCKREGION_X1Y1 which is outside the permissible area.
set_property PACKAGE_PIN P20 [get_ports {vga_pBlue[0]}]
set_property PACKAGE_PIN M20 [get_ports {vga_pBlue[1]}]
set_property PACKAGE_PIN K19 [get_ports {vga_pBlue[2]}]
set_property PACKAGE_PIN J18 [get_ports {vga_pBlue[3]}]
set_property PACKAGE_PIN G19 [get_ports {vga_pBlue[4]}]
set_property IOSTANDARD LVCMOS33 [get_ports {vga_pBlue[4]}]
set_property IOSTANDARD LVCMOS33 [get_ports {vga_pBlue[3]}]
set_property IOSTANDARD LVCMOS33 [get_ports {vga_pBlue[2]}]
set_property IOSTANDARD LVCMOS33 [get_ports {vga_pBlue[1]}]
set_property IOSTANDARD LVCMOS33 [get_ports {vga_pBlue[0]}]
set_property IOSTANDARD LVCMOS33 [get_ports {vga_pGreen[5]}]
set_property IOSTANDARD LVCMOS33 [get_ports {vga_pGreen[4]}]
set_property IOSTANDARD LVCMOS33 [get_ports {vga_pGreen[3]}]
set_property IOSTANDARD LVCMOS33 [get_ports {vga_pGreen[2]}]
set_property IOSTANDARD LVCMOS33 [get_ports {vga_pGreen[1]}]
set_property IOSTANDARD LVCMOS33 [get_ports {vga_pGreen[0]}]
set_property PACKAGE_PIN H18 [get_ports {vga_pGreen[0]}]
set_property PACKAGE_PIN N20 [get_ports {vga_pGreen[1]}]
set_property PACKAGE_PIN L19 [get_ports {vga_pGreen[2]}]
set_property PACKAGE_PIN J19 [get_ports {vga_pGreen[3]}]
set_property PACKAGE_PIN H20 [get_ports {vga_pGreen[4]}]
set_property PACKAGE_PIN F20 [get_ports {vga_pGreen[5]}]
set_property IOSTANDARD LVCMOS33 [get_ports {vga_pRed[4]}]
set_property IOSTANDARD LVCMOS33 [get_ports {vga_pRed[3]}]
set_property IOSTANDARD LVCMOS33 [get_ports {vga_pRed[2]}]
set_property IOSTANDARD LVCMOS33 [get_ports {vga_pRed[1]}]
set_property IOSTANDARD LVCMOS33 [get_ports {vga_pRed[0]}]
set_property PACKAGE_PIN M19 [get_ports {vga_pRed[0]}]
set_property PACKAGE_PIN L20 [get_ports {vga_pRed[1]}]
set_property PACKAGE_PIN J20 [get_ports {vga_pRed[2]}]
set_property PACKAGE_PIN G20 [get_ports {vga_pRed[3]}]
set_property PACKAGE_PIN F19 [get_ports {vga_pRed[4]}]
set_property PACKAGE_PIN H16 [get_ports TMDS_Clk_p]
set_property PACKAGE_PIN D19 [get_ports {TMDS_Data_p[0]}]
set_property PACKAGE_PIN C20 [get_ports {TMDS_Data_p[1]}]
set_property PACKAGE_PIN B19 [get_ports {TMDS_Data_p[2]}]
set_property PACKAGE_PIN G18 [get_ports ddc_sda_io]
set_property PACKAGE_PIN G17 [get_ports ddc_scl_io]
set_property PACKAGE_PIN P19 [get_ports vga_pHSync]
set_property IOSTANDARD LVCMOS33 [get_ports vga_pHSync]
set_property IOSTANDARD LVCMOS33 [get_ports vga_pVSync]
set_property PACKAGE_PIN R19 [get_ports vga_pVSync]
set_property PACKAGE_PIN L16 [get_ports clk125]
set_property PACKAGE_PIN R18 [get_ports reset]
set_property IOSTANDARD LVCMOS33 [get_ports reset]
set_property IOSTANDARD LVCMOS33 [get_ports clk125]
set_property IOSTANDARD LVCMOS33 [get_ports ddc_scl_io]
set_property IOSTANDARD LVCMOS33 [get_ports ddc_sda_io]
set_property CLOCK_DEDICATED_ROUTE BACKBONE [get_nets dvi2vga_i/dvi2rgb_0/U0/TMDS_ClockingX/CLK_IN_hdmi_clk]
set_property CLOCK_DEDICATED_ROUTE FALSE [get_nets dvi2vga_i/dvi2rgb_0/U0/TMDS_ClockingX/CLK_OUT_5x_hdmi_clk]
gcc laplacian_filter5.c -o laplacian_filter5
gcc -O1 laplacian_filter5.c -o laplacian_filter5_O1
gcc -O2 laplacian_filter5.c -o laplacian_filter5_O2
gcc -O3 laplacian_filter5.c -o laplacian_filter5_O3
gcc -Os laplacian_filter5.c -o laplacian_filter5_Os
gcc -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math laplacian_filter5.c -o laplacian_filter5_n
gcc -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math -O1 laplacian_filter5.c -o laplacian_filter5_n1
gcc -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math -O2 laplacian_filter5.c -o laplacian_filter5_n2
gcc -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math -O3 laplacian_filter5.c -o laplacian_filter5_n3
gcc -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math -Os laplacian_filter5.c -o laplacian_filter5_ns
./laplacian_filter5
./laplacian_filter5_O1
./laplacian_filter5_O2
./laplacian_filter5_O3
./laplacian_filter5_Os
./laplacian_filter5_n
./laplacian_filter5_n1
./laplacian_filter5_n2
./laplacian_filter5_n3
./laplacian_filter5_ns
objdump -S -d laplacian_filter5 | grep "vmov" -c
objdump -S -d laplacian_filter5_O1 | grep "vmov" -c
objdump -S -d laplacian_filter5_O2 | grep "vmov" -c
objdump -S -d laplacian_filter5_O3 | grep "vmov" -c
objdump -S -d laplacian_filter5_Os | grep "vmov" -c
objdump -S -d laplacian_filter5_n | grep "vmov" -c
objdump -S -d laplacian_filter5_n1 | grep "vmov" -c
objdump -S -d laplacian_filter5_n2 | grep "vmov" -c
objdump -S -d laplacian_filter5_n3 | grep "vmov" -c
objdump -S -d laplacian_filter5_ns | grep "vmov" -c
gcc -fopenmp laplacian_filter5.c -o laplacian_filter5_mp
gcc -fopenmp -O1 laplacian_filter5.c -o laplacian_filter5_mpO1
gcc -fopenmp -O2 laplacian_filter5.c -o laplacian_filter5_mpO2
gcc -fopenmp -O3 laplacian_filter5.c -o laplacian_filter5_mpO3
gcc -fopenmp -Os laplacian_filter5.c -o laplacian_filter5_mpOs
gcc -fopenmp -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math laplacian_filter5.c -o laplacian_filter5_mpn
gcc -fopenmp -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math -O1 laplacian_filter5.c -o laplacian_filter5_mpn1
gcc -fopenmp -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math -O2 laplacian_filter5.c -o laplacian_filter5_mpn2
gcc -fopenmp -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math -O3 laplacian_filter5.c -o laplacian_filter5_mpn3
gcc -fopenmp -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math -Os laplacian_filter5.c -o laplacian_filter5_mpns
./laplacian_filter5_mp
./laplacian_filter5_mpO1
./laplacian_filter5_mpO2
./laplacian_filter5_mpO3
./laplacian_filter5_mpOs
./laplacian_filter5_mpn
./laplacian_filter5_mpn1
./laplacian_filter5_mpn2
./laplacian_filter5_mpn3
./laplacian_filter5_mpns
objdump -S -d laplacian_filter5_mp | grep "vmov" -c
objdump -S -d laplacian_filter5_mpO1 | grep "vmov" -c
objdump -S -d laplacian_filter5_mpO2 | grep "vmov" -c
objdump -S -d laplacian_filter5_mpO3 | grep "vmov" -c
objdump -S -d laplacian_filter5_mpOs | grep "vmov" -c
objdump -S -d laplacian_filter5_mpn | grep "vmov" -c
objdump -S -d laplacian_filter5_mpn1 | grep "vmov" -c
objdump -S -d laplacian_filter5_mpn2 | grep "vmov" -c
objdump -S -d laplacian_filter5_mpn3 | grep "vmov" -c
objdump -S -d laplacian_filter5_mpns | grep "vmov" -c
を置いた。#ifdef _OPENMP
#pragma omp parallel for private(lap_fil_val, a, b, cam_fb_addr, lap_fb_addr)
#endif
を実行した。gcc -fopenmp laplacian_filter3.c -o laplacian_filter3_mp
gcc -fopenmp -O1 laplacian_filter3.c -o laplacian_filter3_mpO1
gcc -fopenmp -O2 laplacian_filter3.c -o laplacian_filter3_mpO2
gcc -fopenmp -O3 laplacian_filter3.c -o laplacian_filter3_mpO3
gcc -fopenmp -Os laplacian_filter3.c -o laplacian_filter3_mpOs
gcc -fopenmp -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math laplacian_filter3.c -o laplacian_filter3_mpn
gcc -fopenmp -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math -O1 laplacian_filter3.c -o laplacian_filter3_mpn1
gcc -fopenmp -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math -O2 laplacian_filter3.c -o laplacian_filter3_mpn2
gcc -fopenmp -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math -O3 laplacian_filter3.c -o laplacian_filter3_mpn3
gcc -fopenmp -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math -Os laplacian_filter3.c -o laplacian_filter3_mpns
./laplacian_filter3_mp
./laplacian_filter3_mpO1
./laplacian_filter3_mpO2
./laplacian_filter3_mpO3
./laplacian_filter3_mpOs
./laplacian_filter3_mpn
./laplacian_filter3_mpn1
./laplacian_filter3_mpn2
./laplacian_filter3_mpn3
./laplacian_filter3_mpns
objdump -S -d laplacian_filter3_mp | grep "vmov" -c
objdump -S -d laplacian_filter3_mpO1 | grep "vmov" -c
objdump -S -d laplacian_filter3_mpO2 | grep "vmov" -c
objdump -S -d laplacian_filter3_mpO3 | grep "vmov" -c
objdump -S -d laplacian_filter3_mpOs | grep "vmov" -c
objdump -S -d laplacian_filter3_mpn | grep "vmov" -c
objdump -S -d laplacian_filter3_mpn1 | grep "vmov" -c
objdump -S -d laplacian_filter3_mpn2 | grep "vmov" -c
objdump -S -d laplacian_filter3_mpn3 | grep "vmov" -c
objdump -S -d laplacian_filter3_mpns | grep "vmov" -c
を置いた。#ifdef _OPENMP
#pragma omp parallel for private(lap_fil_val, current, next, prev)
#endif
を実行した。gcc -fopenmp laplacian_filter4.c -o laplacian_filter4_mp
gcc -fopenmp -O1 laplacian_filter4.c -o laplacian_filter4_mpO1
gcc -fopenmp -O2 laplacian_filter4.c -o laplacian_filter4_mpO2
gcc -fopenmp -O3 laplacian_filter4.c -o laplacian_filter4_mpO3
gcc -fopenmp -Os laplacian_filter4.c -o laplacian_filter4_mpOs
gcc -fopenmp -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math laplacian_filter4.c -o laplacian_filter4_mpn
gcc -fopenmp -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math -O1 laplacian_filter4.c -o laplacian_filter4_mpn1
gcc -fopenmp -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math -O2 laplacian_filter4.c -o laplacian_filter4_mpn2
gcc -fopenmp -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math -O3 laplacian_filter4.c -o laplacian_filter4_mpn3
gcc -fopenmp -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math -Os laplacian_filter4.c -o laplacian_filter4_mpns
./laplacian_filter4_mp
./laplacian_filter4_mpO1
./laplacian_filter4_mpO2
./laplacian_filter4_mpO3
./laplacian_filter4_mpOs
./laplacian_filter4_mpn
./laplacian_filter4_mpn1
./laplacian_filter4_mpn2
./laplacian_filter4_mpn3
./laplacian_filter4_mpns
objdump -S -d laplacian_filter4_mp | grep "vmov" -c
objdump -S -d laplacian_filter4_mpO1 | grep "vmov" -c
objdump -S -d laplacian_filter4_mpO2 | grep "vmov" -c
objdump -S -d laplacian_filter4_mpO3 | grep "vmov" -c
objdump -S -d laplacian_filter4_mpOs | grep "vmov" -c
objdump -S -d laplacian_filter4_mpn | grep "vmov" -c
objdump -S -d laplacian_filter4_mpn1 | grep "vmov" -c
objdump -S -d laplacian_filter4_mpn2 | grep "vmov" -c
objdump -S -d laplacian_filter4_mpn3 | grep "vmov" -c
objdump -S -d laplacian_filter4_mpns | grep "vmov" -c
を置いた。#ifdef _OPENMP
#pragma omp parallel for private(lap_fil_val, fl, sl, tl, a, b)
#endif
を実行した。gcc -fopenmp laplacian_filter.c -o laplacian_filter1_mp
gcc -fopenmp -O1 laplacian_filter.c -o laplacian_filter1_mpO1
gcc -fopenmp -O2 laplacian_filter.c -o laplacian_filter1_mpO2
gcc -fopenmp -O3 laplacian_filter.c -o laplacian_filter1_mpO3
gcc -fopenmp -Os laplacian_filter.c -o laplacian_filter1_mpOs
gcc -fopenmp -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math laplacian_filter.c -o laplacian_filter1_mpn
gcc -fopenmp -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math -O1 laplacian_filter.c -o laplacian_filter1_mpn1
gcc -fopenmp -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math -O2 laplacian_filter.c -o laplacian_filter1_mpn2
gcc -fopenmp -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math -O3 laplacian_filter.c -o laplacian_filter1_mpn3
gcc -fopenmp -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math -Os laplacian_filter.c -o laplacian_filter1_mpns
./laplacian_filter1_mp
./laplacian_filter1_mpO1
./laplacian_filter1_mpO2
./laplacian_filter1_mpO3
./laplacian_filter1_mpOs
./laplacian_filter1_mpn
./laplacian_filter1_mpn1
./laplacian_filter1_mpn2
./laplacian_filter1_mpn3
./laplacian_filter1_mpns
objdump -S -d laplacian_filter1_mp | grep "vmov" -c
objdump -S -d laplacian_filter1_mpO1 | grep "vmov" -c
objdump -S -d laplacian_filter1_mpO2 | grep "vmov" -c
objdump -S -d laplacian_filter1_mpO3 | grep "vmov" -c
objdump -S -d laplacian_filter1_mpOs | grep "vmov" -c
objdump -S -d laplacian_filter1_mpn | grep "vmov" -c
objdump -S -d laplacian_filter1_mpn1 | grep "vmov" -c
objdump -S -d laplacian_filter1_mpn2 | grep "vmov" -c
objdump -S -d laplacian_filter1_mpn3 | grep "vmov" -c
objdump -S -d laplacian_filter1_mpns | grep "vmov" -c
を置いた。#ifdef _OPENMP
#pragma omp parallel for private(lap_fil_val, a, b, cam_fb_addr, lap_fb_addr)
#endif
を実行した。。gcc -fopenmp laplacian_filter2.c -o laplacian_filter2_mp
gcc -fopenmp -O1 laplacian_filter2.c -o laplacian_filter2_mpO1
gcc -fopenmp -O2 laplacian_filter2.c -o laplacian_filter2_mpO2
gcc -fopenmp -O3 laplacian_filter2.c -o laplacian_filter2_mpO3
gcc -fopenmp -Os laplacian_filter2.c -o laplacian_filter2_mpOs
gcc -fopenmp -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math laplacian_filter2.c -o laplacian_filter2_mpn
gcc -fopenmp -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math -O1 laplacian_filter2.c -o laplacian_filter2_mpn1
gcc -fopenmp -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math -O2 laplacian_filter2.c -o laplacian_filter2_mpn2
gcc -fopenmp -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math -O3 laplacian_filter2.c -o laplacian_filter2_mpn3
gcc -fopenmp -mcpu=cortex-a9 -mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad -ffast-math -Os laplacian_filter2.c -o laplacian_filter2_mpns
./laplacian_filter2_mp
./laplacian_filter2_mpO1
./laplacian_filter2_mpO2
./laplacian_filter2_mpO3
./laplacian_filter2_mpOs
./laplacian_filter2_mpn
./laplacian_filter2_mpn1
./laplacian_filter2_mpn2
./laplacian_filter2_mpn3
./laplacian_filter2_mpns
objdump -S -d laplacian_filter2_mp | grep "vmov" -c
objdump -S -d laplacian_filter2_mpO1 | grep "vmov" -c
objdump -S -d laplacian_filter2_mpO2 | grep "vmov" -c
objdump -S -d laplacian_filter2_mpO3 | grep "vmov" -c
objdump -S -d laplacian_filter2_mpOs | grep "vmov" -c
objdump -S -d laplacian_filter2_mpn | grep "vmov" -c
objdump -S -d laplacian_filter2_mpn1 | grep "vmov" -c
objdump -S -d laplacian_filter2_mpn2 | grep "vmov" -c
objdump -S -d laplacian_filter2_mpn3 | grep "vmov" -c
objdump -S -d laplacian_filter2_mpns | grep "vmov" -c
clang -Ofast laplacian_filter.c -o laplacian_filter1_Of
clang -Ofast laplacian_filter2.c -o laplacian_filter2_Of
clang -Ofast laplacian_filter3.c -o laplacian_filter3_Of
clang -Ofast laplacian_filter4.c -o laplacian_filter4_Of
clang -mcpu=cortex-a9 -mfpu=neon -mfloat-abi=hard -Ofast laplacian_filter.c -o laplacian_filter1_nf
clang -mcpu=cortex-a9 -mfpu=neon -mfloat-abi=hard -Ofast laplacian_filter2.c -o laplacian_filter2_nf
clang -mcpu=cortex-a9 -mfpu=neon -mfloat-abi=hard -Ofast laplacian_filter3.c -o laplacian_filter3_nf
clang -mcpu=cortex-a9 -mfpu=neon -mfloat-abi=hard -Ofast laplacian_filter4.c -o laplacian_filter4_nf
./laplacian_filter1_Of
./laplacian_filter2_Of
./laplacian_filter3_Of
./laplacian_filter4_Of
./laplacian_filter1_nf
./laplacian_filter2_nf
./laplacian_filter3_nf
./laplacian_filter4_nf
実行結果を下に示す。自動ベクトル化オプションを付けてコンパイルした実行形式ファイルについては、以下のコマンドで、clang laplacian_filter.c -o laplacian_filter1
clang -O1 laplacian_filter.c -o laplacian_filter1_O1
clang -O2 laplacian_filter.c -o laplacian_filter1_O2
clang -O3 laplacian_filter.c -o laplacian_filter1_O3
clang -Os laplacian_filter.c -o laplacian_filter1_Os
clang -mcpu=cortex-a9 -mfpu=neon -mfloat-abi=hard -O3 laplacian_filter.c -o laplacian_filter1_n3
NEONの命令の vmov があるかないかでベクトル化されているかどうかを調査した。objdump -S -d laplacian_filter1_n3 | grep "vmov"
自動ベクトル化オプションを付けてコンパイルした実行形式ファイルについては、以下のコマンドで、clang laplacian_filter2.c -o laplacian_filter2
clang -O1 laplacian_filter2.c -o laplacian_filter2_O1
clang -O2 laplacian_filter2.c -o laplacian_filter2_O2
clang -O3 laplacian_filter2.c -o laplacian_filter2_O3
clang -Os laplacian_filter2.c -o laplacian_filter2_Os
clang -mcpu=cortex-a9 -mfpu=neon -mfloat-abi=hard -O3 laplacian_filter2.c -o laplacian_filter2_n3
NEONの命令の vmov があるかないかでベクトル化されているかどうかを調査した。objdump -S -d laplacian_filter2_n3 | grep "vmov"
自動ベクトル化オプションを付けてコンパイルした実行形式ファイルについては、以下のコマンドで、clang laplacian_filter3.c -o laplacian_filter3
clang -O1 laplacian_filter3.c -o laplacian_filter3_O1
clang -O2 laplacian_filter3.c -o laplacian_filter3_O2
clang -O3 laplacian_filter3.c -o laplacian_filter3_O3
clang -Os laplacian_filter3.c -o laplacian_filter3_Os
clang -mcpu=cortex-a9 -mfpu=neon -mfloat-abi=hard -O3 laplacian_filter3.c -o laplacian_filter3_n3
NEONの命令の vmov があるかないかでベクトル化されているかどうかを調査した。objdump -S -d laplacian_filter3_n3 | grep "vmov"
自動ベクトル化オプションを付けてコンパイルした実行形式ファイルについては、以下のコマンドで、clang laplacian_filter4.c -o laplacian_filter4
clang -O1 laplacian_filter4.c -o laplacian_filter4_O1
clang -O2 laplacian_filter4.c -o laplacian_filter4_O2
clang -O3 laplacian_filter4.c -o laplacian_filter4_O3
clang -Os laplacian_filter4.c -o laplacian_filter4_Os
clang -mcpu=cortex-a9 -mfpu=neon -mfloat-abi=hard -O3 laplacian_filter4.c -o laplacian_filter4_n3
NEONの命令の vmov があるかないかでベクトル化されているかどうかを調査した。objdump -S -d laplacian_filter4_n3 | grep "vmov"
error: unable to open output file '/tmp/laplacian_filter-79ee4c.s': 'Error
opening output file '/tmp/laplacian_filter-79ee4c.s': Permission denied'
laplacian_filter3: file format elf32-littlearm
Disassembly of section .init:
00008428 <_init>:
8428: e92d4008 push {r3, lr}
842c: eb0001a8 bl 8ad4 <call_weak_fn>
8430: e8bd8008 pop {r3, pc}
Disassembly of section .plt:
00008434 <.plt>:
8434: e52de004 push {lr} ; (str lr, [sp, #-4]!)
8438: e59fe004 ldr lr, [pc, #4] ; 8444 <_init+0x1c>
843c: e08fe00e add lr, pc, lr
8440: e5bef008 ldr pc, [lr, #8]!
8444: 00008bbc .word 0x00008bbc
8448: e28fc600 add ip, pc, #0, 12
844c: e28cca08 add ip, ip, #8, 20 ; 0x8000
8450: e5bcfbbc ldr pc, [ip, #3004]! ; 0xbbc
8454: e28fc600 add ip, pc, #0, 12
8458: e28cca08 add ip, ip, #8, 20 ; 0x8000
845c: e5bcfbb4 ldr pc, [ip, #2996]! ; 0xbb4
8460: e28fc600 add ip, pc, #0, 12
8464: e28cca08 add ip, ip, #8, 20 ; 0x8000
8468: e5bcfbac ldr pc, [ip, #2988]! ; 0xbac
846c: e28fc600 add ip, pc, #0, 12
8470: e28cca08 add ip, ip, #8, 20 ; 0x8000
8474: e5bcfba4 ldr pc, [ip, #2980]! ; 0xba4
8478: e28fc600 add ip, pc, #0, 12
847c: e28cca08 add ip, ip, #8, 20 ; 0x8000
8480: e5bcfb9c ldr pc, [ip, #2972]! ; 0xb9c
8484: e28fc600 add ip, pc, #0, 12
8488: e28cca08 add ip, ip, #8, 20 ; 0x8000
848c: e5bcfb94 ldr pc, [ip, #2964]! ; 0xb94
8490: e28fc600 add ip, pc, #0, 12
8494: e28cca08 add ip, ip, #8, 20 ; 0x8000
8498: e5bcfb8c ldr pc, [ip, #2956]! ; 0xb8c
849c: e28fc600 add ip, pc, #0, 12
84a0: e28cca08 add ip, ip, #8, 20 ; 0x8000
84a4: e5bcfb84 ldr pc, [ip, #2948]! ; 0xb84
84a8: e28fc600 add ip, pc, #0, 12
84ac: e28cca08 add ip, ip, #8, 20 ; 0x8000
84b0: e5bcfb7c ldr pc, [ip, #2940]! ; 0xb7c
84b4: e28fc600 add ip, pc, #0, 12
84b8: e28cca08 add ip, ip, #8, 20 ; 0x8000
84bc: e5bcfb74 ldr pc, [ip, #2932]! ; 0xb74
84c0: e28fc600 add ip, pc, #0, 12
84c4: e28cca08 add ip, ip, #8, 20 ; 0x8000
84c8: e5bcfb6c ldr pc, [ip, #2924]! ; 0xb6c
84cc: e28fc600 add ip, pc, #0, 12
84d0: e28cca08 add ip, ip, #8, 20 ; 0x8000
84d4: e5bcfb64 ldr pc, [ip, #2916]! ; 0xb64
Disassembly of section .text:
000084d8 <main>:
84d8: e92d 4ff0 stmdb sp!, {r4, r5, r6, r7, r8, r9, sl, fp, lr}
84dc: f648 4044 movw r0, #35908 ; 0x8c44
84e0: ed2d 8b04 vpush {d8-d9}
84e4: f5ad 5d49 sub.w sp, sp, #12864 ; 0x3240
84e8: b083 sub sp, #12
84ea: 2102 movs r1, #2
84ec: f2c0 0000 movt r0, #0
84f0: f7ff efce blx 8490 <_init+0x68>
84f4: 2800 cmp r0, #0
84f6: bfdf itttt le
84f8: f241 0348 movwle r3, #4168 ; 0x1048
84fc: f648 4050 movwle r0, #35920 ; 0x8c50
8500: f2c0 0301 movtle r3, #1
8504: f2c0 0000 movtle r0, #0
8508: f340 812d ble.w 8766 <main+0x28e>
850c: 2400 movs r4, #0
850e: f04f 7180 mov.w r1, #16777216 ; 0x1000000
8512: e88d 0011 stmia.w sp, {r0, r4}
8516: 2203 movs r2, #3
8518: 4620 mov r0, r4
851a: 2301 movs r3, #1
851c: f7ff efc4 blx 84a8 <_init+0x80>
8520: 900a str r0, [sp, #40] ; 0x28
8522: 2800 cmp r0, #0
8524: f000 80e4 beq.w 86f0 <main+0x218>
8528: f500 0500 add.w r5, r0, #8388608 ; 0x800000
852c: f500 0620 add.w r6, r0, #10485760 ; 0xa00000
8530: 08b6 lsrs r6, r6, #2
8532: 08ad lsrs r5, r5, #2
8534: ff87 805f vmov.i32 q4, #255 ; 0x000000ff
8538: a80e add r0, sp, #56 ; 0x38
853a: 4621 mov r1, r4
853c: f7ff ef96 blx 846c <_init+0x44>
8540: 1bab subs r3, r5, r6
8542: f04f 0902 mov.w r9, #2
8546: 009b lsls r3, r3, #2
8548: ea4f 0c86 mov.w ip, r6, lsl #2
854c: 00ad lsls r5, r5, #2
854e: f503 6348 add.w r3, r3, #3200 ; 0xc80
8552: 9307 str r3, [sp, #28]
8554: f10d 0b48 add.w fp, sp, #72 ; 0x48
8558: 464b mov r3, r9
855a: 46a0 mov r8, r4
855c: 4626 mov r6, r4
855e: 2701 movs r7, #1
8560: f240 3a1f movw sl, #799 ; 0x31f
8564: 46a1 mov r9, r4
8566: 950b str r5, [sp, #44] ; 0x2c
8568: 9907 ldr r1, [sp, #28]
856a: f240 2057 movw r0, #599 ; 0x257
856e: f44f 6248 mov.w r2, #3200 ; 0xc80
8572: f1b8 0f00 cmp.w r8, #0
8576: bf18 it ne
8578: 4580 cmpne r8, r0
857a: fb02 f203 mul.w r2, r2, r3
857e: f44f 6048 mov.w r0, #3200 ; 0xc80
8582: 4461 add r1, ip
8584: bf08 it eq
8586: 2301 moveq r3, #1
8588: 9106 str r1, [sp, #24]
858a: f60d 41c8 addw r1, sp, #3272 ; 0xcc8
858e: fb00 1606 mla r6, r0, r6, r1
8592: f102 0504 add.w r5, r2, #4
8596: bf18 it ne
8598: 2300 movne r3, #0
859a: 440a add r2, r1
859c: 2400 movs r4, #0
859e: 3e04 subs r6, #4
85a0: 440d add r5, r1
85a2: 9208 str r2, [sp, #32]
85a4: fb00 1707 mla r7, r0, r7, r1
85a8: f102 0020 add.w r0, r2, #32
85ac: 9005 str r0, [sp, #20]
85ae: 2200 movs r2, #0
85b0: bb43 cbnz r3, 8604 <main+0x12c>
85b2: 2c00 cmp r4, #0
85b4: bf18 it ne
85b6: 4554 cmpne r4, sl
85b8: bf08 it eq
85ba: 461a moveq r2, r3
85bc: d022 beq.n 8604 <main+0x12c>
85be: 2c01 cmp r4, #1
85c0: f000 80da beq.w 8778 <main+0x2a0>
85c4: 6832 ldr r2, [r6, #0]
85c6: 6871 ldr r1, [r6, #4]
85c8: 68b0 ldr r0, [r6, #8]
85ca: f1c2 0e00 rsb lr, r2, #0
85ce: f857 2c04 ldr.w r2, [r7, #-4]
85d2: ebc1 0e0e rsb lr, r1, lr
85d6: 6839 ldr r1, [r7, #0]
85d8: ebc0 0e0e rsb lr, r0, lr
85dc: ebc2 0e0e rsb lr, r2, lr
85e0: 687a ldr r2, [r7, #4]
85e2: eb0e 0ec1 add.w lr, lr, r1, lsl #3
85e6: e915 0003 ldmdb r5, {r0, r1}
85ea: ebc2 0e0e rsb lr, r2, lr
85ee: 682a ldr r2, [r5, #0]
85f0: ebc0 000e rsb r0, r0, lr
85f4: 1a41 subs r1, r0, r1
85f6: 1a89 subs r1, r1, r2
85f8: f381 0108 usat r1, #8, r1
85fc: 020a lsls r2, r1, #8
85fe: eb02 4201 add.w r2, r2, r1, lsl #16
8602: 440a add r2, r1
8604: f84b 2024 str.w r2, [fp, r4, lsl #2]
8608: 3401 adds r4, #1
860a: f5b4 7f48 cmp.w r4, #800 ; 0x320
860e: f106 0604 add.w r6, r6, #4
8612: f107 0704 add.w r7, r7, #4
8616: f105 0504 add.w r5, r5, #4
861a: d1c8 bne.n 85ae <main+0xd6>
861c: f50c 6448 add.w r4, ip, #3200 ; 0xc80
8620: f109 0901 add.w r9, r9, #1
8624: 4660 mov r0, ip
8626: 4659 mov r1, fp
8628: f44f 6248 mov.w r2, #3200 ; 0xc80
862c: f894 f000 pld [r4]
8630: f7ff ef0a blx 8448 <_init+0x20>
8634: f1b9 0f03 cmp.w r9, #3
8638: dd0c ble.n 8654 <main+0x17c>
863a: f240 2057 movw r0, #599 ; 0x257
863e: 4580 cmp r8, r0
8640: d01a beq.n 8678 <main+0x1a0>
8642: f04f 0901 mov.w r9, #1
8646: 2302 movs r3, #2
8648: 2701 movs r7, #1
864a: 2600 movs r6, #0
864c: f108 0801 add.w r8, r8, #1
8650: 46a4 mov ip, r4
8652: e789 b.n 8568 <main+0x90>
8654: f240 2357 movw r3, #599 ; 0x257
8658: 4598 cmp r8, r3
865a: d00d beq.n 8678 <main+0x1a0>
865c: f1b9 0f02 cmp.w r9, #2
8660: d006 beq.n 8670 <main+0x198>
8662: f1b9 0f03 cmp.w r9, #3
8666: d1ee bne.n 8646 <main+0x16e>
8668: 2301 movs r3, #1
866a: 2700 movs r7, #0
866c: 2602 movs r6, #2
866e: e7ed b.n 864c <main+0x174>
8670: 2300 movs r3, #0
8672: 464f mov r7, r9
8674: 2601 movs r6, #1
8676: e7e9 b.n 864c <main+0x174>
8678: 2100 movs r1, #0
867a: a810 add r0, sp, #64 ; 0x40
867c: f7ff eef6 blx 846c <_init+0x44>
8680: f04f 7180 mov.w r1, #16777216 ; 0x1000000
8684: 980a ldr r0, [sp, #40] ; 0x28
8686: f7ff ef1c blx 84c0 <_init+0x98>
868a: f648 4084 movw r0, #35972 ; 0x8c84
868e: 2102 movs r1, #2
8690: f2c0 0000 movt r0, #0
8694: f7ff eefc blx 8490 <_init+0x68>
8698: 1e03 subs r3, r0, #0
869a: dd5c ble.n 8756 <main+0x27e>
869c: 2000 movs r0, #0
869e: 9300 str r3, [sp, #0]
86a0: 9001 str r0, [sp, #4]
86a2: f44f 3180 mov.w r1, #65536 ; 0x10000
86a6: 2203 movs r2, #3
86a8: 2301 movs r3, #1
86aa: f7ff eefe blx 84a8 <_init+0x80>
86ae: 2800 cmp r0, #0
86b0: d040 beq.n 8734 <main+0x25c>
86b2: f04f 52c1 mov.w r2, #404750336 ; 0x18200000
86b6: f44f 3180 mov.w r1, #65536 ; 0x10000
86ba: 6002 str r2, [r0, #0]
86bc: f7ff ef00 blx 84c0 <_init+0x98>
86c0: 9b0f ldr r3, [sp, #60] ; 0x3c
86c2: 9811 ldr r0, [sp, #68] ; 0x44
86c4: 9c10 ldr r4, [sp, #64] ; 0x40
86c6: 4298 cmp r0, r3
86c8: db23 blt.n 8712 <main+0x23a>
86ca: f85b 2c10 ldr.w r2, [fp, #-16]
86ce: f648 41c4 movw r1, #36036 ; 0x8cc4
86d2: 1ac3 subs r3, r0, r3
86d4: f2c0 0100 movt r1, #0
86d8: 2001 movs r0, #1
86da: 1aa2 subs r2, r4, r2
86dc: f7ff eeea blx 84b4 <_init+0x8c>
86e0: 2000 movs r0, #0
86e2: f50d 5d49 add.w sp, sp, #12864 ; 0x3240
86e6: b003 add sp, #12
86e8: ecbd 8b04 vpop {d8-d9}
86ec: e8bd 8ff0 ldmia.w sp!, {r4, r5, r6, r7, r8, r9, sl, fp, pc}
86f0: f241 0348 movw r3, #4168 ; 0x1048
86f4: f648 4068 movw r0, #35944 ; 0x8c68
86f8: f2c0 0301 movt r3, #1
86fc: f2c0 0000 movt r0, #0
8700: 681b ldr r3, [r3, #0]
8702: 2101 movs r1, #1
8704: 2218 movs r2, #24
8706: f7ff eeac blx 8460 <_init+0x38>
870a: f04f 30ff mov.w r0, #4294967295
870e: f7ff eec6 blx 849c <_init+0x74>
8712: f85b 2c10 ldr.w r2, [fp, #-16]
8716: f500 2074 add.w r0, r0, #999424 ; 0xf4000
871a: f500 7010 add.w r0, r0, #576 ; 0x240
871e: f648 41c4 movw r1, #36036 ; 0x8cc4
8722: 1ac3 subs r3, r0, r3
8724: f2c0 0100 movt r1, #0
8728: 1aa2 subs r2, r4, r2
872a: 2001 movs r0, #1
872c: 3a01 subs r2, #1
872e: f7ff eec2 blx 84b4 <_init+0x8c>
8732: e7d5 b.n 86e0 <main+0x208>
8734: f241 0348 movw r3, #4168 ; 0x1048
8738: f648 40a8 movw r0, #36008 ; 0x8ca8
873c: f2c0 0301 movt r3, #1
8740: f2c0 0000 movt r0, #0
8744: 681b ldr r3, [r3, #0]
8746: 2101 movs r1, #1
8748: 221a movs r2, #26
874a: f7ff ee8a blx 8460 <_init+0x38>
874e: f04f 30ff mov.w r0, #4294967295
8752: f7ff eea4 blx 849c <_init+0x74>
8756: f241 0348 movw r3, #4168 ; 0x1048
875a: f648 4090 movw r0, #35984 ; 0x8c90
875e: f2c0 0301 movt r3, #1
8762: f2c0 0000 movt r0, #0
8766: 681b ldr r3, [r3, #0]
8768: 2101 movs r1, #1
876a: 2215 movs r2, #21
876c: f7ff ee78 blx 8460 <_init+0x38>
8770: f04f 30ff mov.w r0, #4294967295
8774: f7ff ee92 blx 849c <_init+0x74>
8778: f1b8 0f01 cmp.w r8, #1
877c: f000 80be beq.w 88fc <main+0x424>
8780: 9906 ldr r1, [sp, #24]
8782: f44f 6248 mov.w r2, #3200 ; 0xc80
8786: 9808 ldr r0, [sp, #32]
8788: 9302 str r3, [sp, #8]
878a: f8cd c00c str.w ip, [sp, #12]
878e: f7ff ee5c blx 8448 <_init+0x20>
8792: 9a05 ldr r2, [sp, #20]
8794: 9b02 ldr r3, [sp, #8]
8796: f8dd c00c ldr.w ip, [sp, #12]
879a: 4611 mov r1, r2
879c: 4618 mov r0, r3
879e: ed52 0b08 vldr d16, [r2, #-32] ; 0xffffffe0
87a2: ed52 1b06 vldr d17, [r2, #-24] ; 0xffffffe8
87a6: eff0 2070 vshr.s32 q9, q8, #16
87aa: f892 f0a0 pld [r2, #160] ; 0xa0
87ae: 3002 adds r0, #2
87b0: eff8 4070 vshr.s32 q10, q8, #8
87b4: 28c6 cmp r0, #198 ; 0xc6
87b6: 4696 mov lr, r2
87b8: 9109 str r1, [sp, #36] ; 0x24
87ba: ef42 21d8 vand q9, q9, q4
87be: f102 0220 add.w r2, r2, #32
87c2: ef44 41d8 vand q10, q10, q4
87c6: f101 0120 add.w r1, r1, #32
87ca: ef40 01d8 vand q8, q8, q4
87ce: efe2 a572 vshl.s32 q13, q9, #2
87d2: efe2 6574 vshl.s32 q11, q10, #2
87d6: ef6a a8e2 vadd.i32 q13, q13, q9
87da: ef66 48e4 vadd.i32 q10, q11, q10
87de: efe3 6570 vshl.s32 q11, q8, #3
87e2: efe1 a57a vshl.s32 q13, q13, #1
87e6: efe4 8574 vshl.s32 q12, q10, #4
87ea: ef6a 28e2 vadd.i32 q9, q13, q9
87ee: ff66 68e0 vsub.i32 q11, q11, q8
87f2: ff68 48e4 vsub.i32 q10, q12, q10
87f6: efe3 8572 vshl.s32 q12, q9, #3
87fa: efe1 4574 vshl.s32 q10, q10, #1
87fe: efe2 6576 vshl.s32 q11, q11, #2
8802: ff68 28e2 vsub.i32 q9, q12, q9
8806: ef66 08e0 vadd.i32 q8, q11, q8
880a: ef62 28e4 vadd.i32 q9, q9, q10
880e: ef62 08e0 vadd.i32 q8, q9, q8
8812: eff8 0070 vshr.s32 q8, q8, #8
8816: ed41 0b10 vstr d16, [r1, #-64] ; 0xffffffc0
881a: ed41 1b0e vstr d17, [r1, #-56] ; 0xffffffc8
881e: ed52 0b0c vldr d16, [r2, #-48] ; 0xffffffd0
8822: ed52 1b0a vldr d17, [r2, #-40] ; 0xffffffd8
8826: eff0 2070 vshr.s32 q9, q8, #16
882a: eff8 4070 vshr.s32 q10, q8, #8
882e: ef42 21d8 vand q9, q9, q4
8832: ef44 41d8 vand q10, q10, q4
8836: ef40 01d8 vand q8, q8, q4
883a: efe2 a572 vshl.s32 q13, q9, #2
883e: efe2 6574 vshl.s32 q11, q10, #2
8842: ef6a a8e2 vadd.i32 q13, q13, q9
8846: ef66 48e4 vadd.i32 q10, q11, q10
884a: efe3 6570 vshl.s32 q11, q8, #3
884e: efe1 a57a vshl.s32 q13, q13, #1
8852: efe4 8574 vshl.s32 q12, q10, #4
8856: ef6a 28e2 vadd.i32 q9, q13, q9
885a: ff66 68e0 vsub.i32 q11, q11, q8
885e: ff68 48e4 vsub.i32 q10, q12, q10
8862: efe3 8572 vshl.s32 q12, q9, #3
8866: efe1 4574 vshl.s32 q10, q10, #1
886a: efe2 6576 vshl.s32 q11, q11, #2
886e: ff68 28e2 vsub.i32 q9, q12, q9
8872: ef66 08e0 vadd.i32 q8, q11, q8
8876: ef62 28e4 vadd.i32 q9, q9, q10
887a: ef62 08e0 vadd.i32 q8, q9, q8
887e: eff8 0070 vshr.s32 q8, q8, #8
8882: ed41 0b0c vstr d16, [r1, #-48] ; 0xffffffd0
8886: ed41 1b0a vstr d17, [r1, #-40] ; 0xffffffd8
888a: d188 bne.n 879e <main+0x2c6>
888c: 9a09 ldr r2, [sp, #36] ; 0x24
888e: e000 b.n 8892 <main+0x3ba>
8890: 20c7 movs r0, #199 ; 0xc7
8892: f96e 0add vld1.64 {d16-d17}, [lr :64]!
8896: eff0 2070 vshr.s32 q9, q8, #16
889a: 28c6 cmp r0, #198 ; 0xc6
889c: eff8 4070 vshr.s32 q10, q8, #8
88a0: ef42 21d8 vand q9, q9, q4
88a4: ef44 41d8 vand q10, q10, q4
88a8: ef40 01d8 vand q8, q8, q4
88ac: efe2 a572 vshl.s32 q13, q9, #2
88b0: efe2 6574 vshl.s32 q11, q10, #2
88b4: ef6a a8e2 vadd.i32 q13, q13, q9
88b8: ef66 48e4 vadd.i32 q10, q11, q10
88bc: efe3 6570 vshl.s32 q11, q8, #3
88c0: efe1 a57a vshl.s32 q13, q13, #1
88c4: efe4 8574 vshl.s32 q12, q10, #4
88c8: ef6a 28e2 vadd.i32 q9, q13, q9
88cc: ff66 68e0 vsub.i32 q11, q11, q8
88d0: ff68 48e4 vsub.i32 q10, q12, q10
88d4: efe3 8572 vshl.s32 q12, q9, #3
88d8: efe1 4574 vshl.s32 q10, q10, #1
88dc: efe2 6576 vshl.s32 q11, q11, #2
88e0: ff68 28e2 vsub.i32 q9, q12, q9
88e4: ef66 08e0 vadd.i32 q8, q11, q8
88e8: ef62 28e4 vadd.i32 q9, q9, q10
88ec: ef62 08e0 vadd.i32 q8, q9, q8
88f0: eff8 0070 vshr.s32 q8, q8, #8
88f4: f942 0add vst1.64 {d16-d17}, [r2 :64]!
88f8: d0ca beq.n 8890 <main+0x3b8>
88fa: e663 b.n 85c4 <main+0xec>
88fc: 980b ldr r0, [sp, #44] ; 0x2c
88fe: f50d 5149 add.w r1, sp, #12864 ; 0x3240
8902: 3128 adds r1, #40 ; 0x28
8904: f60d 42e8 addw r2, sp, #3304 ; 0xce8
8908: 9104 str r1, [sp, #16]
890a: 940c str r4, [sp, #48] ; 0x30
890c: 4601 mov r1, r0
890e: 930d str r3, [sp, #52] ; 0x34
8910: 4664 mov r4, ip
8912: 4613 mov r3, r2
8914: 9009 str r0, [sp, #36] ; 0x24
8916: f1a3 0020 sub.w r0, r3, #32
891a: f44f 6248 mov.w r2, #3200 ; 0xc80
891e: 9302 str r3, [sp, #8]
8920: f7ff ed92 blx 8448 <_init+0x20>
8924: 9b02 ldr r3, [sp, #8]
8926: 2000 movs r0, #0
8928: 461a mov r2, r3
892a: 4619 mov r1, r3
892c: ed52 0b08 vldr d16, [r2, #-32] ; 0xffffffe0
8930: ed52 1b06 vldr d17, [r2, #-24] ; 0xffffffe8
8934: eff0 2070 vshr.s32 q9, q8, #16
8938: f892 f0a0 pld [r2, #160] ; 0xa0
893c: 3002 adds r0, #2
893e: eff8 4070 vshr.s32 q10, q8, #8
8942: 28c6 cmp r0, #198 ; 0xc6
8944: 4696 mov lr, r2
8946: 468c mov ip, r1
8948: ef42 21d8 vand q9, q9, q4
894c: f102 0220 add.w r2, r2, #32
8950: ef44 41d8 vand q10, q10, q4
8954: f101 0120 add.w r1, r1, #32
8958: ef40 01d8 vand q8, q8, q4
895c: efe2 a572 vshl.s32 q13, q9, #2
8960: efe2 6574 vshl.s32 q11, q10, #2
8964: ef6a a8e2 vadd.i32 q13, q13, q9
8968: ef66 48e4 vadd.i32 q10, q11, q10
896c: efe3 6570 vshl.s32 q11, q8, #3
8970: efe1 a57a vshl.s32 q13, q13, #1
8974: efe4 8574 vshl.s32 q12, q10, #4
8978: ef6a 28e2 vadd.i32 q9, q13, q9
897c: ff66 68e0 vsub.i32 q11, q11, q8
8980: ff68 48e4 vsub.i32 q10, q12, q10
8984: efe3 8572 vshl.s32 q12, q9, #3
8988: efe1 4574 vshl.s32 q10, q10, #1
898c: efe2 6576 vshl.s32 q11, q11, #2
8990: ff68 28e2 vsub.i32 q9, q12, q9
8994: ef66 08e0 vadd.i32 q8, q11, q8
8998: ef62 28e4 vadd.i32 q9, q9, q10
899c: ef62 08e0 vadd.i32 q8, q9, q8
89a0: eff8 0070 vshr.s32 q8, q8, #8
89a4: ed41 0b10 vstr d16, [r1, #-64] ; 0xffffffc0
89a8: ed41 1b0e vstr d17, [r1, #-56] ; 0xffffffc8
89ac: ed52 0b0c vldr d16, [r2, #-48] ; 0xffffffd0
89b0: ed52 1b0a vldr d17, [r2, #-40] ; 0xffffffd8
89b4: eff0 2070 vshr.s32 q9, q8, #16
89b8: eff8 4070 vshr.s32 q10, q8, #8
89bc: ef42 21d8 vand q9, q9, q4
89c0: ef44 41d8 vand q10, q10, q4
89c4: ef40 01d8 vand q8, q8, q4
89c8: efe2 a572 vshl.s32 q13, q9, #2
89cc: efe2 6574 vshl.s32 q11, q10, #2
89d0: ef6a a8e2 vadd.i32 q13, q13, q9
89d4: ef66 48e4 vadd.i32 q10, q11, q10
89d8: efe3 6570 vshl.s32 q11, q8, #3
89dc: efe1 a57a vshl.s32 q13, q13, #1
89e0: efe4 8574 vshl.s32 q12, q10, #4
89e4: ef6a 28e2 vadd.i32 q9, q13, q9
89e8: ff66 68e0 vsub.i32 q11, q11, q8
89ec: ff68 48e4 vsub.i32 q10, q12, q10
89f0: efe3 8572 vshl.s32 q12, q9, #3
89f4: efe1 4574 vshl.s32 q10, q10, #1
89f8: efe2 6576 vshl.s32 q11, q11, #2
89fc: ff68 28e2 vsub.i32 q9, q12, q9
8a00: ef66 08e0 vadd.i32 q8, q11, q8
8a04: ef62 28e4 vadd.i32 q9, q9, q10
8a08: ef62 08e0 vadd.i32 q8, q9, q8
8a0c: eff8 0070 vshr.s32 q8, q8, #8
8a10: ed41 0b0c vstr d16, [r1, #-48] ; 0xffffffd0
8a14: ed41 1b0a vstr d17, [r1, #-40] ; 0xffffffd8
8a18: d188 bne.n 892c <main+0x454>
8a1a: 4662 mov r2, ip
8a1c: f96e 0add vld1.64 {d16-d17}, [lr :64]!
8a20: eff0 2070 vshr.s32 q9, q8, #16
8a24: 28c6 cmp r0, #198 ; 0xc6
8a26: eff8 4070 vshr.s32 q10, q8, #8
8a2a: ef42 21d8 vand q9, q9, q4
8a2e: ef44 41d8 vand q10, q10, q4
8a32: ef40 01d8 vand q8, q8, q4
8a36: efe2 a572 vshl.s32 q13, q9, #2
8a3a: efe2 6574 vshl.s32 q11, q10, #2
8a3e: ef6a a8e2 vadd.i32 q13, q13, q9
8a42: ef66 48e4 vadd.i32 q10, q11, q10
8a46: efe3 6570 vshl.s32 q11, q8, #3
8a4a: efe1 a57a vshl.s32 q13, q13, #1
8a4e: efe4 8574 vshl.s32 q12, q10, #4
8a52: ef6a 28e2 vadd.i32 q9, q13, q9
8a56: ff66 68e0 vsub.i32 q11, q11, q8
8a5a: ff68 48e4 vsub.i32 q10, q12, q10
8a5e: efe3 8572 vshl.s32 q12, q9, #3
8a62: efe1 4574 vshl.s32 q10, q10, #1
8a66: efe2 6576 vshl.s32 q11, q11, #2
8a6a: ff68 28e2 vsub.i32 q9, q12, q9
8a6e: ef66 08e0 vadd.i32 q8, q11, q8
8a72: ef62 28e4 vadd.i32 q9, q9, q10
8a76: ef62 08e0 vadd.i32 q8, q9, q8
8a7a: eff8 0070 vshr.s32 q8, q8, #8
8a7e: f942 0add vst1.64 {d16-d17}, [r2 :64]!
8a82: d101 bne.n 8a88 <main+0x5b0>
8a84: 20c7 movs r0, #199 ; 0xc7
8a86: e7c9 b.n 8a1c <main+0x544>
8a88: 9a04 ldr r2, [sp, #16]
8a8a: f503 6348 add.w r3, r3, #3200 ; 0xc80
8a8e: 9909 ldr r1, [sp, #36] ; 0x24
8a90: 4293 cmp r3, r2
8a92: f501 6148 add.w r1, r1, #3200 ; 0xc80
8a96: 9109 str r1, [sp, #36] ; 0x24
8a98: f47f af3d bne.w 8916 <main+0x43e>
8a9c: 46a4 mov ip, r4
8a9e: 9b0d ldr r3, [sp, #52] ; 0x34
8aa0: 9c0c ldr r4, [sp, #48] ; 0x30
8aa2: e58f b.n 85c4 <main+0xec>
00008aa4 <_start>:
8aa4: f04f 0b00 mov.w fp, #0
8aa8: f04f 0e00 mov.w lr, #0
8aac: bc02 pop {r1}
8aae: 466a mov r2, sp
8ab0: b404 push {r2}
8ab2: b401 push {r0}
8ab4: f8df c010 ldr.w ip, [pc, #16] ; 8ac8 <_start+0x24>
8ab8: f84d cd04 str.w ip, [sp, #-4]!
8abc: 4803 ldr r0, [pc, #12] ; (8acc <_start+0x28>)
8abe: 4b04 ldr r3, [pc, #16] ; (8ad0 <_start+0x2c>)
8ac0: f7ff ecda blx 8478 <_init+0x50>
8ac4: f7ff ed02 blx 84cc <_init+0xa4>
8ac8: 00008c35 .word 0x00008c35
8acc: 000084d9 .word 0x000084d9
8ad0: 00008bf5 .word 0x00008bf5
00008ad4 <call_weak_fn>:
8ad4: e59f3014 ldr r3, [pc, #20] ; 8af0 <call_weak_fn+0x1c>
8ad8: e59f2014 ldr r2, [pc, #20] ; 8af4 <call_weak_fn+0x20>
8adc: e08f3003 add r3, pc, r3
8ae0: e7932002 ldr r2, [r3, r2]
8ae4: e3520000 cmp r2, #0
8ae8: 012fff1e bxeq lr
8aec: eafffe64 b 8484 <_init+0x5c>
8af0: 0000851c .word 0x0000851c
8af4: 0000003c .word 0x0000003c
00008af8 <deregister_tm_clones>:
8af8: 4b07 ldr r3, [pc, #28] ; (8b18 <deregister_tm_clones+0x20>)
8afa: f241 0048 movw r0, #4168 ; 0x1048
8afe: f2c0 0001 movt r0, #1
8b02: 1a1b subs r3, r3, r0
8b04: 2b06 cmp r3, #6
8b06: d800 bhi.n 8b0a <deregister_tm_clones+0x12>
8b08: 4770 bx lr
8b0a: f240 0300 movw r3, #0
8b0e: f2c0 0300 movt r3, #0
8b12: 2b00 cmp r3, #0
8b14: d0f8 beq.n 8b08 <deregister_tm_clones+0x10>
8b16: 4718 bx r3
8b18: 0001104b .word 0x0001104b
00008b1c <register_tm_clones>:
8b1c: f241 0348 movw r3, #4168 ; 0x1048
8b20: f241 0048 movw r0, #4168 ; 0x1048
8b24: f2c0 0301 movt r3, #1
8b28: f2c0 0001 movt r0, #1
8b2c: 1a1b subs r3, r3, r0
8b2e: 109b asrs r3, r3, #2
8b30: eb03 73d3 add.w r3, r3, r3, lsr #31
8b34: 1059 asrs r1, r3, #1
8b36: d100 bne.n 8b3a <register_tm_clones+0x1e>
8b38: 4770 bx lr
8b3a: f240 0200 movw r2, #0
8b3e: f2c0 0200 movt r2, #0
8b42: 2a00 cmp r2, #0
8b44: d0f8 beq.n 8b38 <register_tm_clones+0x1c>
8b46: 4710 bx r2
00008b48 <__do_global_dtors_aux>:
8b48: b510 push {r4, lr}
8b4a: f241 044c movw r4, #4172 ; 0x104c
8b4e: f2c0 0401 movt r4, #1
8b52: 7823 ldrb r3, [r4, #0]
8b54: b91b cbnz r3, 8b5e <__do_global_dtors_aux+0x16>
8b56: f7ff ffcf bl 8af8 <deregister_tm_clones>
8b5a: 2301 movs r3, #1
8b5c: 7023 strb r3, [r4, #0]
8b5e: bd10 pop {r4, pc}
00008b60 <frame_dummy>:
8b60: f640 7014 movw r0, #3860 ; 0xf14
8b64: f2c0 0001 movt r0, #1
8b68: b508 push {r3, lr}
8b6a: 6803 ldr r3, [r0, #0]
8b6c: b12b cbz r3, 8b7a <frame_dummy+0x1a>
8b6e: f240 0300 movw r3, #0
8b72: f2c0 0300 movt r3, #0
8b76: b103 cbz r3, 8b7a <frame_dummy+0x1a>
8b78: 4798 blx r3
8b7a: e8bd 4008 ldmia.w sp!, {r3, lr}
8b7e: e7cd b.n 8b1c <register_tm_clones>
00008b80 <conv_rgb2y>:
8b80: 2196 movs r1, #150 ; 0x96
8b82: f3c0 2207 ubfx r2, r0, #8, #8
8b86: fb01 f202 mul.w r2, r1, r2
8b8a: b2c3 uxtb r3, r0
8b8c: 214d movs r1, #77 ; 0x4d
8b8e: f3c0 4007 ubfx r0, r0, #16, #8
8b92: b410 push {r4}
8b94: ebc3 04c3 rsb r4, r3, r3, lsl #3
8b98: fb01 2000 mla r0, r1, r0, r2
8b9c: eb03 0384 add.w r3, r3, r4, lsl #2
8ba0: f85d 4b04 ldr.w r4, [sp], #4
8ba4: 4418 add r0, r3
8ba6: 1200 asrs r0, r0, #8
8ba8: 4770 bx lr
8baa: bf00 nop
00008bac <laplacian_fil>:
8bac: b470 push {r4, r5, r6}
8bae: 4240 negs r0, r0
8bb0: 9c03 ldr r4, [sp, #12]
8bb2: 1a41 subs r1, r0, r1
8bb4: 1a8a subs r2, r1, r2
8bb6: 9d04 ldr r5, [sp, #16]
8bb8: 1ad3 subs r3, r2, r3
8bba: 9e05 ldr r6, [sp, #20]
8bbc: eb03 04c4 add.w r4, r3, r4, lsl #3
8bc0: 9906 ldr r1, [sp, #24]
8bc2: 9a07 ldr r2, [sp, #28]
8bc4: 1b65 subs r5, r4, r5
8bc6: 1bae subs r6, r5, r6
8bc8: 1a71 subs r1, r6, r1
8bca: bc70 pop {r4, r5, r6}
8bcc: 1a88 subs r0, r1, r2
8bce: f380 0008 usat r0, #8, r0
8bd2: 4770 bx lr
00008bd4 <chkhex>:
8bd4: b510 push {r4, lr}
8bd6: 4604 mov r4, r0
8bd8: 7800 ldrb r0, [r0, #0]
8bda: b918 cbnz r0, 8be4 <chkhex+0x10>
8bdc: e007 b.n 8bee <chkhex+0x1a>
8bde: f814 0f01 ldrb.w r0, [r4, #1]!
8be2: b120 cbz r0, 8bee <chkhex+0x1a>
8be4: f7ff ec36 blx 8454 <_init+0x2c>
8be8: 2800 cmp r0, #0
8bea: d1f8 bne.n 8bde <chkhex+0xa>
8bec: bd10 pop {r4, pc}
8bee: 2001 movs r0, #1
8bf0: bd10 pop {r4, pc}
8bf2: bf00 nop
00008bf4 <__libc_csu_init>:
8bf4: e92d 43f8 stmdb sp!, {r3, r4, r5, r6, r7, r8, r9, lr}
8bf8: 4607 mov r7, r0
8bfa: 4e0c ldr r6, [pc, #48] ; (8c2c <__libc_csu_init+0x38>)
8bfc: 4688 mov r8, r1
8bfe: 4d0c ldr r5, [pc, #48] ; (8c30 <__libc_csu_init+0x3c>)
8c00: 4691 mov r9, r2
8c02: 447e add r6, pc
8c04: f7ff ec10 blx 8428 <_init>
8c08: 447d add r5, pc
8c0a: 1b76 subs r6, r6, r5
8c0c: 10b6 asrs r6, r6, #2
8c0e: d00a beq.n 8c26 <__libc_csu_init+0x32>
8c10: 3d04 subs r5, #4
8c12: 2400 movs r4, #0
8c14: 3401 adds r4, #1
8c16: f855 3f04 ldr.w r3, [r5, #4]!
8c1a: 4638 mov r0, r7
8c1c: 4641 mov r1, r8
8c1e: 464a mov r2, r9
8c20: 4798 blx r3
8c22: 42b4 cmp r4, r6
8c24: d1f6 bne.n 8c14 <__libc_csu_init+0x20>
8c26: e8bd 83f8 ldmia.w sp!, {r3, r4, r5, r6, r7, r8, r9, pc}
8c2a: bf00 nop
8c2c: 0000830a .word 0x0000830a
8c30: 00008300 .word 0x00008300
00008c34 <__libc_csu_fini>:
8c34: 4770 bx lr
8c36: bf00 nop
Disassembly of section .fini:
00008c38 <_fini>:
8c38: e92d4008 push {r3, lr}
8c3c: e8bd8008 pop {r3, pc}
// laplacian_filter.c
// RGBをYに変換後にラプラシアンフィルタを掛ける。
// ピクセルのフォーマットは、{8'd0, R(8bits), G(8bits), B(8bits)}, 1pixel = 32bits
// 2013/09/16
// 2014/12/04 : ZYBO用Ubuntu Linux のUIO用に変更
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <assert.h>
#include <sys/mman.h>
#include <fcntl.h>
#define HORIZONTAL_PIXEL_WIDTH 800
#define VERTICAL_PIXEL_WIDTH 600
#define ALL_PIXEL_VALUE (HORIZONTAL_PIXEL_WIDTH*VERTICAL_PIXEL_WIDTH)
#define CMA_START_ADDRESS 0x17800000
#define VIDEO_BUFFER_START_ADDRESS 0x18000000 // Limit 0x18800000, 800*600*4 = 2MBytes * 2
#define LAPLACIAN_FILTER_ADDRESS 0x18200000 // 800*600*4 = 0x1d4c00
int laplacian_fil(int x0y0, int x1y0, int x2y0, int x0y1, int x1y1, int x2y1, int x0y2, int x1y2, int x2y2);
int conv_rgb2y(int rgb);
int chkhex(char *str);
int main()
{
volatile unsigned int *fb_addr, *next_frame_addr;
int lap_fil_val;
int x, y;
struct timeval start_time, temp1, temp2, end_time;
unsigned int line_buf[3][HORIZONTAL_PIXEL_WIDTH];
int a, b;
int fl, sl, tl;
int fd0, fd3;
volatile unsigned *bmdc_axi_lites;
volatile unsigned int *frame_buffer;
// gettimeofday(&start_time, NULL); // プログラム起動時の時刻を記録
// frame_buffer にマップする
fd3 = open("/dev/uio3", O_RDWR); // Frame Buffer
if (fd3 < 1){
fprintf(stderr, "/dev/uio3 open error\n");
exit(-1);
}
frame_buffer = (volatile unsigned *)mmap(NULL, 0x1000000, PROT_READ|PROT_WRITE, MAP_SHARED, fd3, 0);
if (!frame_buffer){
fprintf(stderr, "frame_buffer mmap error\n");
exit(-1);
}
fb_addr = (volatile unsigned int *)((unsigned int)frame_buffer + (unsigned int)(VIDEO_BUFFER_START_ADDRESS-CMA_START_ADDRESS));
// ラプラシアンフィルタの結果を入れておくフレーム・バッファ
next_frame_addr = (volatile unsigned int *)((unsigned int)frame_buffer + (unsigned int)(LAPLACIAN_FILTER_ADDRESS-CMA_START_ADDRESS));
gettimeofday(&start_time, NULL);
// RGB値をY(輝度成分)のみに変換し、ラプラシアンフィルタを掛けた。
for (y=0; y<VERTICAL_PIXEL_WIDTH; y++){
for (x=0; x<HORIZONTAL_PIXEL_WIDTH; x++){
if (y==0 || y==VERTICAL_PIXEL_WIDTH-1){ // 縦の境界の時の値は0とする
lap_fil_val = 0;
}else if (x==0 || x==HORIZONTAL_PIXEL_WIDTH-1){ // 横の境界の時も値は0とする
lap_fil_val = 0;
}else{
if (y == 1 && x == 1){ // 最初のラインの最初のピクセルでは2ライン分の画素を読み出す
for (a=0; a<2; a++){ // 2ライン分
for (b=0; b<HORIZONTAL_PIXEL_WIDTH; b++){ // ライン
line_buf[a][b] = fb_addr[(a*HORIZONTAL_PIXEL_WIDTH)+b];
line_buf[a][b] = conv_rgb2y(line_buf[a][b]);
}
}
}
if (x == 1) { // ラインの最初なので、2つのピクセルを読み込む
for (b=0; b<2; b++){ // ライン
line_buf[(y+1)%3][b] = fb_addr[((y+1)*HORIZONTAL_PIXEL_WIDTH)+b];
// (y+1)%3 は、使用済みのラインがに読み込む、y=2 の時 line[0], y=3の時 line[1], y=4の時 line[2]
line_buf[(y+1)%3][b] = conv_rgb2y(line_buf[(y+1)%3][b]);
}
}
// 1つのピクセルを読み込みながらラプラシアン・フィルタを実行する
line_buf[(y+1)%3][x+1] = fb_addr[((y+1)*HORIZONTAL_PIXEL_WIDTH)+(x+1)];
// (y+1)%3 は、使用済みのラインがに読み込む、y=2 の時 line[0], y=3の時 line[1], y=4の時 line[2]
line_buf[(y+1)%3][x+1] = conv_rgb2y(line_buf[(y+1)%3][x+1]);
fl = (y-1)%3; // 最初のライン, y=1 012, y=2 120, y=3 201, y=4 012
sl = y%3; // 2番めのライン
tl = (y+1)%3; // 3番目のライン
lap_fil_val = laplacian_fil(line_buf[fl][x-1], line_buf[fl][x], line_buf[fl][x+1], line_buf[sl][x-1], line_buf[sl][x], line_buf[sl][x+1], line_buf[tl][x-1], line_buf[tl][x], line_buf[tl][x+1]);
}
// ラプラシアンフィルタ・データの書き込み
next_frame_addr[(y*HORIZONTAL_PIXEL_WIDTH)+x] = (lap_fil_val<<16)+(lap_fil_val<<8)+lap_fil_val ;
// printf("x = %d y = %d", x, y);
}
}
gettimeofday(&end_time, NULL);
munmap((void *)frame_buffer, 0x1000000);
// ラプラシアンフィルタ表示画面に切り替え
// Bitmap Display Controller AXI4 Lite Slave (UIO0)
fd0 = open("/dev/uio0", O_RDWR); // bitmap_display_controller axi4 lite
if (fd0 < 1){
fprintf(stderr, "/dev/uio0 open error\n");
exit(-1);
}
bmdc_axi_lites = (volatile unsigned *)mmap(NULL, 0x10000, PROT_READ|PROT_WRITE, MAP_SHARED, fd0, 0);
if (!bmdc_axi_lites){
fprintf(stderr, "bmdc_axi_lites mmap error\n");
exit(-1);
}
bmdc_axi_lites[0] = (unsigned int)LAPLACIAN_FILTER_ADDRESS; // Bitmap Display Controller start (ラプラシアンフィルタ表示画面のアドレス)
munmap((void *)bmdc_axi_lites, 0x10000);
//gettimeofday(&end_time, NULL);
if (end_time.tv_usec < start_time.tv_usec) {
printf("total time = %ld.%06ld sec\n", end_time.tv_sec - start_time.tv_sec - 1, 1000000 + end_time.tv_usec - start_time.tv_usec);
}
else {
printf("total time = %ld.%06ld sec\n", end_time.tv_sec - start_time.tv_sec, end_time.tv_usec - start_time.tv_usec);
}
return(0);
}
// RGBからYへの変換
// RGBのフォーマットは、{8'd0, R(8bits), G(8bits), B(8bits)}, 1pixel = 32bits
// 輝度信号Yのみに変換する。変換式は、Y = 0.299R + 0.587G + 0.114B
// "YUVフォーマット及び YUV<->RGB変換"を参考にした。http://vision.kuee.kyoto-u.ac.jp/~hiroaki/firewire/yuv.html
// 2013/09/27 : float を止めて、すべてint にした
int conv_rgb2y(int rgb){
int r, g, b, y_f;
int y;
b = rgb & 0xff;
g = (rgb>>8) & 0xff;
r = (rgb>>16) & 0xff;
y_f = 77*r + 150*g + 29*b; //y_f = 0.299*r + 0.587*g + 0.114*b;の係数に256倍した
y = y_f >> 8; // 256で割る
return(y);
}
// ラプラシアンフィルタ
// x0y0 x1y0 x2y0 -1 -1 -1
// x0y1 x1y1 x2y1 -1 8 -1
// x0y2 x1y2 x2y2 -1 -1 -1
int laplacian_fil(int x0y0, int x1y0, int x2y0, int x0y1, int x1y1, int x2y1, int x0y2, int x1y2, int x2y2)
{
int y;
y = -x0y0 -x1y0 -x2y0 -x0y1 +8*x1y1 -x2y1 -x0y2 -x1y2 -x2y2;
if (y<0)
y = 0;
else if (y>255)
y = 255;
return(y);
}
// 文字列が16進数かを調べる
int chkhex(char *str){
while (*str != '\0'){
if (!isxdigit(*str))
return 0;
str++;
}
return 1;
}
// laplacian_filter4.c
// RGBをYに変換後にラプラシアンフィルタを掛ける。
// ピクセルのフォーマットは、{8'd0, R(8bits), G(8bits), B(8bits)}, 1pixel = 32bits
// 2013/09/16
// 2014/12/04 : ZYBO用Ubuntu Linux のUIO用に変更
// Vivado HLS 2014.4 のプロジェクト ZYBO/lap_filter_axim_tu2_2014_4を使用したソースコードと同じものを使用する。これは、Vivado HLSで最速のCソースコードだ
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <assert.h>
#include <sys/mman.h>
#include <fcntl.h>
#define HORIZONTAL_PIXEL_WIDTH 800
#define VERTICAL_PIXEL_WIDTH 600
#define ALL_PIXEL_VALUE (HORIZONTAL_PIXEL_WIDTH*VERTICAL_PIXEL_WIDTH)
#define CMA_START_ADDRESS 0x17800000
#define VIDEO_BUFFER_START_ADDRESS 0x18000000 // Limit 0x18800000, 800*600*4 = 2MBytes * 2
#define LAPLACIAN_FILTER_ADDRESS 0x18200000 // 800*600*4 = 0x1d4c00
int laplacian_fil(int x0y0, int x1y0, int x2y0, int x0y1, int x1y1, int x2y1, int x0y2, int x1y2, int x2y2);
int conv_rgb2y(int rgb);
int chkhex(char *str);
void filter_line(unsigned int* lap_buf, unsigned int* fl, unsigned int* sl, unsigned int* tl){
int lap_fil_val;
int prev[3],current[3],next[3]; // 0->1ライン目, 1->2ライン目, 2->3ライン目, prev->1pixel前, current->現在, next->次pixel
int x;
next[0] = conv_rgb2y(fl[0]);
next[1] = conv_rgb2y(sl[0]);
next[2] = conv_rgb2y(tl[0]);
for (x = 0; x < HORIZONTAL_PIXEL_WIDTH; x++){
if (x == 0 || x == HORIZONTAL_PIXEL_WIDTH-1){
lap_fil_val = 0;
current[0] = next[0];
next[0] = conv_rgb2y(fl[1]);
current[1] = next[1];
next[1] = conv_rgb2y(sl[1]);
current[2] = next[2];
next[2] = conv_rgb2y(tl[1]);
}else{
prev[0] = current[0];
current[0] = next[0];
next[0] = conv_rgb2y(fl[x+1]);
prev[1] = current[1];
current[1] = next[1];
next[1] = conv_rgb2y(sl[x+1]);
prev[2] = current[2];
current[2] = next[2];
next[2] = conv_rgb2y(tl[x+1]);
lap_fil_val = laplacian_fil(prev[0], current[0], next[0],
prev[1], current[1], next[1],
prev[2], current[2], next[2]);
}
lap_buf[x] = (lap_fil_val<<16)+(lap_fil_val<<8)+lap_fil_val; // RGB同じ値を入れる
}
}
int main()
{
volatile unsigned int *cam_fb = 0;
volatile unsigned int *lap_fb = 0;
volatile unsigned int *cam_addr;
volatile unsigned int *lap_addr;
int lap_fil_val;
int x, y;
struct timeval start_time, temp1, temp2, end_time;
unsigned int line_buf[3][HORIZONTAL_PIXEL_WIDTH];
int a, b;
int fl, sl, tl;
int fd0, fd3;
unsigned int offset_cam_addr, offset_lap_addr;
unsigned int lap_buf[HORIZONTAL_PIXEL_WIDTH];
volatile unsigned int *cam_fb_addr, *lap_fb_addr;
int line_sel;
volatile unsigned int *bmdc_axi_lites;
volatile unsigned int *frame_buffer;
// gettimeofday(&start_time, NULL); // プログラム起動時の時刻を記録
// frame_buffer にマップする
fd3 = open("/dev/uio3", O_RDWR); // Frame Buffer
if (fd3 < 1){
fprintf(stderr, "/dev/uio3 open error\n");
exit(-1);
}
frame_buffer = (volatile unsigned int *)mmap(NULL, 0x1000000, PROT_READ|PROT_WRITE, MAP_SHARED, fd3, 0);
if (!frame_buffer){
fprintf(stderr, "frame_buffer mmap error\n");
exit(-1);
}
cam_addr = (volatile unsigned int *)((unsigned int)frame_buffer + (unsigned int)(VIDEO_BUFFER_START_ADDRESS-CMA_START_ADDRESS));
// ラプラシアンフィルタの結果を入れておくフレーム・バッファ
lap_addr = (volatile unsigned int *)((unsigned int)frame_buffer + (unsigned int)(LAPLACIAN_FILTER_ADDRESS-CMA_START_ADDRESS));
offset_cam_addr = (volatile unsigned int)((unsigned int)cam_addr/sizeof(int));
offset_lap_addr = (volatile unsigned int)((unsigned int)lap_addr/sizeof(int));
gettimeofday(&start_time, NULL);
// ラプラシアンフィルタ処理開始
// RGB値をY(輝度成分)のみに変換し、ラプラシアンフィルタを掛けた。
for (y=1, line_sel=0; y<VERTICAL_PIXEL_WIDTH-1; y++){
// 最初のライン, y=1 012, y=2 120, y=3 201, y=4 012
switch(line_sel){
case 1 :
fl = 0; sl = 1; tl = 2;
break;
case 2 :
fl = 1; sl = 2; tl = 0;
break;
case 3 :
fl = 2; sl = 0; tl = 1;
break;
default :
fl = 0; sl = 1; tl = 2;
}
if (y == 1){
for (a=0; a<3; a++){
// 3ライン分
cam_fb_addr = (int*)(cam_fb+offset_cam_addr+(a*(HORIZONTAL_PIXEL_WIDTH)));
memcpy(line_buf[a], (unsigned int*)cam_fb_addr, HORIZONTAL_PIXEL_WIDTH*sizeof(int));
}
}else{ // 最初のラインではないので、1ラインだけ読み込む。すでに他の2ラインは読み込まれている
cam_fb_addr = (int*)(cam_fb+offset_cam_addr+((y+1)*(HORIZONTAL_PIXEL_WIDTH)));
memcpy(line_buf[tl], (unsigned int*)cam_fb_addr, HORIZONTAL_PIXEL_WIDTH*sizeof(int));
}
filter_line(lap_buf, line_buf[fl], line_buf[sl], line_buf[tl]);
lap_fb_addr = (int *)(lap_fb+offset_lap_addr+(y*(HORIZONTAL_PIXEL_WIDTH)));
memcpy((unsigned int*)lap_fb_addr, (unsigned int*)lap_buf, HORIZONTAL_PIXEL_WIDTH*sizeof(int));
line_sel++;
if (line_sel > 3){
line_sel = 1;
}
}
// 最初と最後のラインは0にする
for (x = 0; x < HORIZONTAL_PIXEL_WIDTH; x++)
lap_buf[x] = 0;
lap_fb_addr = (int *)(lap_fb+offset_lap_addr+(0*(HORIZONTAL_PIXEL_WIDTH)));
memcpy((unsigned int*)lap_fb_addr, (unsigned int*)lap_buf, HORIZONTAL_PIXEL_WIDTH*sizeof(int));
lap_fb_addr = (int *)(lap_fb+offset_lap_addr+(VERTICAL_PIXEL_WIDTH-1)*HORIZONTAL_PIXEL_WIDTH);
memcpy((unsigned int*)lap_fb_addr, (unsigned int*)lap_buf, HORIZONTAL_PIXEL_WIDTH*sizeof(int));
// ラプラシアンフィルタ処理終了
gettimeofday(&end_time, NULL);
munmap((void *)frame_buffer, 0x1000000);
// ラプラシアンフィルタ表示画面に切り替え
// Bitmap Display Controller AXI4 Lite Slave (UIO0)
fd0 = open("/dev/uio0", O_RDWR); // bitmap_display_controller axi4 lite
if (fd0 < 1){
fprintf(stderr, "/dev/uio0 open error\n");
exit(-1);
}
bmdc_axi_lites = (volatile unsigned *)mmap(NULL, 0x10000, PROT_READ|PROT_WRITE, MAP_SHARED, fd0, 0);
if (!bmdc_axi_lites){
fprintf(stderr, "bmdc_axi_lites mmap error\n");
exit(-1);
}
bmdc_axi_lites[0] = (unsigned int)LAPLACIAN_FILTER_ADDRESS; // Bitmap Display Controller start (ラプラシアンフィルタ表示画面のアドレス)
munmap((void *)bmdc_axi_lites, 0x10000);
//gettimeofday(&end_time, NULL);
if (end_time.tv_usec < start_time.tv_usec) {
printf("total time = %ld.%06ld sec\n", end_time.tv_sec - start_time.tv_sec - 1, 1000000 + end_time.tv_usec - start_time.tv_usec);
}
else {
printf("total time = %ld.%06ld sec\n", end_time.tv_sec - start_time.tv_sec, end_time.tv_usec - start_time.tv_usec);
}
return(0);
}
// RGBからYへの変換
// RGBのフォーマットは、{8'd0, R(8bits), G(8bits), B(8bits)}, 1pixel = 32bits
// 輝度信号Yのみに変換する。変換式は、Y = 0.299R + 0.587G + 0.114B
// "YUVフォーマット及び YUV<->RGB変換"を参考にした。http://vision.kuee.kyoto-u.ac.jp/~hiroaki/firewire/yuv.html
// 2013/09/27 : float を止めて、すべてint にした
int conv_rgb2y(int rgb){
int r, g, b, y_f;
int y;
b = rgb & 0xff;
g = (rgb>>8) & 0xff;
r = (rgb>>16) & 0xff;
y_f = 77*r + 150*g + 29*b; //y_f = 0.299*r + 0.587*g + 0.114*b;の係数に256倍した
y = y_f >> 8; // 256で割る
return(y);
}
// ラプラシアンフィルタ
// x0y0 x1y0 x2y0 -1 -1 -1
// x0y1 x1y1 x2y1 -1 8 -1
// x0y2 x1y2 x2y2 -1 -1 -1
int laplacian_fil(int x0y0, int x1y0, int x2y0, int x0y1, int x1y1, int x2y1, int x0y2, int x1y2, int x2y2)
{
int y;
y = -x0y0 -x1y0 -x2y0 -x0y1 +8*x1y1 -x2y1 -x0y2 -x1y2 -x2y2;
if (y<0)
y = 0;
else if (y>255)
y = 255;
return(y);
}
// 文字列が16進数かを調べる
int chkhex(char *str){
while (*str != '\0'){
if (!isxdigit(*str))
return 0;
str++;
}
return 1;
}
となった。これが 600 ライン分となるといくつになるかを計算した。これは、455.655 us - 185.865 us = 269.79 us
となった。269.79 us * 600 = 161.874 ms
// laplacian_filter.c
// lap_filter_axim()
#include <stdio.h>
#include <string.h>
#define HORIZONTAL_PIXEL_WIDTH 800
#define VERTICAL_PIXEL_WIDTH 600
#define ALL_PIXEL_VALUE (HORIZONTAL_PIXEL_WIDTH*VERTICAL_PIXEL_WIDTH)
int laplacian_fil(int x0y0, int x1y0, int x2y0, int x0y1, int x1y1, int x2y1, int x0y2, int x1y2, int x2y2);
int conv_rgb2y(int rgb);
int lap_filter_axim(int cam_addr, int lap_addr, volatile int *cam_fb, volatile int *lap_fb)
{
#pragma HLS INTERFACE s_axilite port=cam_addr bundle=BUS_AXI4LS
#pragma HLS INTERFACE s_axilite port=lap_addr bundle=BUS_AXI4LS
#pragma HLS INTERFACE s_axilite port=return bundle=BUS_AXI4LS
#pragma HLS INTERFACE ap_none port=cam_addr
#pragma HLS INTERFACE ap_none port=lap_addr
#pragma HLS INTERFACE m_axi port=cam_fb depth=1920
#pragma HLS INTERFACE m_axi port=lap_fb depth=1920
unsigned int line_buf[3][HORIZONTAL_PIXEL_WIDTH];
int x, y;
int lap_fil_val;
int a, b;
int fl, sl, tl;
int *cam_fb_addr, *lap_fb_addr;
cam_fb_addr = (int *)(cam_fb+(cam_addr/sizeof(int)));
lap_fb_addr = (int *)(lap_fb+(lap_addr/sizeof(int)));
// RGB値をY(輝度成分)のみに変換し、ラプラシアンフィルタを掛けた。
for (y=0; y<VERTICAL_PIXEL_WIDTH; y++){
for (x=0; x<HORIZONTAL_PIXEL_WIDTH; x++){
if (y==0 || y==VERTICAL_PIXEL_WIDTH-1){ // 縦の境界の時の値は0とする
lap_fil_val = 0;
}else if (x==0 || x==HORIZONTAL_PIXEL_WIDTH-1){ // 横の境界の時も値は0とする
lap_fil_val = 0;
}else{
if (y == 1 && x == 1){ // 最初のラインの最初のピクセルでは2ライン分の画素を読み出す
for (a=0; a<2; a++){ // 2ライン分
for (b=0; b<HORIZONTAL_PIXEL_WIDTH; b++){ // ライン
line_buf[a][b] = cam_fb_addr[(a*HORIZONTAL_PIXEL_WIDTH)+b];
line_buf[a][b] = conv_rgb2y(line_buf[a][b]);
}
}
}
if (x == 1) { // ラインの最初なので、2つのピクセルを読み込む
for (b=0; b<2; b++){ // ライン
line_buf[(y+1)%3][b] = cam_fb_addr[((y+1)*HORIZONTAL_PIXEL_WIDTH)+b];
// (y+1)%3 は、使用済みのラインがに読み込む、y=2 の時 line[0], y=3の時 line[1], y=4の時 line[2]
line_buf[(y+1)%3][b] = conv_rgb2y(line_buf[(y+1)%3][b]);
}
}
// 1つのピクセルを読み込みながらラプラシアン・フィルタを実行する
line_buf[(y+1)%3][x+1] = cam_fb_addr[((y+1)*HORIZONTAL_PIXEL_WIDTH)+(x+1)];
// (y+1)%3 は、使用済みのラインがに読み込む、y=2 の時 line[0], y=3の時 line[1], y=4の時 line[2]
line_buf[(y+1)%3][x+1] = conv_rgb2y(line_buf[(y+1)%3][x+1]);
fl = (y-1)%3; // 最初のライン, y=1 012, y=2 120, y=3 201, y=4 012
sl = y%3; // 2番めのライン
tl = (y+1)%3; // 3番目のライン
lap_fil_val = laplacian_fil(line_buf[fl][x-1], line_buf[fl][x], line_buf[fl][x+1], line_buf[sl][x-1], line_buf[sl][x], line_buf[sl][x+1], line_buf[tl][x-1], line_buf[tl][x], line_buf[tl][x+1]);
}
// ラプラシアンフィルタ・データの書き込み
lap_fb_addr[(y*HORIZONTAL_PIXEL_WIDTH)+x] = (lap_fil_val<<16)+(lap_fil_val<<8)+lap_fil_val ;
// printf("x = %d y = %d", x, y);
}
}
return(1);
}
// RGBからYへの変換
// RGBのフォーマットは、{8'd0, R(8bits), G(8bits), B(8bits)}, 1pixel = 32bits
// 輝度信号Yのみに変換する。変換式は、Y = 0.299R + 0.587G + 0.114B
// "YUVフォーマット及び YUV<->RGB変換"を参考にした。http://vision.kuee.kyoto-u.ac.jp/~hiroaki/firewire/yuv.html
// 2013/09/27 : float を止めて、すべてint にした
int conv_rgb2y(int rgb){
int r, g, b, y_f;
int y;
b = rgb & 0xff;
g = (rgb>>8) & 0xff;
r = (rgb>>16) & 0xff;
y_f = 77*r + 150*g + 29*b; //y_f = 0.299*r + 0.587*g + 0.114*b;の係数に256倍した
y = y_f >> 8; // 256で割る
return(y);
}
// ラプラシアンフィルタ
// x0y0 x1y0 x2y0 -1 -1 -1
// x0y1 x1y1 x2y1 -1 8 -1
// x0y2 x1y2 x2y2 -1 -1 -1
int laplacian_fil(int x0y0, int x1y0, int x2y0, int x0y1, int x1y1, int x2y1, int x0y2, int x1y2, int x2y2)
{
int y;
y = -x0y0 -x1y0 -x2y0 -x0y1 +8*x1y1 -x2y1 -x0y2 -x1y2 -x2y2;
if (y<0)
y = 0;
else if (y>255)
y = 255;
return(y);
}
日 | 月 | 火 | 水 | 木 | 金 | 土 |
---|---|---|---|---|---|---|
- | - | - | 1 | 2 | 3 | 4 |
5 | 6 | 7 | 8 | 9 | 10 | 11 |
12 | 13 | 14 | 15 | 16 | 17 | 18 |
19 | 20 | 21 | 22 | 23 | 24 | 25 |
26 | 27 | 28 | 29 | 30 | 31 | - |