User Profile

TanmayKhabia

New Contributor

2 Posts

View All Badges

User Widgets

Contributions

Re: Error during fpga compilation on stratix10
Sorry for the late reply. The design I was trying to run is attached below. #include <CL/sycl.hpp> #include <cmath> #include <cstdint> #include <cstdio> #include <cstdlib> #include <cstring> #define STB_IMAGE_WRITE_IMPLEMENTATION #include "stb_image_write.h" #define STB_IMAGE_IMPLEMENTATION #include "stb_image.h" #include <CL/sycl/INTEL/fpga_extensions.hpp> #include <chrono> #include <fstream> #include <iostream> // using namespace std; using namespace sycl; float luminance(uint8_t r, uint8_t g, uint8_t b) { float r_lin = static_cast<float>(r) / 255; float g_lin = static_cast<float>(g) / 255; float b_lin = static_cast<float>(b) / 255; // Perceptual luminance (CIE 1931) return 0.2126f * r_lin + 0.7152 * g_lin + 0.0722 * b_lin; } class grayscale; class dxxgenerate; class sumgenerate; class cornerd; int main() { // auto property_list = sycl::property_list{sycl::property::queue::enable_profiling()}; int channels; int width; int height; float thresh = 100; uint8_t *image = stbi_load("./1.jpg", &width, &height, &channels, 3); auto start = std::chrono::high_resolution_clock::now(); { #if defined(FPGA_EMULATOR) INTEL::fpga_emulator_selector device_selector; #elif defined(CPU_HOST) host_selector device_selector; #else INTEL::fpga_selector device_selector; #endif buffer<uint8_t, 1> image_buffer{image, width * height * channels}; buffer<float, 1> greyscale_buffer{width * height}; queue queue(device_selector); queue.submit([&greyscale_buffer, &image_buffer, width, height](handler &h) { // A discard_write is a write access that doesn't need to preserve existing // memory contents auto data = greyscale_buffer.get_access<access::mode::discard_write>(h); auto image_data = image_buffer.get_access<access::mode::read>(h); h.parallel_for<class grayscale>(range<1>(width * height), [image_data, data](id<1> idx) { int offset = 3 * idx[0]; data[idx[0]] = luminance(image_data[offset], image_data[offset + 1], image_data[offset + 2]); }); }); buffer<float, 1> dx{width * height}; buffer<float, 1> dy{width * height}; buffer<float, 1> sxx{width * height}; buffer<float, 1> syy{width * height}; buffer<float, 1> sxy{width * height}; uint8_t *out = reinterpret_cast<uint8_t *>( malloc_shared(width * height, queue)); { buffer<float, 1> dy_tmp{width * height}; queue.submit([&greyscale_buffer, &dy_tmp, width, height]( handler &h) { //h.depends_on(gray); auto data = greyscale_buffer.get_access<access::mode::read>(h); auto out = dy_tmp.get_access<access::mode::discard_write>(h); // Create a scratch buffer for the intermediate computation h.parallel_for(range<2>(width, height), [data, width, out](id<2> idx) { // Convolve horizontally int offset = idx[1] * width + idx[0]; float left = idx[0] == 0 ? 0 : data[offset - 1]; float right = idx[0] == width - 1 ? 0 : data[offset + 1]; float center = data[offset]; out[offset] = left + 2 * center + right; }); }); queue.submit([&dy, &dy_tmp, width, height](handler &h) { auto data = dy_tmp.get_access<access::mode::read>(h); auto out = dy.get_access<access::mode::discard_write>(h); h.parallel_for( range<2>(width, height), [data, width, height, out](id<2> idx) { // Convolve vertically int offset = idx[1] * width + idx[0]; float up = idx[1] == 0 ? 0 : data[offset - width]; float down = idx[1] == height - 1 ? 0 : data[offset + width]; out[offset] = up - down; }); }); } { buffer<float, 1> dx_tmp{width * height}; // Extract a 3x1 window around (x, y) and compute the dot product // between the window and the kernel [1, 0, -1] queue.submit([&greyscale_buffer, &dx_tmp, width, height](handler &h) { //h.depends_on(gray); auto data = greyscale_buffer.get_access<access::mode::read>(h); auto out = dx_tmp.get_access<access::mode::discard_write>(h); h.parallel_for(range<2>(width, height), [data, width, out](id<2> idx) { int offset = idx[1] * width + idx[0]; float left = idx[0] == 0 ? 0 : data[offset - 1]; float right = idx[0] == width - 1 ? 0 : data[offset + 1]; out[offset] = left - right; }); }); // Extract a 1x3 window around (x, y) and compute the dot product // between the window and the kernel [1, 2, 1] queue.submit([&dx, &dx_tmp, width, height](handler &h) { auto data = dx_tmp.get_access<access::mode::read>(h); auto out = dx.get_access<access::mode::discard_write>(h); h.parallel_for(range<2>(width, height), [data, width, height, out](id<2> idx) { // Convolve vertically int offset = idx[1] * width + idx[0]; float up = idx[1] == 0 ? 0 : data[offset - width]; float down = idx[1] == height - 1 ? 0 : data[offset + width]; float center = data[offset]; out[offset] = up + 2 * center + down; }); }); } { buffer<float, 1> dxx{width * height}; buffer<float, 1> dyy{width * height}; buffer<float, 1> dxy{width * height}; queue.submit([&dxx, &dxy, &dyy, &dx, &dy, width, height](handler &h) { auto ixx = dxx.get_access<access::mode::discard_write>(h); auto ixy = dxy.get_access<access::mode::discard_write>(h); auto iyy = dyy.get_access<access::mode::discard_write>(h); auto ix = dx.get_access<access::mode::read>(h); auto iy = dy.get_access<access::mode::read>(h); h.parallel_for<class dxxgenerate>(range<2>(width, height), [ixx, ixy, iyy, ix, iy, width](id<2> i) { int offset = i[1] * width + i[0]; ixx[offset] = ix[offset] * ix[offset]; iyy[offset] = iy[offset] * iy[offset]; ixy[offset] = ix[offset] * iy[offset]; }); }); queue.submit([&dxx, &dxy, &dyy, &sxx, &sxy, &syy, width, height](handler &h) { auto ixx = dxx.get_access<access::mode::read>(h); auto ixy = dxy.get_access<access::mode::read>(h); auto iyy = dyy.get_access<access::mode::read>(h); auto sixx = sxx.get_access<access::mode::write>(h); auto sixy = sxy.get_access<access::mode::write>(h); auto siyy = syy.get_access<access::mode::write>(h); // assuming kernal is of size 3 h.parallel_for<class sumgenerate>(range<1>(width * height), [ixx, ixy, iyy, sixx, sixy, siyy, width, height](id<1> ind) { int i = ind[0]; sixx[i] = 0; sixy[i] = 0; siyy[i] = 0; for (int k = -1; k< 2 && i/width +k <height ; k++) { for (int j = -1; j< 2 && i%width + j< width; j++) { if (i%width + j < 0 || i/width +k < 0 ||i + j + k * width < 0 || i + j + k * width >= width * height){ continue ; } sixx[i] += ixx[i + j + k * width]; sixy[i] += ixy[i + j + k * width]; siyy[i] += iyy[i + j + k * width]; } } }); }); } queue.submit([&sxx, &sxy, &syy, width, height, out, thresh](handler &h) { auto sixx = sxx.get_access<access::mode::read>(h); auto sixy = sxy.get_access<access::mode::read>(h); auto siyy = syy.get_access<access::mode::read>(h); h.parallel_for<class cornerd>(range<2>(width, height), [sixx, sixy, siyy, width, thresh, out](id<2> idx) { int offset = idx[1] * width + idx[0]; out[offset] = sixx[offset] * siyy[offset] - sixy[offset] * sixy[offset] - 0.04 * (sixx[offset] + siyy[offset]) * (sixx[offset] + siyy[offset]) > thresh ? 255:0; }); }); queue.wait(); std::cout << "Time taken " << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - start).count() << "\n"; stbi_write_png("./corners.png", width, height, 1, out, width); stbi_image_free(image); sycl::free(out, queue); } } I have tested this code in emulation mode for and errors and it is working fine but when I try to compile the FPGA executable that is generated is not running. I compiled it using batch mode using the earlier mentioned. The executable was run on Stratix 10 - OneAPI, OpenVINO env. The error is given below terminate called after throwing an instance of 'cl::sycl::runtime_error' what(): Native API failed. Native API returns: -50 (CL_INVALID_ARG_VALUE) -50 (CL_INVALID_ARG_VALUE)
4 years ago Place Acceleration
2.1KViews
0likes
0Comments
Error during fpga compilation on stratix10
I was trying to compile my design on fpga. It was working on the emulation mode but during the FPGA compilation of the dpcpp using batch mode It was showing this error. ``` /usr/sbin/kill-illegit-procs: line 6: /etc/kill-illegit-procs.cfg: No such file or directory /usr/sbin/kill-illegit-procs: line 108: /dev/shm/kip-procs-1623879899.tmp: No such file or directory /usr/sbin/kill-illegit-procs: line 108: /dev/shm/kip-procs-1623879899.tmp: No such file or directory /usr/sbin/kill-illegit-procs: line 108: /dev/shm/kip-procs-1623879959.tmp: No such file or directory /usr/sbin/kill-illegit-procs: line 108: /dev/shm/kip-procs-1623880019.tmp: No such file or directory /usr/sbin/kill-illegit-procs: line 108: /dev/shm/kip-procs-1623880079.tmp: No such file or directory ``` To compile the code I used qsub -l nodes=1:stratix10:ppn=2 -d . batch.sh -l walltime=23:10:40 where batch.sh contains #!/bin/bash echo "Running hardware compilation" dpcpp -fintelfpga -c harris.cpp -o harris.o -DFPGA dpcpp -fintelfpga harris.o -o harris.fpga -Xshardware echo "Compilation finished "
4 years ago Place Acceleration
2.1KViews
0likes
6Comments