Forum Discussion
Hi @TanmayKhabia,
Thank you for posting in Intel community forum, hope all is well and apologies for the delayed in response.
Can you share with us which design example that you tried to compile on?
And for the error mention, which environment are you compiling on?
Hope to hear from you soon.
Best Wishes
BB
- TanmayKhabia4 years ago
New Contributor
Sorry for the late reply.
The design I was trying to run is attached below.
#include <CL/sycl.hpp>
#include <cmath>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image_write.h"#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"#include <CL/sycl/INTEL/fpga_extensions.hpp>
#include <chrono>
#include <fstream>
#include <iostream>// using namespace std;
using namespace sycl;float luminance(uint8_t r, uint8_t g, uint8_t b)
{
float r_lin = static_cast<float>(r) / 255;
float g_lin = static_cast<float>(g) / 255;
float b_lin = static_cast<float>(b) / 255;// Perceptual luminance (CIE 1931)
return 0.2126f * r_lin + 0.7152 * g_lin + 0.0722 * b_lin;
}class grayscale;
class dxxgenerate;
class sumgenerate;
class cornerd;
int main()
{
// auto property_list = sycl::property_list{sycl::property::queue::enable_profiling()};
int channels;
int width;
int height;
float thresh = 100;
uint8_t *image = stbi_load("./1.jpg", &width, &height, &channels, 3);
auto start = std::chrono::high_resolution_clock::now();
{
#if defined(FPGA_EMULATOR)
INTEL::fpga_emulator_selector device_selector;
#elif defined(CPU_HOST)
host_selector device_selector;
#else
INTEL::fpga_selector device_selector;
#endif
buffer<uint8_t, 1> image_buffer{image, width * height * channels};
buffer<float, 1> greyscale_buffer{width * height};
queue queue(device_selector);
queue.submit([&greyscale_buffer, &image_buffer, width, height](handler &h)
{
// A discard_write is a write access that doesn't need to preserve existing
// memory contents
auto data = greyscale_buffer.get_access<access::mode::discard_write>(h);
auto image_data = image_buffer.get_access<access::mode::read>(h);h.parallel_for<class grayscale>(range<1>(width * height),
[image_data, data](id<1> idx)
{
int offset = 3 * idx[0];
data[idx[0]] = luminance(image_data[offset],
image_data[offset + 1],
image_data[offset + 2]);
});
});
buffer<float, 1> dx{width * height};
buffer<float, 1> dy{width * height};buffer<float, 1> sxx{width * height};
buffer<float, 1> syy{width * height};
buffer<float, 1> sxy{width * height};
uint8_t *out = reinterpret_cast<uint8_t *>(
malloc_shared(width * height, queue));{
buffer<float, 1> dy_tmp{width * height};queue.submit([&greyscale_buffer, &dy_tmp, width, height](
handler &h)
{
//h.depends_on(gray);
auto data = greyscale_buffer.get_access<access::mode::read>(h);
auto out = dy_tmp.get_access<access::mode::discard_write>(h);// Create a scratch buffer for the intermediate computation
h.parallel_for(range<2>(width, height),
[data, width, out](id<2> idx)
{
// Convolve horizontally
int offset = idx[1] * width + idx[0];
float left = idx[0] == 0 ? 0 : data[offset - 1];
float right = idx[0] == width - 1 ? 0 : data[offset + 1];
float center = data[offset];
out[offset] = left + 2 * center + right;
});
});queue.submit([&dy, &dy_tmp, width, height](handler &h)
{
auto data = dy_tmp.get_access<access::mode::read>(h);
auto out = dy.get_access<access::mode::discard_write>(h);
h.parallel_for(
range<2>(width, height),
[data, width, height, out](id<2> idx)
{
// Convolve vertically
int offset = idx[1] * width + idx[0];
float up = idx[1] == 0 ? 0 : data[offset - width];
float down = idx[1] == height - 1 ? 0 : data[offset + width];
out[offset] = up - down;
});
});
}{
buffer<float, 1> dx_tmp{width * height};// Extract a 3x1 window around (x, y) and compute the dot product
// between the window and the kernel [1, 0, -1]
queue.submit([&greyscale_buffer, &dx_tmp, width, height](handler &h)
{
//h.depends_on(gray);
auto data = greyscale_buffer.get_access<access::mode::read>(h);
auto out = dx_tmp.get_access<access::mode::discard_write>(h);h.parallel_for(range<2>(width, height),
[data, width, out](id<2> idx)
{
int offset = idx[1] * width + idx[0];
float left = idx[0] == 0 ? 0 : data[offset - 1];
float right = idx[0] == width - 1 ? 0 : data[offset + 1];
out[offset] = left - right;
});
});// Extract a 1x3 window around (x, y) and compute the dot product
// between the window and the kernel [1, 2, 1]
queue.submit([&dx, &dx_tmp, width, height](handler &h)
{
auto data = dx_tmp.get_access<access::mode::read>(h);
auto out = dx.get_access<access::mode::discard_write>(h);
h.parallel_for(range<2>(width, height),
[data, width, height, out](id<2> idx)
{
// Convolve vertically
int offset = idx[1] * width + idx[0];
float up = idx[1] == 0 ? 0 : data[offset - width];
float down = idx[1] == height - 1 ? 0 : data[offset + width];
float center = data[offset];
out[offset] = up + 2 * center + down;
});
});
}{
buffer<float, 1> dxx{width * height};
buffer<float, 1> dyy{width * height};
buffer<float, 1> dxy{width * height};
queue.submit([&dxx, &dxy, &dyy, &dx, &dy, width, height](handler &h)
{
auto ixx = dxx.get_access<access::mode::discard_write>(h);
auto ixy = dxy.get_access<access::mode::discard_write>(h);
auto iyy = dyy.get_access<access::mode::discard_write>(h);
auto ix = dx.get_access<access::mode::read>(h);
auto iy = dy.get_access<access::mode::read>(h);
h.parallel_for<class dxxgenerate>(range<2>(width, height),
[ixx, ixy, iyy, ix, iy, width](id<2> i)
{
int offset = i[1] * width + i[0];
ixx[offset] = ix[offset] * ix[offset];
iyy[offset] = iy[offset] * iy[offset];
ixy[offset] = ix[offset] * iy[offset];
});
});queue.submit([&dxx, &dxy, &dyy, &sxx, &sxy, &syy, width, height](handler &h)
{
auto ixx = dxx.get_access<access::mode::read>(h);
auto ixy = dxy.get_access<access::mode::read>(h);
auto iyy = dyy.get_access<access::mode::read>(h);
auto sixx = sxx.get_access<access::mode::write>(h);
auto sixy = sxy.get_access<access::mode::write>(h);
auto siyy = syy.get_access<access::mode::write>(h);
// assuming kernal is of size 3
h.parallel_for<class sumgenerate>(range<1>(width * height),
[ixx, ixy, iyy, sixx, sixy, siyy, width, height](id<1> ind)
{
int i = ind[0];
sixx[i] = 0;
sixy[i] = 0;
siyy[i] = 0;for (int k = -1; k< 2 && i/width +k <height ; k++)
{
for (int j = -1; j< 2 && i%width + j< width; j++)
{
if (i%width + j < 0 || i/width +k < 0 ||i + j + k * width < 0 || i + j + k * width >= width * height){
continue ;
}
sixx[i] += ixx[i + j + k * width];
sixy[i] += ixy[i + j + k * width];
siyy[i] += iyy[i + j + k * width];
}
}
});
});
}queue.submit([&sxx, &sxy, &syy, width, height, out, thresh](handler &h)
{
auto sixx = sxx.get_access<access::mode::read>(h);
auto sixy = sxy.get_access<access::mode::read>(h);
auto siyy = syy.get_access<access::mode::read>(h);h.parallel_for<class cornerd>(range<2>(width, height),
[sixx, sixy, siyy, width, thresh, out](id<2> idx)
{
int offset = idx[1] * width + idx[0];
out[offset] = sixx[offset] * siyy[offset] - sixy[offset] * sixy[offset] - 0.04 * (sixx[offset] + siyy[offset]) * (sixx[offset] + siyy[offset]) > thresh ? 255:0;
});
});
queue.wait();
std::cout << "Time taken " << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - start).count() << "\n";
stbi_write_png("./corners.png", width, height, 1, out, width);stbi_image_free(image);
sycl::free(out, queue);
}
}I have tested this code in emulation mode for and errors and it is working fine but when I try to compile the FPGA executable that is generated is not running. I compiled it using batch mode using the earlier mentioned. The executable was run on Stratix 10 - OneAPI, OpenVINO env.
The error is given below
terminate called after throwing an instance of 'cl::sycl::runtime_error'
what(): Native API failed. Native API returns: -50 (CL_INVALID_ARG_VALUE) -50 (CL_INVALID_ARG_VALUE)