Sorry for the late reply.
The design I was trying to run is attached below.
#include <CL/sycl.hpp>
#include <cmath>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image_write.h"
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
#include <CL/sycl/INTEL/fpga_extensions.hpp>
#include <chrono>
#include <fstream>
#include <iostream>
// using namespace std;
using namespace sycl;
float luminance(uint8_t r, uint8_t g, uint8_t b)
{
float r_lin = static_cast<float>(r) / 255;
float g_lin = static_cast<float>(g) / 255;
float b_lin = static_cast<float>(b) / 255;
// Perceptual luminance (CIE 1931)
return 0.2126f * r_lin + 0.7152 * g_lin + 0.0722 * b_lin;
}
class grayscale;
class dxxgenerate;
class sumgenerate;
class cornerd;
int main()
{
// auto property_list = sycl::property_list{sycl::property::queue::enable_profiling()};
int channels;
int width;
int height;
float thresh = 100;
uint8_t *image = stbi_load("./1.jpg", &width, &height, &channels, 3);
auto start = std::chrono::high_resolution_clock::now();
{
#if defined(FPGA_EMULATOR)
INTEL::fpga_emulator_selector device_selector;
#elif defined(CPU_HOST)
host_selector device_selector;
#else
INTEL::fpga_selector device_selector;
#endif
buffer<uint8_t, 1> image_buffer{image, width * height * channels};
buffer<float, 1> greyscale_buffer{width * height};
queue queue(device_selector);
queue.submit([&greyscale_buffer, &image_buffer, width, height](handler &h)
{
// A discard_write is a write access that doesn't need to preserve existing
// memory contents
auto data = greyscale_buffer.get_access<access::mode::discard_write>(h);
auto image_data = image_buffer.get_access<access::mode::read>(h);
h.parallel_for<class grayscale>(range<1>(width * height),
[image_data, data](id<1> idx)
{
int offset = 3 * idx[0];
data[idx[0]] = luminance(image_data[offset],
image_data[offset + 1],
image_data[offset + 2]);
});
});
buffer<float, 1> dx{width * height};
buffer<float, 1> dy{width * height};
buffer<float, 1> sxx{width * height};
buffer<float, 1> syy{width * height};
buffer<float, 1> sxy{width * height};
uint8_t *out = reinterpret_cast<uint8_t *>(
malloc_shared(width * height, queue));
{
buffer<float, 1> dy_tmp{width * height};
queue.submit([&greyscale_buffer, &dy_tmp, width, height](
handler &h)
{
//h.depends_on(gray);
auto data = greyscale_buffer.get_access<access::mode::read>(h);
auto out = dy_tmp.get_access<access::mode::discard_write>(h);
// Create a scratch buffer for the intermediate computation
h.parallel_for(range<2>(width, height),
[data, width, out](id<2> idx)
{
// Convolve horizontally
int offset = idx[1] * width + idx[0];
float left = idx[0] == 0 ? 0 : data[offset - 1];
float right = idx[0] == width - 1 ? 0 : data[offset + 1];
float center = data[offset];
out[offset] = left + 2 * center + right;
});
});
queue.submit([&dy, &dy_tmp, width, height](handler &h)
{
auto data = dy_tmp.get_access<access::mode::read>(h);
auto out = dy.get_access<access::mode::discard_write>(h);
h.parallel_for(
range<2>(width, height),
[data, width, height, out](id<2> idx)
{
// Convolve vertically
int offset = idx[1] * width + idx[0];
float up = idx[1] == 0 ? 0 : data[offset - width];
float down = idx[1] == height - 1 ? 0 : data[offset + width];
out[offset] = up - down;
});
});
}
{
buffer<float, 1> dx_tmp{width * height};
// Extract a 3x1 window around (x, y) and compute the dot product
// between the window and the kernel [1, 0, -1]
queue.submit([&greyscale_buffer, &dx_tmp, width, height](handler &h)
{
//h.depends_on(gray);
auto data = greyscale_buffer.get_access<access::mode::read>(h);
auto out = dx_tmp.get_access<access::mode::discard_write>(h);
h.parallel_for(range<2>(width, height),
[data, width, out](id<2> idx)
{
int offset = idx[1] * width + idx[0];
float left = idx[0] == 0 ? 0 : data[offset - 1];
float right = idx[0] == width - 1 ? 0 : data[offset + 1];
out[offset] = left - right;
});
});
// Extract a 1x3 window around (x, y) and compute the dot product
// between the window and the kernel [1, 2, 1]
queue.submit([&dx, &dx_tmp, width, height](handler &h)
{
auto data = dx_tmp.get_access<access::mode::read>(h);
auto out = dx.get_access<access::mode::discard_write>(h);
h.parallel_for(range<2>(width, height),
[data, width, height, out](id<2> idx)
{
// Convolve vertically
int offset = idx[1] * width + idx[0];
float up = idx[1] == 0 ? 0 : data[offset - width];
float down = idx[1] == height - 1 ? 0 : data[offset + width];
float center = data[offset];
out[offset] = up + 2 * center + down;
});
});
}
{
buffer<float, 1> dxx{width * height};
buffer<float, 1> dyy{width * height};
buffer<float, 1> dxy{width * height};
queue.submit([&dxx, &dxy, &dyy, &dx, &dy, width, height](handler &h)
{
auto ixx = dxx.get_access<access::mode::discard_write>(h);
auto ixy = dxy.get_access<access::mode::discard_write>(h);
auto iyy = dyy.get_access<access::mode::discard_write>(h);
auto ix = dx.get_access<access::mode::read>(h);
auto iy = dy.get_access<access::mode::read>(h);
h.parallel_for<class dxxgenerate>(range<2>(width, height),
[ixx, ixy, iyy, ix, iy, width](id<2> i)
{
int offset = i[1] * width + i[0];
ixx[offset] = ix[offset] * ix[offset];
iyy[offset] = iy[offset] * iy[offset];
ixy[offset] = ix[offset] * iy[offset];
});
});
queue.submit([&dxx, &dxy, &dyy, &sxx, &sxy, &syy, width, height](handler &h)
{
auto ixx = dxx.get_access<access::mode::read>(h);
auto ixy = dxy.get_access<access::mode::read>(h);
auto iyy = dyy.get_access<access::mode::read>(h);
auto sixx = sxx.get_access<access::mode::write>(h);
auto sixy = sxy.get_access<access::mode::write>(h);
auto siyy = syy.get_access<access::mode::write>(h);
// assuming kernal is of size 3
h.parallel_for<class sumgenerate>(range<1>(width * height),
[ixx, ixy, iyy, sixx, sixy, siyy, width, height](id<1> ind)
{
int i = ind[0];
sixx[i] = 0;
sixy[i] = 0;
siyy[i] = 0;
for (int k = -1; k< 2 && i/width +k <height ; k++)
{
for (int j = -1; j< 2 && i%width + j< width; j++)
{
if (i%width + j < 0 || i/width +k < 0 ||i + j + k * width < 0 || i + j + k * width >= width * height){
continue ;
}
sixx[i] += ixx[i + j + k * width];
sixy[i] += ixy[i + j + k * width];
siyy[i] += iyy[i + j + k * width];
}
}
});
});
}
queue.submit([&sxx, &sxy, &syy, width, height, out, thresh](handler &h)
{
auto sixx = sxx.get_access<access::mode::read>(h);
auto sixy = sxy.get_access<access::mode::read>(h);
auto siyy = syy.get_access<access::mode::read>(h);
h.parallel_for<class cornerd>(range<2>(width, height),
[sixx, sixy, siyy, width, thresh, out](id<2> idx)
{
int offset = idx[1] * width + idx[0];
out[offset] = sixx[offset] * siyy[offset] - sixy[offset] * sixy[offset] - 0.04 * (sixx[offset] + siyy[offset]) * (sixx[offset] + siyy[offset]) > thresh ? 255:0;
});
});
queue.wait();
std::cout << "Time taken " << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - start).count() << "\n";
stbi_write_png("./corners.png", width, height, 1, out, width);
stbi_image_free(image);
sycl::free(out, queue);
}
}
I have tested this code in emulation mode for and errors and it is working fine but when I try to compile the FPGA executable that is generated is not running. I compiled it using batch mode using the earlier mentioned. The executable was run on Stratix 10 - OneAPI, OpenVINO env.
The error is given below
terminate called after throwing an instance of 'cl::sycl::runtime_error'
what(): Native API failed. Native API returns: -50 (CL_INVALID_ARG_VALUE) -50 (CL_INVALID_ARG_VALUE)