Hey BB,
thanks for your help and sorry for the late response, I did not came to work on this for the last week.
I made you the smallest little example of my setup that creates this error, it does not really something useful, just some snippets of the DPCT-translated and slightly modified raytracing benchmark of the Altis GPU Becnhmark.
You need the following files in some directory:
#include <CL/sycl.hpp>
#include <dpct/dpct.hpp>
#include <dpct/rng_utils.hpp>
#include <oneapi/dpl/random>
#include <oneapi/mkl.hpp>
#include <oneapi/mkl/rng/device.hpp>
#include <sycl/ext/intel/fpga_extensions.hpp>
#include "vec3.h"
#include "sphere.h"
#include "camera.h"
#include "random_gen.h"
namespace fpga_tools {
namespace detail {
// Autorun implementation
template <bool run_forever, typename KernelID>
struct Autorun_impl {
// Constructor with a kernel name
template <typename DeviceSelector, typename KernelFunctor>
Autorun_impl(DeviceSelector device_selector, KernelFunctor kernel) {
// static asserts to ensure KernelFunctor is callable
static_assert(std::is_invocable_r_v<void, KernelFunctor>,
"KernelFunctor must be callable with no arguments");
// create the device queue
sycl::queue q{device_selector};
// submit the user's kernel
if constexpr (run_forever) {
if constexpr (std::is_same_v<KernelID, void>) {
// AutorunForever, kernel name not given
q.single_task([=] {
while (1) {
kernel();
}
});
} else {
// AutorunForever, kernel name given
q.single_task<KernelID>([=] {
while (1) {
kernel();
}
});
}
} else {
// run the kernel as-is, if the user wanted it to run forever they
// will write their own explicit while-loop
if constexpr (std::is_same_v<KernelID, void>) {
// Autorun, kernel name not given
q.single_task(kernel);
} else {
// Autorun, kernel name given
q.single_task<KernelID>(kernel);
}
}
}
};
} // namespace detail
// Autorun
template <typename KernelID = void>
using Autorun = detail::Autorun_impl<false, KernelID>;
// AutorunForever
template <typename KernelID = void>
using AutorunForever = detail::Autorun_impl<true, KernelID>;
} // namespace fpga_tools
#if defined(FPGA_EMULATOR)
sycl::ext::intel::fpga_emulator_selector ds;
#else
sycl::ext::intel::fpga_selector ds;
#endif
class random_generator_kernel_id;
struct rnd_generator {
void operator()() const {
auto rand_state = dpct::rng::device::rng_generator<
oneapi::mkl::rng::device::philox4x32x10<4>>(1984, { 0, 0 * 8 });
while (1)
{
sycl::ext::intel::pipe<rnd_out_pipe_id, float, 8>::write(
rand_state
.generate<oneapi::mkl::rng::device::uniform<float>,
1>());
}
}
};
fpga_tools::Autorun<random_generator_kernel_id> ar_rnd_gen{ds, rnd_generator{}};
int
main(int argc, char *argv[])
{
sycl::queue q{ds};
const sycl::range<3> blocks(1, 4, 4);
const sycl::range<3> threads(1, 16, 16);
const size_t fb_size = 3840 * 2160 * sizeof(vec3);
sycl::buffer<vec3> h_fb { sycl::range(fb_size) };
sycl::event render_event = q.submit([&](sycl::handler &cgh) {
sycl::accessor a_fb { h_fb, cgh, sycl::write_only, sycl::no_init };
cgh.parallel_for<class sample_kernel_id>(
sycl::nd_range<3>(blocks * threads, threads),
[=](sycl::nd_item<3> item_ct1) {
int i = item_ct1.get_local_id(2)
+ item_ct1.get_group(2) * item_ct1.get_local_range(2);
int j = item_ct1.get_local_id(1)
+ item_ct1.get_group(1) * item_ct1.get_local_range(1);
if ((i >= 3840) || (j >= 2160))
return;
int pixel_index = j * 3840 + i;
vec3 col(0, 0, 0);
vec3 lookfrom(13.0f, 2.0f, 3.0f);
vec3 lookat(0.0f, 0.0f, 0.0f);
float dist_to_focus = 10.0f;
(lookfrom - lookat).length();
float aperture = 0.1f;
camera cam(lookfrom,
lookat,
vec3(0.0f, 1.0f, 0.0f),
30.0f,
float(3840) / float(2160),
aperture,
dist_to_focus);
for (int s = 0; s < 10; s++)
{
float u = float(i + sycl::ext::intel::pipe<rnd_out_pipe_id, float, 8>::read()) / float(3840);
float v = float(j + sycl::ext::intel::pipe<rnd_out_pipe_id, float, 8>::read()) / float(2160);
ray r = cam.get_ray(u, v);
}
col /= float(10);
col[0] = sycl::sqrt(col[0]);
col[1] = sycl::sqrt(col[1]);
col[2] = sycl::sqrt(col[2]);
a_fb[pixel_index] = col;
});
});
render_event.wait();
const float elapsed = render_event.get_profiling_info<
sycl::info::event_profiling::command_end>()
- render_event.get_profiling_info<
sycl::info::event_profiling::command_start>();
std::cout << "Test elapsed: " << elapsed << std::endl;
return 0;
}
(test.cpp)
////////////////////////////////////////////////////////////////////////////////////////////////////
// file: altis\src\cuda\level2\raytracing\camera.h
//
// summary: Declares the camera class
//
// origin: Ray tracing(https://github.com/ssangx/raytracing.cuda)
////////////////////////////////////////////////////////////////////////////////////////////////////
#ifndef CAMERAH
#define CAMERAH
#include <CL/sycl.hpp>
#include <dpct/dpct.hpp>
#include <oneapi/mkl.hpp>
#include <oneapi/mkl/rng/device.hpp>
#include <dpct/rng_utils.hpp>
#include "ray.h"
#include "random_gen.h"
////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary> Random in unit disk. </summary>
///
/// <remarks> Ed, 5/20/2020. </remarks>
///
/// <param name="local_rand_state"> [in,out] If non-null, state of the local random. </param>
///
/// <returns> A vec3. </returns>
////////////////////////////////////////////////////////////////////////////////////////////////////
/*
DPCT1032:1298: A different random number generator is used. You may need to
adjust the code.
*/
vec3 random_in_unit_disk() {
vec3 p;
do {
/*
DPCT1084:1299: The function call has multiple migration results in
different template instantiations that could not be unified. You may
need to adjust the code.
*/
p = 2.0f * vec3(sycl::ext::intel::pipe<rnd_out_pipe_id, float, 8>::read(),
sycl::ext::intel::pipe<rnd_out_pipe_id, float, 8>::read(),
0) -
vec3(1, 1, 0);
} while (dot(p,p) >= 1.0f);
return p;
}
////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary> A camera. </summary>
///
/// <remarks> Ed, 5/20/2020. </remarks>
////////////////////////////////////////////////////////////////////////////////////////////////////
class camera {
public:
camera(vec3 lookfrom, vec3 lookat, vec3 vup, float vfov, float aspect, float aperture, float focus_dist) { // vfov is top to bottom in degrees
lens_radius = aperture / 2.0f;
float theta = vfov*((float)M_PI)/180.0f;
float half_height = sycl::tan(theta / 2.0f);
float half_width = aspect * half_height;
origin = lookfrom;
/*
DPCT1084:1300: The function call has multiple migration results in
different template instantiations that could not be unified. You may
need to adjust the code.
*/
w = unit_vector(lookfrom - lookat);
/*
DPCT1084:1301: The function call has multiple migration results in
different template instantiations that could not be unified. You may
need to adjust the code.
*/
u = unit_vector(cross(vup, w));
/*
DPCT1084:1302: The function call has multiple migration results in
different template instantiations that could not be unified. You may
need to adjust the code.
*/
v = cross(w, u);
/*
DPCT1084:1303: The function call has multiple migration results in
different template instantiations that could not be unified. You may
need to adjust the code.
*/
lower_left_corner = origin - half_width * focus_dist * u -
half_height * focus_dist * v - focus_dist * w;
/*
DPCT1084:1304: The function call has multiple migration results in
different template instantiations that could not be unified. You may
need to adjust the code.
*/
horizontal = 2.0f * half_width * focus_dist * u;
/*
DPCT1084:1305: The function call has multiple migration results in
different template instantiations that could not be unified. You may
need to adjust the code.
*/
vertical = 2.0f * half_height * focus_dist * v;
}
////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary> Gets a ray. </summary>
///
/// <remarks> Ed, 5/20/2020. </remarks>
///
/// <param name="s"> A float to process. </param>
/// <param name="t"> A float to process. </param>
/// <param name="local_rand_state"> [in,out] If non-null, state of the local random. </param>
///
/// <returns> The ray. </returns>
////////////////////////////////////////////////////////////////////////////////////////////////////
/*
DPCT1032:1306: A different random number generator is used. You may need to
adjust the code.
*/
ray
get_ray(float s, float t) {
/*
DPCT1084:1307: The function call has multiple migration results in
different template instantiations that could not be unified. You may
need to adjust the code.
*/
vec3 rd = lens_radius * random_in_unit_disk();
/*
DPCT1084:1308: The function call has multiple migration results in
different template instantiations that could not be unified. You may
need to adjust the code.
*/
vec3 offset = u * rd.x() + v * rd.y();
/*
DPCT1084:1309: The function call has multiple migration results in
different template instantiations that could not be unified. You may
need to adjust the code.
*/
return ray(origin + offset, lower_left_corner + s * horizontal +
t * vertical - origin - offset);
}
/// <summary> The origin. </summary>
vec3 origin;
/// <summary> The lower left corner. </summary>
vec3 lower_left_corner;
/// <summary> The horizontal. </summary>
vec3 horizontal;
/// <summary> The vertical. </summary>
vec3 vertical;
////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary> Gets the w. </summary>
///
/// <value> The w. </value>
////////////////////////////////////////////////////////////////////////////////////////////////////
vec3 u, v, w;
/// <summary> The lens radius. </summary>
float lens_radius;
};
#endif
(camera.h)
#pragma once
class rnd_out_pipe_id;
(random_gen.h)
#ifndef RAYH
#define RAYH
#include <CL/sycl.hpp>
#include <dpct/dpct.hpp>
#include "vec3.h"
class ray
{
public:
ray() {}
ray(const vec3& a, const vec3& b) { A = a; B = b; }
vec3 origin() const { return A; }
vec3 direction() const { return B; }
/*
DPCT1084:1293: The function call has multiple migration results in
different template instantiations that could not be unified. You may
need to adjust the code.
*/
vec3 point_at_parameter(float t) const { return A + t * B; }
/// <summary> A vec3 to process. </summary>
vec3 A;
/// <summary> A vec3 to process. </summary>
vec3 B;
};
#endif
(ray.h)
#ifndef SPHEREH
#define SPHEREH
#include <CL/sycl.hpp>
#include <dpct/dpct.hpp>
#include "ray.h"
class sphere {
public:
sphere() {}
sphere(vec3 cen, float r) : center(cen), radius(r) {};
SYCL_EXTERNAL bool hit(const ray& r, float tmin, float tmax) const;
vec3 center;
float radius;
};
bool sphere::hit(const ray& r, float t_min, float t_max) const {
vec3 oc = r.origin() - center;
float a = dot(r.direction(), r.direction());
float b = dot(oc, r.direction());
float c = dot(oc, oc) - radius*radius;
float discriminant = b*b - a*c;
if (discriminant > 0) {
float temp = (-b - sycl::sqrt(discriminant)) / a;
if (temp < t_max && temp > t_min) {
return true;
}
temp = (-b + sycl::sqrt(discriminant)) / a;
if (temp < t_max && temp > t_min) {
return true;
}
}
return false;
}
#endif
(sphere.h)
#ifndef VEC3H
#define VEC3H
#include <CL/sycl.hpp>
#include <dpct/dpct.hpp>
#include <math.h>
#include <stdlib.h>
#include <iostream>
class vec3 {
public:
vec3() {}
vec3(float e0, float e1, float e2) { e[0] = e0; e[1] = e1; e[2] = e2; }
inline float x() const { return e[0]; }
inline float y() const { return e[1]; }
inline float z() const { return e[2]; }
inline float r() const { return e[0]; }
inline float g() const { return e[1]; }
inline float b() const { return e[2]; }
inline const vec3& operator+() const { return *this; }
inline vec3 operator-() const { return vec3(-e[0], -e[1], -e[2]); }
inline float operator[](int i) const { return e[i]; }
inline float& operator[](int i) { return e[i]; };
inline vec3& operator+=(const vec3 &v2);
inline vec3& operator-=(const vec3 &v2);
inline vec3& operator*=(const vec3 &v2);
inline vec3& operator/=(const vec3 &v2);
inline vec3& operator*=(const float t);
inline vec3& operator/=(const float t);
inline float length() const {
return sycl::sqrt(e[0] * e[0] + e[1] * e[1] + e[2] * e[2]);
}
inline float squared_length() const { return e[0]*e[0] + e[1]*e[1] + e[2]*e[2]; }
inline void make_unit_vector();
float e[3];
};
inline std::istream& operator>>(std::istream &is, vec3 &t) {
is >> t.e[0] >> t.e[1] >> t.e[2];
return is;
}
inline std::ostream& operator<<(std::ostream &os, const vec3 &t) {
os << t.e[0] << " " << t.e[1] << " " << t.e[2];
return os;
}
inline void vec3::make_unit_vector() {
float k = 1.0 / sycl::sqrt(e[0] * e[0] + e[1] * e[1] + e[2] * e[2]);
e[0] *= k; e[1] *= k; e[2] *= k;
}
inline vec3 operator+(const vec3 &v1, const vec3 &v2) {
return vec3(v1.e[0] + v2.e[0], v1.e[1] + v2.e[1], v1.e[2] + v2.e[2]);
}
inline vec3 operator-(const vec3 &v1, const vec3 &v2) {
return vec3(v1.e[0] - v2.e[0], v1.e[1] - v2.e[1], v1.e[2] - v2.e[2]);
}
inline vec3 operator*(const vec3 &v1, const vec3 &v2) {
return vec3(v1.e[0] * v2.e[0], v1.e[1] * v2.e[1], v1.e[2] * v2.e[2]);
}
inline vec3 operator/(const vec3 &v1, const vec3 &v2) {
return vec3(v1.e[0] / v2.e[0], v1.e[1] / v2.e[1], v1.e[2] / v2.e[2]);
}
inline vec3 operator*(float t, const vec3 &v) {
return vec3(t*v.e[0], t*v.e[1], t*v.e[2]);
}
inline vec3 operator/(vec3 v, float t) {
return vec3(v.e[0]/t, v.e[1]/t, v.e[2]/t);
}
inline vec3 operator*(const vec3 &v, float t) {
return vec3(t*v.e[0], t*v.e[1], t*v.e[2]);
}
inline float dot(const vec3 &v1, const vec3 &v2) {
return v1.e[0] *v2.e[0] + v1.e[1] *v2.e[1] + v1.e[2] *v2.e[2];
}
inline vec3 cross(const vec3 &v1, const vec3 &v2) {
return vec3( (v1.e[1]*v2.e[2] - v1.e[2]*v2.e[1]),
(-(v1.e[0]*v2.e[2] - v1.e[2]*v2.e[0])),
(v1.e[0]*v2.e[1] - v1.e[1]*v2.e[0]));
}
inline vec3& vec3::operator+=(const vec3 &v){
e[0] += v.e[0];
e[1] += v.e[1];
e[2] += v.e[2];
return *this;
}
inline vec3& vec3::operator*=(const vec3 &v){
e[0] *= v.e[0];
e[1] *= v.e[1];
e[2] *= v.e[2];
return *this;
}
inline vec3& vec3::operator/=(const vec3 &v){
e[0] /= v.e[0];
e[1] /= v.e[1];
e[2] /= v.e[2];
return *this;
}
inline vec3& vec3::operator-=(const vec3& v) {
e[0] -= v.e[0];
e[1] -= v.e[1];
e[2] -= v.e[2];
return *this;
}
inline vec3& vec3::operator*=(const float t) {
e[0] *= t;
e[1] *= t;
e[2] *= t;
return *this;
}
inline vec3& vec3::operator/=(const float t) {
float k = 1.0/t;
e[0] *= k;
e[1] *= k;
e[2] *= k;
return *this;
}
inline vec3 unit_vector(vec3 v) {
/*
DPCT1084:1292: The function call has multiple migration results in different
template instantiations that could not be unified. You may need to adjust
the code.
*/
return v / v.length();
}
#endif
(vec3.h)
Compiling the project with
dpcpp -fsycl -fintelfpga -o test.cpp.o -c test.cpp
And linking it with
dpcpp -fsycl -fintelfpga -Xshardware -Xstarget="/opt/intel/oneapi/intel_a10gx_pac:pac_a10" test.cpp.o -o test.fpga
Results in the errors I stated above:
test.cpp:150: Compiler Warning: Pipe ordering required barrier insertion in for.body.i of pipe_ZTSZZ4mainENKUlRN2cl4sycl7handlerEE_clES2_E16sample_kernel_id, but kernel may hang as a result
camera.h:47: Compiler Warning: Pipe ordering required barrier insertion in do.body.i.i.i of pipe_ZTSZZ4mainENKUlRN2cl4sycl7handlerEE_clES2_E16sample_kernel_id, but kernel may hang as a result
test.cpp:150: Compiler Warning: Pipe ordering required barrier insertion in _ZN6camera7get_rayEff.exit.i of pipe_ZTSZZ4mainENKUlRN2cl4sycl7handlerEE_clES2_E16sample_kernel_id, but kernel may hang as a result
A smaller example did not lead to the errors, but I hope it is still not to time-consuming to look into it.
Thanks for your help in advance,
Christoph