Forum Discussion
Christoph9
New Contributor
3 years agoHey BB,
sorry for the late answer, I did not came to work on this the last week!
In the attachment a "minimal" example where this errors occurs. It does nothing productive and consists of codesnippets from the DPCT-translated and slightly modified raytracing-benchmark from the Altis GPU Benchmark Suite.
One file I could not attach, so test.cpp is here:
#include <CL/sycl.hpp> #include <dpct/dpct.hpp> #include <dpct/rng_utils.hpp> #include <oneapi/dpl/random> #include <oneapi/mkl.hpp> #include <oneapi/mkl/rng/device.hpp> #include <sycl/ext/intel/fpga_extensions.hpp> #include "vec3.h" #include "sphere.h" #include "camera.h" #include "random_gen.h" namespace fpga_tools { namespace detail { // Autorun implementation template <bool run_forever, typename KernelID> struct Autorun_impl { // Constructor with a kernel name template <typename DeviceSelector, typename KernelFunctor> Autorun_impl(DeviceSelector device_selector, KernelFunctor kernel) { // static asserts to ensure KernelFunctor is callable static_assert(std::is_invocable_r_v<void, KernelFunctor>, "KernelFunctor must be callable with no arguments"); // create the device queue sycl::queue q{device_selector}; // submit the user's kernel if constexpr (run_forever) { if constexpr (std::is_same_v<KernelID, void>) { // AutorunForever, kernel name not given q.single_task([=] { while (1) { kernel(); } }); } else { // AutorunForever, kernel name given q.single_task<KernelID>([=] { while (1) { kernel(); } }); } } else { // run the kernel as-is, if the user wanted it to run forever they // will write their own explicit while-loop if constexpr (std::is_same_v<KernelID, void>) { // Autorun, kernel name not given q.single_task(kernel); } else { // Autorun, kernel name given q.single_task<KernelID>(kernel); } } } }; } // namespace detail // Autorun template <typename KernelID = void> using Autorun = detail::Autorun_impl<false, KernelID>; // AutorunForever template <typename KernelID = void> using AutorunForever = detail::Autorun_impl<true, KernelID>; } // namespace fpga_tools #if defined(FPGA_EMULATOR) sycl::ext::intel::fpga_emulator_selector ds; #else sycl::ext::intel::fpga_selector ds; #endif class random_generator_kernel_id; struct rnd_generator { void operator()() const { auto rand_state = dpct::rng::device::rng_generator< oneapi::mkl::rng::device::philox4x32x10<4>>(1984, { 0, 0 * 8 }); while (1) { sycl::ext::intel::pipe<rnd_out_pipe_id, float, 8>::write( rand_state .generate<oneapi::mkl::rng::device::uniform<float>, 1>()); } } }; fpga_tools::Autorun<random_generator_kernel_id> ar_rnd_gen{ds, rnd_generator{}}; int main(int argc, char *argv[]) { sycl::queue q{ds}; const sycl::range<3> blocks(1, 4, 4); const sycl::range<3> threads(1, 16, 16); const size_t fb_size = 3840 * 2160 * sizeof(vec3); sycl::buffer<vec3> h_fb { sycl::range(fb_size) }; sycl::event render_event = q.submit([&](sycl::handler &cgh) { sycl::accessor a_fb { h_fb, cgh, sycl::write_only, sycl::no_init }; cgh.parallel_for<class sample_kernel_id>( sycl::nd_range<3>(blocks * threads, threads), [=](sycl::nd_item<3> item_ct1) { int i = item_ct1.get_local_id(2) + item_ct1.get_group(2) * item_ct1.get_local_range(2); int j = item_ct1.get_local_id(1) + item_ct1.get_group(1) * item_ct1.get_local_range(1); if ((i >= 3840) || (j >= 2160)) return; int pixel_index = j * 3840 + i; vec3 col(0, 0, 0); vec3 lookfrom(13.0f, 2.0f, 3.0f); vec3 lookat(0.0f, 0.0f, 0.0f); float dist_to_focus = 10.0f; (lookfrom - lookat).length(); float aperture = 0.1f; camera cam(lookfrom, lookat, vec3(0.0f, 1.0f, 0.0f), 30.0f, float(3840) / float(2160), aperture, dist_to_focus); for (int s = 0; s < 10; s++) { float u = float(i + sycl::ext::intel::pipe<rnd_out_pipe_id, float, 8>::read()) / float(3840); float v = float(j + sycl::ext::intel::pipe<rnd_out_pipe_id, float, 8>::read()) / float(2160); ray r = cam.get_ray(u, v); } col /= float(10); col[0] = sycl::sqrt(col[0]); col[1] = sycl::sqrt(col[1]); col[2] = sycl::sqrt(col[2]); a_fb[pixel_index] = col; }); }); render_event.wait(); const float elapsed = render_event.get_profiling_info< sycl::info::event_profiling::command_end>() - render_event.get_profiling_info< sycl::info::event_profiling::command_start>(); std::cout << "Test elapsed: " << elapsed << std::endl; return 0; }
Compiling them via
dpcpp -fsycl -fintelfpga -o test.cpp.o -c test.cpp
And linking it results in my above stated errors:
dpcpp -fsycl -fintelfpga -Xshardware -Xstarget="/opt/intel/oneapi/intel_a10gx_pac:pac_a10" test.cpp.o -o test.fpga test.cpp:134: Compiler Warning: Pipe ordering required barrier insertion in for.body.i of pipe_ZTSZZ4mainENKUlRN2cl4sycl7handlerEE_clES2_E16sample_kernel_id, but kernel may hang as a result camera.h:47: Compiler Warning: Pipe ordering required barrier insertion in do.body.i.i.i of pipe_ZTSZZ4mainENKUlRN2cl4sycl7handlerEE_clES2_E16sample_kernel_id, but kernel may hang as a result test.cpp:134: Compiler Warning: Pipe ordering required barrier insertion in _ZN6camera7get_rayEff.exit.i of pipe_ZTSZZ4mainENKUlRN2cl4sycl7handlerEE_clES2_E16sample_kernel_id, but kernel may hang as a result
A smaller example sadly did not produce the erros, I hope this is not too much code too look into fast. As you see I did not use the explicit loop-pipeling for ND-Range kernels, therefore the disable-looppipelining attribute should not do anything here.
Thanks in advance and best regards,
Christoph