Non-Ordered Pipes for Random-Number-Generation

New Contributor

3 years ago

Hey BB,

sorry for the late answer, I did not came to work on this the last week!

In the attachment a "minimal" example where this errors occurs. It does nothing productive and consists of codesnippets from the DPCT-translated and slightly modified raytracing-benchmark from the Altis GPU Benchmark Suite.

One file I could not attach, so test.cpp is here:

#include <CL/sycl.hpp>

#include <dpct/dpct.hpp>
#include <dpct/rng_utils.hpp>
#include <oneapi/dpl/random>
#include <oneapi/mkl.hpp>
#include <oneapi/mkl/rng/device.hpp>
#include <sycl/ext/intel/fpga_extensions.hpp>

#include "vec3.h"
#include "sphere.h"
#include "camera.h"
#include "random_gen.h"

namespace fpga_tools {

namespace detail {
// Autorun implementation
template <bool run_forever, typename KernelID>
struct Autorun_impl {
  // Constructor with a kernel name
  template <typename DeviceSelector, typename KernelFunctor>
  Autorun_impl(DeviceSelector device_selector, KernelFunctor kernel) {
    // static asserts to ensure KernelFunctor is callable
    static_assert(std::is_invocable_r_v<void, KernelFunctor>,
                  "KernelFunctor must be callable with no arguments");

    // create the device queue
    sycl::queue q{device_selector};

    // submit the user's kernel
    if constexpr (run_forever) {
      if constexpr (std::is_same_v<KernelID, void>) {
        // AutorunForever, kernel name not given
        q.single_task([=] {
          while (1) {
            kernel();
          }
        });
      } else {
        // AutorunForever, kernel name given
        q.single_task<KernelID>([=] {
          while (1) {
            kernel();
          }
        });
      }
    } else {
      // run the kernel as-is, if the user wanted it to run forever they
      // will write their own explicit while-loop
      if constexpr (std::is_same_v<KernelID, void>) {
        // Autorun, kernel name not given
        q.single_task(kernel);
      } else {
        // Autorun, kernel name given
        q.single_task<KernelID>(kernel);
      }
    }
  }
};
}  // namespace detail

// Autorun
template <typename KernelID = void>
using Autorun = detail::Autorun_impl<false, KernelID>;

// AutorunForever
template <typename KernelID = void>
using AutorunForever = detail::Autorun_impl<true, KernelID>;
}  // namespace fpga_tools

#if defined(FPGA_EMULATOR)
sycl::ext::intel::fpga_emulator_selector ds;
#else
sycl::ext::intel::fpga_selector ds;
#endif

class random_generator_kernel_id;

struct rnd_generator {
  void operator()() const {
    auto rand_state = dpct::rng::device::rng_generator<
        oneapi::mkl::rng::device::philox4x32x10<4>>(1984, { 0, 0 * 8 });
    while (1) 
    {
        sycl::ext::intel::pipe<rnd_out_pipe_id, float, 8>::write(
            rand_state
                .generate<oneapi::mkl::rng::device::uniform<float>,
                            1>());
    }
  }
};

fpga_tools::Autorun<random_generator_kernel_id> ar_rnd_gen{ds, rnd_generator{}};

int 
main(int argc, char *argv[])
{
    sycl::queue q{ds};

    const sycl::range<3> blocks(1, 4, 4);
    const sycl::range<3> threads(1, 16, 16);

    const size_t       fb_size = 3840 * 2160 * sizeof(vec3);
    sycl::buffer<vec3> h_fb { sycl::range(fb_size) };

    sycl::event render_event = q.submit([&](sycl::handler &cgh) {
        sycl::accessor a_fb { h_fb, cgh, sycl::write_only, sycl::no_init };

        cgh.parallel_for<class sample_kernel_id>(
            sycl::nd_range<3>(blocks * threads, threads),
            [=](sycl::nd_item<3> item_ct1) {
                int i = item_ct1.get_local_id(2)
                        + item_ct1.get_group(2) * item_ct1.get_local_range(2);
                int j = item_ct1.get_local_id(1)
                        + item_ct1.get_group(1) * item_ct1.get_local_range(1);
                if ((i >= 3840) || (j >= 2160))
                    return;
                int pixel_index = j * 3840 + i;

                vec3   col(0, 0, 0);
                vec3  lookfrom(13.0f, 2.0f, 3.0f);
                vec3  lookat(0.0f, 0.0f, 0.0f);
                float dist_to_focus = 10.0f;
                (lookfrom - lookat).length();
                float aperture = 0.1f;
                camera cam(lookfrom,
                            lookat,
                            vec3(0.0f, 1.0f, 0.0f),
                            30.0f,
                            float(3840) / float(2160),
                            aperture,
                            dist_to_focus);
                for (int s = 0; s < 10; s++)
                {
                    float u = float(i + sycl::ext::intel::pipe<rnd_out_pipe_id, float, 8>::read()) / float(3840);
                    float v = float(j + sycl::ext::intel::pipe<rnd_out_pipe_id, float, 8>::read()) / float(2160);
                    ray   r = cam.get_ray(u, v);
                }

                col /= float(10);
                col[0]          = sycl::sqrt(col[0]);
                col[1]          = sycl::sqrt(col[1]);
                col[2]          = sycl::sqrt(col[2]);
                a_fb[pixel_index] = col;
            });
    });
    render_event.wait();
    const float elapsed = render_event.get_profiling_info<
                                sycl::info::event_profiling::command_end>()
                            - render_event.get_profiling_info<
                                sycl::info::event_profiling::command_start>();

    std::cout << "Test elapsed: " << elapsed << std::endl;

    return 0;
}

Compiling them via

dpcpp -fsycl -fintelfpga -o test.cpp.o -c test.cpp

And linking it results in my above stated errors:

dpcpp -fsycl -fintelfpga -Xshardware -Xstarget="/opt/intel/oneapi/intel_a10gx_pac:pac_a10" test.cpp.o -o test.fpga
test.cpp:134: Compiler Warning: Pipe ordering required barrier insertion in for.body.i of pipe_ZTSZZ4mainENKUlRN2cl4sycl7handlerEE_clES2_E16sample_kernel_id, but kernel may hang as a result
camera.h:47: Compiler Warning: Pipe ordering required barrier insertion in do.body.i.i.i of pipe_ZTSZZ4mainENKUlRN2cl4sycl7handlerEE_clES2_E16sample_kernel_id, but kernel may hang as a result
test.cpp:134: Compiler Warning: Pipe ordering required barrier insertion in _ZN6camera7get_rayEff.exit.i of pipe_ZTSZZ4mainENKUlRN2cl4sycl7handlerEE_clES2_E16sample_kernel_id, but kernel may hang as a result

A smaller example sadly did not produce the erros, I hope this is not too much code too look into fast. As you see I did not use the explicit loop-pipeling for ND-Range kernels, therefore the disable-looppipelining attribute should not do anything here.

Thanks in advance and best regards,
Christoph

Forum Discussion

Non-Ordered Pipes for Random-Number-Generation

Recent Discussions

Agilex 7 I-Series "aocl diagnose acl0" error following OFS

AI Suite System Throughput Issue

HLS Compiler 24.1 error - aocl-clang.exe - dll entry point not found

How Do I get the License for HLS?

Deprecation Notice for FPGA Support Package for oneAPI DPC++/C++. What is the alternative?