Unable to coalesce kernel execution to memory reads in Arria 10 FPGA using opencl 17.1 bsp
Hi,
I have a benchmarking program based on the intel example for read and kernel coalesced execution:
I know this example was added to the manual after version 17.1 but its core is just basic opencl event and kernel queue usage.
As seen in @buffers_timeline.PNG the first execution does get coalesced (kernel execution and read buffer at the same time), but all the subsequent reads are not coalesced. The 3rd execution of the kernel is waiting for the second buffer somehow.
Any ideas?
I only need a way of getting data out. Is this a problem from version 17.1 ? As you can see the data generation kernel "data_in" gets stalled (I supose because the "daq" kernel is slowing it down).
@BW_occ_stall.PNG
So I coded the following example host program:
@hostcode.PNG
...
cl_event kernelEvent[100], readEvent[100];
const size_t global_work_size[1] = {1};
clEnqueueNDRangeKernel(kernelQ2[0], kernel[1], 1,NULL,global_work_size,NULL, 0, NULL, NULL);
const double start_time = getCurrentTimestamp();
//start daq loop. DAQ_ITERATIONS buffers in host to be filled by the device
printf("\n Starting DAQ %d iterations... \n", num_iterations);
clEnqueueNDRangeKernel(kernelQ[0], kernel[0], 1,NULL,global_work_size,NULL, 0, NULL, &kernelEvent[0]);
clEnqueueReadBuffer(readQ[0], d_output_buff[0], CL_FALSE, 0, d_buffer_num_elem*sizeof(cl_short8), h_output_buf[0], 1, &kernelEvent[0], &readEvent[0]);
clEnqueueNDRangeKernel(kernelQ[0], kernel[0], 1,NULL,global_work_size,NULL, 0, NULL, &kernelEvent[1]);
clEnqueueReadBuffer(readQ[0], d_output_buff[1], CL_FALSE, 0, d_buffer_num_elem*sizeof(cl_short8), h_output_buf[1], 1, &kernelEvent[1], &readEvent[1]);
// clFlush(readQ[0]);
clFlush(kernelQ[0]);
clFlush(readQ[0]);
for (int i=2; i<num_iterations; i++) {
printf("\nIteration %d, buffer %d: \n", i, i%2);
status = clSetKernelArg(kernel[0], i%2, sizeof(cl_mem), &d_output_buff[i]);
checkError(status, "Failed to set argument %d", i%2);
clEnqueueNDRangeKernel(kernelQ[0], kernel[0], 1,NULL,global_work_size,NULL, 1, &readEvent[i-2], &kernelEvent[i]);
// clFlush(kernelQ[0]);
clEnqueueReadBuffer(readQ[0], d_output_buff[i], CL_FALSE, 0, d_buffer_num_elem*sizeof(cl_short8), h_output_buf[i], 1, &kernelEvent[i], &readEvent[i]);
// clFlush(readQ[0]);
}
clFlush(kernelQ[0]);
clFlush(readQ[0]);
// Wait for all kernels to finish.
clWaitForEvents(1,&readEvent[num_iterations-1]);
const double end_time = getCurrentTimestamp();
const double total_time = end_time - start_time;
...