can single work items kernels run in parallel on same device

Honored Contributor

8 years ago

I've tried everything I can think of but I still cannot get my kernel tasks to run in parallel.

The kernels are only doing a simple sum of numbers from "start" to "finish". Other than their result, there is no other memory

being used. Sharing memory shouldn't be a problem preventing them from running together.

Each kernel is created in its own queue.

No waiting via cl_Finish() is being done. I get an event from each kernel and use it to determine when that kernel is complete

and print out the time. I can see from that output that they kernels are not running in parallel.

Because then are not running concurrently, but they can, there must be something being shared that is preventing them.

I don't know what though.

Can anyone take a look at the kernel code and C-code and tell me what is keeping them from running concurrently?

Here's the kernel code:

__kernel void sumN1(const double start,
                    const double stop,
                    const double step,
                 __global double *z) {
  //// get index of the work item
  //int index = get_global_id(0);
  //init result
  double sum = 0.0;
  for (double i=start; i<=stop; i+=step) {
    sum += i;
  }
  z = sum;
} // sumN1
... eight identical kernels ('cept the kernel name) - in the same .cl file
__kernel void sumN8(const double start,
                    const double stop,
                    const double step,
                 __global double *z) {
  //// get index of the work item
  //int index = get_global_id(0);
  //init result
  double sum = 0.0;
  for (double i=start; i<=stop; i+=step) {
    sum += i;
  }
  z = sum;
} // sumN8

Here's "most" of the host C code:


////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//
// main - sumN
//
// - sumN num 
//
int main(int argc, char *argv) {
      cl_device_id  accDeviceID;                //device ID of first acceleator device
    cl_platform_id  accPlatformID;              //platform ID of platform w/first accelerator device
            cl_mem  answer;                  //array to collect results
       std::string  binary_file;                //name of OpenCL program
        cl_context  context;                    //OpenCL context for this application
    cl_device_type  deviceType;                 //type of a device (CPU, GPU, ACCELERATOR)
          cl_event  event;                      //wait event synchronization handle used by OpenCL API
              bool  foundintel = false;         //indicates that Intel FPGA card was found
              char  info_text;   //value of some returned text information
              bool  isACC = false;              //flag to remember that an accelerator has been found
         cl_kernel  kernel;                  //OpenCL kernal for this applicaiton
               int  kerns = 0;                  //number of kernels to use
           cl_uint  numDevices;                 //number of OpenCL computing devices for a platform
           cl_uint  numPlatforms;               //number of OpenCL platforms (typically 1)
            double  number = 0.0;               //number to compute sum to
        cl_program  program;                    //OpenCL program for this application
  cl_command_queue  queue;                   //OpenCL command queue for this application
            double  result = { 0.0 };        //result of the summation computation
            size_t  size;                       //size of returned information from OpenCL API
            double  start = 1.0;                //start of summing
            cl_int  status;                     //return code used by OpenCL API
            double  step  = 1.0;                //step  of summing
            double  stop  = 1.0;                //end   of summing
            cl_int  task_done;                  //info from event query
          cl_event  task_event;              //events from tasks
.... some code omitted here that handled input args and platform, device setup
  ////////////////////////////////////////
  // OpenCL context
  context = clCreateContext(NULL, 1, &accDeviceID, NULL, NULL, &status);
  exitOnFail(status, "create context");
  ////////////////////////////////////////
  // OpenCL command queue
  for ( int kz=0; kz<kerns; kz++) {
    queue = clCreateCommandQueue(context, accDeviceID, 0, &status);
    exitOnFail(status, "create command queue");
  }
  ////////////////////////////////////////
  // Create the program for all device. Use the first device as the
  // representative device (assuming all device are of the same type).
  binary_file = getBoardBinaryFile("sumN", accDeviceID);
  program = createProgramFromBinary(context, binary_file.c_str(), &accDeviceID, 1);
  ////////////////////////////////////////
  // Build the program that was just created.
  status = clBuildProgram(program, 0, NULL, "", NULL, NULL);
  exitOnFail(status, "Failed to build program");
  const double start_time = getCurrentTimestamp();
  ////////////////////////////////////////
  // create the kernel
  // Create the kernel - name passed in here must match kernel name in the
  // original CL file, that was compiled into an AOCX file using the AOC tool
  char kernel_name = "sumNx";  // Kernel name, as defined in the CL file
  for ( int kz=0; kz<kerns; kz++) {
    sprintf(kernel_name, "sumN%d", kz+1);  // generate the Kernel name, as defined in the CL file
    kernel = clCreateKernel(program, kernel_name, &status);
    exitOnFail(status, "Failed to create kernel");
    // Set the kernel argument (argument 0)
    status = clSetKernelArg(kernel, 0, sizeof(cl_double), &start);
    exitOnFail(status, "Failed to set kernel arg 0");
    // Set the kernel argument (argument 1)
    status = clSetKernelArg(kernel, 1, sizeof(cl_double), &stop);
    exitOnFail(status, "Failed to set kernel arg 0");
    // Set the kernel argument (argument 2)
    status = clSetKernelArg(kernel, 2, sizeof(cl_double), &step);
    exitOnFail(status, "Failed to set kernel arg 0");
    // last OpenCL argument: memory buffer object for result
    answer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_double), &result, &status);
    exitOnFail(status, "create buffer for answer");
    // set 4th argument
    status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &answer);
    exitOnFail(status, "set kernel argument answer");
  }
  // Launch the kernels
  for ( int kz=0; kz<kerns; kz++) {
    status = clEnqueueTask(queue, kernel, 0, NULL, &task_event);
    exitOnFail(status, "Failed to launch kernel");
  }
  int total_done = 0;
  int its_done = { 0 };
  while (total_done < kerns) {
    for ( int kz=0; kz<kerns; kz++) {
      if ( its_done == 0 ) {
        status = clGetEventInfo(task_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &task_done, NULL);
        if (task_done == CL_COMPLETE) {
          printf("Task:%d complete (%0.3f ms)\n", kz, (getCurrentTimestamp() - start_time) * 1.0e3 );
          total_done++;
          its_done = 1;
        } else {
          //printf("Task:%d incomplete\n", kz);
        }
      } // if kz task not done
    } // foreach task event
  } // wait for kernels to complete
//// Wait for command queue to complete pending events
//for ( int kz=0; kz<kerns; kz++) {
//  status = clFinish(queue);
//  exitOnFail(status, "Failed to finish");
//}
  const double end_time = getCurrentTimestamp();
  // Wall-clock time taken.
  printf("\nTime: %0.3f ms (%0.3f ms / kernel)\n", (end_time - start_time) * 1e3, (end_time - start_time) * 1e3 / (double)kerns );
  for ( int kz=0; kz<kerns; kz++) {
    printf("Sum 0-%f (step %f) = %f\n", number, step, result);
  }
  // Free the resources allocated
  cleanup();
  if(kernel) {
    for ( int kz=0; kz<kerns; kz++) {
      clReleaseKernel(kernel);
    }
  }
  if(program) {
    clReleaseProgram(program);
  }
  if(queue) {
    for ( int kz=0; kz<kerns; kz++) {
      clReleaseCommandQueue(queue);
    }
  }
  if(context) {
    clReleaseContext(context);
  }
  exit(0);
} // main

This is the output from a run using four kernels.

There is no parallelism. I've tried using the profiler and it clearly shows that each kernel

runs, one after the other.

$ bin/host 100000 4

Reprogramming device [0] with handle 1

Task:0 complete (3.600 ms)

Task:1 complete (7.096 ms)

Task:2 complete (10.583 ms)

Task:3 complete (14.066 ms)

Time: 14.069 ms (3.517 ms / kernel)

Sum 0-100000.000000 (step 1.000000) = 5000050000.000000

Forum Discussion

can single work items kernels run in parallel on same device

Recent Discussions

Failed to run ip-setup-simulation:

Compile option not saved (reversed to default)

How to fix Error(23782): Failed to find an expected report

SSLC Login Issue – "You need to enroll" loop after OTP verification

Free Licence for Max+PlusII