Forum Discussion
Altera_Forum
Honored Contributor
8 years agoI've tried everything I can think of but I still cannot get my kernel tasks to run in parallel.
The kernels are only doing a simple sum of numbers from "start" to "finish". Other than their result, there is no other memory being used. Sharing memory shouldn't be a problem preventing them from running together. Each kernel is created in its own queue. No waiting via cl_Finish() is being done. I get an event from each kernel and use it to determine when that kernel is complete and print out the time. I can see from that output that they kernels are not running in parallel. Because then are not running concurrently, but they can, there must be something being shared that is preventing them. I don't know what though. Can anyone take a look at the kernel code and C-code and tell me what is keeping them from running concurrently? Here's the kernel code:__kernel void sumN1(const double start,
const double stop,
const double step,
__global double *z) {
//// get index of the work item
//int index = get_global_id(0);
//init result
double sum = 0.0;
for (double i=start; i<=stop; i+=step) {
sum += i;
}
z = sum;
} // sumN1
... eight identical kernels ('cept the kernel name) - in the same .cl file
__kernel void sumN8(const double start,
const double stop,
const double step,
__global double *z) {
//// get index of the work item
//int index = get_global_id(0);
//init result
double sum = 0.0;
for (double i=start; i<=stop; i+=step) {
sum += i;
}
z = sum;
} // sumN8
Here's "most" of the host C code:
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//
// main - sumN
//
// - sumN num
//
int main(int argc, char *argv) {
cl_device_id accDeviceID; //device ID of first acceleator device
cl_platform_id accPlatformID; //platform ID of platform w/first accelerator device
cl_mem answer; //array to collect results
std::string binary_file; //name of OpenCL program
cl_context context; //OpenCL context for this application
cl_device_type deviceType; //type of a device (CPU, GPU, ACCELERATOR)
cl_event event; //wait event synchronization handle used by OpenCL API
bool foundintel = false; //indicates that Intel FPGA card was found
char info_text; //value of some returned text information
bool isACC = false; //flag to remember that an accelerator has been found
cl_kernel kernel; //OpenCL kernal for this applicaiton
int kerns = 0; //number of kernels to use
cl_uint numDevices; //number of OpenCL computing devices for a platform
cl_uint numPlatforms; //number of OpenCL platforms (typically 1)
double number = 0.0; //number to compute sum to
cl_program program; //OpenCL program for this application
cl_command_queue queue; //OpenCL command queue for this application
double result = { 0.0 }; //result of the summation computation
size_t size; //size of returned information from OpenCL API
double start = 1.0; //start of summing
cl_int status; //return code used by OpenCL API
double step = 1.0; //step of summing
double stop = 1.0; //end of summing
cl_int task_done; //info from event query
cl_event task_event; //events from tasks
.... some code omitted here that handled input args and platform, device setup
////////////////////////////////////////
// OpenCL context
context = clCreateContext(NULL, 1, &accDeviceID, NULL, NULL, &status);
exitOnFail(status, "create context");
////////////////////////////////////////
// OpenCL command queue
for ( int kz=0; kz<kerns; kz++) {
queue = clCreateCommandQueue(context, accDeviceID, 0, &status);
exitOnFail(status, "create command queue");
}
////////////////////////////////////////
// Create the program for all device. Use the first device as the
// representative device (assuming all device are of the same type).
binary_file = getBoardBinaryFile("sumN", accDeviceID);
program = createProgramFromBinary(context, binary_file.c_str(), &accDeviceID, 1);
////////////////////////////////////////
// Build the program that was just created.
status = clBuildProgram(program, 0, NULL, "", NULL, NULL);
exitOnFail(status, "Failed to build program");
const double start_time = getCurrentTimestamp();
////////////////////////////////////////
// create the kernel
// Create the kernel - name passed in here must match kernel name in the
// original CL file, that was compiled into an AOCX file using the AOC tool
char kernel_name = "sumNx"; // Kernel name, as defined in the CL file
for ( int kz=0; kz<kerns; kz++) {
sprintf(kernel_name, "sumN%d", kz+1); // generate the Kernel name, as defined in the CL file
kernel = clCreateKernel(program, kernel_name, &status);
exitOnFail(status, "Failed to create kernel");
// Set the kernel argument (argument 0)
status = clSetKernelArg(kernel, 0, sizeof(cl_double), &start);
exitOnFail(status, "Failed to set kernel arg 0");
// Set the kernel argument (argument 1)
status = clSetKernelArg(kernel, 1, sizeof(cl_double), &stop);
exitOnFail(status, "Failed to set kernel arg 0");
// Set the kernel argument (argument 2)
status = clSetKernelArg(kernel, 2, sizeof(cl_double), &step);
exitOnFail(status, "Failed to set kernel arg 0");
// last OpenCL argument: memory buffer object for result
answer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_double), &result, &status);
exitOnFail(status, "create buffer for answer");
// set 4th argument
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &answer);
exitOnFail(status, "set kernel argument answer");
}
// Launch the kernels
for ( int kz=0; kz<kerns; kz++) {
status = clEnqueueTask(queue, kernel, 0, NULL, &task_event);
exitOnFail(status, "Failed to launch kernel");
}
int total_done = 0;
int its_done = { 0 };
while (total_done < kerns) {
for ( int kz=0; kz<kerns; kz++) {
if ( its_done == 0 ) {
status = clGetEventInfo(task_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &task_done, NULL);
if (task_done == CL_COMPLETE) {
printf("Task:%d complete (%0.3f ms)\n", kz, (getCurrentTimestamp() - start_time) * 1.0e3 );
total_done++;
its_done = 1;
} else {
//printf("Task:%d incomplete\n", kz);
}
} // if kz task not done
} // foreach task event
} // wait for kernels to complete
//// Wait for command queue to complete pending events
//for ( int kz=0; kz<kerns; kz++) {
// status = clFinish(queue);
// exitOnFail(status, "Failed to finish");
//}
const double end_time = getCurrentTimestamp();
// Wall-clock time taken.
printf("\nTime: %0.3f ms (%0.3f ms / kernel)\n", (end_time - start_time) * 1e3, (end_time - start_time) * 1e3 / (double)kerns );
for ( int kz=0; kz<kerns; kz++) {
printf("Sum 0-%f (step %f) = %f\n", number, step, result);
}
// Free the resources allocated
cleanup();
if(kernel) {
for ( int kz=0; kz<kerns; kz++) {
clReleaseKernel(kernel);
}
}
if(program) {
clReleaseProgram(program);
}
if(queue) {
for ( int kz=0; kz<kerns; kz++) {
clReleaseCommandQueue(queue);
}
}
if(context) {
clReleaseContext(context);
}
exit(0);
} // main
This is the output from a run using four kernels. There is no parallelism. I've tried using the profiler and it clearly shows that each kernel runs, one after the other. $ bin/host 100000 4 Reprogramming device [0] with handle 1 Task:0 complete (3.600 ms) Task:1 complete (7.096 ms) Task:2 complete (10.583 ms) Task:3 complete (14.066 ms) Time: 14.069 ms (3.517 ms / kernel) Sum 0-100000.000000 (step 1.000000) = 5000050000.000000 Sum 0-100000.000000 (step 1.000000) = 5000050000.000000 Sum 0-100000.000000 (step 1.000000) = 5000050000.000000 Sum 0-100000.000000 (step 1.000000) = 5000050000.000000