Altera_Forum
Honored Contributor
8 years agoMemory leak using emulator
Hi
I'm working on transforming a CUDA program to OpenCL and run on FPGA, right now I'm using the emulator since I don't have the device yet. I wrote a OpenCL kernel that do some simple computing on the image passed from the GPU, and for some reason the memory will increase dramatically for each pixel it compute, and then it will overflow at the third frame. The error massages are: Context callback: Could not allocate a buffer in host memory Context callback: Could not map host buffers to device ERROR: CL_OUT_OF_HOST_MEMORY I did release the buffer after each frame and free the host memory as well, but the memory still accumulate. Launching kernel part (runs for each frame):////////////////////////////////////////////////////////////////////////////////////////////////////////
cl_int status;
cufftComplex* h_afPadScnOut;
h_afPadScnOut = (cufftComplex *)malloc(giScnMemSzCmplx);
CUDA_SAFE_CALL(cudaMemcpy(h_afPadScnOut, gd_afPadScnOut, giScnMemSzCmplx, cudaMemcpyDeviceToHost));// copy memory to host
cl_mem cl_d_afPadScnOut = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, giScnMemSzCmplx, h_afPadScnOut, NULL);
cl_event* write_event = (cl_event *)malloc(sizeof(cl_event));
status = clEnqueueWriteBuffer(queue, cl_d_afPadScnOut, CL_TRUE, 0, giScnMemSzCmplx, h_afPadScnOut, 0, NULL, write_event);// write into CL buffer
checkError(status, "Failed to write buffer cl_gd_afPadScnOut");
// Set the kernel arguments
status = clSetKernelArg(kthLaw_kernel, 0, sizeof(cl_mem), (void*)&cl_d_afPadScnOut);
checkError(status, "Failed to set kernel arg 0");
status = clSetKernelArg(kthLaw_kernel, 1, sizeof(cl_int), (void*)&giScnSz);
checkError(status, "Failed to set kernel arg 1");
printf("\nKernel initialization is complete.\n");
printf("Launching the kernel...\n\n");
// Configure work set over which the kernel will execute
size_t wgSize = { 256, 1, 1 };
size_t gSize = { 307200, 1, 1 };
// Launch the kernel
status = clEnqueueNDRangeKernel(queue, kthLaw_kernel, 1, NULL, gSize, wgSize, 1, write_event, NULL);
checkError(status, "Failed to launch kernel");
clReleaseEvent(*write_event);
//Read back data
status = clEnqueueReadBuffer(queue, cl_d_afPadScnOut, CL_TRUE, 0, giScnMemSzCmplx, h_afPadScnOut, 0, NULL, NULL);
checkError(status, "Failed to read buffer cl_gd_afPadScnOut");
//Free CL buffer
status = clReleaseMemObject(cl_d_afPadScnOut);
checkError(status, "Failed to release buffer");
// Wait for command queue to complete pending events
status = clFinish(queue);
checkError(status, "Failed to finish");
printf("\nKernel execution is complete.\n");
// Free the resources allocated
//AOCLcleanup();
CUDA_SAFE_CALL(cudaMemcpy(gd_afPadScnOut, h_afPadScnOut, giScnMemSzCmplx, cudaMemcpyHostToDevice));
free(h_afPadScnOut);
//////////////////////////////////////////////////////////////////////////////////////////////////////// Kernel:
__kernel void kthLaw(__global float2* d_afPadScn, int dataN)
{
int iIndx = get_global_id(0);
if (iIndx < dataN)
{
//afVals(:) = (abs(afVals(:)).^k) .* (cos(angle(afVals(:))) + sin(angle(afVals(:)))*i);
float2 cDat = d_afPadScn;
float fNewAbsDat = pow(sqrtf(pow(cDat.x, 2) + pow(cDat.y, 2)), 0.1);
float fAngDat = atan2(cDat.y, cDat.x);
cDat.x = fNewAbsDat*cosf(fAngDat);
cDat.y = fNewAbsDat*sinf(fAngDat);
d_afPadScn = cDat;
}
} Also I saw the memory increasing from the task manager, is there a way to print out the memory usage form the kernel? Any advice will be appreciated. -------------------------update------------------------- Well I read some material from Altera and they said executing large number of parallel kernels is not feasible on FPGA, instead we should use pipeline design. So I wrote the kernel in serial and the memory problem was no more and emulator runs faster! I guess I haven't think it through properly, the emulator emulates the behavior of a FPGA where the kernels are actual hardware, of cause it can't be freed in runtime...