__attribute__((reqd_work_group_size(NUM_THREAD, 1, 1)))
__kernel void vectorAdd(__global const float * restrict x,
__global const float * restrict y,
int numElements) //numElements = globalSize = 1024000
{
// get index of the work item
int index = get_global_id(0);
int lid = get_local_id(0);
if (index >= numElements) {
return;
}
float sum ;
sum = x[index] + y[index];
write_channel_altera(read , sum);
}
__kernel void sum(__global float * restrict c) //Task
{
float sum_add , temp = 0.0;
c[0] = 0.0 ;
int cnt = 0;
for(cnt = 0 ; cnt < globalSize ; cnt++){
sum_add = read_channel_altera(read );
temp += sum_add;
}
*c = temp;
}
It seems that the code above is not efficient . In my case (DE5-NET) , it needs ~40ms to finish the kernel , and at step ( c[0] = temp ) it needs 8 clock stall (use -g to report) .
"""""""""""""""""""""
Successive iterations launched every 8 cycles due to:
Data dependency on variable temp
Largest Critical Path Contributor:
96%: Fadd Operation
"""""""""""""""""'"""
But the reduction example (altera provides) is doing the same thing , and says that this method is more efficient.
I don't understand what's the problem about this code .
And I want to ask how many private memory per workitem could use ?
Thanks.