I've tried it, loops are pipelined with II=1. But this comes at the expenses of a lower Fmax (as the report states)
The new code look likes this:
__kernel void producer(const int N)
{
int outer_loop_limit=(int)(N/(W))+1;
for(int i=0;i<outer_loop_limit;i++)
{
const int it=i*W;
#pragma unroll
for(int j=0;j<W;j++)
{
if(it+j<N) //handle boundary conditions
write_channel_intel(channel_float,(float)(2.0));
}
}
}
__kernel void consumer(const int N, __global float* res)
{
const int outer_loop_limit=(int)(N/(W))+1;
float acc_o=0, acc_i=0;
float mult[W], x[W];
for(int i=0; i<outer_loop_limit; i++)
{
const int it=i*W;
#pragma unroll
for(int j=0; j<W; j++)
{
if(it+j<N) //handle boundary conditions
x[j]=read_channel_intel(channel_float);
}
acc_i=0;
#pragma unroll
for(int j=0; j<W; j++)
{
if(it+j<N)
acc_i+=x[j];
}
acc_o+=acc_i;
}
}
The report states that a bottleneck occurs at line 34 (Fmax) due to data dependencies (it indicates the comparisons in the two loop guards of consumer and in the definition of the outer_loop_limit constant)