The Fmax bottleneck only happens if one of the channel loops is not unrolled. I commented out both conditions in the channel loops and left the one in the compute loop and all loops where pipelined with an II of one without any Fmax bottleneck. This is the code:
#define W 32
#pragma OPENCL EXTENSION cl_intel_channels : enable;
channel float channel_float __attribute__((depth(W)));
__kernel void producer(const int N)
{
int outer_loop_limit=(int)(N/(W))+1;
for(int i=0;i<outer_loop_limit;i++)
{
const int it=i*W;
#pragma unroll
for(int j=0;j<W;j++)
{
//if(it+j<N) //handle boundary conditions
write_channel_intel(channel_float,(float)(2.0));
}
}
}
__kernel void consumer(const int N, __global float* res)
{
const int outer_loop_limit=(int)(N/(W))+1;
float acc_o=0, acc_i=0;
float mult[W], x[W];
for(int i=0; i<outer_loop_limit; i++)
{
const int it=i*W;
#pragma unroll
for(int j=0; j<W; j++)
{
//if(it+j<N) //handle boundary conditions
x[j]=read_channel_intel(channel_float);
}
acc_i=0;
#pragma unroll
for(int j=0; j<W; j++)
{
if(it+j<N)
acc_i+=x[j];
}
acc_o+=acc_i;
}
*res=acc_o;
}
Regarding the problem with portability, I would personally recommend against having multiple reads/writes from/to channels. In fact, up until v17.1 (or maybe v17.0), doing so was not even allowed. If you want to completely avoid extra channel reads/writes while having pipelineable loops, you might be able to achieve this by using W channels instead of one channel that is unrolled W times. Something like this:
#pragma unroll
for (int j = 0; j < W; j++)
{
if(it + j < N)
{
x[j] = read_channel_intel(channel_float[j])
}
}
In this case the compiler should not fail to unroll the loop due to conditional execution anymore, since now you have W channels instead of one.