I am currently using the OpenCL compiler of version 17.1 from the acceleration stack for development of arria10 GX
and here is my code
#pragma OPENCL EXTENSION cl_intel_channels : enable
channel uint4 c0[31];
uint compare(uint4 A,uint4 B,uint4 C);
uint8 merger(uint4 A, uint4 B);
void swap(uint* A,uint*B);
uint4 read(uint channel_id);
void write(uint channel_id,uint4 write_out_data);
__kernel void read_in_kernel(__global uint* input,__global const uint *restrict lengthG){ //work size=8
uint4 A,B;
__const uint len=lengthG[0];
__const uint id=get_global_id(0);
__const uint offset_0=2*id,offset_1=2*id+1;
__const uint arr_idx_0=offset_0*len,arr_idx_1=offset_1*len;
for(int i=0;i<len;i+=4){
A=(uint4) (input[arr_idx_0+i],input[arr_idx_0+i+1],input[arr_idx_0+i+2],input[arr_idx_0+i+3]);
B=(uint4) (input[arr_idx_1+i],input[arr_idx_1+i+1],input[arr_idx_1+i+2],input[arr_idx_1+i+3]);
write(offset_0,A);//write_channel_intel(c0[offset_0],A);
write(offset_1,B);//write_channel_intel(c0[offset_1],B);
}
}
__kernel void merge_tree(__global const uint *restrict lengthG) {
__private uint4 A,B,regist;
__private uint8 merge_result=(uint8) (0,0,0,0,0,0,0,0);
__private uint counter_a=0,counter_b=0;
__private uint largest=2;
__const uint id=get_global_id(0);
__const uint length=lengthG[0];
__const uint dim=get_global_size(0);
__const uint index_a=2*id+32-dim*4;
__const uint index_b=2*id+33-dim*4;
__const uint index_out=id+32-dim*2;
A=read(index_a);//read_channel_intel(c0[index_a]);
B=read(index_b);//read_channel_intel(c0[index_b]);
counter_a+=4;
counter_b+=4;
merge_result=merger(A,B);
write(index_out,merge_result.lo);//write_channel_intel(c0[index_out],merge_result.lo);
regist=merge_result.hi;
while(counter_a<length&&counter_b<length){
if(largest==1){
A=read(index_a);//read_channel_intel(c0[index_a]);
}
else if(largest==0){
B=read(index_b);//read_channel_intel(c0[index_b]);
}
else if(largest==2){
A=read(index_a);//read_channel_intel(c0[index_a]);
B=read(index_b);//read_channel_intel(c0[index_b]);
}
largest=compare(A,B,regist);
if(largest==0){
merge_result=merger(B,regist);
counter_b+=4;}
else if(largest==1){
merge_result=merger(A,regist);
counter_a+=4;}
else if(largest==2){
merge_result=merger(A,B);
counter_a+=4;
counter_b+=4;}
write(index_out,merge_result.lo);//write_channel_intel(c0[index_out],merge_result.lo);
regist=merge_result.hi;
}
if(counter_a==(length)&&counter_b!=(length))
{ // flush others
merge_result=merger(B,regist);
counter_b+=4;
regist=merge_result.hi;
write(index_out,merge_result.lo);//write_channel_intel(c0[index_out],merge_result.lo);
while(counter_b<length){
B=read(index_b);//read_channel_intel(c0[index_b]);
counter_b+=4;
merge_result=merger(B,regist);
regist=merge_result.hi;
write(index_out,merge_result.lo);//write_channel_intel(c0[index_out],merge_result.lo);
}
}
else if(counter_a!=(length)&&counter_b==(length)){
merge_result=merger(A,regist);
regist=merge_result.hi;
write(index_out,merge_result.lo);//write_channel_intel(c0[index_out],merge_result.lo);
counter_a+=4;
while(counter_a<length){
A=read(index_a);//read_channel_intel(c0[index_a]);
counter_a+=4;
merge_result=merger(A,regist);
regist=merge_result.hi;
write(index_out,merge_result.lo);//write_channel_intel(c0[index_out],merge_result.lo);
}
}
write(index_out,merge_result.lo);//write_channel_intel(c0[index_out],regist);
}
__kernel void write_back( __global uint* sorted_array) {
int index=0;
uint4 A;
for(int i=0;i<4096;++i){
A=read(30);//read_channel_intel(c0[30]);
sorted_array[index++]=A.x;
sorted_array[index++]=A.y;
sorted_array[index++]=A.z;
sorted_array[index++]=A.w;
}
}
uint compare(uint4 A,uint4 B,uint4 C){
if (A.x>=B.x&&A.x>=C.x) return 0;
else if(B.x>=A.x&&B.x>=C.x) return 1;
else if(C.x>=A.x&&C.x>=B.x) return 2;
return 0;
}
uint8 merger(uint4 A, uint4 B){
uint end=0;
uint AB[8]={A.x,A.y,A.z,A.w,B.x,B.y,B.z,B.w};
for(int i=0;i<4;i++){
if(AB[i]>AB[7-i]) swap(&AB[i],&AB[7-i]);
}
for(int i=0;i<8;i+=4){
for(int j=0;j<2;j++)
if(AB[i+j]>AB[i+j+2]) swap(&AB[i+j],&AB[i+j+2]);
}
for(int i=0;i<8;i+=2){
if(AB[i]>AB[i+1]) swap(&AB[i],&AB[i+1]);
}
uint8 merge_result=(uint8) (AB[0],AB[1],AB[2],AB[3],AB[4],AB[5],AB[6],AB[7]);
return merge_result;
}
void swap(uint* A,uint*B){
uint temp;
temp=*A;
*A=*B;
*B=temp;}
uint4 read(uint channel_id){
uint4 data;
switch(channel_id){
case 0: data=read_channel_intel(c0[0]); break;
case 1: data=read_channel_intel(c0[1]); break;
case 2: data=read_channel_intel(c0[2]); break;
case 3: data=read_channel_intel(c0[3]); break;
case 4: data=read_channel_intel(c0[4]); break;
case 5: data=read_channel_intel(c0[5]); break;
case 6: data=read_channel_intel(c0[6]); break;
case 7: data=read_channel_intel(c0[7]); break;
case 8: data=read_channel_intel(c0[8]); break;
case 9: data=read_channel_intel(c0[9]); break;
case 10: data=read_channel_intel(c0[10]); break;
case 11: data=read_channel_intel(c0[11]); break;
case 12: data=read_channel_intel(c0[12]); break;
case 13: data=read_channel_intel(c0[13]); break;
case 14: data=read_channel_intel(c0[14]); break;
case 15: data=read_channel_intel(c0[15]); break;
case 16: data=read_channel_intel(c0[16]); break;
case 17: data=read_channel_intel(c0[17]); break;
case 18: data=read_channel_intel(c0[18]); break;
case 19: data=read_channel_intel(c0[19]); break;
case 20: data=read_channel_intel(c0[20]); break;
case 21: data=read_channel_intel(c0[21]); break;
case 22: data=read_channel_intel(c0[22]); break;
case 23: data=read_channel_intel(c0[23]); break;
case 24: data=read_channel_intel(c0[24]); break;
case 25: data=read_channel_intel(c0[25]); break;
case 26: data=read_channel_intel(c0[26]); break;
case 27: data=read_channel_intel(c0[27]); break;
case 28: data=read_channel_intel(c0[28]); break;
case 29: data=read_channel_intel(c0[29]); break;
case 30: data=read_channel_intel(c0[30]); break;
}
return data;
}
void write(uint channel_id,uint4 write_out_data){
switch(channel_id){
case 0: write_channel_intel(c0[0],write_out_data);break;
case 1: write_channel_intel(c0[1],write_out_data);break;
case 2: write_channel_intel(c0[2],write_out_data);break;
case 3: write_channel_intel(c0[3],write_out_data);break;
case 4: write_channel_intel(c0[4],write_out_data);break;
case 5: write_channel_intel(c0[5],write_out_data);break;
case 6: write_channel_intel(c0[6],write_out_data); break;
case 7: write_channel_intel(c0[7],write_out_data); break;
case 8: write_channel_intel(c0[8],write_out_data); break;
case 9: write_channel_intel(c0[9],write_out_data); break;
case 10: write_channel_intel(c0[10],write_out_data); break;
case 11: write_channel_intel(c0[11],write_out_data); break;
case 12: write_channel_intel(c0[12],write_out_data); break;
case 13: write_channel_intel(c0[13],write_out_data); break;
case 14: write_channel_intel(c0[14],write_out_data); break;
case 15: write_channel_intel(c0[15],write_out_data); break;
case 16: write_channel_intel(c0[16],write_out_data); break;
case 17: write_channel_intel(c0[17],write_out_data); break;
case 18: write_channel_intel(c0[18],write_out_data); break;
case 19: write_channel_intel(c0[19],write_out_data); break;
case 20: write_channel_intel(c0[20],write_out_data); break;
case 21: write_channel_intel(c0[21],write_out_data); break;
case 22: write_channel_intel(c0[22],write_out_data); break;
case 23: write_channel_intel(c0[23],write_out_data); break;
case 24: write_channel_intel(c0[24],write_out_data); break;
case 25: write_channel_intel(c0[25],write_out_data); break;
case 26: write_channel_intel(c0[26],write_out_data); break;
case 27: write_channel_intel(c0[27],write_out_data); break;
case 28: write_channel_intel(c0[28],write_out_data); break;
case 29: write_channel_intel(c0[29],write_out_data); break;
case 30: write_channel_intel(c0[30],write_out_data); break;
}
}