Forum Discussion
mvemp
New Contributor
7 years ago#define DIM_3 2
#define DIM_4 8
#pragma OPENCL EXTENSION cl_intel_channels : enable
typedef char QTYPE;
typedef int HTYPE;
typedef struct {
QTYPE data[DIM_3];
} group_data;
typedef struct {
group_data lane[DIM_4];
} group_vec;
typedef struct {
QTYPE lane[DIM_4];
} group_ch;
channel group_vec data_ch __attribute__((depth(0)));
channel group_vec weight_ch __attribute__((depth(0)));
channel group_ch out_ch __attribute__((depth(0)));
__kernel
__attribute__((task))
__attribute__((max_global_work_dim(0)))
void fetch_data(
__global group_data *restrict bottom
)
{
group_data data_vec;
group_vec data_ch_out;
for(unsigned int win_itm_xyz=0; win_itm_xyz< 39 * 39 * 4096/(DIM_3); win_itm_xyz++){
data_vec = bottom[win_itm_xyz];
#pragma unroll
for(unsigned char ll=0; ll<DIM_4; ll++){
data_ch_out.lane[ll] = data_vec;
}
write_channel_intel(data_ch, data_ch_out);
}
}
__kernel
__attribute__((task))
__attribute__((max_global_work_dim(0)))
void fetch_weights(
__global volatile group_vec *restrict weights
)
{
group_vec weight_vec;
for(unsigned int win_itm_xyz=0; win_itm_xyz< 39 * 39 * 4096/(DIM_3); win_itm_xyz++){
weight_vec = weights[win_itm_xyz];
write_channel_intel(weight_ch, weight_vec);
}
}
__kernel
__attribute__((task))
__attribute__((max_global_work_dim(0)))
void conv_wino(
)
{
group_vec data_vec;
group_vec weight_vec;
group_ch convout;
HTYPE conv_out[169][DIM_4];
group_ch inv_wino_out[4];
uint array_index;
for(uint output = 0; output < 39 * 39; output++) {
for(unsigned int win_itm_xyz=0; win_itm_xyz< 4096/DIM_3; win_itm_xyz++){
data_vec = read_channel_intel(data_ch);
weight_vec = read_channel_intel(weight_ch);
#pragma unroll
for(uint i = 0; i < DIM_4; i++) {
#pragma unroll
for(uint j = 0; j< DIM_3; j++) {
convout.lane[i] += data_vec.lane[i].data[j] * weight_vec.lane[i].data[j];
}
}
}
#pragma unroll
for(unsigned char ll_t=0; ll_t<DIM_4; ll_t++){
conv_out[array_index][ll_t] = convout.lane[ll_t];
}
if (array_index == 169 - 1){
array_index = 0;
}
else
array_index++;
}
#pragma unroll
for(unsigned char ll_t=0; ll_t<DIM_4; ll_t++){
inv_wino_out[0].lane[ll_t] = conv_out[0][ll_t] + conv_out[1][ll_t] + conv_out[2][ll_t] + conv_out[1][ll_t] + conv_out[5][ll_t] + conv_out[9][ll_t] + conv_out[2][ll_t] + conv_out[6][ll_t] + conv_out[10][ll_t];
//printf("\n %d ", inv_win_out[0][ll_t]);
inv_wino_out[1].lane[ll_t] = conv_out[0][ll_t] + conv_out[5][ll_t] + conv_out[9][ll_t] - conv_out[2][ll_t] - conv_out[6][ll_t] - conv_out[10][ll_t] - conv_out[3][ll_t] - conv_out[7][ll_t] - conv_out[11][ll_t];
//printf("\n %d ", inv_win_out[1][ll_t]);
inv_wino_out[2].lane[ll_t] = conv_out[4][ll_t] + conv_out[9][ll_t] - conv_out[12][ll_t] + conv_out[5][ll_t] - conv_out[157][ll_t] - conv_out[13][ll_t] + conv_out[6][ll_t] - conv_out[10][ll_t] - conv_out[14][ll_t];
//printf("\n %d ", inv_win_out[2][ll_t]);
inv_wino_out[3].lane[ll_t] = conv_out[5][ll_t] - conv_out[16][ll_t] - conv_out[13][ll_t] - conv_out[6][ll_t] + conv_out[10][ll_t] + conv_out[14][ll_t] - conv_out[7][ll_t] + conv_out[11][ll_t] + conv_out[15][ll_t];
//printf("\n %d ", inv_win_out[3][ll_t]);
}
for(unsigned char ll_t=0; ll_t<4; ll_t++)
{
write_channel_intel(out_ch, inv_wino_out[ll_t]);
}
}
// Store Data to Global Memory
__kernel
__attribute__((task))
__attribute__((max_global_work_dim(0)))
void WriteBack(
__global group_ch *restrict top
)
{
uint array_index;
uchar index_z_item; // max value 256
ushort index_z_group;// max value 4096
group_ch output;
for(uint dd = 0; dd< 4; dd++){
output = read_channel_intel(out_ch);
top[dd] = output;
//printf("\n index: %d, Output buffer : %d ", dd, output.lane[ll] );
}
}
In the code snippet, the local memory allocated to conv_out = 103. I checked area analysis by source. But no line is mentioned about 103 RAM blocks.