Does the local memory usage increase with banking and coalescing?

New Contributor

7 years ago

#define DIM_3 2
#define DIM_4 8
 
#pragma OPENCL EXTENSION cl_intel_channels : enable
 
 
typedef char QTYPE;
typedef int  HTYPE;
 
 
typedef struct {
   QTYPE data[DIM_3];
} group_data;
 
 
typedef struct {
   group_data lane[DIM_4];
} group_vec; 
 
 
typedef struct {
   QTYPE lane[DIM_4];
} group_ch;
 
 
 
 
channel group_vec    data_ch    __attribute__((depth(0)));
channel group_vec    weight_ch    __attribute__((depth(0)));
channel group_ch   out_ch  __attribute__((depth(0)));
 
__kernel
__attribute__((task))
__attribute__((max_global_work_dim(0)))
void fetch_data(
 
	__global group_data *restrict bottom
 
	)
 
{
	group_data data_vec;
	group_vec data_ch_out;
 
 
 
		for(unsigned int  win_itm_xyz=0; win_itm_xyz< 39 * 39 * 4096/(DIM_3); win_itm_xyz++){
		
			data_vec = bottom[win_itm_xyz];
			#pragma unroll
			for(unsigned char ll=0; ll<DIM_4; ll++){
				data_ch_out.lane[ll] = data_vec;
			}
					
			write_channel_intel(data_ch, data_ch_out);
 
 
		}
}
	
 
__kernel
__attribute__((task))
__attribute__((max_global_work_dim(0)))
void fetch_weights(
 
			__global volatile group_vec  *restrict weights 
			
			   )
 
{
		group_vec weight_vec;
		for(unsigned int  win_itm_xyz=0; win_itm_xyz< 39 * 39 * 4096/(DIM_3); win_itm_xyz++){
		
			weight_vec = weights[win_itm_xyz];
					
			write_channel_intel(weight_ch, weight_vec);
		}
 
}
 
 
__kernel
__attribute__((task))
__attribute__((max_global_work_dim(0)))
void conv_wino(
 
			)
{
 
 
	group_vec data_vec;
	group_vec weight_vec;
	group_ch convout;
	HTYPE conv_out[169][DIM_4];
	group_ch inv_wino_out[4];
	uint array_index;
 
	for(uint output = 0; output < 39 * 39; output++) {
		for(unsigned int  win_itm_xyz=0; win_itm_xyz< 4096/DIM_3; win_itm_xyz++){
			data_vec = read_channel_intel(data_ch);
			weight_vec = read_channel_intel(weight_ch); 
		
		#pragma unroll
		for(uint i = 0; i < DIM_4; i++) {
			#pragma unroll
			for(uint j = 0; j< DIM_3; j++) {
				convout.lane[i] += data_vec.lane[i].data[j] * weight_vec.lane[i].data[j];
				}
		
			}
						
		}
		#pragma unroll
		for(unsigned char ll_t=0; ll_t<DIM_4; ll_t++){
			conv_out[array_index][ll_t] = convout.lane[ll_t];	
		}
		if (array_index == 169 - 1){
			array_index = 0;
		}	
		else
			array_index++;
 
		}
 
		#pragma unroll
		for(unsigned char ll_t=0; ll_t<DIM_4; ll_t++){
 
		inv_wino_out[0].lane[ll_t]  =   conv_out[0][ll_t]  + conv_out[1][ll_t] + conv_out[2][ll_t] + conv_out[1][ll_t] + conv_out[5][ll_t] + conv_out[9][ll_t] + conv_out[2][ll_t] + conv_out[6][ll_t] + conv_out[10][ll_t];
		//printf("\n %d ", inv_win_out[0][ll_t]);
		inv_wino_out[1].lane[ll_t] =    conv_out[0][ll_t] + conv_out[5][ll_t] + conv_out[9][ll_t] - conv_out[2][ll_t] - conv_out[6][ll_t] - conv_out[10][ll_t] - conv_out[3][ll_t] - conv_out[7][ll_t] - conv_out[11][ll_t];
		//printf("\n %d ", inv_win_out[1][ll_t]);
		inv_wino_out[2].lane[ll_t] =	conv_out[4][ll_t] + conv_out[9][ll_t] - conv_out[12][ll_t] + conv_out[5][ll_t] - conv_out[157][ll_t] - conv_out[13][ll_t] + conv_out[6][ll_t] - conv_out[10][ll_t] - conv_out[14][ll_t];
		//printf("\n %d ", inv_win_out[2][ll_t]);
		inv_wino_out[3].lane[ll_t] =	conv_out[5][ll_t] - conv_out[16][ll_t] - conv_out[13][ll_t] - conv_out[6][ll_t] + conv_out[10][ll_t] + conv_out[14][ll_t] - conv_out[7][ll_t] + conv_out[11][ll_t] + conv_out[15][ll_t];		
		//printf("\n %d ", inv_win_out[3][ll_t]);
		}
 
		for(unsigned char ll_t=0; ll_t<4; ll_t++)
		{
			write_channel_intel(out_ch, inv_wino_out[ll_t]);
 
		}
 
		
	
}	
 
 
 
// Store Data to Global Memory
__kernel
__attribute__((task))
__attribute__((max_global_work_dim(0)))
void WriteBack(
 
                __global group_ch *restrict top
				)
{
 
	uint array_index;
 
	uchar  index_z_item; // max value 256
	ushort index_z_group;// max value 4096
 
	group_ch output;
 
 
 
		
		for(uint dd = 0; dd< 4; dd++){
			output = read_channel_intel(out_ch);
			top[dd]  =  output;  
				//printf("\n index: %d, Output buffer : %d ",  dd, output.lane[ll] );
			}  
}

In the code snippet, the local memory allocated to conv_out = 103. I checked area analysis by source. But no line is mentioned about 103 RAM blocks.

Forum Discussion

Does the local memory usage increase with banking and coalescing?

Recent Discussions

How to fix Error(23782): Failed to find an expected report

Quartus 22.1 and 23.1 Synthesis Error

Connection bit order between hierarchy

Could not link 'vsim_auto_compile.dll' error troubleshooting.

Failed to run ip-setup-simulation: