Forum Discussion
Altera_Forum
Honored Contributor
8 years ago=========================================================================
# define TILE_HEIGHT 8 # define TILE_WIDTH 4 # define CVEC 2 typedef struct { float vector[CVEC]; } vec; kernel some_kernel() { ........ __local vec data[TILE_WIDTH]; //weight buffer is two dimensional because whole rows of weight are buffered for reuse __local vec weight[TILE_HEIGHT]; __local float output[TILE_HEIGHT * TILE_WIDTH]; __local float scale[TILE_HEIGHT]; __local float bias[TILE_HEIGHT]; for(uint t = 0; t < (input_dim3 / CVEC) * FILTER_UNROLL_SIZE; t++) { //load data to data buffer for(uint w = 0; w < TILE_WIDTH; w++) { data[w] = read_channel_altera(data_in_ch); } for(uint h = 0; h < TILE_HEIGHT; h++) { weight[h] = read_channel_altera(weight_in_ch); } //comput the matrix tile multiplication using the PE(mac) array # pragma unroll for(uint h = 0; h < TILE_HEIGHT; h++) { vec weight_temp = weight[h]; # pragma unroll for(uint w = 0; w < TILE_WIDTH; w++) { vec data_temp = data[w]; float last_sum; if (t == 0) last_sum = 0; else last_sum = output[h * TILE_WIDTH + w]; # pragma unroll for(uint vv = 0; vv < CVEC; vv++) { last_sum += data_temp.vector[vv] * weight_temp.vector[vv]; } output[h * TILE_WIDTH + w] = last_sum; } } } //declare output data to be enqueued in altara channel lane output_lane; //bias and scale for(uint w = 0; w < TILE_WIDTH; w++) { # pragma unroll 1 for(uint h = 0; h < TILE_HEIGHT; h++) { output_lane.lane_data[h] = output[h * TILE_WIDTH + w] * scale[h] + bias[h]; } write_channel_altera(output_ch, output_lane); } .......... } ============================================================================================ Below is the report about the loop controlled by t ============================================================================================ Block34: Maximum simultaneous execution: 16 threads Local memories are replicated to maximize throughput. See Area analysis of system for exact replication factor. =========================================================================================== Below is the report about local memory "data", "weight", "output" =========================================================================================== conv.cl:135 (data) ALUTs: 0 FFs: 0 BRAMs: 8 DSPs: 0 Local memo... conv.cl:135 (data): Local memory: Good but replicated. Requested size 32 bytes (rounded up to nearest power of 2), implemented size 96 bytes, replicated 3 times total, stall-free, 1 read and 1 write. Additional information: - Replicated 3 times to create private copies for simultaneous execution of 3 threads in the loop containing accesses to the array. conv.cl:135 (weight) ALUTs: 0 FFs: 0 BRAMs: 16 DSPs: 0 Local memo... conv.cl:137 (weight): Local memory: Good but replicated. Requested size 64 bytes (rounded up to nearest power of 2), implemented size 192 bytes, replicated 3 times total, stall-free, 1 read and 1 write. Additional information: - Replicated 3 times to create private copies for simultaneous execution of 3 threads in the loop containing accesses to the array. conv.cl:139 (output) ALUTS: 1025 FFs: 8192 BRAMS: 64 DSPs: 0 Local memo... conv.cl:139 (output): Local memory: Optimal. Requested size 128 bytes (rounded up to nearest power of 2), implemented size 128 bytes, stall-free, 3 reads and 1 write. Additional information: - Reducing accesses to exactly one read and one write for all on-chip memory systems may increase overall system performance. =============================================================================================