Altera_Forum
Honored Contributor
8 years agoundesired BRAM replication
I am using a single work item kernel to do matrix multiplication, and my BRAM usage explored (estimated 100+% BRAM usage while only 16% for DSP).
============================================================================================== #define MAT_A_ROWS 128 #define MAT_A_COLS 64 #define MAT_B_ROWS MAT_A_COLS #define MAT_B_COLS 128 #define BLOCK_SIZE 16 __kernel __attribute__((task)) void matrix_mult(__global float *restrict matA, __global float *restrict matB, __global float *restrict matC, ) { __local float __attribute__((num_banks(BLOCK_SIZE), bandwidth(1))) A[BLOCK_SIZE][BLOCK_SIZE]; __local float __attribute__((num_banks(BLOCK_SIZE), bandwidth(BLOCK_SIZE))) B[BLOCK_SIZE][BLOCK_SIZE]; __local C[BLOCK_SIZE][BLOCK_SIZE]; for (int k = 0; k < MAT_A_COLS / BLOCK_SIZE; k++) { for(int i = 0; i < BLOCK_SIZE; i++) { for(int j = 0; j < BLOCK_SIZE; j++) { A[j] = mata[.....];}
}
for(int i = 0; i < block_size; i++) {
for(int j = 0; j < block_size; j++) {
b[j] = matB[.....]; } } for(int i = 0; i < BLOCK_SIZE; i++){ for(int j = 0; j < BLOCK_SIZE; j++) { float running_sum = 0; for(int k = 0; k < BLOCK_SIZE; k ++) { running_sum += A[k] + b[k][j];
}
c[j] += running_sum; } } } ...... //write to C to matC } ====================================================== According to the report, there 8 threads being pipelined from loop "for(int k = 0; k < MAT_A_COLS / BLOCK_SIZE; k++) ", thus my A and B are replicated 7 times. Is it possible to prevent the memory being replicated? Any advice would be greatly appreciated! Lancer Chiang