NSriv2
New Contributor
7 years agoVery poor II with double buffering
Hi,
I am trying to implement a kernel with double buffering, but because of the dependence between reads and writes on the two buffers, altera opencl compiler (aoc) is not able to pipeline the loop efficiently. Any suggestions on how this kind of code should be writen?
__attribute__((max_global_work_dim(0)))
__attribute__((autorun))
__attribute__((num_compute_units(SYS_NUM_COLS,1,1)))
__kernel void B_feeder (
) {
int x_id = get_compute_id(0);
DATA_TYPE B_buffer[2][SYS_NUM_ROWS][RAM_SIZE];
int count[SYS_NUM_ROWS];
int data_recv = 0;
bool done = false;
int first = true;
int r = 0;
int w = 1;
int wcount = 0;
bool blkdone = false;
// Serve requests from crossbar
bool rsuccess[SYS_NUM_ROWS];
bool wsuccess[SYS_NUM_ROWS];
bool busy[SYS_NUM_ROWS];
int ram_addr[SYS_NUM_ROWS];
int length[SYS_NUM_ROWS];
bool blks_done[SYS_NUM_ROWS];
#pragma unroll
for ( int sr = 0; sr < SYS_NUM_ROWS; sr++)
count[sr] = 0;
while (1) {
// Each request channel whose response channel is not busy, try to read a new request
#pragma unroll
for (int i = 0; i < SYS_NUM_ROWS; i++) {
if (!busy[i] && !first) {
// row addr from crossbar. Note that crossbar is responsible for sending the request at i = (row_addr % num_banks) port number only
int addr = read_channel_nb_intel (crossbar_to_B_feeder_channel[i][x_id], &rsuccess[i]);
if (rsuccess[i]) {
busy[i] = true; // ith port is busy to take further requests
ram_addr[i] = (addr / SYS_NUM_ROWS)*((JJ*J_VEC_SZ)/SYS_NUM_COLS); // addr of the requested data in ith RAM
length[i] = 0;
XPRINTF("B_feeder(%d): Got request for bank: %d, addr: %d (row: %d)\n", x_id, i, ram_addr[i], addr);
}
}
}
// Send the response to Crossbar
#pragma unroll
for (int i = 0; i < SYS_NUM_ROWS; i++) {
if (busy[i] && !first) {
DATA_TYPE x = B_buffer[r][i][ram_addr[i] + length[i]];
wsuccess[i] = write_channel_nb_intel(B_feeder_to_crossbar_channel[i][x_id], x);
if (wsuccess[i]) {
XPRINTF("B_feeder(%d): response for bank: %d, with data: %d (addr: %d length: %d)\n", x_id, i, x, ram_addr[i], length[i]);
length[i]++;
// send J/SYS_NUM_COLS number of data elements once a request is received and then unlock the port
if (length[i] == ((JJ*J_VEC_SZ)/SYS_NUM_COLS))
busy[i] = false;
}
}
}
// Is this block complete check from A_loader_side
#pragma unroll
for (int sr = 0; sr < SYS_NUM_ROWS; sr++) {
bool blkdone_succ;
bool _blkdone = read_channel_nb_intel(C_blkdone_channel[sr][x_id], &blkdone_succ);
if (blkdone_succ) {
blks_done[sr] = _blkdone;
XPRINTF("B_feeder(%d): blkdone for PE[%d][%d]\n", x_id, sr, x_id);
}
}
blkdone = true;
#pragma unroll
for (int sr = 0; sr < SYS_NUM_ROWS; sr++)
blkdone = blkdone && blks_done[sr];
if (blkdone) {
#pragma unroll
for (int sr = 0; sr < SYS_NUM_ROWS; sr++)
write_channel_intel (blkdone_ack_channel[sr][x_id], true);
}
bool succ = false;
B_vec x;
if (wcount != KK*JJ/SYS_NUM_COLS)
x = read_channel_nb_intel(B_loader_channel[x_id], &succ);
if (succ) {
int kk = wcount / (JJ/SYS_NUM_COLS);
int sr = (kk % SYS_NUM_ROWS);
XPRINTF("B_feeder(%d): read_channel B_loader data: %d, %d, written to bank: %d addr: %d\n", x_id, x.data[0], x.data[1], sr, count[sr]);
#pragma unroll
for ( int v = 0; v < J_VEC_SZ; v++)
B_buffer[w][sr][count[sr] + v] = x.data[v];
count[sr] += J_VEC_SZ;
wcount++;
}
if ((wcount == KK*JJ/SYS_NUM_COLS) && (blkdone || first)) {
DPRINTF("B_feeder(%d): swapping read and write buffers\n", x_id);
first = false;
wcount = 0;
int temp = r; r = w; w = temp;
blkdone = false;
#pragma unroll
for ( int sr = 0; sr < SYS_NUM_ROWS; sr++)
count[sr] = 0;
#pragma unroll
for (int i = 0; i < SYS_NUM_ROWS; i++)
busy[i] = 0;
#pragma unroll
for (int i = 0; i < SYS_NUM_ROWS; i++)
blks_done[i] = false;
}
}
}Here because of the RAW dependence between line 97 and line 54, the aoc compiler schedules the kernel with II = 64. Note that buffer accesses on line 97 and 54 are on different sides of the double buffer, so there is really no dependency between 97 and 54.
Any suggestions how to improve this?
Thanks,
Nitish