Very poor II with double buffering

Question

Hi,I am trying to implement a kernel with double buffering, but because of the dependence between reads and writes on the two buffers, altera opencl compiler (aoc) is not able to pipeline the loop efficiently. Any suggestions on how this kind of code should be writen?__attribute__((max_global_work_dim(0)))
__attribute__((autorun))
__attribute__((num_compute_units(SYS_NUM_COLS,1,1)))
 
__kernel void B_feeder (
) {
   int x_id = get_compute_id(0);
  
   DATA_TYPE B_buffer[2][SYS_NUM_ROWS][RAM_SIZE];
 
   int count[SYS_NUM_ROWS];
   int data_recv = 0;
   bool done = false;
   int first = true;
   int r = 0;
   int w = 1;
   int wcount = 0;
   bool blkdone = false;
 
   // Serve requests from crossbar
 
   bool rsuccess[SYS_NUM_ROWS];
   bool wsuccess[SYS_NUM_ROWS];
   bool busy[SYS_NUM_ROWS];
   int ram_addr[SYS_NUM_ROWS];
   int length[SYS_NUM_ROWS];
 
   bool blks_done[SYS_NUM_ROWS];
 
   #pragma unroll 
   for ( int sr = 0; sr &lt; SYS_NUM_ROWS; sr++)
     count[sr] = 0;
 
   while (1) {
     // Each request channel whose response channel is not busy, try to read a new request
     #pragma unroll
     for (int i = 0; i &lt; SYS_NUM_ROWS; i++) {
       if (!busy[i] &amp;&amp; !first) {
         // row addr from crossbar. Note that crossbar is responsible for sending the request at i = (row_addr % num_banks) port number only
         int addr = read_channel_nb_intel (crossbar_to_B_feeder_channel[i][x_id], &amp;rsuccess[i]);
         if (rsuccess[i]) {
           busy[i] = true; // ith port is busy to take further requests
           ram_addr[i] = (addr / SYS_NUM_ROWS)*((JJ*J_VEC_SZ)/SYS_NUM_COLS); // addr of the requested data in ith RAM
           length[i] = 0;
           XPRINTF("B_feeder(%d): Got request for bank: %d, addr: %d (row: %d)
", x_id, i, ram_addr[i], addr);
         }
       }
     }
 
     // Send the response to Crossbar
     #pragma unroll
     for (int i = 0; i &lt; SYS_NUM_ROWS; i++) {
       if (busy[i] &amp;&amp; !first) {
         DATA_TYPE x = B_buffer[r][i][ram_addr[i] + length[i]];
         wsuccess[i] = write_channel_nb_intel(B_feeder_to_crossbar_channel[i][x_id], x);
         if (wsuccess[i]) {
           XPRINTF("B_feeder(%d): response for bank: %d, with data: %d (addr: %d length: %d)
", x_id, i, x, ram_addr[i], length[i]);
           length[i]++;
           // send J/SYS_NUM_COLS number of data elements once a request is received and then unlock the port
           if (length[i] == ((JJ*J_VEC_SZ)/SYS_NUM_COLS))
             busy[i] = false;
         }
       }
     }
 
     // Is this block complete check from A_loader_side
     #pragma unroll
     for (int sr = 0; sr &lt; SYS_NUM_ROWS; sr++) {
       bool blkdone_succ;
       bool _blkdone = read_channel_nb_intel(C_blkdone_channel[sr][x_id], &amp;blkdone_succ);
       if (blkdone_succ) {
         blks_done[sr] = _blkdone;
         XPRINTF("B_feeder(%d): blkdone for PE[%d][%d]
", x_id, sr, x_id);
       }
     }
 
     blkdone = true;
     #pragma unroll
     for (int sr = 0; sr &lt; SYS_NUM_ROWS; sr++)
       blkdone = blkdone &amp;&amp; blks_done[sr];
 
     if (blkdone) {
       #pragma unroll
       for (int sr = 0; sr &lt; SYS_NUM_ROWS; sr++)
         write_channel_intel (blkdone_ack_channel[sr][x_id], true);
     }
     bool succ = false;
     B_vec x;
     if (wcount != KK*JJ/SYS_NUM_COLS)
       x = read_channel_nb_intel(B_loader_channel[x_id], &amp;succ);
     if (succ) {
       int kk = wcount / (JJ/SYS_NUM_COLS); 
       int sr = (kk % SYS_NUM_ROWS);
       XPRINTF("B_feeder(%d): read_channel B_loader data: %d, %d, written to bank: %d addr: %d
", x_id, x.data[0], x.data[1], sr, count[sr]);
       #pragma unroll
       for ( int v = 0; v &lt; J_VEC_SZ; v++)
         B_buffer[w][sr][count[sr] + v] = x.data[v];
       count[sr] += J_VEC_SZ;
       wcount++;
     }
 
     if ((wcount == KK*JJ/SYS_NUM_COLS) &amp;&amp; (blkdone || first)) {
       DPRINTF("B_feeder(%d): swapping read and write buffers
", x_id);
       first = false;
       wcount = 0;
       int temp = r; r = w; w = temp;
       blkdone = false;
 
       #pragma unroll 
       for ( int sr = 0; sr &lt; SYS_NUM_ROWS; sr++)
         count[sr] = 0;
 
       #pragma unroll
       for (int i = 0; i &lt; SYS_NUM_ROWS; i++)
         busy[i] = 0;
 
       #pragma unroll
       for (int i = 0; i &lt; SYS_NUM_ROWS; i++) 
         blks_done[i] = false;
     }
   }
}Here because of the RAW dependence between line 97 and line 54, the aoc compiler schedules the kernel with II = 64. Note that buffer accesses on line 97 and 54 are on different sides of the double buffer, so there is really no dependency between 97 and 54.Any suggestions how to improve this? Thanks,Nitish

hrz · Answer

Can you provide the appropriate compilation command for your kernel? There are multiple undefined constructs in it.

nsriv2 · Answer

Hi HRZ,

I have added the complete design here: https://drive.google.com/open?id=1ZH-SrxWn-ZrWDdo-63uMMDCI09OnnwV7 . Here is the command line to compile the design:

% aoc -g -v -high-effort -time time.out -time-passes -regtest_mode -fpc -fp-relaxed -report spdm3.cl -o spdm3.aocx

Thanks,

Nitish

hrz · Answer

You have conditional swapping of "w" and "r" on line 106; I am not sure if it is reasonable to expect the compiler to be able to pipeline the operation in this case. What I can recommend is physically separating your buffer into two (e.g. B_buffer_1 and B_buffer_2) and using conditional statements to choose which one to read from or write to (maybe with a flag that gets set or reset in the conditional statement at the end), rather than swapping the side of the buffer.

Forum Discussion

Very poor II with double buffering

3 Replies

Recent Discussions

AI Suite - Spatial IP outputs wrong value

AI Suite - Is it possible to simulate the AI IP?

AI Suite - Streaming from HPS to DLA IP

Agilex 7 I-Series "aocl diagnose acl0" error following OFS

AI Suite - Custom model in the FPGA building process