OpenCL FPGA: actual results differ from emulation results

Question

Platform: DE10-nano soc, Intel FPGA SDK for OpenCL 18.1
I am designing a matrix multiplication kernel similar to this one: https://cnugteren.github.io/tutorial/pages/page8.html 
It uses 3D work items to basically multiply many sets of two-matrix pairs and output the results. 
The emulation passes, while the actual design on-chip didn't. When running on FPGA, only the first few digits match with correct results. 
I am thinking maybe it has something to do with the way the emulator emulates multiple work items. But I add barriers whenever I load values to local memory.
Could anyone provide some insights on the difference between multiple work items implementation in emulation and actual design?

#include "config.h"

uint8_t gf_mu_x86(uint8_t a, uint8_t b) {
	uint8_t p = 0; /* the product of the multiplication */
    #pragma unroll
	for (int i=0;i&lt;8;i++){
            // if (!(a &amp;&amp; b)){
            //         break;
            //     }
            if (b &amp; 1) /* if b is odd, then add the corresponding a to p (final product = sum of all a's corresponding to odd b's) */
                p ^= a; /* since we're in GF(2^m), addition is an XOR */

if (a &amp; 0x80) /* GF modulo: if a &gt;= 128, then it will overflow when shifted left, so reduce */
                a = (a &lt;&lt; 1) ^ 0x11D; /* XOR with the primitive polynomial x^8 + x^4 + x^3 + x + 1 (0b1_0001_1011) – you can change it but it must be irreducible */
            else
                a &lt;&lt;= 1; /* equivalent to a*2 */
            b &gt;&gt;= 1; /* equivalent to b // 2 */
            
	}
	return p;
}

int address_interpretor(int x, int y, int offset, __global const uint8_t* restrict sample_idx){
    // use x to find index of required packet (file space) in sample_idx    
    uint8_t file_pkt_idx = sample_idx[offset+x];
    // calculate idx of required data in file space
    return file_pkt_idx*PKT_SIZE + y;
}

// Use 2D register blocking (further increase in work per thread)
__kernel 
// __attribute__((num_compute_units(CMP_UNIT)))
// __attribute__((max_work_group_size(256))) 
__attribute__((reqd_work_group_size(TSM/WPTM, TSN/WPTN, 1)))  // 8, 1, 1
void myGEMM6(
            __global const uint8_t* restrict A,
            __global const uint8_t* restrict B,
            __global uint8_t* restrict C,
            __global const uint8_t* restrict DEGREE_,
            __global const uint8_t* restrict sample_idx // cached
      
                      ) {

// Thread identifiers
    const int tidm = get_local_id(0); // Local row ID (max: TSM/WPTM == RTSM)
    const int tidn = get_local_id(1); // Local col ID (max: TSN/WPTN == RTSN)
    const int offsetM = TSM*get_group_id(0); // Work-group offset
    const int offsetN = TSN*get_group_id(1); // Work-group offset
    const int batch_id = get_global_id(2); // max: N_BATCH

// Local memory to fit a tile of A and B
    __local uint8_t Asub[TSK][TSM];
    __local uint8_t Bsub[TSN][TSK+2];
    __local uint8_t degrees[MAX_NUM_BATCH];
    // Allocate register space
    uint8_t Areg;
    uint8_t Breg[WPTN];
    uint8_t acc[WPTM][WPTN];
    int deg_offset = 0;
    uint8_t my_deg;

// Initialise the accumulation registers
    #pragma unroll
    for (int wm=0; wm&lt;WPTM; wm++) {
        #pragma unroll
        for (int wn=0; wn&lt;WPTN; wn++) {
            acc[wm][wn] = 0;
        }
    }
    
    // load degrees and calculate offsets                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
    if(tidm == 0 &amp;&amp; tidn == 0){
        #pragma unroll
        for(int i=0;i&lt;MAX_NUM_BATCH;i++){
            degrees[i] = DEGREE_[i];
        }
    }                                                                                     
    barrier(CLK_LOCAL_MEM_FENCE);                                                                                                                                                                                                                                      
    for(int i=0;i&lt;batch_id;i++){                                                                                             
        deg_offset += degrees[i];                                                                                            
    }
    my_deg = degrees[batch_id];                                                                                                                        
    
    // Loop over all tiles
    const int numTiles = my_deg/TSK;
    barrier(CLK_LOCAL_MEM_FENCE);
  
    for(int t=0;t&lt;numTiles;t++){

// Load one tile of A and B into local memory
        // #pragma unroll
        for (int la=0; la&lt;LPTA; la++) {
            int tid = tidn*RTSM + tidm;
            int id = la*RTSN*RTSM + tid;
            int row = MOD2(id,TSM);
            int col = DIV2(id,TSM);
            // float row_ = MOD2(id,TSM);
            // float col_ = DIV2(id,TSM);
            // printf("%f,%f
",row_,col_);
            int tiledIndex = TSK*t + col;
            int A_vec = address_interpretor(tiledIndex, offsetM + row, deg_offset,sample_idx);
            // Asub[col][row] = A[tiledIndex*PKT_SIZE + offsetM + row];
            Asub[col][row] = A[A_vec];
            Bsub[row][col]= B[tiledIndex*BATCH_SIZE + offsetN + row + deg_offset*BATCH_SIZE];
        }

// Synchronise to make sure the tile is loaded
        barrier(CLK_LOCAL_MEM_FENCE);

// Loop over the values of a single tile
        // #pragma unroll
        for (int k=0; k&lt;TSK; k++) {
            // Cache the values of Bsub in registers
            #pragma unroll
            for (int wn=0; wn&lt;WPTN; wn++) {
                int col = tidn + wn*RTSN;
                Breg[wn] = Bsub[col][k];
            }

// Perform the computation
            #pragma unroll
            for (int wm=0; wm&lt;WPTM; wm++) {
                int row = tidm + wm*RTSM;
                Areg = Asub[k][row];
                #pragma unroll
                for (int wn=0; wn&lt;WPTN; wn++) {
                    acc[wm][wn] ^= gf_mu_x86(Areg , Breg[wn]);
                }
            }
        }

// Synchronise before loading the next tile
        barrier(CLK_LOCAL_MEM_FENCE);
    }

// Store the final results in C
    // #pragma unroll
    for (int wm=0; wm&lt;WPTM; wm++) {
        int globalRow = offsetM + tidm + wm*RTSM;
        #pragma unroll
        for (int wn=0; wn&lt;WPTN; wn++) {
            int globalCol = offsetN + tidn + wn*RTSN; 
            C[globalCol*PKT_SIZE + globalRow + batch_id*PKT_SIZE*BATCH_SIZE] = acc[wm][wn];
          
        }
    }
    
}

jackgreen · Accepted Answer

I got this problem solved. The access of local memory B_sub is out of range when loading value to it. It works well when I increase the size of B_sub.
It's strange though the emulator didn't complain any run-time error.

Forum Discussion

OpenCL FPGA: actual results differ from emulation results

2 Replies

Recent Discussions

Agilex 7 I-Series "aocl diagnose acl0" error following OFS

AI Suite - core_hw.tcl error

How does the FPGA AI Suite utilize Agilex 5 DSP Blocks?

AI Suite - Why does the Sequential IP not take a model argument?

AI Suite - Spatial IP outputs wrong value