OpenCL FPGA: actual results differ from emulation results
Platform: DE10-nano soc, Intel FPGA SDK for OpenCL 18.1
I am designing a matrix multiplication kernel similar to this one: https://cnugteren.github.io/tutorial/pages/page8.html
It uses 3D work items to basically multiply many sets of two-matrix pairs and output the results.
The emulation passes, while the actual design on-chip didn't. When running on FPGA, only the first few digits match with correct results.
I am thinking maybe it has something to do with the way the emulator emulates multiple work items. But I add barriers whenever I load values to local memory.
Could anyone provide some insights on the difference between multiple work items implementation in emulation and actual design?
#include "config.h" uint8_t gf_mu_x86(uint8_t a, uint8_t b) { uint8_t p = 0; /* the product of the multiplication */ #pragma unroll for (int i=0;i<8;i++){ // if (!(a && b)){ // break; // } if (b & 1) /* if b is odd, then add the corresponding a to p (final product = sum of all a's corresponding to odd b's) */ p ^= a; /* since we're in GF(2^m), addition is an XOR */ if (a & 0x80) /* GF modulo: if a >= 128, then it will overflow when shifted left, so reduce */ a = (a << 1) ^ 0x11D; /* XOR with the primitive polynomial x^8 + x^4 + x^3 + x + 1 (0b1_0001_1011) – you can change it but it must be irreducible */ else a <<= 1; /* equivalent to a*2 */ b >>= 1; /* equivalent to b // 2 */ } return p; } int address_interpretor(int x, int y, int offset, __global const uint8_t* restrict sample_idx){ // use x to find index of required packet (file space) in sample_idx uint8_t file_pkt_idx = sample_idx[offset+x]; // calculate idx of required data in file space return file_pkt_idx*PKT_SIZE + y; } // Use 2D register blocking (further increase in work per thread) __kernel // __attribute__((num_compute_units(CMP_UNIT))) // __attribute__((max_work_group_size(256))) __attribute__((reqd_work_group_size(TSM/WPTM, TSN/WPTN, 1))) // 8, 1, 1 void myGEMM6( __global const uint8_t* restrict A, __global const uint8_t* restrict B, __global uint8_t* restrict C, __global const uint8_t* restrict DEGREE_, __global const uint8_t* restrict sample_idx // cached ) { // Thread identifiers const int tidm = get_local_id(0); // Local row ID (max: TSM/WPTM == RTSM) const int tidn = get_local_id(1); // Local col ID (max: TSN/WPTN == RTSN) const int offsetM = TSM*get_group_id(0); // Work-group offset const int offsetN = TSN*get_group_id(1); // Work-group offset const int batch_id = get_global_id(2); // max: N_BATCH // Local memory to fit a tile of A and B __local uint8_t Asub[TSK][TSM]; __local uint8_t Bsub[TSN][TSK+2]; __local uint8_t degrees[MAX_NUM_BATCH]; // Allocate register space uint8_t Areg; uint8_t Breg[WPTN]; uint8_t acc[WPTM][WPTN]; int deg_offset = 0; uint8_t my_deg; // Initialise the accumulation registers #pragma unroll for (int wm=0; wm<WPTM; wm++) { #pragma unroll for (int wn=0; wn<WPTN; wn++) { acc[wm][wn] = 0; } } // load degrees and calculate offsets if(tidm == 0 && tidn == 0){ #pragma unroll for(int i=0;i<MAX_NUM_BATCH;i++){ degrees[i] = DEGREE_[i]; } } barrier(CLK_LOCAL_MEM_FENCE); for(int i=0;i<batch_id;i++){ deg_offset += degrees[i]; } my_deg = degrees[batch_id]; // Loop over all tiles const int numTiles = my_deg/TSK; barrier(CLK_LOCAL_MEM_FENCE); for(int t=0;t<numTiles;t++){ // Load one tile of A and B into local memory // #pragma unroll for (int la=0; la<LPTA; la++) { int tid = tidn*RTSM + tidm; int id = la*RTSN*RTSM + tid; int row = MOD2(id,TSM); int col = DIV2(id,TSM); // float row_ = MOD2(id,TSM); // float col_ = DIV2(id,TSM); // printf("%f,%f\n",row_,col_); int tiledIndex = TSK*t + col; int A_vec = address_interpretor(tiledIndex, offsetM + row, deg_offset,sample_idx); // Asub[col][row] = A[tiledIndex*PKT_SIZE + offsetM + row]; Asub[col][row] = A[A_vec]; Bsub[row][col]= B[tiledIndex*BATCH_SIZE + offsetN + row + deg_offset*BATCH_SIZE]; } // Synchronise to make sure the tile is loaded barrier(CLK_LOCAL_MEM_FENCE); // Loop over the values of a single tile // #pragma unroll for (int k=0; k<TSK; k++) { // Cache the values of Bsub in registers #pragma unroll for (int wn=0; wn<WPTN; wn++) { int col = tidn + wn*RTSN; Breg[wn] = Bsub[col][k]; } // Perform the computation #pragma unroll for (int wm=0; wm<WPTM; wm++) { int row = tidm + wm*RTSM; Areg = Asub[k][row]; #pragma unroll for (int wn=0; wn<WPTN; wn++) { acc[wm][wn] ^= gf_mu_x86(Areg , Breg[wn]); } } } // Synchronise before loading the next tile barrier(CLK_LOCAL_MEM_FENCE); } // Store the final results in C // #pragma unroll for (int wm=0; wm<WPTM; wm++) { int globalRow = offsetM + tidm + wm*RTSM; #pragma unroll for (int wn=0; wn<WPTN; wn++) { int globalCol = offsetN + tidn + wn*RTSN; C[globalCol*PKT_SIZE + globalRow + batch_id*PKT_SIZE*BATCH_SIZE] = acc[wm][wn]; } } }
I got this problem solved. The access of local memory B_sub is out of range when loading value to it. It works well when I increase the size of B_sub.
It's strange though the emulator didn't complain any run-time error.