oneAPI FPGA hardware error
When I tried to run oneAPI samples on devcloud, it shows error below, but I can run them before. I didn't make any changes. quartus_sh: error while loading shared libraries: libncurses.so.5: cannot open shared object file: No such file or directory Error: The patches required to compile for the target board (0.05dcp) is not installed for the following Quartus: /glob/development-tools/versions/oneapi/2021.3/inteloneapi/intelfpgadpcpp/2021.3.0/QuartusPrimePro/19.2/quartus/bin/quartus_sh dpcpp: error: fpga compiler command failed with exit code 1 (use -v to see invocation) make: *** [Makefile.fpga:30: vector-add-buffers.fpga] Error 14.1KViews0likes11CommentsOpenCL FPGA incremental compilation error
Hello, I'm trying to compile a kernel (.cl) file in the FPGA devcloud (node s001-n137 : Arria 10 1.2) using the -incremental option for the first time, but the compilation process throws this error: Error (23031): Evaluation of Tcl script a10_partial_reconfig/flow.tcl unsuccessful The command line that I used to perform the compilation is: aoc -profile -incremental input_file.cl -o output_file.aocx -board=pac_a10 Without the -incremental option the compilation is successful. What could be causing this error for incremental compilation?2.3KViews0likes4CommentsAOC Offline Compiler wont run quartus_fit command
I am trying to compile a simple CL kernel for my FPGA Cyclone V PCI-E Card. No matter what I do, the compiler always returns an error after the synthesis is completed. Error (11720): Run Analysis and Synthesis (quartus_map) with top-level entity name "top" before running Fitter (quartus_fit) looking a the log I see that quartus_map command was issues to start the process, I don't understand what the problem is. Thanks to all who can assist!1.8KViews0likes2CommentsOpenCL FPGA: actual results differ from emulation results
Platform: DE10-nano soc, Intel FPGA SDK for OpenCL 18.1 I am designing a matrix multiplication kernel similar to this one: https://cnugteren.github.io/tutorial/pages/page8.html It uses 3D work items to basically multiply many sets of two-matrix pairs and output the results. The emulation passes, while the actual design on-chip didn't. When running on FPGA, only the first few digits match with correct results. I am thinking maybe it has something to do with the way the emulator emulates multiple work items. But I add barriers whenever I load values to local memory. Could anyone provide some insights on the difference between multiple work items implementation in emulation and actual design? #include "config.h" uint8_t gf_mu_x86(uint8_t a, uint8_t b) { uint8_t p = 0; /* the product of the multiplication */ #pragma unroll for (int i=0;i<8;i++){ // if (!(a && b)){ // break; // } if (b & 1) /* if b is odd, then add the corresponding a to p (final product = sum of all a's corresponding to odd b's) */ p ^= a; /* since we're in GF(2^m), addition is an XOR */ if (a & 0x80) /* GF modulo: if a >= 128, then it will overflow when shifted left, so reduce */ a = (a << 1) ^ 0x11D; /* XOR with the primitive polynomial x^8 + x^4 + x^3 + x + 1 (0b1_0001_1011) – you can change it but it must be irreducible */ else a <<= 1; /* equivalent to a*2 */ b >>= 1; /* equivalent to b // 2 */ } return p; } int address_interpretor(int x, int y, int offset, __global const uint8_t* restrict sample_idx){ // use x to find index of required packet (file space) in sample_idx uint8_t file_pkt_idx = sample_idx[offset+x]; // calculate idx of required data in file space return file_pkt_idx*PKT_SIZE + y; } // Use 2D register blocking (further increase in work per thread) __kernel // __attribute__((num_compute_units(CMP_UNIT))) // __attribute__((max_work_group_size(256))) __attribute__((reqd_work_group_size(TSM/WPTM, TSN/WPTN, 1))) // 8, 1, 1 void myGEMM6( __global const uint8_t* restrict A, __global const uint8_t* restrict B, __global uint8_t* restrict C, __global const uint8_t* restrict DEGREE_, __global const uint8_t* restrict sample_idx // cached ) { // Thread identifiers const int tidm = get_local_id(0); // Local row ID (max: TSM/WPTM == RTSM) const int tidn = get_local_id(1); // Local col ID (max: TSN/WPTN == RTSN) const int offsetM = TSM*get_group_id(0); // Work-group offset const int offsetN = TSN*get_group_id(1); // Work-group offset const int batch_id = get_global_id(2); // max: N_BATCH // Local memory to fit a tile of A and B __local uint8_t Asub[TSK][TSM]; __local uint8_t Bsub[TSN][TSK+2]; __local uint8_t degrees[MAX_NUM_BATCH]; // Allocate register space uint8_t Areg; uint8_t Breg[WPTN]; uint8_t acc[WPTM][WPTN]; int deg_offset = 0; uint8_t my_deg; // Initialise the accumulation registers #pragma unroll for (int wm=0; wm<WPTM; wm++) { #pragma unroll for (int wn=0; wn<WPTN; wn++) { acc[wm][wn] = 0; } } // load degrees and calculate offsets if(tidm == 0 && tidn == 0){ #pragma unroll for(int i=0;i<MAX_NUM_BATCH;i++){ degrees[i] = DEGREE_[i]; } } barrier(CLK_LOCAL_MEM_FENCE); for(int i=0;i<batch_id;i++){ deg_offset += degrees[i]; } my_deg = degrees[batch_id]; // Loop over all tiles const int numTiles = my_deg/TSK; barrier(CLK_LOCAL_MEM_FENCE); for(int t=0;t<numTiles;t++){ // Load one tile of A and B into local memory // #pragma unroll for (int la=0; la<LPTA; la++) { int tid = tidn*RTSM + tidm; int id = la*RTSN*RTSM + tid; int row = MOD2(id,TSM); int col = DIV2(id,TSM); // float row_ = MOD2(id,TSM); // float col_ = DIV2(id,TSM); // printf("%f,%f\n",row_,col_); int tiledIndex = TSK*t + col; int A_vec = address_interpretor(tiledIndex, offsetM + row, deg_offset,sample_idx); // Asub[col][row] = A[tiledIndex*PKT_SIZE + offsetM + row]; Asub[col][row] = A[A_vec]; Bsub[row][col]= B[tiledIndex*BATCH_SIZE + offsetN + row + deg_offset*BATCH_SIZE]; } // Synchronise to make sure the tile is loaded barrier(CLK_LOCAL_MEM_FENCE); // Loop over the values of a single tile // #pragma unroll for (int k=0; k<TSK; k++) { // Cache the values of Bsub in registers #pragma unroll for (int wn=0; wn<WPTN; wn++) { int col = tidn + wn*RTSN; Breg[wn] = Bsub[col][k]; } // Perform the computation #pragma unroll for (int wm=0; wm<WPTM; wm++) { int row = tidm + wm*RTSM; Areg = Asub[k][row]; #pragma unroll for (int wn=0; wn<WPTN; wn++) { acc[wm][wn] ^= gf_mu_x86(Areg , Breg[wn]); } } } // Synchronise before loading the next tile barrier(CLK_LOCAL_MEM_FENCE); } // Store the final results in C // #pragma unroll for (int wm=0; wm<WPTM; wm++) { int globalRow = offsetM + tidm + wm*RTSM; #pragma unroll for (int wn=0; wn<WPTN; wn++) { int globalCol = offsetN + tidn + wn*RTSN; C[globalCol*PKT_SIZE + globalRow + batch_id*PKT_SIZE*BATCH_SIZE] = acc[wm][wn]; } } }Solved2KViews0likes2CommentsFloat 32from IEEE-754 representation
I have a RTL module that convert a fixed point number to IEEE-754. This RTL module is used as a OpenCL library and returns a 32 bits integer with the values of sign, exponent and mantissa. Is possible cast in OpenCL an integer that contains theIEEE-754 representation into a float32? How I could achieve this?? Thanks.1.4KViews0likes1Comment