Obtaining different results while executing a kernel on GPU or FPGA

Question

Hello,  I have got another question concerning OpenCL.  My problem: When I execute the same kernel on GPU and on FPGA (pre-compiled binary), I get different results when I read the buffer afterwards.  Are there any device specific operations that can result in a different output?  The kernel code: # pragma OPENCL EXTENSION cl_amd_printf : enable
struct __attribute__ ((packed)) gm_component {
           float w;
           float4 m;
           float16 P;
       };
struct __attribute__ ((packed)) gm_component_survive {
        float w;
        float4 m;
        float16 P;
        float2 eta;
        float4 S;
        float8 K;
    };    
       
//4x4 matrix multiplication
 float16 matrixMult4x4f(float16 M, float16 N,float4 unit4)
{
    //return M*N
float4 a=M.s0123;
float4 b=M.s4567;
float4 c=M.s89ab;
float4 d=M.scdef;
float4 e=N.s048c;
float4 f=N.s159d;
float4 g=N.s26ae;
float4 h=N.s37bf;  
float16 tmp = (float16){dot(a*e,unit4),dot(a*f,unit4),dot(a*g,unit4),dot(a*h,unit4),
              dot(b*e,unit4),dot(b*f,unit4),dot(b*g,unit4),dot(b*h,unit4),
              dot(c*e,unit4),dot(c*f,unit4),dot(c*g,unit4),dot(c*h,unit4),
              dot(d*e,unit4),dot(d*f,unit4),dot(d*g,unit4),dot(d*h,unit4)};
return tmp;
}
// OpenCL Kernel to compute multiplication and addition
__kernel void
update(__global struct gm_component_survive * restrict predict_mixture, 
           float8 Hk, __global float2 *Zk,
          __global struct gm_component * restrict update_mixture,int size, float pr_dk, int updateMixtureSize)
{
  int zk_index=get_global_id(0);
  
  int survive_index = get_global_id(1);
 
if(zk_index&lt;size &amp;&amp; survive_index&lt;updateMixtureSize){
   __global struct gm_component *um = &amp;update_mixture; 
 
 __global struct gm_component_survive *pm = &amp;predict_mixture; 
 
  float2 unit2={1.f,1.f};
  float4 unit4={1.f,1.f,1.f,1.f};
  
  //Multivarate guassian calculation
  //calculate miu
  float2 miu = Zk -pm-&gt;eta;
       
  //Inverse of covarience.. start
  float4 inv_covariance = pm-&gt;S;
  //calculate denominator 
  float determenent = inv_covariance.s3*inv_covariance.s0 - inv_covariance.s2*inv_covariance.s1;
  
  inv_covariance =(float4){inv_covariance.s3,-inv_covariance.s1,-inv_covariance.s2,inv_covariance.s0};
  inv_covariance = inv_covariance/determenent;               //inverse calculation end
  
  //multiplication of miu.T*covariance*miu
  float2 number = { dot(miu*inv_covariance.even,unit2),dot(miu*inv_covariance.odd,unit2)};
  number=number*miu;
  
  //Calculate denominator :pow(2*M_PI, 2)* determenent   39.4784
  float denom = 39.4784* determenent;
  
  denom=sqrt(denom);  
 
 //calculate weight
  um-&gt;w = pr_dk*pm-&gt;w*native_exp(-0.5f * dot(number,unit2)) / denom;
  
 //calculate mean 
  number = (float2){dot(Hk.lo*pm-&gt;m,unit4) , dot(Hk.hi*pm-&gt;m,unit4)};
 
  number = Zk-number;    
  inv_covariance =(float4){dot(pm-&gt;K.lo.lo*number,unit2),dot(pm-&gt;K.lo.hi*number,unit2),dot(pm-&gt;K.hi.lo*number,unit2),dot(pm-&gt;K.hi.hi*number,unit2)};
  um-&gt;m = pm-&gt;m+inv_covariance;
  //calculate covarince
  
  float16 temp1 = (float16){1-dot(pm-&gt;K.lo.lo*Hk.s04,unit2),-dot(pm-&gt;K.lo.lo*Hk.s15,unit2),-dot(pm-&gt;K.lo.lo*Hk.s26,unit2),-dot(pm-&gt;K.lo.lo*Hk.s37,unit2),
                          -dot(pm-&gt;K.lo.hi*Hk.s04,unit2),1-dot(pm-&gt;K.lo.hi*Hk.s15,unit2),-dot(pm-&gt;K.lo.hi*Hk.s26,unit2),-dot(pm-&gt;K.lo.hi*Hk.s37,unit2),
              -dot(pm-&gt;K.hi.lo*Hk.s04,unit2),-dot(pm-&gt;K.hi.lo*Hk.s15,unit2),1-dot(pm-&gt;K.hi.lo*Hk.s26,unit2),-dot(pm-&gt;K.hi.lo*Hk.s37,unit2),
                          -dot(pm-&gt;K.hi.hi*Hk.s04,unit2),-dot(pm-&gt;K.hi.hi*Hk.s15,unit2),-dot(pm-&gt;K.hi.hi*Hk.s26,unit2),1-dot(pm-&gt;K.hi.hi*Hk.s37,unit2)}; 
  um-&gt;P = matrixMult4x4f(temp1, pm-&gt;P,unit4);;  
  }
}
  Any hints will be apprecciated.  Tobias

altera_forum · Answer

How different are the results? If the results are only different in the last few digits of the numbers, then it is probably caused by some rounding difference. Note that if you use --fpc or --fp-relaxed for FPGA compilation, output of floating-point operations will be different.

altera_forum · Answer

Thank you for your answer!

The output is somehow very different. As example look at the following outputs obtained by GPU and FPGA:

FPGA:

w: 0

m: 0.1 51410 0.1 0

P: 8.25 13.5 0 0

-879.562 -1435.38 -0.0762125 -0.124711

0 0 8.25 13.5

0 0 13.5 26

w: 0

m: 0.1 56227.3 0.1 0

P: 8.25 13.5 0 0

-879.562 -1435.38 -0.0762125 -0.124711

0 0 8.25 13.5

0 0 13.5 26

w: 0

m: 0.1 95197.5 0.1 0

P: 8.25 13.5 0 0

-879.562 -1435.38 -0.0762125 -0.124711

0 0 8.25 13.5

0 0 13.5 26

w: 0

m: 0.1 85130.8 0.1 0

P: 8.25 13.5 0 0

-879.562 -1435.38 -0.0762125 -0.124711

0 0 8.25 13.5

0 0 13.5 26

w: 0

m: 0.1 68243.1 0.1 0

P: 8.25 13.5 0 0

-879.562 -1435.38 -0.0762125 -0.124711

0 0 8.25 13.5

0 0 13.5 26

w: 0

m: 0 0 0 0

P: 0 0 0 0

0 0 0 0

GPU:

w: 0

m: 4.48707 0 2.19607 0

P: 8.17379 13.3753 0 0

13.5 26 0 0

0 0 8.17379 13.3753

0 0 13.5 26

w: 0

m: 4.89815 0 2.33002 0

P: 8.17379 13.3753 0 0

13.5 26 0 0

0 0 8.17379 13.3753

0 0 13.5 26

w: 0

m: 8.22379 0 2.51016 0

P: 8.17379 13.3753 0 0

13.5 26 0 0

0 0 8.17379 13.3753

0 0 13.5 26

w: 0

m: 7.36467 0 3.0552 0

P: 8.17379 13.3753 0 0

13.5 26 0 0

0 0 8.17379 13.3753

0 0 13.5 26

w: 0

m: 5.92356 0 2.4224 0

P: 8.17379 13.3753 0 0

13.5 26 0 0

0 0 8.17379 13.3753

0 0 13.5 26

w: 0

m: 0 0 0 0

P: 0 0 0 0

0 0 0 0

altera_forum · Answer

Thank you for your answer!  The results are quite different. A sample is shown below:  
FPGA:
w: 	0
m: 	0.1	51410	0.1	0	
P: 	8.25	13.5	0	0	
	-879.562	-1435.38	-0.0762125	-0.124711	
	0	0	8.25	13.5	
	0	0	13.5	26	
	
w: 	0
m: 	0.1	56227.3	0.1	0	
P: 	8.25	13.5	0	0	
	-879.562	-1435.38	-0.0762125	-0.124711	
	0	0	8.25	13.5	
	0	0	13.5	26	
	
w: 	0
m: 	0.1	95197.5	0.1	0	
P: 	8.25	13.5	0	0	
	-879.562	-1435.38	-0.0762125	-0.124711	
	0	0	8.25	13.5	
	0	0	13.5	26	
	
w: 	0
m: 	0.1	85130.8	0.1	0	
P: 	8.25	13.5	0	0	
	-879.562	-1435.38	-0.0762125	-0.124711	
	0	0	8.25	13.5	
	0	0	13.5	26	
	
w: 	0
m: 	0.1	68243.1	0.1	0	
P: 	8.25	13.5	0	0	
	-879.562	-1435.38	-0.0762125	-0.124711	
	0	0	8.25	13.5	
	0	0	13.5	26	
	
w: 	0
m: 	0	0	0	0	
P: 	0	0	0	0	
	0	0	0	0	
	0	0	0	0	
	0	0	0	0
GPU:
w: 	0
m: 	4.48707	0	2.19607	0	
P: 	8.17379	13.3753	0	0	
	13.5	26	0	0	
	0	0	8.17379	13.3753	
	0	0	13.5	26	
	
w: 	0
m: 	4.89815	0	2.33002	0	
P: 	8.17379	13.3753	0	0	
	13.5	26	0	0	
	0	0	8.17379	13.3753	
	0	0	13.5	26	
	
w: 	0
m: 	8.22379	0	2.51016	0	
P: 	8.17379	13.3753	0	0	
	13.5	26	0	0	
	0	0	8.17379	13.3753	
	0	0	13.5	26	
	
w: 	0
m: 	7.36467	0	3.0552	0	
P: 	8.17379	13.3753	0	0	
	13.5	26	0	0	
	0	0	8.17379	13.3753	
	0	0	13.5	26	
	
w: 	0
m: 	5.92356	0	2.4224	0	
P: 	8.17379	13.3753	0	0	
	13.5	26	0	0	
	0	0	8.17379	13.3753	
	0	0	13.5	26	
	
w: 	0
m: 	0	0	0	0	
P: 	0	0	0	0	
	0	0	0	0	
	0	0	0	0	
	0	0	0	0

altera_forum · Answer

Thank you for your answer!  The outputs are in fact very different.

altera_forum · Answer

One example:  FPGA:  m: 	0.1	51410	0.1	0   GPU:  m: 	4.48707	0	2.19607	0

Forum Discussion

Obtaining different results while executing a kernel on GPU or FPGA

6 Replies

Recent Discussions

Free Agilex3 license is non-commercial?

Quartus Prime 25.1 installation issue

No access to the Self Service Licensing Center (SSLC)

Quartus 20.1std compilation fails for Quartus map - Device 10AS057K2F40I1SG

recovery timing issue