tde_m
Occasional Contributor
6 years agoHigh resource usage when using half precision
Hello,
I started to investigate the possibility to use half precision for some mathematical routines.
However I found a suspicious high resource utilization when compared with classical floating point version.
In the following code, there are three different implementation of a dot product, in which the computation is partially unrolled:
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define W 64 //unrolling parameter
#define SHIFT_REG_SIZE 8
__kernel void float_dot(__global volatile const float * restrict x, __global volatile const float * restrict y, __global volatile float * restrict res, unsigned int N){
const int num_vect = N/W;
float acc = 0;
for(int i=0;i<num_vect;i++){
float acc_i = 0;
#pragma unroll
for(int j=0;j<W;j++){
float xx = x[i*W+j];
float yy = y[i*W+j];
acc_i += xx * yy;
}
acc +=acc_i;
}
*res=acc;
}
__kernel void half_dot(__global volatile const half * restrict x, __global volatile const half * restrict y, __global volatile half * restrict res, unsigned int N){
const int num_vect = N/W;
half acc = 0;
for(int i=0;i<num_vect;i++){
half acc_i = 0;
#pragma unroll
for(int j=0;j<W;j++){
half xx = x[i*W+j];
half yy = y[i*W+j];
acc_i += xx * yy;
}
acc +=acc_i;
}
*res=acc;
}
__kernel void half_dot_sr(__global volatile const half * restrict x, __global volatile const half * restrict y, __global volatile half * restrict res, unsigned int N){
const int num_vect = N/W;
half shift_reg[SHIFT_REG_SIZE+1]; //shift register
#pragma unroll
for(int i=0;i<SHIFT_REG_SIZE+1;i++)
shift_reg[i]=0;
half acc = 0;
for(int i=0;i<num_vect;i++){
half acc_i = 0;
#pragma unroll
for(int j=0;j<W;j++){
half xx = x[i*W+j];
half yy = y[i*W+j];
acc_i += xx * yy;
}
shift_reg[SHIFT_REG_SIZE] = shift_reg[0]+acc_i;
#pragma unroll
for(int j = 0; j < SHIFT_REG_SIZE; ++j)
shift_reg[j] = shift_reg[j + 1];
}
//reconstruct the result using the partial results in shift register
#pragma unroll
for(int i=0;i<SHIFT_REG_SIZE;i++)
acc+=shift_reg[i];
*res=acc;
}- float_dot: classical floating point version. It takes advantage of the single clock cycle accumulator, obtaining an II=1
- half_dot: simple version using half instead of float. In this case, single clock cycle accumulator is not used, resulting in an II=3
- half_dot_sr: a shift register has been introduced to mask loop carried dependency, obtaining an II=1
Apart from the single clock cycle accumulation, is there any good reason about the fact that half precision versions use much more logic resources compared to the floating point version (~10X LUTs and ~3x FFs)?
This has been evaluated using Quartus 19.1.0, targeting a Stratix 10.
Thanks for the help