Altera_Forum
Honored Contributor
7 years agoAchieve Low Latency in OpenCL implementation of a State Space Equation Solver
Dear All,
I am exploring OpenCL as an alternative to HDL to implement a simple accelerator, to solve state space equations of an Induction Motor, on an FPGA PCIe board. My aim is to have a low latency implementation, with a very modest bandwidth, but able to provide the results with possibly <50us latency (including memory transfers to the host, a Xeon CPU). I implemented my first kernel, following the Intel Programming and Best Practices guide.
typedef union{
float f;
float4 f4;
}float4_t;
typedef union{
float f;
float8 f8;
float4_t f4_t;
}float8_t;
typedef struct __attribute__((packed)){
t_float3 iabc;
t_float wr;
t_float Te;
}ker_out_t;
# define RANK 4
__attribute__((task))
__kernel void induction_machine(
__constant float4_t* restrict vabc,
__constant float* restrict theta,
__global ker_out_t* restrict ker_out,
__global float4_t* restrict AC,
__global float4_t* restrict BD
){
int tid = 0;
const float kpc = sqrt(2.0/3.0);
const float sqr2b2 = sqrt(2.0)/2.0;
const float pi2b3 = 2.0*pi/3.0;
const float sqrt3b2 = sqrt(3.0)/2.0;
const float f1b2 = 0.5;
float4_t KP;
float4_t iKP;
float4_t iabc={};
float Te;
float wr;
float4_t vdqo = {};
float4_t x;
float4_t u = {};
float4_t y = {};
float4_t xn = {};
float8_t ACx={};
float8_t BDu={};
float costh = cos(theta);
float sinth = sin(theta);
float costh_p_pi2b3 = -sqrt3b2*sinth -f1b2*costh; //cos(theta + pi2b3);
float costh_m_pi2b3 = sqrt3b2*sinth -f1b2*costh;//cos(theta - pi2b3);
float sinth_p_pi2b3 = sqrt3b2*costh -f1b2*sinth; //sin(theta + pi2b3);
float sinth_m_pi2b3 = -sqrt3b2*costh -f1b2*sinth; //sin(theta - pi2b3);
// park and inverse park coefficients
KP.f4 =(float4) (kpc*costh,kpc*costh_m_pi2b3,kpc*costh_p_pi2b3,0);
KP.f4 =(float4) (kpc*(-sinth),kpc*(-sinth_m_pi2b3),kpc*(-sinth_p_pi2b3),0);
KP.f4 = (float4) (kpc*sqr2b2,kpc*sqr2b2,kpc*sqr2b2,0);
iKP.f4 =(float4) (kpc*costh,-kpc*sinth,kpc*sqr2b2,0);
iKP.f4 =(float4) (kpc*costh_m_pi2b3,kpc*(-sinth_m_pi2b3),kpc*sqr2b2,0);
iKP.f4 =(float4) (kpc*costh_p_pi2b3,kpc*(-sinth_p_pi2b3),kpc*sqr2b2,0);
// park transform
for(int i=0;i<RANK;i++){
# pragma unroll
for(int j=0;j<RANK;j++)
vdqo.f+=KP->f*vabc->f;
}
u.f4.s01 = vdqo.f4.s01; //state space input
BDu.f8 = (float8) (0,0,0,0,0,0,0,0);
ACx.f8 = (float8) (0,0,0,0,0,0,0,0);
//state solver# pragma unroll
for(int i=0; i<2*RANK;i++){# pragma unroll
for(int j=0; j<RANK;j++){
ACx.f += AC.f*x.f;
BDu.f += BD.f*u.f;
}
}
# pragma unroll
for(int i=0;i<RANK;i++){
xn.f = x.f + h*(ACx.f + BDu.f);
y.f = ACx.f + BDu.f;
}
//torque and speed output
ker_out->Te = (3.0/2.0)*(P/2.0)*(x.f*y.f - x.f*y.f);
ker_out->wr = ker_out->wr + (P/(2*J))*(ker_out->Te - Td)*h;
//system update
AC.f = ker_out->wr-w;
AC.f = w-ker_out->wr;
//output currents inverse park transform
for(int i=0;i<RANK;i++){
# pragma unroll
for(int j=0;j<RANK;j++)
iabc.f+=iKP.f*y.f;
}
ker_out->iabc = iabc.f4.s012;
//state update
x = xn;
}
Profiling this code on an Arria10GX I noticed that it roughly takes 70us to execute. Do you think this result is reasonable? Is there a way to reduce this figure? Running the kernel several times inside a for loop in the host, executing using EnqueueTask, I also noticed in the output of aocl report that a lot of time is spent in between executions. Does that part represent the memory transfer ? A screenshot of the profiling timeline is in the attachments Any suggestion is appreciated. Thank you in advance, Peter https://alteraforum.com/forum/attachment.php?attachmentid=15477&stc=1