Altera_Forum
Honored Contributor
8 years agoWhy does this design not fit and how to reason about congestion
Hi all,
I'm trying to optimize a simple program which computes a correlation matrix. To promote reuse/parallelism the design proceeds in blocks. However, I do not get the design to route for larger block sizes (> 12) I do not fully understand why it does not fit. Here is the code:# define NR_RECEIVERS 576# define NR_BASELINE 166176 # define NR_SAMPLES_PER_CHANNEL 1024# define NR_CHANNELS 64# define NR_POLARIZATIONS 2# define COMPLEX 2# define BLOCK_SIZE_X 8# define BLOCK_SIZE (BLOCK_SIZE_X * BLOCK_SIZE_X)# define NR_BLOCK_X (NR_RECEIVERS / BLOCK_SIZE_X) # define NR_BLOCKS (NR_BLOCK_X * (NR_BLOCK_X + 1)) / 2# define OUTSIZE NR_BLOCKS * BLOCK_SIZE
typedef signed char int8_t;
typedef float2 fcomplex;
typedef fcomplex InputType/**/;
typedef fcomplex OutputType/**/;
fcomplex __attribute__((__overloadable__,__always_inline__,const)) mulConj(fcomplex a, fcomplex b){
return (float2)(a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y);
}
__kernel __attribute__((reqd_work_group_size(1,1,1))) __attribute__((max_global_work_dim(0)))
void Correlator(__global OutputType *restrict output, const __global volatile InputType *restrict input)
{
int blockx = 0;
int blocky = 0;
for( int baselineBlock = 0 ; baselineBlock < NR_BLOCKS; baselineBlock++) {
fcomplex sums;
# pragma unroll
for(int i = 0 ; i < BLOCK_SIZE_X ; i++){
# pragma unroll
for(int j = 0 ; j < BLOCK_SIZE_X ; j++){
sums = (fcomplex)0;
}
}
for (int time = 0 ; time < NR_SAMPLES_PER_CHANNEL; time++){
fcomplex memx;
fcomplex memy;
# pragma unroll
for (int i = 0; i < BLOCK_SIZE_X ; i++){
memy = (*input);
}
# pragma unroll
for (int i = 0; i < BLOCK_SIZE_X ; i++){
memx = (*input);
}
# pragma unroll
for(int i = 0; i < BLOCK_SIZE_X ; i++){
# pragma unroll
for(int j = 0; j < BLOCK_SIZE_X ; j++){
sums += mulConj( memx , memy);
}
}
}
# pragma unroll
for(int i = 0 ; i < BLOCK_SIZE_X ; i++){
# pragma unroll
for(int j = 0 ; j < BLOCK_SIZE_X ; j++){
(*output) = sums;
}
}
blocky+=BLOCK_SIZE_X;
if(blocky > blockx){
blockx+=BLOCK_SIZE_X;
blocky = 0;
}
}
}
For block size 16, the compiler estimate (--report) suggests that it should fit, with estimates like this:
+--------------------------------------------------------------------+
; Estimated Resource Usage Summary ;
+----------------------------------------+---------------------------+
; Resource + Usage ;
+----------------------------------------+---------------------------+
; Logic utilization ; 40% ;
; ALUTs ; 11% ;
; Dedicated logic registers ; 28% ;
; Memory blocks ; 14% ;
; DSP blocks ; 67% ;
+----------------------------------------+---------------------------;
At large block sizes, the quartus_sh_compile says that there is high congestion in the design. I assumed that this was due to the high amount of reuse in the design, every value in memx or memy is read 16 times, for a blocksize of 16. To combat this effect, I redesigned the kernel in a systolic array-like design, where data is shifted through a grid in both the x and y direction and each value is only read once (instead of 16 times): \# define NR_RECEIVERS 576# define NR_BASELINE 166176 # define NR_SAMPLES_PER_CHANNEL 1024# define NR_CHANNELS 64# define NR_POLARIZATIONS 2# define COMPLEX 2# define BLOCK_SIZE_X 16# define BLOCK_SIZE_Y (BLOCK_SIZE_X / 2)# define BLOCK_SIZE (BLOCK_SIZE_X * BLOCK_SIZE_X)# define NR_BLOCK_X (NR_RECEIVERS / BLOCK_SIZE_X) # define NR_BLOCKS (NR_BLOCK_X * (NR_BLOCK_X + 1)) / 2# define OUTSIZE ((NR_RECEIVERS * (NR_RECEIVERS + 1)) / 2)
typedef signed char int8_t;
typedef float2 fcomplex;
typedef fcomplex InputType/**/;
// Write non-contigous:
// typedef fcomplex OutputType/**/;
// Write contigous:
typedef fcomplex OutputType/**/;
fcomplex __attribute__((__overloadable__,__always_inline__,const)) mulConj(fcomplex a, fcomplex b){
return (float2)(a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y);
}
__kernel __attribute__((reqd_work_group_size(1,1,1))) __attribute__((max_global_work_dim(0)))
void Correlator(__global OutputType *restrict output, const __global volatile InputType *restrict input)
{
int blockx = 0;
int blocky = 0;
# pragma unroll 1
for( int baselineBlock = 0 ; baselineBlock < NR_BLOCKS; baselineBlock++) {
fcomplex sums;
# pragma unroll
for(int i = 0 ; i < BLOCK_SIZE_X ; i++){
# pragma unroll
for(int j = 0 ; j < BLOCK_SIZE_X ; j++){
sums = (fcomplex)0;
}
}
fcomplex bufx;
# pragma unroll
for(int i = 0 ; i < 2 * BLOCK_SIZE - 1 ; i++){
# pragma unroll
for(int j = 0 ; j < BLOCK_SIZE ; j++){
bufx = (fcomplex)0;
}
}
fcomplex bufy;
# pragma unroll
for(int i = 0 ; i < BLOCK_SIZE ; i++){
# pragma unroll
for(int j = 0 ; j < 2 * BLOCK_SIZE - 1 ; j++){
bufy = (fcomplex)0;
}
}
# pragma unroll 1
for (int time = 0 ; time < NR_SAMPLES_PER_CHANNEL; time++){
# pragma unroll
for(int j = 0 ; j < BLOCK_SIZE ; j++){
# pragma unroll
for(int i = 2 * BLOCK_SIZE - 2 ; i >=1 ; i--){
bufx = bufx;
}
}
# pragma unroll
for(int i = 0 ; i < BLOCK_SIZE ; i++){
# pragma unroll
for(int j = 2 * BLOCK_SIZE - 2 ; j >=1 ; j--){
bufy = bufy;
}
}
# pragma unroll
for(int i = 0 ; i < BLOCK_SIZE ; i++){
bufx = (*input);
}
# pragma unroll
for(int i = 0 ; i < BLOCK_SIZE ; i++){
bufy = (*input);
}
# pragma unroll
for(int i = 0; i < BLOCK_SIZE_X ; i++){
# pragma unroll
for(int j = 0; j < BLOCK_SIZE_Y ; j++){
float2 a = bufx;
float2 b = bufy;
sums += (float2)(a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y);
}
}
}
# pragma unroll
for(int i = 0 ; i < BLOCK_SIZE_X ; i++){
# pragma unroll
for(int j = 0 ; j < BLOCK_SIZE_X ; j++){
(*output) = sums;
}
}
blocky+=BLOCK_SIZE_X;
if(blocky > blockx){
blockx+=BLOCK_SIZE_X;
blocky = 0;
}
}
}
This gives estimates like before. However, this also does not fit for larger block sizes (>12), just like the earlier design. The log states: Cannot place the following 18 DSP cells -- a legal placement which satisfies all the DSP requirements could not be found..... I do not understand why. Why do these designs have high congestion, how can I reason about this and how can I make a larger block size fit? Is it simply unrealistic to use more than 50% of the DSP nodes? Any ideas appreciated!