Kernel Vectorization: branching is thread ID dependent ... cannot vectorize
I have the following code, and in the kernel the norm is get from the rowdata and row data is dependent on the thread ID.
In the myfastsquaredDistance function, the if else statement is dependent on the norm, thus when I compile, it gives warning that it cannot vectorize. Does anyone know how to get around this? Thanks!
double myfastSquaredDistance(__constant double *rowdata, __constant double *rowcenter, double norm1, double norm2, int n, int y)
{
double precision = 1e-6;
double sumSquaredNorm = norm1 * norm1 + norm2 * norm2;
double normDiff = norm1 - norm2;
double sqDist = 0.0;
double precisionBound1 = 2.0 * epsilon() * sumSquaredNorm / (normDiff * normDiff + epsilon());
if (precisionBound1 < precision) {
sqDist = sumSquaredNorm - 2.0 * mydot(rowdata, rowcenter, n, y);
} else {
sqDist = sqdist(rowdata, rowcenter, n, y);
}
return sqDist;
}
__attribute__((num_simd_work_items(4)))
__attribute__((reqd_work_group_size(32,1,1)))
__kernel
void kmeansTest(
int num_vectors,
int vector_length,
int num_clusters,
__constant double* restrict data,
__constant double* restrict center,
__global double* restrict result
)
{
const uint y = get_global_id(0);
int bestindex;
double norm1, norm2;
double min_distance = FLT_MAX;
__constant double* rowdata = &data [y * (vector_length + 1)];
norm1 = rowdata[vector_length];
for (int i = 0; i < num_clusters; ++i){
double dotProduct = 0;
double precision = 1e-6;
__constant double *rowcenter = center + i * (vector_length + 1);
norm2 = rowcenter[vector_length];
dotProduct = myfastSquaredDistance(data, rowcenter, norm1, norm2, vector_length, y);
if(dotProduct < min_distance){
min_distance = dotProduct;
bestindex = i;
}
}
result[y * 2] = bestindex;
result[y * 2 + 1] = sqrt(min_distance);
}