SChun19
New Contributor
5 years agoBidirectional OpenCL Channel Stalls (both at read and write insts)
Hi, I am experiencing an issue where there are stalls both at the read and write instructions for the channel in a feed-forward situation. I am not sure how to fix this or interpret the issue even since the implication is contradictory. I have attached the profiler and will paste the code below. Thanks.
#pragma OPENCL EXTENSION cl_intel_channels : enable
#define Tc 4
#define Tm 16
#define Tn 8
struct input_features {
float input_buf[Tc];
bool rc_zero;
bool flush;
};
struct filter_weights {
float weight_buf[Tn];
};
struct outputs {
float output_buf[Tm][Tc];
};
channel struct input_features loadAChannel __attribute__((depth(64)));
channel struct filter_weights loadBChannel __attribute__((depth(64)));
channel struct outputs storeCChannel __attribute__((depth(64)));
kernel void loadA(global float* restrict compute, global float* volatile restrict input0, global float* restrict input1, global float* restrict T_clip, global float* restrict input2, global float* restrict input3,
int ax1_bound, int yy_bound, int xx_bound, int rc_bound) {
for (int ax1 = 0; ax1 < ax1_bound; ax1 += Tm) {
for (int yy = 0; yy < yy_bound; yy++) {
for (int xx = 0; xx < yy_bound; xx += Tc) {
for (int rc = 0; rc < rc_bound; rc += Tn) {
struct input_features i_local;
i_local.rc_zero = (rc == 0);
i_local.flush = (rc + Tn) >= rc_bound;
for (int tii = 0; tii < Tn; tii++) {
#pragma unroll
for (int tcc = 0; tcc < Tc; tcc++) {
uint tmp = (0x3F800000 + tcc) + ((rc * 1 + tii) & 0xFFFF);
i_local.input_buf[tcc] = *(float *) &tmp;
}
write_channel_intel(loadAChannel, i_local);
}
}
}
}
}
}
kernel void loadB(global float* restrict compute, global float* restrict input0, global float* volatile restrict input1, global float* restrict T_clip, global float* restrict input2, global float* restrict input3,
int ax1_bound, int yy_bound, int xx_bound, int rc_bound) {
for (int ax1 = 0; ax1 < ax1_bound; ax1 += Tm) {
for (int yy = 0; yy < yy_bound; yy++) {
for (int xx = 0; xx < yy_bound; xx += Tc) {
for (int rc = 0; rc < rc_bound; rc += Tn) {
struct filter_weights w_local;
for (int too = 0; too < Tm; too++) { // ax1
#pragma unroll
for (int tii = 0; tii < Tn; tii++) { // rc
uint tmp = (0x3F800000 + too) + ((rc * 1 + tii) & 0xFFFF);
w_local.weight_buf[tii] = *(float *) &tmp;
}
write_channel_intel(loadBChannel, w_local);
}
}
}
}
}
}
__attribute__((max_global_work_dim(0)))
__attribute__((autorun))
kernel void monolithic() {
float __attribute__((memory)) output_buf[Tm][Tc];
float __attribute__((memory)) weight_buf[Tm][Tn];
float __attribute__((memory)) input_buf[Tn][Tc];
while (1) {
struct outputs out;
bool resetsum, flush;
for (int tii = 0; tii < Tn; tii++) { // rc - input feature maps (C)
struct input_features valA = read_channel_intel(loadAChannel);
resetsum = valA.rc_zero;
flush = valA.flush;
#pragma unroll
for (int tcc = 0; tcc < Tc; tcc++) { // xx - output columns (Q)
input_buf[tii][tcc] = valA.input_buf[tcc];
}
}
for (int too = 0; too < Tm; too++) { // ax1 - output features (K)
struct filter_weights valB = read_channel_intel(loadBChannel);
#pragma unroll
for (int tii = 0; tii < Tn; tii++) { // rc - input feature maps (C)
weight_buf[too][tii] = valB.weight_buf[tii];
}
}
/* compute here */
if (flush) {
#pragma unroll
for (int too = 0; too < Tm; too++) { // ax1 - output features (K)
#pragma unroll
for (int tcc = 0; tcc < Tc; tcc++) { // xx - output columns (Q)
out.output_buf[too][tcc] = output_buf[too][tcc];
}
}
write_channel_intel(storeCChannel, out);
}
}
}
kernel void storeC(global float* restrict compute, global float* restrict input0, global float* restrict input1, global float* restrict T_clip, global float* restrict input2, global float* restrict input3,
int ax1_bound, int yy_bound, int xx_bound, int rc_bound) {
for (int ax1 = 0; ax1 < ax1_bound; ax1 += Tm) {
for (int yy = 0; yy < yy_bound; yy++) {
for (int xx = 0; xx < yy_bound; xx += Tc) {
struct outputs out_local = read_channel_intel(storeCChannel);
#pragma unroll
for (int too = 0; too < Tm; too++) {
#pragma unroll
for (int tcc = 0; tcc < Tc; tcc++) {
out_local.output_buf[too][tcc] += 1;
}
}
}
}
}
}