Error: Optimizer FAILED
Hi,
I am quite new to OpenCl.
I have downloded the following code (only a portion shown here) of Vladimir Antonenko which compresses image to a jpeg.
The program originally ment to run on GPU but I intend implement it on Altera FPGA.
I run it on Altera OpenCl compiler for FPGA and encounter the following behavior:
In compilation for Emulation no problem is encountered and after compilation (also host part) the program runs without any
problem and indeed compresses an image.
Compiling the code for the device (no emulation) the compiler stops with "Optimizer FAILED" problem.
The strange part is that there is no single line or segment that causes the problem but combinations of parts of the code.
1: the whole function v4_rgb2yuv420() causes "Optimizer FAILED" and does not finish the first compilation stage.
2: if I remark out segment A or only the 8 occurrences of the lines "t = as_uint4(shuffle(as_uchar16(t), mask));"
in segment A depicted with "//####" then the compiler finishes the first stage successfully (I don't wait for the whole compilation cause it takes hours).
3: If the above 8 occurrences of the lines are kept but segment B is remarked out then the compiler finishes the first stage successfully.
so to summarize: if segment A or B are remarked the compiler succeeds and if both A and B are kept then the Optimizer fails.
the program and the resulted printing are attached here:
The code (only a portion that displays the problem):
================================================== ==
// jpeg.cl
/////////////////////////////////////////////////////////////////////////////
inline uchar8 RGB2Y2(const ushort8 R0, const ushort8 G0, const ushort8 B0)
{
const uint8 R = convert_uint8(R0);
const uint8 G = convert_uint8(G0);
const uint8 B = convert_uint8(B0);
return convert_uchar8(( 32768 + 19595*R + 38470*G + 7471*B) >> 16);
}
/////////////////////////////////////////////////////////////////////////////
inline uchar8 RGB2Cb2(const short8 R0, const short8 G0, const short8 B0)
{
const int8 R = convert_int8(R0);
const int8 G = convert_int8(G0);
const int8 B = convert_int8(B0);
return convert_uchar8((8421376*4 - 11058*R - 21709*G + 32767*B) >> 18);
}
/////////////////////////////////////////////////////////////////////////////
inline uchar8 RGB2Cr2(const short8 R0, const short8 G0, const short8 B0)
{
const int8 R = convert_int8(R0);
const int8 G = convert_int8(G0);
const int8 B = convert_int8(B0);
return convert_uchar8((8421376*4 + 32767*R - 27438*G - 5329*B) >> 18);
}
/////////////////////////////////////////////////////////////////////////////
inline ushort4 hsum2(const ushort8 t)
{
return (ushort4) (t.s0+t.s1, t.s2+t.s3, t.s4+t.s5, t.s6+t.s7);
}
/////////////////////////////////////////////////////////////////////////////
// BGR to YUV conversion + chroma subsampling 4:2:0
// Input: 2D], dimension [SIZEX/16,SIZEY/2]
__kernel void v4_rgb2yuv420
(
__global const uint *bgr,
__global uchar8 *yuv
)
{
const uchar16 mask = (uchar16)(2,5,8,11, 1,4,7,10, 0,3,6,9, 0,0,0,0);
const size_t x = get_global_id(0);
size_t y = get_global_id(1)*2;
size_t i = (y*get_global_size(0) + x)*4;
size_t o = ((y/8)*get_global_size(0) + x)*16 + (y%8);
uint2 rgb[3][2][2];
//======= start segment A
{
uint4 t = (uint4)(vload3(i+0,bgr), 0);
t = as_uint4(shuffle(as_uchar16(t), mask)); //####
rgb[0][0][0].lo = t.s0;
rgb[1][0][0].lo = t.s1;
rgb[2][0][0].lo = t.s2;
}
{
uint4 t = (uint4)(vload3(i+1,bgr), 0);
t = as_uint4(shuffle(as_uchar16(t), mask)); //####
rgb[0][0][0].hi = t.s0;
rgb[1][0][0].hi = t.s1;
rgb[2][0][0].hi = t.s2;
}
{
uint4 t = (uint4)(vload3(i+2,bgr), 0);
t = as_uint4(shuffle(as_uchar16(t), mask)); //####
rgb[0][0][1].lo = t.s0;
rgb[1][0][1].lo = t.s1;
rgb[2][0][1].lo = t.s2;
}
{
uint4 t = (uint4)(vload3(i+3,bgr), 0);
t = as_uint4(shuffle(as_uchar16(t), mask)); //####
rgb[0][0][1].hi = t.s0;
rgb[1][0][1].hi = t.s1;
rgb[2][0][1].hi = t.s2;
}
// second line
y += 1;
i = (y*get_global_size(0) + x)*4;
{
uint4 t = (uint4)(vload3(i+0,bgr), 0);
t = as_uint4(shuffle(as_uchar16(t), mask)); //####
rgb[0][1][0].lo = t.s0;
rgb[1][1][0].lo = t.s1;
rgb[2][1][0].lo = t.s2;
}
{
uint4 t = (uint4)(vload3(i+1,bgr), 0);
t = as_uint4(shuffle(as_uchar16(t), mask)); //####
rgb[0][1][0].hi = t.s0;
rgb[1][1][0].hi = t.s1;
rgb[2][1][0].hi = t.s2;
}
{
uint4 t = (uint4)(vload3(i+2,bgr), 0);
t = as_uint4(shuffle(as_uchar16(t), mask)); //####
rgb[0][1][1].lo = t.s0;
rgb[1][1][1].lo = t.s1;
rgb[2][1][1].lo = t.s2;
}
{
uint4 t = (uint4)(vload3(i+3,bgr), 0);
t = as_uint4(shuffle(as_uchar16(t), mask)); //####
rgb[0][1][1].hi = t.s0;
rgb[1][1][1].hi = t.s1;
rgb[2][1][1].hi = t.s2;
}
//======= end segment A
//======= start segment B
{
ushort8 R = convert_ushort8(as_uchar8(rgb[0][0][0]));
rgb[0][0][0] = as_uint2(hsum2(R));
ushort8 G = convert_ushort8(as_uchar8(rgb[1][0][0]));
rgb[1][0][0] = as_uint2(hsum2(G));
ushort8 B = convert_ushort8(as_uchar8(rgb[2][0][0]));
rgb[2][0][0] = as_uint2(hsum2(B));
yuv[o] = RGB2Y2(R,G,B);
}
{
ushort8 R = convert_ushort8(as_uchar8(rgb[0][0][1]));
rgb[0][0][1] = as_uint2(hsum2(R));
ushort8 G = convert_ushort8(as_uchar8(rgb[1][0][1]));
rgb[1][0][1] = as_uint2(hsum2(G));
ushort8 B = convert_ushort8(as_uchar8(rgb[2][0][1]));
rgb[2][0][1] = as_uint2(hsum2(B));
yuv[o+8] = RGB2Y2(R,G,B);
}
o = ((y/8)*get_global_size(0) + x)*16 + (y%8);
{
ushort8 R = convert_ushort8(as_uchar8(rgb[0][1][0]));
rgb[0][1][0] = as_uint2(hsum2(R));
ushort8 G = convert_ushort8(as_uchar8(rgb[1][1][0]));
rgb[1][1][0] = as_uint2(hsum2(G));
ushort8 B = convert_ushort8(as_uchar8(rgb[2][1][0]));
rgb[2][1][0] = as_uint2(hsum2(B));
yuv[o] = RGB2Y2(R,G,B);
}
{
ushort8 R = convert_ushort8(as_uchar8(rgb[0][1][1]));
rgb[0][1][1] = as_uint2(hsum2(R));
ushort8 G = convert_ushort8(as_uchar8(rgb[1][1][1]));
rgb[1][1][1] = as_uint2(hsum2(G));
ushort8 B = convert_ushort8(as_uchar8(rgb[2][1][1]));
rgb[2][1][1] = as_uint2(hsum2(B));
yuv[o+8] = RGB2Y2(R,G,B);
}
{
const size_t offset_cb = get_global_size(1)*get_global_size(0)*4;
const size_t offset_cr = offset_cb + get_global_size(1)*get_global_size(0);
y = get_global_id(1);
o = ((y/8)*get_global_size(0) + x)*8 + (y%8);
// RGB addition of the pair in vetical direction
short8 R = ((short8)(as_short4(rgb[0][0][0]),as_short4(rgb[0][0][1])))
+ ((short8)(as_short4(rgb[0][1][0]),as_short4(rgb[0][1][1])));
short8 G = ((short8)(as_short4(rgb[1][0][0]),as_short4(rgb[1][0][1])))
+ ((short8)(as_short4(rgb[1][1][0]),as_short4(rgb[1][1][1])));
short8 B = ((short8)(as_short4(rgb[2][0][0]),as_short4(rgb[2][0][1])))
+ ((short8)(as_short4(rgb[2][1][0]),as_short4(rgb[2][1][1])));
yuv[offset_cb+o] = RGB2Cb2(R,G,B);
yuv[offset_cr+o] = RGB2Cr2(R,G,B);
}
//======= end segment B
}
See attached for printing