This is on an Intrinsyc u820 board. I have a tombstone file, but not sure how to post that here. The OpenCL kernel in question:
/*OpenCL C*/
#pragma OPENCL FP_CONTRACT ON
float float_from_bits(unsigned int x) {return as_float(x);}
float nan_f32() { return NAN; }
float neg_inf_f32() { return -INFINITY; }
float inf_f32() { return INFINITY; }
#define sqrt_f32 sqrt
#define sin_f32 sin
#define cos_f32 cos
#define exp_f32 exp
#define log_f32 log
#define abs_f32 fabs
#define floor_f32 floor
#define ceil_f32 ceil
#define round_f32 round
#define trunc_f32 trunc
#define pow_f32 pow
#define asin_f32 asin
#define acos_f32 acos
#define tan_f32 tan
#define atan_f32 atan
#define atan2_f32 atan2
#define sinh_f32 sinh
#define asinh_f32 asinh
#define cosh_f32 cosh
#define acosh_f32 acosh
#define tanh_f32 tanh
#define atanh_f32 atanh
#define fast_inverse_f32 native_recip
#define fast_inverse_sqrt_f32 native_rsqrt
int halide_gpu_thread_barrier() {
barrier(CLK_LOCAL_MEM_FENCE);
return 0;
}
#define __address_space___shared __local
__kernel void _at_least_one_kernel(int x) { }
// Address spaces for kernel_maxblockglobal_s0_by___block_id_y_1
#define __address_space__input __global
#define __address_space__maxblockglobal __global
__kernel void kernel_maxblockglobal_s0_by___block_id_y_1(
const int _ht,
const int _maxblockglobal_bx_extent_realized_s,
const int _wd,
__address_space__input const float *restrict _input,
__address_space__maxblockglobal float *restrict _maxblockglobal,
__address_space___shared int16* __shared)
{
int _maxblockglobal_s0_by___block_id_y = get_group_id(1);
int _maxblockglobal_s0_bx___block_id_x = get_group_id(0);
int ___thread_id_x = get_local_id(0);
// produce maxthreadshared
{
float _maxthreadvec[4];
#define __address_space__maxthreadvec __private
// produce maxthreadvec
float4 _38 = float_from_bits(0 /* 0 */);
vstore4(_38,0, (__address_space__maxthreadvec float*)_maxthreadvec + 0);
for (int _maxthreadvec_s1_rItr__x = 0; _maxthreadvec_s1_rItr__x < 0 + 64; _maxthreadvec_s1_rItr__x++)
{
float4 _39 = vload4(0, (__address_space__maxthreadvec float*)_maxthreadvec + 0);
int _40 = _wd + -1793;
int _41 = _40 >> 8;
int _42 = min(_41, 0);
int _43 = _42 + _maxblockglobal_s0_bx___block_id_x;
int _44 = _43 * 256;
int _45 = _wd + -256;
int _46 = min(_44, _45);
int _47 = _maxthreadvec_s1_rItr__x * 4;
int _48 = _46 + _47;
int _49 = _maxblockglobal_s0_by___block_id_y * 256;
int _50 = _ht + -256;
int _51 = min(_49, _50);
int _52 = _51 + ___thread_id_x;
int _53 = _52 * _wd;
int _54 = _48 + _53;
float4 _55 = vload4(0, (__address_space__input float*)_input + _54);
float4 _56 = max(_39, _55);
vstore4(_56,0, (__address_space__maxthreadvec float*)_maxthreadvec + 0);
} // for _maxthreadvec_s1_rItr__x
// consume maxthreadvec
{
float _maxthread[1];
#define __address_space__maxthread __private
// produce maxthread
_maxthread[0] = float_from_bits(0 /* 0 */);
for (int _maxthread_s1_rThdVec__x = 0; _maxthread_s1_rThdVec__x < 0 + 4; _maxthread_s1_rThdVec__x++)
{
float _57 = _maxthread[0];
float _58 = _maxthreadvec[_maxthread_s1_rThdVec__x];
float _59 = max(_57, _58);
_maxthread[0] = _59;
} // for _maxthread_s1_rThdVec__x
#undef __address_space__maxthreadvec
// consume maxthread
float _60 = _maxthread[0];
((__address_space___shared float *)__shared)[___thread_id_x] = _60;
#undef __address_space__maxthread
} // alloc _maxthread
} // alloc _maxthreadvec
int _61 = halide_gpu_thread_barrier();
(void)_61;
// consume maxthreadshared
bool _62 = ___thread_id_x < 1;
if (_62)
{
{
float _maxblocklocal[1];
#define __address_space__maxblocklocal __private
// produce maxblocklocal
_maxblocklocal[0] = float_from_bits(0 /* 0 */);
{
float _maxblockvec[4];
#define __address_space__maxblockvec __private
// produce maxblockvec
float4 _63 = float_from_bits(0 /* 0 */);
vstore4(_63,0, (__address_space__maxblockvec float*)_maxblockvec + 0);
for (int _maxblockvec_s1_rThd__x = 0; _maxblockvec_s1_rThd__x < 0 + 64; _maxblockvec_s1_rThd__x++)
{
float4 _64 = vload4(0, (__address_space__maxblockvec float*)_maxblockvec + 0);
int _65 = _maxblockvec_s1_rThd__x * 4;
float4 _66 = vload4(0, (__address_space___shared float*)__shared + _65);
float4 _67 = max(_64, _66);
vstore4(_67,0, (__address_space__maxblockvec float*)_maxblockvec + 0);
} // for _maxblockvec_s1_rThd__x
// consume maxblockvec
for (int _maxblocklocal_s1_rBlkVec__x = 0; _maxblocklocal_s1_rBlkVec__x < 0 + 4; _maxblocklocal_s1_rBlkVec__x++)
{
float _68 = _maxblocklocal[0];
float _69 = _maxblockvec[_maxblocklocal_s1_rBlkVec__x];
float _70 = max(_68, _69);
_maxblocklocal[0] = _70;
} // for _maxblocklocal_s1_rBlkVec__x
#undef __address_space__maxblockvec
} // alloc _maxblockvec
// consume maxblocklocal
float _71 = _maxblocklocal[0];
int _72 = ___thread_id_x + _maxblockglobal_s0_bx___block_id_x;
int _73 = _maxblockglobal_bx_extent_realized_s + 8;
int _74 = _maxblockglobal_s0_by___block_id_y * _73;
int _75 = _72 + _74;
_maxblockglobal[_75] = _71;
#undef __address_space__maxblocklocal
} // alloc _maxblocklocal
} // if _62
} // kernel kernel_maxblockglobal_s0_by___block_id_y_1
#undef __address_space__input
#undef __address_space__maxblockglobal
I have the problem. Have you found the solution. Or it is the problem of the driver