Hi there, I've encountered a vertex shader problem on Adreno 330, tested on Nexus 5 and Kindle Fire HDX.
my vertex shader is as follows: uniform highp vec4 bone_dq[220];
#define FETCH_DQ(i) dq.r = bone_dq[(i)*2]; dq.t = bone_dq[(i)*2+1];
void skin_vertex_weight4(inout vec3 v, inout vec3 tbn_quat, ivec4 bones, lowp vec4 weights)
{
DQ dq;
FETCH_DQ( bones[0] ];
vec4 r0 = dq.r;
DQ finalDQ;
finalDQ.r = weights[0] * dq.r;
finalDQ.t = weights[0] * dq.t;
for(int i = 1; i < 4; ++i)
{
FETCH_DQ( bones[i] );
if( dot( r0, dq.r) < 0 )
{
// seems that it never entered branch
finalDQ.r -= weights[i] * dq.r;
finalDQ.t -= weights[i] * dq.t;
}
else
{
finalDQ.r += weights[i] * dq.r;
finalDQ.r += weights[i] * dq.r;
}
}
...
}
I tried to use math method to kill the branch, but it doesn't work too.
unroll the for loop doesn't help too. float shortPath = sign(sign(dot(r0, dq.r))+0.5);
finalDQ.r += shortPath*weights[i] * dq.r;
finalDQ.t += shortPath*weights[i] * dq.t;
BTW, the same code runs good on other GPU, i.e. ARM Mali Txx. And I don't see any extra notes on GLES 3.0 GLSL specification on dynamic branching.
Is that a bug or can you give any help? Thanks
:)
Anyone help?
I can give you my APK & OBB file to test.
Hello.
I wrote a small sample app and copy-pasted your shader.
First I added the following after the #define FETCH_DQ line:
However the shader did not compile:
Then I changed the following line:
to this;
So I finally got the second reason why it wouldn't compile:
The dot product result was being tested with integer '0' instead of float '0.0'.
With this last change, the shader successfully compiled:
Could you try all the above changes and let us know if it compiles for you? Also, you should make sure to check shader compiler results in your app using "glGetShaderInfoLog" to output your compiler results before attempting to use the shader.
Hi Ayo,
Thanks very very much for your patience, that code is part of my original code and has some typos.
my shader code is same as you corrrected except that the 0 => 0.0 part. I'll try it.
Actually my code is converted from HLSL, I'll paste the orginal HLSL and converted code next time if this problem remains unsolved.
Thanks again. :)
#define BONE_PALETTE_SIZE 110
struct DQ
{
float4 r;
float4 t;
};
float4 bone_dq[BONE_PALETTE_SIZE*2] : BONE_PALETTE;
DQ fetchDQ(int index)
{
DQ dq; dq.r = bone_dq[ index*2 ];
dq.t = bone_dq[ index*2+1 ];
return dq;
}
//skin vertex for 4 weights
void skin_vertex_tbn_weight4(inout float3 v, inout float4 tbn_quat, int4 bones, half4 weights)
{
DQ dq = fetchDQ(bones[0]);
float4 r0 = dq.r;
DQ finalDQ;
finalDQ.r = weights[0] * dq.r;
finalDQ.t = weights[0] * dq.t;
for(int i = 1; i < 4; ++i)
{
dq = fetchDQ(bones[i]);
float shortPath = sign(sign(dot(r0, dq.r))+0.5);
finalDQ.r += shortPath*weights[i] * dq.r;
finalDQ.t += shortPath*weights[i] * dq.t;
}
finalDQ = dqnormalize(finalDQ);
v = dqmul(finalDQ, v);
tbn_quat = qqmul(finalDQ.r, tbn_quat);
}
The animation result is not correct on Nexus 5 & Kindle HDX 7'', 8.9'' , as if the "sign(sign(dot()+0.5))" operation or "if" condition has no effect. As the same result of this code below, on all devices (even on PC):
....
for(int i = 1; i < 4; ++i)
{
dq = fetchDQ(bones[i]);
finalDQ.r += weights[i] * dq.r;
finalDQ.t += weights[i] * dq.t;
} ...
Here's the converted (& optimized) GLSL full code:
#version 300 es
#if defined(ENABLE_VS)
uniform highp vec4 bone_dq[220];
uniform highp mat4 wvp_matrix;
uniform highp mat4 world_matrix;
in highp vec4 blade_position0;
in highp vec4 blade_normal0;
in mediump vec2 blade_texcoord0;
in highp vec4 blade_blendindices0;
in mediump vec4 blade_blendwight0;
out mediump vec2 blade_varying_TEXCOORD0;
out highp vec4 blade_varying_TEXCOORD1;
out highp vec3 blade_varying_TEXCOORD2;
void main ()
{
ivec4 tmpvar_1;
tmpvar_1 = ivec4(blade_blendindices0);
highp vec4 pos_2;
pos_2.w = blade_position0.w;
highp vec4 tmpvar_3;
tmpvar_3 = (((blade_normal0 * 255.0) / 128.0) - 1.0);
highp vec4 tmpvar_4; highp vec4 tmpvar_5;
highp vec4 tmpvar_6; tmpvar_6 = bone_dq[(tmpvar_1.x * 2)];
tmpvar_4 = (blade_blendwight0.x * tmpvar_6);
tmpvar_5 = (blade_blendwight0.x * bone_dq[((tmpvar_1.x * 2) + 1)]);
highp vec4 tmpvar_7;
tmpvar_7 = bone_dq[(tmpvar_1.y * 2)];
highp float tmpvar_8;
tmpvar_8 = sign((sign( dot (tmpvar_6, tmpvar_7) ) + 0.5));
tmpvar_4 = (tmpvar_4 + ((tmpvar_8 * blade_blendwight0.y) * tmpvar_7));
tmpvar_5 = (tmpvar_5 + ((tmpvar_8 * blade_blendwight0.y) * bone_dq[( (tmpvar_1.y * 2) + 1)]));
highp vec4 tmpvar_9;
tmpvar_9 = bone_dq[(tmpvar_1.z * 2)];
highp float tmpvar_10;
tmpvar_10 = sign((sign( dot (tmpvar_6, tmpvar_9) ) + 0.5));
tmpvar_4 = (tmpvar_4 + ((tmpvar_10 * blade_blendwight0.z) * tmpvar_9));
tmpvar_5 = (tmpvar_5 + ((tmpvar_10 * blade_blendwight0.z) * bone_dq[( (tmpvar_1.z * 2) + 1)]));
highp vec4 tmpvar_11;
tmpvar_11 = bone_dq[(tmpvar_1.w * 2)];
highp float tmpvar_12;
tmpvar_12 = sign((sign( dot (tmpvar_6, tmpvar_11) ) + 0.5));
tmpvar_4 = (tmpvar_4 + ((tmpvar_12 * blade_blendwight0.w) * tmpvar_11));
tmpvar_5 = (tmpvar_5 + ((tmpvar_12 * blade_blendwight0.w) * bone_dq[( (tmpvar_1.w * 2) + 1)]));
highp vec4 tmpvar_13;
highp vec4 tmpvar_14;
highp float tmpvar_15;
tmpvar_15 = sqrt(dot (tmpvar_4, tmpvar_4));
tmpvar_13 = (tmpvar_4 / tmpvar_15);
tmpvar_14 = (tmpvar_5 / tmpvar_15);
tmpvar_4 = tmpvar_13;
tmpvar_5 = tmpvar_14;
highp vec3 tmpvar_16;
tmpvar_16 = (((tmpvar_13.yzx * blade_position0.zxy) - (tmpvar_13.zxy * blade_position0.yzx)) * 2.0);
highp vec4 tmpvar_17;
tmpvar_17.xyz = (((tmpvar_13.w * tmpvar_3.xyz) + (tmpvar_3.w * tmpvar_13.xyz)) + ((tmpvar_13.yzx * tmpvar_3.zxy) - (tmpvar_13.zxy * tmpvar_3.yzx)));
tmpvar_17.w = ((tmpvar_13.w * tmpvar_3.w) - dot (tmpvar_13.xyz, tmpvar_3.xyz));
pos_2.xyz = (((blade_position0.xyz + (tmpvar_16 * tmpvar_13.w) ) + ( (tmpvar_13.yzx * tmpvar_16.zxy) - (tmpvar_13.zxy * tmpvar_16.yzx) )) + (2.0 * ( ((tmpvar_13.w * tmpvar_14.xyz) - (tmpvar_14.w * tmpvar_13.xyz)) + ((tmpvar_13.yzx * tmpvar_14.zxy) - (tmpvar_13.zxy * tmpvar_14.yzx)) )));
highp vec3 tmpvar_18;
tmpvar_18 = (((tmpvar_17.yzx * vec3(1.0, 0.0, 0.0)) - (tmpvar_17.zxy * vec3(0.0, 1.0, 0.0))) * 2.0);
highp mat3 tmpvar_19;
tmpvar_19[0u] = world_matrix[0u].xyz;
tmpvar_19[1u] = world_matrix[1u].xyz;
tmpvar_19[2u] = world_matrix[2u].xyz;
gl_Position = (pos_2 * wvp_matrix);
blade_varying_TEXCOORD0 = blade_texcoord0;
blade_varying_TEXCOORD1 = (pos_2 * world_matrix);
blade_varying_TEXCOORD2 = (((vec3(0.0, 0.0, 1.0) + (tmpvar_18 * tmpvar_17.w) ) + ( (tmpvar_17.yzx * tmpvar_18.zxy) - (tmpvar_17.zxy * tmpvar_18.yzx) )) * tmpvar_19);
}
#elif defined(ENABLE_FS)
uniform int light_count;
uniform highp vec4 light_vector[8];
uniform highp vec4 light_diffuse[8];
uniform highp vec4 light_ambient;
uniform highp vec4 light_specular[8];
uniform highp vec4 eye_position;
uniform sampler2D diffuseMap;
in mediump vec2 blade_varying_TEXCOORD0;
in highp vec4 blade_varying_TEXCOORD1;
in highp vec3 blade_varying_TEXCOORD2;
layout(location=0) out highp vec4 outBladeColor0;
void main () {
highp vec4 diffuse_1;
lowp vec4 tmpvar_2;
tmpvar_2 = texture (diffuseMap, blade_varying_TEXCOORD0);
diffuse_1 = tmpvar_2;
highp vec3 worldPos_3;
worldPos_3 = blade_varying_TEXCOORD1.xyz;
highp vec3 worldNormal_4;
worldNormal_4 = normalize(blade_varying_TEXCOORD2);
highp vec3 eye_dir_6;
highp vec4 light_7;
light_7 = light_ambient;
eye_dir_6 = normalize((eye_position.xyz - blade_varying_TEXCOORD1.xyz));
for (int i_5 = 0; i_5 < light_count; i_5++)
{
highp vec3 tmpvar_8;
tmpvar_8 = normalize((light_vector[i_5].xyz - (light_vector[i_5].w * worldPos_3)));
highp float tmpvar_9;
tmpvar_9 = dot (worldNormal_4, tmpvar_8);
light_7.xyz = (light_7.xyz + (( max (0.0, tmpvar_9) * light_diffuse[i_5].xyz) + ( pow ((max (0.0, dot (worldNormal_4, normalize((tmpvar_8 + eye_dir_6)) )) * float((tmpvar_9 >= 0.0))), 32.0) * light_specular[i_5].xyz)));
};
outBladeColor0 = (light_7 * diffuse_1);
}
#else
#error switch not defined.
#endif
Now I have to HACK in CPU code: check the dot() operation and pre-apply the sign to blend weights, and in shader code, remove dot() operation.
then the animation result turns out fine.
#if DQ_GPU_SKINNING_HACK
To do this, I have to change "blend weight" from "normalized unsigned byte" to "half float".//this C/C++ code modify blend weight before each draw call
...
if( dq0.real.dotProduct(dq.real) < 0 )
weight.weight[i] = -fWeight;
else
weight.weight[i] = fWeight;
...
#endif
This is only a HACK and need update new data of vertex attribute "blend weights" to GLES on each draw call. That's no acceptable.
It's just helping me find what the problem is.
Now I believe that the Adreno 3xx devices, or its driver/shader compiler has bugs with dynamic branching on UNIFORM ARRAYS.
may be the optimizer treate the uniform array as a SINGLE UNIFORM so that all shader units goes the same branch.
Can anyone verify this problem? I've linked a APK in another thread, I'm putting it here if you need test this APK & obb:
https://drive.google.com/folderview?id=0B-jwAxcRPTTafmNob2l0OXRRR1VGUTJFTkNjNFFTUXhtUXpLUWRBUlNRZWtiQmx5enZfQWM&usp=sharing
Note: put the OBB in root folder of sdcard and the APK will work.
Thanks for your solution!
I'll try the workaround when possible.