Platform SM8350/SM8550
I have implemented a BoxFilter_31x31 using OpenGL Compute Shader and OpenCL with both Local Memory and Global Memory. However, the performance of Local Memory is significantly lower than Global Memory. Is this normal?
#version 320 es
layout(local_size_x = 32, local_size_y = 32) in;
layout(binding = 0) readonly buffer Input0 {
float data[];
} input0;
layout(binding = 2) writeonly buffer Output {
float data[];
} output0;
const ivec3 pad_size = ivec3(gl_WorkGroupSize) / 2;
const ivec3 local_size = 2 * ivec3(gl_WorkGroupSize);
shared float local_data[local_size.x * local_size.y];
void main()
{
ivec3 global_size = ivec3(gl_NumWorkGroups) * ivec3(gl_WorkGroupSize);
//////////////////////////////////////////////////////
// 1. box filter with global memory
//////////////////////////////////////////////////////
// simple box filter
{
// int R = 15 + (int(gl_GlobalInvocationID.x) + int(gl_GlobalInvocationID.y)) % 2;
// int R = 15 + (int(gl_WorkGroupID.x) + int(gl_WorkGroupID.y)) % 2;
int R = 16;
int D = 2 * R + 1;
int S = D * D;
float dst = 0.0;
# pragma unroll(1)
for(int dx = -R; dx <= R; ++dx){
# pragma unroll(1)
for(int dy = -R; dy <= R; ++dy){
// calc global coord
int global_x = int(gl_GlobalInvocationID.x) + dx;
int global_y = int(gl_GlobalInvocationID.y) + dy;
global_x = clamp(global_x, 0, global_size.x - 1);
global_y = clamp(global_y, 0, global_size.y - 1);
int global_idx = global_y * global_size.x + global_x;
// boxfilter
dst += input0.data[global_idx];
}
}
dst /= float(S);
// set output
{
int global_x = int(gl_GlobalInvocationID.x);
int global_y = int(gl_GlobalInvocationID.y);
int global_idx = global_y * global_size.x + global_x;
output0.data[global_idx] = dst;
}
}
//////////////////////////////////////////////////////
// 2. box filter with local memory
//////////////////////////////////////////////////////
// load to local memory
#pragma unroll(2)
for(int dy = 0; dy < 2; ++dy){
#pragma unroll(2)
for(int dx = 0; dx < 2; ++dx){
// calc global coord
int global_x = int(gl_GlobalInvocationID.x);
int global_y = int(gl_GlobalInvocationID.y);
global_x = global_x + dx * int(gl_WorkGroupSize.x) - pad_size.x;
global_y = global_y + dy * int(gl_WorkGroupSize.y) - pad_size.y;
global_x = min(max(global_x, 0), global_size.x - 1);
global_y = min(max(global_y, 0), global_size.y - 1);
int global_idx = global_y * global_size.x + global_x;
// calc local coord
int local_x = int(gl_LocalInvocationID.x) + dx * int(gl_WorkGroupSize.x);
int local_y = int(gl_LocalInvocationID.y) + dy * int(gl_WorkGroupSize.y);
int local_idx = local_y * local_size.x + local_x;
// copy data
local_data[local_idx] = input0.data[global_idx];
}
}
barrier();
// do box filter
{
// int R = 15 + (int(gl_GlobalInvocationID.x) + int(gl_GlobalInvocationID.y)) % 2;
int R = 16;
int D = 2 * R + 1;
int S = D * D;
float dst = 0.0;
int local_cx = int(gl_LocalInvocationID.x) + pad_size.x;
int local_cy = int(gl_LocalInvocationID.y) + pad_size.y;
for(int dy = -R; dy <= R; ++dy){
for(int dx = -R; dx <= R; ++dx){
// calc local coord
int local_x = local_cx + dx;
int local_y = local_cy + dy;
local_x = clamp(local_x, 0, local_size.x - 1);
local_y = clamp(local_y, 0, local_size.y - 1);
int local_idx = local_y * local_size.x + local_x;
// boxfilter
dst += local_data[local_idx];
}
}
dst /= float(S);
// set output
{
int global_x = int(gl_GlobalInvocationID.x);
int global_y = int(gl_GlobalInvocationID.y);
int global_idx = global_y * global_size.x + global_x;
output0.data[global_idx] = dst;
}
}
}