Forums - Local Memory is Slower than Global Memory

1 post / 0 new
Local Memory is Slower than Global Memory
zhangdengsu
Join Date: 21 Sep 23
Posts: 1
Posted: Fri, 2023-10-20 01:45

Platform SM8350/SM8550

I have implemented a BoxFilter_31x31 using OpenGL Compute Shader and OpenCL with both Local Memory and Global Memory. However, the performance of Local Memory is significantly lower than Global Memory. Is this normal?

#version 320 es
 
layout(local_size_x = 32, local_size_y = 32) in;
 
layout(binding = 0) readonly buffer Input0 {
    float data[];
} input0;
layout(binding = 2) writeonly buffer Output {
    float data[];
} output0;
 
const ivec3 pad_size = ivec3(gl_WorkGroupSize) / 2;
const ivec3 local_size = 2 * ivec3(gl_WorkGroupSize);
 
shared float local_data[local_size.x * local_size.y];
 
 
void main()
{
    ivec3 global_size = ivec3(gl_NumWorkGroups) * ivec3(gl_WorkGroupSize);
 
    //////////////////////////////////////////////////////
    // 1. box filter with global memory
    //////////////////////////////////////////////////////
 
    // simple box filter
    {
        // int R = 15 + (int(gl_GlobalInvocationID.x) + int(gl_GlobalInvocationID.y)) % 2;
        // int R = 15 + (int(gl_WorkGroupID.x) + int(gl_WorkGroupID.y)) % 2;
        int R = 16;
        int D = 2 * R + 1;
        int S = D * D;
        float dst = 0.0;
        # pragma unroll(1)
        for(int dx = -R; dx <= R; ++dx){
            # pragma unroll(1)
            for(int dy = -R; dy <= R; ++dy){
                // calc global coord
                int global_x = int(gl_GlobalInvocationID.x) + dx;
                int global_y = int(gl_GlobalInvocationID.y) + dy;
                global_x = clamp(global_x, 0, global_size.x - 1);
                global_y = clamp(global_y, 0, global_size.y - 1);
                int global_idx = global_y * global_size.x + global_x;
                // boxfilter
                dst += input0.data[global_idx];
            }
        }
        dst /= float(S);
        // set output
        {
            int global_x = int(gl_GlobalInvocationID.x);
            int global_y = int(gl_GlobalInvocationID.y);
            int global_idx = global_y * global_size.x + global_x;
            output0.data[global_idx] = dst;
        }
    }
 
    //////////////////////////////////////////////////////
    // 2. box filter with local memory
    //////////////////////////////////////////////////////
 
    // load to local memory
    #pragma unroll(2)
    for(int dy = 0; dy < 2; ++dy){
        #pragma unroll(2)
        for(int dx = 0; dx < 2; ++dx){
            // calc global coord
            int global_x = int(gl_GlobalInvocationID.x);
            int global_y = int(gl_GlobalInvocationID.y);
            global_x = global_x + dx * int(gl_WorkGroupSize.x) - pad_size.x;
            global_y = global_y + dy * int(gl_WorkGroupSize.y) - pad_size.y;
            global_x = min(max(global_x, 0), global_size.x - 1);
            global_y = min(max(global_y, 0), global_size.y - 1);
            int global_idx = global_y * global_size.x + global_x;
            // calc local coord
            int local_x = int(gl_LocalInvocationID.x) + dx * int(gl_WorkGroupSize.x);
            int local_y = int(gl_LocalInvocationID.y) + dy * int(gl_WorkGroupSize.y);
            int local_idx = local_y * local_size.x + local_x;
            // copy data
            local_data[local_idx] = input0.data[global_idx];
        }
    }
    barrier();
    // do box filter
    {
        // int R = 15 + (int(gl_GlobalInvocationID.x) + int(gl_GlobalInvocationID.y)) % 2;
        int R = 16;
        int D = 2 * R + 1;
        int S = D * D;
        float dst = 0.0;
 
        int local_cx = int(gl_LocalInvocationID.x) + pad_size.x;
        int local_cy = int(gl_LocalInvocationID.y) + pad_size.y;
        for(int dy = -R; dy <= R; ++dy){
            for(int dx = -R; dx <= R; ++dx){
                // calc local coord
                int local_x = local_cx + dx;
                int local_y = local_cy + dy;
                local_x = clamp(local_x, 0, local_size.x - 1);
                local_y = clamp(local_y, 0, local_size.y - 1);
                int local_idx = local_y * local_size.x + local_x;
                // boxfilter
                dst += local_data[local_idx];
            }
        }
        dst /= float(S);
        // set output
        {
            int global_x = int(gl_GlobalInvocationID.x);
            int global_y = int(gl_GlobalInvocationID.y);
            int global_idx = global_y * global_size.x + global_x;
            output0.data[global_idx] = dst;
        }
    }
}

 

 

  • Up0
  • Down0

Opinions expressed in the content posted here are the personal opinions of the original authors, and do not necessarily reflect those of Qualcomm Incorporated or its subsidiaries (“Qualcomm”). The content is provided for informational purposes only and is not meant to be an endorsement or representation by Qualcomm or any other party. This site may also provide links or references to non-Qualcomm sites and resources. Qualcomm makes no representations, warranties, or other commitments whatsoever about any non-Qualcomm sites or third-party resources that may be referenced, accessible from, or linked to this site.