Forums - Vulkan Compute Shader implementation of Matrix Multiplication is taking more time as compared to CPU in 8th Gen 1 chipset

1 post / 0 new
Vulkan Compute Shader implementation of Matrix Multiplication is taking more time as compared to CPU in 8th Gen 1 chipset
ramees.t
Join Date: 20 Mar 23
Posts: 1
Posted: Wed, 2023-03-22 03:54

Hi,

I tried implimenting matrix multiplication using vulkan compute shaders (please see the below 2 implementations)

both implementations taking more more time for execution as compared to CPU.

Please let me know if I am doing anything wrong or any performance improvement suggestion.

Thanks in advance

Compute Shader implementations 1:

using milliseconds = std::chrono::duration<double, std::milli>;
milliseconds tmProcess;
   
        auto tm0 = std::chrono::high_resolution_clock::now();

        VkResult result = vkQueueSubmit(device.queue_, 1, &submitInfo, 0);
        CALL_VK(vkQueueWaitIdle(device.queue_));

        auto tm1 = std::chrono::high_resolution_clock::now();

        tmProcess = (tm1-tm0);
std::cout<<"Execution Time GPU:(Size="<<Mat_Size<<"X"<<Mat_Size<<")="<<tmProcess.count()<<"ms"<<std::endl;
 

 

vkCmdDispatch(commandBuffer, Mat_Size/8, Mat_Size/4 , 1);

 

std::string glsl_data;
    glsl_data += "#version 460\n";
    glsl_data += "#pragma shader_stage(compute)\n";
    glsl_data += "#define DIM 2048\n";
    glsl_data += "layout(std430, binding = 0) buffer readonly lay0 { float matA[];};\n";
    glsl_data += "layout(std430, binding = 1) buffer readonly lay1 { float matB[];};\n";
    glsl_data += "layout(std430, binding = 2) buffer writeonly lay2 { float matOut[];};\n";
    glsl_data += "layout (local_size_x = 8,local_size_y = 4,local_size_z = 1 ) in;\n";
    glsl_data += "void main() {\n";
    glsl_data +=    "const uint row    = gl_GlobalInvocationID.x;\n";
    glsl_data +=    "const uint col    = gl_GlobalInvocationID.y;\n";
    glsl_data +=    "if(row >=DIM || col>=DIM){ \n";
    glsl_data +=        " return;\n";
    glsl_data +=    "} \n";
    glsl_data +=    "float sum =0.0; \n";
    glsl_data +=    "for(uint i=0;i<DIM;i++){ \n";
    glsl_data +=        "sum +=matA[row*DIM+i] * matB[i*DIM+col];\n";
    glsl_data +=    "}\n";
    glsl_data +=    "matOut[row*DIM+col]= sum;\n";
    glsl_data += "}\n";

 

 

Compute Shader implementations 2(using shared memory):

std::string glsl_data;

    glsl_data += "#version 460\n";

    glsl_data += "#define BLOCK_SIZE 8\n";

    glsl_data += "#define DIM 2048\n";

    glsl_data += "layout(std430, binding = 0) buffer lay0 { float matA[];};\n";

    glsl_data += "layout(std430, binding = 1) buffer lay1 { float matB[];};\n";

    glsl_data += "layout(std430, binding = 2) buffer lay2 { float matOut[];};\n";

    glsl_data += "layout (local_size_x = BLOCK_SIZE,local_size_y = BLOCK_SIZE,local_size_z = 1 ) in;\n";

    glsl_data += "shared float sa[BLOCK_SIZE*BLOCK_SIZE];\n";

    glsl_data += "shared float sb[BLOCK_SIZE*BLOCK_SIZE];\n";

    glsl_data += "void main() {\n";

    glsl_data +=    "const uint lx    = gl_LocalInvocationID.x;\n";

    glsl_data +=    "const uint ly    = gl_LocalInvocationID.y;\n";

    glsl_data +=    "const uint dx    = gl_WorkGroupID.x * BLOCK_SIZE;\n";

    glsl_data +=    "const uint dy    = gl_WorkGroupID.y * BLOCK_SIZE;\n";

    glsl_data +=    "float sum=0.0;\n";

    glsl_data +=    "for(uint i = 0; i < DIM; i+=BLOCK_SIZE) {\n";

    glsl_data +=        "sa[ly*BLOCK_SIZE+lx] = matA[(dy+ly) * DIM + (i+ lx)];\n";

    glsl_data +=        "sb[ly*BLOCK_SIZE+lx] = matB[(i +ly) * DIM + (dx+lx)];\n";

    glsl_data +=        "barrier();\n";

    glsl_data +=        "for(uint k = 0; k < BLOCK_SIZE; ++k) {\n";

    glsl_data +=            "sum += sa[ly*BLOCK_SIZE + k] * sb[k * BLOCK_SIZE + lx];\n";

    glsl_data +=        "}\n";

    glsl_data +=        "barrier();\n";

    glsl_data +=    "}\n";

    glsl_data +=    "matOut[(dy+ly) * DIM + (dx+lx)] = sum;\n";

    glsl_data += "}\n";


 

  • Up0
  • Down0

Opinions expressed in the content posted here are the personal opinions of the original authors, and do not necessarily reflect those of Qualcomm Incorporated or its subsidiaries (“Qualcomm”). The content is provided for informational purposes only and is not meant to be an endorsement or representation by Qualcomm or any other party. This site may also provide links or references to non-Qualcomm sites and resources. Qualcomm makes no representations, warranties, or other commitments whatsoever about any non-Qualcomm sites or third-party resources that may be referenced, accessible from, or linked to this site.