Hi,
I tried implimenting matrix multiplication using vulkan compute shaders (please see the below 2 implementations)
both implementations taking more more time for execution as compared to CPU.
Please let me know if I am doing anything wrong or any performance improvement suggestion.
Thanks in advance
Compute Shader implementations 1:
Compute Shader implementations 2(using shared memory):
std::string glsl_data;
glsl_data += "#version 460\n";
glsl_data += "#define BLOCK_SIZE 8\n";
glsl_data += "#define DIM 2048\n";
glsl_data += "layout(std430, binding = 0) buffer lay0 { float matA[];};\n";
glsl_data += "layout(std430, binding = 1) buffer lay1 { float matB[];};\n";
glsl_data += "layout(std430, binding = 2) buffer lay2 { float matOut[];};\n";
glsl_data += "layout (local_size_x = BLOCK_SIZE,local_size_y = BLOCK_SIZE,local_size_z = 1 ) in;\n";
glsl_data += "shared float sa[BLOCK_SIZE*BLOCK_SIZE];\n";
glsl_data += "shared float sb[BLOCK_SIZE*BLOCK_SIZE];\n";
glsl_data += "void main() {\n";
glsl_data += "const uint lx = gl_LocalInvocationID.x;\n";
glsl_data += "const uint ly = gl_LocalInvocationID.y;\n";
glsl_data += "const uint dx = gl_WorkGroupID.x * BLOCK_SIZE;\n";
glsl_data += "const uint dy = gl_WorkGroupID.y * BLOCK_SIZE;\n";
glsl_data += "float sum=0.0;\n";
glsl_data += "for(uint i = 0; i < DIM; i+=BLOCK_SIZE) {\n";
glsl_data += "sa[ly*BLOCK_SIZE+lx] = matA[(dy+ly) * DIM + (i+ lx)];\n";
glsl_data += "sb[ly*BLOCK_SIZE+lx] = matB[(i +ly) * DIM + (dx+lx)];\n";
glsl_data += "barrier();\n";
glsl_data += "for(uint k = 0; k < BLOCK_SIZE; ++k) {\n";
glsl_data += "sum += sa[ly*BLOCK_SIZE + k] * sb[k * BLOCK_SIZE + lx];\n";
glsl_data += "}\n";
glsl_data += "barrier();\n";
glsl_data += "}\n";
glsl_data += "matOut[(dy+ly) * DIM + (dx+lx)] = sum;\n";
glsl_data += "}\n";