Forums - GPU Slower then CPU

1 post / 0 new
GPU Slower then CPU
konan91
Join Date: 2 Apr 19
Posts: 8
Posted: Sun, 2019-09-29 20:19

Hi guys,

i m working on qualcomm qcs 605 adreno 645.

now i try to run opencv(3.4.6) via opencl on android platform.

making opencl context is done. but GPU rumtime looks slower then CPU runtime.

i try to  HoughLines transform to use Mat, UMat.

i dont know why UMat is Slower then Mat.

i knew UMat object is faster then Mat object. but when i check runtime, UMat is slower.

Here is my funtion.

can i know why?

regards.

extern "C" void initCL()
{
    dumpCLinfo();

    EGLDisplay mEglDisplay = eglGetCurrentDisplay();
    if (mEglDisplay == EGL_NO_DISPLAY)
        LOGE("initCL: eglGetCurrentDisplay() returned 'EGL_NO_DISPLAY', error = %x", eglGetError());

    EGLContext mEglContext = eglGetCurrentContext();
    if (mEglContext == EGL_NO_CONTEXT)
        LOGE("initCL: eglGetCurrentContext() returned 'EGL_NO_CONTEXT', error = %x", eglGetError());

    cl_context_properties props[] =
            {   CL_CONTEXT_PERF_HINT_QCOM,CL_PERF_HINT_HIGH_QCOM, CL_GL_CONTEXT_KHR,   (cl_context_properties) mEglContext,
                CL_EGL_DISPLAY_KHR,  (cl_context_properties) mEglDisplay,
                CL_CONTEXT_PLATFORM, 0,
                0 };

    try
    {
        haveOpenCL = false;
        cl::Platform p = cl::Platform::getDefault();
        std::string ext = p.getInfo<CL_PLATFORM_EXTENSIONS>();
        if(ext.find("cl_khr_gl_sharing") == std::string::npos)
            LOGE("Warning: CL-GL sharing isn't supported by PLATFORM");
        props[7] = (cl_context_properties) p();

        theContext = cl::Context(CL_DEVICE_TYPE_GPU, props);
        std::vector<cl::Device> devs = theContext.getInfo<CL_CONTEXT_DEVICES>();
        LOGD("Context returned %d devices, taking the 1st one", devs.size());
        ext = devs[0].getInfo<CL_DEVICE_EXTENSIONS>();
        if(ext.find("cl_khr_gl_sharing") == std::string::npos)
            LOGE("Warning: CL-GL sharing isn't supported by DEVICE");

        theQueue = cl::CommandQueue(theContext, devs[0]);

        cl::Program::Sources src(1, std::make_pair(oclProgI2I, sizeof(oclProgI2I)));
        theProgI2I = cl::Program(theContext, src);
        theProgI2I.build(devs);
        cv::ocl::attachContext(p.getInfo<CL_PLATFORM_NAME>(), p(), theContext(), devs[0]());

        if( cv::ocl::useOpenCL() )
            LOGD("OpenCV+OpenCL works OK!");
        else
            LOGE("Can't init OpenCV with OpenCL TAPI");
        haveOpenCL = true;
    }
    catch(const cl::Error& e)
    {
        LOGE("cl::Error: %s (%d)", e.what(), e.err());
    }
    catch(const std::exception& e)
    {
        LOGE("std::exception: %s", e.what());
    }
    catch(...)
    {
        LOGE( "OpenCL info: unknown error while initializing OpenCL stuff" );
    }
    LOGD("initCL completed");
}

 

void procOCL_OCV(int texIn, int texOut, int w, int h)
{
    LOGD("Processing OpenCL via OpenCV");
    if(!haveOpenCL)
    {
        LOGE("OpenCL isn't initialized");
        return;
    }

    int64_t t = getTimeMs();
    cl::ImageGL imgIn (theContext, CL_MEM_READ_ONLY,  GL_TEXTURE_2D, 0, texIn);
    std::vector < cl::Memory > images(1, imgIn);
    theQueue.enqueueAcquireGLObjects(&images);
    theQueue.finish();
    cv::UMat uIn, uOut, uTmp,cdst;
    std::vector<cv::Vec2f> lines;
    cv::ocl::convertFromImage(imgIn(), uIn);
    LOGD("loading texture data to OpenCV UMat costs %d ms", getTimeInterval(t));
    theQueue.enqueueReleaseGLObjects(&images);

    t = getTimeMs();
    cv::cvtColor(uIn, uTmp, CV_RGB2GRAY);
    cv::Canny(uTmp,cdst,125,350);
    cv::HoughLines(cdst, lines,1,PI/180,60);  

    cv::Mat result(cdst.rows, cdst.cols, CV_8U, cv::Scalar(255));

    std::vector<cv::Vec2f>::const_iterator it= lines.begin();
    while (it!=lines.end()) {
        float rho = (*it)[0];   
        float theta = (*it)[1]; 
        if (theta < PI/4. || theta > 3.*PI/4.) { 
            cv::Point pt1(rho/cos(theta), 0); 
            cv::Point pt2((rho-result.rows*sin(theta))/cos(theta), result.rows);
            cv::line(uIn, pt1, pt2, cv::Scalar(255), 1); 

        } else { 
            cv::Point pt1(0,rho/sin(theta)); 
            cv::Point pt2(result.cols,(rho-result.cols*cos(theta))/sin(theta));
            cv::line(uIn, pt1, pt2, cv::Scalar(255), 1); 
        }
        ++it;
    }

    LOGD("HoughLines() costs %d ms", getTimeInterval(t));

    t = getTimeMs();
    cl::ImageGL imgOut(theContext, CL_MEM_WRITE_ONLY, GL_TEXTURE_2D, 0, texOut);
    images.clear();
    images.push_back(imgOut);
    theQueue.enqueueAcquireGLObjects(&images);
    cl_mem clBuffer = (cl_mem)uIn.handle(cv::ACCESS_READ);
    cl_command_queue q = (cl_command_queue)cv::ocl::Queue::getDefault().ptr();
    size_t offset = 0;
    size_t origin[3] = { 0, 0, 0 };
    size_t region[3] = {static_cast<size_t>(w), static_cast<size_t>(h), 1 };
    CV_Assert(clEnqueueCopyBufferToImage (q, clBuffer, imgOut(), offset, origin, region, 0, NULL, NULL) == CL_SUCCESS);
    theQueue.enqueueReleaseGLObjects(&images);
    cv::ocl::finish();
    LOGD("uploading results to texture costs %d ms", getTimeInterval(t));
}
 
D/JNIpart: Processing OpenCL via OpenCV
D/JNIpart: loading texture data to OpenCV UMat costs 3 ms
D/JNIpart: HoughLines() costs 35 ms
D/JNIpart: uploading results to texture costs 2 ms
 
void drawFrameProcCPU(int w, int h, int texOut)
{
    LOGD("Processing on CPU");
    int64_t t;

    // let's modify pixels in FBO texture in C++ code (on CPU)
    static cv::Mat m;
    static cv::Mat dst;
    static cv::Mat cdst;
    std::vector<cv::Vec2f> lines;
    m.create(h, w, CV_8UC4);

    // read
    t = getTimeMs();
    // expecting FBO to be bound
    glReadPixels(0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, m.data);
    LOGD("glReadPixels() costs %d ms", getTimeInterval(t));

    // modify

    t = getTimeMs();
    cv::cvtColor(m, cdst, CV_RGB2GRAY);
    cv::Canny(cdst,dst,125,350);
    cv::HoughLines(dst, lines,1,PI/180,60); 
    //m *= 10;

    cv::Mat result(dst.rows, dst.cols, CV_8U, cv::Scalar(255));

    std::vector<cv::Vec2f>::const_iterator it= lines.begin();
    while (it!=lines.end()) {
        float rho = (*it)[0];  
        float theta = (*it)[1]; 
        if (theta < PI/4. || theta > 3.*PI/4.) { 
            cv::Point pt1(rho/cos(theta), 0); 
            cv::Point pt2((rho-result.rows*sin(theta))/cos(theta), result.rows);
            cv::line(m, pt1, pt2, cv::Scalar(255), 1); 

        } else {
            cv::Point pt1(0,rho/sin(theta));
            cv::Point pt2(result.cols,(rho-result.cols*cos(theta))/sin(theta));
            cv::line(m, pt1, pt2, cv::Scalar(255), 1);
        }
        ++it;
    }

    LOGD("HoughLines() costs %d ms", getTimeInterval(t));

    // write back
    glActiveTexture(GL_TEXTURE0);
    glBindTexture(GL_TEXTURE_2D, texOut);
    t = getTimeMs();
    glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, m.data);
    LOGD("glTexSubImage2D() costs %d ms", getTimeInterval(t));
}
D/JNIpart: Processing on CPU
D/JNIpart: glReadPixels() costs 1 ms
D/JNIpart: HoughLines() costs 13 ms
D/JNIpart: glTexSubImage2D() costs 1 ms

 

 

 

 

  • Up0
  • Down0

Opinions expressed in the content posted here are the personal opinions of the original authors, and do not necessarily reflect those of Qualcomm Incorporated or its subsidiaries (“Qualcomm”). The content is provided for informational purposes only and is not meant to be an endorsement or representation by Qualcomm or any other party. This site may also provide links or references to non-Qualcomm sites and resources. Qualcomm makes no representations, warranties, or other commitments whatsoever about any non-Qualcomm sites or third-party resources that may be referenced, accessible from, or linked to this site.