GPU Slower then CPU
Hi guys,

i m working on qualcomm qcs 605 adreno 645.

now i try to run opencv(3.4.6) via opencl on android platform.

making opencl context is done. but GPU rumtime looks slower then CPU runtime.

i try to  HoughLines transform to use Mat, UMat.

i dont know why UMat is Slower then Mat.

i knew UMat object is faster then Mat object. but when i check runtime, UMat is slower.

Here is my funtion.

can i know why?


extern "C" void initCL()

    EGLDisplay mEglDisplay = eglGetCurrentDisplay();
    if (mEglDisplay == EGL_NO_DISPLAY)
        LOGE("initCL: eglGetCurrentDisplay() returned 'EGL_NO_DISPLAY', error = %x", eglGetError());

    EGLContext mEglContext = eglGetCurrentContext();
    if (mEglContext == EGL_NO_CONTEXT)
        LOGE("initCL: eglGetCurrentContext() returned 'EGL_NO_CONTEXT', error = %x", eglGetError());

    cl_context_properties props[] =
            {   CL_CONTEXT_PERF_HINT_QCOM,CL_PERF_HINT_HIGH_QCOM, CL_GL_CONTEXT_KHR,   (cl_context_properties) mEglContext,
                CL_EGL_DISPLAY_KHR,  (cl_context_properties) mEglDisplay,
                CL_CONTEXT_PLATFORM, 0,
                0 };

        haveOpenCL = false;
        cl::Platform p = cl::Platform::getDefault();
        std::string ext = p.getInfo<CL_PLATFORM_EXTENSIONS>();
        if(ext.find("cl_khr_gl_sharing") == std::string::npos)
            LOGE("Warning: CL-GL sharing isn't supported by PLATFORM");
        props[7] = (cl_context_properties) p();

        theContext = cl::Context(CL_DEVICE_TYPE_GPU, props);
        std::vector<cl::Device> devs = theContext.getInfo<CL_CONTEXT_DEVICES>();
        LOGD("Context returned %d devices, taking the 1st one", devs.size());
        ext = devs[0].getInfo<CL_DEVICE_EXTENSIONS>();
        if(ext.find("cl_khr_gl_sharing") == std::string::npos)
            LOGE("Warning: CL-GL sharing isn't supported by DEVICE");

        theQueue = cl::CommandQueue(theContext, devs[0]);

        cl::Program::Sources src(1, std::make_pair(oclProgI2I, sizeof(oclProgI2I)));
        theProgI2I = cl::Program(theContext, src);;
        cv::ocl::attachContext(p.getInfo<CL_PLATFORM_NAME>(), p(), theContext(), devs[0]());

        if( cv::ocl::useOpenCL() )
            LOGD("OpenCV+OpenCL works OK!");
            LOGE("Can't init OpenCV with OpenCL TAPI");
        haveOpenCL = true;
    catch(const cl::Error& e)
        LOGE("cl::Error: %s (%d)", e.what(), e.err());
    catch(const std::exception& e)
        LOGE("std::exception: %s", e.what());
        LOGE( "OpenCL info: unknown error while initializing OpenCL stuff" );
    LOGD("initCL completed");


void procOCL_OCV(int texIn, int texOut, int w, int h)
    LOGD("Processing OpenCL via OpenCV");
        LOGE("OpenCL isn't initialized");

    int64_t t = getTimeMs();
    cl::ImageGL imgIn (theContext, CL_MEM_READ_ONLY,  GL_TEXTURE_2D, 0, texIn);
    std::vector < cl::Memory > images(1, imgIn);
    cv::UMat uIn, uOut, uTmp,cdst;
    std::vector<cv::Vec2f> lines;
    cv::ocl::convertFromImage(imgIn(), uIn);
    LOGD("loading texture data to OpenCV UMat costs %d ms", getTimeInterval(t));

    t = getTimeMs();
    cv::cvtColor(uIn, uTmp, CV_RGB2GRAY);
    cv::HoughLines(cdst, lines,1,PI/180,60);  

    cv::Mat result(cdst.rows, cdst.cols, CV_8U, cv::Scalar(255));

    std::vector<cv::Vec2f>::const_iterator it= lines.begin();
    while (it!=lines.end()) {
        float rho = (*it)[0];   
        float theta = (*it)[1]; 
        if (theta < PI/4. || theta > 3.*PI/4.) { 
            cv::Point pt1(rho/cos(theta), 0); 
            cv::Point pt2((rho-result.rows*sin(theta))/cos(theta), result.rows);
            cv::line(uIn, pt1, pt2, cv::Scalar(255), 1); 

        } else { 
            cv::Point pt1(0,rho/sin(theta)); 
            cv::Point pt2(result.cols,(rho-result.cols*cos(theta))/sin(theta));
            cv::line(uIn, pt1, pt2, cv::Scalar(255), 1); 

    LOGD("HoughLines() costs %d ms", getTimeInterval(t));

    t = getTimeMs();
    cl::ImageGL imgOut(theContext, CL_MEM_WRITE_ONLY, GL_TEXTURE_2D, 0, texOut);
    cl_mem clBuffer = (cl_mem)uIn.handle(cv::ACCESS_READ);
    cl_command_queue q = (cl_command_queue)cv::ocl::Queue::getDefault().ptr();
    size_t offset = 0;
    size_t origin[3] = { 0, 0, 0 };
    size_t region[3] = {static_cast<size_t>(w), static_cast<size_t>(h), 1 };
    CV_Assert(clEnqueueCopyBufferToImage (q, clBuffer, imgOut(), offset, origin, region, 0, NULL, NULL) == CL_SUCCESS);
    LOGD("uploading results to texture costs %d ms", getTimeInterval(t));
D/JNIpart: Processing OpenCL via OpenCV
D/JNIpart: loading texture data to OpenCV UMat costs 3 ms
D/JNIpart: HoughLines() costs 35 ms
D/JNIpart: uploading results to texture costs 2 ms
void drawFrameProcCPU(int w, int h, int texOut)
    LOGD("Processing on CPU");
    int64_t t;

    // let's modify pixels in FBO texture in C++ code (on CPU)
    static cv::Mat m;
    static cv::Mat dst;
    static cv::Mat cdst;
    std::vector<cv::Vec2f> lines;
    m.create(h, w, CV_8UC4);

    // read
    t = getTimeMs();
    // expecting FBO to be bound
    glReadPixels(0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE,;
    LOGD("glReadPixels() costs %d ms", getTimeInterval(t));

    // modify

    t = getTimeMs();
    cv::cvtColor(m, cdst, CV_RGB2GRAY);
    cv::HoughLines(dst, lines,1,PI/180,60); 
    //m *= 10;

    cv::Mat result(dst.rows, dst.cols, CV_8U, cv::Scalar(255));

    std::vector<cv::Vec2f>::const_iterator it= lines.begin();
    while (it!=lines.end()) {
        float rho = (*it)[0];  
        float theta = (*it)[1]; 
        if (theta < PI/4. || theta > 3.*PI/4.) { 
            cv::Point pt1(rho/cos(theta), 0); 
            cv::Point pt2((rho-result.rows*sin(theta))/cos(theta), result.rows);
            cv::line(m, pt1, pt2, cv::Scalar(255), 1); 

        } else {
            cv::Point pt1(0,rho/sin(theta));
            cv::Point pt2(result.cols,(rho-result.cols*cos(theta))/sin(theta));
            cv::line(m, pt1, pt2, cv::Scalar(255), 1);

    LOGD("HoughLines() costs %d ms", getTimeInterval(t));

    // write back
    glBindTexture(GL_TEXTURE_2D, texOut);
    t = getTimeMs();
    glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE,;
    LOGD("glTexSubImage2D() costs %d ms", getTimeInterval(t));
D/JNIpart: Processing on CPU
D/JNIpart: glReadPixels() costs 1 ms
D/JNIpart: HoughLines() costs 13 ms
D/JNIpart: glTexSubImage2D() costs 1 ms





