Hi guys,
i m working on qualcomm qcs 605 adreno 645.
now i try to run opencv(3.4.6) via opencl on android platform.
making opencl context is done. but GPU rumtime looks slower then CPU runtime.
i try to HoughLines transform to use Mat, UMat.
i dont know why UMat is Slower then Mat.
i knew UMat object is faster then Mat object. but when i check runtime, UMat is slower.
Here is my funtion.
can i know why?
regards.
extern "C" void initCL() { dumpCLinfo(); EGLDisplay mEglDisplay = eglGetCurrentDisplay(); if (mEglDisplay == EGL_NO_DISPLAY) LOGE("initCL: eglGetCurrentDisplay() returned 'EGL_NO_DISPLAY', error = %x", eglGetError()); EGLContext mEglContext = eglGetCurrentContext(); if (mEglContext == EGL_NO_CONTEXT) LOGE("initCL: eglGetCurrentContext() returned 'EGL_NO_CONTEXT', error = %x", eglGetError()); cl_context_properties props[] = { CL_CONTEXT_PERF_HINT_QCOM,CL_PERF_HINT_HIGH_QCOM, CL_GL_CONTEXT_KHR, (cl_context_properties) mEglContext, CL_EGL_DISPLAY_KHR, (cl_context_properties) mEglDisplay, CL_CONTEXT_PLATFORM, 0, 0 }; try { haveOpenCL = false; cl::Platform p = cl::Platform::getDefault(); std::string ext = p.getInfo<CL_PLATFORM_EXTENSIONS>(); if(ext.find("cl_khr_gl_sharing") == std::string::npos) LOGE("Warning: CL-GL sharing isn't supported by PLATFORM"); props[7] = (cl_context_properties) p(); theContext = cl::Context(CL_DEVICE_TYPE_GPU, props); std::vector<cl::Device> devs = theContext.getInfo<CL_CONTEXT_DEVICES>(); LOGD("Context returned %d devices, taking the 1st one", devs.size()); ext = devs[0].getInfo<CL_DEVICE_EXTENSIONS>(); if(ext.find("cl_khr_gl_sharing") == std::string::npos) LOGE("Warning: CL-GL sharing isn't supported by DEVICE"); theQueue = cl::CommandQueue(theContext, devs[0]); cl::Program::Sources src(1, std::make_pair(oclProgI2I, sizeof(oclProgI2I))); theProgI2I = cl::Program(theContext, src); theProgI2I.build(devs); cv::ocl::attachContext(p.getInfo<CL_PLATFORM_NAME>(), p(), theContext(), devs[0]()); if( cv::ocl::useOpenCL() ) LOGD("OpenCV+OpenCL works OK!"); else LOGE("Can't init OpenCV with OpenCL TAPI"); haveOpenCL = true; } catch(const cl::Error& e) { LOGE("cl::Error: %s (%d)", e.what(), e.err()); } catch(const std::exception& e) { LOGE("std::exception: %s", e.what()); } catch(...) { LOGE( "OpenCL info: unknown error while initializing OpenCL stuff" ); } LOGD("initCL completed"); }
void procOCL_OCV(int texIn, int texOut, int w, int h) { LOGD("Processing OpenCL via OpenCV"); if(!haveOpenCL) { LOGE("OpenCL isn't initialized"); return; } int64_t t = getTimeMs(); cl::ImageGL imgIn (theContext, CL_MEM_READ_ONLY, GL_TEXTURE_2D, 0, texIn); std::vector < cl::Memory > images(1, imgIn); theQueue.enqueueAcquireGLObjects(&images); theQueue.finish(); cv::UMat uIn, uOut, uTmp,cdst; std::vector<cv::Vec2f> lines; cv::ocl::convertFromImage(imgIn(), uIn); LOGD("loading texture data to OpenCV UMat costs %d ms", getTimeInterval(t)); theQueue.enqueueReleaseGLObjects(&images); t = getTimeMs(); cv::cvtColor(uIn, uTmp, CV_RGB2GRAY); cv::Canny(uTmp,cdst,125,350); cv::HoughLines(cdst, lines,1,PI/180,60); cv::Mat result(cdst.rows, cdst.cols, CV_8U, cv::Scalar(255)); std::vector<cv::Vec2f>::const_iterator it= lines.begin(); while (it!=lines.end()) { float rho = (*it)[0]; float theta = (*it)[1]; if (theta < PI/4. || theta > 3.*PI/4.) { cv::Point pt1(rho/cos(theta), 0); cv::Point pt2((rho-result.rows*sin(theta))/cos(theta), result.rows); cv::line(uIn, pt1, pt2, cv::Scalar(255), 1); } else { cv::Point pt1(0,rho/sin(theta)); cv::Point pt2(result.cols,(rho-result.cols*cos(theta))/sin(theta)); cv::line(uIn, pt1, pt2, cv::Scalar(255), 1); } ++it; } LOGD("HoughLines() costs %d ms", getTimeInterval(t)); t = getTimeMs(); cl::ImageGL imgOut(theContext, CL_MEM_WRITE_ONLY, GL_TEXTURE_2D, 0, texOut); images.clear(); images.push_back(imgOut); theQueue.enqueueAcquireGLObjects(&images); cl_mem clBuffer = (cl_mem)uIn.handle(cv::ACCESS_READ); cl_command_queue q = (cl_command_queue)cv::ocl::Queue::getDefault().ptr(); size_t offset = 0; size_t origin[3] = { 0, 0, 0 }; size_t region[3] = {static_cast<size_t>(w), static_cast<size_t>(h), 1 }; CV_Assert(clEnqueueCopyBufferToImage (q, clBuffer, imgOut(), offset, origin, region, 0, NULL, NULL) == CL_SUCCESS); theQueue.enqueueReleaseGLObjects(&images); cv::ocl::finish(); LOGD("uploading results to texture costs %d ms", getTimeInterval(t)); }
void drawFrameProcCPU(int w, int h, int texOut) { LOGD("Processing on CPU"); int64_t t; // let's modify pixels in FBO texture in C++ code (on CPU) static cv::Mat m; static cv::Mat dst; static cv::Mat cdst; std::vector<cv::Vec2f> lines; m.create(h, w, CV_8UC4); // read t = getTimeMs(); // expecting FBO to be bound glReadPixels(0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, m.data); LOGD("glReadPixels() costs %d ms", getTimeInterval(t)); // modify t = getTimeMs(); cv::cvtColor(m, cdst, CV_RGB2GRAY); cv::Canny(cdst,dst,125,350); cv::HoughLines(dst, lines,1,PI/180,60); //m *= 10; cv::Mat result(dst.rows, dst.cols, CV_8U, cv::Scalar(255)); std::vector<cv::Vec2f>::const_iterator it= lines.begin(); while (it!=lines.end()) { float rho = (*it)[0]; float theta = (*it)[1]; if (theta < PI/4. || theta > 3.*PI/4.) { cv::Point pt1(rho/cos(theta), 0); cv::Point pt2((rho-result.rows*sin(theta))/cos(theta), result.rows); cv::line(m, pt1, pt2, cv::Scalar(255), 1); } else { cv::Point pt1(0,rho/sin(theta)); cv::Point pt2(result.cols,(rho-result.cols*cos(theta))/sin(theta)); cv::line(m, pt1, pt2, cv::Scalar(255), 1); } ++it; } LOGD("HoughLines() costs %d ms", getTimeInterval(t)); // write back glActiveTexture(GL_TEXTURE0); glBindTexture(GL_TEXTURE_2D, texOut); t = getTimeMs(); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, m.data); LOGD("glTexSubImage2D() costs %d ms", getTimeInterval(t)); }