Index: modules/gpu/include/opencv2/gpu/gpu.hpp =================================================================== --- modules/gpu/include/opencv2/gpu/gpu.hpp (revision 7044) +++ modules/gpu/include/opencv2/gpu/gpu.hpp (working copy) @@ -913,7 +913,7 @@ //!performs labeling via graph cuts CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, - GpuMat& buf, Stream& stream = Stream::Null()); + Stream& stream = Stream::Null()); ////////////////////////////////// Histograms ////////////////////////////////// Index: modules/gpu/src/matrix_reductions.cpp =================================================================== --- modules/gpu/src/matrix_reductions.cpp (revision 7044) +++ modules/gpu/src/matrix_reductions.cpp (working copy) @@ -115,11 +115,17 @@ sz.width = src.cols; sz.height = src.rows; + int nBufferSize; + nppiMeanStdDev8uC1RGetBufferHostSize (sz, &nBufferSize); + Npp8u * pDeviceBuffer; + cudaSafeCall( cudaMalloc((void **)& pDeviceBuffer, nBufferSize) ); + DeviceBuffer dbuf(2); - nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, dbuf, (double*)dbuf + 1) ); + nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, pDeviceBuffer, dbuf, (double*)dbuf + 1) ); cudaSafeCall( cudaDeviceSynchronize() ); + cudaSafeCall( cudaFree(pDeviceBuffer) ); double* ptrs[2] = {mean.val, stddev.val}; dbuf.download(ptrs); Index: modules/gpu/src/element_operations.cpp =================================================================== --- modules/gpu/src/element_operations.cpp (revision 7044) +++ modules/gpu/src/element_operations.cpp (working copy) @@ -656,39 +656,27 @@ sz.width = src1.cols * src1.channels(); sz.height = src1.rows; - if (src1.depth() == CV_8U && (src1.cols * src1.channels()) % 4 == 0) + if (src1.depth() == CV_8U) { NppStreamHandler h(stream); - sz.width /= 4; - - nppSafeCall( nppiAbsDiff_8u_C4R(src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), - dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - else if (src1.depth() == CV_8U) - { - NppStreamHandler h(stream); - nppSafeCall( nppiAbsDiff_8u_C1R(src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } - else if (src1.depth() == CV_32S) + else if (src1.depth() == CV_16U) { NppStreamHandler h(stream); - nppSafeCall( nppiAbsDiff_32s_C1R(src1.ptr<Npp32s>(), static_cast<int>(src1.step), src2.ptr<Npp32s>(), static_cast<int>(src2.step), - dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) ); + nppSafeCall( nppiAbsDiff_16u_C1R(src1.ptr<Npp16u>(), static_cast<int>(src1.step), src2.ptr<Npp16u>(), static_cast<int>(src2.step), + dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } - else if (src1.depth() == CV_32F) + else if ((src1.depth() == CV_32F) || (src1.depth() == CV_32S)) { NppStreamHandler h(stream); Index: modules/gpu/src/graphcuts.cpp =================================================================== --- modules/gpu/src/graphcuts.cpp (revision 7044) +++ modules/gpu/src/graphcuts.cpp (working copy) @@ -48,7 +48,7 @@ #else /* !defined (HAVE_CUDA) */ -void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf, Stream& s) +void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, Stream& s) { Size src_size = terminals.size(); CV_Assert(terminals.type() == CV_32S); @@ -68,20 +68,26 @@ sznpp.height = src_size.height; int bufsz; + NppiGraphcutState *pState; + Npp8u *pDeviceMem; + nppSafeCall( nppiGraphcutGetSize(sznpp, &bufsz) ); + cudaSafeCall( cudaMalloc((void **)& pDeviceMem, bufsz) ); - if ((size_t)bufsz > buf.cols * buf.rows * buf.elemSize()) - buf.create(1, bufsz, CV_8U); + nppSafeCall( nppiGraphcutInitAlloc(sznpp, &pState, pDeviceMem) ); cudaStream_t stream = StreamAccessor::getStream(s); NppStreamHandler h(stream); nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(), - static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), buf.ptr<Npp8u>()) ); + static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), pState) ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); + + nppSafeCall ( nppiGraphcutFree(pState)); + cudaSafeCall( cudaFree(pDeviceMem) ); } Index: modules/gpu/src/imgproc.cpp =================================================================== --- modules/gpu/src/imgproc.cpp (revision 7044) +++ modules/gpu/src/imgproc.cpp (working copy) @@ -928,7 +928,7 @@ NppStreamHandler h(stream); - nppSafeCall( nppiRectStdDev_32s32f_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), sqr.ptr<Npp32f>(), static_cast<int>(sqr.step), + nppSafeCall( nppiRectStdDev_32s32f_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), sqr.ptr<Npp64f>(), static_cast<int>(sqr.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, nppRect) ); if (stream == 0) Index: modules/stitching/src/seam_finders.cpp =================================================================== --- modules/stitching/src/seam_finders.cpp (revision 7044) +++ modules/stitching/src/seam_finders.cpp (working copy) @@ -529,9 +529,9 @@ gpu::GpuMat rightT_d(rightT); gpu::GpuMat top_d(top); gpu::GpuMat bottom_d(bottom); - gpu::GpuMat labels_d, buf_d; + gpu::GpuMat labels_d; - gpu::graphcut(terminals_d, leftT_d, rightT_d, top_d, bottom_d, labels_d, buf_d); + gpu::graphcut(terminals_d, leftT_d, rightT_d, top_d, bottom_d, labels_d); Mat_<uchar> labels = (Mat)labels_d; for (int y = 0; y < roi.height; ++y)