Index: modules/gpu/include/opencv2/gpu/gpu.hpp
===================================================================
--- modules/gpu/include/opencv2/gpu/gpu.hpp (revision 7044)
+++ modules/gpu/include/opencv2/gpu/gpu.hpp (working copy)
@@ -913,7 +913,7 @@
 
 //!performs labeling via graph cuts
 CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, 
-                         GpuMat& buf, Stream& stream = Stream::Null());
+                         Stream& stream = Stream::Null());
 
 ////////////////////////////////// Histograms //////////////////////////////////
 
Index: modules/gpu/src/matrix_reductions.cpp
===================================================================
--- modules/gpu/src/matrix_reductions.cpp   (revision 7044)
+++ modules/gpu/src/matrix_reductions.cpp   (working copy)
@@ -115,11 +115,17 @@
     sz.width  = src.cols;
     sz.height = src.rows;
 
+    int nBufferSize;
+    nppiMeanStdDev8uC1RGetBufferHostSize (sz, &nBufferSize);
+    Npp8u * pDeviceBuffer;
+    cudaSafeCall( cudaMalloc((void **)& pDeviceBuffer, nBufferSize) );
+
     DeviceBuffer dbuf(2);
 
-    nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, dbuf, (double*)dbuf + 1) );
+    nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, pDeviceBuffer, dbuf, (double*)dbuf + 1) );
 
     cudaSafeCall( cudaDeviceSynchronize() );
+    cudaSafeCall( cudaFree(pDeviceBuffer) );
     
     double* ptrs[2] = {mean.val, stddev.val};
     dbuf.download(ptrs);
Index: modules/gpu/src/element_operations.cpp
===================================================================
--- modules/gpu/src/element_operations.cpp  (revision 7044)
+++ modules/gpu/src/element_operations.cpp  (working copy)
@@ -656,39 +656,27 @@
     sz.width  = src1.cols * src1.channels();
     sz.height = src1.rows;
 
-    if (src1.depth() == CV_8U && (src1.cols * src1.channels()) % 4 == 0)
+    if (src1.depth() == CV_8U)
     {
         NppStreamHandler h(stream);
 
-        sz.width /= 4;
-
-        nppSafeCall( nppiAbsDiff_8u_C4R(src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), 
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-    else if (src1.depth() == CV_8U)
-    {
-        NppStreamHandler h(stream);
-
         nppSafeCall( nppiAbsDiff_8u_C1R(src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), 
             dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
 
         if (stream == 0)
             cudaSafeCall( cudaDeviceSynchronize() );
     }
-    else if (src1.depth() == CV_32S)
+    else if (src1.depth() == CV_16U)
     {
         NppStreamHandler h(stream);
 
-        nppSafeCall( nppiAbsDiff_32s_C1R(src1.ptr<Npp32s>(), static_cast<int>(src1.step), src2.ptr<Npp32s>(), static_cast<int>(src2.step), 
-            dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
+        nppSafeCall( nppiAbsDiff_16u_C1R(src1.ptr<Npp16u>(), static_cast<int>(src1.step), src2.ptr<Npp16u>(), static_cast<int>(src2.step), 
+            dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
 
         if (stream == 0)
             cudaSafeCall( cudaDeviceSynchronize() );
     }
-    else if (src1.depth() == CV_32F)
+    else if ((src1.depth() == CV_32F) || (src1.depth() == CV_32S))
     {
         NppStreamHandler h(stream);
 
Index: modules/gpu/src/graphcuts.cpp
===================================================================
--- modules/gpu/src/graphcuts.cpp   (revision 7044)
+++ modules/gpu/src/graphcuts.cpp   (working copy)
@@ -48,7 +48,7 @@
 
 #else /* !defined (HAVE_CUDA) */
 
-void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf, Stream& s)
+void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, Stream& s)
 {
     Size src_size = terminals.size();
     CV_Assert(terminals.type() == CV_32S);
@@ -68,20 +68,26 @@
     sznpp.height = src_size.height;
 
     int bufsz;
+    NppiGraphcutState *pState;
+    Npp8u *pDeviceMem;
+
     nppSafeCall( nppiGraphcutGetSize(sznpp, &bufsz) );
+    cudaSafeCall( cudaMalloc((void **)& pDeviceMem, bufsz) );
 
-    if ((size_t)bufsz > buf.cols * buf.rows * buf.elemSize())
-        buf.create(1, bufsz, CV_8U);
+    nppSafeCall( nppiGraphcutInitAlloc(sznpp, &pState, pDeviceMem) );
 
     cudaStream_t stream = StreamAccessor::getStream(s);
 
     NppStreamHandler h(stream);
 
     nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(),
-        static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), buf.ptr<Npp8u>()) );
+        static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), pState) );
 
     if (stream == 0)
         cudaSafeCall( cudaDeviceSynchronize() );
+
+    nppSafeCall ( nppiGraphcutFree(pState));
+    cudaSafeCall( cudaFree(pDeviceMem) );
 }
 
 
Index: modules/gpu/src/imgproc.cpp
===================================================================
--- modules/gpu/src/imgproc.cpp (revision 7044)
+++ modules/gpu/src/imgproc.cpp (working copy)
@@ -928,7 +928,7 @@
 
     NppStreamHandler h(stream);
 
-    nppSafeCall( nppiRectStdDev_32s32f_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), sqr.ptr<Npp32f>(), static_cast<int>(sqr.step),
+    nppSafeCall( nppiRectStdDev_32s32f_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), sqr.ptr<Npp64f>(), static_cast<int>(sqr.step),
                 dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, nppRect) );
 
     if (stream == 0)
Index: modules/stitching/src/seam_finders.cpp
===================================================================
--- modules/stitching/src/seam_finders.cpp  (revision 7044)
+++ modules/stitching/src/seam_finders.cpp  (working copy)
@@ -529,9 +529,9 @@
     gpu::GpuMat rightT_d(rightT);
     gpu::GpuMat top_d(top);
     gpu::GpuMat bottom_d(bottom);
-    gpu::GpuMat labels_d, buf_d;
+    gpu::GpuMat labels_d;
 
-    gpu::graphcut(terminals_d, leftT_d, rightT_d, top_d, bottom_d, labels_d, buf_d);
+    gpu::graphcut(terminals_d, leftT_d, rightT_d, top_d, bottom_d, labels_d);
 
     Mat_<uchar> labels = (Mat)labels_d;
     for (int y = 0; y < roi.height; ++y)