1
/*M///////////////////////////////////////////////////////////////////////////////////////
3
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5
// By downloading, copying, installing or using the software you agree to this license.
6
// If you do not agree to this license, do not download, install,
7
// copy or use the software.
11
// For Open Source Computer Vision Library
13
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15
// Third party copyrights are property of their respective owners.
17
// Redistribution and use in source and binary forms, with or without modification,
18
// are permitted provided that the following conditions are met:
20
// * Redistribution's of source code must retain the above copyright notice,
21
// this list of conditions and the following disclaimer.
23
// * Redistribution's in binary form must reproduce the above copyright notice,
24
// this list of conditions and the following disclaimer in the documentation
25
// and/or other materials provided with the distribution.
27
// * The name of the copyright holders may not be used to endorse or promote products
28
// derived from this software without specific prior written permission.
30
// This software is provided by the copyright holders and contributors "as is" and
31
// any express or implied warranties, including, but not limited to, the implied
32
// warranties of merchantability and fitness for a particular purpose are disclaimed.
33
// In no event shall the Intel Corporation or contributors be liable for any direct,
34
// indirect, incidental, special, exemplary, or consequential damages
35
// (including, but not limited to, procurement of substitute goods or services;
36
// loss of use, data, or profits; or business interruption) however caused
37
// and on any theory of liability, whether in contract, strict liability,
38
// or tort (including negligence or otherwise) arising in any way out of
39
// the use of this software, even if advised of the possibility of such damage.
43
#include "precomp.hpp"
46
using namespace cv::cuda;
48
/////////////////////////////////////////////////////////////
60
uchar* requestMemory(size_t size);
61
void returnMemory(uchar* ptr);
71
std::vector<size_t> allocations;
75
uchar* MemoryStack::requestMemory(size_t size)
77
const size_t freeMem = dataend - tip;
87
allocations.push_back(size);
93
void MemoryStack::returnMemory(uchar* ptr)
95
CV_DbgAssert( ptr >= datastart && ptr < dataend );
98
const size_t allocSize = tip - ptr;
99
CV_Assert( allocSize == allocations.back() );
100
allocations.pop_back();
109
/////////////////////////////////////////////////////////////
121
void initialize(size_t stackSize, int stackCount);
124
MemoryStack* getFreeMemStack();
125
void returnMemStack(MemoryStack* memStack);
128
void initilizeImpl();
138
std::vector<MemoryStack> stacks_;
141
MemoryPool::MemoryPool() : initialized_(false), mem_(0)
143
// default : 10 Mb, 5 stacks
144
stackSize_ = 10 * 1024 * 1024;
148
void MemoryPool::initialize(size_t stackSize, int stackCount)
154
stackSize_ = stackSize;
155
stackCount_ = stackCount;
160
void MemoryPool::initilizeImpl()
162
const size_t totalSize = stackSize_ * stackCount_;
166
cudaError_t err = cudaMalloc(&mem_, totalSize);
167
if (err != cudaSuccess)
170
stacks_.resize(stackCount_);
174
for (int i = 0; i < stackCount_; ++i)
176
stacks_[i].datastart = ptr;
177
stacks_[i].dataend = ptr + stackSize_;
178
stacks_[i].tip = ptr;
179
stacks_[i].isFree = true;
180
stacks_[i].pool = this;
189
void MemoryPool::release()
194
for (int i = 0; i < stackCount_; ++i)
196
CV_DbgAssert( stacks_[i].isFree );
197
CV_DbgAssert( stacks_[i].tip == stacks_[i].datastart );
204
initialized_ = false;
208
MemoryStack* MemoryPool::getFreeMemStack()
218
for (int i = 0; i < stackCount_; ++i)
220
if (stacks_[i].isFree)
222
stacks_[i].isFree = false;
230
void MemoryPool::returnMemStack(MemoryStack* memStack)
234
CV_DbgAssert( !memStack->isFree );
238
for (int i = 0; i < stackCount_; ++i)
240
if (memStack == &stacks_[i])
246
CV_DbgAssert( found );
249
CV_DbgAssert( memStack->tip == memStack->datastart );
251
memStack->isFree = true;
257
////////////////////////////////////////////////////////////////
262
class cv::cuda::Stream::Impl
276
class StackAllocator;
279
class cv::cuda::Stream::Impl
285
Ptr<StackAllocator> stackAllocator;
288
explicit Impl(cudaStream_t stream);
293
cv::cuda::Stream::Impl::Impl() : stream(0), ownStream(false)
295
cudaSafeCall( cudaStreamCreate(&stream) );
298
stackAllocator = makePtr<StackAllocator>(stream);
301
cv::cuda::Stream::Impl::Impl(cudaStream_t stream_) : stream(stream_), ownStream(false)
303
stackAllocator = makePtr<StackAllocator>(stream);
306
cv::cuda::Stream::Impl::~Impl()
308
stackAllocator.release();
310
if (stream && ownStream)
312
cudaStreamDestroy(stream);
318
/////////////////////////////////////////////////////////////
319
/// DefaultDeviceInitializer
323
namespace cv { namespace cuda
325
class DefaultDeviceInitializer
328
DefaultDeviceInitializer();
329
~DefaultDeviceInitializer();
331
Stream& getNullStream(int deviceId);
332
MemoryPool* getMemoryPool(int deviceId);
338
std::vector<Ptr<Stream> > streams_;
341
std::vector<MemoryPool> pools_;
345
DefaultDeviceInitializer::DefaultDeviceInitializer()
349
DefaultDeviceInitializer::~DefaultDeviceInitializer()
353
for (size_t i = 0; i < pools_.size(); ++i)
355
cudaSetDevice(static_cast<int>(i));
362
Stream& DefaultDeviceInitializer::getNullStream(int deviceId)
364
AutoLock lock(streams_mtx_);
366
if (streams_.empty())
368
int deviceCount = getCudaEnabledDeviceCount();
371
streams_.resize(deviceCount);
374
CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(streams_.size()) );
376
if (streams_[deviceId].empty())
378
cudaStream_t stream = NULL;
379
Ptr<Stream::Impl> impl = makePtr<Stream::Impl>(stream);
380
streams_[deviceId] = Ptr<Stream>(new Stream(impl));
383
return *streams_[deviceId];
386
MemoryPool* DefaultDeviceInitializer::getMemoryPool(int deviceId)
388
AutoLock lock(pools_mtx_);
392
int deviceCount = getCudaEnabledDeviceCount();
395
pools_.resize(deviceCount);
398
CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(pools_.size()) );
400
return &pools_[deviceId];
403
DefaultDeviceInitializer initializer;
408
/////////////////////////////////////////////////////////////
411
cv::cuda::Stream::Stream()
416
impl_ = makePtr<Impl>();
420
bool cv::cuda::Stream::queryIfComplete() const
426
cudaError_t err = cudaStreamQuery(impl_->stream);
428
if (err == cudaErrorNotReady || err == cudaSuccess)
429
return err == cudaSuccess;
436
void cv::cuda::Stream::waitForCompletion()
441
cudaSafeCall( cudaStreamSynchronize(impl_->stream) );
445
void cv::cuda::Stream::waitEvent(const Event& event)
451
cudaSafeCall( cudaStreamWaitEvent(impl_->stream, EventAccessor::getEvent(event), 0) );
455
#if defined(HAVE_CUDA) && (CUDART_VERSION >= 5000)
461
Stream::StreamCallback callback;
464
CallbackData(Stream::StreamCallback callback_, void* userData_) : callback(callback_), userData(userData_) {}
467
void CUDART_CB cudaStreamCallback(cudaStream_t, cudaError_t status, void* userData)
469
CallbackData* data = reinterpret_cast<CallbackData*>(userData);
470
data->callback(static_cast<int>(status), data->userData);
477
void cv::cuda::Stream::enqueueHostCallback(StreamCallback callback, void* userData)
484
#if CUDART_VERSION < 5000
487
CV_Error(cv::Error::StsNotImplemented, "This function requires CUDA >= 5.0");
489
CallbackData* data = new CallbackData(callback, userData);
491
cudaSafeCall( cudaStreamAddCallback(impl_->stream, cudaStreamCallback, data, 0) );
496
Stream& cv::cuda::Stream::Null()
500
static Stream stream;
503
const int deviceId = getDevice();
504
return initializer.getNullStream(deviceId);
508
cv::cuda::Stream::operator bool_type() const
513
return (impl_->stream != 0) ? &Stream::this_type_does_not_support_comparisons : 0;
519
cudaStream_t cv::cuda::StreamAccessor::getStream(const Stream& stream)
521
return stream.impl_->stream;
524
Stream cv::cuda::StreamAccessor::wrapStream(cudaStream_t stream)
526
return Stream(makePtr<Stream::Impl>(stream));
531
/////////////////////////////////////////////////////////////
538
bool enableMemoryPool = true;
540
class StackAllocator : public GpuMat::Allocator
543
explicit StackAllocator(cudaStream_t stream);
546
bool allocate(GpuMat* mat, int rows, int cols, size_t elemSize);
547
void free(GpuMat* mat);
550
StackAllocator(const StackAllocator&);
551
StackAllocator& operator =(const StackAllocator&);
553
cudaStream_t stream_;
554
MemoryStack* memStack_;
558
StackAllocator::StackAllocator(cudaStream_t stream) : stream_(stream), memStack_(0)
560
if (enableMemoryPool)
562
const int deviceId = getDevice();
563
memStack_ = initializer.getMemoryPool(deviceId)->getFreeMemStack();
564
DeviceInfo devInfo(deviceId);
565
alignment_ = devInfo.textureAlignment();
569
StackAllocator::~StackAllocator()
571
cudaStreamSynchronize(stream_);
574
memStack_->pool->returnMemStack(memStack_);
577
size_t alignUp(size_t what, size_t alignment)
579
size_t alignMask = alignment-1;
580
size_t inverseAlignMask = ~alignMask;
581
size_t res = (what + alignMask) & inverseAlignMask;
585
bool StackAllocator::allocate(GpuMat* mat, int rows, int cols, size_t elemSize)
590
size_t pitch, memSize;
592
if (rows > 1 && cols > 1)
594
pitch = alignUp(cols * elemSize, alignment_);
595
memSize = pitch * rows;
599
// Single row or single column must be continuous
600
pitch = elemSize * cols;
601
memSize = alignUp(elemSize * cols * rows, 64);
604
uchar* ptr = memStack_->requestMemory(memSize);
611
mat->refcount = (int*) fastMalloc(sizeof(int));
616
void StackAllocator::free(GpuMat* mat)
621
memStack_->returnMemory(mat->datastart);
622
fastFree(mat->refcount);
628
/////////////////////////////////////////////////////////////
631
void cv::cuda::setBufferPoolUsage(bool on)
637
enableMemoryPool = on;
641
void cv::cuda::setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount)
649
const int currentDevice = getDevice();
654
initializer.getMemoryPool(deviceId)->initialize(stackSize, stackCount);
658
const int deviceCount = getCudaEnabledDeviceCount();
660
for (deviceId = 0; deviceId < deviceCount; ++deviceId)
663
initializer.getMemoryPool(deviceId)->initialize(stackSize, stackCount);
667
setDevice(currentDevice);
673
cv::cuda::BufferPool::BufferPool(Stream& stream) : allocator_(stream.impl_->stackAllocator.get())
677
GpuMat cv::cuda::BufferPool::getBuffer(int rows, int cols, int type)
679
GpuMat buf(allocator_);
680
buf.create(rows, cols, type);
686
////////////////////////////////////////////////////////////////
691
class cv::cuda::Event::Impl
702
class cv::cuda::Event::Impl
708
explicit Impl(unsigned int flags);
709
explicit Impl(cudaEvent_t event);
713
cv::cuda::Event::Impl::Impl(unsigned int flags) : event(0), ownEvent(false)
715
cudaSafeCall( cudaEventCreateWithFlags(&event, flags) );
719
cv::cuda::Event::Impl::Impl(cudaEvent_t e) : event(e), ownEvent(false)
723
cv::cuda::Event::Impl::~Impl()
725
if (event && ownEvent)
727
cudaEventDestroy(event);
731
cudaEvent_t cv::cuda::EventAccessor::getEvent(const Event& event)
733
return event.impl_->event;
736
Event cv::cuda::EventAccessor::wrapEvent(cudaEvent_t event)
738
return Event(makePtr<Event::Impl>(event));
743
cv::cuda::Event::Event(CreateFlags flags)
749
impl_ = makePtr<Impl>(flags);
753
void cv::cuda::Event::record(Stream& stream)
759
cudaSafeCall( cudaEventRecord(impl_->event, StreamAccessor::getStream(stream)) );
763
bool cv::cuda::Event::queryIfComplete() const
769
cudaError_t err = cudaEventQuery(impl_->event);
771
if (err == cudaErrorNotReady || err == cudaSuccess)
772
return err == cudaSuccess;
779
void cv::cuda::Event::waitForCompletion()
784
cudaSafeCall( cudaEventSynchronize(impl_->event) );
788
float cv::cuda::Event::elapsedTime(const Event& start, const Event& end)
797
cudaSafeCall( cudaEventElapsedTime(&ms, start.impl_->event, end.impl_->event) );