274 lines
8.7 KiB
C++
274 lines
8.7 KiB
C++
// Copyright (C) 2017 Davis E. King (davis@dlib.net)
|
|
// License: Boost Software License See LICENSE.txt for the full license.
|
|
#ifndef DLIB_DNN_CuDA_DATA_PTR_H_
|
|
#define DLIB_DNN_CuDA_DATA_PTR_H_
|
|
|
|
#include "../assert.h"
|
|
|
|
#ifdef DLIB_USE_CUDA
|
|
|
|
#include <memory>
|
|
#include <vector>
|
|
|
|
namespace dlib
|
|
{
|
|
namespace cuda
|
|
{
|
|
|
|
// ------------------------------------------------------------------------------------
|
|
|
|
class cuda_data_void_ptr
|
|
{
|
|
/*!
|
|
WHAT THIS OBJECT REPRESENTS
|
|
This is a block of memory on a CUDA device.
|
|
!*/
|
|
public:
|
|
|
|
cuda_data_void_ptr() = default;
|
|
|
|
cuda_data_void_ptr(size_t n);
|
|
/*!
|
|
ensures
|
|
- This object will allocate a device memory buffer of n bytes.
|
|
- #size() == n
|
|
!*/
|
|
|
|
void* data() { return pdata.get(); }
|
|
const void* data() const { return pdata.get(); }
|
|
operator void*() { return pdata.get(); }
|
|
operator const void*() const { return pdata.get(); }
|
|
|
|
void reset() { pdata.reset(); }
|
|
|
|
size_t size() const { return num; }
|
|
/*!
|
|
ensures
|
|
- returns the length of this buffer, in bytes.
|
|
!*/
|
|
|
|
cuda_data_void_ptr operator+ (size_t offset) const
|
|
/*!
|
|
requires
|
|
- offset < size()
|
|
ensures
|
|
- returns a pointer that is offset by the given amount.
|
|
!*/
|
|
{
|
|
DLIB_CASSERT(offset < num);
|
|
cuda_data_void_ptr temp;
|
|
temp.num = num-offset;
|
|
temp.pdata = std::shared_ptr<void>(pdata, ((char*)pdata.get())+offset);
|
|
return temp;
|
|
}
|
|
|
|
private:
|
|
|
|
size_t num = 0;
|
|
std::shared_ptr<void> pdata;
|
|
};
|
|
|
|
inline cuda_data_void_ptr operator+(size_t offset, const cuda_data_void_ptr& rhs) { return rhs+offset; }
|
|
|
|
// ------------------------------------------------------------------------------------
|
|
|
|
void memcpy(
|
|
void* dest,
|
|
const cuda_data_void_ptr& src
|
|
);
|
|
/*!
|
|
requires
|
|
- dest == a pointer to at least src.size() bytes on the host machine.
|
|
ensures
|
|
- copies the GPU data from src into dest.
|
|
- This routine is equivalent to performing: memcpy(dest,src,src.size())
|
|
!*/
|
|
|
|
void memcpy(
|
|
void* dest,
|
|
const cuda_data_void_ptr& src,
|
|
const size_t num
|
|
);
|
|
/*!
|
|
requires
|
|
- dest == a pointer to at least num bytes on the host machine.
|
|
- num <= src.size()
|
|
ensures
|
|
- copies the GPU data from src into dest. Copies only the first num bytes
|
|
of src to dest.
|
|
!*/
|
|
|
|
// ------------------------------------------------------------------------------------
|
|
|
|
void memcpy(
|
|
cuda_data_void_ptr dest,
|
|
const void* src
|
|
);
|
|
/*!
|
|
requires
|
|
- dest == a pointer to at least src.size() bytes on the host machine.
|
|
ensures
|
|
- copies the host data from src to the GPU memory buffer dest.
|
|
- This routine is equivalent to performing: memcpy(dest,src,dest.size())
|
|
!*/
|
|
|
|
void memcpy(
|
|
cuda_data_void_ptr dest,
|
|
const void* src,
|
|
const size_t num
|
|
);
|
|
/*!
|
|
requires
|
|
- dest == a pointer to at least num bytes on the host machine.
|
|
- num <= dest.size()
|
|
ensures
|
|
- copies the host data from src to the GPU memory buffer dest. Copies only
|
|
the first num bytes of src to dest.
|
|
!*/
|
|
|
|
// ------------------------------------------------------------------------------------
|
|
// ------------------------------------------------------------------------------------
|
|
// ------------------------------------------------------------------------------------
|
|
|
|
template <typename T>
|
|
class cuda_data_ptr
|
|
{
|
|
/*!
|
|
WHAT THIS OBJECT REPRESENTS
|
|
This is a block of memory on a CUDA device. It is just a type safe
|
|
version of cuda_data_void_ptr.
|
|
!*/
|
|
|
|
public:
|
|
|
|
static_assert(std::is_standard_layout<T>::value, "You can only create basic standard layout types on the GPU");
|
|
|
|
cuda_data_ptr() = default;
|
|
cuda_data_ptr(size_t n) : num(n)
|
|
/*!
|
|
ensures
|
|
- This object will allocate a device memory buffer of n T objects.
|
|
- #size() == n
|
|
!*/
|
|
{
|
|
if (n == 0)
|
|
return;
|
|
|
|
pdata = cuda_data_void_ptr(n*sizeof(T));
|
|
}
|
|
|
|
T* data() { return (T*)pdata.data(); }
|
|
const T* data() const { return (T*)pdata.data(); }
|
|
|
|
operator T*() { return (T*)pdata.data(); }
|
|
operator const T*() const { return (T*)pdata.data(); }
|
|
|
|
void reset() { pdata.reset(); }
|
|
|
|
size_t size() const { return num; }
|
|
|
|
|
|
friend void memcpy(
|
|
std::vector<T>& dest,
|
|
const cuda_data_ptr& src
|
|
)
|
|
{
|
|
dest.resize(src.size());
|
|
if (src.size() != 0)
|
|
memcpy(dest.data(), src.pdata);
|
|
}
|
|
|
|
friend void memcpy(
|
|
cuda_data_ptr& dest,
|
|
const std::vector<T>& src
|
|
)
|
|
{
|
|
if (src.size() != dest.size())
|
|
dest = cuda_data_ptr<T>(src.size());
|
|
|
|
if (dest.size() != 0)
|
|
memcpy(dest.pdata, src.data());
|
|
}
|
|
|
|
friend void memcpy(
|
|
cuda_data_ptr& dest,
|
|
const float* src
|
|
)
|
|
{
|
|
memcpy(dest.pdata, src);
|
|
}
|
|
|
|
friend void memcpy(
|
|
float* dest,
|
|
const cuda_data_ptr& src
|
|
)
|
|
{
|
|
memcpy(dest, src.pdata);
|
|
}
|
|
|
|
private:
|
|
|
|
size_t num = 0;
|
|
cuda_data_void_ptr pdata;
|
|
};
|
|
|
|
// ------------------------------------------------------------------------------------
|
|
|
|
class resizable_cuda_buffer
|
|
{
|
|
/*!
|
|
WHAT THIS OBJECT REPRESENTS
|
|
This is a block of memory on a CUDA device that will be automatically
|
|
resized if requested size is larger than allocated.
|
|
!*/
|
|
public:
|
|
cuda_data_void_ptr get(size_t size)
|
|
/*!
|
|
ensures
|
|
- This object will return the buffer of requested size or larger.
|
|
- buffer.size() >= size
|
|
- Client code should not hold the returned cuda_data_void_ptr for long
|
|
durations, but instead should call get() whenever the buffer is
|
|
needed. Doing so ensures that multiple buffers are not kept around
|
|
in the event of a resize.
|
|
!*/
|
|
{
|
|
if (buffer.size() < size)
|
|
{
|
|
buffer.reset();
|
|
buffer = cuda_data_void_ptr(size);
|
|
}
|
|
return buffer;
|
|
}
|
|
private:
|
|
cuda_data_void_ptr buffer;
|
|
};
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
std::shared_ptr<resizable_cuda_buffer> device_global_buffer(
|
|
);
|
|
/*!
|
|
ensures
|
|
- Returns a pointer to a globally shared CUDA memory buffer on the
|
|
currently selected CUDA device. The buffer is also thread local. So
|
|
each host thread will get its own buffer. You can use this global buffer
|
|
as scratch space for CUDA computations that all take place on the default
|
|
stream. Using it in this way ensures that there aren't any race conditions
|
|
involving the use of the buffer.
|
|
- The global buffer is deallocated once all references to it are
|
|
destructed. It will be reallocated as required. So if you want to avoid
|
|
these reallocations then hold a copy of the shared_ptr returned by this
|
|
function.
|
|
!*/
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
}
|
|
}
|
|
|
|
#endif // DLIB_USE_CUDA
|
|
|
|
#endif // DLIB_DNN_CuDA_DATA_PTR_H_
|
|
|