373 lines
14 KiB
C++
373 lines
14 KiB
C++
// Copyright (C) 2017 Davis E. King (davis@dlib.net)
|
|
// License: Boost Software License See LICENSE.txt for the full license.
|
|
#ifndef DLIB_DNN_CuDA_DATA_PTR_H_
|
|
#define DLIB_DNN_CuDA_DATA_PTR_H_
|
|
|
|
#include "../assert.h"
|
|
|
|
#ifdef DLIB_USE_CUDA
|
|
|
|
#include <memory>
|
|
#include <vector>
|
|
#include <type_traits>
|
|
|
|
namespace dlib
|
|
{
|
|
namespace cuda
|
|
{
|
|
|
|
// ------------------------------------------------------------------------------------
|
|
|
|
class cuda_data_void_ptr;
|
|
class weak_cuda_data_void_ptr
|
|
{
|
|
/*!
|
|
WHAT THIS OBJECT REPRESENTS
|
|
This is just like a std::weak_ptr version of cuda_data_void_ptr. It allows you
|
|
to hold a non-owning reference to a cuda_data_void_ptr.
|
|
!*/
|
|
public:
|
|
weak_cuda_data_void_ptr() = default;
|
|
|
|
weak_cuda_data_void_ptr(const cuda_data_void_ptr& ptr);
|
|
|
|
void reset() { pdata.reset(); num = 0; }
|
|
|
|
cuda_data_void_ptr lock() const;
|
|
/*!
|
|
ensures
|
|
- if (the memory block referenced by this object hasn't been deleted) then
|
|
- returns a cuda_data_void_ptr referencing that memory block
|
|
- else
|
|
- returns a default initialized cuda_data_void_ptr (i.e. an empty one).
|
|
!*/
|
|
|
|
private:
|
|
size_t num = 0;
|
|
std::weak_ptr<void> pdata;
|
|
};
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
class cuda_data_void_ptr
|
|
{
|
|
/*!
|
|
WHAT THIS OBJECT REPRESENTS
|
|
This is a block of memory on a CUDA device.
|
|
!*/
|
|
public:
|
|
|
|
cuda_data_void_ptr() = default;
|
|
|
|
cuda_data_void_ptr(size_t n);
|
|
/*!
|
|
ensures
|
|
- This object will allocate a device memory buffer of n bytes.
|
|
- #size() == n
|
|
!*/
|
|
|
|
void* data() { return pdata.get(); }
|
|
const void* data() const { return pdata.get(); }
|
|
operator void*() { return pdata.get(); }
|
|
operator const void*() const { return pdata.get(); }
|
|
|
|
void reset() { pdata.reset(); }
|
|
|
|
size_t size() const { return num; }
|
|
/*!
|
|
ensures
|
|
- returns the length of this buffer, in bytes.
|
|
!*/
|
|
|
|
cuda_data_void_ptr operator+ (size_t offset) const
|
|
/*!
|
|
requires
|
|
- offset < size()
|
|
ensures
|
|
- returns a pointer that is offset by the given amount.
|
|
!*/
|
|
{
|
|
DLIB_CASSERT(offset < num);
|
|
cuda_data_void_ptr temp;
|
|
temp.num = num-offset;
|
|
temp.pdata = std::shared_ptr<void>(pdata, ((char*)pdata.get())+offset);
|
|
return temp;
|
|
}
|
|
|
|
void shrink(size_t new_size)
|
|
/*!
|
|
requires
|
|
- new_size <= num
|
|
ensures
|
|
- #size() == new_size
|
|
- Doesn't actually deallocate anything, just changes the size() metadata to a
|
|
smaller number and only for this instance of the pointer.
|
|
!*/
|
|
{
|
|
DLIB_CASSERT(new_size <= num);
|
|
num = new_size;
|
|
}
|
|
|
|
private:
|
|
|
|
friend class weak_cuda_data_void_ptr;
|
|
size_t num = 0;
|
|
std::shared_ptr<void> pdata;
|
|
};
|
|
|
|
inline cuda_data_void_ptr operator+(size_t offset, const cuda_data_void_ptr& rhs) { return rhs+offset; }
|
|
|
|
// ------------------------------------------------------------------------------------
|
|
|
|
void memcpy(
|
|
void* dest,
|
|
const cuda_data_void_ptr& src
|
|
);
|
|
/*!
|
|
requires
|
|
- dest == a pointer to at least src.size() bytes on the host machine.
|
|
ensures
|
|
- copies the GPU data from src into dest.
|
|
- This routine is equivalent to performing: memcpy(dest,src,src.size())
|
|
!*/
|
|
|
|
void memcpy(
|
|
void* dest,
|
|
const cuda_data_void_ptr& src,
|
|
const size_t num
|
|
);
|
|
/*!
|
|
requires
|
|
- dest == a pointer to at least num bytes on the host machine.
|
|
- num <= src.size()
|
|
ensures
|
|
- copies the GPU data from src into dest. Copies only the first num bytes
|
|
of src to dest.
|
|
!*/
|
|
|
|
// ------------------------------------------------------------------------------------
|
|
|
|
void memcpy(
|
|
cuda_data_void_ptr dest,
|
|
const void* src
|
|
);
|
|
/*!
|
|
requires
|
|
- dest == a pointer to at least src.size() bytes on the host machine.
|
|
ensures
|
|
- copies the host data from src to the GPU memory buffer dest.
|
|
- This routine is equivalent to performing: memcpy(dest,src,dest.size())
|
|
!*/
|
|
|
|
void memcpy(
|
|
cuda_data_void_ptr dest,
|
|
const void* src,
|
|
const size_t num
|
|
);
|
|
/*!
|
|
requires
|
|
- dest == a pointer to at least num bytes on the host machine.
|
|
- num <= dest.size()
|
|
ensures
|
|
- copies the host data from src to the GPU memory buffer dest. Copies only
|
|
the first num bytes of src to dest.
|
|
!*/
|
|
|
|
// ------------------------------------------------------------------------------------
|
|
// ------------------------------------------------------------------------------------
|
|
// ------------------------------------------------------------------------------------
|
|
|
|
template <typename T>
|
|
class cuda_data_ptr
|
|
{
|
|
/*!
|
|
WHAT THIS OBJECT REPRESENTS
|
|
This is a block of memory on a CUDA device. It is just a type safe
|
|
version of cuda_data_void_ptr.
|
|
!*/
|
|
|
|
public:
|
|
|
|
static_assert(std::is_standard_layout<T>::value, "You can only create basic standard layout types on the GPU");
|
|
|
|
cuda_data_ptr() = default;
|
|
cuda_data_ptr(size_t n) : num(n)
|
|
/*!
|
|
ensures
|
|
- This object will allocate a device memory buffer of n T objects.
|
|
- #size() == n
|
|
!*/
|
|
{
|
|
if (n == 0)
|
|
return;
|
|
|
|
pdata = cuda_data_void_ptr(n*sizeof(T));
|
|
}
|
|
|
|
cuda_data_ptr(
|
|
const cuda_data_ptr<typename std::remove_const<T>::type> &other
|
|
) : num(other.num), pdata(other.pdata) {}
|
|
/*!
|
|
ensures
|
|
- *this is a copy of other. This version of the copy constructor allows
|
|
assigning non-const pointers to const ones. For instance, converting from
|
|
cuda_data_ptr<float> to cuda_data_ptr<const float>.
|
|
!*/
|
|
|
|
T* data() { return (T*)pdata.data(); }
|
|
const T* data() const { return (T*)pdata.data(); }
|
|
|
|
operator T*() { return (T*)pdata.data(); }
|
|
operator const T*() const { return (T*)pdata.data(); }
|
|
|
|
void reset() { pdata.reset(); }
|
|
|
|
size_t size() const { return num; }
|
|
/*!
|
|
ensures
|
|
- returns the number of T instances pointed to by *this.
|
|
!*/
|
|
|
|
operator cuda_data_void_ptr() const
|
|
/*!
|
|
ensures
|
|
- returns *this as a cuda_data_void_ptr. Importantly, the returned size() will
|
|
reflect the number of bytes referenced by *this. To be clear, let P be the
|
|
returned pointer. Then:
|
|
- P.get() == get()
|
|
- P.size() == size() * sizeof(T)
|
|
!*/
|
|
{
|
|
cuda_data_void_ptr temp = pdata;
|
|
temp.shrink(size() * sizeof(T));
|
|
return temp;
|
|
}
|
|
|
|
private:
|
|
template <typename U>
|
|
friend cuda_data_ptr<U> static_pointer_cast(const cuda_data_void_ptr &ptr);
|
|
template <typename U>
|
|
friend cuda_data_ptr<U> static_pointer_cast(const cuda_data_void_ptr &ptr, size_t num);
|
|
template <typename U>
|
|
friend class cuda_data_ptr;
|
|
|
|
size_t num = 0;
|
|
cuda_data_void_ptr pdata;
|
|
};
|
|
|
|
template <typename T>
|
|
cuda_data_ptr<T> static_pointer_cast(const cuda_data_void_ptr &ptr)
|
|
{
|
|
DLIB_CASSERT(ptr.size() % sizeof(T) == 0,
|
|
"Size of memory buffer in ptr doesn't match sizeof(T). "
|
|
<< "\nptr.size(): "<< ptr.size()
|
|
<< "\nsizeof(T): "<< sizeof(T));
|
|
cuda_data_ptr<T> result;
|
|
result.pdata = ptr;
|
|
result.num = ptr.size() / sizeof(T);
|
|
return result;
|
|
}
|
|
|
|
template <typename T>
|
|
cuda_data_ptr<T> static_pointer_cast(const cuda_data_void_ptr &ptr, size_t num)
|
|
{
|
|
DLIB_CASSERT(num*sizeof(T) <= ptr.size(),
|
|
"Size of memory buffer in ptr isn't big enough to represent this many T objects. "
|
|
<< "\nnum: "<< num
|
|
<< "\nnum*sizeof(T): "<< num*sizeof(T)
|
|
<< "\nsizeof(T): "<< sizeof(T)
|
|
<< "\nptr.size(): "<< ptr.size());
|
|
|
|
cuda_data_ptr<T> result;
|
|
result.pdata = ptr;
|
|
result.num = num;
|
|
return result;
|
|
}
|
|
|
|
template <typename T>
|
|
void memcpy(std::vector<T>& dest, const cuda_data_ptr<T>& src)
|
|
{
|
|
dest.resize(src.size());
|
|
if (src.size() != 0)
|
|
memcpy(dest.data(), static_cast<cuda_data_void_ptr>(src));
|
|
}
|
|
|
|
template <typename T>
|
|
void memcpy(cuda_data_ptr<T>& dest, const std::vector<T>& src)
|
|
{
|
|
if (src.size() != dest.size())
|
|
dest = cuda_data_ptr<T>(src.size());
|
|
|
|
if (dest.size() != 0)
|
|
memcpy(static_cast<cuda_data_void_ptr>(dest), src.data());
|
|
}
|
|
|
|
template <typename T>
|
|
void memcpy(cuda_data_ptr<T>& dest, const T* src)
|
|
{
|
|
memcpy(static_cast<cuda_data_void_ptr>(dest), src);
|
|
}
|
|
template <typename T>
|
|
void memcpy(cuda_data_ptr<T>& dest, const T* src, size_t num)
|
|
{
|
|
DLIB_CASSERT(num <= dest.size());
|
|
memcpy(static_cast<cuda_data_void_ptr>(dest), src, num*sizeof(T));
|
|
}
|
|
|
|
template <typename T>
|
|
void memcpy(T* dest, const cuda_data_ptr<T>& src)
|
|
{
|
|
memcpy(dest, static_cast<cuda_data_void_ptr>(src));
|
|
}
|
|
template <typename T>
|
|
void memcpy(T* dest, const cuda_data_ptr<T>& src, size_t num)
|
|
{
|
|
DLIB_CASSERT(num <= src.size());
|
|
memcpy(dest, static_cast<cuda_data_void_ptr>(src), num*sizeof(T));
|
|
}
|
|
|
|
// ------------------------------------------------------------------------------------
|
|
|
|
cuda_data_void_ptr device_global_buffer(size_t size);
|
|
/*!
|
|
ensures
|
|
- Returns a pointer to a globally shared CUDA memory buffer on the
|
|
currently selected CUDA device. The buffer is also thread local. So
|
|
each host thread will get its own buffer. You can use this global buffer
|
|
as scratch space for CUDA computations that all take place on the default
|
|
stream. Using it in this way ensures that there aren't any race conditions
|
|
involving the use of the buffer.
|
|
- The returned pointer will point to at least size bytes. It may point to more.
|
|
- The global buffer is deallocated once all references to it are destructed.
|
|
However, if device_global_buffer() is called before then with a size <= the last
|
|
size requested, then the previously returned global buffer pointer is returned.
|
|
This avoids triggering expensive CUDA reallocations. So if you want to avoid
|
|
these reallocations then hold a copy of the pointer returned by this function.
|
|
However, as a general rule, client code should not hold the returned
|
|
cuda_data_void_ptr for long durations, but instead should call
|
|
device_global_buffer() whenever the buffer is needed, and overwrite the previously
|
|
returned pointer with the new pointer. Doing so ensures multiple buffers are not
|
|
kept around in the event that multiple sized buffers are requested. To explain
|
|
this, consider this code, assumed to execute at program startup:
|
|
auto ptr1 = device_global_buffer(1);
|
|
auto ptr2 = device_global_buffer(2);
|
|
auto ptr3 = device_global_buffer(3);
|
|
since the sizes increased at each call 3 separate buffers were allocated. First
|
|
one of size 1, then of size 2, then of size 3. If we then executed:
|
|
ptr1 = device_global_buffer(1);
|
|
ptr2 = device_global_buffer(2);
|
|
ptr3 = device_global_buffer(3);
|
|
all three of these pointers would now point to the same buffer, since the smaller
|
|
requests can be satisfied by returning the size 3 buffer in each case.
|
|
!*/
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
}
|
|
}
|
|
|
|
#endif // DLIB_USE_CUDA
|
|
|
|
#endif // DLIB_DNN_CuDA_DATA_PTR_H_
|
|
|