255 lines
9.2 KiB
C++
255 lines
9.2 KiB
C++
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
|
|
// License: Boost Software License See LICENSE.txt for the full license.
|
|
#ifndef DLIB_GPU_DaTA_CPP_
|
|
#define DLIB_GPU_DaTA_CPP_
|
|
|
|
// Only things that require CUDA are declared in this cpp file. Everything else is in the
|
|
// gpu_data.h header so that it can operate as "header-only" code when using just the CPU.
|
|
#ifdef DLIB_USE_CUDA
|
|
|
|
#include "gpu_data.h"
|
|
#include <iostream>
|
|
#include "cuda_utils.h"
|
|
#include <cstring>
|
|
#include <cuda.h>
|
|
|
|
namespace dlib
|
|
{
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
void memcpy (
|
|
gpu_data& dest,
|
|
const gpu_data& src
|
|
)
|
|
{
|
|
DLIB_CASSERT(dest.size() == src.size());
|
|
if (src.size() == 0 || &dest == &src)
|
|
return;
|
|
|
|
memcpy(dest,0, src, 0, src.size());
|
|
}
|
|
|
|
void memcpy (
|
|
gpu_data& dest,
|
|
size_t dest_offset,
|
|
const gpu_data& src,
|
|
size_t src_offset,
|
|
size_t num
|
|
)
|
|
{
|
|
DLIB_CASSERT(dest_offset + num <= dest.size());
|
|
DLIB_CASSERT(src_offset + num <= src.size());
|
|
if (num == 0)
|
|
return;
|
|
|
|
// if there is aliasing
|
|
if (&dest == &src && std::max(dest_offset, src_offset) < std::min(dest_offset,src_offset)+num)
|
|
{
|
|
// if they perfectly alias each other then there is nothing to do
|
|
if (dest_offset == src_offset)
|
|
return;
|
|
else
|
|
std::memmove(dest.host()+dest_offset, src.host()+src_offset, sizeof(float)*num);
|
|
}
|
|
else
|
|
{
|
|
// if we write to the entire thing then we can use device_write_only()
|
|
if (dest_offset == 0 && num == dest.size())
|
|
{
|
|
// copy the memory efficiently based on which copy is current in each object.
|
|
if (src.device_ready())
|
|
CHECK_CUDA(cudaMemcpy(dest.device_write_only(), src.device()+src_offset, num*sizeof(float), cudaMemcpyDeviceToDevice));
|
|
else
|
|
CHECK_CUDA(cudaMemcpy(dest.device_write_only(), src.host()+src_offset, num*sizeof(float), cudaMemcpyHostToDevice));
|
|
}
|
|
else
|
|
{
|
|
// copy the memory efficiently based on which copy is current in each object.
|
|
if (dest.device_ready() && src.device_ready())
|
|
CHECK_CUDA(cudaMemcpy(dest.device()+dest_offset, src.device()+src_offset, num*sizeof(float), cudaMemcpyDeviceToDevice));
|
|
else if (!dest.device_ready() && src.device_ready())
|
|
CHECK_CUDA(cudaMemcpy(dest.host()+dest_offset, src.device()+src_offset, num*sizeof(float), cudaMemcpyDeviceToHost));
|
|
else if (dest.device_ready() && !src.device_ready())
|
|
CHECK_CUDA(cudaMemcpy(dest.device()+dest_offset, src.host()+src_offset, num*sizeof(float), cudaMemcpyHostToDevice));
|
|
else
|
|
CHECK_CUDA(cudaMemcpy(dest.host()+dest_offset, src.host()+src_offset, num*sizeof(float), cudaMemcpyHostToHost));
|
|
}
|
|
}
|
|
}
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
void synchronize_stream(cudaStream_t stream)
|
|
{
|
|
#if !defined CUDA_VERSION
|
|
#error CUDA_VERSION not defined
|
|
#elif CUDA_VERSION >= 9020 && CUDA_VERSION < 11000
|
|
// We will stop using this alternative version with cuda V11, hopefully the bug in
|
|
// cudaStreamSynchronize is fixed by then.
|
|
//
|
|
// This should be pretty much the same as cudaStreamSynchronize, which for some
|
|
// reason makes training freeze in some cases.
|
|
// (see https://github.com/davisking/dlib/issues/1513)
|
|
while (true)
|
|
{
|
|
cudaError_t err = cudaStreamQuery(stream);
|
|
switch (err)
|
|
{
|
|
case cudaSuccess: return; // now we are synchronized
|
|
case cudaErrorNotReady: break; // continue waiting
|
|
default: CHECK_CUDA(err); // unexpected error: throw
|
|
}
|
|
}
|
|
#else // CUDA_VERSION
|
|
CHECK_CUDA(cudaStreamSynchronize(stream));
|
|
#endif // CUDA_VERSION
|
|
}
|
|
|
|
void gpu_data::
|
|
wait_for_transfer_to_finish() const
|
|
{
|
|
if (have_active_transfer)
|
|
{
|
|
synchronize_stream((cudaStream_t)cuda_stream.get());
|
|
have_active_transfer = false;
|
|
// Check for errors. These calls to cudaGetLastError() are what help us find
|
|
// out if our kernel launches have been failing.
|
|
CHECK_CUDA(cudaGetLastError());
|
|
}
|
|
}
|
|
|
|
void gpu_data::
|
|
copy_to_device() const
|
|
{
|
|
// We want transfers to the device to always be concurrent with any device
|
|
// computation. So we use our non-default stream to do the transfer.
|
|
async_copy_to_device();
|
|
wait_for_transfer_to_finish();
|
|
}
|
|
|
|
void gpu_data::
|
|
copy_to_host() const
|
|
{
|
|
if (!host_current)
|
|
{
|
|
wait_for_transfer_to_finish();
|
|
CHECK_CUDA(cudaMemcpy(data_host.get(), data_device.get(), data_size*sizeof(float), cudaMemcpyDeviceToHost));
|
|
host_current = true;
|
|
// At this point we know our RAM block isn't in use because cudaMemcpy()
|
|
// implicitly syncs with the device.
|
|
device_in_use = false;
|
|
// Check for errors. These calls to cudaGetLastError() are what help us find
|
|
// out if our kernel launches have been failing.
|
|
CHECK_CUDA(cudaGetLastError());
|
|
}
|
|
}
|
|
|
|
void gpu_data::
|
|
async_copy_to_device() const
|
|
{
|
|
if (!device_current)
|
|
{
|
|
if (device_in_use)
|
|
{
|
|
// Wait for any possible CUDA kernels that might be using our memory block to
|
|
// complete before we overwrite the memory.
|
|
synchronize_stream(0);
|
|
device_in_use = false;
|
|
}
|
|
CHECK_CUDA(cudaMemcpyAsync(data_device.get(), data_host.get(), data_size*sizeof(float), cudaMemcpyHostToDevice, (cudaStream_t)cuda_stream.get()));
|
|
have_active_transfer = true;
|
|
device_current = true;
|
|
}
|
|
}
|
|
|
|
void gpu_data::
|
|
set_size(
|
|
size_t new_size
|
|
)
|
|
{
|
|
if (new_size == 0)
|
|
{
|
|
if (device_in_use)
|
|
{
|
|
// Wait for any possible CUDA kernels that might be using our memory block to
|
|
// complete before we free the memory.
|
|
synchronize_stream(0);
|
|
device_in_use = false;
|
|
}
|
|
wait_for_transfer_to_finish();
|
|
data_size = 0;
|
|
host_current = true;
|
|
device_current = true;
|
|
device_in_use = false;
|
|
data_host.reset();
|
|
data_device.reset();
|
|
}
|
|
else if (new_size != data_size)
|
|
{
|
|
if (device_in_use)
|
|
{
|
|
// Wait for any possible CUDA kernels that might be using our memory block to
|
|
// complete before we free the memory.
|
|
synchronize_stream(0);
|
|
device_in_use = false;
|
|
}
|
|
wait_for_transfer_to_finish();
|
|
data_size = new_size;
|
|
host_current = true;
|
|
device_current = true;
|
|
device_in_use = false;
|
|
|
|
try
|
|
{
|
|
CHECK_CUDA(cudaGetDevice(&the_device_id));
|
|
|
|
// free memory blocks before we allocate new ones.
|
|
data_host.reset();
|
|
data_device.reset();
|
|
|
|
void* data;
|
|
CHECK_CUDA(cudaMallocHost(&data, new_size*sizeof(float)));
|
|
// Note that we don't throw exceptions since the free calls are invariably
|
|
// called in destructors. They also shouldn't fail anyway unless someone
|
|
// is resetting the GPU card in the middle of their program.
|
|
data_host.reset((float*)data, [](float* ptr){
|
|
auto err = cudaFreeHost(ptr);
|
|
if(err!=cudaSuccess)
|
|
std::cerr << "cudaFreeHost() failed. Reason: " << cudaGetErrorString(err) << std::endl;
|
|
});
|
|
|
|
CHECK_CUDA(cudaMalloc(&data, new_size*sizeof(float)));
|
|
data_device.reset((float*)data, [](float* ptr){
|
|
auto err = cudaFree(ptr);
|
|
if(err!=cudaSuccess)
|
|
std::cerr << "cudaFree() failed. Reason: " << cudaGetErrorString(err) << std::endl;
|
|
});
|
|
|
|
if (!cuda_stream)
|
|
{
|
|
cudaStream_t cstream;
|
|
CHECK_CUDA(cudaStreamCreateWithFlags(&cstream, cudaStreamNonBlocking));
|
|
cuda_stream.reset(cstream, [](void* ptr){
|
|
auto err = cudaStreamDestroy((cudaStream_t)ptr);
|
|
if(err!=cudaSuccess)
|
|
std::cerr << "cudaStreamDestroy() failed. Reason: " << cudaGetErrorString(err) << std::endl;
|
|
});
|
|
}
|
|
|
|
}
|
|
catch(...)
|
|
{
|
|
set_size(0);
|
|
throw;
|
|
}
|
|
}
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
}
|
|
|
|
#endif // DLIB_USE_CUDA
|
|
|
|
#endif // DLIB_GPU_DaTA_CPP_
|
|
|