373 lines
10 KiB
C++
373 lines
10 KiB
C++
// Copyright (C) 2010 Davis E. King (davis@dlib.net)
|
|
// License: Boost Software License See LICENSE.txt for the full license.
|
|
#ifndef DLIB_RANDOM_SUBSeT_SELECTOR_H_
|
|
#define DLIB_RANDOM_SUBSeT_SELECTOR_H_
|
|
|
|
#include "random_subset_selector_abstract.h"
|
|
#include "../rand.h"
|
|
#include <vector>
|
|
#include "../algs.h"
|
|
#include "../string.h"
|
|
#include "../serialize.h"
|
|
#include "../matrix/matrix_mat.h"
|
|
#include <iostream>
|
|
|
|
namespace dlib
|
|
{
|
|
template <
|
|
typename T,
|
|
typename Rand_type = dlib::rand
|
|
>
|
|
class random_subset_selector
|
|
{
|
|
/*!
|
|
INITIAL VALUE
|
|
- _max_size == 0
|
|
- items.size() == 0
|
|
- count == 0
|
|
- _next_add_accepts == false
|
|
|
|
CONVENTION
|
|
- count == the number of times add() has been called since the last
|
|
time this object was empty.
|
|
- items.size() == size()
|
|
- max_size() == _max_size
|
|
- next_add_accepts() == _next_add_accepts
|
|
!*/
|
|
public:
|
|
typedef T type;
|
|
typedef T value_type;
|
|
typedef default_memory_manager mem_manager_type;
|
|
typedef Rand_type rand_type;
|
|
|
|
typedef typename std::vector<T>::iterator iterator;
|
|
typedef typename std::vector<T>::const_iterator const_iterator;
|
|
|
|
|
|
random_subset_selector (
|
|
)
|
|
{
|
|
_max_size = 0;
|
|
make_empty();
|
|
}
|
|
|
|
void set_seed(const std::string& value)
|
|
{
|
|
rnd.set_seed(value);
|
|
}
|
|
|
|
void make_empty (
|
|
)
|
|
{
|
|
items.resize(0);
|
|
count = 0;
|
|
update_next_add_accepts();
|
|
}
|
|
|
|
const std::vector<T>& to_std_vector(
|
|
) const { return items; }
|
|
|
|
size_t size (
|
|
) const
|
|
{
|
|
return items.size();
|
|
}
|
|
|
|
void set_max_size (
|
|
unsigned long new_max_size
|
|
)
|
|
{
|
|
items.reserve(new_max_size);
|
|
make_empty();
|
|
_max_size = new_max_size;
|
|
update_next_add_accepts();
|
|
}
|
|
|
|
unsigned long max_size (
|
|
) const
|
|
{
|
|
return _max_size;
|
|
}
|
|
|
|
T& operator[] (
|
|
unsigned long idx
|
|
)
|
|
{
|
|
// make sure requires clause is not broken
|
|
DLIB_ASSERT(idx < size(),
|
|
"\tvoid random_subset_selector::operator[]()"
|
|
<< "\n\t idx is out of range"
|
|
<< "\n\t idx: " << idx
|
|
<< "\n\t size(): " << size()
|
|
<< "\n\t this: " << this
|
|
);
|
|
|
|
return items[idx];
|
|
}
|
|
|
|
const T& operator[] (
|
|
unsigned long idx
|
|
) const
|
|
{
|
|
// make sure requires clause is not broken
|
|
DLIB_ASSERT(idx < size(),
|
|
"\tvoid random_subset_selector::operator[]()"
|
|
<< "\n\t idx is out of range"
|
|
<< "\n\t idx: " << idx
|
|
<< "\n\t size(): " << size()
|
|
<< "\n\t this: " << this
|
|
);
|
|
|
|
return items[idx];
|
|
}
|
|
|
|
iterator begin() { return items.begin(); }
|
|
const_iterator begin() const { return items.begin(); }
|
|
iterator end() { return items.end(); }
|
|
const_iterator end() const { return items.end(); }
|
|
|
|
bool next_add_accepts (
|
|
) const
|
|
{
|
|
return _next_add_accepts;
|
|
}
|
|
|
|
void add (
|
|
const T& new_item
|
|
)
|
|
{
|
|
if (items.size() < _max_size)
|
|
{
|
|
items.push_back(new_item);
|
|
// swap into a random place
|
|
exchange(items[rnd.get_random_32bit_number()%items.size()], items.back());
|
|
}
|
|
else if (_next_add_accepts)
|
|
{
|
|
// pick a random element of items and replace it.
|
|
items[rnd.get_random_32bit_number()%items.size()] = new_item;
|
|
}
|
|
|
|
update_next_add_accepts();
|
|
++count;
|
|
}
|
|
|
|
void add (
|
|
)
|
|
{
|
|
// make sure requires clause is not broken
|
|
DLIB_ASSERT(next_add_accepts() == false,
|
|
"\tvoid random_subset_selector::add()"
|
|
<< "\n\t You should be calling the version of add() that takes an argument"
|
|
<< "\n\t this: " << this
|
|
);
|
|
|
|
update_next_add_accepts();
|
|
++count;
|
|
}
|
|
|
|
void swap (
|
|
random_subset_selector& a
|
|
)
|
|
{
|
|
items.swap(a.items);
|
|
std::swap(_max_size, a._max_size);
|
|
std::swap(count, a.count);
|
|
rnd.swap(a.rnd);
|
|
std::swap(_next_add_accepts, a._next_add_accepts);
|
|
}
|
|
|
|
template <typename T1, typename T2>
|
|
friend void serialize (
|
|
const random_subset_selector<T1,T2>& item,
|
|
std::ostream& out
|
|
);
|
|
|
|
template <typename T1, typename T2>
|
|
friend void deserialize (
|
|
random_subset_selector<T1,T2>& item,
|
|
std::istream& in
|
|
);
|
|
|
|
private:
|
|
|
|
void update_next_add_accepts (
|
|
)
|
|
{
|
|
if (items.size() < _max_size)
|
|
{
|
|
_next_add_accepts = true;
|
|
}
|
|
else if (_max_size == 0)
|
|
{
|
|
_next_add_accepts = false;
|
|
}
|
|
else
|
|
{
|
|
// At this point each element of items has had an equal chance of being in this object.
|
|
// In particular, the probability that each arrived here is currently items.size()/count.
|
|
// We need to be able to say that, after this function ends, the probability of any
|
|
// particular object ending up in items is items.size()/(count+1). So this means that
|
|
// we should decide to add a new item into items with this probability. Also, if we do
|
|
// so then we pick one of the current items and replace it at random with the new item.
|
|
|
|
// Make me a random 64 bit number. This might seem excessive but I want this object
|
|
// to be able to handle an effectively infinite number of calls to add(). So count
|
|
// might get very large and we need to deal with that properly.
|
|
const unsigned long num1 = rnd.get_random_32bit_number();
|
|
const unsigned long num2 = rnd.get_random_32bit_number();
|
|
uint64 num = num1;
|
|
num <<= 32;
|
|
num |= num2;
|
|
|
|
num %= (count+1);
|
|
|
|
_next_add_accepts = (num < items.size());
|
|
}
|
|
|
|
}
|
|
|
|
std::vector<T> items;
|
|
unsigned long _max_size;
|
|
uint64 count;
|
|
|
|
rand_type rnd;
|
|
|
|
bool _next_add_accepts;
|
|
|
|
};
|
|
|
|
template <
|
|
typename T,
|
|
typename rand_type
|
|
>
|
|
void swap (
|
|
random_subset_selector<T,rand_type>& a,
|
|
random_subset_selector<T,rand_type>& b
|
|
) { a.swap(b); }
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <typename T1, typename T2>
|
|
void serialize (
|
|
const random_subset_selector<T1,T2>& item,
|
|
std::ostream& out
|
|
)
|
|
{
|
|
serialize(item.items, out);
|
|
serialize(item._max_size, out);
|
|
serialize(item.count, out);
|
|
serialize(item.rnd, out);
|
|
serialize(item._next_add_accepts, out);
|
|
}
|
|
|
|
template <typename T1, typename T2>
|
|
void deserialize (
|
|
random_subset_selector<T1,T2>& item,
|
|
std::istream& in
|
|
)
|
|
{
|
|
deserialize(item.items, in);
|
|
deserialize(item._max_size, in);
|
|
deserialize(item.count, in);
|
|
deserialize(item.rnd, in);
|
|
deserialize(item._next_add_accepts, in);
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename T,
|
|
typename alloc
|
|
>
|
|
random_subset_selector<T> randomly_subsample (
|
|
const std::vector<T,alloc>& samples,
|
|
unsigned long num
|
|
)
|
|
{
|
|
random_subset_selector<T> subset;
|
|
subset.set_max_size(num);
|
|
for (unsigned long i = 0; i < samples.size(); ++i)
|
|
subset.add(samples[i]);
|
|
return subset;
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename T,
|
|
typename alloc,
|
|
typename U
|
|
>
|
|
random_subset_selector<T> randomly_subsample (
|
|
const std::vector<T,alloc>& samples,
|
|
unsigned long num,
|
|
const U& random_seed
|
|
)
|
|
{
|
|
random_subset_selector<T> subset;
|
|
subset.set_seed(cast_to_string(random_seed));
|
|
subset.set_max_size(num);
|
|
for (unsigned long i = 0; i < samples.size(); ++i)
|
|
subset.add(samples[i]);
|
|
return subset;
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename T
|
|
>
|
|
random_subset_selector<T> randomly_subsample (
|
|
const random_subset_selector<T>& samples,
|
|
unsigned long num
|
|
)
|
|
{
|
|
random_subset_selector<T> subset;
|
|
subset.set_max_size(num);
|
|
for (unsigned long i = 0; i < samples.size(); ++i)
|
|
subset.add(samples[i]);
|
|
return subset;
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename T,
|
|
typename U
|
|
>
|
|
random_subset_selector<T> randomly_subsample (
|
|
const random_subset_selector<T>& samples,
|
|
unsigned long num,
|
|
const U& random_seed
|
|
)
|
|
{
|
|
random_subset_selector<T> subset;
|
|
subset.set_seed(cast_to_string(random_seed));
|
|
subset.set_max_size(num);
|
|
for (unsigned long i = 0; i < samples.size(); ++i)
|
|
subset.add(samples[i]);
|
|
return subset;
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename T
|
|
>
|
|
const matrix_op<op_array_to_mat<random_subset_selector<T> > > mat (
|
|
const random_subset_selector<T>& m
|
|
)
|
|
{
|
|
typedef op_array_to_mat<random_subset_selector<T> > op;
|
|
return matrix_op<op>(op(m));
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
}
|
|
|
|
#endif // DLIB_RANDOM_SUBSeT_SELECTOR_H_
|
|
|
|
|