1388 lines
44 KiB
C++
1388 lines
44 KiB
C++
// Copyright (C) 2008 Davis E. King (davis@dlib.net), Steve Taylor
|
|
// License: Boost Software License See LICENSE.txt for the full license.
|
|
#undef DLIB_STATISTICs_ABSTRACT_
|
|
#ifdef DLIB_STATISTICs_ABSTRACT_
|
|
|
|
#include <limits>
|
|
#include <cmath>
|
|
#include "../matrix/matrix_abstract.h"
|
|
#include "../svm/sparse_vector_abstract.h"
|
|
|
|
namespace dlib
|
|
{
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename T,
|
|
typename alloc
|
|
>
|
|
double mean_sign_agreement (
|
|
const std::vector<T,alloc>& a,
|
|
const std::vector<T,alloc>& b
|
|
);
|
|
/*!
|
|
requires
|
|
- a.size() == b.size()
|
|
ensures
|
|
- returns the number of times a[i] has the same sign as b[i] divided by
|
|
a.size(). So we return the probability that elements of a and b have
|
|
the same sign.
|
|
!*/
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename T,
|
|
typename alloc
|
|
>
|
|
double correlation (
|
|
const std::vector<T,alloc>& a,
|
|
const std::vector<T,alloc>& b
|
|
);
|
|
/*!
|
|
requires
|
|
- a.size() == b.size()
|
|
- a.size() > 1
|
|
ensures
|
|
- returns the correlation coefficient between all the elements of a and b.
|
|
(i.e. how correlated is a(i) with b(i))
|
|
!*/
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename T,
|
|
typename alloc
|
|
>
|
|
double covariance (
|
|
const std::vector<T,alloc>& a,
|
|
const std::vector<T,alloc>& b
|
|
);
|
|
/*!
|
|
requires
|
|
- a.size() == b.size()
|
|
- a.size() > 1
|
|
ensures
|
|
- returns the covariance between all the elements of a and b.
|
|
(i.e. how does a(i) vary with b(i))
|
|
!*/
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename T,
|
|
typename alloc
|
|
>
|
|
double r_squared (
|
|
const std::vector<T,alloc>& a,
|
|
const std::vector<T,alloc>& b
|
|
);
|
|
/*!
|
|
requires
|
|
- a.size() == b.size()
|
|
- a.size() > 1
|
|
ensures
|
|
- returns the R^2 coefficient of determination between all the elements of a and b.
|
|
This value is just the square of correlation(a,b).
|
|
!*/
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename T,
|
|
typename alloc
|
|
>
|
|
double mean_squared_error (
|
|
const std::vector<T,alloc>& a,
|
|
const std::vector<T,alloc>& b
|
|
);
|
|
/*!
|
|
requires
|
|
- a.size() == b.size()
|
|
ensures
|
|
- returns the mean squared error between all the elements of a and b.
|
|
(i.e. mean(squared(mat(a)-mat(b))))
|
|
!*/
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
double binomial_random_vars_are_different (
|
|
uint64_t k1,
|
|
uint64_t n1,
|
|
uint64_t k2,
|
|
uint64_t n2
|
|
);
|
|
/*!
|
|
requires
|
|
- k1 <= n1
|
|
- k2 <= n2
|
|
ensures
|
|
- Given two binomially distributed random variables, X1 and X2, we want to know
|
|
if these variables have the same parameter (i.e. the chance of "success").
|
|
So assume that:
|
|
- You observed X1 to give k1 successes out of n1 trials.
|
|
- You observed X2 to give k2 successes out of n2 trials.
|
|
- This function performs a simple likelihood ratio test to determine if X1 and
|
|
X2 have the same parameter. The return value of this function will be:
|
|
- Close to 0 if they are probably the same.
|
|
- Larger than 0 if X1 probably has a higher "success" rate than X2.
|
|
- Smaller than 0 if X2 probably has a higher "success" rate than X1.
|
|
Moreover, the larger the absolute magnitude of the return value the more
|
|
likely it is that X1 and X2 have different distributions.
|
|
- For a discussion of the technique and applications see:
|
|
Dunning, Ted. "Accurate methods for the statistics of surprise and
|
|
coincidence." Computational linguistics 19.1 (1993): 61-74.
|
|
!*/
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
double event_correlation (
|
|
uint64_t A_count,
|
|
uint64_t B_count,
|
|
uint64_t AB_count,
|
|
uint64_t total_num_observations
|
|
);
|
|
/*!
|
|
requires
|
|
- AB_count <= A_count <= total_num_observations
|
|
- AB_count <= B_count <= total_num_observations
|
|
- A_count + B_count - AB_count <= total_num_observations
|
|
ensures
|
|
- This function does a statistical test to determine if two events co-occur in
|
|
a statistically significant way. In particular, we assume you performed
|
|
total_num_observations measurements and during those measurements you:
|
|
- Observed event A to happen A_count times.
|
|
- Observed event B to happen B_count times.
|
|
- Observed AB_count co-occurrences of the events. That is, AB_count is the
|
|
number of times the events happened together during the same measurement.
|
|
- This function returns a number, COR, which can take any real value. It has
|
|
the following interpretations:
|
|
- COR == 0: there is no evidence of correlation between the two events.
|
|
They appear to be unrelated.
|
|
- COR > 0: There is evidence that A and B co-occur together. That is,
|
|
they happen at the same times more often than you would expect if they
|
|
were independent events. The larger the magnitude of COR the more
|
|
evidence we have for the correlation.
|
|
- COR < 0: There is evidence that A and B are anti-correlated. That is,
|
|
when A happens B is unlikely to happen and vise versa. The larger the
|
|
magnitude of COR the more evidence we have for the anti-correlation.
|
|
- This function implements the simple likelihood ratio test discussed in the
|
|
following paper:
|
|
Dunning, Ted. "Accurate methods for the statistics of surprise and
|
|
coincidence." Computational linguistics 19.1 (1993): 61-74.
|
|
So for an extended discussion of the method see the above paper.
|
|
!*/
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename T
|
|
>
|
|
class running_stats
|
|
{
|
|
/*!
|
|
REQUIREMENTS ON T
|
|
- T must be a float, double, or long double type
|
|
|
|
INITIAL VALUE
|
|
- mean() == 0
|
|
- current_n() == 0
|
|
|
|
WHAT THIS OBJECT REPRESENTS
|
|
This object represents something that can compute the running mean,
|
|
variance, skewness, and excess kurtosis of a stream of real numbers.
|
|
!*/
|
|
public:
|
|
|
|
running_stats(
|
|
);
|
|
/*!
|
|
ensures
|
|
- this object is properly initialized
|
|
!*/
|
|
|
|
void clear(
|
|
);
|
|
/*!
|
|
ensures
|
|
- this object has its initial value
|
|
- clears all memory of any previous data points
|
|
!*/
|
|
|
|
T current_n (
|
|
) const;
|
|
/*!
|
|
ensures
|
|
- returns the number of points given to this object so far.
|
|
!*/
|
|
|
|
void add (
|
|
const T& val
|
|
);
|
|
/*!
|
|
ensures
|
|
- updates the mean, variance, skewness, and kurtosis stored in this object
|
|
so that the new value is factored into them.
|
|
- #mean() == mean()*current_n()/(current_n()+1) + val/(current_n()+1).
|
|
(i.e. the updated mean value that takes the new value into account)
|
|
- #variance() == the updated variance that takes this new value into account.
|
|
- #skewness() == the updated skewness that takes this new value into account.
|
|
- #ex_kurtosis() == the updated kurtosis that takes this new value into account.
|
|
- #current_n() == current_n() + 1
|
|
!*/
|
|
|
|
T mean (
|
|
) const;
|
|
/*!
|
|
ensures
|
|
- returns the mean of all the values presented to this object
|
|
so far.
|
|
!*/
|
|
|
|
T variance (
|
|
) const;
|
|
/*!
|
|
requires
|
|
- current_n() > 1
|
|
ensures
|
|
- returns the unbiased sample variance of all the values presented to this
|
|
object so far.
|
|
!*/
|
|
|
|
T stddev (
|
|
) const;
|
|
/*!
|
|
requires
|
|
- current_n() > 1
|
|
ensures
|
|
- returns the unbiased sampled standard deviation of all the values
|
|
presented to this object so far.
|
|
!*/
|
|
|
|
T skewness (
|
|
) const;
|
|
/*!
|
|
requires
|
|
- current_n() > 2
|
|
ensures
|
|
- returns the unbiased sample skewness of all the values presented
|
|
to this object so far.
|
|
!*/
|
|
|
|
T ex_kurtosis(
|
|
) const;
|
|
/*!
|
|
requires
|
|
- current_n() > 3
|
|
ensures
|
|
- returns the unbiased sample kurtosis of all the values presented
|
|
to this object so far.
|
|
!*/
|
|
|
|
T max (
|
|
) const;
|
|
/*!
|
|
requires
|
|
- current_n() > 1
|
|
ensures
|
|
- returns the largest value presented to this object so far.
|
|
!*/
|
|
|
|
T min (
|
|
) const;
|
|
/*!
|
|
requires
|
|
- current_n() > 1
|
|
ensures
|
|
- returns the smallest value presented to this object so far.
|
|
!*/
|
|
|
|
T scale (
|
|
const T& val
|
|
) const;
|
|
/*!
|
|
requires
|
|
- current_n() > 1
|
|
ensures
|
|
- return (val-mean())/stddev();
|
|
!*/
|
|
|
|
running_stats operator+ (
|
|
const running_stats& rhs
|
|
) const;
|
|
/*!
|
|
ensures
|
|
- returns a new running_stats object that represents the combination of all
|
|
the values given to *this and rhs. That is, this function returns a
|
|
running_stats object, R, that is equivalent to what you would obtain if
|
|
all calls to this->add() and rhs.add() had instead been done to R.
|
|
!*/
|
|
};
|
|
|
|
template <typename T>
|
|
void serialize (
|
|
const running_stats<T>& item,
|
|
std::ostream& out
|
|
);
|
|
/*!
|
|
provides serialization support
|
|
!*/
|
|
|
|
template <typename T>
|
|
void deserialize (
|
|
running_stats<T>& item,
|
|
std::istream& in
|
|
);
|
|
/*!
|
|
provides serialization support
|
|
!*/
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename T
|
|
>
|
|
class running_scalar_covariance
|
|
{
|
|
/*!
|
|
REQUIREMENTS ON T
|
|
- T must be a float, double, or long double type
|
|
|
|
INITIAL VALUE
|
|
- mean_x() == 0
|
|
- mean_y() == 0
|
|
- current_n() == 0
|
|
|
|
WHAT THIS OBJECT REPRESENTS
|
|
This object represents something that can compute the running covariance
|
|
of a stream of real number pairs.
|
|
!*/
|
|
|
|
public:
|
|
|
|
running_scalar_covariance(
|
|
);
|
|
/*!
|
|
ensures
|
|
- this object is properly initialized
|
|
!*/
|
|
|
|
void clear(
|
|
);
|
|
/*!
|
|
ensures
|
|
- this object has its initial value
|
|
- clears all memory of any previous data points
|
|
!*/
|
|
|
|
void add (
|
|
const T& x,
|
|
const T& y
|
|
);
|
|
/*!
|
|
ensures
|
|
- updates the statistics stored in this object so that
|
|
the new pair (x,y) is factored into them.
|
|
- #current_n() == current_n() + 1
|
|
!*/
|
|
|
|
T current_n (
|
|
) const;
|
|
/*!
|
|
ensures
|
|
- returns the number of points given to this object so far.
|
|
!*/
|
|
|
|
T mean_x (
|
|
) const;
|
|
/*!
|
|
ensures
|
|
- returns the mean value of all x samples presented to this object
|
|
via add().
|
|
!*/
|
|
|
|
T mean_y (
|
|
) const;
|
|
/*!
|
|
ensures
|
|
- returns the mean value of all y samples presented to this object
|
|
via add().
|
|
!*/
|
|
|
|
T covariance (
|
|
) const;
|
|
/*!
|
|
requires
|
|
- current_n() > 1
|
|
ensures
|
|
- returns the covariance between all the x and y samples presented
|
|
to this object via add()
|
|
!*/
|
|
|
|
T correlation (
|
|
) const;
|
|
/*!
|
|
requires
|
|
- current_n() > 1
|
|
ensures
|
|
- returns the correlation coefficient between all the x and y samples
|
|
presented to this object via add()
|
|
!*/
|
|
|
|
T variance_x (
|
|
) const;
|
|
/*!
|
|
requires
|
|
- current_n() > 1
|
|
ensures
|
|
- returns the unbiased sample variance value of all x samples presented
|
|
to this object via add().
|
|
!*/
|
|
|
|
T variance_y (
|
|
) const;
|
|
/*!
|
|
requires
|
|
- current_n() > 1
|
|
ensures
|
|
- returns the unbiased sample variance value of all y samples presented
|
|
to this object via add().
|
|
!*/
|
|
|
|
T stddev_x (
|
|
) const;
|
|
/*!
|
|
requires
|
|
- current_n() > 1
|
|
ensures
|
|
- returns the unbiased sample standard deviation of all x samples
|
|
presented to this object via add().
|
|
!*/
|
|
|
|
T stddev_y (
|
|
) const;
|
|
/*!
|
|
requires
|
|
- current_n() > 1
|
|
ensures
|
|
- returns the unbiased sample standard deviation of all y samples
|
|
presented to this object via add().
|
|
!*/
|
|
|
|
running_scalar_covariance operator+ (
|
|
const running_covariance& rhs
|
|
) const;
|
|
/*!
|
|
ensures
|
|
- returns a new running_scalar_covariance object that represents the
|
|
combination of all the values given to *this and rhs. That is, this
|
|
function returns a running_scalar_covariance object, R, that is
|
|
equivalent to what you would obtain if all calls to this->add() and
|
|
rhs.add() had instead been done to R.
|
|
!*/
|
|
};
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename T
|
|
>
|
|
class running_scalar_covariance_decayed
|
|
{
|
|
/*!
|
|
REQUIREMENTS ON T
|
|
- T must be a float, double, or long double type
|
|
|
|
INITIAL VALUE
|
|
- mean_x() == 0
|
|
- mean_y() == 0
|
|
- current_n() == 0
|
|
|
|
WHAT THIS OBJECT REPRESENTS
|
|
This object represents something that can compute the running covariance of
|
|
a stream of real number pairs. It is essentially the same as
|
|
running_scalar_covariance except that it forgets about data it has seen
|
|
after a certain period of time. It does this by exponentially decaying old
|
|
statistics.
|
|
!*/
|
|
|
|
public:
|
|
|
|
running_scalar_covariance_decayed(
|
|
T decay_halflife = 1000
|
|
);
|
|
/*!
|
|
requires
|
|
- decay_halflife > 0
|
|
ensures
|
|
- #forget_factor() == std::pow(0.5, 1/decay_halflife);
|
|
(i.e. after decay_halflife calls to add() the data given to the first add
|
|
will be down weighted by 0.5 in the statistics stored in this object).
|
|
!*/
|
|
|
|
T forget_factor (
|
|
) const;
|
|
/*!
|
|
ensures
|
|
- returns the exponential forget factor used to forget old statistics when
|
|
add() is called.
|
|
!*/
|
|
|
|
void add (
|
|
const T& x,
|
|
const T& y
|
|
);
|
|
/*!
|
|
ensures
|
|
- updates the statistics stored in this object so that
|
|
the new pair (x,y) is factored into them.
|
|
- #current_n() == current_n()*forget_factor() + forget_factor()
|
|
- Down weights old statistics by a factor of forget_factor().
|
|
!*/
|
|
|
|
T current_n (
|
|
) const;
|
|
/*!
|
|
ensures
|
|
- returns the effective number of points given to this object. As add()
|
|
is called this value will converge to a constant, the value of which is
|
|
based on the decay_halflife supplied to the constructor.
|
|
!*/
|
|
|
|
T mean_x (
|
|
) const;
|
|
/*!
|
|
ensures
|
|
- returns the mean value of all x samples presented to this object
|
|
via add().
|
|
!*/
|
|
|
|
T mean_y (
|
|
) const;
|
|
/*!
|
|
ensures
|
|
- returns the mean value of all y samples presented to this object
|
|
via add().
|
|
!*/
|
|
|
|
T covariance (
|
|
) const;
|
|
/*!
|
|
requires
|
|
- current_n() > 1
|
|
ensures
|
|
- returns the covariance between all the x and y samples presented
|
|
to this object via add()
|
|
!*/
|
|
|
|
T correlation (
|
|
) const;
|
|
/*!
|
|
requires
|
|
- current_n() > 1
|
|
ensures
|
|
- returns the correlation coefficient between all the x and y samples
|
|
presented to this object via add()
|
|
!*/
|
|
|
|
T variance_x (
|
|
) const;
|
|
/*!
|
|
requires
|
|
- current_n() > 1
|
|
ensures
|
|
- returns the sample variance value of all x samples presented
|
|
to this object via add().
|
|
!*/
|
|
|
|
T variance_y (
|
|
) const;
|
|
/*!
|
|
requires
|
|
- current_n() > 1
|
|
ensures
|
|
- returns the sample variance value of all y samples presented
|
|
to this object via add().
|
|
!*/
|
|
|
|
T stddev_x (
|
|
) const;
|
|
/*!
|
|
requires
|
|
- current_n() > 1
|
|
ensures
|
|
- returns the sample standard deviation of all x samples
|
|
presented to this object via add().
|
|
!*/
|
|
|
|
T stddev_y (
|
|
) const;
|
|
/*!
|
|
requires
|
|
- current_n() > 1
|
|
ensures
|
|
- returns the sample standard deviation of all y samples
|
|
presented to this object via add().
|
|
!*/
|
|
};
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename T
|
|
>
|
|
class running_stats_decayed
|
|
{
|
|
/*!
|
|
REQUIREMENTS ON T
|
|
- T must be a float, double, or long double type
|
|
|
|
INITIAL VALUE
|
|
- mean() == 0
|
|
- current_n() == 0
|
|
|
|
WHAT THIS OBJECT REPRESENTS
|
|
This object represents something that can compute the running mean and
|
|
variance of a stream of real numbers. It is similar to running_stats
|
|
except that it forgets about data it has seen after a certain period of
|
|
time. It does this by exponentially decaying old statistics.
|
|
!*/
|
|
|
|
public:
|
|
|
|
running_stats_decayed(
|
|
T decay_halflife = 1000
|
|
);
|
|
/*!
|
|
requires
|
|
- decay_halflife > 0
|
|
ensures
|
|
- #forget_factor() == std::pow(0.5, 1/decay_halflife);
|
|
(i.e. after decay_halflife calls to add() the data given to the first add
|
|
will be down weighted by 0.5 in the statistics stored in this object).
|
|
!*/
|
|
|
|
T forget_factor (
|
|
) const;
|
|
/*!
|
|
ensures
|
|
- returns the exponential forget factor used to forget old statistics when
|
|
add() is called.
|
|
!*/
|
|
|
|
void add (
|
|
const T& x
|
|
);
|
|
/*!
|
|
ensures
|
|
- updates the statistics stored in this object so that x is factored into
|
|
them.
|
|
- #current_n() == current_n()*forget_factor() + forget_factor()
|
|
- Down weights old statistics by a factor of forget_factor().
|
|
!*/
|
|
|
|
T current_n (
|
|
) const;
|
|
/*!
|
|
ensures
|
|
- returns the effective number of points given to this object. As add()
|
|
is called this value will converge to a constant, the value of which is
|
|
based on the decay_halflife supplied to the constructor.
|
|
!*/
|
|
|
|
T mean (
|
|
) const;
|
|
/*!
|
|
ensures
|
|
- returns the mean value of all x samples presented to this object
|
|
via add().
|
|
!*/
|
|
|
|
T variance (
|
|
) const;
|
|
/*!
|
|
requires
|
|
- current_n() > 1
|
|
ensures
|
|
- returns the sample variance value of all x samples presented to this
|
|
object via add().
|
|
!*/
|
|
|
|
T stddev (
|
|
) const;
|
|
/*!
|
|
requires
|
|
- current_n() > 1
|
|
ensures
|
|
- returns the sample standard deviation of all x samples presented to this
|
|
object via add().
|
|
!*/
|
|
|
|
};
|
|
|
|
template <typename T>
|
|
void serialize (
|
|
const running_stats_decayed<T>& item,
|
|
std::ostream& out
|
|
);
|
|
/*!
|
|
provides serialization support
|
|
!*/
|
|
|
|
template <typename T>
|
|
void deserialize (
|
|
running_stats_decayed<T>& item,
|
|
std::istream& in
|
|
);
|
|
/*!
|
|
provides serialization support
|
|
!*/
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename matrix_type
|
|
>
|
|
class running_covariance
|
|
{
|
|
/*!
|
|
REQUIREMENTS ON matrix_type
|
|
Must be some type of dlib::matrix.
|
|
|
|
INITIAL VALUE
|
|
- in_vector_size() == 0
|
|
- current_n() == 0
|
|
|
|
WHAT THIS OBJECT REPRESENTS
|
|
This object is a simple tool for computing the mean and
|
|
covariance of a sequence of vectors.
|
|
!*/
|
|
public:
|
|
|
|
typedef typename matrix_type::mem_manager_type mem_manager_type;
|
|
typedef typename matrix_type::type scalar_type;
|
|
typedef typename matrix_type::layout_type layout_type;
|
|
typedef matrix<scalar_type,0,0,mem_manager_type,layout_type> general_matrix;
|
|
typedef matrix<scalar_type,0,1,mem_manager_type,layout_type> column_matrix;
|
|
|
|
running_covariance(
|
|
);
|
|
/*!
|
|
ensures
|
|
- this object is properly initialized
|
|
!*/
|
|
|
|
void clear(
|
|
);
|
|
/*!
|
|
ensures
|
|
- this object has its initial value
|
|
- clears all memory of any previous data points
|
|
!*/
|
|
|
|
long current_n (
|
|
) const;
|
|
/*!
|
|
ensures
|
|
- returns the number of samples that have been presented to this object
|
|
!*/
|
|
|
|
long in_vector_size (
|
|
) const;
|
|
/*!
|
|
ensures
|
|
- if (this object has been presented with any input vectors or
|
|
set_dimension() has been called) then
|
|
- returns the dimension of the column vectors used with this object
|
|
- else
|
|
- returns 0
|
|
!*/
|
|
|
|
void set_dimension (
|
|
long size
|
|
);
|
|
/*!
|
|
requires
|
|
- size > 0
|
|
ensures
|
|
- #in_vector_size() == size
|
|
- #current_n() == 0
|
|
!*/
|
|
|
|
template <typename T>
|
|
void add (
|
|
const T& val
|
|
);
|
|
/*!
|
|
requires
|
|
- val must represent a column vector. It can either be a dlib::matrix
|
|
object or some kind of unsorted sparse vector type. See the top of
|
|
dlib/svm/sparse_vector_abstract.h for a definition of unsorted sparse vector.
|
|
- val must have a number of dimensions which is compatible with the current
|
|
setting of in_vector_size(). In particular, this means that the
|
|
following must hold:
|
|
- if (val is a dlib::matrix) then
|
|
- in_vector_size() == 0 || val.size() == val_vector_size()
|
|
- else
|
|
- max_index_plus_one(val) <= in_vector_size()
|
|
- in_vector_size() > 0
|
|
(i.e. you must call set_dimension() prior to calling add() if
|
|
you want to use sparse vectors.)
|
|
ensures
|
|
- updates the mean and covariance stored in this object so that
|
|
the new value is factored into them.
|
|
- if (val is a dlib::matrix) then
|
|
- #in_vector_size() == val.size()
|
|
!*/
|
|
|
|
const column_matrix mean (
|
|
) const;
|
|
/*!
|
|
requires
|
|
- in_vector_size() != 0
|
|
ensures
|
|
- returns the mean of all the vectors presented to this object
|
|
so far.
|
|
!*/
|
|
|
|
const general_matrix covariance (
|
|
) const;
|
|
/*!
|
|
requires
|
|
- in_vector_size() != 0
|
|
- current_n() > 1
|
|
ensures
|
|
- returns the unbiased sample covariance matrix for all the vectors
|
|
presented to this object so far.
|
|
!*/
|
|
|
|
const running_covariance operator+ (
|
|
const running_covariance& item
|
|
) const;
|
|
/*!
|
|
requires
|
|
- in_vector_size() == 0 || item.in_vector_size() == 0 || in_vector_size() == item.in_vector_size()
|
|
(i.e. the in_vector_size() of *this and item must match or one must be zero)
|
|
ensures
|
|
- returns a new running_covariance object that represents the combination of all
|
|
the vectors given to *this and item. That is, this function returns a
|
|
running_covariance object, R, that is equivalent to what you would obtain if all
|
|
calls to this->add() and item.add() had instead been done to R.
|
|
!*/
|
|
};
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename matrix_type
|
|
>
|
|
class running_cross_covariance
|
|
{
|
|
/*!
|
|
REQUIREMENTS ON matrix_type
|
|
Must be some type of dlib::matrix.
|
|
|
|
INITIAL VALUE
|
|
- x_vector_size() == 0
|
|
- y_vector_size() == 0
|
|
- current_n() == 0
|
|
|
|
WHAT THIS OBJECT REPRESENTS
|
|
This object is a simple tool for computing the mean and cross-covariance
|
|
matrices of a sequence of pairs of vectors.
|
|
!*/
|
|
|
|
public:
|
|
|
|
typedef typename matrix_type::mem_manager_type mem_manager_type;
|
|
typedef typename matrix_type::type scalar_type;
|
|
typedef typename matrix_type::layout_type layout_type;
|
|
typedef matrix<scalar_type,0,0,mem_manager_type,layout_type> general_matrix;
|
|
typedef matrix<scalar_type,0,1,mem_manager_type,layout_type> column_matrix;
|
|
|
|
running_cross_covariance(
|
|
);
|
|
/*!
|
|
ensures
|
|
- this object is properly initialized
|
|
!*/
|
|
|
|
void clear(
|
|
);
|
|
/*!
|
|
ensures
|
|
- This object has its initial value.
|
|
- Clears all memory of any previous data points.
|
|
!*/
|
|
|
|
long x_vector_size (
|
|
) const;
|
|
/*!
|
|
ensures
|
|
- if (this object has been presented with any input vectors or
|
|
set_dimensions() has been called) then
|
|
- returns the dimension of the x vectors given to this object via add().
|
|
- else
|
|
- returns 0
|
|
!*/
|
|
|
|
long y_vector_size (
|
|
) const;
|
|
/*!
|
|
ensures
|
|
- if (this object has been presented with any input vectors or
|
|
set_dimensions() has been called) then
|
|
- returns the dimension of the y vectors given to this object via add().
|
|
- else
|
|
- returns 0
|
|
!*/
|
|
|
|
void set_dimensions (
|
|
long x_size,
|
|
long y_size
|
|
);
|
|
/*!
|
|
requires
|
|
- x_size > 0
|
|
- y_size > 0
|
|
ensures
|
|
- #x_vector_size() == x_size
|
|
- #y_vector_size() == y_size
|
|
- #current_n() == 0
|
|
!*/
|
|
|
|
long current_n (
|
|
) const;
|
|
/*!
|
|
ensures
|
|
- returns the number of samples that have been presented to this object.
|
|
!*/
|
|
|
|
template <typename T, typename U>
|
|
void add (
|
|
const T& x,
|
|
const U& y
|
|
);
|
|
/*!
|
|
requires
|
|
- x and y must represent column vectors. They can either be dlib::matrix
|
|
objects or some kind of unsorted sparse vector type. See the top of
|
|
dlib/svm/sparse_vector_abstract.h for a definition of unsorted sparse vector.
|
|
- x and y must have a number of dimensions which is compatible with the
|
|
current setting of x_vector_size() and y_vector_size(). In particular,
|
|
this means that the following must hold:
|
|
- if (x or y is a sparse vector type) then
|
|
- x_vector_size() > 0 && y_vector_size() > 0
|
|
(i.e. you must call set_dimensions() prior to calling add() if
|
|
you want to use sparse vectors.)
|
|
- if (x is a dlib::matrix) then
|
|
- x_vector_size() == 0 || x.size() == x_vector_size()
|
|
- else
|
|
- max_index_plus_one(x) <= x_vector_size()
|
|
- if (y is a dlib::matrix) then
|
|
- y_vector_size() == 0 || y.size() == y_vector_size()
|
|
- else
|
|
- max_index_plus_one(y) <= y_vector_size()
|
|
ensures
|
|
- updates the mean and cross-covariance matrices stored in this object so
|
|
that the new (x,y) vector pair is factored into them.
|
|
- if (x is a dlib::matrix) then
|
|
- #x_vector_size() == x.size()
|
|
- if (y is a dlib::matrix) then
|
|
- #y_vector_size() == y.size()
|
|
!*/
|
|
|
|
const column_matrix mean_x (
|
|
) const;
|
|
/*!
|
|
requires
|
|
- current_n() != 0
|
|
ensures
|
|
- returns the mean of all the x vectors presented to this object so far.
|
|
- The returned vector will have x_vector_size() dimensions.
|
|
!*/
|
|
|
|
const column_matrix mean_y (
|
|
) const;
|
|
/*!
|
|
requires
|
|
- current_n() != 0
|
|
ensures
|
|
- returns the mean of all the y vectors presented to this object so far.
|
|
- The returned vector will have y_vector_size() dimensions.
|
|
!*/
|
|
|
|
const general_matrix covariance_xy (
|
|
) const;
|
|
/*!
|
|
requires
|
|
- current_n() > 1
|
|
ensures
|
|
- returns the unbiased sample cross-covariance matrix for all the vector
|
|
pairs presented to this object so far. In particular, returns a matrix
|
|
M such that:
|
|
- M.nr() == x_vector_size()
|
|
- M.nc() == y_vector_size()
|
|
- M == the cross-covariance matrix of the data given to add().
|
|
!*/
|
|
|
|
const running_cross_covariance operator+ (
|
|
const running_cross_covariance& item
|
|
) const;
|
|
/*!
|
|
requires
|
|
- x_vector_size() == 0 || item.x_vector_size() == 0 || x_vector_size() == item.x_vector_size()
|
|
(i.e. the x_vector_size() of *this and item must match or one must be zero)
|
|
- y_vector_size() == 0 || item.y_vector_size() == 0 || y_vector_size() == item.y_vector_size()
|
|
(i.e. the y_vector_size() of *this and item must match or one must be zero)
|
|
ensures
|
|
- returns a new running_cross_covariance object that represents the
|
|
combination of all the vectors given to *this and item. That is, this
|
|
function returns a running_cross_covariance object, R, that is equivalent
|
|
to what you would obtain if all calls to this->add() and item.add() had
|
|
instead been done to R.
|
|
!*/
|
|
};
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename matrix_type
|
|
>
|
|
class vector_normalizer
|
|
{
|
|
/*!
|
|
REQUIREMENTS ON matrix_type
|
|
- must be a dlib::matrix object capable of representing column
|
|
vectors
|
|
|
|
INITIAL VALUE
|
|
- in_vector_size() == 0
|
|
- out_vector_size() == 0
|
|
- means().size() == 0
|
|
- std_devs().size() == 0
|
|
|
|
WHAT THIS OBJECT REPRESENTS
|
|
This object represents something that can learn to normalize a set
|
|
of column vectors. In particular, normalized column vectors should
|
|
have zero mean and a variance of one.
|
|
|
|
Also, if desired, this object can use principal component
|
|
analysis for the purposes of reducing the number of elements in a
|
|
vector.
|
|
|
|
THREAD SAFETY
|
|
Note that this object contains a cached matrix object it uses
|
|
to store intermediate results for normalization. This avoids
|
|
needing to reallocate it every time this object performs normalization
|
|
but also makes it non-thread safe. So make sure you don't share
|
|
instances of this object between threads.
|
|
!*/
|
|
|
|
public:
|
|
typedef typename matrix_type::mem_manager_type mem_manager_type;
|
|
typedef typename matrix_type::type scalar_type;
|
|
typedef matrix_type result_type;
|
|
|
|
template <typename vector_type>
|
|
void train (
|
|
const vector_type& samples
|
|
);
|
|
/*!
|
|
requires
|
|
- samples.size() > 0
|
|
- samples == a column matrix or something convertible to a column
|
|
matrix via mat(). Also, x should contain
|
|
matrix_type objects that represent nonempty column vectors.
|
|
- samples does not contain any infinite or NaN values
|
|
ensures
|
|
- #in_vector_size() == samples(0).nr()
|
|
- #out_vector_size() == samples(0).nr()
|
|
- This object has learned how to normalize vectors that look like
|
|
vectors in the given set of samples.
|
|
- #means() == mean(samples)
|
|
- #std_devs() == reciprocal(sqrt(variance(samples)));
|
|
!*/
|
|
|
|
long in_vector_size (
|
|
) const;
|
|
/*!
|
|
ensures
|
|
- returns the number of rows that input vectors are
|
|
required to contain if they are to be normalized by
|
|
this object.
|
|
!*/
|
|
|
|
long out_vector_size (
|
|
) const;
|
|
/*!
|
|
ensures
|
|
- returns the number of rows in the normalized vectors
|
|
that come out of this object.
|
|
!*/
|
|
|
|
const matrix_type& means (
|
|
) const;
|
|
/*!
|
|
ensures
|
|
- returns a matrix M such that:
|
|
- M.nc() == 1
|
|
- M.nr() == in_vector_size()
|
|
- M(i) == the mean of the ith input feature shown to train()
|
|
!*/
|
|
|
|
const matrix_type& std_devs (
|
|
) const;
|
|
/*!
|
|
ensures
|
|
- returns a matrix SD such that:
|
|
- SD.nc() == 1
|
|
- SD.nr() == in_vector_size()
|
|
- SD(i) == the reciprocal of the standard deviation of the ith
|
|
input feature shown to train()
|
|
!*/
|
|
|
|
const result_type& operator() (
|
|
const matrix_type& x
|
|
) const;
|
|
/*!
|
|
requires
|
|
- x.nr() == in_vector_size()
|
|
- x.nc() == 1
|
|
ensures
|
|
- returns a normalized version of x, call it Z, that has the
|
|
following properties:
|
|
- Z.nr() == out_vector_size()
|
|
- Z.nc() == 1
|
|
- the mean of each element of Z is 0
|
|
- the variance of each element of Z is 1
|
|
- Z == pointwise_multiply(x-means(), std_devs());
|
|
!*/
|
|
|
|
void swap (
|
|
vector_normalizer& item
|
|
);
|
|
/*!
|
|
ensures
|
|
- swaps *this and item
|
|
!*/
|
|
};
|
|
|
|
template <
|
|
typename matrix_type
|
|
>
|
|
inline void swap (
|
|
vector_normalizer<matrix_type>& a,
|
|
vector_normalizer<matrix_type>& b
|
|
) { a.swap(b); }
|
|
/*!
|
|
provides a global swap function
|
|
!*/
|
|
|
|
template <
|
|
typename matrix_type,
|
|
>
|
|
void deserialize (
|
|
vector_normalizer<matrix_type>& item,
|
|
std::istream& in
|
|
);
|
|
/*!
|
|
provides deserialization support
|
|
!*/
|
|
|
|
template <
|
|
typename matrix_type,
|
|
>
|
|
void serialize (
|
|
const vector_normalizer<matrix_type>& item,
|
|
std::ostream& out
|
|
);
|
|
/*!
|
|
provides serialization support
|
|
!*/
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename matrix_type
|
|
>
|
|
class vector_normalizer_pca
|
|
{
|
|
/*!
|
|
REQUIREMENTS ON matrix_type
|
|
- must be a dlib::matrix object capable of representing column
|
|
vectors
|
|
|
|
INITIAL VALUE
|
|
- in_vector_size() == 0
|
|
- out_vector_size() == 0
|
|
- means().size() == 0
|
|
- std_devs().size() == 0
|
|
- pca_matrix().size() == 0
|
|
|
|
WHAT THIS OBJECT REPRESENTS
|
|
This object represents something that can learn to normalize a set
|
|
of column vectors. In particular, normalized column vectors should
|
|
have zero mean and a variance of one.
|
|
|
|
Also, this object uses principal component analysis for the purposes
|
|
of reducing the number of elements in a vector.
|
|
|
|
THREAD SAFETY
|
|
Note that this object contains a cached matrix object it uses
|
|
to store intermediate results for normalization. This avoids
|
|
needing to reallocate it every time this object performs normalization
|
|
but also makes it non-thread safe. So make sure you don't share
|
|
instances of this object between threads.
|
|
!*/
|
|
|
|
public:
|
|
typedef typename matrix_type::mem_manager_type mem_manager_type;
|
|
typedef typename matrix_type::type scalar_type;
|
|
typedef matrix<scalar_type,0,1,mem_manager_type> result_type;
|
|
|
|
template <typename vector_type>
|
|
void train (
|
|
const vector_type& samples,
|
|
const double eps = 0.99
|
|
);
|
|
/*!
|
|
requires
|
|
- 0 < eps <= 1
|
|
- samples.size() > 0
|
|
- samples == a column matrix or something convertible to a column
|
|
matrix via mat(). Also, x should contain
|
|
matrix_type objects that represent nonempty column vectors.
|
|
- samples does not contain any infinite or NaN values
|
|
ensures
|
|
- This object has learned how to normalize vectors that look like
|
|
vectors in the given set of samples.
|
|
- Principal component analysis is performed to find a transform
|
|
that might reduce the number of output features.
|
|
- #in_vector_size() == samples(0).nr()
|
|
- 0 < #out_vector_size() <= samples(0).nr()
|
|
- eps is a number that controls how "lossy" the pca transform will be.
|
|
Large values of eps result in #out_vector_size() being larger and
|
|
smaller values of eps result in #out_vector_size() being smaller.
|
|
- #means() == mean(samples)
|
|
- #std_devs() == reciprocal(sqrt(variance(samples)));
|
|
- #pca_matrix() == the PCA transform matrix that is out_vector_size()
|
|
rows by in_vector_size() columns.
|
|
!*/
|
|
|
|
long in_vector_size (
|
|
) const;
|
|
/*!
|
|
ensures
|
|
- returns the number of rows that input vectors are
|
|
required to contain if they are to be normalized by
|
|
this object.
|
|
!*/
|
|
|
|
long out_vector_size (
|
|
) const;
|
|
/*!
|
|
ensures
|
|
- returns the number of rows in the normalized vectors
|
|
that come out of this object.
|
|
!*/
|
|
|
|
const matrix<scalar_type,0,1,mem_manager_type>& means (
|
|
) const;
|
|
/*!
|
|
ensures
|
|
- returns a matrix M such that:
|
|
- M.nc() == 1
|
|
- M.nr() == in_vector_size()
|
|
- M(i) == the mean of the ith input feature shown to train()
|
|
!*/
|
|
|
|
const matrix<scalar_type,0,1,mem_manager_type>& std_devs (
|
|
) const;
|
|
/*!
|
|
ensures
|
|
- returns a matrix SD such that:
|
|
- SD.nc() == 1
|
|
- SD.nr() == in_vector_size()
|
|
- SD(i) == the reciprocal of the standard deviation of the ith
|
|
input feature shown to train()
|
|
!*/
|
|
|
|
const matrix<scalar_type,0,0,mem_manager_type>& pca_matrix (
|
|
) const;
|
|
/*!
|
|
ensures
|
|
- returns a matrix PCA such that:
|
|
- PCA.nr() == out_vector_size()
|
|
- PCA.nc() == in_vector_size()
|
|
- PCA == the principal component analysis transformation
|
|
matrix
|
|
!*/
|
|
|
|
const result_type& operator() (
|
|
const matrix_type& x
|
|
) const;
|
|
/*!
|
|
requires
|
|
- x.nr() == in_vector_size()
|
|
- x.nc() == 1
|
|
ensures
|
|
- returns a normalized version of x, call it Z, that has the
|
|
following properties:
|
|
- Z.nr() == out_vector_size()
|
|
- Z.nc() == 1
|
|
- the mean of each element of Z is 0
|
|
- the variance of each element of Z is 1
|
|
- Z == pca_matrix()*pointwise_multiply(x-means(), std_devs());
|
|
!*/
|
|
|
|
void swap (
|
|
vector_normalizer_pca& item
|
|
);
|
|
/*!
|
|
ensures
|
|
- swaps *this and item
|
|
!*/
|
|
};
|
|
|
|
template <
|
|
typename matrix_type
|
|
>
|
|
inline void swap (
|
|
vector_normalizer_pca<matrix_type>& a,
|
|
vector_normalizer_pca<matrix_type>& b
|
|
) { a.swap(b); }
|
|
/*!
|
|
provides a global swap function
|
|
!*/
|
|
|
|
template <
|
|
typename matrix_type,
|
|
>
|
|
void deserialize (
|
|
vector_normalizer_pca<matrix_type>& item,
|
|
std::istream& in
|
|
);
|
|
/*!
|
|
provides deserialization support
|
|
!*/
|
|
|
|
template <
|
|
typename matrix_type,
|
|
>
|
|
void serialize (
|
|
const vector_normalizer_pca<matrix_type>& item,
|
|
std::ostream& out
|
|
);
|
|
/*!
|
|
provides serialization support
|
|
!*/
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
}
|
|
|
|
#endif // DLIB_STATISTICs_ABSTRACT_
|
|
|