409 lines
16 KiB
C++
409 lines
16 KiB
C++
// Copyright (C) 2010 Davis E. King (davis@dlib.net)
|
|
// License: Boost Software License See LICENSE.txt for the full license.
|
|
|
|
#include "tester.h"
|
|
#include <dlib/manifold_regularization.h>
|
|
#include <dlib/svm.h>
|
|
#include <dlib/rand.h>
|
|
#include <dlib/string.h>
|
|
#include <dlib/graph_utils_threaded.h>
|
|
#include <vector>
|
|
#include <sstream>
|
|
#include <ctime>
|
|
|
|
namespace
|
|
{
|
|
using namespace test;
|
|
using namespace dlib;
|
|
using namespace std;
|
|
dlib::logger dlog("test.linear_manifold_regularizer");
|
|
|
|
template <typename hash_type, typename samples_type>
|
|
void test_find_k_nearest_neighbors_lsh(
|
|
const samples_type& samples
|
|
)
|
|
{
|
|
std::vector<sample_pair> edges1, edges2;
|
|
|
|
find_k_nearest_neighbors(samples, cosine_distance(), 2, edges1);
|
|
find_k_nearest_neighbors_lsh(samples, cosine_distance(), hash_type(), 2, 6, edges2, 2);
|
|
|
|
std::sort(edges1.begin(), edges1.end(), order_by_index<sample_pair>);
|
|
std::sort(edges2.begin(), edges2.end(), order_by_index<sample_pair>);
|
|
|
|
DLIB_TEST_MSG(edges1.size() == edges2.size(), edges1.size() << " " << edges2.size());
|
|
for (unsigned long i = 0; i < edges1.size(); ++i)
|
|
{
|
|
DLIB_TEST(edges1[i] == edges2[i]);
|
|
DLIB_TEST_MSG(std::abs(edges1[i].distance() - edges2[i].distance()) < 1e-7,
|
|
edges1[i].distance() - edges2[i].distance());
|
|
}
|
|
}
|
|
|
|
template <typename scalar_type>
|
|
void test_knn_lsh_sparse()
|
|
{
|
|
dlib::rand rnd;
|
|
std::vector<std::map<unsigned long,scalar_type> > samples;
|
|
samples.resize(20);
|
|
for (unsigned int i = 0; i < samples.size(); ++i)
|
|
{
|
|
samples[i][0] = rnd.get_random_gaussian();
|
|
samples[i][2] = rnd.get_random_gaussian();
|
|
}
|
|
|
|
test_find_k_nearest_neighbors_lsh<hash_similar_angles_64>(samples);
|
|
test_find_k_nearest_neighbors_lsh<hash_similar_angles_128>(samples);
|
|
test_find_k_nearest_neighbors_lsh<hash_similar_angles_256>(samples);
|
|
test_find_k_nearest_neighbors_lsh<hash_similar_angles_512>(samples);
|
|
}
|
|
|
|
template <typename scalar_type>
|
|
void test_knn_lsh_dense()
|
|
{
|
|
dlib::rand rnd;
|
|
std::vector<matrix<scalar_type,0,1> > samples;
|
|
samples.resize(20);
|
|
for (unsigned int i = 0; i < samples.size(); ++i)
|
|
{
|
|
samples[i].set_size(2);
|
|
samples[i](0) = rnd.get_random_gaussian();
|
|
samples[i](1) = rnd.get_random_gaussian();
|
|
}
|
|
|
|
test_find_k_nearest_neighbors_lsh<hash_similar_angles_64>(samples);
|
|
test_find_k_nearest_neighbors_lsh<hash_similar_angles_128>(samples);
|
|
test_find_k_nearest_neighbors_lsh<hash_similar_angles_256>(samples);
|
|
test_find_k_nearest_neighbors_lsh<hash_similar_angles_512>(samples);
|
|
}
|
|
|
|
|
|
|
|
class linear_manifold_regularizer_tester : public tester
|
|
{
|
|
/*!
|
|
WHAT THIS OBJECT REPRESENTS
|
|
This object represents a unit test. When it is constructed
|
|
it adds itself into the testing framework.
|
|
!*/
|
|
public:
|
|
linear_manifold_regularizer_tester (
|
|
) :
|
|
tester (
|
|
"test_linear_manifold_regularizer", // the command line argument name for this test
|
|
"Run tests on the linear_manifold_regularizer object.", // the command line argument description
|
|
0 // the number of command line arguments for this test
|
|
)
|
|
{
|
|
seed = 1;
|
|
}
|
|
|
|
dlib::rand rnd;
|
|
|
|
unsigned long seed;
|
|
|
|
typedef matrix<double, 0, 1> sample_type;
|
|
typedef radial_basis_kernel<sample_type> kernel_type;
|
|
|
|
void do_the_test()
|
|
{
|
|
print_spinner();
|
|
std::vector<sample_type> samples;
|
|
|
|
// Declare an instance of the kernel we will be using.
|
|
const kernel_type kern(0.1);
|
|
|
|
const unsigned long num_points = 200;
|
|
|
|
// create a large dataset with two concentric circles.
|
|
generate_circle(samples, 1, num_points); // circle of radius 1
|
|
generate_circle(samples, 5, num_points); // circle of radius 5
|
|
|
|
std::vector<sample_pair> edges;
|
|
find_percent_shortest_edges_randomly(samples, squared_euclidean_distance(0.1, 4), 1, 10000, "random seed", edges);
|
|
|
|
dlog << LTRACE << "number of edges generated: " << edges.size();
|
|
|
|
empirical_kernel_map<kernel_type> ekm;
|
|
|
|
ekm.load(kern, randomly_subsample(samples, 100));
|
|
|
|
// Project all the samples into the span of our 50 basis samples
|
|
for (unsigned long i = 0; i < samples.size(); ++i)
|
|
samples[i] = ekm.project(samples[i]);
|
|
|
|
|
|
// Now create the manifold regularizer. The result is a transformation matrix that
|
|
// embodies the manifold assumption discussed above.
|
|
linear_manifold_regularizer<sample_type> lmr;
|
|
lmr.build(samples, edges, use_gaussian_weights(0.1));
|
|
matrix<double> T = lmr.get_transformation_matrix(10000);
|
|
|
|
print_spinner();
|
|
|
|
// generate the T matrix manually and make sure it matches. The point of this test
|
|
// is to make sure that the more complex version of this that happens inside the linear_manifold_regularizer
|
|
// is correct. It uses a tedious block of loops to do it in a way that is a lot faster for sparse
|
|
// W matrices but isn't super straight forward.
|
|
matrix<double> X(samples[0].size(), samples.size());
|
|
for (unsigned long i = 0; i < samples.size(); ++i)
|
|
set_colm(X,i) = samples[i];
|
|
|
|
matrix<double> W(samples.size(), samples.size());
|
|
W = 0;
|
|
for (unsigned long i = 0; i < edges.size(); ++i)
|
|
{
|
|
W(edges[i].index1(), edges[i].index2()) = use_gaussian_weights(0.1)(edges[i]);
|
|
W(edges[i].index2(), edges[i].index1()) = use_gaussian_weights(0.1)(edges[i]);
|
|
}
|
|
matrix<double> L = diagm(sum_rows(W)) - W;
|
|
matrix<double> trueT = inv_lower_triangular(chol(identity_matrix<double>(X.nr()) + (10000.0/sum(lowerm(W)))*X*L*trans(X)));
|
|
|
|
dlog << LTRACE << "T error: "<< max(abs(T - trueT));
|
|
DLIB_TEST(max(abs(T - trueT)) < 1e-7);
|
|
|
|
|
|
print_spinner();
|
|
// Apply the transformation generated by the linear_manifold_regularizer to
|
|
// all our samples.
|
|
for (unsigned long i = 0; i < samples.size(); ++i)
|
|
samples[i] = T*samples[i];
|
|
|
|
|
|
// For convenience, generate a projection_function and merge the transformation
|
|
// matrix T into it.
|
|
projection_function<kernel_type> proj = ekm.get_projection_function();
|
|
proj.weights = T*proj.weights;
|
|
|
|
|
|
// Pick 2 different labeled points. One on the inner circle and another on the outer.
|
|
// For each of these test points we will see if using the single plane that separates
|
|
// them is a good way to separate the concentric circles. Also do this a bunch
|
|
// of times with different randomly chosen points so we can see how robust the result is.
|
|
for (int itr = 0; itr < 10; ++itr)
|
|
{
|
|
print_spinner();
|
|
std::vector<sample_type> test_points;
|
|
// generate a random point from the radius 1 circle
|
|
generate_circle(test_points, 1, 1);
|
|
// generate a random point from the radius 5 circle
|
|
generate_circle(test_points, 5, 1);
|
|
|
|
// project the two test points into kernel space. Recall that this projection_function
|
|
// has the manifold regularizer incorporated into it.
|
|
const sample_type class1_point = proj(test_points[0]);
|
|
const sample_type class2_point = proj(test_points[1]);
|
|
|
|
double num_wrong = 0;
|
|
|
|
// Now attempt to classify all the data samples according to which point
|
|
// they are closest to. The output of this program shows that without manifold
|
|
// regularization this test will fail but with it it will perfectly classify
|
|
// all the points.
|
|
for (unsigned long i = 0; i < samples.size(); ++i)
|
|
{
|
|
double distance_to_class1 = length(samples[i] - class1_point);
|
|
double distance_to_class2 = length(samples[i] - class2_point);
|
|
|
|
bool predicted_as_class_1 = (distance_to_class1 < distance_to_class2);
|
|
|
|
bool really_is_class_1 = (i < num_points);
|
|
|
|
// now count how many times we make a mistake
|
|
if (predicted_as_class_1 != really_is_class_1)
|
|
++num_wrong;
|
|
}
|
|
|
|
DLIB_TEST_MSG(num_wrong == 0, num_wrong);
|
|
}
|
|
|
|
}
|
|
|
|
void generate_circle (
|
|
std::vector<sample_type>& samples,
|
|
double radius,
|
|
const long num
|
|
)
|
|
{
|
|
sample_type m(2,1);
|
|
|
|
for (long i = 0; i < num; ++i)
|
|
{
|
|
double sign = 1;
|
|
if (rnd.get_random_double() < 0.5)
|
|
sign = -1;
|
|
m(0) = 2*radius*rnd.get_random_double()-radius;
|
|
m(1) = sign*sqrt(radius*radius - m(0)*m(0));
|
|
|
|
samples.push_back(m);
|
|
}
|
|
}
|
|
|
|
|
|
void test_knn1()
|
|
{
|
|
std::vector<matrix<double,2,1> > samples;
|
|
|
|
matrix<double,2,1> test;
|
|
|
|
test = 0,0; samples.push_back(test);
|
|
test = 1,1; samples.push_back(test);
|
|
test = 1,-1; samples.push_back(test);
|
|
test = -1,1; samples.push_back(test);
|
|
test = -1,-1; samples.push_back(test);
|
|
|
|
std::vector<sample_pair> edges;
|
|
find_k_nearest_neighbors(samples, squared_euclidean_distance(), 1, edges);
|
|
DLIB_TEST(edges.size() == 4);
|
|
|
|
std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>);
|
|
|
|
DLIB_TEST(edges[0] == sample_pair(0,1,0));
|
|
DLIB_TEST(edges[1] == sample_pair(0,2,0));
|
|
DLIB_TEST(edges[2] == sample_pair(0,3,0));
|
|
DLIB_TEST(edges[3] == sample_pair(0,4,0));
|
|
|
|
find_k_nearest_neighbors(samples, squared_euclidean_distance(), 3, edges);
|
|
DLIB_TEST(edges.size() == 8);
|
|
|
|
find_k_nearest_neighbors(samples, squared_euclidean_distance(3.9, 4.1), 3, edges);
|
|
DLIB_TEST(edges.size() == 4);
|
|
|
|
std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>);
|
|
|
|
DLIB_TEST(edges[0] == sample_pair(1,2,0));
|
|
DLIB_TEST(edges[1] == sample_pair(1,3,0));
|
|
DLIB_TEST(edges[2] == sample_pair(2,4,0));
|
|
DLIB_TEST(edges[3] == sample_pair(3,4,0));
|
|
|
|
find_k_nearest_neighbors(samples, squared_euclidean_distance(30000, 4.1), 3, edges);
|
|
DLIB_TEST(edges.size() == 0);
|
|
}
|
|
|
|
void test_knn1_approx()
|
|
{
|
|
std::vector<matrix<double,2,1> > samples;
|
|
|
|
matrix<double,2,1> test;
|
|
|
|
test = 0,0; samples.push_back(test);
|
|
test = 1,1; samples.push_back(test);
|
|
test = 1,-1; samples.push_back(test);
|
|
test = -1,1; samples.push_back(test);
|
|
test = -1,-1; samples.push_back(test);
|
|
|
|
std::vector<sample_pair> edges;
|
|
find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(), 1, 10000, seed, edges);
|
|
DLIB_TEST(edges.size() == 4);
|
|
|
|
std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>);
|
|
|
|
DLIB_TEST(edges[0] == sample_pair(0,1,0));
|
|
DLIB_TEST(edges[1] == sample_pair(0,2,0));
|
|
DLIB_TEST(edges[2] == sample_pair(0,3,0));
|
|
DLIB_TEST(edges[3] == sample_pair(0,4,0));
|
|
|
|
find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(), 3, 10000, seed, edges);
|
|
DLIB_TEST(edges.size() == 8);
|
|
|
|
find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(3.9, 4.1), 3, 10000, seed, edges);
|
|
DLIB_TEST(edges.size() == 4);
|
|
|
|
std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>);
|
|
|
|
DLIB_TEST(edges[0] == sample_pair(1,2,0));
|
|
DLIB_TEST(edges[1] == sample_pair(1,3,0));
|
|
DLIB_TEST(edges[2] == sample_pair(2,4,0));
|
|
DLIB_TEST(edges[3] == sample_pair(3,4,0));
|
|
|
|
find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(30000, 4.1), 3, 10000, seed, edges);
|
|
DLIB_TEST(edges.size() == 0);
|
|
}
|
|
|
|
void test_knn2()
|
|
{
|
|
std::vector<matrix<double,2,1> > samples;
|
|
|
|
matrix<double,2,1> test;
|
|
|
|
test = 1,1; samples.push_back(test);
|
|
test = 1,-1; samples.push_back(test);
|
|
test = -1,1; samples.push_back(test);
|
|
test = -1,-1; samples.push_back(test);
|
|
|
|
std::vector<sample_pair> edges;
|
|
find_k_nearest_neighbors(samples, squared_euclidean_distance(), 2, edges);
|
|
DLIB_TEST(edges.size() == 4);
|
|
|
|
std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>);
|
|
|
|
DLIB_TEST(edges[0] == sample_pair(0,1,0));
|
|
DLIB_TEST(edges[1] == sample_pair(0,2,0));
|
|
DLIB_TEST(edges[2] == sample_pair(1,3,0));
|
|
DLIB_TEST(edges[3] == sample_pair(2,3,0));
|
|
|
|
find_k_nearest_neighbors(samples, squared_euclidean_distance(), 200, edges);
|
|
DLIB_TEST(edges.size() == 4*3/2);
|
|
}
|
|
|
|
void test_knn2_approx()
|
|
{
|
|
std::vector<matrix<double,2,1> > samples;
|
|
|
|
matrix<double,2,1> test;
|
|
|
|
test = 1,1; samples.push_back(test);
|
|
test = 1,-1; samples.push_back(test);
|
|
test = -1,1; samples.push_back(test);
|
|
test = -1,-1; samples.push_back(test);
|
|
|
|
std::vector<sample_pair> edges;
|
|
// For this simple graph and high number of samples we will do we should obtain the exact
|
|
// knn solution.
|
|
find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(), 2, 10000, seed, edges);
|
|
DLIB_TEST(edges.size() == 4);
|
|
|
|
std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>);
|
|
|
|
DLIB_TEST(edges[0] == sample_pair(0,1,0));
|
|
DLIB_TEST(edges[1] == sample_pair(0,2,0));
|
|
DLIB_TEST(edges[2] == sample_pair(1,3,0));
|
|
DLIB_TEST(edges[3] == sample_pair(2,3,0));
|
|
|
|
|
|
find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(), 200, 10000, seed, edges);
|
|
DLIB_TEST(edges.size() == 4*3/2);
|
|
}
|
|
|
|
void perform_test (
|
|
)
|
|
{
|
|
for (int i = 0; i < 5; ++i)
|
|
{
|
|
do_the_test();
|
|
|
|
++seed;
|
|
test_knn1_approx();
|
|
test_knn2_approx();
|
|
}
|
|
test_knn1();
|
|
test_knn2();
|
|
test_knn_lsh_sparse<double>();
|
|
test_knn_lsh_sparse<float>();
|
|
test_knn_lsh_dense<double>();
|
|
test_knn_lsh_dense<float>();
|
|
|
|
}
|
|
};
|
|
|
|
// Create an instance of this object. Doing this causes this test
|
|
// to be automatically inserted into the testing framework whenever this cpp file
|
|
// is linked into the project. Note that since we are inside an unnamed-namespace
|
|
// we won't get any linker errors about the symbol a being defined multiple times.
|
|
linear_manifold_regularizer_tester a;
|
|
|
|
}
|
|
|
|
|
|
|