254 lines
8.7 KiB
C++
254 lines
8.7 KiB
C++
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
|
|
// License: Boost Software License See LICENSE.txt for the full license.
|
|
#ifndef DLIB_BOTTOM_uP_CLUSTER_Hh_
|
|
#define DLIB_BOTTOM_uP_CLUSTER_Hh_
|
|
|
|
#include <queue>
|
|
#include <map>
|
|
|
|
#include "bottom_up_cluster_abstract.h"
|
|
#include "../algs.h"
|
|
#include "../matrix.h"
|
|
#include "../disjoint_subsets.h"
|
|
#include "../graph_utils.h"
|
|
|
|
|
|
namespace dlib
|
|
{
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
namespace buc_impl
|
|
{
|
|
inline void merge_sets (
|
|
matrix<double>& dists,
|
|
unsigned long dest,
|
|
unsigned long src
|
|
)
|
|
{
|
|
for (long r = 0; r < dists.nr(); ++r)
|
|
dists(dest,r) = dists(r,dest) = std::max(dists(r,dest), dists(r,src));
|
|
}
|
|
|
|
struct compare_dist
|
|
{
|
|
bool operator() (
|
|
const sample_pair& a,
|
|
const sample_pair& b
|
|
) const
|
|
{
|
|
return a.distance() > b.distance();
|
|
}
|
|
};
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename EXP
|
|
>
|
|
unsigned long bottom_up_cluster (
|
|
const matrix_exp<EXP>& dists_,
|
|
std::vector<unsigned long>& labels,
|
|
unsigned long min_num_clusters,
|
|
double max_dist = std::numeric_limits<double>::infinity()
|
|
)
|
|
{
|
|
matrix<double> dists = matrix_cast<double>(dists_);
|
|
// make sure requires clause is not broken
|
|
DLIB_CASSERT(dists.nr() == dists.nc() && min_num_clusters > 0,
|
|
"\t unsigned long bottom_up_cluster()"
|
|
<< "\n\t Invalid inputs were given to this function."
|
|
<< "\n\t dists.nr(): " << dists.nr()
|
|
<< "\n\t dists.nc(): " << dists.nc()
|
|
<< "\n\t min_num_clusters: " << min_num_clusters
|
|
);
|
|
|
|
using namespace buc_impl;
|
|
|
|
labels.resize(dists.nr());
|
|
disjoint_subsets sets;
|
|
sets.set_size(dists.nr());
|
|
if (labels.size() == 0)
|
|
return 0;
|
|
|
|
// push all the edges in the graph into a priority queue so the best edges to merge
|
|
// come first.
|
|
std::priority_queue<sample_pair, std::vector<sample_pair>, compare_dist> que;
|
|
for (long r = 0; r < dists.nr(); ++r)
|
|
for (long c = r+1; c < dists.nc(); ++c)
|
|
que.push(sample_pair(r,c,dists(r,c)));
|
|
|
|
// Now start merging nodes.
|
|
for (unsigned long iter = min_num_clusters; iter < sets.size(); ++iter)
|
|
{
|
|
// find the next best thing to merge.
|
|
double best_dist = que.top().distance();
|
|
unsigned long a = sets.find_set(que.top().index1());
|
|
unsigned long b = sets.find_set(que.top().index2());
|
|
que.pop();
|
|
// we have been merging and modifying the distances, so make sure this distance
|
|
// is still valid and these guys haven't been merged already.
|
|
while(a == b || best_dist < dists(a,b))
|
|
{
|
|
// Haven't merged it yet, so put it back in with updated distance for
|
|
// reconsideration later.
|
|
if (a != b)
|
|
que.push(sample_pair(a, b, dists(a, b)));
|
|
|
|
best_dist = que.top().distance();
|
|
a = sets.find_set(que.top().index1());
|
|
b = sets.find_set(que.top().index2());
|
|
que.pop();
|
|
}
|
|
|
|
|
|
// now merge these sets if the best distance is small enough
|
|
if (best_dist > max_dist)
|
|
break;
|
|
unsigned long news = sets.merge_sets(a,b);
|
|
unsigned long olds = (news==a)?b:a;
|
|
merge_sets(dists, news, olds);
|
|
}
|
|
|
|
// figure out which cluster each element is in. Also make sure the labels are
|
|
// contiguous.
|
|
std::map<unsigned long, unsigned long> relabel;
|
|
for (unsigned long r = 0; r < labels.size(); ++r)
|
|
{
|
|
unsigned long l = sets.find_set(r);
|
|
// relabel to make contiguous
|
|
if (relabel.count(l) == 0)
|
|
{
|
|
unsigned long next = relabel.size();
|
|
relabel[l] = next;
|
|
}
|
|
labels[r] = relabel[l];
|
|
}
|
|
|
|
|
|
return relabel.size();
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
struct snl_range
|
|
{
|
|
snl_range() = default;
|
|
snl_range(double val) : lower(val), upper(val) {}
|
|
snl_range(double l, double u) : lower(l), upper(u) { DLIB_ASSERT(lower <= upper)}
|
|
|
|
double lower = 0;
|
|
double upper = 0;
|
|
|
|
double width() const { return upper-lower; }
|
|
bool operator<(const snl_range& item) const { return lower < item.lower; }
|
|
};
|
|
|
|
inline snl_range merge(const snl_range& a, const snl_range& b)
|
|
{
|
|
return snl_range(std::min(a.lower, b.lower), std::max(a.upper, b.upper));
|
|
}
|
|
|
|
inline double distance (const snl_range& a, const snl_range& b)
|
|
{
|
|
return std::max(a.lower,b.lower) - std::min(a.upper,b.upper);
|
|
}
|
|
|
|
inline std::ostream& operator<< (std::ostream& out, const snl_range& item )
|
|
{
|
|
out << "["<<item.lower<<","<<item.upper<<"]";
|
|
return out;
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
inline std::vector<snl_range> segment_number_line (
|
|
const std::vector<double>& x,
|
|
const double max_range_width
|
|
)
|
|
{
|
|
DLIB_CASSERT(max_range_width >= 0);
|
|
|
|
// create initial ranges, one for each value in x. So initially, all the ranges have
|
|
// width of 0.
|
|
std::vector<snl_range> ranges;
|
|
for (auto v : x)
|
|
ranges.push_back(v);
|
|
std::sort(ranges.begin(), ranges.end());
|
|
|
|
std::vector<snl_range> greedy_final_ranges;
|
|
if (ranges.size() == 0)
|
|
return greedy_final_ranges;
|
|
// We will try two different clustering strategies. One that does a simple greedy left
|
|
// to right sweep and another that does a bottom up agglomerative clustering. This
|
|
// first loop runs the greedy left to right sweep. Then at the end of this routine we
|
|
// will return the results that produced the tightest clustering.
|
|
greedy_final_ranges.push_back(ranges[0]);
|
|
for (size_t i = 1; i < ranges.size(); ++i)
|
|
{
|
|
auto m = merge(greedy_final_ranges.back(), ranges[i]);
|
|
if (m.width() <= max_range_width)
|
|
greedy_final_ranges.back() = m;
|
|
else
|
|
greedy_final_ranges.push_back(ranges[i]);
|
|
}
|
|
|
|
|
|
// Here we do the bottom up clustering. So compute the edges connecting our ranges.
|
|
// We will simply say there are edges between ranges if and only if they are
|
|
// immediately adjacent on the number line.
|
|
std::vector<sample_pair> edges;
|
|
for (size_t i = 1; i < ranges.size(); ++i)
|
|
edges.push_back(sample_pair(i-1,i, distance(ranges[i-1],ranges[i])));
|
|
std::sort(edges.begin(), edges.end(), order_by_distance<sample_pair>);
|
|
|
|
disjoint_subsets sets;
|
|
sets.set_size(ranges.size());
|
|
|
|
// Now start merging nodes.
|
|
for (auto edge : edges)
|
|
{
|
|
// find the next best thing to merge.
|
|
unsigned long a = sets.find_set(edge.index1());
|
|
unsigned long b = sets.find_set(edge.index2());
|
|
|
|
// merge it if it doesn't result in an interval that's too big.
|
|
auto m = merge(ranges[a], ranges[b]);
|
|
if (m.width() <= max_range_width)
|
|
{
|
|
unsigned long news = sets.merge_sets(a,b);
|
|
ranges[news] = m;
|
|
}
|
|
}
|
|
|
|
// Now create a list of the final ranges. We will do this by keeping track of which
|
|
// range we already added to final_ranges.
|
|
std::vector<snl_range> final_ranges;
|
|
std::vector<bool> already_output(ranges.size(), false);
|
|
for (unsigned long i = 0; i < sets.size(); ++i)
|
|
{
|
|
auto s = sets.find_set(i);
|
|
if (!already_output[s])
|
|
{
|
|
final_ranges.push_back(ranges[s]);
|
|
already_output[s] = true;
|
|
}
|
|
}
|
|
|
|
// only use the greedy clusters if they found a clustering with fewer clusters.
|
|
// Otherwise, the bottom up clustering probably produced a more sensible clustering.
|
|
if (final_ranges.size() <= greedy_final_ranges.size())
|
|
return final_ranges;
|
|
else
|
|
return greedy_final_ranges;
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
}
|
|
|
|
#endif // DLIB_BOTTOM_uP_CLUSTER_Hh_
|
|
|