429 lines
16 KiB
C++
429 lines
16 KiB
C++
// Copyright (C) 2010 Davis E. King (davis@dlib.net)
|
|
// License: Boost Software License See LICENSE.txt for the full license.
|
|
#ifndef DLIB_OPTIMIZATIoN_OCA_Hh_
|
|
#define DLIB_OPTIMIZATIoN_OCA_Hh_
|
|
|
|
#include "optimization_oca_abstract.h"
|
|
|
|
#include "../matrix.h"
|
|
#include "optimization_solve_qp_using_smo.h"
|
|
#include <vector>
|
|
#include "../sequence.h"
|
|
#include <chrono>
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
namespace dlib
|
|
{
|
|
template <typename matrix_type>
|
|
class oca_problem
|
|
{
|
|
public:
|
|
typedef typename matrix_type::type scalar_type;
|
|
|
|
virtual ~oca_problem() {}
|
|
|
|
virtual bool risk_has_lower_bound (
|
|
scalar_type&
|
|
) const { return false; }
|
|
|
|
virtual bool optimization_status (
|
|
scalar_type ,
|
|
scalar_type ,
|
|
scalar_type ,
|
|
scalar_type ,
|
|
unsigned long,
|
|
unsigned long
|
|
) const = 0;
|
|
|
|
virtual scalar_type get_c (
|
|
) const = 0;
|
|
|
|
virtual long get_num_dimensions (
|
|
) const = 0;
|
|
|
|
virtual void get_risk (
|
|
matrix_type& current_solution,
|
|
scalar_type& risk_value,
|
|
matrix_type& risk_subgradient
|
|
) const = 0;
|
|
|
|
};
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
class oca
|
|
{
|
|
public:
|
|
|
|
oca ()
|
|
{
|
|
sub_eps = 1e-2;
|
|
sub_max_iter = 50000;
|
|
|
|
inactive_thresh = 20;
|
|
max_runtime = std::chrono::hours(24*356*290); // 290 years
|
|
}
|
|
|
|
void set_subproblem_epsilon (
|
|
double eps_
|
|
) { sub_eps = eps_; }
|
|
|
|
double get_subproblem_epsilon (
|
|
) const { return sub_eps; }
|
|
|
|
void set_subproblem_max_iterations (
|
|
unsigned long sub_max_iter_
|
|
)
|
|
{
|
|
// make sure requires clause is not broken
|
|
DLIB_ASSERT(sub_max_iter_ > 0,
|
|
"\t void oca::set_subproblem_max_iterations"
|
|
<< "\n\t max iterations must be greater than 0"
|
|
<< "\n\t sub_max_iter_: " << sub_max_iter_
|
|
<< "\n\t this: " << this
|
|
);
|
|
|
|
sub_max_iter = sub_max_iter_;
|
|
}
|
|
|
|
unsigned long get_subproblem_max_iterations (
|
|
) const { return sub_max_iter; }
|
|
|
|
void set_max_runtime (
|
|
const std::chrono::nanoseconds& max_runtime_
|
|
)
|
|
{
|
|
max_runtime = max_runtime_;
|
|
}
|
|
|
|
std::chrono::nanoseconds get_max_runtime (
|
|
) const
|
|
{
|
|
return max_runtime;
|
|
}
|
|
|
|
void set_inactive_plane_threshold (
|
|
unsigned long inactive_thresh_
|
|
)
|
|
{
|
|
// make sure requires clause is not broken
|
|
DLIB_ASSERT(inactive_thresh_ > 0,
|
|
"\t void oca::set_inactive_plane_threshold"
|
|
<< "\n\t inactive threshold must be greater than 0"
|
|
<< "\n\t inactive_thresh_: " << inactive_thresh_
|
|
<< "\n\t this: " << this
|
|
);
|
|
|
|
inactive_thresh = inactive_thresh_;
|
|
}
|
|
|
|
unsigned long get_inactive_plane_threshold (
|
|
) const { return inactive_thresh; }
|
|
|
|
template <
|
|
typename matrix_type
|
|
>
|
|
typename matrix_type::type operator() (
|
|
const oca_problem<matrix_type>& problem,
|
|
matrix_type& w,
|
|
unsigned long num_nonnegative = 0,
|
|
unsigned long force_weight_to_1 = std::numeric_limits<unsigned long>::max()
|
|
) const
|
|
{
|
|
matrix_type empty_prior;
|
|
return oca_impl(problem, w, empty_prior, false, num_nonnegative, force_weight_to_1, 0);
|
|
}
|
|
|
|
template <
|
|
typename matrix_type
|
|
>
|
|
typename matrix_type::type solve_with_elastic_net (
|
|
const oca_problem<matrix_type>& problem,
|
|
matrix_type& w,
|
|
double lasso_lambda,
|
|
unsigned long force_weight_to_1 = std::numeric_limits<unsigned long>::max()
|
|
) const
|
|
{
|
|
matrix_type empty_prior;
|
|
return oca_impl(problem, w, empty_prior, false, 0, force_weight_to_1, lasso_lambda);
|
|
}
|
|
|
|
template <
|
|
typename matrix_type
|
|
>
|
|
typename matrix_type::type operator() (
|
|
const oca_problem<matrix_type>& problem,
|
|
matrix_type& w,
|
|
const matrix_type& prior
|
|
) const
|
|
{
|
|
// make sure requires clause is not broken
|
|
DLIB_ASSERT(is_col_vector(prior) && prior.size() == problem.get_num_dimensions(),
|
|
"\t scalar_type oca::operator()"
|
|
<< "\n\t The prior vector does not have the correct dimensions."
|
|
<< "\n\t is_col_vector(prior): " << is_col_vector(prior)
|
|
<< "\n\t prior.size(): " << prior.size()
|
|
<< "\n\t problem.get_num_dimensions(): " << problem.get_num_dimensions()
|
|
<< "\n\t this: " << this
|
|
);
|
|
// disable the force weight to 1 option for this mode. We also disable the
|
|
// non-negative constraints.
|
|
unsigned long force_weight_to_1 = std::numeric_limits<unsigned long>::max();
|
|
return oca_impl(problem, w, prior, true, 0, force_weight_to_1, 0);
|
|
}
|
|
|
|
private:
|
|
|
|
template <
|
|
typename matrix_type
|
|
>
|
|
typename matrix_type::type oca_impl (
|
|
const oca_problem<matrix_type>& problem,
|
|
matrix_type& w,
|
|
const matrix_type& prior,
|
|
bool have_prior,
|
|
unsigned long num_nonnegative,
|
|
unsigned long force_weight_to_1,
|
|
const double lasso_lambda
|
|
) const
|
|
{
|
|
const unsigned long num_dims = problem.get_num_dimensions();
|
|
|
|
// make sure requires clause is not broken
|
|
DLIB_ASSERT(problem.get_c() > 0 &&
|
|
problem.get_num_dimensions() > 0 &&
|
|
0 <= lasso_lambda && lasso_lambda < 1,
|
|
"\t scalar_type oca::operator()"
|
|
<< "\n\t The oca_problem is invalid"
|
|
<< "\n\t problem.get_c(): " << problem.get_c()
|
|
<< "\n\t problem.get_num_dimensions(): " << num_dims
|
|
<< "\n\t lasso_lambda: " << lasso_lambda
|
|
<< "\n\t this: " << this
|
|
);
|
|
if (have_prior)
|
|
{
|
|
DLIB_ASSERT(lasso_lambda == 0, "Solver doesn't support using a prior with lasso.");
|
|
DLIB_ASSERT(num_nonnegative == 0, "Solver doesn't support using a prior with non-negative constraints.");
|
|
}
|
|
else if (lasso_lambda != 0)
|
|
{
|
|
DLIB_ASSERT(num_nonnegative == 0, "Solver doesn't support using lasso with non-negative constraints.");
|
|
}
|
|
|
|
const double ridge_lambda = 1-lasso_lambda;
|
|
|
|
if (num_nonnegative > num_dims)
|
|
num_nonnegative = num_dims;
|
|
|
|
typedef typename matrix_type::type scalar_type;
|
|
typedef typename matrix_type::layout_type layout_type;
|
|
typedef typename matrix_type::mem_manager_type mem_manager_type;
|
|
typedef matrix_type vect_type;
|
|
|
|
const scalar_type C = problem.get_c();
|
|
|
|
typename sequence<vect_type>::kernel_2a planes;
|
|
std::vector<scalar_type> bs, miss_count;
|
|
|
|
vect_type new_plane, alpha, btemp;
|
|
|
|
w.set_size(num_dims, 1);
|
|
w = 0;
|
|
|
|
// The current objective value. Note also that w always contains
|
|
// the current solution.
|
|
scalar_type cur_obj = std::numeric_limits<scalar_type>::max();
|
|
|
|
// This will hold the cutting plane objective value. This value is
|
|
// a lower bound on the true optimal objective value.
|
|
scalar_type cp_obj = 0;
|
|
|
|
matrix<scalar_type,0,0,mem_manager_type, layout_type> K, Ktmp;
|
|
matrix<scalar_type,0,1,mem_manager_type, layout_type> lambda, d;
|
|
if (lasso_lambda != 0)
|
|
d.set_size(num_dims);
|
|
else
|
|
d.set_size(num_nonnegative);
|
|
d = lasso_lambda*ones_matrix(d);
|
|
|
|
scalar_type R_lower_bound;
|
|
if (problem.risk_has_lower_bound(R_lower_bound))
|
|
{
|
|
// The flat lower bounding plane is always good to have if we know
|
|
// what it is.
|
|
bs.push_back(R_lower_bound);
|
|
new_plane = zeros_matrix(w);
|
|
planes.add(0, new_plane);
|
|
alpha = uniform_matrix<scalar_type>(1,1, C);
|
|
miss_count.push_back(0);
|
|
|
|
K.set_size(1,1);
|
|
K(0,0) = 0;
|
|
}
|
|
|
|
const double prior_norm = have_prior ? 0.5*dot(prior,prior) : 0;
|
|
|
|
const auto time_to_stop = std::chrono::steady_clock::now() + max_runtime;
|
|
|
|
unsigned long counter = 0;
|
|
while (true)
|
|
{
|
|
|
|
// add the next cutting plane
|
|
scalar_type cur_risk;
|
|
if (force_weight_to_1 < (unsigned long)w.size())
|
|
w(force_weight_to_1) = 1;
|
|
|
|
problem.get_risk(w, cur_risk, new_plane);
|
|
|
|
if (force_weight_to_1 < (unsigned long)w.size())
|
|
{
|
|
// We basically arrange for the w(force_weight_to_1) element and all
|
|
// subsequent elements of w to not be involved in the optimization at
|
|
// all. An easy way to do this is to just make sure the elements of w
|
|
// corresponding elements in the subgradient are always set to zero
|
|
// while we run the cutting plane algorithm. The only time
|
|
// w(force_weight_to_1) is 1 is when we pass it to the oca_problem.
|
|
set_rowm(w, range(force_weight_to_1, w.size()-1)) = 0;
|
|
set_rowm(new_plane, range(force_weight_to_1, new_plane.size()-1)) = 0;
|
|
}
|
|
|
|
if (have_prior)
|
|
bs.push_back(cur_risk - dot(w,new_plane) + dot(prior,new_plane));
|
|
else
|
|
bs.push_back(cur_risk - dot(w,new_plane));
|
|
planes.add(planes.size(), new_plane);
|
|
miss_count.push_back(0);
|
|
|
|
// If alpha is empty then initialize it (we must always have sum(alpha) == C).
|
|
// But otherwise, just append a zero.
|
|
if (alpha.size() == 0)
|
|
alpha = uniform_matrix<scalar_type>(1,1, C);
|
|
else
|
|
alpha = join_cols(alpha,zeros_matrix<scalar_type>(1,1));
|
|
|
|
const scalar_type wnorm = 0.5*ridge_lambda*trans(w)*w + lasso_lambda*sum(abs(w));
|
|
const double prior_part = have_prior? dot(w,prior) : 0;
|
|
cur_obj = wnorm + C*cur_risk + prior_norm-prior_part;
|
|
|
|
// report current status
|
|
const scalar_type risk_gap = cur_risk - (cp_obj-wnorm+prior_part-prior_norm)/C;
|
|
if (counter > 0 && problem.optimization_status(cur_obj, cur_obj - cp_obj,
|
|
cur_risk, risk_gap, planes.size(), counter))
|
|
{
|
|
break;
|
|
}
|
|
if (std::chrono::steady_clock::now() >= time_to_stop)
|
|
break;
|
|
|
|
// compute kernel matrix for all the planes
|
|
K.swap(Ktmp);
|
|
K.set_size(planes.size(), planes.size());
|
|
// copy over the old K matrix
|
|
set_subm(K, 0,0, Ktmp.nr(), Ktmp.nc()) = Ktmp;
|
|
|
|
// now add the new row and column to K
|
|
for (unsigned long c = 0; c < planes.size(); ++c)
|
|
{
|
|
K(c, Ktmp.nc()) = dot(planes[c], planes[planes.size()-1]);
|
|
K(Ktmp.nc(), c) = K(c,Ktmp.nc());
|
|
}
|
|
|
|
|
|
// solve the cutting plane subproblem for the next w. We solve it to an
|
|
// accuracy that is related to how big the error gap is. Also, we multiply
|
|
// by ridge_lambda because the objective function for the QP we solve was
|
|
// implicitly scaled by ridge_lambda. That is, we want to ask the QP
|
|
// solver to solve the problem until the duality gap is 0.1 times smaller
|
|
// than what it is now. So the factor of ridge_lambda is necessary to make
|
|
// this happen.
|
|
scalar_type eps = std::min<scalar_type>(sub_eps, 0.1*ridge_lambda*(cur_obj-cp_obj));
|
|
// just a sanity check
|
|
if (eps < 1e-16)
|
|
eps = 1e-16;
|
|
// Note that we warm start this optimization by using the alpha from the last
|
|
// iteration as the starting point.
|
|
if (lasso_lambda != 0)
|
|
{
|
|
// copy planes into a matrix so we can call solve_qp4_using_smo()
|
|
matrix<scalar_type,0,0,mem_manager_type, layout_type> planes_mat(num_dims,planes.size());
|
|
for (unsigned long i = 0; i < planes.size(); ++i)
|
|
set_colm(planes_mat,i) = planes[i];
|
|
|
|
btemp = ridge_lambda*mat(bs) - trans(planes_mat)*d;
|
|
solve_qp4_using_smo(planes_mat, K, btemp, d, alpha, lambda, eps, sub_max_iter, (scalar_type)(2*lasso_lambda));
|
|
}
|
|
else if (num_nonnegative != 0)
|
|
{
|
|
// copy planes into a matrix so we can call solve_qp4_using_smo()
|
|
matrix<scalar_type,0,0,mem_manager_type, layout_type> planes_mat(num_nonnegative,planes.size());
|
|
for (unsigned long i = 0; i < planes.size(); ++i)
|
|
set_colm(planes_mat,i) = colm(planes[i],0,num_nonnegative);
|
|
|
|
solve_qp4_using_smo(planes_mat, K, mat(bs), d, alpha, lambda, eps, sub_max_iter);
|
|
}
|
|
else
|
|
{
|
|
solve_qp_using_smo(K, mat(bs), alpha, eps, sub_max_iter);
|
|
}
|
|
|
|
// construct the w that minimized the subproblem.
|
|
w = -alpha(0)*planes[0];
|
|
for (unsigned long i = 1; i < planes.size(); ++i)
|
|
w -= alpha(i)*planes[i];
|
|
if (lasso_lambda != 0)
|
|
w = (lambda-d+w)/ridge_lambda;
|
|
else if (num_nonnegative != 0) // threshold the first num_nonnegative w elements if necessary.
|
|
set_rowm(w,range(0,num_nonnegative-1)) = lowerbound(rowm(w,range(0,num_nonnegative-1)),0);
|
|
|
|
for (long i = 0; i < alpha.size(); ++i)
|
|
{
|
|
if (alpha(i) != 0)
|
|
miss_count[i] = 0;
|
|
else
|
|
miss_count[i] += 1;
|
|
}
|
|
|
|
// Compute the lower bound on the true objective given to us by the cutting
|
|
// plane subproblem.
|
|
cp_obj = -0.5*ridge_lambda*trans(w)*w + trans(alpha)*mat(bs);
|
|
if (have_prior)
|
|
w += prior;
|
|
|
|
// If it has been a while since a cutting plane was an active constraint then
|
|
// we should throw it away.
|
|
while (max(mat(miss_count)) >= inactive_thresh)
|
|
{
|
|
const long idx = index_of_max(mat(miss_count));
|
|
bs.erase(bs.begin()+idx);
|
|
miss_count.erase(miss_count.begin()+idx);
|
|
K = removerc(K, idx, idx);
|
|
alpha = remove_row(alpha,idx);
|
|
planes.remove(idx, new_plane);
|
|
}
|
|
|
|
++counter;
|
|
}
|
|
|
|
if (force_weight_to_1 < (unsigned long)w.size())
|
|
w(force_weight_to_1) = 1;
|
|
|
|
return cur_obj;
|
|
}
|
|
|
|
double sub_eps;
|
|
|
|
unsigned long sub_max_iter;
|
|
|
|
unsigned long inactive_thresh;
|
|
|
|
std::chrono::nanoseconds max_runtime;
|
|
};
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
#endif // DLIB_OPTIMIZATIoN_OCA_Hh_
|
|
|