# Source code for pymanopt.solvers.trust_regions

```
# References, taken from trustregions.m in manopt:
# Please cite the Manopt paper as well as the research paper:
# @Article{genrtr,
# Title = {Trust-region methods on {Riemannian} manifolds},
# Author = {Absil, P.-A. and Baker, C. G. and Gallivan, K. A.},
# Journal = {Foundations of Computational Mathematics},
# Year = {2007},
# Number = {3},
# Pages = {303--330},
# Volume = {7},
# Doi = {10.1007/s10208-005-0179-9}
# }
#
# See also: steepestdescent conjugategradient manopt/examples
# An explicit, general listing of this algorithm, with preconditioning,
# can be found in the following paper:
# @Article{boumal2015lowrank,
# Title = {Low-rank matrix completion via preconditioned optimization
# on the {G}rassmann manifold},
# Author = {Boumal, N. and Absil, P.-A.},
# Journal = {Linear Algebra and its Applications},
# Year = {2015},
# Pages = {200--239},
# Volume = {475},
# Doi = {10.1016/j.laa.2015.02.027},
# }
# When the Hessian is not specified, it is approximated with
# finite-differences of the gradient. The resulting method is called
# RTR-FD. Some convergence theory for it is available in this paper:
# @incollection{boumal2015rtrfd
# author={Boumal, N.},
# title={Riemannian trust regions with finite-difference Hessian
# approximations are globally convergent},
# year={2015},
# booktitle={Geometric Science of Information}
# }
# This file is part of Manopt: www.manopt.org.
# This code is an adaptation to Manopt of the original GenRTR code:
# RTR - Riemannian Trust-Region
# (c) 2004-2007, P.-A. Absil, C. G. Baker, K. A. Gallivan
# Florida State University
# School of Computational Science
# (http://www.math.fsu.edu/~cbaker/GenRTR/?page=download)
# See accompanying license file.
# The adaptation was executed by Nicolas Boumal.
# Ported to pymanopt by Jamie Townsend. January 2016.
import time
import numpy as np
from pymanopt.solvers.solver import Solver
[docs]class TrustRegions(Solver):
(NEGATIVE_CURVATURE, EXCEEDED_TR, REACHED_TARGET_LINEAR,
REACHED_TARGET_SUPERLINEAR, MAX_INNER_ITER, MODEL_INCREASED) = range(6)
TCG_STOP_REASONS = {
NEGATIVE_CURVATURE: "negative curvature",
EXCEEDED_TR: "exceeded trust region",
REACHED_TARGET_LINEAR: "reached target residual-kappa (linear)",
REACHED_TARGET_SUPERLINEAR: "reached target residual-theta "
"(superlinear)",
MAX_INNER_ITER: "maximum inner iterations",
MODEL_INCREASED: "model increased"
}
def __init__(self, miniter=3, kappa=0.1, theta=1.0, rho_prime=0.1,
use_rand=False, rho_regularization=1e3, *args, **kwargs):
"""
Trust regions algorithm based on trustregions.m from the
Manopt MATLAB package.
Also included is the Truncated (Steihaug-Toint) Conjugate-Gradient
algorithm, based on tCG.m from the Manopt MATLAB package.
"""
super().__init__(*args, **kwargs)
self.miniter = miniter
self.kappa = kappa
self.theta = theta
self.rho_prime = rho_prime
self.use_rand = use_rand
self.rho_regularization = rho_regularization
[docs] def solve(self, problem, x=None, mininner=1, maxinner=None,
Delta_bar=None, Delta0=None):
man = problem.manifold
verbosity = problem.verbosity
if maxinner is None:
maxinner = man.dim
# Set default Delta_bar and Delta0 separately to deal with additional
# logic: if Delta_bar is provided but not Delta0, let Delta0
# automatically be some fraction of the provided Delta_bar.
if Delta_bar is None:
try:
Delta_bar = man.typicaldist
except NotImplementedError:
Delta_bar = np.sqrt(man.dim)
if Delta0 is None:
Delta0 = Delta_bar / 8
cost = problem.cost
grad = problem.grad
hess = problem.hess
# If no starting point is specified, generate one at random.
if x is None:
x = man.rand()
# Initializations
time0 = time.time()
# k counts the outer (TR) iterations. The semantic is that k counts the
# number of iterations fully executed so far.
k = 0
# Initialize solution and companion measures: f(x), fgrad(x)
fx = cost(x)
fgradx = grad(x)
norm_grad = man.norm(x, fgradx)
# Initialize the trust region radius
Delta = Delta0
# To keep track of consecutive radius changes, so that we can warn the
# user if it appears necessary.
consecutive_TRplus = 0
consecutive_TRminus = 0
# ** Display:
if verbosity >= 1:
print("Optimizing...")
if verbosity >= 2:
print("{:44s}f: {:+.6e} |grad|: {:.6e}".format(
" ", float(fx), norm_grad))
self._start_optlog()
while True:
# *************************
# ** Begin TR Subproblem **
# *************************
# Determine eta0
if not self.use_rand:
# Pick the zero vector
eta = man.zerovec(x)
else:
# Random vector in T_x M (this has to be very small)
eta = 1e-6 * man.randvec(x)
# Must be inside trust region
while man.norm(x, eta) > Delta:
eta = np.sqrt(np.sqrt(np.spacing(1)))
# Solve TR subproblem approximately
eta, Heta, numit, stop_inner = self._truncated_conjugate_gradient(
problem, x, fgradx, eta, Delta, self.theta, self.kappa,
mininner, maxinner)
srstr = self.TCG_STOP_REASONS[stop_inner]
# If using randomized approach, compare result with the Cauchy
# point. Convergence proofs assume that we achieve at least (a
# fraction of) the reduction of the Cauchy point. After this
# if-block, either all eta-related quantities have been changed
# consistently, or none of them have.
if self.use_rand:
used_cauchy = False
# Check the curvature
Hg = hess(x, fgradx)
g_Hg = man.inner(x, fgradx, Hg)
if g_Hg <= 0:
tau_c = 1
else:
tau_c = min(norm_grad ** 3 / (Delta * g_Hg), 1)
# and generate the Cauchy point.
eta_c = -tau_c * Delta / norm_grad * fgradx
Heta_c = -tau_c * Delta / norm_grad * Hg
# Now that we have computed the Cauchy point in addition to the
# returned eta, we might as well keep the best of them.
mdle = (fx + man.inner(x, fgradx, eta) +
0.5 * man.inner(x, Heta, eta))
mdlec = (fx + man.inner(x, fgradx, eta_c) +
0.5 * man.inner(x, Heta_c, eta_c))
if mdlec < mdle:
eta = eta_c
Heta = Heta_c
used_cauchy = True
# This is only computed for logging purposes, because it may be
# useful for some user-defined stopping criteria. If this is not
# cheap for specific applications (compared to evaluating the
# cost), we should reconsider this.
# norm_eta = man.norm(x, eta)
# Compute the tentative next iterate (the proposal)
x_prop = man.retr(x, eta)
# Compute the function value of the proposal
fx_prop = cost(x_prop)
# Will we accept the proposal or not? Check the performance of the
# quadratic model against the actual cost.
rhonum = fx - fx_prop
rhoden = -man.inner(x, fgradx, eta) - 0.5 * man.inner(x, eta, Heta)
# rhonum could be anything.
# rhoden should be nonnegative, as guaranteed by tCG, baring
# numerical errors.
# Heuristic -- added Dec. 2, 2013 (NB) to replace the former
# heuristic. This heuristic is documented in the book by Conn Gould
# and Toint on trust-region methods, section 17.4.2. rhonum
# measures the difference between two numbers. Close to
# convergence, these two numbers are very close to each other, so
# that computing their difference is numerically challenging: there
# may be a significant loss in accuracy. Since the acceptance or
# rejection of the step is conditioned on the ratio between rhonum
# and rhoden, large errors in rhonum result in a very large error
# in rho, hence in erratic acceptance / rejection. Meanwhile, close
# to convergence, steps are usually trustworthy and we should
# transition to a Newton- like method, with rho=1 consistently. The
# heuristic thus shifts both rhonum and rhoden by a small amount
# such that far from convergence, the shift is irrelevant and close
# to convergence, the ratio rho goes to 1, effectively promoting
# acceptance of the step. The rationale is that close to
# convergence, both rhonum and rhoden are quadratic in the distance
# between x and x_prop. Thus, when this distance is on the order of
# sqrt(eps), the value of rhonum and rhoden is on the order of eps,
# which is indistinguishable from the numerical error, resulting in
# badly estimated rho's.
# For abs(fx) < 1, this heuristic is invariant under offsets of f
# but not under scaling of f. For abs(fx) > 1, the opposite holds.
# This should not alarm us, as this heuristic only triggers at the
# very last iterations if very fine convergence is demanded.
rho_reg = max(1, abs(fx)) * np.spacing(1) * self.rho_regularization
rhonum = rhonum + rho_reg
rhoden = rhoden + rho_reg
# This is always true if a linear, symmetric operator is used for
# the Hessian (approximation) and if we had infinite numerical
# precision. In practice, nonlinear approximations of the Hessian
# such as the built-in finite difference approximation and finite
# numerical accuracy can cause the model to increase. In such
# scenarios, we decide to force a rejection of the step and a
# reduction of the trust-region radius. We test the sign of the
# regularized rhoden since the regularization is supposed to
# capture the accuracy to which rhoden is computed: if rhoden were
# negative before regularization but not after, that should not be
# (and is not) detected as a failure.
#
# Note (Feb. 17, 2015, NB): the most recent version of tCG already
# includes a mechanism to ensure model decrease if the Cauchy step
# attained a decrease (which is theoretically the case under very
# lax assumptions). This being said, it is always possible that
# numerical errors will prevent this, so that it is good to keep a
# safeguard.
#
# The current strategy is that, if this should happen, then we
# reject the step and reduce the trust region radius. This also
# ensures that the actual cost values are monotonically decreasing.
model_decreased = (rhoden >= 0)
if not model_decreased:
srstr = srstr + ", model did not decrease"
try:
rho = rhonum / rhoden
except ZeroDivisionError:
# Added June 30, 2015 following observation by BM. With this
# modification, it is guaranteed that a step rejection is
# always accompanied by a TR reduction. This prevents
# stagnation in this "corner case" (NaN's really aren't
# supposed to occur, but it's nice if we can handle them
# nonetheless).
print("rho is NaN! Forcing a radius decrease. This should "
"not happen.")
rho = np.nan
# Choose the new TR radius based on the model performance
trstr = " "
# If the actual decrease is smaller than 1/4 of the predicted
# decrease, then reduce the TR radius.
if rho < 1.0 / 4 or not model_decreased or np.isnan(rho):
trstr = "TR-"
Delta = Delta / 4
consecutive_TRplus = 0
consecutive_TRminus = consecutive_TRminus + 1
if consecutive_TRminus >= 5 and verbosity >= 1:
consecutive_TRminus = -np.inf
print(" +++ Detected many consecutive TR- (radius "
"decreases).")
print(" +++ Consider decreasing options.Delta_bar "
"by an order of magnitude.")
print(" +++ Current values: Delta_bar = {:g} and "
"Delta0 = {:g}".format(Delta_bar, Delta0))
# If the actual decrease is at least 3/4 of the precicted decrease
# and the tCG (inner solve) hit the TR boundary, increase the TR
# radius. We also keep track of the number of consecutive
# trust-region radius increases. If there are many, this may
# indicate the need to adapt the initial and maximum radii.
elif rho > 3.0 / 4 and (stop_inner == self.NEGATIVE_CURVATURE or
stop_inner == self.EXCEEDED_TR):
trstr = "TR+"
Delta = min(2 * Delta, Delta_bar)
consecutive_TRminus = 0
consecutive_TRplus = consecutive_TRplus + 1
if consecutive_TRplus >= 5 and verbosity >= 1:
consecutive_TRplus = -np.inf
print(" +++ Detected many consecutive TR+ (radius "
"increases).")
print(" +++ Consider increasing options.Delta_bar "
"by an order of magnitude.")
print(" +++ Current values: Delta_bar = {:g} and "
"Delta0 = {:g}.".format(Delta_bar, Delta0))
else:
# Otherwise, keep the TR radius constant.
consecutive_TRplus = 0
consecutive_TRminus = 0
# Choose to accept or reject the proposed step based on the model
# performance. Note the strict inequality.
if model_decreased and rho > self.rho_prime:
# accept = True
accstr = "acc"
x = x_prop
fx = fx_prop
fgradx = grad(x)
norm_grad = man.norm(x, fgradx)
else:
# accept = False
accstr = "REJ"
# k is the number of iterations we have accomplished.
k = k + 1
# ** Display:
if verbosity == 2:
print("{:.3s} {:.3s} k: {:5d} num_inner: "
"{:5d} f: {:+e} |grad|: {:e} "
"{:s}".format(accstr, trstr, k, numit,
float(fx), norm_grad, srstr))
elif verbosity > 2:
if self.use_rand and used_cauchy:
print("USED CAUCHY POINT")
print("{:.3s} {:.3s} k: {:5d} num_inner: "
"{:5d} {:s}".format(accstr, trstr, k, numit, srstr))
print(" f(x) : {:+e} |grad| : "
"{:e}".format(fx, norm_grad))
print(" rho : {:e}".format(rho))
# ** CHECK STOPPING criteria
stop_reason = self._check_stopping_criterion(
time0, gradnorm=norm_grad, iter=k)
if stop_reason:
if verbosity >= 1:
print(stop_reason)
print('')
break
if self._logverbosity <= 0:
return x
else:
self._stop_optlog(x, fx, stop_reason, time0,
gradnorm=norm_grad, iter=k)
return x, self._optlog
def _truncated_conjugate_gradient(self, problem, x, fgradx, eta, Delta,
theta, kappa, mininner, maxinner):
man = problem.manifold
inner = man.inner
hess = problem.hess
precon = problem.precon
if not self.use_rand: # and therefore, eta == 0
Heta = man.zerovec(x)
r = fgradx
e_Pe = 0
else: # and therefore, no preconditioner
# eta (presumably) ~= 0 was provided by the caller.
Heta = hess(x, eta)
r = fgradx + Heta
e_Pe = inner(x, eta, eta)
r_r = inner(x, r, r)
norm_r = np.sqrt(r_r)
norm_r0 = norm_r
# Precondition the residual
if not self.use_rand:
z = precon(x, r)
else:
z = r
# Compute z'*r
z_r = inner(x, z, r)
d_Pd = z_r
# Initial search direction
delta = -z
if not self.use_rand:
e_Pd = 0
else:
e_Pd = inner(x, eta, delta)
# If the Hessian or a linear Hessian approximation is in use, it is
# theoretically guaranteed that the model value decreases strictly with
# each iteration of tCG. Hence, there is no need to monitor the model
# value. But, when a nonlinear Hessian approximation is used (such as
# the built-in finite-difference approximation for example), the model
# may increase. It is then important to terminate the tCG iterations
# and return the previous (the best-so-far) iterate. The variable below
# will hold the model value.
def model_fun(eta, Heta):
return inner(x, eta, fgradx) + 0.5 * inner(x, eta, Heta)
if not self.use_rand:
model_value = 0
else:
model_value = model_fun(eta, Heta)
# Pre-assume termination because j == end.
stop_tCG = self.MAX_INNER_ITER
# Begin inner/tCG loop.
for j in range(int(maxinner)):
# This call is the computationally intensive step
Hdelta = hess(x, delta)
# Compute curvature (often called kappa)
d_Hd = inner(x, delta, Hdelta)
# Note that if d_Hd == 0, we will exit at the next "if" anyway.
alpha = z_r / d_Hd
# <neweta,neweta>_P =
# <eta,eta>_P + 2*alpha*<eta,delta>_P + alpha*alpha*<delta,delta>_P
e_Pe_new = e_Pe + 2 * alpha * e_Pd + alpha ** 2 * d_Pd
# Check against negative curvature and trust-region radius
# violation. If either condition triggers, we bail out.
if d_Hd <= 0 or e_Pe_new >= Delta**2:
# want
# ee = <eta,eta>_prec,x
# ed = <eta,delta>_prec,x
# dd = <delta,delta>_prec,x
tau = ((-e_Pd +
np.sqrt(e_Pd * e_Pd +
d_Pd * (Delta ** 2 - e_Pe))) / d_Pd)
eta = eta + tau * delta
# If only a nonlinear Hessian approximation is available, this
# is only approximately correct, but saves an additional
# Hessian call.
Heta = Heta + tau * Hdelta
# Technically, we may want to verify that this new eta is
# indeed better than the previous eta before returning it (this
# is always the case if the Hessian approximation is linear,
# but I am unsure whether it is the case or not for nonlinear
# approximations.) At any rate, the impact should be limited,
# so in the interest of code conciseness (if we can still hope
# for that), we omit this.
if d_Hd <= 0:
stop_tCG = self.NEGATIVE_CURVATURE
else:
stop_tCG = self.EXCEEDED_TR
break
# No negative curvature and eta_prop inside TR: accept it.
e_Pe = e_Pe_new
new_eta = eta + alpha * delta
# If only a nonlinear Hessian approximation is available, this is
# only approximately correct, but saves an additional Hessian call.
new_Heta = Heta + alpha * Hdelta
# Verify that the model cost decreased in going from eta to
# new_eta. If it did not (which can only occur if the Hessian
# approximation is nonlinear or because of numerical errors), then
# we return the previous eta (which necessarily is the best reached
# so far, according to the model cost). Otherwise, we accept the
# new eta and go on.
new_model_value = model_fun(new_eta, new_Heta)
if new_model_value >= model_value:
stop_tCG = self.MODEL_INCREASED
break
eta = new_eta
Heta = new_Heta
model_value = new_model_value
# Update the residual.
r = r + alpha * Hdelta
# Compute new norm of r.
r_r = inner(x, r, r)
norm_r = np.sqrt(r_r)
# Check kappa/theta stopping criterion.
# Note that it is somewhat arbitrary whether to check this stopping
# criterion on the r's (the gradients) or on the z's (the
# preconditioned gradients). [CGT2000], page 206, mentions both as
# acceptable criteria.
if (j >= mininner and
norm_r <= norm_r0 * min(norm_r0**theta, kappa)):
# Residual is small enough to quit
if kappa < norm_r0 ** theta:
stop_tCG = self.REACHED_TARGET_LINEAR
else:
stop_tCG = self.REACHED_TARGET_SUPERLINEAR
break
# Precondition the residual.
if not self.use_rand:
z = precon(x, r)
else:
z = r
# Save the old z'*r.
zold_rold = z_r
# Compute new z'*r.
z_r = inner(x, z, r)
# Compute new search direction
beta = z_r / zold_rold
delta = -z + beta * delta
# Update new P-norms and P-dots [CGT2000, eq. 7.5.6 & 7.5.7].
e_Pd = beta * (e_Pd + alpha * d_Pd)
d_Pd = z_r + beta * beta * d_Pd
return eta, Heta, j, stop_tCG
```