121
121
low_rank_inverse_hessian_.Update(
122
122
previous.search_direction * previous.step_size,
123
123
current.gradient - previous.gradient);
124
125
search_direction->setZero();
125
126
low_rank_inverse_hessian_.RightMultiply(current.gradient.data(),
126
127
search_direction->data());
176
177
const Vector delta_gradient = current.gradient - previous.gradient;
177
178
const double delta_x_dot_delta_gradient = delta_x.dot(delta_gradient);
179
if (delta_x_dot_delta_gradient <= 1e-10) {
180
// The (L)BFGS algorithm explicitly requires that the secant equation:
182
// B_{k+1} * s_k = y_k
184
// Is satisfied at each iteration, where B_{k+1} is the approximated
185
// Hessian at the k+1-th iteration, s_k = (x_{k+1} - x_{k}) and
186
// y_k = (grad_{k+1} - grad_{k}). As the approximated Hessian must be
187
// positive definite, this is equivalent to the condition:
189
// s_k^T * y_k > 0 [s_k^T * B_{k+1} * s_k = s_k^T * y_k > 0]
191
// This condition would always be satisfied if the function was strictly
192
// convex, alternatively, it is always satisfied provided that a Wolfe line
193
// search is used (even if the function is not strictly convex). See [1]
194
// (p138) for a proof.
196
// Although Ceres will always use a Wolfe line search when using (L)BFGS,
197
// practical implementation considerations mean that the line search
198
// may return a point that satisfies only the Armijo condition, and thus
199
// could violate the Secant equation. As such, we will only use a step
200
// to update the Hessian approximation if:
202
// s_k^T * y_k > tolerance
204
// It is important that tolerance is very small (and >=0), as otherwise we
205
// might skip the update too often and fail to capture important curvature
206
// information in the Hessian. For example going from 1e-10 -> 1e-14
207
// improves the NIST benchmark score from 43/54 to 53/54.
209
// [1] Nocedal J, Wright S, Numerical Optimization, 2nd Ed. Springer, 1999.
211
// TODO(alexs.mac): Consider using Damped BFGS update instead of
213
const double kBFGSSecantConditionHessianUpdateTolerance = 1e-14;
214
if (delta_x_dot_delta_gradient <=
215
kBFGSSecantConditionHessianUpdateTolerance) {
180
216
VLOG(2) << "Skipping BFGS Update, delta_x_dot_delta_gradient too "
181
<< "small: " << delta_x_dot_delta_gradient;
217
<< "small: " << delta_x_dot_delta_gradient << ", tolerance: "
218
<< kBFGSSecantConditionHessianUpdateTolerance
219
<< " (Secant condition).";
183
221
// Update dense inverse Hessian approximation.
214
252
// Part II: Implementation and experiments, Management Science,
215
253
// 20(5), 863-874, 1974.
216
254
// [2] Nocedal J., Wright S., Numerical Optimization, Springer, 1999.
255
const double approximate_eigenvalue_scale =
218
256
delta_x_dot_delta_gradient / delta_gradient.dot(delta_gradient);
257
inverse_hessian_ *= approximate_eigenvalue_scale;
259
VLOG(4) << "Applying approximate_eigenvalue_scale: "
260
<< approximate_eigenvalue_scale << " to initial inverse "
261
<< "Hessian approximation.";
220
263
initialized_ = true;