Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 61 additions & 7 deletions src/OptimKit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@ const LS_MAXITER = ScopedValue(10)
const LS_MAXFG = ScopedValue(20)
const LS_VERBOSITY = ScopedValue(1)

const GRADTOL = ScopedValue(1e-8)
const GRADTOL = ScopedValue(1.0e-8)
const MAXITER = ScopedValue(1_000_000)
const VERBOSITY = ScopedValue(1)

# Default values for the manifold structure
_retract(x, d, α) = (add(x, d, α), d)
_invretract(x, y) = add(y, x, -1)
_inner(x, v1, v2) = v1 === v2 ? norm(v1)^2 : real(inner(v1, v2))
_transport!(v, xold, d, α, xnew) = v
_scale!(v, α) = scale!!(v, α)
Expand All @@ -26,7 +27,7 @@ _precondition(x, g) = deepcopy(g)
_finalize!(x, f, g, numiter) = x, f, g

# Default structs for new convergence and termination keywords
@kwdef struct DefaultHasConverged{T<:Real}
@kwdef struct DefaultHasConverged{T <: Real}
gradtol::T
end

Expand All @@ -44,6 +45,7 @@ end

# Optimization
abstract type OptimizationAlgorithm end
abstract type FixedPointAlgorithm end

const _xlast = Ref{Any}()
const _glast = Ref{Any}()
Expand Down Expand Up @@ -104,11 +106,61 @@ Also see [`GradientDescent`](@ref), [`ConjugateGradient`](@ref), [`LBFGS`](@ref)
"""
function optimize end

"""
fixedpoint(fp, x, alg;
finalize! = _finalize!,
shouldstop = DefaultShouldStop(alg.maxiter),
hasconverged = DefaultHasConverged(alg.gradtol),
retract = _retract, invretract = _invretract,
inner = _inner, (transport!) = _transport!,
(scale!) = _scale!, (add!) = _add!,
isometrictransport = (transport! == _transport! && inner == _inner))
-> x, g, numfp, history

Find a fixed point of the function `fp` starting from an initial point `x₀` and using the fixed point algorithm `alg`,
which is an instance of `SimpleIteration` or `AndersonMixing`.

Returns the final point `x`, the residual `g`, the total number of calls to `fp`,
and the history of the norm of the residual across the different iterations.

The algorithm is run until either `hasconverged(x, false, g, normg)` returns `true` or
`shouldstop(x, false, g, numfp, numiter, time)` returns `true`. The latter case happening before
the former is considered to be a failure to converge, and a warning is issued.

The keyword arguments are:
- `finalize!::Function`: A function that takes the final point `x`, a dummy variable `f = false`,
the residual `g`, and the iteration number, and returns a possibly modified values for
`x`, `f` and `g`. By default, the identity is used.
It is the user's responsibility to ensure that the modified values do not lead to
inconsistencies within the optimization algorithm.
- `hasconverged::Function`: A function that takes the current point `x`, a dummy variable `f = false`,
the residual `r`, and the norm of the residual, and returns a boolean indicating whether
the fixed point iteration has converged. By default, the norm of the residual is compared to the
tolerance `gradtol` as encoded in the algorithm instance.
- `shouldstop::Function`: A function that takes the current point `x`, a dummy variable `f = false`,
the residuals `g`, the number of calls to `fg`, the iteration number, and the time spent
so far, and returns a boolean indicating whether the optimization should stop. By default,
the number of iterations is compared to the maximum number of iterations as encoded in the
algorithm instance.

Check the README of this package for further details on creating an algorithm instance,
as well as for the meaning of the remaining keyword arguments and their default values.

!!! Warning

The default values of `hasconverged` and `shouldstop` are provided to ensure continuity
with the previous versions of this package. However, this behaviour might change in the
future.

Also see [`SimpleIteration`](@ref) and [`AndersonMixing`](@ref).
"""
function fixedpoint end

function format_time(t::Float64)
if t < 1e-3
return @sprintf("%5.1f μs", 1e6*t)
if t < 1.0e-3
return @sprintf("%5.1f μs", 1.0e6 * t)
elseif t < 1
return @sprintf("%5.1f ms", 1e3*t)
return @sprintf("%5.1f ms", 1.0e3 * t)
elseif t < 60
return @sprintf("%5.2f s", t)
elseif t < 3600
Expand All @@ -122,7 +174,8 @@ include("linesearches.jl")
include("gd.jl")
include("cg.jl")
include("lbfgs.jl")

include("simpleiteration.jl")
include("anderson.jl")
const gd = GradientDescent()
const cg = ConjugateGradient()
const lbfgs = LBFGS()
Expand All @@ -131,6 +184,7 @@ export optimize, gd, cg, lbfgs, optimtest
export GradientDescent, ConjugateGradient, LBFGS
export FletcherReeves, HestenesStiefel, PolakRibiere, HagerZhang, DaiYuan
export HagerZhangLineSearch
export fixedpoint, SimpleIteration, AndersonMixing

"""
optimtest(fg, x, [d]; alpha = -0.1:0.001:0.1, retract = _retract, inner = _inner)
Expand All @@ -142,7 +196,7 @@ Test the compatibility between the computation of the gradient, the retraction a

It is up to the user to check that the values in `dfs1` and `dfs2` match up to expected precision, by inspecting the numerical values or plotting them. If these values don't match, the linesearch in `optimize` cannot be expected to work.
"""
function optimtest(fg, x, d=fg(x)[2]; alpha=-0.1:0.001:0.1, retract=_retract, inner=_inner)
function optimtest(fg, x, d = fg(x)[2]; alpha = -0.1:0.001:0.1, retract = _retract, inner = _inner)
# evaluate function at given edge points
fs_edges = map(alpha) do a
f, = fg(retract(x, d, a)[1])
Expand Down
258 changes: 258 additions & 0 deletions src/anderson.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,258 @@
"""
struct AndersonMixing{T <: Real} <: FixedPointAlgorithm
AndersonMixing(m::Int;
damping::Real = 1,
maxiter::Int=MAXITER[], # 1_000_000
gradtol::Real=GRADTOL[], # 1e-8
verbosity::Int=VERBOSITY[]) # 1

Anderson mixing, also known as Anderson acceleration, for fixed point problems.

## Parameters
- `m::Int`: The number of previous iterates to use for Anderson extrapolation.
- `damping::Real`: The damping parameter for Anderson extrapolation; a value of 1 corresponds to no damping, while a value between 0 and 1 corresponds to under-relaxation.
- `maxiter::Int`: The maximum number of iterations.
- `gradtol::T`: The tolerance for the norm of the residual.
- `verbosity::Int`: The verbosity level of the optimization algorithm.

The verbosity level use the following scheme:
- 0: no output
- 1: only warnings upon non-convergence
- 2: convergence information at the end of the algorithm
- 3: progress information after each iteration
"""
struct AndersonMixing{T <: Real} <: FixedPointAlgorithm
m::Int
damping::T
maxiter::Int
gradtol::T
verbosity::Int
end
function AndersonMixing(m::Int=8;
damping::Real = 1,
maxiter::Int=MAXITER[],
gradtol::Real=GRADTOL[],
verbosity::Int=VERBOSITY[])
damping′, gradtol′ = promote(damping, gradtol)
return AndersonMixing(m, damping′, maxiter, gradtol′, verbosity)
end

using LinearAlgebra
function fixedpoint(
fp::F, x₀, alg::AndersonMixing;
(finalize!) = _finalize!,
shouldstop = DefaultShouldStop(alg.maxiter),
hasconverged = DefaultHasConverged(alg.gradtol),
retract = _retract, invretract = _invretract,
inner = _inner, (transport!) = _transport!,
(scale!) = _scale!, (add!) = _add!,
isometrictransport = (transport! == _transport! && inner == _inner)
) where {F}

t₀ = time()
verbosity = alg.verbosity
x = x₀
fx = fp(x)
numfp = 1
numiter = 0
g = invretract(x, fx) # residual, i.e. direction from x to fx, similar to F(x) - x in the vector case
x, _, g = finalize!(x, false, g, numiter)
innergg = inner(x, g, g)
normg = sqrt(innergg)
normghistory = [normg]
verbosity >= 2 &&
@info @sprintf("Anderson: initializing with ‖x - F(x)‖ = %.4e", normg)
t = time() - t₀
_hasconverged = hasconverged(x, false, g, normg)
_shouldstop = shouldstop(x, false, g, numfp, numiter, t)

if !(_hasconverged || _shouldstop) # first iteration is special
xprev = x
gprev = g
Δxprev = deepcopy(g)
told = t
x, Δx = retract(x, g, 1)
fx = fp(x)
numfp += 1
numiter += 1
g = invretract(x, fx) # residual, i.e. direction from x to fx, similar to F(x) - x in the vector case
x, _, g = finalize!(x, false, g, numiter)
innergg = inner(x, g, g)
normg = sqrt(innergg)
push!(normghistory, normg)
t = time() - t₀
Δt = t - told
_hasconverged = hasconverged(x, false, g, normg)
_shouldstop = shouldstop(x, false, g, numfp, numiter, t)
end
TangentType = typeof(g)
H = AndersonHistory(alg.m, TangentType[], TangentType[])
overlap_full = zeros(typeof(innergg), alg.m, alg.m)
rhs_full = zeros(typeof(innergg), alg.m)
while !(_hasconverged || _shouldstop)
verbosity >= 3 &&
@info @sprintf("Anderson acceleration: iter %4d, Δt %s: ‖f(x) - x‖ = %.4e", numiter, format_time(Δt), normg)

told = t
# compute new update direction using Anderson extrapolation
gprev = transport!(gprev, xprev, Δxprev, 1, x)
Δg = add!(deepcopy(g), gprev, -1)
# Δx = transport!(deepcopy(Δx), xprev, Δx, 1, x) # transport previous Anderson extrapolation step to current point

mold = length(H)
push!(H, (Δg, Δx))
for k in 1:(length(H) - 1)
(Δgₖ, Δxₖ) = H[k]
Δgₖ = transport!(Δgₖ, xprev, Δxprev, 1, x) # transport stored residuals to current point
Δxₖ = transport!(Δxₖ, xprev, Δxprev, 1, x)
H[k] = (Δgₖ, Δxₖ) # update stored residuals and steps to current point
end
m = length(H)

overlap = view(overlap_full, 1:m, 1:m)
rhs = view(rhs_full, 1:m)
if isometrictransport
if mold == m
@inbounds for j in 1:(m - 1)
for i in 1:(m - 1)
overlap[i, j] = overlap[i + 1, j + 1]
end
end
end
@inbounds for i in 1:(m - 1)
(Δgᵢ, Δxᵢ) = H[i]
overlap[m, i] = overlap[i, m] = inner(x, Δgᵢ, Δg)
end
overlap[m, m] = inner(x, Δg, Δg)
else
@inbounds for j in 1:m
(Δgⱼ, Δxⱼ) = H[j]
for i in 1:(j - 1)
(Δgᵢ, Δxᵢ) = H[i]
overlap[j, i] = overlap[i, j] = inner(x, Δgᵢ, Δgⱼ)
end
overlap[j, j] = inner(x, Δgⱼ, Δgⱼ)
end
end
@inbounds for i in 1:m
(Δgᵢ, Δxᵢ) = H[i]
rhs[i] = inner(x, Δgᵢ, g)
end
# overlapX = [inner(x, H[i][2], H[j][2]) for i in 1:m, j in 1:m]
# overlap += sqrt(eps(one(eltype(overlap)))) * overlapX
overlap += LinearAlgebra.tr(overlap) / size(overlap, 1) * sqrt(eps(one(eltype(overlap)))) * I
Γ = overlap \ rhs # solve least squares problem to get Anderson coefficients
ḡ = deepcopy(g)
Δx = scale!(deepcopy(Δx), 0)
@inbounds for k in 1:m
(Δgₖ, Δxₖ) = H[k]
ḡ = add!(ḡ, Δgₖ, -Γ[k])
Δx = add!(Δx, Δxₖ, -Γ[k]) # compute Anderson extrapolation direction
end
@show sqrt(inner(x, ḡ, ḡ))
Δx = add!(Δx, ḡ, alg.damping) # add damping to Anderson extrapolation direction

# store current quantities as previous quantities
xprev = x
Δxprev = Δx
gprev = g
_xlast[] = x # store result in global variables to debug failures
_glast[] = g

# take step and evaluate new point
x, Δx = retract(x, Δx, 1)
fx = fp(x)
numfp += 1
numiter += 1
g = invretract(x, fx) # Direction from x to fx, i.e. similar to F(x) - x in the vector case
x, _, g = finalize!(x, false, g, numiter)
innergg = inner(x, g, g)
normg = sqrt(innergg)
push!(normghistory, normg)
t = time() - t₀
Δt = t - told
_hasconverged = hasconverged(x, false, g, normg)
_shouldstop = shouldstop(x, false, g, numfp, numiter, t)

# check stopping criteria and print info
if _hasconverged || _shouldstop
break
end
end
if _hasconverged
verbosity >= 2 &&
@info @sprintf("Anderson acceleration: converged after %d iterations and time %s: ‖f(x) - x‖ = %.4e", numiter, format_time(t), normg)
else
verbosity >= 1 &&
@warn @sprintf("Anderson acceleration: not converged to requested tol after %d iterations and time %s: ‖f(x) - x‖ = %.4e", numiter, format_time(t), normg)
end
return x, g, numfp, normghistory
end

mutable struct AndersonHistory{TangentType}
maxlength::Int
length::Int
first::Int
Δgesiduals::Vector{TangentType}
Δpositions::Vector{TangentType}
function AndersonHistory{T}(maxlength::Int, Δgesiduals::Vector{T}, Δpositions::Vector{T}) where {T}
l = length(Δgesiduals)
@assert l == length(Δpositions) "AndersonHistory: Δgesiduals and Δpositions must have the same length"
@assert l <= maxlength "AndersonHistory: initial history length cannot exceed maxlength"
Δgesiduals = resize!(copy(Δgesiduals), maxlength)
Δpositions = resize!(copy(Δpositions), maxlength)
return new{T}(maxlength, l, 1, Δgesiduals, Δpositions)
end
end
function AndersonHistory(maxlength::Int, Δgesiduals::Vector{T}, Δpositions::Vector{T}) where {T}
return AndersonHistory{T}(maxlength, Δgesiduals, Δpositions)
end

Base.length(H::AndersonHistory) = H.length

@inline function Base.getindex(H::AndersonHistory, i::Int)
@boundscheck if i < 1 || i > H.length
throw(BoundsError(H, i))
end
n = H.maxlength
idx = H.first + i - 1
idx = ifelse(idx > n, idx - n, idx)
return (getindex(H.Δgesiduals, idx), getindex(H.Δpositions, idx))
end

@inline function Base.setindex!(H::AndersonHistory, (Δg, Δx), i)
@boundscheck if i < 1 || i > H.length
throw(BoundsError(H, i))
end
idx = mod1(H.first + i - 1, H.maxlength)
setindex!(H.Δgesiduals, Δg, idx)
setindex!(H.Δpositions, Δx, idx)
return (Δg, Δx)
end

@inline function Base.push!(H::AndersonHistory, value)
if H.length < H.maxlength
H.length += 1
else
H.first = mod1(H.first + 1, H.maxlength)
end
@inbounds setindex!(H, value, H.length)
return H
end
@inline function Base.pop!(H::AndersonHistory)
@inbounds v = H[H.length]
H.length -= 1
return v
end
@inline function Base.popfirst!(H::AndersonHistory)
@inbounds v = H[1]
H.first = mod1(H.first + 1, H.maxlength)
H.length -= 1
return v
end

@inline function Base.empty!(H::AndersonHistory)
H.length = 0
H.first = 1
return H
end
Loading
Loading