Jutho · Jutho · Mar 25, 2026
diff --git a/src/OptimKit.jl b/src/OptimKit.jl
@@ -11,12 +11,13 @@ const LS_MAXITER = ScopedValue(10)
 const LS_MAXFG = ScopedValue(20)
 const LS_VERBOSITY = ScopedValue(1)
 
-const GRADTOL = ScopedValue(1e-8)
+const GRADTOL = ScopedValue(1.0e-8)
 const MAXITER = ScopedValue(1_000_000)
 const VERBOSITY = ScopedValue(1)
 
 # Default values for the manifold structure
 _retract(x, d, α) = (add(x, d, α), d)
+_invretract(x, y) = add(y, x, -1)
 _inner(x, v1, v2) = v1 === v2 ? norm(v1)^2 : real(inner(v1, v2))
 _transport!(v, xold, d, α, xnew) = v
 _scale!(v, α) = scale!!(v, α)
@@ -26,7 +27,7 @@ _precondition(x, g) = deepcopy(g)
 _finalize!(x, f, g, numiter) = x, f, g
 
 # Default structs for new convergence and termination keywords
-@kwdef struct DefaultHasConverged{T<:Real}
+@kwdef struct DefaultHasConverged{T <: Real}
     gradtol::T
 end
 
@@ -44,6 +45,7 @@ end
 
 # Optimization
 abstract type OptimizationAlgorithm end
+abstract type FixedPointAlgorithm end
 
 const _xlast = Ref{Any}()
 const _glast = Ref{Any}()
@@ -104,11 +106,61 @@ Also see [`GradientDescent`](@ref), [`ConjugateGradient`](@ref), [`LBFGS`](@ref)
 """
 function optimize end
 
+"""
+    fixedpoint(fp, x, alg;
+                  finalize! = _finalize!,
+                  shouldstop = DefaultShouldStop(alg.maxiter),
+                  hasconverged = DefaultHasConverged(alg.gradtol),
+                  retract = _retract, invretract = _invretract,
+                  inner = _inner, (transport!) = _transport!,
+                  (scale!) = _scale!, (add!) = _add!,
+                  isometrictransport = (transport! == _transport! && inner == _inner))
+    -> x, g, numfp, history
+
+Find a fixed point of the function `fp` starting from an initial point `x₀` and using the fixed point algorithm `alg`,
+which is an instance of `SimpleIteration` or `AndersonMixing`.
+
+Returns the final point `x`, the residual `g`, the total number of calls to `fp`,
+and the history of the norm of the residual across the different iterations.
+
+The algorithm is run until either `hasconverged(x, false, g, normg)` returns `true` or 
+`shouldstop(x, false, g, numfp, numiter, time)` returns `true`. The latter case happening before
+the former is considered to be a failure to converge, and a warning is issued.
+
+The keyword arguments are:
+-   `finalize!::Function`: A function that takes the final point `x`, a dummy variable `f = false`,
+    the residual `g`, and the iteration number, and returns a possibly modified values for
+    `x`, `f` and `g`. By default, the identity is used.
+    It is the user's responsibility to ensure that the modified values do not lead to
+    inconsistencies within the optimization algorithm.
+-   `hasconverged::Function`: A function that takes the current point `x`, a dummy variable `f = false`,
+    the residual `r`, and the norm of the residual, and returns a boolean indicating whether
+    the fixed point iteration has converged. By default, the norm of the residual is compared to the
+    tolerance `gradtol` as encoded in the algorithm instance.
+-   `shouldstop::Function`: A function that takes the current point `x`, a dummy variable `f = false`,
+    the residuals `g`, the number of calls to `fg`, the iteration number, and the time spent
+    so far, and returns a boolean indicating whether the optimization should stop. By default,
+    the number of iterations is compared to the maximum number of iterations as encoded in the
+    algorithm instance.
+
+Check the README of this package for further details on creating an algorithm instance,
+as well as for the meaning of the remaining keyword arguments and their default values.
+
+!!! Warning
+
+    The default values of `hasconverged` and `shouldstop` are provided to ensure continuity
+    with the previous versions of this package. However, this behaviour might change in the
+    future.
+
+Also see [`SimpleIteration`](@ref) and [`AndersonMixing`](@ref).
+"""
+function fixedpoint end
+
 function format_time(t::Float64)
-    if t < 1e-3
-        return @sprintf("%5.1f μs", 1e6*t)
+    if t < 1.0e-3
+        return @sprintf("%5.1f μs", 1.0e6 * t)
     elseif t < 1
-        return @sprintf("%5.1f ms", 1e3*t)
+        return @sprintf("%5.1f ms", 1.0e3 * t)
     elseif t < 60
         return @sprintf("%5.2f s", t)
     elseif t < 3600
@@ -122,7 +174,8 @@ include("linesearches.jl")
 include("gd.jl")
 include("cg.jl")
 include("lbfgs.jl")
-
+include("simpleiteration.jl")
+include("anderson.jl")
 const gd = GradientDescent()
 const cg = ConjugateGradient()
 const lbfgs = LBFGS()
@@ -131,6 +184,7 @@ export optimize, gd, cg, lbfgs, optimtest
 export GradientDescent, ConjugateGradient, LBFGS
 export FletcherReeves, HestenesStiefel, PolakRibiere, HagerZhang, DaiYuan
 export HagerZhangLineSearch
+export fixedpoint, SimpleIteration, AndersonMixing
 
 """
     optimtest(fg, x, [d]; alpha = -0.1:0.001:0.1, retract = _retract, inner = _inner)
@@ -142,7 +196,7 @@ Test the compatibility between the computation of the gradient, the retraction a
 
 It is up to the user to check that the values in `dfs1` and `dfs2` match up to expected precision, by inspecting the numerical values or plotting them. If these values don't match, the linesearch in `optimize` cannot be expected to work.
 """
-function optimtest(fg, x, d=fg(x)[2]; alpha=-0.1:0.001:0.1, retract=_retract, inner=_inner)
+function optimtest(fg, x, d = fg(x)[2]; alpha = -0.1:0.001:0.1, retract = _retract, inner = _inner)
     # evaluate function at given edge points
     fs_edges = map(alpha) do a
         f, = fg(retract(x, d, a)[1])

diff --git a/src/anderson.jl b/src/anderson.jl
@@ -0,0 +1,258 @@
+"""
+    struct AndersonMixing{T <: Real} <: FixedPointAlgorithm
+    AndersonMixing(m::Int;
+                      damping::Real = 1,
+                      maxiter::Int=MAXITER[], # 1_000_000
+                      gradtol::Real=GRADTOL[], # 1e-8
+                      verbosity::Int=VERBOSITY[]) # 1
+
+Anderson mixing, also known as Anderson acceleration, for fixed point problems.
+
+## Parameters
+- `m::Int`: The number of previous iterates to use for Anderson extrapolation.
+- `damping::Real`: The damping parameter for Anderson extrapolation; a value of 1 corresponds to no damping, while a value between 0 and 1 corresponds to under-relaxation.
+- `maxiter::Int`: The maximum number of iterations.
+- `gradtol::T`: The tolerance for the norm of the residual.
+- `verbosity::Int`: The verbosity level of the optimization algorithm.
+
+The verbosity level use the following scheme:
+- 0: no output
+- 1: only warnings upon non-convergence
+- 2: convergence information at the end of the algorithm
+- 3: progress information after each iteration
+"""
+struct AndersonMixing{T <: Real} <: FixedPointAlgorithm
+    m::Int
+    damping::T
+    maxiter::Int
+    gradtol::T
+    verbosity::Int
+end
+function AndersonMixing(m::Int=8;
+               damping::Real = 1,
+               maxiter::Int=MAXITER[],
+               gradtol::Real=GRADTOL[],
+               verbosity::Int=VERBOSITY[])
+    damping′, gradtol′ = promote(damping, gradtol)
+    return AndersonMixing(m, damping′, maxiter, gradtol′, verbosity)
+end
+
+using LinearAlgebra
+function fixedpoint(
+        fp::F, x₀, alg::AndersonMixing;
+        (finalize!) = _finalize!,
+        shouldstop = DefaultShouldStop(alg.maxiter),
+        hasconverged = DefaultHasConverged(alg.gradtol),
+        retract = _retract, invretract = _invretract,
+        inner = _inner, (transport!) = _transport!,
+        (scale!) = _scale!, (add!) = _add!,
+        isometrictransport = (transport! == _transport! && inner == _inner)
+    ) where {F}
+
+    t₀ = time()
+    verbosity = alg.verbosity
+    x = x₀
+    fx = fp(x)
+    numfp = 1
+    numiter = 0
+    g = invretract(x, fx) # residual, i.e. direction from x to fx, similar to F(x) - x in the vector case
+    x, _, g = finalize!(x, false, g, numiter)
+    innergg = inner(x, g, g)
+    normg = sqrt(innergg)
+    normghistory = [normg]
+    verbosity >= 2 &&
+        @info @sprintf("Anderson: initializing with ‖x - F(x)‖ = %.4e", normg)
+    t = time() - t₀
+    _hasconverged = hasconverged(x, false, g, normg)
+    _shouldstop = shouldstop(x, false, g, numfp, numiter, t)
+
+    if !(_hasconverged || _shouldstop) # first iteration is special
+        xprev = x
+        gprev = g
+        Δxprev = deepcopy(g)
+        told = t
+        x, Δx = retract(x, g, 1)
+        fx = fp(x)
+        numfp += 1
+        numiter += 1
+        g = invretract(x, fx) # residual, i.e. direction from x to fx, similar to F(x) - x in the vector case
+        x, _, g = finalize!(x, false, g, numiter)
+        innergg = inner(x, g, g)
+        normg = sqrt(innergg)
+        push!(normghistory, normg)
+        t = time() - t₀
+        Δt = t - told
+        _hasconverged = hasconverged(x, false, g, normg)
+        _shouldstop = shouldstop(x, false, g, numfp, numiter, t)
+    end
+    TangentType = typeof(g)
+    H = AndersonHistory(alg.m, TangentType[], TangentType[])
+    overlap_full = zeros(typeof(innergg), alg.m, alg.m)
+    rhs_full = zeros(typeof(innergg), alg.m)
+    while !(_hasconverged || _shouldstop)
+        verbosity >= 3 &&
+            @info @sprintf("Anderson acceleration: iter %4d, Δt %s: ‖f(x) - x‖ = %.4e", numiter, format_time(Δt), normg)
+
+        told = t
+        # compute new update direction using Anderson extrapolation
+        gprev = transport!(gprev, xprev, Δxprev, 1, x)
+        Δg = add!(deepcopy(g), gprev, -1)
+        # Δx = transport!(deepcopy(Δx), xprev, Δx, 1, x) # transport previous Anderson extrapolation step to current point
+
+        mold = length(H)
+        push!(H, (Δg, Δx)) 
+        for k in 1:(length(H) - 1)
+            (Δgₖ, Δxₖ) = H[k]
+            Δgₖ = transport!(Δgₖ, xprev, Δxprev, 1, x) # transport stored residuals to current point
+            Δxₖ = transport!(Δxₖ, xprev, Δxprev, 1, x)
+            H[k] = (Δgₖ, Δxₖ) # update stored residuals and steps to current point
+        end
+        m = length(H)
+
+        overlap = view(overlap_full, 1:m, 1:m)
+        rhs = view(rhs_full, 1:m)
+        if isometrictransport
+            if mold == m
+                @inbounds for j in 1:(m - 1)
+                    for i in 1:(m - 1)
+                        overlap[i, j] = overlap[i + 1, j + 1]
+                    end
+                end
+            end
+            @inbounds for i in 1:(m - 1)
+                (Δgᵢ, Δxᵢ) = H[i]
+                overlap[m, i] = overlap[i, m] = inner(x, Δgᵢ, Δg)
+            end
+            overlap[m, m] = inner(x, Δg, Δg)
+        else
+            @inbounds for j in 1:m
+                (Δgⱼ, Δxⱼ) = H[j]
+                for i in 1:(j - 1)
+                    (Δgᵢ, Δxᵢ) = H[i]
+                    overlap[j, i] = overlap[i, j] = inner(x, Δgᵢ, Δgⱼ)
+                end
+                overlap[j, j] = inner(x, Δgⱼ, Δgⱼ)
+            end
+        end
+        @inbounds for i in 1:m
+            (Δgᵢ, Δxᵢ) = H[i]
+            rhs[i] = inner(x, Δgᵢ, g)
+        end
+        # overlapX = [inner(x, H[i][2], H[j][2]) for i in 1:m, j in 1:m]
+        # overlap += sqrt(eps(one(eltype(overlap)))) * overlapX
+        overlap += LinearAlgebra.tr(overlap) / size(overlap, 1) * sqrt(eps(one(eltype(overlap)))) * I
+        Γ = overlap \ rhs # solve least squares problem to get Anderson coefficients
+        ḡ = deepcopy(g)
+        Δx = scale!(deepcopy(Δx), 0)
+        @inbounds for k in 1:m
+            (Δgₖ, Δxₖ) = H[k]
+            ḡ = add!(ḡ, Δgₖ, -Γ[k])
+            Δx = add!(Δx, Δxₖ, -Γ[k]) # compute Anderson extrapolation direction
+        end
+        @show sqrt(inner(x, ḡ, ḡ))
+        Δx = add!(Δx, ḡ, alg.damping) # add damping to Anderson extrapolation direction
+
+        # store current quantities as previous quantities
+        xprev = x
+        Δxprev = Δx
+        gprev = g
+        _xlast[] = x # store result in global variables to debug failures
+        _glast[] = g
+
+        # take step and evaluate new point
+        x, Δx = retract(x, Δx, 1)
+        fx = fp(x)
+        numfp += 1
+        numiter += 1
+        g = invretract(x, fx) # Direction from x to fx, i.e. similar to F(x) - x in the vector case
+        x, _, g = finalize!(x, false, g, numiter)
+        innergg = inner(x, g, g)
+        normg = sqrt(innergg)
+        push!(normghistory, normg)
+        t = time() - t₀
+        Δt = t - told
+        _hasconverged = hasconverged(x, false, g, normg)
+        _shouldstop = shouldstop(x, false, g, numfp, numiter, t)
+
+        # check stopping criteria and print info
+        if _hasconverged || _shouldstop
+            break
+        end
+    end
+    if _hasconverged
+        verbosity >= 2 &&
+            @info @sprintf("Anderson acceleration: converged after %d iterations and time %s: ‖f(x) - x‖ = %.4e", numiter, format_time(t), normg)
+    else
+        verbosity >= 1 &&
+            @warn @sprintf("Anderson acceleration: not converged to requested tol after %d iterations and time %s: ‖f(x) - x‖ = %.4e", numiter, format_time(t), normg)
+    end
+    return x, g, numfp, normghistory
+end
+
+mutable struct AndersonHistory{TangentType}
+    maxlength::Int
+    length::Int
+    first::Int
+    Δgesiduals::Vector{TangentType}
+    Δpositions::Vector{TangentType}
+    function AndersonHistory{T}(maxlength::Int, Δgesiduals::Vector{T}, Δpositions::Vector{T}) where {T}
+        l = length(Δgesiduals)
+        @assert l == length(Δpositions) "AndersonHistory: Δgesiduals and Δpositions must have the same length"
+        @assert l <= maxlength "AndersonHistory: initial history length cannot exceed maxlength"
+        Δgesiduals = resize!(copy(Δgesiduals), maxlength)
+        Δpositions = resize!(copy(Δpositions), maxlength)
+        return new{T}(maxlength, l, 1, Δgesiduals, Δpositions)
+    end
+end
+function AndersonHistory(maxlength::Int, Δgesiduals::Vector{T}, Δpositions::Vector{T}) where {T}
+    return AndersonHistory{T}(maxlength, Δgesiduals, Δpositions)
+end
+
+Base.length(H::AndersonHistory) = H.length
+
+@inline function Base.getindex(H::AndersonHistory, i::Int)
+    @boundscheck if i < 1 || i > H.length
+        throw(BoundsError(H, i))
+    end
+    n = H.maxlength
+    idx = H.first + i - 1
+    idx = ifelse(idx > n, idx - n, idx)
+    return (getindex(H.Δgesiduals, idx), getindex(H.Δpositions, idx))
+end
+
+@inline function Base.setindex!(H::AndersonHistory, (Δg, Δx), i)
+    @boundscheck if i < 1 || i > H.length
+        throw(BoundsError(H, i))
+    end
+    idx = mod1(H.first + i - 1, H.maxlength)
+    setindex!(H.Δgesiduals, Δg, idx)
+    setindex!(H.Δpositions, Δx, idx)
+    return (Δg, Δx)
+end
+
+@inline function Base.push!(H::AndersonHistory, value)
+    if H.length < H.maxlength
+        H.length += 1
+    else
+        H.first = mod1(H.first + 1, H.maxlength)
+    end
+    @inbounds setindex!(H, value, H.length)
+    return H
+end
+@inline function Base.pop!(H::AndersonHistory)
+    @inbounds v = H[H.length]
+    H.length -= 1
+    return v
+end
+@inline function Base.popfirst!(H::AndersonHistory)
+    @inbounds v = H[1]
+    H.first = mod1(H.first + 1, H.maxlength)
+    H.length -= 1
+    return v
+end
+
+@inline function Base.empty!(H::AndersonHistory)
+    H.length = 0
+    H.first = 1
+    return H
+end