diff --git a/src/plans/cache.jl b/src/plans/cache.jl
index 4d1b3efc79..8f83774c0a 100644
--- a/src/plans/cache.jl
+++ b/src/plans/cache.jl
@@ -20,7 +20,7 @@ Both `X` and `c` are accompanied by booleans to keep track of their validity.
 * `initialized` (`true`) – whether to initialize the cached `X` and `c` or not.
 """
 mutable struct SimpleManifoldCachedObjective{
-    E<:AbstractEvaluationType,TC,TG,O<:AbstractManifoldGradientObjective{E,TC,TG},P,T,C
+    E<:AbstractEvaluationType,O<:AbstractManifoldObjective{E},P,T,C
 } <: AbstractDecoratedManifoldObjective{E,O}
     objective::O
     p::P # a point
@@ -37,9 +37,9 @@ function SimpleManifoldCachedObjective(
     p=rand(M),
     X=initialized ? get_gradient(M, obj, p) : zero_vector(M, p),
     c=initialized ? get_cost(M, obj, p) : 0.0,
-) where {E<:AbstractEvaluationType,TC,TG,O<:AbstractManifoldGradientObjective{E,TC,TG}}
+) where {E<:AbstractEvaluationType,O<:AbstractManifoldObjective{E}}
     q = copy(M, p)
-    return SimpleManifoldCachedObjective{E,TC,TG,O,typeof(q),typeof(X),typeof(c)}(
+    return SimpleManifoldCachedObjective{E,O,typeof(q),typeof(X),typeof(c)}(
         obj, q, X, initialized, c, initialized
     )
 end
@@ -91,10 +91,10 @@ end
 function get_cost(
     M::AbstractManifold,
     sco::SimpleManifoldCachedObjective{
-        AllocatingEvaluation,TC,TG,<:ManifoldCostGradientObjective
+        AllocatingEvaluation,<:ManifoldCostGradientObjective
     },
     p,
-) where {TC,TG}
+)
     scop_neq_p = sco.p != p
     if scop_neq_p || !sco.c_valid
         sco.c, sco.X = sco.objective.costgrad!!(M, p)
@@ -106,11 +106,9 @@ function get_cost(
 end
 function get_cost(
     M::AbstractManifold,
-    sco::SimpleManifoldCachedObjective{
-        InplaceEvaluation,TC,TG,<:ManifoldCostGradientObjective
-    },
+    sco::SimpleManifoldCachedObjective{InplaceEvaluation,<:ManifoldCostGradientObjective},
     p,
-) where {TC,TG}
+)
     scop_neq_p = sco.p != p
     if scop_neq_p || !sco.c_valid
         sco.c, _ = sco.objective.costgrad!!(M, sco.X, p)
@@ -123,10 +121,10 @@ end
 function get_gradient(
     M::AbstractManifold,
     sco::SimpleManifoldCachedObjective{
-        AllocatingEvaluation,TC,TG,<:ManifoldCostGradientObjective
+        AllocatingEvaluation,<:ManifoldCostGradientObjective
     },
     p,
-) where {TC,TG}
+)
     scop_neq_p = sco.p != p
     if scop_neq_p || !sco.X_valid
         sco.c, sco.X = sco.objective.costgrad!!(M, p)
@@ -139,11 +137,9 @@ function get_gradient(
 end
 function get_gradient(
     M::AbstractManifold,
-    sco::SimpleManifoldCachedObjective{
-        InplaceEvaluation,TC,TG,<:ManifoldCostGradientObjective
-    },
+    sco::SimpleManifoldCachedObjective{InplaceEvaluation,<:ManifoldCostGradientObjective},
     p,
-) where {TC,TG}
+)
     scop_neq_p = sco.p != p
     if scop_neq_p || !sco.X_valid
         sco.c, _ = sco.objective.costgrad!!(M, sco.X, p)
@@ -158,10 +154,10 @@ function get_gradient!(
     M::AbstractManifold,
     X,
     sco::SimpleManifoldCachedObjective{
-        AllocatingEvaluation,TC,TG,<:ManifoldCostGradientObjective
+        AllocatingEvaluation,<:ManifoldCostGradientObjective
     },
     p,
-) where {TC,TG}
+)
     scop_neq_p = sco.p != p
     if scop_neq_p || !sco.X_valid
         sco.c, sco.X = sco.objective.costgrad!!(M, p)
@@ -175,11 +171,9 @@ end
 function get_gradient!(
     M::AbstractManifold,
     X,
-    sco::SimpleManifoldCachedObjective{
-        InplaceEvaluation,TC,TG,<:ManifoldCostGradientObjective
-    },
+    sco::SimpleManifoldCachedObjective{InplaceEvaluation,<:ManifoldCostGradientObjective},
     p,
-) where {TC,TG}
+)
     scop_neq_p = sco.p != p
     if scop_neq_p || !sco.X_valid
         sco.c, _ = sco.objective.costgrad!!(M, sco.X, p)
diff --git a/tutorials/CountAndCache.qmd b/tutorials/CountAndCache.qmd
index d32c5e04d4..60eb12ea41 100644
--- a/tutorials/CountAndCache.qmd
+++ b/tutorials/CountAndCache.qmd
@@ -62,7 +62,7 @@ Pkg.develop(path="../") # a trick to work on the local dev version
 
 ```{julia}
 #| output: false
-using Manopt, Manifolds, Random, LRUCache
+using Manopt, Manifolds, Random, LRUCache, LinearAlgebra
 ```
 
 ## Counting
@@ -74,6 +74,7 @@ n = 100
 σ = π / 8
 M = Sphere(2)
 p = 1 / sqrt(2) * [1.0, 0.0, 1.0]
+Random.seed!(42)
 data = [exp(M, p,  σ * rand(M; vector_at=p)) for i in 1:n];
 f(M, p) = sum(1 / (2 * n) * distance.(Ref(M), Ref(p), data) .^ 2)
 grad_f(M, p) = sum(1 / n * grad_distance.(Ref(M), data, Ref(p)));
@@ -122,3 +123,221 @@ result as usual:
 ```{julia}
 get_solver_result(r)
 ```
+
+## Advanced Caching Examples
+
+There are more options other than caching single calls to specific parts of the objective.
+For example you may want to cache intermediate results
+of computing the cost and share that with the gradient computation. We will present
+three solutions to this:
+
+1. An easy approach from within `Manopt.jl`: The [`ManifoldCostGradientObjective`](@ref)
+2. A shared storage approach using a functor
+3. A shared (internal) cache approach also using a functor
+
+For that we switch to another example:
+The Rayleigh quotient. We aim to maximize the Rayleigh quotient $\displaystyle\frac{x^{\mathrm{T}}Ax}{x^{\mathrm{T}}x}$, for some $A\in\mathbb R^{m+1\times m+1}$ and $x\in\mathbb R^{m+1}$ but since we consider this on the sphere and `Manopt.jl`
+(as many other optimization toolboxes) minimizes, we consider
+
+```math
+g(p) = -p^{\mathrm{T}}Ap,\qquad p\in\mathbb S^{m}
+```
+
+The Euclidean gradient (that is in $ R^{m+1}$) is actually just $\nabla g(p) = -2Ap$, the Riemannian gradient the projection of $\nabla g(p)$ onto the tangent space $T_p\mathbb S^{m}$.
+
+```{julia}
+m = 25
+Random.seed!(42)
+A = randn(m + 1, m + 1)
+A = Symmetric(A)
+p_star = eigvecs(A)[:, end] # minimizer (or similarly -p)
+f_star = -eigvals(A)[end] # cost (note that we get - the largest Eigenvalue)
+
+N = Sphere(m);
+
+g(M, p) = -p' * A*p
+∇g(p) = -2 * A * p
+grad_g(M,p) = project(M, p, ∇g(p))
+grad_g!(M,X, p) = project!(M, X, p, ∇g(p))
+```
+
+But since both the cost and the gradient require the computation of the matrix-vector product $Ap$, it might be beneficial to only compute this once.
+
+### The [`ManifoldCostGradientObjective`](@ref) approach
+
+The [`ManifoldCostGradientObjective`](@ref) uses a combined function to compute both the gradient and the cost at the same time. We define the inplace variant as
+
+```{julia}
+function g_grad_g!(M::AbstractManifold, X, p)
+    X .= -A*p
+    c = p'*X
+    X .*= 2
+    project!(M, X, p, X)
+    return (c, X)
+end
+```
+
+where we only compute the matrix-vector product once.
+The small disadvantage might be, that we always compute _both_, the gradient and the cost. Luckily, the cache we used before, takes this into account and caches both results, such that we indeed end up computing `A*p` only once when asking to a cost and a gradient.
+
+Let's compare both methods
+```{julia}
+p0 = [(1/5 .* ones(5))..., zeros(m-4)...];
+@time s1 = gradient_descent(N, g, grad_g!, p0;
+    stopping_criterion = StopWhenGradientNormLess(1e-5),
+    evaluation=InplaceEvaluation(),
+    count=[:Cost, :Gradient],
+    cache=(:LRU, [:Cost, :Gradient], 25),
+    return_objective=true,
+)
+```
+
+versus
+
+```{julia}
+obj = ManifoldCostGradientObjective(g_grad_g!; evaluation=InplaceEvaluation())
+@time s2 = gradient_descent(N, obj, p0;
+    stopping_criterion=StopWhenGradientNormLess(1e-5),
+    count=[:Cost, :Gradient],
+    cache=(:LRU, [:Cost, :Gradient], 25),
+    return_objective=true,
+)
+```
+
+first of all both yield the same result
+
+```{julia}
+p1 = get_solver_result(s1)
+p2 = get_solver_result(s2)
+[distance(N, p1, p2), g(N, p1), g(N, p2), f_star]
+```
+
+and we can see that the combined number of evaluations is once 2051, once
+just the number of cost evaluations 1449. Note that the involved additional 847
+gradient evaluations are merely a multiplication with 2. On the other hand, the additional caching of the gradient in these cases might be less beneficial.
+It is beneficial, when the gradient and the cost are very often required together.
+
+### A shared storage approach using a functor
+
+An alternative to the previous approach is the usage of a functor that introduces a “shared storage”
+of the result of computing `A*p`. We additionally have to store `p` though,
+since we have to check that we are still evaluating the cost and/or gradient
+at the same point at which the cached `A*p` was computed.
+We again consider the (more efficient) inplace variant.
+This can be done as follows
+
+```{julia}
+struct StorageG{T,M}
+    A::M
+    Ap::T
+    p::T
+end
+function (g::StorageG)(::Val{:Cost}, M::AbstractManifold, p)
+    if !(p==g.p) #We are at a new point -> Update
+        g.Ap .= g.A*p
+        g.p .= p
+    end
+    return -g.p'*g.Ap
+end
+function (g::StorageG)(::Val{:Gradient}, M::AbstractManifold, X, p)
+    if !(p==g.p) #We are at a new point -> Update
+        g.Ap .= g.A*p
+        g.p .= p
+    end
+    X .= -2 .* g.Ap
+    project!(M, X, p, X)
+    return X
+end
+```
+
+Here we use the first parameter to distinguish both functions. For the mutating
+case the signatures are different regardless of the additional argument but for the allocating case, the signatures of the cost and the gradient function are the same.
+
+
+```{julia}
+#Define the new functor
+storage_g = StorageG(A, zero(p0), zero(p0))
+# and cost and gradient that use this functor as
+g3(M,p) = storage_g(Val(:Cost), M, p)
+grad_g3!(M, X, p) = storage_g(Val(:Gradient), M, X, p)
+@time s3 = gradient_descent(N, g3, grad_g3!, p0;
+    stopping_criterion = StopWhenGradientNormLess(1e-5),
+    evaluation=InplaceEvaluation(),
+    count=[:Cost, :Gradient],
+    cache=(:LRU, [:Cost, :Gradient], 2),
+    return_objective=true#, return_state=true
+)
+```
+
+This of course still yields the same result
+
+```{julia}
+p3 = get_solver_result(s3)
+g(N, p3) - f_star
+```
+
+And while we again have a split off the cost and gradient evaluations, we can observe that the allocations are less than half of the previous approach.
+
+### A local cache approach
+
+This variant is very similar to the previous one, but uses a whole cache instead
+of just one place to store `A*p`. This makes the code a bit nicer,
+and it is possible to store more than just the last `p` either cost or gradient
+was called with.
+
+```{julia}
+struct CacheG{C,M}
+    A::M
+    cache::C
+end
+function (g::CacheG)(::Val{:Cost}, M, p)
+    Ap = get!(g.cache, copy(M,p)) do
+        g.A*p
+    end
+    return -p'*Ap
+end
+function (g::CacheG)(::Val{:Gradient}, M, X, p)
+    Ap = get!(g.cache, copy(M,p)) do
+        g.A*p
+    end
+    X .= -2 .* Ap
+    project!(M, X, p, X)
+    return X
+end
+```
+
+However, the resulting solver run is not always faster, since
+the whole cache instead of storing just `Ap` and `p` is a bit more costly.
+Then the tradeoff is, whether this pays off.
+
+```{julia}
+#Define the new functor
+cache_g = CacheG(A, LRU{typeof(p0),typeof(p0)}(; maxsize=25))
+# and cost and gradient that use this functor as
+g4(M,p) = cache_g(Val(:Cost), M, p)
+grad_g4!(M, X, p) = cache_g(Val(:Gradient), M, X, p)
+@time s4 = gradient_descent(N, g4, grad_g4!, p0;
+    stopping_criterion = StopWhenGradientNormLess(1e-5),
+    evaluation=InplaceEvaluation(),
+    count=[:Cost, :Gradient],
+    cache=(:LRU, [:Cost, :Gradient], 25),
+    return_objective=true,
+)
+```
+
+and for safety let's check that we are reasonably close
+
+```{julia}
+p4 = get_solver_result(s4)
+g(N, p4) - f_star
+```
+
+For this example, or maybe even [`gradient_descent`](@ref) in general it seems,
+this additional (second, inner) cache does not improve the result further,
+it is about the same effort both time and allocation-wise.
+
+## Summary
+
+While the second approach of [`ManifoldCostGradientObjective`](@ref) is very easy to implement, both the storage and the (local) cache approach are more efficient.
+All three are an improvement over the first implementation without sharing interms results.
+The results with storage or cache have further advantage of being more flexible, i.e. the stored information could also be reused in a third function, for example when also computing the Hessian.
\ No newline at end of file