diff --git a/python/ffsim/optimize/__init__.py b/python/ffsim/optimize/__init__.py
index 594e4191a..8cb5b45c5 100644
--- a/python/ffsim/optimize/__init__.py
+++ b/python/ffsim/optimize/__init__.py
@@ -11,7 +11,11 @@
 """Optimization algorithms."""
 
 from ffsim.optimize.linear_method import minimize_linear_method
+from ffsim.optimize.stochastic_reconfiguration import (
+    minimize_stochastic_reconfiguration,
+)
 
 __all__ = [
     "minimize_linear_method",
+    "minimize_stochastic_reconfiguration",
 ]
diff --git a/python/ffsim/optimize/_util.py b/python/ffsim/optimize/_util.py
new file mode 100644
index 000000000..b4669893d
--- /dev/null
+++ b/python/ffsim/optimize/_util.py
@@ -0,0 +1,112 @@
+# (C) Copyright IBM 2024.
+#
+# This code is licensed under the Apache License, Version 2.0. You may
+# obtain a copy of this license in the LICENSE.txt file in the root directory
+# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# Any modifications or derivative works of this code must retain this
+# copyright notice, and modified files need to carry a notice indicating
+# that they have been altered from the originals.
+
+from __future__ import annotations
+
+from typing import Callable
+
+import numpy as np
+from scipy.optimize import OptimizeResult
+from scipy.sparse.linalg import LinearOperator
+
+from ffsim import states
+
+
+class WrappedCallable:
+    """Callable wrapper used to count function calls."""
+
+    def __init__(
+        self, func: Callable[[np.ndarray], np.ndarray], optimize_result: OptimizeResult
+    ):
+        self.func = func
+        self.optimize_result = optimize_result
+
+    def __call__(self, x: np.ndarray) -> np.ndarray:
+        self.optimize_result.nfev += 1
+        return self.func(x)
+
+
+class WrappedLinearOperator:
+    """LinearOperator wrapper used to count LinearOperator applications."""
+
+    def __init__(self, linop: LinearOperator, optimize_result: OptimizeResult):
+        self.linop = linop
+        self.optimize_result = optimize_result
+
+    def __matmul__(self, other: np.ndarray):
+        if len(other.shape) == 1:
+            self.optimize_result.nlinop += 1
+        else:
+            _, n = other.shape
+            self.optimize_result.nlinop += n
+        return self.linop @ other
+
+    def __rmatmul__(self, other: np.ndarray):
+        if len(other.shape) == 1:
+            self.optimize_result.nlinop += 1
+        else:
+            n, _ = other.shape
+            self.optimize_result.nlinop += n
+        return other @ self.linop
+
+
+def gradient_finite_diff(
+    params_to_vec: Callable[[np.ndarray], np.ndarray],
+    theta: np.ndarray,
+    index: int,
+    epsilon: float,
+) -> np.ndarray:
+    """Return the gradient of one of the components of a function.
+
+    Given a function that maps a vector of "parameters" to an output vector, return
+    the gradient of one of the parameter components.
+
+    Args:
+        params_to_vec: Function that maps a parameter vector to an output vector.
+        theta: The parameters at which to evaluate the gradient.
+        index: The index of the parameter to take the gradient of.
+        epsilon: Finite difference step size.
+
+    Returns:
+        The gradient of the desired parameter component.
+    """
+    unit = states.one_hot(len(theta), index, dtype=float)
+    plus = theta + epsilon * unit
+    minus = theta - epsilon * unit
+    return (params_to_vec(plus) - params_to_vec(minus)) / (2 * epsilon)
+
+
+def orthogonal_jacobian_finite_diff(
+    params_to_vec: Callable[[np.ndarray], np.ndarray],
+    theta: np.ndarray,
+    vec: np.ndarray,
+    epsilon: float,
+) -> np.ndarray:
+    """Return the "orthogonal" Jacobian matrix of a function.
+
+    Rather than the true Jacobian, the columns of the returned matrix contain only
+    the components of the gradients orthogonal to a given input vector.
+
+    Args:
+        params_to_vec: Function that maps a parameter vector to an output vector.
+        theta: The parameters at which to evaluate the Jacobian.
+        vec: The vector to use for computing the orthogonal component of the gradient.
+        epsilon: Finite difference step size.
+
+    Returns:
+        The "orthogonal" Jacobian matrix.
+    """
+    jac = np.zeros((len(vec), len(theta)), dtype=complex)
+    for i in range(len(theta)):
+        grad = gradient_finite_diff(params_to_vec, theta, i, epsilon)
+        # Subtract the component of the gradient along the vector
+        grad -= np.vdot(vec, grad) * vec
+        jac[:, i] = grad
+    return jac
diff --git a/python/ffsim/optimize/linear_method.py b/python/ffsim/optimize/linear_method.py
index e8f47a6f9..f88d1c6bf 100644
--- a/python/ffsim/optimize/linear_method.py
+++ b/python/ffsim/optimize/linear_method.py
@@ -20,45 +20,11 @@
 from scipy.optimize import OptimizeResult, minimize
 from scipy.sparse.linalg import LinearOperator
 
-from ffsim.states import one_hot
-
-
-class _WrappedCallable:
-    """Callable wrapper used to count function calls."""
-
-    def __init__(
-        self, func: Callable[[np.ndarray], np.ndarray], optimize_result: OptimizeResult
-    ):
-        self.func = func
-        self.optimize_result = optimize_result
-
-    def __call__(self, x: np.ndarray) -> np.ndarray:
-        self.optimize_result.nfev += 1
-        return self.func(x)
-
-
-class _WrappedLinearOperator:
-    """LinearOperator wrapper used to count LinearOperator applications."""
-
-    def __init__(self, linop: LinearOperator, optimize_result: OptimizeResult):
-        self.linop = linop
-        self.optimize_result = optimize_result
-
-    def __matmul__(self, other: np.ndarray):
-        if len(other.shape) == 1:
-            self.optimize_result.nlinop += 1
-        else:
-            _, n = other.shape
-            self.optimize_result.nlinop += n
-        return self.linop @ other
-
-    def __rmatmul__(self, other: np.ndarray):
-        if len(other.shape) == 1:
-            self.optimize_result.nlinop += 1
-        else:
-            n, _ = other.shape
-            self.optimize_result.nlinop += n
-        return other @ self.linop
+from ffsim.optimize._util import (
+    WrappedCallable,
+    WrappedLinearOperator,
+    orthogonal_jacobian_finite_diff,
+)
 
 
 def minimize_linear_method(
@@ -165,12 +131,14 @@ def minimize_linear_method(
         x=None, fun=None, jac=None, nfev=0, njev=0, nit=0, nlinop=0
     )
 
-    params_to_vec = _WrappedCallable(params_to_vec, intermediate_result)
-    hamiltonian = _WrappedLinearOperator(hamiltonian, intermediate_result)
+    params_to_vec = WrappedCallable(params_to_vec, intermediate_result)
+    hamiltonian = WrappedLinearOperator(hamiltonian, intermediate_result)
 
     for i in range(maxiter):
         vec = params_to_vec(params)
-        jac = _jac(params_to_vec, params, vec, epsilon=epsilon)
+        jac = orthogonal_jacobian_finite_diff(
+            params_to_vec, params, vec, epsilon=epsilon
+        )
 
         energy_mat, overlap_mat = _linear_method_matrices(vec, jac, hamiltonian)
         energy = energy_mat[0, 0]
@@ -293,32 +261,6 @@ def _solve_linear_method_eigensystem(
     return eig, vec
 
 
-def _jac(
-    params_to_vec: Callable[[np.ndarray], np.ndarray],
-    theta: np.ndarray,
-    vec: np.ndarray,
-    epsilon: float,
-) -> np.ndarray:
-    jac = np.zeros((len(vec), len(theta)), dtype=complex)
-    for i in range(len(theta)):
-        grad = _grad(params_to_vec, theta, i, epsilon)
-        grad -= np.vdot(vec, grad) * vec
-        jac[:, i] = grad
-    return jac
-
-
-def _grad(
-    params_to_vec: Callable[[np.ndarray], np.ndarray],
-    theta: np.ndarray,
-    index: int,
-    epsilon: float,
-) -> np.ndarray:
-    unit = one_hot(len(theta), index, dtype=float)
-    plus = theta + epsilon * unit
-    minus = theta - epsilon * unit
-    return (params_to_vec(plus) - params_to_vec(minus)) / (2 * epsilon)
-
-
 def _get_param_update(
     energy_mat: np.ndarray,
     overlap_mat: np.ndarray,
diff --git a/python/ffsim/optimize/stochastic_reconfiguration.py b/python/ffsim/optimize/stochastic_reconfiguration.py
new file mode 100644
index 000000000..2e4af07b8
--- /dev/null
+++ b/python/ffsim/optimize/stochastic_reconfiguration.py
@@ -0,0 +1,228 @@
+# (C) Copyright IBM 2024.
+#
+# This code is licensed under the Apache License, Version 2.0. You may
+# obtain a copy of this license in the LICENSE.txt file in the root directory
+# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# Any modifications or derivative works of this code must retain this
+# copyright notice, and modified files need to carry a notice indicating
+# that they have been altered from the originals.
+
+"""Stochastic reconfiguration for optimization of a variational ansatz."""
+
+from __future__ import annotations
+
+import math
+from typing import Any, Callable
+
+import numpy as np
+import scipy.linalg
+from scipy.optimize import OptimizeResult, minimize
+from scipy.sparse.linalg import LinearOperator
+
+from ffsim.optimize._util import (
+    WrappedCallable,
+    WrappedLinearOperator,
+    orthogonal_jacobian_finite_diff,
+)
+
+
+def minimize_stochastic_reconfiguration(
+    params_to_vec: Callable[[np.ndarray], np.ndarray],
+    hamiltonian: LinearOperator,
+    x0: np.ndarray,
+    *,
+    maxiter: int = 1000,
+    variation: float = 1.0,
+    cond: float | None = None,  # TODO document this
+    epsilon: float = 1e-8,
+    gtol: float = 1e-5,
+    optimize_hyperparameters: bool = True,
+    optimize_hyperparameters_args: dict | None = None,
+    callback: Callable[[OptimizeResult], Any] | None = None,
+) -> OptimizeResult:
+    """Minimize the energy of a variational ansatz using stochastic reconfiguration.
+
+    References:
+
+    - `Generalized Lanczos algorithm for variational quantum Monte Carlo`_
+
+    Args:
+        params_to_vec: Function representing the wavefunction ansatz. It takes as input
+            a vector of real-valued parameters and outputs the state vector represented
+            by those parameters.
+        hamiltonian: The Hamiltonian representing the energy to be minimized.
+        x0: Initial guess for the parameters.
+        maxiter: Maximum number of optimization iterations to perform.
+        variation: Hyperparameter controlling the size of parameter variations
+            used in the linear expansion of the wavefunction. Its value must be
+            strictly between 0 and 1. A larger value results in larger variations.
+        cond: `cond` argument to pass to `scipy.linalg.lstsq`_, which is called to
+            solve for the parameter update.
+        epsilon: Increment to use for approximating the gradient using
+            finite difference.
+        gtol: Convergence threshold for the norm of the projected gradient.
+        optimize_hyperparameters: Whether to optimize the `variation` hyperparameter in
+            each iteration. Optimizing the hyperparameter incurs more function and
+            energy evaluations in each iteration, but may speed up convergence, leading
+            to fewer iterations overall. The optimization is performed using
+            `scipy.optimize.minimize`_.
+        optimize_hyperparameters_args: Arguments to use when calling
+            `scipy.optimize.minimize`_ to optimize the hyperparameters. The call is
+            constructed as
+
+            .. code::
+
+                scipy.optimize.minimize(f, x0, **scipy_optimize_minimize_args)
+
+            If not specified, the default value of `dict(method="L-BFGS-B")` will be
+            used.
+
+        callback: A callable called after each iteration. It is called with the
+            signature
+
+            .. code::
+
+                callback(intermediate_result: OptimizeResult)
+
+            where ``intermediate_result`` is a `scipy.optimize.OptimizeResult`_
+            with attributes ``x``  and ``fun``, the present values of the parameter
+            vector and objective function. For all iterations except for the last,
+            it also contains the ``jac`` attribute holding the present value of the
+            gradient, as well as ``regularization`` and ``variation`` attributes
+            holding the present values of the `regularization` and `variation`
+            hyperparameters.
+
+    Returns:
+        The optimization result represented as a `scipy.optimize.OptimizeResult`_
+        object. Note the following definitions of selected attributes:
+
+        - ``x``: The final parameters of the optimization.
+        - ``fun``: The energy associated with the final parameters. That is, the
+          expectation value of the Hamiltonian with respect to the state vector
+          corresponding to the parameters.
+        - ``nfev``: The number of times the ``params_to_vec`` function was called.
+        - ``nlinop``: The number of times the ``hamiltonian`` linear operator was
+          applied to a vector.
+
+    .. _Generalized Lanczos algorithm for variational quantum Monte Carlo: https://doi.org/10.1103/PhysRevB.64.024512
+    .. _scipy.optimize.OptimizeResult: https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.OptimizeResult.html#scipy.optimize.OptimizeResult
+    .. _scipy.linalg.lstsq: https://docs.scipy.org/doc/scipy/reference/generated/scipy.linalg.lstsq.html
+    .. _scipy.optimize.minimize: https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.minimize.html
+    """  # noqa: E501
+    if variation <= 0:
+        raise ValueError(f"variation must be positive. Got {variation}.")
+    if maxiter < 1:
+        raise ValueError(f"maxiter must be at least 1. Got {maxiter}.")
+
+    if optimize_hyperparameters_args is None:
+        optimize_hyperparameters_args = dict(method="L-BFGS-B")
+
+    variation_param = math.sqrt(variation)
+    params = x0.copy()
+    converged = False
+    intermediate_result = OptimizeResult(
+        x=None, fun=None, jac=None, nfev=0, njev=0, nit=0, nlinop=0
+    )
+
+    params_to_vec = WrappedCallable(params_to_vec, intermediate_result)
+    hamiltonian = WrappedLinearOperator(hamiltonian, intermediate_result)
+
+    for i in range(maxiter):
+        vec = params_to_vec(params)
+        jac = orthogonal_jacobian_finite_diff(
+            params_to_vec, params, vec, epsilon=epsilon
+        )
+
+        energy, grad, overlap_mat = _sr_matrices(vec, jac, hamiltonian)
+        intermediate_result.njev += 1
+
+        if i > 0 and callback is not None:
+            intermediate_result.x = params
+            intermediate_result.fun = energy
+            intermediate_result.jac = grad
+            intermediate_result.overlap_mat = overlap_mat
+            intermediate_result.variation = variation
+            callback(intermediate_result)
+
+        if np.linalg.norm(grad) < gtol:
+            converged = True
+            break
+
+        if optimize_hyperparameters:
+
+            def f(x: np.ndarray) -> float:
+                (variation_param,) = x
+                variation = variation_param**2
+                param_update = _get_param_update(
+                    grad,
+                    overlap_mat,
+                    variation,
+                    cond=cond,
+                )
+                vec = params_to_vec(params + param_update)
+                return np.vdot(vec, hamiltonian @ vec).real
+
+            result = minimize(
+                f,
+                x0=[variation_param],
+                **optimize_hyperparameters_args,
+            )
+            (variation_param,) = result.x
+            variation = variation_param**2
+
+        param_update = _get_param_update(
+            grad,
+            overlap_mat,
+            variation,
+            cond=cond,
+        )
+        params = params + param_update
+        intermediate_result.nit += 1
+
+    vec = params_to_vec(params)
+    energy = np.vdot(vec, hamiltonian @ vec).real
+
+    intermediate_result.x = params
+    intermediate_result.fun = energy
+    del intermediate_result.jac
+    if callback is not None:
+        callback(intermediate_result)
+
+    if converged:
+        success = True
+        message = "Convergence: Norm of projected gradient <= gtol."
+    else:
+        success = False
+        message = "Stop: Total number of iterations reached limit."
+
+    return OptimizeResult(
+        x=params,
+        success=success,
+        message=message,
+        fun=energy,
+        jac=grad,
+        nfev=intermediate_result.nfev,
+        njev=intermediate_result.njev,
+        nlinop=intermediate_result.nlinop,
+        nit=intermediate_result.nit,
+    )
+
+
+def _sr_matrices(
+    vec: np.ndarray, jac: np.ndarray, hamiltonian: LinearOperator
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    energy = np.vdot(vec, hamiltonian @ vec)
+    grad = vec.conj() @ (hamiltonian @ jac)
+    overlap_mat = jac.T.conj() @ jac
+    return energy.real, 2 * grad.real, overlap_mat.real
+
+
+def _get_param_update(
+    grad: np.ndarray,
+    overlap_mat: np.ndarray,
+    variation: float,
+    cond: float | None,
+) -> np.ndarray:
+    x, _, _, _ = scipy.linalg.lstsq(overlap_mat, -0.5 * variation * grad, cond=cond)
+    return x
diff --git a/tests/python/optimize/linear_method_test.py b/tests/python/optimize/linear_method_test.py
index cb475f47a..90c5ac48c 100644
--- a/tests/python/optimize/linear_method_test.py
+++ b/tests/python/optimize/linear_method_test.py
@@ -81,7 +81,7 @@ def callback(intermediate_result):
     assert result.nit <= 7
     assert result.nit < result.nlinop < result.nfev
 
-    # optimization with optimizing hyperparameters
+    # optimization without optimizing hyperparameters
     info = defaultdict(list)
     result = ffsim.optimize.minimize_linear_method(
         params_to_vec,
diff --git a/tests/python/optimize/stochastic_reconfiguration_test.py b/tests/python/optimize/stochastic_reconfiguration_test.py
new file mode 100644
index 000000000..f5b57c88f
--- /dev/null
+++ b/tests/python/optimize/stochastic_reconfiguration_test.py
@@ -0,0 +1,118 @@
+# (C) Copyright IBM 2024.
+#
+# This code is licensed under the Apache License, Version 2.0. You may
+# obtain a copy of this license in the LICENSE.txt file in the root directory
+# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# Any modifications or derivative works of this code must retain this
+# copyright notice, and modified files need to carry a notice indicating
+# that they have been altered from the originals.
+
+"""Tests for stochastic reconfiguration optimization algorithm."""
+
+from __future__ import annotations
+
+from collections import defaultdict
+
+import numpy as np
+import pyscf
+import pytest
+
+import ffsim
+
+
+def test_minimize_stochastic_reconfiguration():
+    # Build an H2 molecule
+    mol = pyscf.gto.Mole()
+    mol.build(
+        atom=[["H", (0, 0, 0)], ["H", (0, 0, 1.8)]],
+        basis="sto-6g",
+    )
+    hartree_fock = pyscf.scf.RHF(mol)
+    hartree_fock.kernel()
+
+    # Initialize parameters
+    n_reps = 2
+    n_params = ffsim.UCJOperator.n_params(hartree_fock.mol.nao_nr(), n_reps)
+    rng = np.random.default_rng(1804)
+    x0 = rng.uniform(-10, 10, size=n_params)
+
+    # Get molecular data and molecular Hamiltonian (one- and two-body tensors)
+    mol_data = ffsim.MolecularData.from_scf(hartree_fock)
+    norb = mol_data.norb
+    nelec = mol_data.nelec
+    mol_hamiltonian = mol_data.hamiltonian
+    hamiltonian = ffsim.linear_operator(mol_hamiltonian, norb=norb, nelec=nelec)
+    reference_state = ffsim.hartree_fock_state(norb, nelec)
+
+    def params_to_vec(x: np.ndarray):
+        operator = ffsim.UCJOperator.from_parameters(x, norb=norb, n_reps=n_reps)
+        return ffsim.apply_unitary(reference_state, operator, norb=norb, nelec=nelec)
+
+    def energy(x: np.ndarray):
+        vec = params_to_vec(x)
+        return np.real(np.vdot(vec, hamiltonian @ vec))
+
+    info = defaultdict(list)
+
+    def callback(intermediate_result):
+        info["x"].append(intermediate_result.x)
+        info["fun"].append(intermediate_result.fun)
+        np.testing.assert_allclose(
+            energy(intermediate_result.x), intermediate_result.fun
+        )
+        if hasattr(intermediate_result, "jac"):
+            info["jac"].append(intermediate_result.jac)
+        if hasattr(intermediate_result, "variation"):
+            info["variation"].append(intermediate_result.variation)
+
+    # default optimization
+    result = ffsim.optimize.minimize_stochastic_reconfiguration(
+        params_to_vec, x0=x0, hamiltonian=hamiltonian, callback=callback
+    )
+    np.testing.assert_allclose(energy(result.x), result.fun)
+    np.testing.assert_allclose(result.fun, -0.970773)
+    np.testing.assert_allclose(info["fun"][0], -0.853501, atol=1e-5)
+    np.testing.assert_allclose(info["fun"][-1], -0.970773, atol=1e-5)
+    for params, fun in zip(info["x"], info["fun"]):
+        np.testing.assert_allclose(energy(params), fun)
+    assert result.nit <= 20
+    assert result.nit < result.nlinop < result.nfev
+
+    # optimization without optimizing hyperparameters
+    info = defaultdict(list)
+    result = ffsim.optimize.minimize_stochastic_reconfiguration(
+        params_to_vec,
+        x0=x0,
+        hamiltonian=hamiltonian,
+        optimize_hyperparameters=False,
+        callback=callback,
+    )
+    np.testing.assert_allclose(energy(result.x), result.fun)
+    np.testing.assert_allclose(result.fun, -0.970773)
+    for params, fun in zip(info["x"], info["fun"]):
+        np.testing.assert_allclose(energy(params), fun)
+    assert result.nit <= 30
+    assert result.nit < result.nlinop < result.nfev
+    assert set(info["variation"]) == {1.0}
+
+    # optimization with maxiter
+    info = defaultdict(list)
+    result = ffsim.optimize.minimize_stochastic_reconfiguration(
+        params_to_vec, hamiltonian=hamiltonian, x0=x0, maxiter=3, callback=callback
+    )
+    assert result.nit == 3
+    assert len(info["x"]) == 3
+    assert len(info["fun"]) == 3
+    assert len(info["jac"]) == 2
+    np.testing.assert_allclose(energy(result.x), result.fun)
+
+    # test raising errors
+    with pytest.raises(ValueError, match="variation"):
+        result = ffsim.optimize.minimize_stochastic_reconfiguration(
+            params_to_vec, x0=x0, hamiltonian=hamiltonian, variation=-0.1
+        )
+    with pytest.raises(ValueError, match="maxiter"):
+        result = ffsim.optimize.minimize_stochastic_reconfiguration(
+            params_to_vec, x0=x0, hamiltonian=hamiltonian, maxiter=0
+        )