From 99f498fb397f1ccef3b4a3365f4ed9b5f847cbb1 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Fri, 10 Nov 2023 15:25:32 +0100
Subject: [PATCH 01/13] Basic skeleton of initial support for sparse indices in
 diff_tensors, not functional yet.

---
 include/heyoka/expression.hpp |  14 +-
 src/expression_diff.cpp       | 274 +++++++++++++++++++++++-----------
 2 files changed, 196 insertions(+), 92 deletions(-)
diff --git a/include/heyoka/expression.hpp b/include/heyoka/expression.hpp
index 9c833a1aa..e7989e951 100644
--- a/include/heyoka/expression.hpp
+++ b/include/heyoka/expression.hpp
@@ -419,21 +419,23 @@ IGOR_MAKE_NAMED_ARGUMENT(diff_order);
 namespace detail
 {
 
-// Private aliases/utilities needed in the implementation of dtens.
-using dtens_v_idx_t = std::vector<std::uint32_t>;
+// Sparse structure used to index derivatives in dtens:
+// the first element of the pair is the function component index,
+// the second element the vector of derivative orders in sparse form.
+using dtens_sv_idx_t = std::pair<std::uint32_t, std::vector<std::pair<std::uint32_t, std::uint32_t>>>;
 
-struct dtens_v_idx_cmp {
-    [[nodiscard]] bool operator()(const dtens_v_idx_t &, const dtens_v_idx_t &) const;
+struct dtens_sv_idx_cmp {
+    [[nodiscard]] bool operator()(const dtens_sv_idx_t &, const dtens_sv_idx_t &) const;
 };
 
-using dtens_map_t = boost::container::flat_map<dtens_v_idx_t, expression, dtens_v_idx_cmp>;
+using dtens_map_t = boost::container::flat_map<dtens_sv_idx_t, expression, dtens_sv_idx_cmp>;
 
 } // namespace detail
 
 class HEYOKA_DLL_PUBLIC dtens
 {
 public:
-    using v_idx_t = detail::dtens_v_idx_t;
+    using v_idx_t = std::vector<std::uint32_t>;
     using size_type = detail::dtens_map_t::size_type;
 
 private:
diff --git a/src/expression_diff.cpp b/src/expression_diff.cpp
index 3334595f7..2b1f5bb0d 100644
--- a/src/expression_diff.cpp
+++ b/src/expression_diff.cpp
@@ -14,6 +14,7 @@
 #include <functional>
 #include <iterator>
 #include <memory>
+#include <numeric>
 #include <ostream>
 #include <stdexcept>
 #include <string>
@@ -453,15 +454,60 @@ auto diff_make_adj_dep(const std::vector<expression> &dc, std::vector<expression
     return std::tuple{std::move(adj), std::move(dep), std::move(revdep), std::move(subs_map)};
 }
 
+// Hasher for a pair of indices.
+struct idx_pair_hasher {
+    std::size_t operator()(const std::pair<std::uint32_t, std::uint32_t> &p) const noexcept
+    {
+        std::size_t seed = std::hash<std::uint32_t>{}(p.first);
+        boost::hash_combine(seed, std::hash<std::uint32_t>{}(p.second));
+
+        return seed;
+    }
+};
+
+// This is an alternative version of dtens_sv_idx_t that uses a dictionary
+// for storing the derivative orders instead of a sorted vector. Using a dictionary
+// allows for faster/easier manipulation of the derivative orders.
+using dtens_ss_idx_t = std::pair<std::uint32_t, fast_umap<std::uint32_t, std::uint32_t>>;
+
+// Helper to turn a dtens_sv_idx_t into a dtens_ss_idx_t.
+void vidx_v2s(dtens_ss_idx_t &output, const dtens_sv_idx_t &input)
+{
+    output.first = input.first;
+
+    output.second.clear();
+    for (const auto &p : input.second) {
+        [[maybe_unused]] auto [_, flag] = output.second.insert(p);
+        assert(flag);
+    }
+}
+
+// Helper to turn a dtens_ss_idx_t into a dtens_sv_idx_t.
+dtens_sv_idx_t vidx_s2v(const dtens_ss_idx_t &input)
+{
+    dtens_sv_idx_t retval{input.first, {}};
+
+    std::copy(input.second.begin(), input.second.end(), std::back_inserter(retval.second));
+    std::sort(retval.second.begin(), retval.second.end(),
+              [](const auto &p1, const auto &p2) { return p1.first < p2.first; });
+
+    return retval;
+}
+
 // Hasher for the local maps of derivatives used in the
 // forward/reverse mode implementations.
 struct diff_map_hasher {
-    std::size_t operator()(const dtens_v_idx_t &v) const noexcept
+    std::size_t operator()(const dtens_ss_idx_t &s) const noexcept
     {
-        std::size_t seed = std::hash<decltype(v.size())>{}(v.size());
-
-        for (auto idx : v) {
-            boost::hash_combine(seed, std::hash<decltype(idx)>{}(idx));
+        // Use as seed the component index.
+        std::size_t seed = std::hash<std::uint32_t>{}(s.first);
+
+        // Compose via additions the hashes of the derivative orders.
+        // NOTE: it is important that we use here a commutative operation
+        // so that the final hash is independent of the order in which
+        // the derivative orders are stored in the dictionary.
+        for (const auto &p : s.second) {
+            seed += idx_pair_hasher{}(p);
         }
 
         return seed;
@@ -507,8 +553,8 @@ void diff_tensors_forward_impl(
     // to prevent duplicates. For order-1 derivatives, no duplicate
     // derivatives will be produced and thus we can use a plain vector
     // which can be quite a bit faster.
-    using diff_map_t = fast_umap<dtens_v_idx_t, expression, diff_map_hasher>;
-    using diff_vec_t = std::vector<std::pair<dtens_v_idx_t, expression>>;
+    using diff_map_t = fast_umap<dtens_ss_idx_t, expression, diff_map_hasher>;
+    using diff_vec_t = std::vector<std::pair<dtens_ss_idx_t, expression>>;
     using local_diff_t = std::variant<diff_map_t, diff_vec_t>;
     auto local_diff = (cur_order == 1u) ? local_diff_t(diff_vec_t{}) : local_diff_t(diff_map_t{});
 
@@ -518,7 +564,7 @@ void diff_tensors_forward_impl(
     auto local_dvec = [&local_diff]() -> diff_vec_t & { return std::get<diff_vec_t>(local_diff); };
 
     // An indices vector used as temporary variable in several places below.
-    std::vector<std::uint32_t> tmp_v_idx;
+    dtens_ss_idx_t tmp_v_idx;
 
     // These two containers will be used to store the list of subexpressions
     // which depend on an input. They are used in the forward pass
@@ -557,9 +603,8 @@ void diff_tensors_forward_impl(
             for (std::vector<expression>::size_type out_idx = 0; out_idx < cur_nouts; ++out_idx, ++out_it) {
                 assert(out_it != diff_map.end());
 
-                tmp_v_idx = out_it->first;
-                assert(diff_arg_idx + 1u < tmp_v_idx.size());
-                tmp_v_idx[diff_arg_idx + 1u] += 1u;
+                vidx_v2s(tmp_v_idx, out_it->first);
+                tmp_v_idx.second[static_cast<std::uint32_t>(diff_arg_idx)] += 1u;
 
                 if (cur_order == 1u) {
                     local_dvec().emplace_back(tmp_v_idx, 0_dbl);
@@ -675,9 +720,8 @@ void diff_tensors_forward_impl(
         for (std::vector<expression>::size_type out_idx = 0; out_idx < cur_nouts; ++out_idx, ++out_it) {
             assert(out_it != diff_map.end());
 
-            tmp_v_idx = out_it->first;
-            assert(diff_arg_idx + 1u < tmp_v_idx.size());
-            tmp_v_idx[diff_arg_idx + 1u] += 1u;
+            vidx_v2s(tmp_v_idx, out_it->first);
+            tmp_v_idx.second[static_cast<std::uint32_t>(diff_arg_idx)] += 1u;
 
             if (cur_order == 1u) {
                 auto cur_der = diffs[diffs.size() - cur_nouts + out_idx];
@@ -697,9 +741,13 @@ void diff_tensors_forward_impl(
 
     // Merge the local map into diff_map.
     if (cur_order == 1u) {
-        diff_map.insert(diff_map.end(), local_dvec().begin(), local_dvec().end());
+        for (auto &p : local_dvec()) {
+            diff_map.emplace_back(vidx_s2v(p.first), std::move(p.second));
+        }
     } else {
-        diff_map.insert(diff_map.end(), local_dmap().begin(), local_dmap().end());
+        for (auto &p : local_dmap()) {
+            diff_map.emplace_back(vidx_s2v(p.first), std::move(p.second));
+        }
     }
 }
 
@@ -742,8 +790,8 @@ void diff_tensors_reverse_impl(
     // to prevent duplicates. For order-1 derivatives, no duplicate
     // derivatives will be produced and thus we can use a plain vector
     // which can be quite a bit faster.
-    using diff_map_t = fast_umap<dtens_v_idx_t, expression, diff_map_hasher>;
-    using diff_vec_t = std::vector<std::pair<dtens_v_idx_t, expression>>;
+    using diff_map_t = fast_umap<dtens_ss_idx_t, expression, diff_map_hasher>;
+    using diff_vec_t = std::vector<std::pair<dtens_ss_idx_t, expression>>;
     using local_diff_t = std::variant<diff_map_t, diff_vec_t>;
     auto local_diff = (cur_order == 1u) ? local_diff_t(diff_vec_t{}) : local_diff_t(diff_map_t{});
 
@@ -756,7 +804,7 @@ void diff_tensors_reverse_impl(
     const auto nargs = args.size();
 
     // An indices vector used as temporary variable in several places below.
-    std::vector<std::uint32_t> tmp_v_idx;
+    dtens_ss_idx_t tmp_v_idx;
 
     // These two containers will be used to store the list of subexpressions
     // on which an output depends. They are used in the reverse pass
@@ -893,12 +941,11 @@ void diff_tensors_reverse_impl(
         // Add the derivatives to the local map.
         for (decltype(args.size()) j = 0; j < nargs; ++j) {
             // Compute the indices vector for the current derivative.
-            tmp_v_idx = prev_begin->first;
-            assert(j + 1u < tmp_v_idx.size());
+            vidx_v2s(tmp_v_idx, prev_begin->first);
             // NOTE: no need to overflow check here, because no derivative
             // order can end up being larger than the total diff order which
             // is representable by std::uint32_t.
-            tmp_v_idx[j + 1u] += 1u;
+            tmp_v_idx.second[static_cast<std::uint32_t>(j)] += 1u;
 
             if (cur_order == 1u) {
                 // Check if the diff argument is present in the
@@ -936,31 +983,53 @@ void diff_tensors_reverse_impl(
 
     // Merge the local map into diff_map.
     if (cur_order == 1u) {
-        diff_map.insert(diff_map.end(), local_dvec().begin(), local_dvec().end());
+        for (auto &p : local_dvec()) {
+            diff_map.emplace_back(vidx_s2v(p.first), std::move(p.second));
+        }
     } else {
-        diff_map.insert(diff_map.end(), local_dmap().begin(), local_dmap().end());
+        for (auto &p : local_dmap()) {
+            diff_map.emplace_back(vidx_s2v(p.first), std::move(p.second));
+        }
     }
 }
 
 } // namespace
 
-bool dtens_v_idx_cmp::operator()(const dtens_v_idx_t &v1, const dtens_v_idx_t &v2) const
+bool dtens_sv_idx_cmp::operator()(const dtens_sv_idx_t &v1, const dtens_sv_idx_t &v2) const
 {
-    assert(v1.size() == v2.size());
-    assert(v1.size() > 1u);
+#if !defined(NDEBUG)
+
+    // Sanity checks on the inputs.
+
+    // Check indices sorting.
+    auto cmp = [](const auto &p1, const auto &p2) { return p1.first < p2.first; };
+    assert(std::is_sorted(v1.second.begin(), v1.second.end(), cmp));
+    assert(std::is_sorted(v2.second.begin(), v2.second.end(), cmp));
 
-    // Compute the total derivative order for both
-    // vectors.
+    // Check no duplicate indices.
+    auto no_dup = [](const auto &p1, const auto &p2) { return p1.first == p2.first; };
+    assert(std::adjacent_find(v1.second.begin(), v1.second.end(), no_dup) == v1.second.end());
+    assert(std::adjacent_find(v2.second.begin(), v2.second.end(), no_dup) == v2.second.end());
+
+    // Check no zero diff orders.
+    auto nz_order = [](const auto &p) { return p.second != 0u; };
+    assert(std::all_of(v1.second.begin(), v1.second.end(), nz_order));
+    assert(std::all_of(v2.second.begin(), v2.second.end(), nz_order));
+
+#endif
+
+    // Compute the total derivative order for both vectors.
     // NOTE: here we have to use safe_numerics because this comparison operator
-    // might end up being invoked on a user-supplied v_idx_t, whose total degree
-    // may overflow. The v_idx_t in dtens, by contrast, are guaranteed to never
+    // might end up being invoked on a user-supplied index vector, whose total degree
+    // may overflow. The indices vector in dtens, by contrast, are guaranteed to never
     // overflow when computing the total degree.
-    boost::safe_numerics::safe<std::uint32_t> deg1 = 0, deg2 = 0;
-    const auto size = v1.size();
-    for (decltype(v1.size()) i = 1; i < size; ++i) {
-        deg1 += v1[i];
-        deg2 += v2[i];
-    }
+    using su32 = boost::safe_numerics::safe<std::uint32_t>;
+
+    // The accumulator.
+    auto acc = [](const auto &val, const auto &p) { return val + p.second; };
+
+    const auto deg1 = std::accumulate(v1.second.begin(), v1.second.end(), su32(0), acc);
+    const auto deg2 = std::accumulate(v2.second.begin(), v2.second.end(), su32(0), acc);
 
     if (deg1 < deg2) {
         return true;
@@ -972,18 +1041,54 @@ bool dtens_v_idx_cmp::operator()(const dtens_v_idx_t &v1, const dtens_v_idx_t &v
 
     // The total derivative order is the same, look at
     // the component index next.
-    if (v1[0] < v2[0]) {
+    if (v1.first < v2.first) {
         return true;
     }
 
-    if (v1[0] > v2[0]) {
+    if (v1.first > v2.first) {
         return false;
     }
 
     // Component and total derivative order are the same,
     // resort to reverse lexicographical compare on the
     // derivative orders.
-    return std::lexicographical_compare(v1.begin() + 1, v1.end(), v2.begin() + 1, v2.end(), std::greater{});
+    auto it1 = v1.second.begin(), it2 = v2.second.begin();
+    const auto end1 = v1.second.end(), end2 = v2.second.end();
+    for (; it1 != end1 && it2 != end2; ++it1, ++it2) {
+        const auto [idx1, n1] = *it1;
+        const auto [idx2, n2] = *it2;
+
+        if (idx2 > idx1) {
+            return true;
+        }
+
+        if (idx1 > idx2) {
+            return false;
+        }
+
+        if (n1 > n2) {
+            return true;
+        }
+
+        if (n2 > n1) {
+            return false;
+        }
+
+        assert(std::equal(v1.second.begin(), it1 + 1, v2.second.begin()));
+    }
+
+    if (it1 == end1 && it2 == end2) {
+        assert(v1.second == v2.second);
+        return false;
+    }
+
+    if (it1 == end1) {
+        return false;
+    }
+
+    assert(it2 == end2);
+
+    return true;
 }
 
 } // namespace detail
@@ -1097,9 +1202,9 @@ auto diff_tensors_impl(const std::vector<expression> &v_ex, const std::vector<ex
 
     // Helper to locate the vector of indices v in diff_map. If not present,
     // diff_map.end() will be returned.
-    auto search_diff_map = [&diff_map](const dtens_v_idx_t &v) {
+    auto search_diff_map = [&diff_map](const dtens_sv_idx_t &v) {
         auto it = std::lower_bound(diff_map.begin(), diff_map.end(), v, [](const auto &item, const auto &vec) {
-            return dtens_v_idx_cmp{}(item.first, vec);
+            return dtens_sv_idx_cmp{}(item.first, vec);
         });
 
         if (it != diff_map.end() && it->first == v) {
@@ -1109,10 +1214,8 @@ auto diff_tensors_impl(const std::vector<expression> &v_ex, const std::vector<ex
         }
     };
 
-    // An indices vector with preallocated storage,
-    // used as temporary variable in several places below.
-    std::vector<std::uint32_t> tmp_v_idx;
-    tmp_v_idx.resize(1 + boost::safe_numerics::safe<decltype(tmp_v_idx.size())>(nargs));
+    // An indices vector used as temporary variable in several places below.
+    dtens_sv_idx_t tmp_v_idx;
 
     // Vector that will store the previous-order derivatives in the loop below.
     // It will be used to construct the decomposition.
@@ -1121,7 +1224,7 @@ auto diff_tensors_impl(const std::vector<expression> &v_ex, const std::vector<ex
     // Init diff_map with the order 0 derivatives
     // (i.e., the original function components).
     for (decltype(v_ex.size()) i = 0; i < orig_nouts; ++i) {
-        tmp_v_idx[0] = boost::numeric_cast<std::uint32_t>(i);
+        tmp_v_idx.first = boost::numeric_cast<std::uint32_t>(i);
 
         assert(search_diff_map(tmp_v_idx) == diff_map.end());
         diff_map.emplace_back(tmp_v_idx, v_ex[i]);
@@ -1131,9 +1234,10 @@ auto diff_tensors_impl(const std::vector<expression> &v_ex, const std::vector<ex
     for (std::uint32_t cur_order = 0; cur_order < order; ++cur_order) {
         // Locate the iterator in diff_map corresponding to the beginning
         // of the previous-order derivatives.
-        tmp_v_idx[0] = 0;
-        tmp_v_idx[1] = cur_order;
-        std::fill(tmp_v_idx.begin() + 2, tmp_v_idx.end(), static_cast<std::uint32_t>(0));
+        tmp_v_idx.first = 0;
+        tmp_v_idx.second.clear();
+        tmp_v_idx.second.emplace_back(0, cur_order);
+
         const auto prev_begin = search_diff_map(tmp_v_idx);
         assert(prev_begin != diff_map.end());
 
@@ -1177,7 +1281,7 @@ auto diff_tensors_impl(const std::vector<expression> &v_ex, const std::vector<ex
 
         // Sort the derivatives for the current order.
         oneapi::tbb::parallel_sort(
-            cur_begin, cur_end, [](const auto &p1, const auto &p2) { return dtens_v_idx_cmp{}(p1.first, p2.first); });
+            cur_begin, cur_end, [](const auto &p1, const auto &p2) { return dtens_sv_idx_cmp{}(p1.first, p2.first); });
 
         // NOTE: the derivatives we just added to diff_map are still expressed in terms of u variables.
         // We need to apply the substitution map subs_map in order to recover the expressions in terms
@@ -1225,10 +1329,11 @@ auto diff_tensors_impl(const std::vector<expression> &v_ex, const std::vector<ex
 
     // Check sorting.
     assert(std::is_sorted(retval.begin(), retval.end(),
-                          [](const auto &p1, const auto &p2) { return dtens_v_idx_cmp{}(p1.first, p2.first); }));
+                          [](const auto &p1, const auto &p2) { return dtens_sv_idx_cmp{}(p1.first, p2.first); }));
     // Check the number of elements in the indices vectors.
-    assert(std::all_of(retval.begin(), retval.end(),
-                       [&nargs](const auto &p) { return p.first.size() >= 2u && p.first.size() - 1u == nargs; }));
+    assert(std::all_of(retval.begin(), retval.end(), [&nargs](const auto &p) {
+        return p.first.second.empty() || p.first.second.back().first < nargs;
+    }));
     // No duplicates in the indices vectors.
     assert(std::adjacent_find(retval.begin(), retval.end(),
                               [](const auto &p1, const auto &p2) { return p1.first == p2.first; })
@@ -1372,29 +1477,32 @@ std::uint32_t dtens::get_order() const
     // in the map (specifically, it is
     // the last element in the indices
     // vector of the last derivative).
-    return (end() - 1)->first.back();
+    assert(!(end() - 1)->first.second.empty());
+    return (end() - 1)->first.second.back().second;
 }
 
 dtens::iterator dtens::find(const v_idx_t &vidx) const
 {
-    // NOTE: we need to sanity check vidx as the
-    // custom comparison operator for the internal map
-    // has preconditions.
-
     // First we handle the empty case.
     if (p_impl->m_map.empty()) {
         return end();
     }
 
-    // Second, we check that the size of vidx is correct.
-    if (begin()->first.size() != vidx.size()) {
+    // vidx must at least contain the function component index.
+    if (vidx.empty()) {
         return end();
     }
 
-    assert(vidx.size() > 1u);
+    // Turn vidx into sparse format.
+    detail::dtens_sv_idx_t s_vidx{vidx[0], {}};
+    for (decltype(vidx.size()) i = 1; i < vidx.size(); ++i) {
+        if (vidx[i] != 0u) {
+            s_vidx.second.emplace_back(boost::numeric_cast<std::uint32_t>(i - 1u), vidx[i]);
+        }
+    }
 
-    // vidx is ok, look it up.
-    return p_impl->m_map.find(vidx);
+    // Lookup.
+    return p_impl->m_map.find(s_vidx);
 }
 
 const expression &dtens::operator[](const v_idx_t &vidx) const
@@ -1430,13 +1538,11 @@ dtens::subrange dtens::get_derivatives(std::uint32_t order) const
 
     // Create the indices vector corresponding to the first derivative
     // of component 0 for the given order in the map.
-    auto vidx = begin()->first;
-    assert(std::all_of(vidx.begin(), vidx.end(), [](auto x) { return x == 0u; }));
-    vidx[1] = order;
+    detail::dtens_sv_idx_t s_vidx{0, {{0, order}}};
 
     // Locate the corresponding derivative in the map.
     // NOTE: this could be end() for invalid order.
-    const auto b = p_impl->m_map.find(vidx);
+    const auto b = p_impl->m_map.find(s_vidx);
 
 #if !defined(NDEBUG)
 
@@ -1453,11 +1559,12 @@ dtens::subrange dtens::get_derivatives(std::uint32_t order) const
     // NOTE: get_nouts() can return zero only if the internal
     // map is empty, and we handled this corner case earlier.
     assert(get_nouts() > 0u);
-    vidx[0] = get_nouts() - 1u;
-    vidx[1] = 0;
-    vidx.back() = order;
+    s_vidx.first = get_nouts() - 1u;
+    assert(get_nvars() > 0u);
+    s_vidx.second[0].first = get_nvars() - 1u;
+
     // NOTE: this could be end() for invalid order.
-    auto e = p_impl->m_map.find(vidx);
+    auto e = p_impl->m_map.find(s_vidx);
 
 #if !defined(NDEBUG)
 
@@ -1489,14 +1596,11 @@ dtens::subrange dtens::get_derivatives(std::uint32_t component, std::uint32_t or
 
     // Create the indices vector corresponding to the first derivative
     // for the given order and component in the map.
-    auto vidx = begin()->first;
-    assert(std::all_of(vidx.begin(), vidx.end(), [](auto x) { return x == 0u; }));
-    vidx[0] = component;
-    vidx[1] = order;
+    detail::dtens_sv_idx_t s_vidx{component, {{0, order}}};
 
     // Locate the corresponding derivative in the map.
     // NOTE: this could be end() for invalid component/order.
-    const auto b = p_impl->m_map.find(vidx);
+    const auto b = p_impl->m_map.find(s_vidx);
 
 #if !defined(NDEBUG)
 
@@ -1510,10 +1614,11 @@ dtens::subrange dtens::get_derivatives(std::uint32_t component, std::uint32_t or
 
     // Modify vidx so that it now refers to the last derivative
     // for the given order and component in the map.
-    vidx[1] = 0;
-    vidx.back() = order;
+    assert(get_nvars() > 0u);
+    s_vidx.second[0].first = get_nvars() - 1u;
+
     // NOTE: this could be end() for invalid component/order.
-    auto e = p_impl->m_map.find(vidx);
+    auto e = p_impl->m_map.find(s_vidx);
 
 #if !defined(NDEBUG)
 
@@ -1588,8 +1693,7 @@ std::uint32_t dtens::get_nvars() const
     if (p_impl->m_map.empty()) {
         assert(ret == 0u);
     } else {
-        assert(!begin()->first.empty());
-        assert(ret == begin()->first.size() - 1u);
+        assert(!begin()->first.second.empty());
     }
 
 #endif
@@ -1605,12 +1709,10 @@ std::uint32_t dtens::get_nouts() const
 
     // Construct the indices vector corresponding
     // to the first derivative of order 1 of the first component.
-    auto vidx = begin()->first;
-    assert(std::all_of(vidx.begin(), vidx.end(), [](auto x) { return x == 0u; }));
-    vidx[1] = 1;
+    detail::dtens_sv_idx_t s_vidx{0, {{0, 1}}};
 
     // Try to find it in the map.
-    const auto it = p_impl->m_map.find(vidx);
+    const auto it = p_impl->m_map.find(s_vidx);
 
     // NOTE: the number of outputs is always representable by
     // std::uint32_t, otherwise we could not index the function

From e63aae3b7ba1fc5d680c9a09616e3b45971a091a Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Fri, 10 Nov 2023 15:48:18 +0100
Subject: [PATCH 02/13] Minor.

---
 src/expression_diff.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/expression_diff.cpp b/src/expression_diff.cpp
index 2b1f5bb0d..cc967743d 100644
--- a/src/expression_diff.cpp
+++ b/src/expression_diff.cpp
@@ -551,7 +551,7 @@ void diff_tensors_forward_impl(
     // will produce several times the same derivative, and thus
     // we need to store the derivatives in a dictionary in order
     // to prevent duplicates. For order-1 derivatives, no duplicate
-    // derivatives will be produced and thus we can use a plain vector
+    // derivatives will be produced and thus we can use a plain vector,
     // which can be quite a bit faster.
     using diff_map_t = fast_umap<dtens_ss_idx_t, expression, diff_map_hasher>;
     using diff_vec_t = std::vector<std::pair<dtens_ss_idx_t, expression>>;
@@ -788,7 +788,7 @@ void diff_tensors_reverse_impl(
     // will produce several times the same derivative, and thus
     // we need to store the derivatives in a dictionary in order
     // to prevent duplicates. For order-1 derivatives, no duplicate
-    // derivatives will be produced and thus we can use a plain vector
+    // derivatives will be produced and thus we can use a plain vector,
     // which can be quite a bit faster.
     using diff_map_t = fast_umap<dtens_ss_idx_t, expression, diff_map_hasher>;
     using diff_vec_t = std::vector<std::pair<dtens_ss_idx_t, expression>>;

From 9016ef89dd33591a61316886dfade09e6e7409ac Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Fri, 10 Nov 2023 22:07:57 +0100
Subject: [PATCH 03/13] Fix quadratic complexity when building a ffnn model.

---
 src/model/ffnn.cpp | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/model/ffnn.cpp b/src/model/ffnn.cpp
index 4e5358021..435b8839b 100644
--- a/src/model/ffnn.cpp
+++ b/src/model/ffnn.cpp
@@ -19,6 +19,7 @@
 
 #include <heyoka/config.hpp>
 #include <heyoka/expression.hpp>
+#include <heyoka/math/sum.hpp>
 #include <heyoka/model/ffnn.hpp>
 
 HEYOKA_BEGIN_NAMESPACE
@@ -43,21 +44,28 @@ std::vector<expression> compute_layer(su32 layer_id, const std::vector<expressio
     auto n_neurons_prev_layer = su32(inputs.size());
     auto n_neurons_curr_layer = n_neurons[layer_id];
 
-    std::vector<expression> retval(static_cast<std::vector<expression>::size_type>(n_neurons_curr_layer), 0_dbl);
+    std::vector<expression> retval, tmp_sum;
+    retval.reserve(n_neurons_curr_layer);
+
     for (su32 i = 0; i < n_neurons_curr_layer; ++i) {
+        // Clear the summation terms.
+        tmp_sum.clear();
+
         for (su32 j = 0; j < n_neurons_prev_layer; ++j) {
 
             // Add the weight and update the weight counter.
-            retval[i] += nn_wb[wcounter] * inputs[j];
+            tmp_sum.push_back(nn_wb[wcounter] * inputs[j]);
             ++wcounter;
         }
 
         // Add the bias and update the counter.
-        retval[i] += nn_wb[bcounter + n_net_w];
+        tmp_sum.push_back(nn_wb[bcounter + n_net_w]);
         ++bcounter;
+
         // Activation function.
-        retval[i] = activation(retval[i]);
+        retval.push_back(activation(sum(tmp_sum)));
     }
+
     return retval;
 }
 

From ef8e031f014ec470bafa5635594176a79d1e5011 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Fri, 10 Nov 2023 22:08:34 +0100
Subject: [PATCH 04/13] Several fixes, provisional WIP.

---
 src/expression_diff.cpp          | 40 ++++++++++++++++++++++++--------
 test/expression_diff_tensors.cpp |  4 ++++
 2 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/src/expression_diff.cpp b/src/expression_diff.cpp
index cc967743d..8c95ddc9f 100644
--- a/src/expression_diff.cpp
+++ b/src/expression_diff.cpp
@@ -1236,7 +1236,9 @@ auto diff_tensors_impl(const std::vector<expression> &v_ex, const std::vector<ex
         // of the previous-order derivatives.
         tmp_v_idx.first = 0;
         tmp_v_idx.second.clear();
-        tmp_v_idx.second.emplace_back(0, cur_order);
+        if (cur_order != 0u) {
+            tmp_v_idx.second.emplace_back(0, cur_order);
+        }
 
         const auto prev_begin = search_diff_map(tmp_v_idx);
         assert(prev_begin != diff_map.end());
@@ -1477,8 +1479,12 @@ std::uint32_t dtens::get_order() const
     // in the map (specifically, it is
     // the last element in the indices
     // vector of the last derivative).
-    assert(!(end() - 1)->first.second.empty());
-    return (end() - 1)->first.second.back().second;
+    const auto &sv = (end() - 1)->first.second;
+    if (sv.empty()) {
+        return 0;
+    } else {
+        return sv.back().second;
+    }
 }
 
 dtens::iterator dtens::find(const v_idx_t &vidx) const
@@ -1493,6 +1499,12 @@ dtens::iterator dtens::find(const v_idx_t &vidx) const
         return end();
     }
 
+    // The size of vidx must be consistent with the number
+    // of diff args.
+    if (vidx.size() - 1u != get_nvars()) {
+        return end();
+    }
+
     // Turn vidx into sparse format.
     detail::dtens_sv_idx_t s_vidx{vidx[0], {}};
     for (decltype(vidx.size()) i = 1; i < vidx.size(); ++i) {
@@ -1538,7 +1550,10 @@ dtens::subrange dtens::get_derivatives(std::uint32_t order) const
 
     // Create the indices vector corresponding to the first derivative
     // of component 0 for the given order in the map.
-    detail::dtens_sv_idx_t s_vidx{0, {{0, order}}};
+    detail::dtens_sv_idx_t s_vidx{0, {}};
+    if (order != 0u) {
+        s_vidx.second.emplace_back(0, order);
+    }
 
     // Locate the corresponding derivative in the map.
     // NOTE: this could be end() for invalid order.
@@ -1560,8 +1575,10 @@ dtens::subrange dtens::get_derivatives(std::uint32_t order) const
     // map is empty, and we handled this corner case earlier.
     assert(get_nouts() > 0u);
     s_vidx.first = get_nouts() - 1u;
-    assert(get_nvars() > 0u);
-    s_vidx.second[0].first = get_nvars() - 1u;
+    if (order != 0u) {
+        assert(get_nvars() > 0u);
+        s_vidx.second[0].first = get_nvars() - 1u;
+    }
 
     // NOTE: this could be end() for invalid order.
     auto e = p_impl->m_map.find(s_vidx);
@@ -1596,7 +1613,10 @@ dtens::subrange dtens::get_derivatives(std::uint32_t component, std::uint32_t or
 
     // Create the indices vector corresponding to the first derivative
     // for the given order and component in the map.
-    detail::dtens_sv_idx_t s_vidx{component, {{0, order}}};
+    detail::dtens_sv_idx_t s_vidx{component, {}};
+    if (order != 0u) {
+        s_vidx.second.emplace_back(0, order);
+    }
 
     // Locate the corresponding derivative in the map.
     // NOTE: this could be end() for invalid component/order.
@@ -1615,7 +1635,9 @@ dtens::subrange dtens::get_derivatives(std::uint32_t component, std::uint32_t or
     // Modify vidx so that it now refers to the last derivative
     // for the given order and component in the map.
     assert(get_nvars() > 0u);
-    s_vidx.second[0].first = get_nvars() - 1u;
+    if (order != 0u) {
+        s_vidx.second[0].first = get_nvars() - 1u;
+    }
 
     // NOTE: this could be end() for invalid component/order.
     auto e = p_impl->m_map.find(s_vidx);
@@ -1692,8 +1714,6 @@ std::uint32_t dtens::get_nvars() const
 
     if (p_impl->m_map.empty()) {
         assert(ret == 0u);
-    } else {
-        assert(!begin()->first.second.empty());
     }
 
 #endif
diff --git a/test/expression_diff_tensors.cpp b/test/expression_diff_tensors.cpp
index aa04923c4..05fcc4a13 100644
--- a/test/expression_diff_tensors.cpp
+++ b/test/expression_diff_tensors.cpp
@@ -384,6 +384,8 @@ TEST_CASE("dtens basics")
     REQUIRE(dt3.get_args() == dt2.get_args());
 }
 
+#if 0
+
 TEST_CASE("fixed centres check")
 {
     std::uniform_real_distribution<double> rdist(-10., 10.);
@@ -577,6 +579,8 @@ TEST_CASE("speelpenning check")
     }
 }
 
+#endif
+
 TEST_CASE("speelpenning complexity")
 {
     fmt::print("Speelpenning's example\n");

From 56e79e7c8307a0300cf28c50ba4a14761d0b7ccc Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sat, 11 Nov 2023 18:38:06 +0100
Subject: [PATCH 05/13] Small tweaks.

---
 src/expression_diff.cpp | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/src/expression_diff.cpp b/src/expression_diff.cpp
index 80d3cfa3e..8dcf36db8 100644
--- a/src/expression_diff.cpp
+++ b/src/expression_diff.cpp
@@ -473,8 +473,10 @@ using dtens_ss_idx_t = std::pair<std::uint32_t, fast_umap<std::uint32_t, std::ui
 // Helper to turn a dtens_sv_idx_t into a dtens_ss_idx_t.
 void vidx_v2s(dtens_ss_idx_t &output, const dtens_sv_idx_t &input)
 {
+    // Assign the component.
     output.first = input.first;
 
+    // Assign the derivative indices.
     output.second.clear();
     for (const auto &p : input.second) {
         [[maybe_unused]] auto [_, flag] = output.second.insert(p);
@@ -485,9 +487,10 @@ void vidx_v2s(dtens_ss_idx_t &output, const dtens_sv_idx_t &input)
 // Helper to turn a dtens_ss_idx_t into a dtens_sv_idx_t.
 dtens_sv_idx_t vidx_s2v(const dtens_ss_idx_t &input)
 {
-    dtens_sv_idx_t retval{input.first, {}};
+    // Init retval.
+    dtens_sv_idx_t retval{input.first, {input.second.begin(), input.second.end()}};
 
-    std::copy(input.second.begin(), input.second.end(), std::back_inserter(retval.second));
+    // Sort the derivative indices.
     std::sort(retval.second.begin(), retval.second.end(),
               [](const auto &p1, const auto &p2) { return p1.first < p2.first; });
 
@@ -1726,18 +1729,31 @@ std::uint32_t dtens::get_nvars() const
     return ret;
 }
 
+namespace detail
+{
+
+namespace
+{
+
+// The indices vector corresponding
+// to the first derivative of order 1
+// of the first component.
+// NOLINTNEXTLINE(cert-err58-cpp)
+const dtens_sv_idx_t s_vidx_001{0, {{0, 1}}};
+
+} // namespace
+
+} // namespace detail
+
 std::uint32_t dtens::get_nouts() const
 {
     if (p_impl->m_map.empty()) {
         return 0;
     }
 
-    // Construct the indices vector corresponding
+    // Try to find in the map the indices vector corresponding
     // to the first derivative of order 1 of the first component.
-    detail::dtens_sv_idx_t s_vidx{0, {{0, 1}}};
-
-    // Try to find it in the map.
-    const auto it = p_impl->m_map.find(s_vidx);
+    const auto it = p_impl->m_map.find(detail::s_vidx_001);
 
     // NOTE: the number of outputs is always representable by
     // std::uint32_t, otherwise we could not index the function

From d196d27630223ee31be6fd0df5bb3c07e5c2d05b Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sun, 12 Nov 2023 00:43:29 +0100
Subject: [PATCH 06/13] Small tweaks, internal docs.

---
 include/heyoka/expression.hpp |   8 +-
 src/expression_diff.cpp       | 174 ++++++++++++++++++++++++----------
 2 files changed, 129 insertions(+), 53 deletions(-)

diff --git a/include/heyoka/expression.hpp b/include/heyoka/expression.hpp
index e7989e951..047a7bac5 100644
--- a/include/heyoka/expression.hpp
+++ b/include/heyoka/expression.hpp
@@ -420,8 +420,10 @@ namespace detail
 {
 
 // Sparse structure used to index derivatives in dtens:
-// the first element of the pair is the function component index,
-// the second element the vector of derivative orders in sparse form.
+// - the first element of the pair is the function component index,
+// - the second element is the vector of variable index/diff order pairs,
+//   which is kept sorted according to the variable index, and in which no
+//   diff order can be zero and no variable index can appear twice.
 using dtens_sv_idx_t = std::pair<std::uint32_t, std::vector<std::pair<std::uint32_t, std::uint32_t>>>;
 
 struct dtens_sv_idx_cmp {
@@ -435,7 +437,9 @@ using dtens_map_t = boost::container::flat_map<dtens_sv_idx_t, expression, dtens
 class HEYOKA_DLL_PUBLIC dtens
 {
 public:
+    // Derivative indexing vector in dense form.
     using v_idx_t = std::vector<std::uint32_t>;
+
     using size_type = detail::dtens_map_t::size_type;
 
 private:
diff --git a/src/expression_diff.cpp b/src/expression_diff.cpp
index 8dcf36db8..38b2339ff 100644
--- a/src/expression_diff.cpp
+++ b/src/expression_diff.cpp
@@ -454,20 +454,9 @@ auto diff_make_adj_dep(const std::vector<expression> &dc, std::vector<expression
     return std::tuple{std::move(adj), std::move(dep), std::move(revdep), std::move(subs_map)};
 }
 
-// Hasher for a pair of indices.
-struct idx_pair_hasher {
-    std::size_t operator()(const std::pair<std::uint32_t, std::uint32_t> &p) const noexcept
-    {
-        std::size_t seed = std::hash<std::uint32_t>{}(p.first);
-        boost::hash_combine(seed, std::hash<std::uint32_t>{}(p.second));
-
-        return seed;
-    }
-};
-
 // This is an alternative version of dtens_sv_idx_t that uses a dictionary
-// for storing the derivative orders instead of a sorted vector. Using a dictionary
-// allows for faster/easier manipulation of the derivative orders.
+// for storing the index/order pairs instead of a sorted vector. Using a dictionary
+// allows for faster/easier manipulation.
 using dtens_ss_idx_t = std::pair<std::uint32_t, fast_umap<std::uint32_t, std::uint32_t>>;
 
 // Helper to turn a dtens_sv_idx_t into a dtens_ss_idx_t.
@@ -476,7 +465,7 @@ void vidx_v2s(dtens_ss_idx_t &output, const dtens_sv_idx_t &input)
     // Assign the component.
     output.first = input.first;
 
-    // Assign the derivative indices.
+    // Assign the index/order pairs.
     output.second.clear();
     for (const auto &p : input.second) {
         [[maybe_unused]] auto [_, flag] = output.second.insert(p);
@@ -490,7 +479,7 @@ dtens_sv_idx_t vidx_s2v(const dtens_ss_idx_t &input)
     // Init retval.
     dtens_sv_idx_t retval{input.first, {input.second.begin(), input.second.end()}};
 
-    // Sort the derivative indices.
+    // Sort the index/order pairs.
     std::sort(retval.second.begin(), retval.second.end(),
               [](const auto &p1, const auto &p2) { return p1.first < p2.first; });
 
@@ -505,12 +494,15 @@ struct diff_map_hasher {
         // Use as seed the component index.
         std::size_t seed = std::hash<std::uint32_t>{}(s.first);
 
-        // Compose via additions the hashes of the derivative orders.
+        // Compose via additions the hashes of the index/order pairs.
         // NOTE: it is important that we use here a commutative operation
-        // so that the final hash is independent of the order in which
-        // the derivative orders are stored in the dictionary.
+        // for the composition so that the final hash is independent of the order
+        // in which the pairs are stored in the dictionary.
         for (const auto &p : s.second) {
-            seed += idx_pair_hasher{}(p);
+            std::size_t p_hash = std::hash<std::uint32_t>{}(p.first);
+            boost::hash_combine(p_hash, std::hash<std::uint32_t>{}(p.second));
+
+            seed += p_hash;
         }
 
         return seed;
@@ -566,7 +558,7 @@ void diff_tensors_forward_impl(
     auto local_dmap = [&local_diff]() -> diff_map_t & { return std::get<diff_map_t>(local_diff); };
     auto local_dvec = [&local_diff]() -> diff_vec_t & { return std::get<diff_vec_t>(local_diff); };
 
-    // An indices vector used as temporary variable in several places below.
+    // This is used as a temporary variable in several places below.
     dtens_ss_idx_t tmp_v_idx;
 
     // These two containers will be used to store the list of subexpressions
@@ -809,7 +801,7 @@ void diff_tensors_reverse_impl(
     // Cache the number of diff arguments.
     const auto nargs = args.size();
 
-    // An indices vector used as temporary variable in several places below.
+    // This is used as a temporary variable in several places below.
     dtens_ss_idx_t tmp_v_idx;
 
     // These two containers will be used to store the list of subexpressions
@@ -1001,35 +993,33 @@ void diff_tensors_reverse_impl(
     }
 }
 
-} // namespace
-
-bool dtens_sv_idx_cmp::operator()(const dtens_sv_idx_t &v1, const dtens_sv_idx_t &v2) const
+// Utility function to check that a dtens_sv_idx_t is well-formed.
+void sv_sanity_check([[maybe_unused]] const dtens_sv_idx_t &v)
 {
-#if !defined(NDEBUG)
-
-    // Sanity checks on the inputs.
-
-    // Check indices sorting.
+    // Check sorting according to the derivative indices.
     auto cmp = [](const auto &p1, const auto &p2) { return p1.first < p2.first; };
-    assert(std::is_sorted(v1.second.begin(), v1.second.end(), cmp));
-    assert(std::is_sorted(v2.second.begin(), v2.second.end(), cmp));
+    assert(std::is_sorted(v.second.begin(), v.second.end(), cmp));
 
-    // Check no duplicate indices.
+    // Check no duplicate derivative indices.
     auto no_dup = [](const auto &p1, const auto &p2) { return p1.first == p2.first; };
-    assert(std::adjacent_find(v1.second.begin(), v1.second.end(), no_dup) == v1.second.end());
-    assert(std::adjacent_find(v2.second.begin(), v2.second.end(), no_dup) == v2.second.end());
+    assert(std::adjacent_find(v.second.begin(), v.second.end(), no_dup) == v.second.end());
 
-    // Check no zero diff orders.
+    // Check no zero derivative orders.
     auto nz_order = [](const auto &p) { return p.second != 0u; };
-    assert(std::all_of(v1.second.begin(), v1.second.end(), nz_order));
-    assert(std::all_of(v2.second.begin(), v2.second.end(), nz_order));
+    assert(std::all_of(v.second.begin(), v.second.end(), nz_order));
+}
 
-#endif
+// Implementation of dtens_sv_idx_cmp::operator().
+bool dtens_sv_idx_cmp_impl(const dtens_sv_idx_t &v1, const dtens_sv_idx_t &v2)
+{
+    // Sanity checks on the inputs.
+    sv_sanity_check(v1);
+    sv_sanity_check(v2);
 
-    // Compute the total derivative order for both vectors.
+    // Compute the total derivative order for both v1 and v2.
     // NOTE: here we have to use safe_numerics because this comparison operator
-    // might end up being invoked on a user-supplied index vector, whose total degree
-    // may overflow. The indices vector in dtens, by contrast, are guaranteed to never
+    // might end up being invoked on a user-supplied dtens_sv_idx_t, whose total degree
+    // may overflow. The dtens_sv_idx_t in dtens, by contrast, are guaranteed to never
     // overflow when computing the total degree.
     using su32 = boost::safe_numerics::safe<std::uint32_t>;
 
@@ -1099,6 +1089,87 @@ bool dtens_sv_idx_cmp::operator()(const dtens_sv_idx_t &v1, const dtens_sv_idx_t
     return true;
 }
 
+#if !defined(NDEBUG)
+
+// Same comparison as the previous function, but in dense format.
+// Used only for debug.
+bool dtens_v_idx_cmp_impl(const dtens::v_idx_t &v1, const dtens::v_idx_t &v2)
+{
+    assert(v1.size() == v2.size());
+    assert(!v1.empty());
+
+    // Compute the total derivative order for both
+    // vectors.
+    boost::safe_numerics::safe<std::uint32_t> deg1 = 0, deg2 = 0;
+    const auto size = v1.size();
+    for (decltype(v1.size()) i = 1; i < size; ++i) {
+        deg1 += v1[i];
+        deg2 += v2[i];
+    }
+
+    if (deg1 < deg2) {
+        return true;
+    }
+
+    if (deg1 > deg2) {
+        return false;
+    }
+
+    // The total derivative order is the same, look at
+    // the component index next.
+    if (v1[0] < v2[0]) {
+        return true;
+    }
+
+    if (v1[0] > v2[0]) {
+        return false;
+    }
+
+    // Component and total derivative order are the same,
+    // resort to reverse lexicographical compare on the
+    // derivative orders.
+    return std::lexicographical_compare(v1.begin() + 1, v1.end(), v2.begin() + 1, v2.end(), std::greater{});
+}
+
+#endif
+
+} // namespace
+
+bool dtens_sv_idx_cmp::operator()(const dtens_sv_idx_t &v1, const dtens_sv_idx_t &v2) const
+{
+    auto ret = dtens_sv_idx_cmp_impl(v1, v2);
+
+#if !defined(NDEBUG)
+
+    // Convert to dense and re-run the same comparison.
+    auto to_dense = [](const dtens_sv_idx_t &v) {
+        dtens::v_idx_t dv{v.first};
+
+        std::uint32_t cur_d_idx = 0;
+        for (auto it = v.second.begin(); it != v.second.end(); ++cur_d_idx) {
+            if (cur_d_idx == it->first) {
+                dv.push_back(it->second);
+                ++it;
+            } else {
+                dv.push_back(0);
+            }
+        }
+
+        return dv;
+    };
+
+    auto dv1 = to_dense(v1);
+    auto dv2 = to_dense(v2);
+    dv1.resize(std::max(dv1.size(), dv2.size()));
+    dv2.resize(std::max(dv1.size(), dv2.size()));
+
+    assert(ret == dtens_v_idx_cmp_impl(dv1, dv2));
+
+#endif
+
+    return ret;
+}
+
 } // namespace detail
 
 // NOLINTNEXTLINE(bugprone-exception-escape)
@@ -1198,17 +1269,12 @@ auto diff_tensors_impl(const std::vector<expression> &v_ex, const std::vector<ex
     // in the dtens API get_nvars() can safely return std::uint32_t.
     (void)(boost::numeric_cast<std::uint32_t>(nargs));
 
-    // Map to associate a vector of indices to a derivative.
-    // The first index in each vector is the component index,
-    // the rest of the indices are the derivative orders for
-    // each diff args. E.g., with diff args = [x, y, z],
-    // then [0, 1, 2, 1] means d4f0/(dx dy**2 dz) (where f0 is the
-    // first component of the vector function f).
+    // Map to associate a dtens_sv_idx_t to a derivative.
     // This will be kept manually sorted according to dtens_v_idx_cmp
     // and it will be turned into a flat map at the end.
     dtens_map_t::sequence_type diff_map;
 
-    // Helper to locate the vector of indices v in diff_map. If not present,
+    // Helper to locate a dtens_sv_idx_t in diff_map. If not present,
     // diff_map.end() will be returned.
     auto search_diff_map = [&diff_map](const dtens_sv_idx_t &v) {
         auto it = std::lower_bound(diff_map.begin(), diff_map.end(), v, [](const auto &item, const auto &vec) {
@@ -1222,7 +1288,7 @@ auto diff_tensors_impl(const std::vector<expression> &v_ex, const std::vector<ex
         }
     };
 
-    // An indices vector used as temporary variable in several places below.
+    // This is used as a temporary variable in several places below.
     dtens_sv_idx_t tmp_v_idx;
 
     // Vector that will store the previous-order derivatives in the loop below.
@@ -1340,7 +1406,7 @@ auto diff_tensors_impl(const std::vector<expression> &v_ex, const std::vector<ex
     // Check sorting.
     assert(std::is_sorted(retval.begin(), retval.end(),
                           [](const auto &p1, const auto &p2) { return dtens_sv_idx_cmp{}(p1.first, p2.first); }));
-    // Check the number of elements in the indices vectors.
+    // Check the variable indices.
     assert(std::all_of(retval.begin(), retval.end(), [&nargs](const auto &p) {
         return p.first.second.empty() || p.first.second.back().first < nargs;
     }));
@@ -1478,6 +1544,7 @@ dtens::iterator dtens::end() const
 
 std::uint32_t dtens::get_order() const
 {
+    // First we handle the empty case.
     if (p_impl->m_map.empty()) {
         return 0;
     }
@@ -1489,6 +1556,11 @@ std::uint32_t dtens::get_order() const
     // vector of the last derivative).
     const auto &sv = (end() - 1)->first.second;
     if (sv.empty()) {
+        // NOTE: an empty index/order vector
+        // at the end means that the maximum
+        // diff order is zero and that we are
+        // only storing the original function
+        // components in the dtens object.
         return 0;
     } else {
         return sv.back().second;
@@ -1577,7 +1649,7 @@ dtens::subrange dtens::get_derivatives(std::uint32_t order) const
 
 #endif
 
-    // Modify vidx so that it now refers to the last derivative
+    // Modify s_vidx so that it now refers to the last derivative
     // for the last component at the given order in the map.
     // NOTE: get_nouts() can return zero only if the internal
     // map is empty, and we handled this corner case earlier.

From c5572e23459f38416362b941994ffa56ba6b4e5b Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sun, 12 Nov 2023 01:57:30 +0100
Subject: [PATCH 07/13] Fix and re-enable tests.

---
 test/expression_diff_tensors.cpp | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/test/expression_diff_tensors.cpp b/test/expression_diff_tensors.cpp
index 05fcc4a13..0cba38a63 100644
--- a/test/expression_diff_tensors.cpp
+++ b/test/expression_diff_tensors.cpp
@@ -384,8 +384,6 @@ TEST_CASE("dtens basics")
     REQUIRE(dt3.get_args() == dt2.get_args());
 }
 
-#if 0
-
 TEST_CASE("fixed centres check")
 {
     std::uniform_real_distribution<double> rdist(-10., 10.);
@@ -454,17 +452,13 @@ TEST_CASE("fixed centres check")
 
             const auto &v_idx = sr_it->first;
 
-            REQUIRE(v_idx.size() == diff_vars.size() + 1u);
-
             // Build the current derivative via repeated
             // invocations of diff().
-            auto ex = v_idx[0] == 0u ? fc_energy : fc_pot;
-
-            for (auto j = 1u; j < v_idx.size(); ++j) {
-                auto order = v_idx[j];
+            auto ex = v_idx.first == 0u ? fc_energy : fc_pot;
 
+            for (auto [var_idx, order] : v_idx.second) {
                 for (decltype(order) k = 0; k < order; ++k) {
-                    ex = diff(ex, diff_vars[j - 1u]);
+                    ex = diff(ex, diff_vars[var_idx]);
                 }
             }
 
@@ -545,18 +539,15 @@ TEST_CASE("speelpenning check")
 
             const auto &v_idx = sr_it->first;
 
-            REQUIRE(v_idx.size() == nvars + 1u);
-            REQUIRE(v_idx[0] == 0u);
+            REQUIRE(v_idx.first == 0u);
 
             // Build the current derivative via repeated
             // invocations of diff().
             auto ex = prod;
 
-            for (auto j = 1u; j < v_idx.size(); ++j) {
-                auto order = v_idx[j];
-
+            for (auto [var_idx, order] : v_idx.second) {
                 for (decltype(order) k = 0; k < order; ++k) {
-                    ex = diff(ex, vars[j - 1u]);
+                    ex = diff(ex, vars[var_idx]);
                 }
             }
 
@@ -579,8 +570,6 @@ TEST_CASE("speelpenning check")
     }
 }
 
-#endif
-
 TEST_CASE("speelpenning complexity")
 {
     fmt::print("Speelpenning's example\n");

From 19122f66cd8e1fca7f1f01d89737266ed78b38eb Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sun, 12 Nov 2023 11:42:43 +0100
Subject: [PATCH 08/13] Small coverage fixes.

---
 src/expression_diff.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/expression_diff.cpp b/src/expression_diff.cpp
index 38b2339ff..2ed5992e0 100644
--- a/src/expression_diff.cpp
+++ b/src/expression_diff.cpp
@@ -484,7 +484,7 @@ dtens_sv_idx_t vidx_s2v(const dtens_ss_idx_t &input)
               [](const auto &p1, const auto &p2) { return p1.first < p2.first; });
 
     return retval;
-}
+} // LCOV_EXCL_LINE
 
 // Hasher for the local maps of derivatives used in the
 // forward/reverse mode implementations.
@@ -987,9 +987,11 @@ void diff_tensors_reverse_impl(
             diff_map.emplace_back(vidx_s2v(p.first), std::move(p.second));
         }
     } else {
+        // LCOV_EXCL_START
         for (auto &p : local_dmap()) {
             diff_map.emplace_back(vidx_s2v(p.first), std::move(p.second));
         }
+        // LCOV_EXCL_STOP
     }
 }
 
@@ -1156,7 +1158,7 @@ bool dtens_sv_idx_cmp::operator()(const dtens_sv_idx_t &v1, const dtens_sv_idx_t
         }
 
         return dv;
-    };
+    }; // LCOV_EXCL_LINE
 
     auto dv1 = to_dense(v1);
     auto dv2 = to_dense(v2);

From 840ba1eb27be770f6b0ca0c64444672900d5c062 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sun, 12 Nov 2023 11:52:33 +0100
Subject: [PATCH 09/13] Bump up s11n version number for dtens.

---
 include/heyoka/expression.hpp | 5 +++++
 src/expression_diff.cpp       | 9 ++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/include/heyoka/expression.hpp b/include/heyoka/expression.hpp
index 047a7bac5..bf05a518c 100644
--- a/include/heyoka/expression.hpp
+++ b/include/heyoka/expression.hpp
@@ -515,6 +515,11 @@ HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const dtens &);
 
 HEYOKA_END_NAMESPACE
 
+// Version changelog:
+// - version 1: switched from dense to sparse
+//   format for the indices vectors.
+BOOST_CLASS_VERSION(heyoka::dtens::impl, 1)
+
 // fmt formatter for dtens, implemented
 // on top of the streaming operator.
 namespace fmt
diff --git a/src/expression_diff.cpp b/src/expression_diff.cpp
index 2ed5992e0..e4ede9216 100644
--- a/src/expression_diff.cpp
+++ b/src/expression_diff.cpp
@@ -1202,8 +1202,15 @@ struct dtens::impl {
     // NOTE: as usual, we assume here that the archive contains
     // a correctly-serialised instance. In particular, we are assuming
     // that the elements in ar are sorted correctly.
-    void load(boost::archive::binary_iarchive &ar, unsigned)
+    void load(boost::archive::binary_iarchive &ar, unsigned version)
     {
+        // LCOV_EXCL_START
+        if (version < static_cast<unsigned>(boost::serialization::version<dtens>::type::value)) {
+            throw std::invalid_argument(
+                fmt::format("Unable to load a dtens object: the archive version ({}) is too old", version));
+        }
+        // LCOV_EXCL_STOP
+
         try {
             // Reset m_map.
             m_map.clear();

From 109743adb258226c4e0a674b6784b6aa68c03a61 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sun, 12 Nov 2023 11:56:57 +0100
Subject: [PATCH 10/13] Small addition.

---
 include/heyoka/expression.hpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/heyoka/expression.hpp b/include/heyoka/expression.hpp
index bf05a518c..f7113e225 100644
--- a/include/heyoka/expression.hpp
+++ b/include/heyoka/expression.hpp
@@ -440,6 +440,9 @@ class HEYOKA_DLL_PUBLIC dtens
     // Derivative indexing vector in dense form.
     using v_idx_t = std::vector<std::uint32_t>;
 
+    // Derivative indexing vector in sparse form.
+    using sv_idx_t = detail::dtens_sv_idx_t;
+
     using size_type = detail::dtens_map_t::size_type;
 
 private:

From da1288d0b84b8bc4b119af614199a3fd1da7ad76 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sun, 12 Nov 2023 12:01:21 +0100
Subject: [PATCH 11/13] Add another test in the hope of triggering more corner
 cases.

---
 test/expression_diff_tensors.cpp | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/test/expression_diff_tensors.cpp b/test/expression_diff_tensors.cpp
index 0cba38a63..4cb91140c 100644
--- a/test/expression_diff_tensors.cpp
+++ b/test/expression_diff_tensors.cpp
@@ -25,6 +25,8 @@
 #include <heyoka/llvm_state.hpp>
 #include <heyoka/math/prod.hpp>
 #include <heyoka/math/sum.hpp>
+#include <heyoka/math/tanh.hpp>
+#include <heyoka/model/ffnn.hpp>
 #include <heyoka/model/fixed_centres.hpp>
 #include <heyoka/s11n.hpp>
 
@@ -700,3 +702,18 @@ TEST_CASE("jacobian")
         REQUIRE(jac == std::vector{1_dbl, 1_dbl, 1_dbl, -1_dbl, -1_dbl, -1_dbl});
     }
 }
+
+// A test on a neural network. No REQUIREs,
+// for this test we rely on the internal assertions
+// in debug mode.
+TEST_CASE("nn test")
+{
+    const auto nn_layer = 50u;
+
+    auto [x, y] = make_vars("x", "y");
+    auto ffnn = model::ffnn(kw::inputs = {x, y}, kw::nn_hidden = {nn_layer, nn_layer, nn_layer}, kw::n_out = 2,
+                            kw::activations = {heyoka::tanh, heyoka::tanh, heyoka::tanh, heyoka::tanh});
+
+    auto dt = diff_tensors(ffnn, kw::diff_args = diff_args::params);
+    auto dNdtheta = dt.get_jacobian();
+}

From 60399488667db540bb3d33e72ca52837a7e7c32a Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sun, 12 Nov 2023 12:05:39 +0100
Subject: [PATCH 12/13] Update changelog.

---
 doc/changelog.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/changelog.rst b/doc/changelog.rst
index 9a8e42762..b4a33aaba 100644
--- a/doc/changelog.rst
+++ b/doc/changelog.rst
@@ -23,7 +23,8 @@ Changes
 
 - Substantial speedups in the computation of first-order derivatives
   with respect to many variables/parameters
-  (`#358 <https://github.com/bluescarni/heyoka/pull/358>`__).
+  (`#360 <https://github.com/bluescarni/heyoka/pull/360>`__,
+  `#358 <https://github.com/bluescarni/heyoka/pull/358>`__).
 - Substantial performance improvements in the computation of
   derivative tensors of large expressions with a high degree
   of internal redundancy

From f164b7e344d372e2586b8df360026f52e121c5ce Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sun, 12 Nov 2023 14:32:09 +0100
Subject: [PATCH 13/13] Simplification in the comparison operator.

---
 src/expression_diff.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/expression_diff.cpp b/src/expression_diff.cpp
index e4ede9216..138ca369d 100644
--- a/src/expression_diff.cpp
+++ b/src/expression_diff.cpp
@@ -1077,18 +1077,18 @@ bool dtens_sv_idx_cmp_impl(const dtens_sv_idx_t &v1, const dtens_sv_idx_t &v2)
         assert(std::equal(v1.second.begin(), it1 + 1, v2.second.begin()));
     }
 
-    if (it1 == end1 && it2 == end2) {
-        assert(v1.second == v2.second);
-        return false;
-    }
-
-    if (it1 == end1) {
-        return false;
-    }
-
-    assert(it2 == end2);
-
-    return true;
+    // NOTE: if we end up here, it means that:
+    // - component and diff order are the same, and
+    // - the two index/order lists share a common
+    //   (possibly empty) initial sequence.
+    // It follows then that both it1 and it2 must be
+    // end iterators, because if either index/order list
+    // had additional terms, then the diff orders
+    // could not possibly be equal.
+    assert(it1 == end1 && it2 == end2);
+    assert(v1.second == v2.second);
+
+    return false;
 }
 
 #if !defined(NDEBUG)