From 57f201728c306b1e93e4ed3ca7a02d12c6c4b399 Mon Sep 17 00:00:00 2001
From: Rupert Nash <r.nash@epcc.ed.ac.uk>
Date: Thu, 2 May 2024 21:06:43 +0100
Subject: [PATCH] Major simplification of geometry reading and optimisation

I could not understand how the OptimisedDecomposition class worked
while trying to track down a bug. This lead to a complete re-write of
how the distributed graph optimised by ParMETIS was constructed and
how the "moves" (i.e. which vertices need to go to which process) were
calculated.

This has been rewritten in a much simpler and slightly faster way,
using the net::SparseExchange class.

The GeometryReader is also much simpler and leans more on the octree
to avoid unnecessary work. The IO pattern is much simpler also now:
the GMY block data is now collectively streamed onto node "leader"
processes and then individual MPI processes copy out the parts they
care about.
---
 CMake/HemeLbOptions.cmake                     |    2 -
 Code/CMakeLists.txt                           |    1 -
 Code/geometry/Domain.cc                       |  392 +++--
 Code/geometry/GeometryBlock.h                 |    6 +-
 Code/geometry/GeometryReader.cc               |  906 +++++-----
 Code/geometry/GeometryReader.h                |  166 +-
 Code/geometry/GmyReadResult.cc                |   21 +-
 Code/geometry/GmyReadResult.h                 |   63 +
 Code/geometry/LookupTree.h                    |    4 +
 .../decomposition/BasicDecomposition.cc       |    6 +-
 .../decomposition/OptimisedDecomposition.cc   | 1548 +++++------------
 .../decomposition/OptimisedDecomposition.h    |  173 +-
 Code/reporting/BuildInfo.cc                   |    3 +-
 Code/resources/report.txt.ctp                 |    1 -
 Code/resources/report.xml.ctp                 |    3 +-
 Code/tests/geometry/LookupTreeTests.cc        |    6 +-
 .../redblood/parallel/GraphCommsTests.cc      |    2 +-
 .../parallel/MPIIntegrateVelocities.cc        |    4 +-
 .../redblood/parallel/MPILockStepTests.cc     |    4 +-
 .../parallel/MPIParallelIntegrationTests.cc   |    4 +-
 .../redblood/parallel/MPISpreadForcesTests.cc |    4 +-
 .../redblood/parallel/ParallelFixtureTests.cc |    5 +-
 22 files changed, 1320 insertions(+), 2004 deletions(-)
diff --git a/CMake/HemeLbOptions.cmake b/CMake/HemeLbOptions.cmake
index 5ecc6da06..a793c2a13 100644
--- a/CMake/HemeLbOptions.cmake
+++ b/CMake/HemeLbOptions.cmake
@@ -47,8 +47,6 @@ endif()
 #
 pass_cachevar(HEMELB HEMELB_EXECUTABLE "hemelb"
   STRING "File name of executable to produce")
-pass_cachevar(HEMELB HEMELB_READING_GROUP_SIZE 5
-  STRING "Number of cores to use to read geometry file.")
 pass_cachevar_choice(HEMELB HEMELB_LOG_LEVEL Info
   STRING "Log level"
   Critical Error Warning Info Debug Trace)
diff --git a/Code/CMakeLists.txt b/Code/CMakeLists.txt
index 9fed5225b..d1ec98a37 100644
--- a/Code/CMakeLists.txt
+++ b/Code/CMakeLists.txt
@@ -28,7 +28,6 @@ if (HEMELB_USE_ALL_WARNINGS_GNU)
 endif()
 
 add_definitions(-DHEMELB_CODE)
-add_definitions(-DHEMELB_READING_GROUP_SIZE=${HEMELB_READING_GROUP_SIZE})
 add_definitions(-DHEMELB_COMPUTE_ARCHITECTURE=${HEMELB_COMPUTE_ARCHITECTURE})
 
 if(HEMELB_VALIDATE_GEOMETRY)
diff --git a/Code/geometry/Domain.cc b/Code/geometry/Domain.cc
index 868996a58..bc7f36c19 100644
--- a/Code/geometry/Domain.cc
+++ b/Code/geometry/Domain.cc
@@ -66,206 +66,224 @@ namespace hemelb::geometry
             blockCount = blockCounts.x() * blockCounts.y() * blockCounts.z();
         }
 
-        void Domain::ProcessReadSites(const GmyReadResult & readResult)
-        {
-            log::Logger::Log<log::Info, log::Singleton>("Processing sites assigned to each MPI process");
-            const auto max_site_index = sites - util::Vector3D<site_t>::Ones();
-            blocks.resize(rank_for_site_store->GetBlockCount());
-
-            totalSharedFs = 0;
-
-            std::vector<SiteData> domainEdgeSiteData[COLLISION_TYPES];
-            std::vector<SiteData> midDomainSiteData[COLLISION_TYPES];
-            std::vector<site_t> domainEdgeBlockNumber[COLLISION_TYPES];
-            std::vector<site_t> midDomainBlockNumber[COLLISION_TYPES];
-            std::vector<site_t> domainEdgeSiteNumber[COLLISION_TYPES];
-            std::vector<site_t> midDomainSiteNumber[COLLISION_TYPES];
-            std::vector<util::Vector3D<float> > domainEdgeWallNormals[COLLISION_TYPES];
-            std::vector<util::Vector3D<float> > midDomainWallNormals[COLLISION_TYPES];
-            std::vector<float> domainEdgeWallDistance[COLLISION_TYPES];
-            std::vector<float> midDomainWallDistance[COLLISION_TYPES];
-
-            proc_t localRank = comms.Rank();
-
-            // Iterate over all blocks in site units
-            for (auto leaf: rank_for_site_store->GetTree().IterLeaves())
-            {
-                auto block_ijk = leaf.coords();
-                site_t blockGmyIdx = GetBlockGmyIdxFromBlockCoords(block_ijk);
-                auto const& blockReadIn = readResult.Blocks[blockGmyIdx];
-
-                if (blockReadIn.Sites.empty())
+    void Domain::ProcessReadSites(const GmyReadResult & readResult)
+    {
+        log::Logger::Log<log::Info, log::Singleton>("Processing sites assigned to each MPI process");
+        const auto max_site_index = sites - util::Vector3D<site_t>::Ones();
+        blocks.resize(rank_for_site_store->GetBlockCount());
+
+        totalSharedFs = 0;
+
+        std::vector<SiteData> domainEdgeSiteData[COLLISION_TYPES];
+        std::vector<SiteData> midDomainSiteData[COLLISION_TYPES];
+        std::vector<site_t> domainEdgeBlockNumber[COLLISION_TYPES];
+        std::vector<site_t> midDomainBlockNumber[COLLISION_TYPES];
+        std::vector<site_t> domainEdgeSiteNumber[COLLISION_TYPES];
+        std::vector<site_t> midDomainSiteNumber[COLLISION_TYPES];
+        std::vector<util::Vector3D<float> > domainEdgeWallNormals[COLLISION_TYPES];
+        std::vector<util::Vector3D<float> > midDomainWallNormals[COLLISION_TYPES];
+        std::vector<float> domainEdgeWallDistance[COLLISION_TYPES];
+        std::vector<float> midDomainWallDistance[COLLISION_TYPES];
+
+        proc_t localRank = comms.Rank();
+
+        // We need to know the coordinates of all the domain edge
+        // sites below so we can loop over them to figure out which
+        // processes they live on once the RMA data structures are
+        // initialised.
+        std::vector<util::Vector3D<site_t>> edge_sites;
+
+        // This closure returns which rank a site (given by global
+        // 3D coordinate) lives on, according to the
+        // GmyReadResult. Since that is only guaranteed to know
+        // the rank of sites which are assigned to this rank, it
+        // may well return UNKNOWN_PROCESS
+        auto read_result_rank_for_site = [this, &max_site_index, &readResult](
+            util::Vector3D<site_t> const& global_site_coords,
+            Vec16& block_coords, site_t& site_gmy_idx
+        ) {
+            // If outside the bounding box it is definitely non-fluid
+            if (!global_site_coords.IsInRange(
+                util::Vector3D<site_t>::Zero(),
+                max_site_index
+            ))
+                return SITE_OR_BLOCK_SOLID;
+
+          // ... (that is actually being simulated and not a solid)...
+          block_coords = (global_site_coords / readResult.GetBlockSize()).as<U16>();
+          site_t block_gmy_idx = readResult.GetBlockIdFromBlockCoordinates(block_coords.x(),
+                                                                           block_coords.y(),
+                                                                           block_coords.z());
+
+          // Move on if the neighbour is in a block of solids
+          // in which case the block will contain zero sites
+          // Or on if the neighbour site is solid
+          // in which case the targetProcessor is SITE_OR_BLOCK_SOLID
+          // Or the neighbour is also on this processor
+          // in which case the targetProcessor is localRank
+          if (readResult.Blocks[block_gmy_idx].Sites.empty())
+              return SITE_OR_BLOCK_SOLID;
+
+          auto site_local_coords = global_site_coords % readResult.GetBlockSize();
+          site_gmy_idx = readResult.GetSiteIdFromSiteCoordinates(site_local_coords.x(),
+                                                                 site_local_coords.y(),
+                                                                 site_local_coords.z());
+          return readResult.Blocks[block_gmy_idx].Sites[site_gmy_idx].targetProcessor;
+        };
+
+        for (auto leaf: rank_for_site_store->GetTree().IterLeaves()) {
+            auto block_ijk = leaf.coords();
+            site_t blockGmyIdx = GetBlockGmyIdxFromBlockCoords(block_ijk);
+            auto const& blockReadIn = readResult.Blocks[blockGmyIdx];
+
+            if (blockReadIn.Sites.empty())
+                continue;
+
+            auto blockOctIdx = leaf.index();
+            auto lowest_site_in_block = block_ijk.as<site_t>() * GetBlockSize();
+
+            // Iterate over all sites within the current block.
+            for (
+                auto siteTraverser = SiteTraverser(*this);
+                siteTraverser.CurrentLocationValid();
+                siteTraverser.TraverseOne()
+             ) {
+                site_t localSiteId = siteTraverser.GetCurrentIndex();
+                // Create block if required
+                if (blocks[blockOctIdx].IsEmpty())
+                    blocks[blockOctIdx] = Block(GetSitesPerBlockVolumeUnit());
+
+                auto assignedRank = blockReadIn.Sites[localSiteId].targetProcessor;
+                blocks[blockOctIdx].SetProcessorRankForSite(localSiteId, assignedRank);
+
+                // If the site is not on this processor, continue.
+                if (assignedRank != localRank)
                     continue;
 
-                auto blockOctIdx = leaf.index();
-                auto lowest_site_in_block = block_ijk.as<site_t>() * GetBlockSize();
-
-                // Iterate over all sites within the current block.
-                for (auto siteTraverser = SiteTraverser(*this);
-                     siteTraverser.CurrentLocationValid(); siteTraverser.TraverseOne())
-                {
-                    site_t localSiteId = siteTraverser.GetCurrentIndex();
-                    // Create block if required
-                    if (blocks[blockOctIdx].IsEmpty())
-                        blocks[blockOctIdx] = Block(GetSitesPerBlockVolumeUnit());
-
-                    auto assignedRank = blockReadIn.Sites[localSiteId].targetProcessor;
-                    blocks[blockOctIdx].SetProcessorRankForSite(localSiteId, assignedRank);
-
-                    // If the site is not on this processor, continue.
-                    if (assignedRank != localRank)
+                bool isMidDomainSite = true;
+                util::Vector3D<site_t> siteGlobalCoords = lowest_site_in_block + siteTraverser.GetCurrentLocation();
+                // Iterate over all non-zero direction vectors.
+                for (unsigned int l = 1; l < latticeInfo.GetNumVectors(); l++) {
+                    // Find the neighbour site co-ords in this direction.
+                    util::Vector3D<site_t> neighbourGlobalCoords = siteGlobalCoords
+                      + latticeInfo.GetVector(l).as<site_t>();
+                    Vec16 neighbourBlock;
+                    site_t neighbourSiteId;
+                    site_t neighbourProc = read_result_rank_for_site(
+                        neighbourGlobalCoords, neighbourBlock, neighbourSiteId
+                    );
+                    if (neighbourProc == SITE_OR_BLOCK_SOLID || localRank == neighbourProc)
                         continue;
+                    isMidDomainSite = false;
+                    totalSharedFs++;
+                }
 
-                    bool isMidDomainSite = true;
-                    // Iterate over all non-zero direction vectors.
-                    for (unsigned int l = 1; l < latticeInfo.GetNumVectors(); l++)
-                    {
-                        // Find the neighbour site co-ords in this direction.
-                        util::Vector3D<site_t> neighbourGlobalCoords =
-                                lowest_site_in_block
-                                + siteTraverser.GetCurrentLocation()
-                                + latticeInfo.GetVector(l).as<site_t>();
-
-                        // If outside the bounding box it is definitely non-fluid
-                        if (!neighbourGlobalCoords.IsInRange(
-                                util::Vector3D<site_t>::Zero(),
-                                max_site_index
-                                ))
-                            continue;
-
-
-                        // ... (that is actually being simulated and not a solid)...
-                        auto neighbourBlock = neighbourGlobalCoords / readResult.GetBlockSize();
-                        auto neighbourSite = neighbourGlobalCoords % readResult.GetBlockSize();
-                        site_t neighBlockGmyIdx = readResult.GetBlockIdFromBlockCoordinates(neighbourBlock.x(),
-                                                                                            neighbourBlock.y(),
-                                                                                            neighbourBlock.z());
-
-                        // Move on if the neighbour is in a block of solids
-                        // in which case the block will contain zero sites
-                        // Or on if the neighbour site is solid
-                        // in which case the targetProcessor is SITE_OR_BLOCK_SOLID
-                        // Or the neighbour is also on this processor
-                        // in which case the targetProcessor is localRank
-                        if (readResult.Blocks[neighBlockGmyIdx].Sites.empty())
-                            continue;
-
-                        site_t neighbourSiteId = readResult.GetSiteIdFromSiteCoordinates(neighbourSite.x(),
-                                                                                         neighbourSite.y(),
-                                                                                         neighbourSite.z());
-
-                        proc_t neighbourProc = readResult.Blocks[neighBlockGmyIdx].Sites[neighbourSiteId].targetProcessor;
-                        if (neighbourProc == SITE_OR_BLOCK_SOLID || localRank == neighbourProc)
-                            continue;
+                if (!isMidDomainSite)
+                    edge_sites.push_back(siteGlobalCoords);
+
+                // Set the collision type data. map_block site data is renumbered according to
+                // fluid site numbers within a particular collision type.
+                SiteData siteData(blockReadIn.Sites[localSiteId]);
+                int l = -1;
+                switch (siteData.GetCollisionType()) {
+                case FLUID:
+                    l = 0;
+                    break;
+                case WALL:
+                    l = 1;
+                    break;
+                case INLET:
+                    l = 2;
+                    break;
+                case OUTLET:
+                    l = 3;
+                    break;
+                case (INLET | WALL):
+                    l = 4;
+                    break;
+                case (OUTLET | WALL):
+                    l = 5;
+                    break;
+                }
 
-                        isMidDomainSite = false;
-                        totalSharedFs++;
-
-                        // Iterate over neighbouring processors until we find the one with the
-                        // neighbouring site on it.
-                        auto neighProcWithSite = std::find_if(
-                                neighbouringProcs.begin(), neighbouringProcs.end(),
-                                [&](NeighbouringProcessor const& np) {
-                                    return np.Rank == neighbourProc;
-                                });
-                        if (neighProcWithSite == neighbouringProcs.end()) {
-                            // We didn't find a neighbour-proc with the
-                            // neighbour-site on it, so we need a new
-                            // neighbouring processor.
-
-                            // Store rank of neighbour in >neigh_proc[neigh_procs]
-                            NeighbouringProcessor lNewNeighbour;
-                            lNewNeighbour.SharedDistributionCount = 1;
-                            lNewNeighbour.Rank = neighbourProc;
-                            neighbouringProcs.push_back(lNewNeighbour);
-
-                            // if debugging then output decisions with reasoning for all neighbour processors
-                            log::Logger::Log<log::Trace, log::OnePerCore>("domain_type: added %i as neighbour for %i because site %i in block %i is neighbour to site %i in block %i in direction (%i,%i,%i)\n",
-                                                                          (int) neighbourProc,
-                                                                          (int) localRank,
-                                                                          (int) neighbourSiteId,
-                                                                          (int) neighBlockGmyIdx,
-                                                                          (int) localSiteId,
-                                                                          (int) blockGmyIdx,
-                                                                          latticeInfo.GetVector(l).x(),
-                                                                          latticeInfo.GetVector(l).y(),
-                                                                          latticeInfo.GetVector(l).z());
-                        } else {
-                            // Did find it, increment the shared count
-                            ++neighProcWithSite->SharedDistributionCount;
-                        }
+                const util::Vector3D<float>& normal = blockReadIn.Sites[localSiteId].wallNormalAvailable ?
+                  blockReadIn.Sites[localSiteId].wallNormal :
+                  util::Vector3D<float>(NO_VALUE);
+
+                if (isMidDomainSite) {
+                    midDomainBlockNumber[l].push_back(blockOctIdx);
+                    midDomainSiteNumber[l].push_back(localSiteId);
+                    midDomainSiteData[l].push_back(siteData);
+                    midDomainWallNormals[l].push_back(normal);
+                    for (Direction direction = 1; direction < latticeInfo.GetNumVectors(); direction++) {
+                        midDomainWallDistance[l].push_back(blockReadIn.Sites[localSiteId].links[direction - 1].distanceToIntersection);
                     }
-
-                    // Set the collision type data. map_block site data is renumbered according to
-                    // fluid site numbers within a particular collision type.
-                    SiteData siteData(blockReadIn.Sites[localSiteId]);
-                    int l = -1;
-                    switch (siteData.GetCollisionType())
-                    {
-                        case FLUID:
-                            l = 0;
-                            break;
-                        case WALL:
-                            l = 1;
-                            break;
-                        case INLET:
-                            l = 2;
-                            break;
-                        case OUTLET:
-                            l = 3;
-                            break;
-                        case (INLET | WALL):
-                            l = 4;
-                            break;
-                        case (OUTLET | WALL):
-                            l = 5;
-                            break;
+                } else {
+                    domainEdgeBlockNumber[l].push_back(blockOctIdx);
+                    domainEdgeSiteNumber[l].push_back(localSiteId);
+                    domainEdgeSiteData[l].push_back(siteData);
+                    domainEdgeWallNormals[l].push_back(normal);
+                    for (Direction direction = 1; direction < latticeInfo.GetNumVectors(); direction++) {
+                        domainEdgeWallDistance[l].push_back(blockReadIn.Sites[localSiteId].links[direction - 1].distanceToIntersection);
                     }
+                }
+            }
+        }
 
-                    const util::Vector3D<float>& normal = blockReadIn.Sites[localSiteId].wallNormalAvailable ?
-                                                          blockReadIn.Sites[localSiteId].wallNormal :
-                                                          util::Vector3D<float>(NO_VALUE);
-
-                    //SiteData siteData(blockReadIn.Sites[localSiteId]);
+        PopulateWithReadData(midDomainBlockNumber,
+                             midDomainSiteNumber,
+                             midDomainSiteData,
+                             midDomainWallNormals,
+                             midDomainWallDistance,
+                             domainEdgeBlockNumber,
+                             domainEdgeSiteNumber,
+                             domainEdgeSiteData,
+                             domainEdgeWallNormals,
+                             domainEdgeWallDistance);
+
+        // We now have the distributed store setup, so can find which
+        // process any site lives one. Set up the neighbouring
+        // processes for the edge sites.
+        for (auto const& site_global_coords: edge_sites) {
+            for (unsigned int l = 1; l < latticeInfo.GetNumVectors(); l++) {
+                // Find the neighbour site co-ords in this direction.
+                util::Vector3D<site_t> neigh_global_coords = site_global_coords + latticeInfo.GetVector(l).as<site_t>();
+                Vec16 neigh_block;
+                site_t neigh_site_id;
+                site_t rrProc = read_result_rank_for_site(
+                    neigh_global_coords, neigh_block, neigh_site_id
+                );
+
+                if (rrProc == SITE_OR_BLOCK_SOLID || rrProc == localRank)
+                    continue;
 
-                    if (isMidDomainSite)
-                    {
-                        midDomainBlockNumber[l].push_back(blockOctIdx);
-                        midDomainSiteNumber[l].push_back(localSiteId);
-                        midDomainSiteData[l].push_back(siteData);
-                        midDomainWallNormals[l].push_back(normal);
-                        for (Direction direction = 1; direction < latticeInfo.GetNumVectors(); direction++)
-                        {
-                            midDomainWallDistance[l].push_back(blockReadIn.Sites[localSiteId].links[direction - 1].distanceToIntersection);
-                        }
+                auto [neighbourProc, remoteSiteIdx] = rank_for_site_store->GetSiteData(
+                    neigh_block, neigh_site_id
+                );
+                auto neighProcWithSite = std::find_if(
+                    neighbouringProcs.begin(), neighbouringProcs.end(),
+                    [&](NeighbouringProcessor const& np) {
+                      return np.Rank == neighbourProc;
                     }
-                    else
-                    {
-                        domainEdgeBlockNumber[l].push_back(blockOctIdx);
-                        domainEdgeSiteNumber[l].push_back(localSiteId);
-                        domainEdgeSiteData[l].push_back(siteData);
-                        domainEdgeWallNormals[l].push_back(normal);
-                        for (Direction direction = 1; direction < latticeInfo.GetNumVectors(); direction++)
-                        {
-                            domainEdgeWallDistance[l].push_back(blockReadIn.Sites[localSiteId].links[direction - 1].distanceToIntersection);
-                        }
-                    }
-
+                );
+
+                if (neighProcWithSite == neighbouringProcs.end()) {
+                    // We didn't find a neighbour-proc with the
+                    // neighbour-site on it, so we need a new
+                    // neighbouring processor.
+
+                    // Store rank of neighbour in >neigh_proc[neigh_procs]
+                    NeighbouringProcessor lNewNeighbour;
+                    lNewNeighbour.SharedDistributionCount = 1;
+                    lNewNeighbour.Rank = neighbourProc;
+                    neighbouringProcs.push_back(lNewNeighbour);
+                } else {
+                    // Did find it, increment the shared count
+                    ++neighProcWithSite->SharedDistributionCount;
                 }
-
             }
-
-            PopulateWithReadData(midDomainBlockNumber,
-                                 midDomainSiteNumber,
-                                 midDomainSiteData,
-                                 midDomainWallNormals,
-                                 midDomainWallDistance,
-                                 domainEdgeBlockNumber,
-                                 domainEdgeSiteNumber,
-                                 domainEdgeSiteData,
-                                 domainEdgeWallNormals,
-                                 domainEdgeWallDistance);
         }
+    }
 
         void Domain::PopulateWithReadData(
                 const std::vector<site_t> midDomainBlockNumbers[COLLISION_TYPES],
diff --git a/Code/geometry/GeometryBlock.h b/Code/geometry/GeometryBlock.h
index fcbb739f4..c89f4c729 100644
--- a/Code/geometry/GeometryBlock.h
+++ b/Code/geometry/GeometryBlock.h
@@ -9,20 +9,16 @@
 #include <vector>
 #include "geometry/GeometrySite.h"
 
-namespace hemelb
+namespace hemelb::geometry
 {
-  namespace geometry
-  {
     /***
      * Model of the information stored for a block in a geometry file.
      * Just gives the array of sites
      */
     struct BlockReadResult
     {
-      public:
         std::vector<GeometrySite> Sites;
     };
-  }
 }
 
 #endif /* HEMELB_GEOMETRY_GEOMETRYBLOCK_H */
diff --git a/Code/geometry/GeometryReader.cc b/Code/geometry/GeometryReader.cc
index e98535bd2..46f7df2a6 100644
--- a/Code/geometry/GeometryReader.cc
+++ b/Code/geometry/GeometryReader.cc
@@ -16,9 +16,12 @@
 #include "geometry/GeometryReader.h"
 #include "geometry/LookupTree.h"
 #include "net/net.h"
+#include "net/SparseExchange.h"
 #include "net/IOCommunicator.h"
 #include "log/Logger.h"
+#include "util/span.h"
 #include "util/utilityFunctions.h"
+#include "util/Iterator.h"
 #include "constants.h"
 
 namespace hemelb::geometry
@@ -76,43 +79,28 @@ namespace hemelb::geometry
 
     GmyReadResult GeometryReader::LoadAndDecompose(const std::string& dataFilePath)
     {
-        log::Logger::Log<log::Debug, log::OnePerCore>("Starting file read timer");
-        timings[hemelb::reporting::Timers::fileRead].Start();
-
-        // Create hints about how we'll read the file. See Chapter 13, page 400 of the MPI 2.2 spec.
-        MPI_Info fileInfo;
-        HEMELB_MPI_CALL(MPI_Info_create, (&fileInfo));
-        std::string accessStyle = "access_style";
-        std::string accessStyleValue = "sequential";
-        std::string buffering = "collective_buffering";
-        std::string bufferingValue = "true";
-
-        HEMELB_MPI_CALL(MPI_Info_set,
-                        (fileInfo, const_cast<char*> (accessStyle.c_str()), const_cast<char*> (accessStyleValue.c_str())));
-        HEMELB_MPI_CALL(MPI_Info_set,
-                        (fileInfo, const_cast<char*> (buffering.c_str()), const_cast<char*> (bufferingValue.c_str())));
-
-        // Open the file.
-        file = net::MpiFile::Open(computeComms, dataFilePath, MPI_MODE_RDONLY, fileInfo);
-        log::Logger::Log<log::Info, log::Singleton>("Opened config file %s", dataFilePath.c_str());
+        timings[reporting::Timers::fileRead].Start();
+
+        // Open the file for read on node leaders
+        if (computeComms.AmNodeLeader()) {
+            file = net::MpiFile::Open(
+                computeComms.GetLeadersComm(), dataFilePath, MPI_MODE_RDONLY, MPI_INFO_NULL
+            );
+        }
+
+        log::Logger::Log<log::Info, log::Singleton>("Opened geometry file %s", dataFilePath.c_str());
         // TODO: Why is there this fflush?
         fflush(nullptr);
 
-        // Set the view to the file.
-        file.SetView(0, MPI_CHAR, MPI_CHAR, "native", fileInfo);
-
         log::Logger::Log<log::Debug, log::OnePerCore>("Reading file preamble");
         GmyReadResult geometry = ReadPreamble();
 
         log::Logger::Log<log::Debug, log::OnePerCore>("Reading file header");
         ReadHeader(geometry.GetBlockCount());
-
-        // Close the file - only the ranks participating in the topology need to read it again.
-        file.Close();
-
+        timings[reporting::Timers::fileRead].Stop();
 
         {
-            timings[hemelb::reporting::Timers::initialDecomposition].Start();
+            timings[reporting::Timers::initialDecomposition].Start();
             principalProcForEachBlock.resize(geometry.GetBlockCount());
 
             log::Logger::Log<log::Info, log::Singleton>("Creating block-level octree");
@@ -120,46 +108,53 @@ namespace hemelb::geometry
                     geometry.GetBlockDimensions().as<octree::U16>(),
                     fluidSitesOnEachBlock
             );
+            nFluidBlocks = blockTree.levels.back().node_ids.size();
+            log::Logger::Log<log::Info, log::Singleton>(
+                "Geometry has %lu / %ld active blocks, total %ld sites",
+                nFluidBlocks,
+                geometry.GetBlockCount(),
+                blockTree.levels[0].sites_per_node[0]
+            );
 
             // Get an initial base-level decomposition of the domain macro-blocks over processors.
             // This will later be improved upon by ParMetis.
             log::Logger::Log<log::Info, log::Singleton>("Beginning initial decomposition");
             decomposition::BasicDecomposition basicDecomposer(geometry,
-                                                              computeComms);
+                                                              computeComms.Size());
 
-            auto owner_for_block = basicDecomposer.Decompose(blockTree, principalProcForEachBlock);
+            // This vector only has entries for blocks that have a least one fluid site
+            procForBlockOct = basicDecomposer.Decompose(blockTree, principalProcForEachBlock);
             geometry.block_store = std::make_unique<octree::DistributedStore>(
                     geometry.GetSitesPerBlock(),
                     std::move(blockTree),
-                    std::move(owner_for_block),
+                    procForBlockOct,
                     computeComms
             );
-            if (ShouldValidate()) {
+            if constexpr (build_info::VALIDATE_GEOMETRY) {
                 log::Logger::Log<log::Info, log::Singleton>("Validating initial decomposition");
-                basicDecomposer.Validate(principalProcForEachBlock);
+                basicDecomposer.Validate(principalProcForEachBlock, computeComms);
             }
 
-            timings[hemelb::reporting::Timers::initialDecomposition].Stop();
+            timings[reporting::Timers::initialDecomposition].Stop();
         }
 
-        // Perform the initial read-in.
-        log::Logger::Log<log::Debug, log::OnePerCore>("Reading in my blocks");
-
-        // Reopen in the file just between the nodes in the topology decomposition. Read in blocks
-        // local to this node.
-        file = net::MpiFile::Open(computeComms, dataFilePath, MPI_MODE_RDONLY, fileInfo);
-
-        ReadInBlocksWithHalo(geometry, principalProcForEachBlock, computeComms.Rank());
-
-        if (ShouldValidate())
+        timings[reporting::Timers::fileRead].Start();
         {
+          std::vector<U64> blocks_wanted;
+          blocks_wanted.reserve((2*nFluidBlocks) / computeComms.Size());
+          for (U64 i = 0; i < nFluidBlocks; ++i) {
+            if (procForBlockOct[i] == computeComms.Rank())
+              blocks_wanted.push_back(i);
+          }
+          ReadInBlocksWithHalo(geometry, blocks_wanted);
+        }
+        if constexpr (build_info::VALIDATE_GEOMETRY) {
             ValidateGeometry(geometry);
         }
+        timings[reporting::Timers::fileRead].Stop();
 
-        timings[hemelb::reporting::Timers::fileRead].Stop();
-
-        hemelb::log::Logger::Log<hemelb::log::Info, hemelb::log::Singleton>("Optimising the domain decomposition.");
-        timings[hemelb::reporting::Timers::domainDecomposition].Start();
+        log::Logger::Log<log::Info, log::Singleton>("Optimising the domain decomposition.");
+        timings[reporting::Timers::domainDecomposition].Start();
 
         // Having done an initial decomposition of the geometry, and read in the data, we optimise the
         // domain decomposition.
@@ -167,31 +162,28 @@ namespace hemelb::geometry
         OptimiseDomainDecomposition(geometry, principalProcForEachBlock);
         log::Logger::Log<log::Debug, log::OnePerCore>("Ending domain decomposition optimisation");
 
-        if (ShouldValidate())
-        {
+        if constexpr (build_info::VALIDATE_GEOMETRY) {
             log::Logger::Log<log::Info, log::Singleton>("Validating optimised decomposition");
             ValidateGeometry(geometry);
         }
-        file.Close();
 
-        // Finish up - close the file, set the timings, deallocate memory.
-        HEMELB_MPI_CALL(MPI_Info_free, (&fileInfo));
-
-        timings[hemelb::reporting::Timers::domainDecomposition].Stop();
+        timings[reporting::Timers::domainDecomposition].Stop();
 
         return geometry;
     }
 
-    std::vector<char> GeometryReader::ReadOnAllTasks(unsigned nBytes)
+    std::vector<char> GeometryReader::ReadAllProcesses(std::size_t start, unsigned nBytes)
     {
-      std::vector<char> buffer(nBytes);
-      const net::MpiCommunicator& comm = file.GetCommunicator();
-      if (comm.Rank() == HEADER_READING_RANK)
-      {
-        file.Read(buffer);
-      }
-      comm.Broadcast(buffer, HEADER_READING_RANK);
-      return buffer;
+        // result
+        std::vector<char> buffer(nBytes);
+        auto sp = to_span(buffer);
+        if (computeComms.AmNodeLeader()) {
+          // file is opened by the leaders comm only
+          file.ReadAtAll(start, sp);
+        }
+        // Broadcast from node leader to others
+        computeComms.GetNodeComm().Broadcast(sp, 0);
+        return buffer;
     }
 
     /**
@@ -199,11 +191,11 @@ namespace hemelb::geometry
      */
     GmyReadResult GeometryReader::ReadPreamble()
     {
-      std::vector<char> preambleBuffer = ReadOnAllTasks(gmy::PreambleLength);
+      std::vector<char> preambleBuffer = ReadAllProcesses(0, gmy::PreambleLength);
 
       // Create an Xdr translator based on the read-in data.
       auto preambleReader = io::XdrMemReader(preambleBuffer.data(),
-							   gmy::PreambleLength);
+                                                           gmy::PreambleLength);
 
       uint32_t hlbMagicNumber, gmyMagicNumber, version;
       // Read in housekeeping values
@@ -267,11 +259,11 @@ namespace hemelb::geometry
     void GeometryReader::ReadHeader(site_t blockCount)
     {
       site_t headerByteCount = GetHeaderLength(blockCount);
-      std::vector<char> headerBuffer = ReadOnAllTasks(headerByteCount);
+      std::vector<char> headerBuffer = ReadAllProcesses(gmy::PreambleLength, headerByteCount);
 
       // Create a Xdr translation object to translate from binary
-      auto preambleReader = hemelb::io::XdrMemReader(headerBuffer.data(),
-								   headerByteCount);
+      auto preambleReader = io::XdrMemReader(headerBuffer.data(),
+                                             headerByteCount);
 
       fluidSitesOnEachBlock.reserve(blockCount);
       bytesPerCompressedBlock.reserve(blockCount);
@@ -291,169 +283,232 @@ namespace hemelb::geometry
       }
     }
 
+    // Args are vectors with index representing a block ID.
+    // First arg is which blocks this rank wants
+    // Second arg is which rank "owns" them
+    // Returns a pair with:
+    // - a map from source rank to vector of block IDs wanted from it
+    // - the total number of blocks wanted.
+    auto ComputeSources(std::vector<bool> const& wanted, std::vector<int> const& ranks_they_are_on) {
+      std::map<int, std::vector<std::size_t>> ans;
+      auto const N = wanted.size();
+      unsigned total = 0;
+      for (std::size_t i = 0; i < N; ++i) {
+        if (wanted[i]) {
+          // This will default construct an empty vec if this is the first time
+          auto& blocks = ans[ranks_they_are_on[i]];
+          blocks.push_back(i);
+          total += 1;
+        }
+      }
+      return std::make_pair(std::move(ans), total);
+    }
+
+
+    // Return a map from block gmy index to compressed data, for
+    // those blocks that where the predicate returns true
+    template <std::predicate<std::size_t> PredT>
+    auto GeometryReader::ReadCompressedBlockData(GmyReadResult const& geometry, PredT&& want) -> block_cache {
+        // Strategy: read the entire file once and filter out those
+        // blocks we want.
+
+        // Going to read the file in large chunks (but note using
+        // whole blocks) and only on the node leaders (using
+        // collective MPI IO). Going to use the node communicator to
+        // set up a shared memory allocation to hold this and then
+        // filter in parallel.
+        constexpr auto MiB = std::size_t(1) << 20;
+        constexpr std::size_t MAX_GMY_BUFFER_SIZE = 64U * MiB;
+
+        log::Logger::Log<log::Info, log::Singleton>("Buffering compressed site data onto owning processes.");
+        log::Logger::Log<log::Debug, log::Singleton>("Maximum buffer size %lu B", MAX_GMY_BUFFER_SIZE);
+
+        block_cache ans;
+
+        // Work out where blocks live in the gmy **file**
+        std::size_t const nBlocksGmy = bytesPerCompressedBlock.size();
+        log::Logger::Log<log::Debug, log::Singleton>("Number of GMY blocks %lu", nBlocksGmy);
+        std::size_t const dataStart = gmy::PreambleLength + GetHeaderLength(geometry.GetBlockCount());
+
+        // N + 1 elements, elem i holds the start of block i, elem i+1 holds the end
+        auto const blockBoundsGmy = [&] () {
+            std::vector<std::size_t> ans(nBlocksGmy + 1);
+            ans[0] = dataStart;
+            std::inclusive_scan(
+                bytesPerCompressedBlock.begin(), bytesPerCompressedBlock.end(),
+                ans.begin() + 1, std::plus<std::size_t>(), dataStart
+            );
+            return ans;
+        }();
+
+        log::Logger::Log<log::Debug, log::Singleton>("Setup node level shared memory");
+
+        MPI_Win win;
+        char* local_buf = nullptr;
+
+        // Full size of buffer across node communicator
+        auto total_buf_size = std::min(blockBoundsGmy[nBlocksGmy] - blockBoundsGmy[0],
+                                       MAX_GMY_BUFFER_SIZE);
+
+        auto&& nodeComm = computeComms.GetNodeComm();
+        auto local_buf_size = (total_buf_size - 1) / nodeComm.Size() + 1;
+
+        // Need contiguous memory.
+        // Note local_buf has pointer into that process's "bit" of memory
+        net::MpiCall{MPI_Win_allocate_shared}(
+            local_buf_size, 1, MPI_INFO_NULL, nodeComm, &local_buf, &win
+        );
+        // Get the whole thing's start address
+        char* buf = local_buf - nodeComm.Rank() * local_buf_size;
+
+        // Open a passive access epoch to the shared buffer
+        net::MpiCall{MPI_Win_lock_all}(MPI_MODE_NOCHECK, win);
+
+        // Get to work reading chunks
+        std::size_t const* blockBoundsGmy_end = &*blockBoundsGmy.end();
+        for (std::size_t i_first_block = 0; i_first_block < nBlocksGmy; /* end of loop */) {
+          // Given we know which block we're starting at, figure out
+          // the most whole blocks we can fit in the buffer.
+          auto first_block_ptr = &blockBoundsGmy[i_first_block];
+          auto max_read_pos = *first_block_ptr + total_buf_size;
+          // upper bound gives the first elem after max_read_pos or _end if none
+          auto end_ptr = std::upper_bound(first_block_ptr, blockBoundsGmy_end, max_read_pos) - 1;
+          std::size_t n_blocks = end_ptr - first_block_ptr;
+          auto read_size = *end_ptr - *first_block_ptr;
+          log::Logger::Log<log::Debug, log::Singleton>("Reading blocks from %lu count %lu", i_first_block, n_blocks);
+
+          // Only read on node leader
+          if (computeComms.AmNodeLeader()) {
+            auto sp = std::span<char>(buf, read_size);
+            // Collective on leaders comm
+            file.ReadAtAll(*first_block_ptr, sp);
+          }
+          // Need to wait for leader to read
+          nodeComm.Barrier();
+          // Sync the memory
+          net::MpiCall{MPI_Win_sync}(win);
+
+          // Now we've read a chunk of the file. Go through it,
+          // copying out the blocks we want.
+          for (std::size_t i = 0; i < n_blocks; ++i) {
+            auto block_gmy = i_first_block + i;
+            if (want(block_gmy)) {
+              // Recall std::map::operator[] will create the element.
+              auto& block_data = ans[block_gmy];
+              block_data.resize(bytesPerCompressedBlock[block_gmy]);
+
+              auto buf_pos = blockBoundsGmy[block_gmy] - blockBoundsGmy[i_first_block];
+              std::memcpy(block_data.data(), &buf[buf_pos], bytesPerCompressedBlock[block_gmy]);
+            }
+          }
+
+          i_first_block += n_blocks;
+        }
+        // Close the access epoch
+        net::MpiCall{MPI_Win_unlock_all}(win);
+        // and free the window & buffer
+        net::MpiCall{MPI_Win_free}(&win);
+
+        log::Logger::Log<log::Debug, log::Singleton>("Finished caching blocks");
+        return ans;
+    }
+
     /**
-     * Read in the necessary blocks from the file.
+     * Read and deserialise the necessary blocks from the file into
+     * `geometry`. Collective.
+     *
+     * Initially, we have procForBlock which is the rank where each
+     * block is wanted (or -1 if unknown).
+     *
+     * - Decide which blocks this rank want (i.e. procForBlock ==
+     *   Rank() plus neigbours).
+     *
+     * - Collectively read the file, filtering blocks to ranks that
+     *   want them.
+     *
+     * - Deserialise those blocks into the geometry.
      */
     void GeometryReader::ReadInBlocksWithHalo(GmyReadResult& geometry,
-                                              const std::vector<proc_t>& unitForEachBlock,
-                                              const proc_t localRank)
+                                              const std::vector<U64>& blocksWanted)
     {
       // Create a list of which blocks to read in.
-      timings[hemelb::reporting::Timers::readBlocksPrelim].Start();
+      timings[reporting::Timers::readBlocksPrelim].Start();
 
       // Populate the list of blocks to read (including a halo one block wide around all
       // local blocks).
       log::Logger::Log<log::Debug, log::OnePerCore>("Determining blocks to read");
-      std::vector<bool> readBlock = DecideWhichBlocksToReadIncludingHalo(geometry,
-                                                                         unitForEachBlock,
-                                                                         localRank);
 
-      if (ShouldValidate())
-      {
-        log::Logger::Log<log::Debug, log::OnePerCore>("Validating block sizes");
+      auto halo_wanted = DecideWhichBlocksToReadIncludingHalo(geometry, blocksWanted);
 
-        // Validate the uncompressed length of the block on disk fits out expectations.
-        for (site_t block = 0; block < geometry.GetBlockCount(); ++block)
-        {
-          if (bytesPerUncompressedBlock[block]
-              > gmy::GetMaxBlockRecordLength(geometry.GetBlockSize(),
-					     fluidSitesOnEachBlock[block]))
-          {
-            log::Logger::Log<log::Critical, log::OnePerCore>("Block %i is %i bytes when the longest possible block should be %i bytes",
-                                                             block,
-                                                             bytesPerUncompressedBlock[block],
-                                                             gmy::GetMaxBlockRecordLength(geometry.GetBlockSize(),
-											  fluidSitesOnEachBlock[block]));
-          }
-        }
+      // GMY file indexs of blocks we want.
+      std::vector<std::size_t> wanted_gmys;
+      wanted_gmys.reserve(halo_wanted.size());
+      auto&& tree = geometry.block_store->GetTree();
+      for (auto idx: halo_wanted) {
+          auto ijk = tree.GetLeafCoords(idx);
+          wanted_gmys.push_back(geometry.GetBlockIdFromBlockCoordinates(ijk));
       }
+      // Recall we hit every block in GMY order, so sort this. We now
+      // only have to test one value at a time and bump the iterator
+      // forward when it matches.
+      std::sort(wanted_gmys.begin(), wanted_gmys.end());
+
+      auto compressed_block_data = ReadCompressedBlockData(
+          geometry,
+          [lower=wanted_gmys.cbegin(), upper=wanted_gmys.cend()] (std::size_t gmy) mutable {
+            if (lower != upper && *lower == gmy) {
+              ++lower;
+              return true;
+            } else {
+              return false;
+            }
+          }
+      );
 
-      // Next we spread round the lists of which blocks each core needs access to.
-      log::Logger::Log<log::Debug, log::OnePerCore>("Informing reading cores of block needs");
-      net::Net net = net::Net(computeComms);
-      Needs needs(geometry.GetBlockCount(),
-                  readBlock,
-                  util::NumericalFunctions::min(READING_GROUP_SIZE, computeComms.Size()),
-                  net,
-                  ShouldValidate());
-
-      timings[hemelb::reporting::Timers::readBlocksPrelim].Stop();
-      log::Logger::Log<log::Debug, log::OnePerCore>("Reading blocks");
-      timings[hemelb::reporting::Timers::readBlocksAll].Start();
-
-      // Set the initial offset to the first block, which will be updated as we progress
-      // through the blocks.
-      MPI_Offset offset = gmy::PreambleLength
-          + GetHeaderLength(geometry.GetBlockCount());
-
-      // Iterate over each block.
-      for (site_t nextBlockToRead = 0; nextBlockToRead < geometry.GetBlockCount();
-          ++nextBlockToRead)
-      {
-        // Read in the block on all cores (nothing will be done if this core doesn't need the block).
-        ReadInBlock(offset,
-                    geometry,
-                    needs.ProcessorsNeedingBlock(nextBlockToRead),
-                    nextBlockToRead,
-                    readBlock[nextBlockToRead]);
-
-        // Update the offset to be ready for the next block.
-        offset += bytesPerCompressedBlock[nextBlockToRead];
+      for (auto& [gmy_idx, data]: compressed_block_data) {
+        DeserialiseBlock(geometry, data, gmy_idx);
       }
-
-      timings[hemelb::reporting::Timers::readBlocksAll].Stop();
     }
 
-    void GeometryReader::ReadInBlock(MPI_Offset offsetSoFar, GmyReadResult& geometry,
-                                     const std::vector<proc_t>& procsWantingThisBlock,
-                                     const site_t blockNumber, const bool neededOnThisRank)
-    {
-      // Easy case if there are no sites on the block.
-      if (fluidSitesOnEachBlock[blockNumber] <= 0)
-      {
-        return;
-      }
-      std::vector<char> compressedBlockData;
-      proc_t readingCore = GetReadingCoreForBlock(blockNumber);
-
-      net::Net net = net::Net(computeComms);
-
-      if (readingCore == computeComms.Rank())
-      {
-        timings[hemelb::reporting::Timers::readBlock].Start();
-        // Read the data.
-        compressedBlockData.resize(bytesPerCompressedBlock[blockNumber]);
-        file.ReadAt(offsetSoFar, compressedBlockData);
-
-        // Spread it.
-        for (std::vector<proc_t>::const_iterator receiver = procsWantingThisBlock.begin();
-            receiver != procsWantingThisBlock.end(); receiver++)
-        {
-          if (*receiver != computeComms.Rank())
-          {
-
-            net.RequestSendV(std::span<const char>(compressedBlockData), *receiver);
-          }
-        }
-        timings[hemelb::reporting::Timers::readBlock].Stop();
-      }
-      else if (neededOnThisRank)
-      {
-        compressedBlockData.resize(bytesPerCompressedBlock[blockNumber]);
-
-        net.RequestReceiveV(std::span<char>(compressedBlockData), readingCore);
-
-      }
-      else
-      {
-        return;
-      }
-      timings[hemelb::reporting::Timers::readNet].Start();
-      net.Dispatch();
-      timings[hemelb::reporting::Timers::readNet].Stop();
-      timings[hemelb::reporting::Timers::readParse].Start();
-      if (neededOnThisRank)
-      {
+    void GeometryReader::DeserialiseBlock(
+        GmyReadResult& geometry, std::vector<char> const& compressedBlockData,
+        site_t block_gmy
+    ) {
+        timings[reporting::Timers::readParse].Start();
         // Create an Xdr interpreter.
-        std::vector<char> blockData = DecompressBlockData(compressedBlockData,
-                                                          bytesPerUncompressedBlock[blockNumber]);
+        auto blockData = DecompressBlockData(compressedBlockData,
+                                             bytesPerUncompressedBlock[block_gmy]);
         io::XdrMemReader lReader(&blockData.front(), blockData.size());
 
-        ParseBlock(geometry, blockNumber, lReader);
+        ParseBlock(geometry, block_gmy, lReader);
 
         // If debug-level logging, check that we've read in as many sites as anticipated.
-        if (ShouldValidate())
-        {
+        if constexpr (build_info::VALIDATE_GEOMETRY) {
           // Count the sites read,
           site_t numSitesRead = 0;
           for (site_t site = 0; site < geometry.GetSitesPerBlock(); ++site)
           {
-            if (geometry.Blocks[blockNumber].Sites[site].targetProcessor != SITE_OR_BLOCK_SOLID)
+            if (geometry.Blocks[block_gmy].Sites[site].targetProcessor != SITE_OR_BLOCK_SOLID)
             {
               ++numSitesRead;
             }
           }
           // Compare with the sites we expected to read.
-          if (numSitesRead != fluidSitesOnEachBlock[blockNumber])
+          if (numSitesRead != fluidSitesOnEachBlock[block_gmy])
           {
             log::Logger::Log<log::Error, log::OnePerCore>("Was expecting %i fluid sites on block %i but actually read %i",
-                                                          fluidSitesOnEachBlock[blockNumber],
-                                                          blockNumber,
+                                                          fluidSitesOnEachBlock[block_gmy],
+                                                          block_gmy,
                                                           numSitesRead);
           }
         }
-      }
-      else if (!geometry.Blocks[blockNumber].Sites.empty())
-      {
-        geometry.Blocks[blockNumber].Sites = std::vector<GeometrySite>(0, GeometrySite(false));
-      }
-      timings[hemelb::reporting::Timers::readParse].Stop();
+      timings[reporting::Timers::readParse].Stop();
     }
 
     std::vector<char> GeometryReader::DecompressBlockData(const std::vector<char>& compressed,
                                                           const unsigned int uncompressedBytes)
     {
-      timings[hemelb::reporting::Timers::unzip].Start();
+      timings[reporting::Timers::unzip].Start();
       // For zlib return codes.
       int ret;
 
@@ -484,7 +539,7 @@ namespace hemelb::geometry
       if (ret != Z_OK)
         throw Exception() << "Decompression error for block";
 
-      timings[hemelb::reporting::Timers::unzip].Stop();
+      timings[reporting::Timers::unzip].Stop();
       return uncompressed;
     }
 
@@ -529,11 +584,11 @@ namespace hemelb::geometry
       for (auto&& dir: gmy::Neighbourhood)
       {
         // read the type of the intersection and create a link...
-	auto intersectionType = [&]() {
-	  unsigned readType;
-	  reader.read(readType);
-	  return CutTypeValidator::Run(readType);
-	} ();
+        auto intersectionType = [&]() {
+          unsigned readType;
+          reader.read(readType);
+          return CutTypeValidator::Run(readType);
+        } ();
 
         GeometrySiteLink link;
         link.type = intersectionType;
@@ -575,9 +630,9 @@ namespace hemelb::geometry
       }
 
       auto normalAvailable = [&]() {
-	unsigned normalAvailable;
-	reader.read(normalAvailable);
-	return WallNormalAvailabilityValidator::Run(normalAvailable);
+        unsigned normalAvailable;
+        reader.read(normalAvailable);
+        return WallNormalAvailabilityValidator::Run(normalAvailable);
       }();
       readInSite.wallNormalAvailable = (normalAvailable == gmy::WallNormalAvailability::AVAILABLE);
 
@@ -600,12 +655,6 @@ namespace hemelb::geometry
       return readInSite;
     }
 
-    proc_t GeometryReader::GetReadingCoreForBlock(site_t blockNumber)
-    {
-      return proc_t(blockNumber
-          % util::NumericalFunctions::min(READING_GROUP_SIZE, computeComms.Size()));
-    }
-
     /**
      * This function is only called if in geometry-validation mode.
      * @param geometry
@@ -614,156 +663,128 @@ namespace hemelb::geometry
     {
       log::Logger::Log<log::Debug, log::OnePerCore>("Validating the GlobalLatticeData");
 
-      // We check the isFluid property and the link type for each direction
+      auto const SPB = geometry.GetSitesPerBlock();
+      auto const NV = latticeInfo.GetNumVectors();
+      constexpr auto UMAX = std::numeric_limits<unsigned>::max();
 
-      std::vector<proc_t> myProcForSite;
-      std::vector<unsigned> dummySiteData;
+      std::vector<proc_t> myProcForSite(SPB);
+      std::vector<unsigned> dummySiteData(SPB * NV);
 
+      // We check the isFluid property and the link type for each direction
       // We also validate that each processor has the same beliefs about each site.
-      for (site_t block = 0; block < geometry.GetBlockCount(); ++block)
-      {
-        // Clear vectors
-        myProcForSite.clear();
-        dummySiteData.clear();
-
-        if (geometry.Blocks[block].Sites.empty())
-        {
-          for (site_t localSite = 0; localSite < geometry.GetSitesPerBlock(); ++localSite)
-          {
-            myProcForSite.push_back(SITE_OR_BLOCK_SOLID);
-            dummySiteData.push_back(std::numeric_limits<unsigned>::max());
-            for (Direction direction = 1; direction < latticeInfo.GetNumVectors(); ++direction)
-            {
-              dummySiteData.push_back(std::numeric_limits<unsigned>::max());
+      for (site_t block_gmy = 0; block_gmy < geometry.GetBlockCount(); ++block_gmy) {
+        auto const& block = geometry.Blocks[block_gmy];
+
+        if (block.Sites.empty()) {
+          std::fill(myProcForSite.begin(), myProcForSite.end(), SITE_OR_BLOCK_SOLID);
+          std::fill(dummySiteData.begin(), dummySiteData.end(), UMAX);
+        } else {
+          for (site_t localSite = 0; localSite < SPB; ++localSite) {
+            auto const& site = block.Sites[localSite];
+            myProcForSite[localSite] = site.targetProcessor;
+
+            auto dsd_start = localSite*NV;
+            dummySiteData[dsd_start] = site.isFluid;
+            for (Direction direction = 1; direction < NV; ++direction) {
+              dummySiteData[dsd_start + direction] = site.isFluid ? unsigned(site.links[direction-1].type) : UMAX;
             }
           }
         }
-        else
-        {
-          for (site_t localSite = 0; localSite < geometry.GetSitesPerBlock(); ++localSite)
-          {
-            myProcForSite.push_back(geometry.Blocks[block].Sites[localSite].targetProcessor);
-
-            dummySiteData.push_back(geometry.Blocks[block].Sites[localSite].isFluid);
 
-            for (Direction direction = 1; direction < latticeInfo.GetNumVectors(); ++direction)
-            {
-              if (geometry.Blocks[block].Sites[localSite].isFluid)
-              {
-                dummySiteData.push_back(static_cast<unsigned>(geometry.Blocks[block].Sites[localSite].links[direction - 1].type));
-              }
-              else
-              {
-                dummySiteData.push_back(std::numeric_limits<unsigned>::max());
-              }
-            }
-          }
-        }
-
-        // Reduce using a minimum to find the actual processor for each site (ignoring the
+        // Reduce using a maximum to find the actual processor for each site (ignoring the
         // invalid entries).
-        std::vector<proc_t> procForSiteRecv = computeComms.AllReduce(myProcForSite, MPI_MIN);
+        std::vector<proc_t> procForSiteRecv = computeComms.AllReduce(myProcForSite, MPI_MAX);
         std::vector<unsigned> siteDataRecv = computeComms.AllReduce(dummySiteData, MPI_MIN);
 
-        for (site_t site = 0; site < geometry.GetSitesPerBlock(); ++site)
-        {
-          if (procForSiteRecv[site] == computeComms.Rank()
-              && (myProcForSite[site] != computeComms.Rank()))
-          {
-            log::Logger::Log<log::Critical, log::OnePerCore>("Other cores think this core has site %li on block %li but it disagrees.",
-                                                             site,
-                                                             block);
+        for (site_t site = 0; site < SPB; ++site) {
+          if (myProcForSite[site] < 0)
+            continue;
+
+          if (myProcForSite[site] != procForSiteRecv[site]) {
+            log::Logger::Log<log::Critical, log::OnePerCore>(
+                "Site %li of block %li believed to be on %li but other process thinks %li",
+                site, block_gmy, myProcForSite[site], procForSiteRecv[site]
+            );
           }
-          else if (myProcForSite[site] != SITE_OR_BLOCK_SOLID && procForSiteRecv[site]
-              != myProcForSite[site])
-          {
-            log::Logger::Log<log::Critical, log::OnePerCore>("This core thought that core %li has site %li on block %li but others think it's on core %li.",
-                                                             myProcForSite[site],
+
+          if (dummySiteData[site * NV] != siteDataRecv[site * NV]) {
+            log::Logger::Log<log::Critical, log::OnePerCore>("Different fluid state was found for site %li on block %li. One: %li, Two: %li .",
                                                              site,
-                                                             block,
-                                                             procForSiteRecv[site]);
+                                                             block_gmy,
+                                                             dummySiteData[site*NV],
+                                                             siteDataRecv[site*NV]
+                                                             );
           }
 
-          if (!geometry.Blocks[block].Sites.empty())
-          {
-            if (dummySiteData[site * latticeInfo.GetNumVectors()]
-                != siteDataRecv[site * latticeInfo.GetNumVectors()])
-            {
-              log::Logger::Log<log::Critical, log::OnePerCore>("Different fluid state was found for site %li on block %li. One: %li, Two: %li .",
+          for (Direction dir = 1; dir < NV; ++dir) {
+            if (dummySiteData[site * NV + dir] != siteDataRecv[site * NV+ dir]) {
+              log::Logger::Log<log::Critical, log::OnePerCore>("Different link type was found for site %li, link %i on block %li. One: %li, Two: %li .",
                                                                site,
-                                                               block,
-                                                               dummySiteData[site
-                                                                   * latticeInfo.GetNumVectors()],
-                                                               siteDataRecv[site
-                                                                   * latticeInfo.GetNumVectors()]);
-            }
-
-            for (Direction direction = 1; direction < latticeInfo.GetNumVectors(); ++direction)
-            {
-              if (dummySiteData[site * latticeInfo.GetNumVectors() + direction]
-                  != siteDataRecv[site * latticeInfo.GetNumVectors() + direction])
-              {
-                log::Logger::Log<log::Critical, log::OnePerCore>("Different link type was found for site %li, link %i on block %li. One: %li, Two: %li .",
-                                                                 site,
-                                                                 direction,
-                                                                 block,
-                                                                 dummySiteData[site
-                                                                     * latticeInfo.GetNumVectors()
-                                                                     + direction],
-                                                                 siteDataRecv[site
-                                                                     * latticeInfo.GetNumVectors()
-                                                                     + direction]);
-              }
+                                                               dir,
+                                                               block_gmy,
+                                                               dummySiteData[site*NV + dir],
+                                                               siteDataRecv[site *NV + dir]);
             }
-
           }
+
         }
       }
     }
 
-    std::vector<bool> GeometryReader::DecideWhichBlocksToReadIncludingHalo(
-            const GmyReadResult& geometry, const std::vector<proc_t>& unitForEachBlock, proc_t localRank)
-    {
-      std::vector<bool> shouldReadBlock(geometry.GetBlockCount(), false);
 
-      // Read a block in if it has fluid sites and is to live on the current processor. Also read
-      // in any neighbours with fluid sites.
+    // Go through blocks_wanted and add any 26-neighbouring blocks that are non-solid, using OCT ids.
+    std::vector<U64> GeometryReader::DecideWhichBlocksToReadIncludingHalo(
+        const GmyReadResult& geometry, const std::vector<U64>& blocks_wanted
+    ) const {
+      // Going to have to go from "compressed" octree order index
+      // (idx) -> 3D grid coordinate (ijk) to compute neighbours. Get
+      // set up for this.
       auto const block_dims = geometry.GetBlockDimensions();
-      for (site_t blockI = 0; blockI < block_dims.x(); ++blockI)
-      {
-        for (site_t blockJ = 0; blockJ < block_dims.y(); ++blockJ)
-        {
-          for (site_t blockK = 0; blockK < block_dims.z(); ++blockK)
-          {
-            site_t lBlockId = geometry.GetBlockIdFromBlockCoordinates(blockI, blockJ, blockK);
-
-            if (unitForEachBlock[lBlockId] != localRank)
-            {
-              continue;
-            }
-
-            // Read in all neighbouring blocks.
-            for (site_t neighI = std::max<site_t>(0, blockI - 1);
-                (neighI <= (blockI + 1)) && (neighI < block_dims.x()); ++neighI)
-            {
-              for (site_t neighJ = std::max<site_t>(0, blockJ - 1);
-                  (neighJ <= (blockJ + 1)) && (neighJ < block_dims.y()); ++neighJ)
-              {
-                for (site_t neighK = std::max<site_t>(0, blockK - 1);
-                    (neighK <= (blockK + 1)) && (neighK < block_dims.z());
-                    ++neighK)
-                {
-                  site_t lNeighId = geometry.GetBlockIdFromBlockCoordinates(neighI, neighJ, neighK);
-
-                  shouldReadBlock[lNeighId] = true;
-                }
-              }
-            }
+      constexpr auto U16_MAX = std::numeric_limits<U16>::max();
+      auto&& tree = geometry.block_store->GetTree();
+
+      // Start with no blocks wanted.
+      std::vector<bool> want_block_on_this_rank(nFluidBlocks, false);
+
+      // Main loop
+      for (auto block_idx: blocks_wanted) {
+          // Could set this here, but will cover in loop over halo below
+          // want_block_on_this_rank[block_idx] = true;
+
+          auto block_ijk = tree.GetLeafCoords(block_idx);
+
+          // Compute the bounds of neighbours to consider
+          Vec16 lo, hi;
+          for (int d = 0; d < 3; ++d) {
+              // Beware the "usual arithmetic conversions"!
+              U16 const v = block_ijk[d];
+              lo[d] = v > 0U ? v - 1U : 0U;
+              // Note, we will use inclusive hi limit below so max
+              // allowed value is block_dims[d] - 1
+              U16 const w = v < U16_MAX ? v + 1U : v;
+              hi[d] = w < block_dims[d] ? w : block_dims[d] - 1;
           }
-        }
+
+          // 3D loop
+          for (U16 ni = lo[0]; ni <= hi[0]; ++ni)
+              for (U16 nj = lo[1]; nj <= hi[1]; ++nj)
+                  for (U16 nk = lo[2]; nk <= hi[2]; ++nk) {
+                      auto neigh_ijk = Vec16{ni, nj, nk};
+                      auto neigh_idx = tree.GetPath(neigh_ijk).leaf();
+                      // Recall that the octree will return a path
+                      // with "no child" for all levels where the
+                      // requested node doesn't exist.
+                      if (neigh_idx != octree::Level::NC)
+                          want_block_on_this_rank[neigh_idx] = true;
+                  }
       }
 
-      return shouldReadBlock;
+      std::vector<U64> ans;
+      for (unsigned i = 0; i < nFluidBlocks; ++i) {
+        if (want_block_on_this_rank[i])
+          ans.push_back(i);
+      }
+      return ans;
     }
 
     void GeometryReader::OptimiseDomainDecomposition(GmyReadResult& geometry,
@@ -772,27 +793,24 @@ namespace hemelb::geometry
       decomposition::OptimisedDecomposition optimiser(timings,
                                                       computeComms,
                                                       geometry,
-                                                      latticeInfo,
-                                                      procForEachBlock,
-                                                      fluidSitesOnEachBlock);
+                                                      latticeInfo);
 
-      timings[hemelb::reporting::Timers::reRead].Start();
+      timings[reporting::Timers::reRead].Start();
       log::Logger::Log<log::Debug, log::OnePerCore>("Rereading blocks");
       // Reread the blocks based on the ParMetis decomposition.
       RereadBlocks(geometry,
-                   optimiser.GetMovesCountPerCore(),
-                   optimiser.GetMovesList(),
-                   procForEachBlock);
-      timings[hemelb::reporting::Timers::reRead].Stop();
+                   optimiser.GetStaying(),
+                   optimiser.GetArriving());
+      timings[reporting::Timers::reRead].Stop();
 
-      timings[hemelb::reporting::Timers::moves].Start();
+      timings[reporting::Timers::moves].Start();
       // Implement the decomposition now that we have read the necessary data.
       log::Logger::Log<log::Debug, log::OnePerCore>("Implementing moves");
       ImplementMoves(geometry,
-                     procForEachBlock,
-                     optimiser.GetMovesCountPerCore(),
-                     optimiser.GetMovesList());
-      timings[hemelb::reporting::Timers::moves].Stop();
+                     optimiser.GetStaying(),
+                     optimiser.GetArriving(),
+                     optimiser.GetLeaving());
+      timings[reporting::Timers::moves].Stop();
     }
 
     // The header section of the config file contains a number of records.
@@ -801,117 +819,131 @@ namespace hemelb::geometry
       return gmy::HeaderRecordLength * blockCount;
     }
 
-    void GeometryReader::RereadBlocks(GmyReadResult& geometry, const std::vector<idx_t>& movesPerProc,
-                                      const std::vector<idx_t>& movesList,
-                                      const std::vector<int>& procForEachBlock)
-    {
-      // Initialise the array (of which proc each block belongs to) to what it was before.
-      std::vector<int> newProcForEachBlock(geometry.GetBlockCount());
-
-      for (site_t blockNumber = 0; blockNumber < geometry.GetBlockCount(); ++blockNumber)
-      {
-        newProcForEachBlock[blockNumber] = procForEachBlock[blockNumber];
-      }
+    // Iterator to advance through the moves vector to the first move
+    // in the next block, recalling that moves are sorted by block and
+    // there's a max number of sites per block.
+    struct only_block_id_iterator {
+        using Iter = SiteVec::const_iterator;
 
-      // Set the proc for each block to be the current proc whenever a site on that block is
-      // going to be moved to the current proc.
-      idx_t moveIndex = 0;
+        site_t max_step;
+        Iter pos;
+        Iter end;
 
-      for (proc_t fromProc = 0; fromProc < computeComms.Size(); ++fromProc)
-      {
-        for (idx_t moveNumber = 0; moveNumber < movesPerProc[fromProc]; ++moveNumber)
-        {
-          idx_t block = movesList[3 * moveIndex];
-          idx_t toProc = movesList[3 * moveIndex + 2];
-          ++moveIndex;
+        // Only care about block ID
+        auto operator*() const {
+          return (*pos)[0];
+        }
 
-          if (toProc == (idx_t) computeComms.Rank())
-          {
-            newProcForEachBlock[block] = computeComms.Rank();
-          }
+        only_block_id_iterator& operator++() {
+            auto block = **this;
+            ++pos; // Guard the case where have a fully fluid block
+            auto limit = std::min(pos + max_step, end);
+            pos = std::upper_bound(
+                pos, limit,
+                block,
+                [](U64 l, SiteDesc const& r) {return l < r[0]; }
+            );
+            return *this;
         }
-      }
 
-      // Reread the blocks into the GlobalLatticeData now.
-      ReadInBlocksWithHalo(geometry, newProcForEachBlock, computeComms.Rank());
-    }
+        operator bool() const {
+            return pos < end;
+        }
+    };
 
-    void GeometryReader::ImplementMoves(GmyReadResult& geometry,
-                                        const std::vector<proc_t>& procForEachBlock,
-                                        const std::vector<idx_t>& movesFromEachProc,
-                                        const std::vector<idx_t>& movesList) const
+    void GeometryReader::RereadBlocks(GmyReadResult& geometry, SiteVec const& staying,
+                                      MovesMap const& arriving)
     {
-      // First all, set the proc rank for each site to what it originally was before
-      // domain decomposition optimisation. Go through each block...
-      for (site_t block = 0; block < geometry.GetBlockCount(); ++block)
-      {
-        // If this proc has owned a fluid site on this block either before or after optimisation,
-        // the following will be non-null.
-        if (!geometry.Blocks[block].Sites.empty())
-        {
-          // Get the original proc for that block.
-          proc_t originalProc = procForEachBlock[block];
+      // Go through the sites that we will end up with, and compute
+      // the unique list of blocks (by OCT index) that we need. Recall
+      // that the moves lists are sorted by block and then site id
+      // (tho we only care about block).
+      std::vector<U64> optimal_blocks;
+
+      // We can be simple for the staying blocks as opt is empty at
+      // start and know:
+      // - that the blocks monotonically increase thru the array
+      // - that there are at most SPB sites per block
+      auto const SPB = geometry.GetSitesPerBlock();
+      for (auto it = only_block_id_iterator{SPB, staying.begin(), staying.end()}; it; ++it) {
+          optimal_blocks.push_back(*it);
+      }
 
-          // For each site on that block...
-          for (site_t siteIndex = 0; siteIndex < geometry.GetSitesPerBlock(); ++siteIndex)
-          {
-            // ... if the site is non-solid...
-            if (geometry.Blocks[block].Sites[siteIndex].targetProcessor != SITE_OR_BLOCK_SOLID)
-            {
-              // ... set its rank to be the rank it had before optimisation.
-              geometry.Blocks[block].Sites[siteIndex].targetProcessor = originalProc;
-            }
+      // For the arriving sites, need to deal with insertion
+      for (auto& [src_rank, data]: arriving) {
+        // Recall that each rank's data is sorted by block ID, thus we
+        // can search for the insert point more efficiently.
+        auto opt_pos = optimal_blocks.begin();
+
+        for (auto it = only_block_id_iterator{SPB, data.begin(), data.end()}; it; ++it) {
+          auto block = *it;
+
+          auto lb = std::lower_bound(opt_pos, optimal_blocks.end(), block);
+          // BUT adding a block may invalidate the opt_pos iterator
+          if (lb ==  optimal_blocks.end()) {
+            // Not found and greater than any value
+            optimal_blocks.push_back(block);
+            opt_pos = optimal_blocks.end();
+          } else if (*lb == block) {
+            // already there
+            ++opt_pos;
+          } else {
+            // Not found and need to insert and move past the inserted one
+            opt_pos = ++optimal_blocks.insert(lb, block);
           }
         }
       }
 
-      // Now implement the moves suggested by parmetis.
-      idx_t moveIndex = 0;
+      // Reread the blocks into the GlobalLatticeData now.
+      ReadInBlocksWithHalo(geometry, optimal_blocks);
+    }
 
-      // For each source proc, go through as many moves as it had.
-      for (proc_t fromProc = 0; fromProc < computeComms.Size(); ++fromProc)
-      {
-        for (idx_t moveNumber = 0; moveNumber < movesFromEachProc[fromProc]; ++moveNumber)
-        {
-          // For each move, get the block, site and destination proc.
-          idx_t block = movesList[3 * moveIndex];
-          idx_t site = movesList[3 * moveIndex + 1];
-          idx_t toProc = movesList[3 * moveIndex + 2];
+    void GeometryReader::ImplementMoves(GmyReadResult& geometry,
+                                        SiteVec const& staying,
+                                        MovesMap const& arriving,
+                                        MovesMap const& leaving) const
+    {
+        // Given a vector of sites (sorted by block, then site index
+        // within the block), assign the process to that site in the
+        // GmyReadResult.
+        auto set_rank_for_sites = [&] (SiteVec const& sites, int rank) {
+
+            auto block_start = sites.begin();
+            auto end = sites.end();
+            auto&& tree = geometry.block_store->GetTree();
+
+            while (block_start != end) {
+                auto const block_oct = (*block_start)[0];
+                site_t const nsites = tree.levels[tree.n_levels].sites_per_node[block_oct];
+                auto const block_end = (++only_block_id_iterator{nsites, block_start, end}).pos;
+
+                auto const block_ijk = tree.GetLeafCoords(block_oct);
+                auto const block_gmy = geometry.GetBlockIdFromBlockCoordinates(block_ijk);
+
+                auto& block_sites = geometry.Blocks[block_gmy].Sites;
+                for (auto it = block_start; it < block_end; ++it) {
+                    auto [block, site_idx] = *it;
+                    HASSERT(block == block_oct);
+                    HASSERT(!block_sites.empty());
+
+                    auto& site = block_sites[site_idx];
+                    HASSERT(site.isFluid);
+                    site.targetProcessor = rank;
+                }
 
-          // Only implement the move if we have read that block's data.
-          if (!geometry.Blocks[block].Sites.empty())
-          {
-            // Some logging code - the unmodified rank for each move's site should equal
-            // lFromProc.
-            if (ShouldValidate())
-            {
-              if (geometry.Blocks[block].Sites[site].targetProcessor
-                  != fromProc)
-              {
-                log::Logger::Log<log::Error, log::OnePerCore>("Block %ld, site %ld from move %u was originally on proc %i, not proc %u.",
-                                                              block,
-                                                              site,
-                                                              moveIndex,
-                                                              geometry.Blocks[block].Sites[site].targetProcessor,
-                                                              fromProc);
-              }
+                block_start = block_end;
             }
+        };
 
-            // Implement the move.
-            geometry.Blocks[block].Sites[site].targetProcessor = toProc;
-          }
-
-          ++moveIndex;
+        // First set proc for staying sites
+        set_rank_for_sites(staying, computeComms.Rank());
+        // Then arriving sites
+        for (auto const& [_, sites]: arriving) {
+          set_rank_for_sites(sites, computeComms.Rank());
         }
-      }
+        // We could set the leaving sites, but can't guarantee we get
+        // all the neighbours of our sites, so no saving of time
+        // really.
     }
 
-    bool GeometryReader::ShouldValidate() const
-    {
-#ifdef HEMELB_VALIDATE_GEOMETRY
-      return true;
-#else
-      return false;
-#endif
-    }
 }
diff --git a/Code/geometry/GeometryReader.h b/Code/geometry/GeometryReader.h
index 08a18587a..c12b135de 100644
--- a/Code/geometry/GeometryReader.h
+++ b/Code/geometry/GeometryReader.h
@@ -37,139 +37,101 @@ namespace hemelb::geometry
         GmyReadResult LoadAndDecompose(const std::string& dataFilePath);
 
     private:
-        /**
-         * Read from the file into a buffer. We read this on a single core then broadcast it.
-         * This has proven to be more efficient than reading in on every core (even using a collective
-         * read).
-         *
-         * Note this allocates memory and returns the pointer to you.
-         *
-         * @param nBytes
-         * @return
-         */
-        std::vector<char> ReadOnAllTasks(unsigned nBytes);
+        // Read from the file into a buffer on all processes.
+        // This is collective and start and nBytes must be the same on all ranks.
+        std::vector<char> ReadAllProcesses(std::size_t startBytes, unsigned nBytes);
 
+        // Read the preamble and create the empty read result.
         GmyReadResult ReadPreamble();
 
+        // Read the block header with basic size data for each block in
+        // the domain.
         void ReadHeader(site_t blockCount);
 
-        void ReadInBlocksWithHalo(GmyReadResult& geometry, const std::vector<proc_t>& unitForEachBlock,
-                                  const proc_t localRank);
-
-        /**
-         * Compile a list of blocks to be read onto this core, including all the ones we perform
-         * LB on, and also any of their neighbouring blocks.
-         *
-         * NOTE: that the skipping-over of blocks without any fluid sites is dealt with by other
-         * code.
-         *
-         * @param geometry [in] GmyReadResult object as it has been read so far
-         * @param unitForEachBlock [in] The initial processor assigned to each block
-         * @param localRank [in] Local rank number
-         * @return Vector with true for each block we should read in.
-         */
-        std::vector<bool> DecideWhichBlocksToReadIncludingHalo(
-                const GmyReadResult& geometry, const std::vector<proc_t>& unitForEachBlock,
-                const proc_t localRank);
-
-        /**
-         * Reads in a single block and ensures it is distributed to all cores that need it.
-         *
-         * @param offsetSoFar [in] The offset into the file to read from to get the block.
-         * @param geometry [out] The geometry object to populate with info about the block.
-         * @param procsWantingThisBlock [in] A list of proc ids where info about this block is required.
-         * @param blockNumber [in] The id of the block we're reading.
-         * @param neededOnThisRank [in] A boolean indicating whether the block is required locally.
-         */
-        void ReadInBlock(MPI_Offset offsetSoFar, GmyReadResult& geometry,
-                         const std::vector<proc_t>& procsWantingThisBlock, const site_t blockNumber,
-                         const bool neededOnThisRank);
-
-        /**
-         * Decompress the block data. Uses the known number of sites to get an
-         * upper bound on the uncompressed data to simplify the code and avoid
-         * reallocation.
-         * @param compressed
-         * @param sites
-         * @return
-         */
+        // map from block GMY index to vect of data
+        using block_cache = std::map<U64, std::vector<char>>;
+
+        // Load compressed data from the file if the predicate is true for that block's GMY index.
+        template <std::predicate<std::size_t> PredT>
+        block_cache ReadCompressedBlockData(GmyReadResult const& geometry, PredT&& p);
+
+
+        // Given a vector of the block OCT ids that we want, add a
+        // one-block halo and read those blocks into the
+        // GmyReadResult.
+        void ReadInBlocksWithHalo(GmyReadResult& geometry,
+                                  const std::vector<U64>& blocksWanted);
+
+        // Given a vector of the block OCT ids wanted, add a one-block
+        // halo and return a vector of those block OCT ids.
+        std::vector<U64> DecideWhichBlocksToReadIncludingHalo(
+            const GmyReadResult& geometry, const std::vector<U64>& blocks_wanted
+        ) const;
+
+        // Parse a compressed block of data into the geometry at the given GMY index
+        void DeserialiseBlock(GmyReadResult& geometry,
+                              std::vector<char> const& compressed_data, site_t block_gmy);
+
+        // Decompress the block data
         std::vector<char> DecompressBlockData(const std::vector<char>& compressed,
                                               const unsigned int uncompressedBytes);
 
+        // Given a reader for a block's data, parse that into the
+        // GmyReadResult at the given index.
         void ParseBlock(GmyReadResult& geometry, const site_t block,
                         io::XdrReader& reader);
 
-        /**
-         * Parse the next site from the XDR reader. Note that we return by copy here.
-         * @param reader
-         * @return
-         */
+        // Parse the next site from the XDR reader
         GeometrySite ParseSite(io::XdrReader& reader);
 
-        /**
-         * Calculates the number of the rank used to read in a given block.
-         * Intent is to move this into Decomposition class, which will also handle knowledge of which procs to use for reading, and own the decomposition topology.
-         *
-         * @param blockNumber
-         * @return
-         */
-        proc_t GetReadingCoreForBlock(site_t blockNumber);
-
-        /**
-         * Optimise the domain decomposition using ParMetis. We take this approach because ParMetis
-         * is more efficient when given an initial decomposition to start with.
-         * @param geometry
-         * @param procForEachBlock
-         */
+        // Use the OptimisedDecomposition class to refine a simple,
+        // block-level initial decomposition.
         void OptimiseDomainDecomposition(GmyReadResult& geometry,
                                          const std::vector<proc_t>& procForEachBlock);
 
+        // Check for self-consistency
         void ValidateGeometry(const GmyReadResult& geometry);
 
-        /**
-         * Get the length of the header section, given the number of blocks.
-         *
-         * @param blockCount The number of blocks.
-         * @return
-         */
+        // Get the length of the header section, given the number of blocks.
         site_t GetHeaderLength(site_t blockCount) const;
 
-        void RereadBlocks(GmyReadResult& geometry, const std::vector<idx_t>& movesPerProc,
-                          const std::vector<idx_t>& movesList,
-                          const std::vector<int>& procForEachBlock);
-
-        void ImplementMoves(GmyReadResult& geometry, const std::vector<proc_t>& procForEachBlock,
-                            const std::vector<idx_t>& movesFromEachProc,
-                            const std::vector<idx_t>& movesList) const;
+        // Given knowledge of which sites are needed, read the blocks
+        // required (i.e. the blocks the sites belong to plus a
+        // one-block halo) into the GmyReadResult.
+        void RereadBlocks(GmyReadResult& geometry, SiteVec const& staying_sites,
+                          MovesMap const& arriving_sites);
 
-        /**
-         * True if we should validate the geometry.
-         * @return
-         */
-        bool ShouldValidate() const;
-
-        //! The rank which reads in the header information.
-        static const proc_t HEADER_READING_RANK = 0;
-        //! The number of cores (0-READING_GROUP_SIZE-1) that read files in parallel
-        static const proc_t READING_GROUP_SIZE = HEMELB_READING_GROUP_SIZE;
+        // Set the rank of sites assigned to this process in the GmyReadResult.
+        void ImplementMoves(GmyReadResult& geometry, SiteVec const& staying,
+                            MovesMap const& arriving, MovesMap const& leaving) const;
 
         //! Info about the connectivity of the lattice.
         const lb::LatticeInfo& latticeInfo;
-        //! File accessed to read in the geometry data.
+
+        // File accessed to read in the geometry data.
+        //
+        // We are going
+        // to read the whole file collectively and sequentially in
+        // chunks. The node leaders will read data either into private
+        // memory and broadcast or into node-shared memory and do a
+        // barrier/fence.
         net::MpiFile file;
-        //! Information about the file, to give cues and hints to MPI.
 
-        //! Communication info for all ranks that will need a slice of the geometry
-        net::MpiCommunicator computeComms;
+        //! Communicator for all ranks that will need a slice of the geometry
+        net::IOCommunicator computeComms;
 
-        //! The number of fluid sites on each block in the geometry
+        //! How many blocks with at least one fluid site
+        U64 nFluidBlocks;
+        //! The number of fluid sites on each block in the file.
         std::vector<site_t> fluidSitesOnEachBlock;
-        //! The number of bytes each block takes up while still compressed.
+        //! The number of bytes each block in the file takes up while still compressed.
         std::vector<unsigned int> bytesPerCompressedBlock;
-        //! The number of bytes each block takes up when uncompressed.
+        //! The number of bytes each block in the file takes up when uncompressed.
         std::vector<unsigned int> bytesPerUncompressedBlock;
-        //! The processor assigned to each block.
+        //! The process assigned to each block.
         std::vector<proc_t> principalProcForEachBlock;
+        //! The process for fluid-containing blocks in octree order
+        std::vector<proc_t> procForBlockOct;
 
         //! Timings object for recording the time taken for each step of the domain decomposition.
         hemelb::reporting::Timers &timings;
diff --git a/Code/geometry/GmyReadResult.cc b/Code/geometry/GmyReadResult.cc
index 0c4bcd13b..0a4f54b4b 100644
--- a/Code/geometry/GmyReadResult.cc
+++ b/Code/geometry/GmyReadResult.cc
@@ -39,20 +39,11 @@ namespace hemelb::geometry {
 
     site_t GmyReadResult::FindFluidSiteIndexInBlock(site_t fluidSiteBlock, site_t neighbourSiteId) const
     {
-        site_t SiteId = 0;
-        // Calculate the site's id over the whole geometry,
-        for (site_t neighSite = 0; neighSite < GetSitesPerBlock(); neighSite++)
-        {
-            if (neighSite == neighbourSiteId)
-            {
-                break;
+        auto& sites = Blocks[fluidSiteBlock].Sites;
+        return std::count_if(
+            &sites[0], &sites[neighbourSiteId], [](GeometrySite const& s) {
+                return s.isFluid;
             }
-            else if (Blocks[fluidSiteBlock].Sites[neighSite].targetProcessor != SITE_OR_BLOCK_SOLID)
-            {
-                SiteId++;
-            }
-        }
-
-        return SiteId;
+        );
     }
-}
\ No newline at end of file
+}
diff --git a/Code/geometry/GmyReadResult.h b/Code/geometry/GmyReadResult.h
index c8b4bc7b9..77da7c4d7 100644
--- a/Code/geometry/GmyReadResult.h
+++ b/Code/geometry/GmyReadResult.h
@@ -7,6 +7,7 @@
 #define HEMELB_GEOMETRY_GMYREADRESULT_H
 
 #include <memory>
+#include <map>
 #include <vector>
 
 #include "units.h"
@@ -18,6 +19,15 @@
 namespace hemelb::geometry
 {
     namespace octree { class DistributedStore; }
+
+    // A site description is the block's OCT index (hence U64 to
+    // match the tree's choice) and the index of the site within
+    // the block (could be U16, but use 64 as padding will force
+    // that anyway).
+    using SiteDesc = std::array<U64,2>;
+    using SiteVec = std::vector<SiteDesc>;
+    using MovesMap = std::map<int, SiteVec>;
+
     /***
      * Model of the information in a geometry file
      */
@@ -61,6 +71,11 @@ namespace hemelb::geometry
           return (blockI * dimensionsInBlocks.y() + blockJ) * dimensionsInBlocks.z() + blockK;
         }
 
+        inline U64 GetBlockIdFromBlockCoordinates(Vec16 ijk) const {
+            auto IJK = ijk.as<U64>();
+            return (IJK[0] * dimensionsInBlocks[1] + IJK[1]) * dimensionsInBlocks[2] + IJK[2];
+        }
+
         /***
          * Get the coordinates of a block from a block id.
          */
@@ -136,5 +151,53 @@ namespace hemelb::geometry
         std::unique_ptr<octree::DistributedStore> block_store;
     };
 
+    // This iterator will move through the sites that make up a
+    // block. On dereference it returns a pair of the current 3D
+    // position and the 1D index in geometry file order.
+    struct BlockSiteIterator {
+        GmyReadResult const* gmy;
+        Vec16 pos;
+        site_t idx;
+
+        using value_type = std::pair<Vec16, site_t>;
+        inline value_type operator*() const {
+            return {pos, idx};
+        }
+
+        inline BlockSiteIterator& operator++() {
+            auto B = gmy->GetBlockSize();
+            idx++;
+            if (++pos[2] == B) {
+                pos[2] = 0;
+                if (++pos[1] == B) {
+                    pos[1] = 0;
+                    ++pos[0];
+                }
+            }
+            return *this;
+        }
+
+        inline friend bool operator==(BlockSiteIterator const& l, BlockSiteIterator const& r) {
+          return l.idx == r.idx;
+        }
+    };
+
+    // Represent the range of valid sites in a block that belongs to
+    // the GmyReadResult.
+    struct BlockSiteRange {
+        GmyReadResult const* gmy;
+        auto begin() const {
+            return BlockSiteIterator{gmy, {0,0,0}, 0};
+        }
+        auto end() const {
+            auto B = gmy->GetBlockSize();
+            return BlockSiteIterator{gmy, {B, B, B}, B*B*B};
+        }
+    };
+
+    // Helper for the above.
+    inline auto IterSitesInBlock(GmyReadResult const& gmy) {
+        return BlockSiteRange{&gmy};
+    }
 }
 #endif // HEMELB_GEOMETRY_GMYREADRESULT_H
diff --git a/Code/geometry/LookupTree.h b/Code/geometry/LookupTree.h
index a57d5d962..9b116e1f2 100644
--- a/Code/geometry/LookupTree.h
+++ b/Code/geometry/LookupTree.h
@@ -297,6 +297,10 @@ namespace hemelb::geometry::octree
             return storage_rank.size();
         }
 
+        [[nodiscard]] inline auto& GetBlockOwnerRank() const {
+            return storage_rank;
+        }
+
         // Allow many writes to the same block, wherever it is.
         // Constructed by a WriteSession below
         struct BlockWriteChunk {
diff --git a/Code/geometry/decomposition/BasicDecomposition.cc b/Code/geometry/decomposition/BasicDecomposition.cc
index 91ce68ee0..dd8828162 100644
--- a/Code/geometry/decomposition/BasicDecomposition.cc
+++ b/Code/geometry/decomposition/BasicDecomposition.cc
@@ -79,8 +79,8 @@ namespace hemelb::geometry::decomposition
                      rank_for_block.begin(), rank_for_block.end(), comm_size);
 
         // We need to return data organised in GMY file order
-        // Initialise output to -1 => SOLID, we will overwrite the non-solid below
-        std::fill(procAssignedToEachBlock.begin(), procAssignedToEachBlock.end(), -1);
+        // Initialise output to SOLID, we will overwrite the non-solid below
+        std::fill(procAssignedToEachBlock.begin(), procAssignedToEachBlock.end(), SITE_OR_BLOCK_SOLID);
 
         // We need to know the octree ID to map to 3D coords.
         auto& block_oct_ids = tree.levels[tree.n_levels].node_ids;
@@ -95,7 +95,7 @@ namespace hemelb::geometry::decomposition
         return rank_for_block;
     }
 
-  void BasicDecomposition::Validate(std::vector<proc_t>& procAssignedToEachBlock, net::MpiCommunicator const& communicator) const
+    void BasicDecomposition::Validate(std::vector<proc_t>& procAssignedToEachBlock, net::MpiCommunicator const& communicator) const
     {
         log::Logger::Log<log::Debug, log::OnePerCore>("Validating procForEachBlock");
 
diff --git a/Code/geometry/decomposition/OptimisedDecomposition.cc b/Code/geometry/decomposition/OptimisedDecomposition.cc
index 70c7f1aa8..e216f062d 100644
--- a/Code/geometry/decomposition/OptimisedDecomposition.cc
+++ b/Code/geometry/decomposition/OptimisedDecomposition.cc
@@ -6,34 +6,35 @@
 #include "geometry/ParmetisHeader.h"
 #include "geometry/decomposition/OptimisedDecomposition.h"
 #include "geometry/decomposition/DecompositionWeights.h"
+#include "geometry/LookupTree.h"
+
 #include "lb/lattices/D3Q27.h"
 #include "log/Logger.h"
-#include "net/net.h"
+#include "net/SparseExchange.h"
+#include "util/Iterator.h"
+#include "util/span.h"
 
 namespace hemelb::geometry::decomposition
 {
-      OptimisedDecomposition::OptimisedDecomposition(
-          reporting::Timers& timers, net::MpiCommunicator& comms, const GmyReadResult& geometry,
-          const lb::LatticeInfo& latticeInfo, const std::vector<proc_t>& procForEachBlock,
-          const std::vector<site_t>& fluidSitesOnEachBlock) :
-          timers(timers), comms(comms), geometry(geometry), latticeInfo(latticeInfo),
-              procForEachBlock(procForEachBlock), fluidSitesPerBlock(fluidSitesOnEachBlock)
-      {
-        timers[hemelb::reporting::Timers::InitialGeometryRead].Start(); //overall dbg timing
+    using namespace util;
+
+    OptimisedDecomposition::OptimisedDecomposition(
+        reporting::Timers& timers,
+        net::MpiCommunicator c,
+        const GmyReadResult& geometry,
+        const lb::LatticeInfo& latticeInfo
+    ) : timers(timers), comms(std::move(c)), geometry(geometry),
+        tree(geometry.block_store->GetTree()), latticeInfo(latticeInfo),
+        procForBlockOct(geometry.block_store->GetBlockOwnerRank()),
+        fluidSitesPerBlockOct(tree.levels.back().sites_per_node)
+    {
+        timers[reporting::Timers::InitialGeometryRead].Start(); //overall dbg timing
 
         // Calculate the site distribution and validate if appropriate.
         PopulateSiteDistribution();
 
-        if (ShouldValidate())
-        {
+        if constexpr (build_info::VALIDATE_GEOMETRY) {
           ValidateVertexDistribution();
-        }
-
-        // Calculate the firstSiteIndexOnEachBlock and validate if appropriate
-        PopulateFirstSiteIndexOnEachBlock();
-
-        if (ShouldValidate())
-        {
           ValidateFirstSiteIndexOnEachBlock();
         }
 
@@ -42,45 +43,29 @@ namespace hemelb::geometry::decomposition
 
         PopulateAdjacencyData(localVertexCount);
 
-        if (ShouldValidate())
-        {
+        if constexpr (build_info::VALIDATE_GEOMETRY) {
           ValidateAdjacencyData(localVertexCount);
         }
 
         log::Logger::Log<log::Trace, log::OnePerCore>("Adj length %i", localAdjacencies.size());
 
-        timers[hemelb::reporting::Timers::InitialGeometryRead].Stop();
+        timers[reporting::Timers::InitialGeometryRead].Stop();
 
         // Call parmetis.
-        timers[hemelb::reporting::Timers::parmetis].Start();
+        timers[reporting::Timers::parmetis].Start();
         log::Logger::Log<log::Debug, log::OnePerCore>("Making the call to Parmetis");
-
-        bool do_decomposition = true;
-#ifdef HEMELB_NO_DECOMPOSITION
-        do_decomposition = false;
-#endif
-
-        if (do_decomposition)
-        {
-          CallParmetis(localVertexCount);
-          timers[hemelb::reporting::Timers::parmetis].Stop();
-          log::Logger::Log<log::Debug, log::OnePerCore>("Parmetis has finished.");
-
-          // Convert the ParMetis results into a nice format.
-          timers[hemelb::reporting::Timers::PopulateOptimisationMovesList].Start();
-          log::Logger::Log<log::Debug, log::OnePerCore>("Getting moves lists for this core.");
-          PopulateMovesList();
-        }
-#ifdef HEMELB_NO_DECOMPOSITION
-        else
-        {
-          LoadDecomposition(); //this is a modified version of PopulateMovesList();
-        }
-#endif
-
+        CallParmetis(localVertexCount);
+        timers[reporting::Timers::parmetis].Stop();
+        log::Logger::Log<log::Debug, log::OnePerCore>("Parmetis has finished.");
+
+        // Now each process knows which rank all its sites belong
+        // on. Tell the destination ranks which sites they need.
+        timers[reporting::Timers::PopulateOptimisationMovesList].Start();
+        log::Logger::Log<log::Debug, log::OnePerCore>("Getting moves lists for this core.");
+        PopulateMovesList();
         log::Logger::Log<log::Debug, log::OnePerCore>("Done getting moves lists for this core");
-        timers[hemelb::reporting::Timers::PopulateOptimisationMovesList].Stop();
-      }
+        timers[reporting::Timers::PopulateOptimisationMovesList].Stop();
+    }
 
       void OptimisedDecomposition::CallParmetis(idx_t localVertexCount)
       {
@@ -112,22 +97,24 @@ namespace hemelb::geometry::decomposition
         // Weight all vertices evenly.
         //std::vector < idx_t > vertexWeight(localVertexCount, 1);
 
-        // Populate the vertex weight data arrays (for ParMetis) and print out the number of different fluid sites on each core
+        // Populate the vertex weight data arrays (for ParMetis) and
+        // print out the number of different fluid sites on each core
         PopulateVertexWeightData(localVertexCount);
 
-        // Set the weights of each partition to be even, and to sum to 1.
-        idx_t desiredPartitionSize = comms.Size();
+        // Going to follow ParMETIS docs for naming parameters. Comments to make less cryptic!
 
-        std::vector<real_t> domainWeights(desiredPartitionSize,
-                                          (real_t) (1.0) / ((real_t) (desiredPartitionSize)));
+        // Desired number of sub domains.
+        idx_t ncon = 1; // Number of constraints
+        idx_t wgtflag = 2; // Indicate weighting 2 => vertices only
+        idx_t numflag = 0; // Numbering scheme 0 => C style from 0
+        idx_t nparts = comms.Size();
+
+        // Set the weights of each partition to be even, and to sum to 1.
+        std::vector<real_t> tpwgts(nparts, real_t(1.0) / real_t(nparts));
         // A bunch of values ParMetis needs.
-        idx_t noConstraints = 1;
-        idx_t weightFlag = 2;
-        idx_t numberingFlag = 0;
         idx_t edgesCut = 0;
         idx_t options[4] = { 0, 0, 0, 0 };
-        if (ShouldValidate())
-        {
+        if constexpr (build_info::VALIDATE_GEOMETRY) {
           // Specify that some options are set and that we should
           // debug everything.
           // Specify that we have set some options
@@ -142,32 +129,33 @@ namespace hemelb::geometry::decomposition
           // info on remappining (64)
           options[1] = 1 | 2 | 4 | 8 | 32 | 64;
         }
-        real_t tolerance = 1.001F;
-        log::Logger::Log<log::Debug, log::OnePerCore>("Calling ParMetis");
-        // Reserve 1 on these vectors so that the reference to their first element
-        // exists (even if it's unused).
-        // Reserve on the vectors to be certain they're at least 1 in capacity (so &vector[0] works)
-        partitionVector.reserve(1);
-        vtxDistribn.reserve(1);
-        adjacenciesPerVertex.reserve(1);
-        localAdjacencies.reserve(1);
-        vertexWeights.reserve(1);
+        // Tolerance. 1 => perfect balance, npars => perfect imbalance.
+        // Docs recommend 1.05 so we are being quite strict here.
+        real_t ubvec = 1.001F;
         MPI_Comm communicator = comms;
-        ParMETIS_V3_PartKway(&vtxDistribn[0],
-                             &adjacenciesPerVertex[0],
-                             &localAdjacencies[0],
-                             &vertexWeights[0],
-                             nullptr,
-                             &weightFlag,
-                             &numberingFlag,
-                             &noConstraints,
-                             &desiredPartitionSize,
-                             &domainWeights[0],
-                             &tolerance,
-                             options,
-                             &edgesCut,
-                             &partitionVector[0],
-                             &communicator);
+
+        log::Logger::Log<log::Debug, log::OnePerCore>("Calling ParMetis");
+        int err = ParMETIS_V3_PartKway(
+            vtxDistribn.data(),
+            adjacenciesPerVertex.data(),
+            localAdjacencies.data(),
+            vertexWeights.data(),
+            nullptr, // adjwgt, adjacency weights
+            &wgtflag,
+            &numflag,
+            &ncon,
+            &nparts,
+            tpwgts.data(),
+            &ubvec,
+            options,
+            &edgesCut,
+            partitionVector.data(),
+            &communicator
+        );
+
+        if (err != METIS_OK) {
+          throw Exception() << "ParMETIS error";
+        }
 
         log::Logger::Log<log::Debug, log::OnePerCore>("ParMetis returned.");
         if (comms.Rank() == comms.Size() - 1)
@@ -182,1097 +170,451 @@ namespace hemelb::geometry::decomposition
         }
       }
 
-      void OptimisedDecomposition::PopulateVertexWeightData(idx_t localVertexCount)
-      {
+    void OptimisedDecomposition::PopulateVertexWeightData(idx_t localVertexCount)
+    {
         // These counters will be used later on to count the number of each type of vertex site
-        int FluidSiteCounter = 0, WallSiteCounter = 0, IOSiteCounter = 0, WallIOSiteCounter = 0;
-        int localweight = 1;
+        std::array<int, 6> siteCounters;
+        std::fill(begin(siteCounters), end(siteCounters), 0);
 
+        vertexWeights.resize(localVertexCount);
+        idx_t i_wgt = 0;
         // For each block (counting up by lowest site id)...
-        auto const& block_dims = geometry.GetBlockDimensions();
-        for (site_t blockI = 0; blockI < block_dims.x(); blockI++)
-        {
-          for (site_t blockJ = 0; blockJ < block_dims.y(); blockJ++)
-          {
-            for (site_t blockK = 0; blockK < block_dims.z(); blockK++)
-            {
-              const site_t blockNumber = geometry.GetBlockIdFromBlockCoordinates(blockI,
-                                                                                 blockJ,
-                                                                                 blockK);
-
-              // Only consider sites on this processor.
-              if (procForEachBlock[blockNumber] != comms.Rank())
-              {
+        for (auto [block_idx, block_rank]: enumerate_with<std::size_t>(procForBlockOct)) {
+            if (block_rank != comms.Rank())
+                // Only consider sites on this procesr.
                 continue;
-              }
 
-              const BlockReadResult& blockReadResult = geometry.Blocks[blockNumber];
-
-              site_t m = -1;
-              const int block_size = geometry.GetBlockSize();
-              const int blockXCoord = blockI * block_size;
-              const int blockYCoord = blockJ * block_size;
-              const int blockZCoord = blockK * block_size;
-
-              // ... iterate over sites within the block...
-              for (site_t localSiteI = 0; localSiteI < block_size; localSiteI++)
-              {
-                for (site_t localSiteJ = 0; localSiteJ < block_size; localSiteJ++)
-                {
-                  for (site_t localSiteK = 0; localSiteK < block_size; localSiteK++)
-                  {
-                    ++m;
-
-                    // ... only looking at non-solid sites...
-                    if (blockReadResult.Sites[m].targetProcessor == SITE_OR_BLOCK_SOLID)
-                    {
-                      continue;
+            auto block_ijk = tree.GetLeafCoords(block_idx);
+            auto block_gmy = geometry.GetBlockIdFromBlockCoordinates(block_ijk);
+
+            const BlockReadResult& blockReadResult = geometry.Blocks[block_gmy];
+
+            // util::Vector3D<int> block_coord = BS * block_ijk;
+            // ... iterate over sites within the block...
+            for (auto [local_site_ijk, m]: IterSitesInBlock(geometry)) {
+                // ... only looking at non-solid sites...
+                if (blockReadResult.Sites[m].targetProcessor == SITE_OR_BLOCK_SOLID)
+                    continue;
+
+                SiteData siteData(blockReadResult.Sites[m]);
+                int site_type_i = [&] () {
+                    switch (siteData.GetCollisionType()) {
+                    case FLUID:
+                        return 0;
+                    case WALL:
+                        return 1;
+                    case INLET:
+                        return 2;
+                    case OUTLET:
+                        return 3;
+                    case (INLET | WALL):
+                        return 4;
+                    case (OUTLET | WALL):
+                        return 5;
+                    default:
+                        throw Exception() << "Bad collision type";
                     }
-
-                    //Getting Site ID to be able to identify site type
-                    site_t localSiteId = geometry.GetSiteIdFromSiteCoordinates(localSiteI,
-                                                                               localSiteJ,
-                                                                               localSiteK);
-
-                    //Switch structure which identifies site type and assigns the proper weight to each vertex
-
-                    SiteData siteData(blockReadResult.Sites[localSiteId]);
-
-                    switch (siteData.GetCollisionType())
-                    {
-                      case FLUID:
-                        localweight = hemelbSiteWeights[0];
-                        ++FluidSiteCounter;
-                        break;
-
-                      case WALL:
-                        localweight = hemelbSiteWeights[1];
-                        ++WallSiteCounter;
-                        break;
-
-                      case INLET:
-                        localweight = hemelbSiteWeights[2];
-                        ++IOSiteCounter;
-                        break;
-
-                      case OUTLET:
-                        localweight = hemelbSiteWeights[3];
-                        ++IOSiteCounter;
-                        break;
-
-                      case (INLET | WALL):
-                        localweight = hemelbSiteWeights[4];
-                        ++WallIOSiteCounter;
-                        break;
-
-                      case (OUTLET | WALL):
-                        localweight = hemelbSiteWeights[5];
-                        ++WallIOSiteCounter;
-                        break;
-                    }
-
-                    vertexWeights.push_back(localweight);
-                    vertexCoordinates.push_back(blockXCoord + localSiteI);
-                    vertexCoordinates.push_back(blockYCoord + localSiteJ);
-                    vertexCoordinates.push_back(blockZCoord + localSiteK);
-
-                  }
-
-                }
-
-              }
-
+                }();
+                ++siteCounters[site_type_i];
+                vertexWeights[i_wgt++] = hemelbSiteWeights[site_type_i];
             }
-
-          }
         }
 
-        int TotalCoreWeight = ( (FluidSiteCounter * hemelbSiteWeights[0])
-            + (WallSiteCounter * hemelbSiteWeights[1]) + (IOSiteCounter * hemelbSiteWeights[2])
-            + (WallIOSiteCounter * hemelbSiteWeights[4])) / hemelbSiteWeights[0];
-        int TotalSites = FluidSiteCounter + WallSiteCounter + WallIOSiteCounter;
+        if (i_wgt != localVertexCount)
+          throw Exception() << "Wrong number of vertices: expected " << localVertexCount << " got " << i_wgt;
+
+        int TotalCoreWeight = std::inner_product(begin(siteCounters), end(siteCounters),
+                                                 hemelbSiteWeights, 0);
+        int TotalSites = std::reduce(begin(siteCounters), end(siteCounters), 0);
 
         log::Logger::Log<log::Debug, log::OnePerCore>("There are %u Bulk Flow Sites, %u Wall Sites, %u IO Sites, %u WallIO Sites on core %u. Total: %u (Weighted %u Points)",
-                                                      FluidSiteCounter,
-                                                      WallSiteCounter,
-                                                      IOSiteCounter,
-                                                      WallIOSiteCounter,
+                                                      siteCounters[0],
+                                                      siteCounters[1],
+                                                      siteCounters[2] + siteCounters[3],
+                                                      siteCounters[4] + siteCounters[5],
                                                       comms.Rank(),
                                                       TotalSites,
                                                       TotalCoreWeight);
-      }
-
-      void OptimisedDecomposition::PopulateSiteDistribution()
-      {
-        vtxDistribn.resize(comms.Size() + 1, 0);
-        // Firstly, count the sites per processor. Do this off-by-one
-        // to be compatible with ParMetis.
-        for (site_t block = 0; block < geometry.GetBlockCount(); ++block)
-        {
-          if (procForEachBlock[block] >= 0)
-          {
-            vtxDistribn[1 + procForEachBlock[block]] += (idx_t) (fluidSitesPerBlock[block]);
-          }
-        }
+    }
 
-        // Now make the count cumulative, again off-by-one.
-        for (proc_t rank = 0; rank < comms.Size(); ++rank)
-        {
-          vtxDistribn[rank + 1] += vtxDistribn[rank];
-        }
-      }
-
-      void OptimisedDecomposition::PopulateFirstSiteIndexOnEachBlock()
-      {
-        // First calculate the lowest site index on each proc - relatively easy.
-        std::vector<idx_t> firstSiteOnProc(vtxDistribn);
-        // Now for each block (in ascending order), the smallest site index is the smallest site
-        // index on its processor, incremented by the number of sites observed from that processor
-        // so far.
-        for (site_t block = 0; block < geometry.GetBlockCount(); ++block)
-        {
-          proc_t proc = procForEachBlock[block];
-          if (proc < 0)
-          {
-            firstSiteIndexPerBlock.push_back(-1);
-          }
-          else
-          {
-            firstSiteIndexPerBlock.push_back(firstSiteOnProc[proc]);
-            firstSiteOnProc[proc] += (idx_t) (fluidSitesPerBlock[block]);
-          }
+    void OptimisedDecomposition::PopulateSiteDistribution()
+    {
+        // Parmetis needs to know (on all processes) how
+        // vertices/sites are distributed across the processes.
+        //
+        // The `vtxdist` array has P+1 elements.
+        //
+        // `vtxdist[i]` gives the total number of vertices that are on
+        // processes with rank less than `i`.
+        //
+        // `vtxdist[0]` is always zero
+        //
+        // `vtxdist[P]` has the total number of vertices
+        //
+        // `vtxdist[i+1] - vtxdist[i]` gives the number of vertices on
+        // process i
+        //
+        // We also need to know what vertex indices to assign to each
+        // site. Do this contiguously for the sites within each block
+        // per-process.
+        //
+        // We will also need to be able to translate between a site's
+        // GMY index in a block and it's fluid-only index.
+
+        auto& procForBlockOct = geometry.block_store->GetBlockOwnerRank();
+        auto& tree = geometry.block_store->GetTree();
+        auto& fluidSitesPerBlockOct = tree.levels[tree.n_levels].sites_per_node;
+        auto const NBLOCKS = geometry.block_store->GetBlockCount();
+
+        // First, count the sites per process and assign contiguous
+        // IDs to the sites in each block.
+        U64 total_sites = 0;
+        vtxCountPerProc = std::vector<idx_t>(comms.Size(), 0);
+        firstSiteIndexPerBlockOct.resize(NBLOCKS + 1);
+
+        for (site_t block = 0; block < NBLOCKS; ++block) {
+            auto block_rank = procForBlockOct[block];
+            firstSiteIndexPerBlockOct[block] = total_sites;
+            vtxCountPerProc[block_rank] += fluidSitesPerBlockOct[block];
+            total_sites += fluidSitesPerBlockOct[block];
+        }
+        firstSiteIndexPerBlockOct[NBLOCKS] = total_sites;
+
+        // Now do the scan to setup vtxdist
+        vtxDistribn = std::vector<idx_t>(comms.Size() + 1);
+        vtxDistribn[0] = 0;
+        std::inclusive_scan(
+            vtxCountPerProc.begin(), vtxCountPerProc.end(),
+            vtxDistribn.begin() + 1,
+            std::plus<idx_t>()
+        );
+
+        // Now, for every block we've got data for, create the mapping
+        // from block OCT id to a vector of the GMY local site ids of
+        // the fluid sites.
+        for (auto [block_idx, block_rank]: enumerate_with<U64>(procForBlockOct)) {
+            auto block_ijk = tree.GetLeafCoords(block_idx);
+            auto block_gmy = geometry.GetBlockIdFromBlockCoordinates(block_ijk);
+            auto& blockReadResult = geometry.Blocks[block_gmy];
+            if (blockReadResult.Sites.empty())
+                continue;
+            std::vector<U16>& block_fluid_site_gmy_ids = gmySiteIdForBlockOct[block_idx];
+            for (auto const& [i, s]: util::enumerate_with<U16>(blockReadResult.Sites)) {
+                // ... only looking at non-solid sites...
+                if (s.targetProcessor != SITE_OR_BLOCK_SOLID) {
+                    block_fluid_site_gmy_ids.push_back(i);
+                }
+            }
         }
-      }
+    }
 
-      void OptimisedDecomposition::PopulateAdjacencyData(idx_t localVertexCount)
-      {
+    void OptimisedDecomposition::PopulateAdjacencyData(idx_t localVertexCount)
+    {
         adjacenciesPerVertex.push_back(0);
 
-        // TODO: Reimplement using traversers or iterators.
-        // For each block (counting up by lowest site id)...
         auto const& block_dims = geometry.GetBlockDimensions();
-        for (site_t blockI = 0; blockI < block_dims.x(); blockI++)
-        {
-          for (site_t blockJ = 0; blockJ < block_dims.y(); blockJ++)
-          {
-            for (site_t blockK = 0; blockK < block_dims.z(); blockK++)
-            {
-              const site_t blockNumber = geometry.GetBlockIdFromBlockCoordinates(blockI,
-                                                                                 blockJ,
-                                                                                 blockK);
-              // ... considering only the ones which live on this proc...
-              if (procForEachBlock[blockNumber] != comms.Rank())
-              {
+        auto const BS = geometry.GetBlockSize();
+        // For range check below
+        auto const lo_site = Vector3D<site_t>::Zero();
+        auto const hi_site = block_dims.as<site_t>() * BS - Vector3D<site_t>::Ones();
+
+        // For each block (counting up by lowest site id)...
+        for (auto [block_idx, block_rank]: enumerate_with<std::size_t>(procForBlockOct)) {
+            if (block_rank != comms.Rank())
+                // ... considering only the ones which live on this proc...
                 continue;
-              }
-              const BlockReadResult& blockReadResult = geometry.Blocks[blockNumber];
-              site_t m = -1;
-              // ... iterate over sites within the block...
-              for (site_t localSiteI = 0; localSiteI < geometry.GetBlockSize(); localSiteI++)
-              {
-                for (site_t localSiteJ = 0; localSiteJ < geometry.GetBlockSize(); localSiteJ++)
-                {
-                  for (site_t localSiteK = 0; localSiteK < geometry.GetBlockSize(); localSiteK++)
-                  {
-                    ++m;
-                    // ... only looking at non-solid sites...
-                    if (blockReadResult.Sites[m].targetProcessor == SITE_OR_BLOCK_SOLID)
-                    {
-                      continue;
-                    }
-                    // ... for each lattice direction...
-                    for (unsigned int l = 1; l < latticeInfo.GetNumVectors(); l++)
-                    {
-                      // ... which leads to a valid neighbouring site...
-                      site_t neighbourI = blockI * geometry.GetBlockSize() + localSiteI
-                          + latticeInfo.GetVector(l).x();
-                      site_t neighbourJ = blockJ * geometry.GetBlockSize() + localSiteJ
-                          + latticeInfo.GetVector(l).y();
-                      site_t neighbourK = blockK * geometry.GetBlockSize() + localSiteK
-                          + latticeInfo.GetVector(l).z();
-                      if (neighbourI < 0 || neighbourJ < 0 || neighbourK < 0
-                          || neighbourI
-                              >= (geometry.GetBlockSize() * block_dims.x())
-                          || neighbourJ
-                              >= (geometry.GetBlockSize() * block_dims.y())
-                          || neighbourK
-                              >= (geometry.GetBlockSize() * block_dims.z()))
-                      {
+
+            auto block_ijk = tree.GetLeafCoords(block_idx);
+            auto block_gmy = geometry.GetBlockIdFromBlockCoordinates(block_ijk);
+
+            auto& blockReadResult = geometry.Blocks[block_gmy];
+            for (auto [local_site_ijk, m]: IterSitesInBlock(geometry)) {
+                // ... only looking at non-solid sites...
+                if (blockReadResult.Sites[m].targetProcessor == SITE_OR_BLOCK_SOLID)
+                    continue;
+
+                auto global_site_ijk = block_ijk.as<site_t>() * BS + local_site_ijk;
+                // ... for each lattice direction (recalling vector[0] == {0,0,0}) ...
+                for (unsigned int l = 1; l < latticeInfo.GetNumVectors(); l++) {
+                    // ... which leads to a valid neighbouring site...
+                    auto neigh_ijk = global_site_ijk + latticeInfo.GetVector(l);
+                    if (!neigh_ijk.IsInRange(lo_site, hi_site))
                         continue;
-                      }
-                      // ... (that is actually being simulated and not a solid)...
-                      site_t neighbourBlockI = neighbourI / geometry.GetBlockSize();
-                      site_t neighbourBlockJ = neighbourJ / geometry.GetBlockSize();
-                      site_t neighbourBlockK = neighbourK / geometry.GetBlockSize();
-                      site_t neighbourSiteI = neighbourI % geometry.GetBlockSize();
-                      site_t neighbourSiteJ = neighbourJ % geometry.GetBlockSize();
-                      site_t neighbourSiteK = neighbourK % geometry.GetBlockSize();
-                      site_t neighbourBlockId =
-                          geometry.GetBlockIdFromBlockCoordinates(neighbourBlockI,
-                                                                  neighbourBlockJ,
-                                                                  neighbourBlockK);
-                      const BlockReadResult& neighbourBlock = geometry.Blocks[neighbourBlockId];
-                      site_t neighbourSiteId =
-                          geometry.GetSiteIdFromSiteCoordinates(neighbourSiteI,
-                                                                neighbourSiteJ,
-                                                                neighbourSiteK);
-                      if (neighbourBlock.Sites.empty()
-                          || neighbourBlock.Sites[neighbourSiteId].targetProcessor == SITE_OR_BLOCK_SOLID)
-                      {
+
+                    // ... (that is actually being simulated and not a solid)...
+                    auto neigh_block = Vec16(neigh_ijk / BS);
+                    auto neigh_idx = tree.GetPath(neigh_block).leaf();
+                    // Recall that the octree will return a path
+                    // with "no child" for all levels where the
+                    // requested node doesn't exist.
+                    if (neigh_idx == octree::Level::NC)
                         continue;
-                      }
 
-                      site_t neighGlobalSiteId = firstSiteIndexPerBlock[neighbourBlockId]
-                          + geometry.FindFluidSiteIndexInBlock(neighbourBlockId, neighbourSiteId);
+                    auto [ni, nj, nk] = neigh_ijk % BS;
+                    U16 neighbourSiteId = geometry.GetSiteIdFromSiteCoordinates(ni, nj, nk);
 
-                      // then add this to the list of adjacencies.
-                      localAdjacencies.push_back((idx_t) (neighGlobalSiteId));
-                    }
+                    // This is the GMY local site IDs for only the
+                    // fluid sites on the neighbour block
+                    auto const& neigh_gmy_sites = gmySiteIdForBlockOct[neigh_idx];
+                    auto lb = std::lower_bound(neigh_gmy_sites.begin(), neigh_gmy_sites.end(), neighbourSiteId);
+                    // lb is either end or the first elem greater than or equal to what we want
+                    // end => SOLID; greater than or equal => SOLID
+                    if (lb == neigh_gmy_sites.end() || *lb != neighbourSiteId)
+                        continue;
+                    // have equal
+                    auto local_contig_idx = std::distance(neigh_gmy_sites.begin(), lb);
+                    U64 neighGlobalSiteId = firstSiteIndexPerBlockOct[neigh_idx] + local_contig_idx;
 
-                    // The cumulative count of adjacencies for this vertex is equal to the total
-                    // number of adjacencies we've entered.
-                    // NOTE: The prefix operator is correct here because
-                    // the array has a leading 0 not relating to any site.
-                    adjacenciesPerVertex.push_back(localAdjacencies.size());
-                  }
+                    // then add this to the list of adjacencies.
+                    localAdjacencies.push_back(idx_t(neighGlobalSiteId));
                 }
-              }
+
+                // The cumulative count of adjacencies for this vertex is equal to the total
+                // number of adjacencies we've entered.
+                // NOTE: The prefix operator is correct here because
+                // the array has a leading 0 not relating to any site.
+                adjacenciesPerVertex.push_back(localAdjacencies.size());
             }
-          }
         }
-      }
+    }
 
-      std::vector<idx_t> OptimisedDecomposition::CompileMoveData(
-          std::map<site_t, site_t>& blockIdLookupByLastSiteIndex)
-      {
-        // Right. Let's count how many sites we're going to have to move. Count the local number of
-        // sites to be moved, and collect the site id and the destination processor.
-        std::vector<idx_t> moveData;
-        const idx_t myLowest = vtxDistribn[comms.Rank()];
-        const idx_t myHighest = vtxDistribn[comms.Rank() + 1] - 1;
+    // We are returning a map (keyed by the process rank) of where
+    // each site that starts on this process should end up. Every site
+    // is here and the values of the map (vectors) will be sorted,
+    // first by block ID and then by site ID (note this for use in the
+    // geometry reader!)
+    auto OptimisedDecomposition::CompileMoveData() -> MovesMap
+    {
+        // Key is the destination rank, value is the list of [block, site id] pairs
+        std::map<int, std::vector<std::array<U64,2>>> movesToRank;
 
+        std::vector<idx_t> moveData;
+        auto const myLowest = vtxDistribn[comms.Rank()];
+        auto const mySiteCount = vtxDistribn[comms.Rank() + 1] - myLowest;
         // For each local fluid site...
-        for (idx_t ii = 0; ii <= (myHighest - myLowest); ++ii)
-        {
-          // ... if it's going elsewhere...
-          if (partitionVector[ii] != comms.Rank())
-          {
-            // ... get its id on the local processor...
-            idx_t localFluidSiteId = myLowest + ii;
-
-            // ... find out which block it's on, using our lookup map...
-
-            // A feature of std::map::equal_range is that if there's no equal key, both iterators
-            // returned will point to the entry with the next greatest key. Since we store block
-            // ids by last fluid site number, this immediately gives us the block id.
-            std::pair<std::map<site_t, site_t>::iterator, std::map<site_t, site_t>::iterator> rangeMatch =
-                blockIdLookupByLastSiteIndex.equal_range(localFluidSiteId);
-
-            site_t fluidSiteBlock = rangeMatch.first->second;
-
-            // Check the block id is correct
-            if (ShouldValidate())
-            {
-              if (procForEachBlock[fluidSiteBlock] < 0)
-              {
-                log::Logger::Log<log::Critical, log::OnePerCore>("Found block %i for site %i but this block has a processor of %i assigned",
-                                                                 fluidSiteBlock,
-                                                                 localFluidSiteId,
-                                                                 procForEachBlock[fluidSiteBlock]);
-              }
-              if (firstSiteIndexPerBlock[fluidSiteBlock] > localFluidSiteId)
-              {
-                log::Logger::Log<log::Critical, log::OnePerCore>("Found block %i for site %i but sites on this block start at number %i",
-                                                                 fluidSiteBlock,
-                                                                 localFluidSiteId,
-                                                                 firstSiteIndexPerBlock[fluidSiteBlock]);
-              }
-              if (firstSiteIndexPerBlock[fluidSiteBlock] + fluidSitesPerBlock[fluidSiteBlock] - 1
-                  < localFluidSiteId)
-              {
-                log::Logger::Log<log::Critical, log::OnePerCore>("Found block %i for site %i but there are %i sites on this block starting at %i",
-                                                                 fluidSiteBlock,
-                                                                 localFluidSiteId,
-                                                                 fluidSitesPerBlock[fluidSiteBlock],
-                                                                 firstSiteIndexPerBlock[fluidSiteBlock]);
+        // (NB: since the vertices/sites are sorted by block and then
+        // site, the results are also so sorted.)
+        for (idx_t ii = 0; ii < mySiteCount; ++ii)
+        {
+            // Where should it be? Catalogue even sites staying here for simplicity
+            auto dest_rank = partitionVector[ii];
+
+            idx_t global_site_id = myLowest + ii;
+            // Find out which block it's on.
+
+            // Upper bound gives the iterator to the first one after
+            // (recalling that we have an extra entry with the total
+            // number of the fluid sites at the end).
+            auto after = std::upper_bound(
+                firstSiteIndexPerBlockOct.begin(), firstSiteIndexPerBlockOct.end(),
+                global_site_id
+            );
+            U64 const block_idx = std::distance(firstSiteIndexPerBlockOct.begin(), after) - 1;
+
+            if constexpr (build_info::VALIDATE_GEOMETRY) {
+                // Check the block id is correct
+              if (
+                  global_site_id  < firstSiteIndexPerBlockOct[block_idx] ||
+                  global_site_id  >= firstSiteIndexPerBlockOct[block_idx+1]) {
+                  log::Logger::Log<log::Critical, log::OnePerCore>(
+                      "Found block %i for site %i but sites on this block start at number %i and finish before %i",
+                      block_idx,
+                      global_site_id ,
+                      firstSiteIndexPerBlockOct[block_idx],
+                      firstSiteIndexPerBlockOct[block_idx + 1]
+                  );
               }
             }
 
             // ... and find its site id within that block. Start by working out how many fluid sites
             // we have to pass before we arrive at the fluid site we're after...
-            site_t fluidSitesToPass = localFluidSiteId - firstSiteIndexPerBlock[fluidSiteBlock];
-            site_t siteIndex = 0;
-
-            siteIndex = geometry.FindSiteIndexInBlock(fluidSiteBlock, fluidSitesToPass);
-
-            // The above code could go wrong, so in debug logging mode, we do some extra tests.
-            if (ShouldValidate())
-            {
-              // If we've ended up on an impossible block, or one that doesn't live on this rank,
-              // inform the user.
-              if (fluidSiteBlock >= geometry.GetBlockCount()
-                  || procForEachBlock[fluidSiteBlock] != comms.Rank())
-              {
-                log::Logger::Log<log::Critical, log::OnePerCore>("Partition element %i wrongly assigned to block %u of %i (block on processor %i)",
-                                                                 ii,
-                                                                 fluidSiteBlock,
-                                                                 geometry.GetBlockCount(),
-                                                                 procForEachBlock[fluidSiteBlock]);
-              }
-              // Similarly, if we've ended up with an impossible site index, or a solid site,
-              // print an error message.
-              if (siteIndex >= geometry.GetSitesPerBlock()
-                  || geometry.Blocks[fluidSiteBlock].Sites[siteIndex].targetProcessor
-                      == SITE_OR_BLOCK_SOLID)
-              {
-                log::Logger::Log<log::Critical, log::OnePerCore>("Partition element %i wrongly assigned to site %u of %i (block %i%s)",
-                                                                 ii,
-                                                                 siteIndex,
-                                                                 fluidSitesPerBlock[fluidSiteBlock],
-                                                                 fluidSiteBlock,
-                                                                 geometry.Blocks[fluidSiteBlock].Sites[siteIndex].targetProcessor
-                                                                     == SITE_OR_BLOCK_SOLID ?
-                                                                   " and site is solid" :
-                                                                   "");
-              }
-            }
+            site_t block_site_id = global_site_id - firstSiteIndexPerBlockOct[block_idx];
+            auto siteIndexGmy = gmySiteIdForBlockOct[block_idx][block_site_id];
 
             // Add the block, site and destination rank to our move list.
-            moveData.push_back(fluidSiteBlock);
-            moveData.push_back(siteIndex);
-            moveData.push_back(partitionVector[ii]);
-          }
-
-        }
-
-        return moveData;
-      }
-
-      void OptimisedDecomposition::ForceSomeBlocksOnOtherCores(
-          std::vector<idx_t>& moveData,
-          std::map<proc_t, std::vector<site_t> >& blockIdsIRequireFromX)
-      {
-        timers[hemelb::reporting::Timers::moveForcingNumbers].Start();
-
-        net::Net netForMoveSending(comms);
-
-        // We also need to force some data upon blocks, i.e. when they're receiving data from a new
-        // block they didn't previously want to know about.
-        std::map<proc_t, std::vector<site_t> > blockForcedUponX;
-        std::vector<proc_t> numberOfBlocksIForceUponX(comms.Size(), 0);
-        for (idx_t moveNumber = 0; moveNumber < (idx_t) (moveData.size()); moveNumber += 3)
-        {
-          proc_t target_proc = moveData[moveNumber + 2];
-          site_t blockId = moveData[moveNumber];
-          if (std::count(blockForcedUponX[target_proc].begin(),
-                         blockForcedUponX[target_proc].end(),
-                         blockId) == 0)
-          {
-            blockForcedUponX[target_proc].push_back(blockId);
-            ++numberOfBlocksIForceUponX[target_proc];
-            log::Logger::Log<log::Trace, log::OnePerCore>("I'm ensuring proc %i takes data about block %i",
-                                                          target_proc,
-                                                          blockId);
-          }
-        }
-
-        // Now find how many blocks are being forced upon us from every other core.
-        log::Logger::Log<log::Debug, log::OnePerCore>("Moving forcing block numbers");
-        std::vector<proc_t> blocksForcedOnMe = comms.AllToAll(numberOfBlocksIForceUponX);
-        timers[hemelb::reporting::Timers::moveForcingNumbers].Stop();
-
-        timers[hemelb::reporting::Timers::moveForcingData].Start();
-        // Now get all the blocks being forced upon me.
-        std::map<proc_t, std::vector<site_t> > blocksForcedOnMeByEachProc;
-        for (proc_t otherProc = 0; otherProc < (proc_t) ( ( ( ( (comms.Size()))))); ++otherProc)
-        {
-          if (blocksForcedOnMe[otherProc] > 0)
-          {
-            blocksForcedOnMeByEachProc[otherProc] =
-                std::vector<site_t>(blocksForcedOnMe[otherProc]);
-            netForMoveSending.RequestReceiveV(std::span<site_t>(blocksForcedOnMeByEachProc[otherProc]), otherProc);
-          }
-          if (numberOfBlocksIForceUponX[otherProc] > 0)
-          {
-            netForMoveSending.RequestSendV(std::span<site_t const>(blockForcedUponX[otherProc]), otherProc);
-          }
-          log::Logger::Log<log::Trace, log::OnePerCore>("I'm forcing %i blocks on proc %i.",
-                                                        numberOfBlocksIForceUponX[otherProc],
-                                                        otherProc);
-        }
-
-        log::Logger::Log<log::Debug, log::OnePerCore>("Moving forcing block ids");
-        netForMoveSending.Dispatch();
-        // Now go through every block forced upon me and add it to the list of ones I want.
-        for (proc_t otherProc = 0; otherProc < (proc_t) ( ( ( ( (comms.Size()))))); ++otherProc)
-        {
-          if (blocksForcedOnMe[otherProc] > 0)
-          {
-            for (auto it = blocksForcedOnMeByEachProc[otherProc].begin();
-                it != blocksForcedOnMeByEachProc[otherProc].end(); ++it)
-            {
-              if (std::count(blockIdsIRequireFromX[otherProc].begin(),
-                             blockIdsIRequireFromX[otherProc].end(),
-                             *it) == 0)
-              {
-                blockIdsIRequireFromX[otherProc].push_back(*it);
-                log::Logger::Log<log::Trace, log::OnePerCore>("I'm being forced to take block %i from proc %i",
-                                                              *it,
-                                                              otherProc);
-              }
-              // We also need to take all neighbours of the forced block from their processors.
-              auto blockCoords = geometry.GetBlockCoordinatesFromBlockId(*it).as<int>();
-              // Iterate over every direction we might need (except 0 as we obviously already have
-              // that block in the list).
-              for (Direction direction = 1; direction < lb::D3Q27::NUMVECTORS;
-                  ++direction)
-              {
-                // Calculate the putative neighbour's coordinates...
-                auto neighbourCoords = (blockCoords + lb::D3Q27::VECTORS[direction]).as<U16>();
-                // If the neighbour is a real block...
-                if (geometry.AreBlockCoordinatesValid(neighbourCoords))
-                {
-                  // Get the block id, and check whether it has any fluid sites...
-                  site_t neighbourBlockId =
-                      geometry.GetBlockIdFromBlockCoordinates(neighbourCoords.x(),
-                                                              neighbourCoords.y(),
-                                                              neighbourCoords.z());
-                  proc_t neighbourBlockProc = procForEachBlock[neighbourBlockId];
-                  if (neighbourBlockProc >= 0)
-                  {
-                    // Check whether this is a block we're already interested in from that neighbour.
-                    if (std::count(blockIdsIRequireFromX[neighbourBlockProc].begin(),
-                                   blockIdsIRequireFromX[neighbourBlockProc].end(),
-                                   neighbourBlockId) == 0)
-                    {
-                      // Then add it to the list of blocks we're getting from that neighbour.
-                      blockIdsIRequireFromX[neighbourBlockProc].push_back(neighbourBlockId);
-                      log::Logger::Log<log::Trace, log::OnePerCore>("I need to also take block %i from proc %i",
-                                                                    neighbourBlockId,
-                                                                    neighbourBlockProc);
-                    }
-                  }
-
-                }
-
-              }
-
-            }
-
-          }
-
-        }
-
-        timers[hemelb::reporting::Timers::moveForcingData].Stop();
-      }
-
-      void OptimisedDecomposition::GetBlockRequirements(
-          std::vector<site_t>& numberOfBlocksRequiredFrom,
-          std::map<proc_t, std::vector<site_t> >& blockIdsIRequireFromX,
-          std::vector<site_t>& numberOfBlocksXRequiresFromMe,
-          std::map<proc_t, std::vector<site_t> >& blockIdsXRequiresFromMe)
-      {
-        log::Logger::Log<log::Debug, log::OnePerCore>("Calculating block requirements");
-        timers[hemelb::reporting::Timers::blockRequirements].Start();
-        // Populate numberOfBlocksRequiredFrom
-        for (proc_t otherProc = 0; otherProc < (proc_t) ( ( ( ( (comms.Size()))))); ++otherProc)
-        {
-          numberOfBlocksRequiredFrom[otherProc] = blockIdsIRequireFromX.count(otherProc) == 0 ?
-            0 :
-            blockIdsIRequireFromX[otherProc].size();
-          log::Logger::Log<log::Trace, log::OnePerCore>("I require a total of %i blocks from proc %i",
-                                                        numberOfBlocksRequiredFrom[otherProc],
-                                                        otherProc);
-        }
-        // Now perform the exchange s.t. each core knows how many blocks are required of it from
-        // each other core.
-        numberOfBlocksXRequiresFromMe = comms.AllToAll(numberOfBlocksRequiredFrom);
-
-        // Awesome. Now we need to get a list of all the blocks wanted from each core by each other
-        // core.
-        net::Net netForMoveSending(comms);
-        for (proc_t otherProc = 0; otherProc < (proc_t) ( ( ( ( (comms.Size()))))); ++otherProc)
-        {
-          blockIdsXRequiresFromMe[otherProc] =
-              std::vector<site_t>(numberOfBlocksXRequiresFromMe[otherProc]);
-          log::Logger::Log<log::Trace, log::OnePerCore>("Proc %i requires %i blocks from me",
-                                                        otherProc,
-                                                        blockIdsXRequiresFromMe[otherProc].size());
-          netForMoveSending.RequestReceiveV(std::span<site_t>(blockIdsXRequiresFromMe[otherProc]), otherProc);
-          netForMoveSending.RequestSendV(std::span<site_t const>(blockIdsIRequireFromX[otherProc]), otherProc);
-        }
-        netForMoveSending.Dispatch();
-        timers[hemelb::reporting::Timers::blockRequirements].Stop();
-      }
-
-      void OptimisedDecomposition::ShareMoveCounts(
-          std::map<site_t, idx_t>& movesForEachLocalBlock,
-          std::map<proc_t, std::vector<site_t> >& blockIdsXRequiresFromMe,
-          std::map<site_t, std::vector<proc_t> >& coresInterestedInEachBlock,
-          std::vector<idx_t>& moveData, std::map<site_t, std::vector<idx_t> >& moveDataForEachBlock,
-          std::map<proc_t, std::vector<site_t> >& blockIdsIRequireFromX,
-          std::vector<idx_t>& movesForEachBlockWeCareAbout)
-      {
-        timers[hemelb::reporting::Timers::moveCountsSending].Start();
-        // Initialise the moves for each local block to 0. This handles an edge case where a local
-        // block has no moves.
-        for (site_t blockId = 0; blockId < geometry.GetBlockCount(); ++blockId)
-        {
-          if (procForEachBlock[blockId] == comms.Rank())
-          {
-            movesForEachLocalBlock[blockId] = 0;
-          }
-        }
-
-        for (proc_t otherProc = 0; otherProc < (proc_t) ( ( ( ( (comms.Size()))))); ++otherProc)
-        {
-          for (site_t blockNum = 0;
-              blockNum < (site_t) ( ( ( ( (blockIdsXRequiresFromMe[otherProc].size())))));
-              ++blockNum)
-          {
-            site_t blockId = blockIdsXRequiresFromMe[otherProc][blockNum];
-            log::Logger::Log<log::Trace, log::OnePerCore>("Proc %i requires block %i from me",
-                                                          otherProc,
-                                                          blockId);
-            if (coresInterestedInEachBlock.count(blockId) == 0)
-            {
-              coresInterestedInEachBlock[blockId] = std::vector<proc_t>();
-            }
-            coresInterestedInEachBlock[blockId].push_back(otherProc);
-          }
-
-        }
-
-        for (site_t moveNumber = 0; moveNumber < (site_t) ( ( ( ( (moveData.size())))));
-            moveNumber += 3)
-        {
-          site_t blockId = moveData[moveNumber];
-          if (moveDataForEachBlock.count(blockId) == 0)
-          {
-            moveDataForEachBlock[blockId] = std::vector<idx_t>();
-          }
-          moveDataForEachBlock[blockId].push_back(blockId);
-          moveDataForEachBlock[blockId].push_back(moveData[moveNumber + 1]);
-          moveDataForEachBlock[blockId].push_back(moveData[moveNumber + 2]);
-          movesForEachLocalBlock[blockId]++;
+            movesToRank[dest_rank].push_back({block_idx, siteIndexGmy});
         }
 
-        net::Net netForMoveSending(comms);
-        for (proc_t otherProc = 0; otherProc < (proc_t) ( ( ( ( (comms.Size()))))); ++otherProc)
-        {
-          for (auto it = blockIdsIRequireFromX[otherProc].begin();
-              it != blockIdsIRequireFromX[otherProc].end(); ++it)
-          {
-            netForMoveSending.RequestReceiveR(movesForEachBlockWeCareAbout[*it], otherProc);
-            log::Logger::Log<log::Trace, log::OnePerCore>("I want the move count for block %i from proc %i",
-                                                          *it,
-                                                          otherProc);
-          }
-          for (auto it = blockIdsXRequiresFromMe[otherProc].begin();
-              it != blockIdsXRequiresFromMe[otherProc].end(); ++it)
-          {
-            netForMoveSending.RequestSendR(movesForEachLocalBlock[*it], otherProc);
-            log::Logger::Log<log::Trace, log::OnePerCore>("I'm sending move count for block %i to proc %i",
-                                                          *it,
-                                                          otherProc);
-          }
-        }
-
-        log::Logger::Log<log::Debug, log::OnePerCore>("Sending move counts");
-        netForMoveSending.Dispatch();
-        timers[hemelb::reporting::Timers::moveCountsSending].Stop();
+        return movesToRank;
       }
 
-      void OptimisedDecomposition::ShareMoveData(
-          std::vector<idx_t> movesForEachBlockWeCareAbout,
-          std::map<proc_t, std::vector<site_t> > blockIdsIRequireFromX,
-          std::map<proc_t, std::vector<site_t> > blockIdsXRequiresFromMe,
-          std::map<site_t, std::vector<idx_t> > moveDataForEachBlock)
-      {
-        timers[hemelb::reporting::Timers::moveDataSending].Start();
-        idx_t totalMovesToReceive = 0;
-        for (site_t blockId = 0; blockId < geometry.GetBlockCount(); ++blockId)
-        {
-          totalMovesToReceive += movesForEachBlockWeCareAbout[blockId];
-        }
-        log::Logger::Log<log::Trace, log::OnePerCore>("I'm expecting a total of %i moves",
-                                                      totalMovesToReceive);
-        // Gather the moves to the places they need to go to.
-        // Moves list has block, site id, destination proc
-        movesList.resize(totalMovesToReceive * 3);
-        idx_t localMoveId = 0;
-
-        net::Net netForMoveSending(comms);
+    // Organise out the moves: those sites stay, leaving and arriving.
+    void OptimisedDecomposition::PopulateMovesList() {
+        auto rank = comms.Rank();
 
-        for (proc_t otherProc = 0; otherProc < (proc_t) ( ( ( ( (comms.Size()))))); ++otherProc)
-        {
-          allMoves[otherProc] = 0;
-          for (auto it = blockIdsIRequireFromX[otherProc].begin();
-              it != blockIdsIRequireFromX[otherProc].end(); ++it)
-          {
-            if (movesForEachBlockWeCareAbout[*it] > 0)
-            {
-              netForMoveSending.RequestReceive(&movesList[localMoveId * 3],
-                                               3 * movesForEachBlockWeCareAbout[*it],
-                                               otherProc);
-              localMoveId += movesForEachBlockWeCareAbout[*it];
-              allMoves[otherProc] += movesForEachBlockWeCareAbout[*it];
-              log::Logger::Log<log::Trace, log::OnePerCore>("Expect %i moves from from proc %i about block %i",
-                                                            movesForEachBlockWeCareAbout[*it],
-                                                            otherProc,
-                                                            *it);
-            }
-          }
-
-          for (auto& bi: blockIdsXRequiresFromMe[otherProc])
-          {
-            if (!moveDataForEachBlock[bi].empty())
-            {
-              netForMoveSending.RequestSendV(std::span<idx_t const>(moveDataForEachBlock[bi]), otherProc);
-              log::Logger::Log<log::Trace, log::OnePerCore>("Sending %i moves from to proc %i about block %i",
-                                                            moveDataForEachBlock[bi].size() / 3,
-                                                            otherProc,
-                                                            bi);
-            }
-          }
+        // Work out where *all* sites should be
+        leaving = CompileMoveData();
 
-          log::Logger::Log<log::Trace, log::OnePerCore>("%i moves from proc %i",
-                                                        allMoves[otherProc],
-                                                        otherProc);
-        }
+        // Separate out those staying
+        staying = std::move(leaving[rank]);
+        leaving.erase(rank);
 
-        log::Logger::Log<log::Debug, log::OnePerCore>("Sending move data");
-        netForMoveSending.Dispatch();
-        timers[hemelb::reporting::Timers::moveDataSending].Stop();
-      }
-
-      /**
-       * Returns a list of the fluid sites to be moved.
-       *
-       * NOTE: This function's return value is a dynamically-allocated array of all the moves to be
-       * performed, ordered by (origin processor [with a count described by the content of the first
-       * parameter], site id on the origin processor). The contents of the array are contiguous
-       * triplets of ints: (block id, site id on block, destination rank).
-       *
-       * @return
-       */
-      void OptimisedDecomposition::PopulateMovesList()
-      {
-        allMoves = std::vector<idx_t>(comms.Size());
-
-        // Create a map for looking up block Ids: the map is from the contiguous site index
-        // of the last fluid site on the block, to the block id.
-        std::map<site_t, site_t> blockIdLookupByLastSiteIndex;
-        for (site_t blockId = 0; blockId < geometry.GetBlockCount(); ++blockId)
-        {
-          if (procForEachBlock[blockId] >= 0 && procForEachBlock[blockId] != SITE_OR_BLOCK_SOLID)
-          {
-            site_t lastFluidSiteId = firstSiteIndexPerBlock[blockId] + fluidSitesPerBlock[blockId]
-                - 1;
-            blockIdLookupByLastSiteIndex[lastFluidSiteId] = blockId;
-          }
-        }
-
-        // Right. Let's count how many sites we're going to have to move. Count the local number of
-        // sites to be moved, and collect the site id and the destination processor.
-        std::vector<idx_t> moveData = CompileMoveData(blockIdLookupByLastSiteIndex);
         // Spread the move data around
         log::Logger::Log<log::Debug, log::OnePerCore>("Starting to spread move data");
-        // First, for each core, gather a list of which blocks the current core wants to
-        // know more data about.
-        // Handily, the blocks we want to know about are exactly those for which we already
-        // have some data.
-        std::map<proc_t, std::vector<site_t> > blockIdsIRequireFromX;
-        for (site_t block = 0; block < geometry.GetBlockCount(); ++block)
-        {
-          if (!geometry.Blocks[block].Sites.empty())
-          {
-            proc_t residentProc = procForEachBlock[block];
-            blockIdsIRequireFromX[residentProc].push_back(block);
-            log::Logger::Log<log::Trace, log::OnePerCore>("I require block %i from proc %i (running total %i)",
-                                                          block,
-                                                          residentProc,
-                                                          blockIdsIRequireFromX[residentProc].size());
-          }
-        }
-
-        // We also need to force some data upon blocks, i.e. when they're receiving data from a new
-        // block they didn't previously want to know about.
-        ForceSomeBlocksOnOtherCores(moveData, blockIdsIRequireFromX);
-
-        // Now we want to spread this info around so that each core knows which blocks each other
-        // requires from it.
-        std::vector<site_t> numberOfBlocksRequiredFrom(comms.Size(), 0);
-        std::vector<site_t> numberOfBlocksXRequiresFromMe(comms.Size(), 0);
-        std::map<proc_t, std::vector<site_t> > blockIdsXRequiresFromMe;
-        GetBlockRequirements(numberOfBlocksRequiredFrom,
-                             blockIdsIRequireFromX,
-                             numberOfBlocksXRequiresFromMe,
-                             blockIdsXRequiresFromMe);
-
-        // OK, now to get on with the actual sending of the data...
-        // Except first, it'll be helpful to organise things by blocks.
-        std::map<site_t, std::vector<proc_t> > coresInterestedInEachBlock;
-        std::map<site_t, std::vector<idx_t> > moveDataForEachBlock;
-        std::map<site_t, idx_t> movesForEachLocalBlock;
-        // And it'll also be super-handy to know how many moves we're going to have locally.
-        std::vector<idx_t> movesForEachBlockWeCareAbout(geometry.GetBlockCount(), 0);
-        ShareMoveCounts(movesForEachLocalBlock,
-                        blockIdsXRequiresFromMe,
-                        coresInterestedInEachBlock,
-                        moveData,
-                        moveDataForEachBlock,
-                        blockIdsIRequireFromX,
-                        movesForEachBlockWeCareAbout);
-
-        ShareMoveData(movesForEachBlockWeCareAbout,
-                      blockIdsIRequireFromX,
-                      blockIdsXRequiresFromMe,
-                      moveDataForEachBlock);
-      }
-
-#if HEMELB_NO_DECOMPOSITION
-      std::vector<idx_t> OptimisedDecomposition::CompileMoveDataFromFile(
-          std::map<site_t, site_t>& blockIdLookupByLastSiteIndex, std::string decomposition_file)
-      {
-        // Right. Let's count how many sites we're going to have to move. Count the local number of
-        // sites to be moved, and collect the site id and the destination processor.
-        std::vector<idx_t> moveData;
-        const idx_t myLowest = vtxDistribn[comms.Rank()];
-        const idx_t myHighest = vtxDistribn[comms.Rank() + 1] - 1;
-
-        // For each local fluid site...
-        for (idx_t ii = 0; ii <= (myHighest - myLowest); ++ii)
-        {
-          // ... if it's going elsewhere...
-          if (partitionVector[ii] != comms.Rank())
-          {
-            // ... get its id on the local processor...
-            idx_t localFluidSiteId = myLowest + ii;
-
-            // ... find out which block it's on, using our lookup map...
-
-            // A feature of std::map::equal_range is that if there's no equal key, both iterators
-            // returned will point to the entry with the next greatest key. Since we store block
-            // ids by last fluid site number, this immediately gives us the block id.
-            std::pair<std::map<site_t, site_t>::iterator, std::map<site_t, site_t>::iterator> rangeMatch =
-            blockIdLookupByLastSiteIndex.equal_range(localFluidSiteId);
-
-            idx_t fluidSiteBlock = rangeMatch.first->second;
-
-            // ... and find its site id within that block. Start by working out how many fluid sites
-            // we have to pass before we arrive at the fluid site we're after...
-            idx_t fluidSitesToPass = localFluidSiteId - firstSiteIndexPerBlock[fluidSiteBlock];
-
-            site_t siteIndex = geometry.FindSiteIndexInBlock(fluidSiteBlock, fluidSitesToPass);
 
-            //TODO: put back validation (it's in CompileMovesData).
-
-            // Add the block, site and destination rank to our move list.
-            moveData.push_back(fluidSiteBlock);
-            moveData.push_back(siteIndex);
-            moveData.push_back(partitionVector[ii]);
-          }
-
-        }
-
-        return moveData;
-      }
-#endif
-
-#if HEMELB_NO_DECOMPOSITION
-      /* Below is a modified version of PopulateMovesList(). */
-      void OptimisedDecomposition::LoadDecomposition()
-      {
-        allMoves = std::vector<idx_t>(comms.Size());
-
-        // Create a map for looking up block Ids: the map is from the contiguous site index
-        // of the last fluid site on the block, to the block id.
-        std::map<site_t, site_t> blockIdLookupByLastSiteIndex;
-        for (site_t blockId = 0; blockId < geometry.GetBlockCount(); ++blockId)
-        {
-          if (procForEachBlock[blockId] >= 0 && procForEachBlock[blockId] != SITE_OR_BLOCK_SOLID)
-          {
-            site_t lastFluidSiteId = firstSiteIndexPerBlock[blockId] + fluidSitesPerBlock[blockId]
-            - 1;
-            blockIdLookupByLastSiteIndex[lastFluidSiteId] = blockId;
-          }
+        net::sparse_exchange<SiteDesc> xchg(comms, 444);
+        for (auto const& [dest, moves]: leaving) {
+            xchg.send(to_span(moves), dest);
         }
 
-        // Right. Let's count how many sites we're going to have to move. Count the local number of
-        // sites to be moved, and collect the site id and the destination processor.
-        std::vector<idx_t> moveData = CompileMoveDataFromFile(blockIdLookupByLastSiteIndex);
-        // Spread the move data around
-        log::Logger::Log<log::Debug, log::OnePerCore>("Starting to spread move data");
-        // First, for each core, gather a list of which blocks the current core wants to
-        // know more data about.
-        // Handily, the blocks we want to know about are exactly those for which we already
-        // have some data.
-        std::map<proc_t, std::vector<site_t> > blockIdsIRequireFromX;
-        for (site_t block = 0; block < geometry.GetBlockCount(); ++block)
-        {
-          if (!geometry.Blocks[block].Sites.empty())
-          {
-            proc_t residentProc = procForEachBlock[block];
-            blockIdsIRequireFromX[residentProc].push_back(block);
-            log::Logger::Log<log::Trace, log::OnePerCore>("I require block %i from proc %i (running total %i)",
-                block,
-                residentProc,
-                blockIdsIRequireFromX[residentProc].size());
-          }
-        }
-
-        // We also need to force some data upon blocks, i.e. when they're receiving data from a new
-        // block they didn't previously want to know about.
-        ForceSomeBlocksOnOtherCores(moveData, blockIdsIRequireFromX);
-
-        // Now we want to spread this info around so that each core knows which blocks each other
-        // requires from it.
-        std::vector<site_t> numberOfBlocksRequiredFrom(comms.Size(), 0);
-        std::vector<site_t> numberOfBlocksXRequiresFromMe(comms.Size(), 0);
-        std::map<proc_t, std::vector<site_t> > blockIdsXRequiresFromMe;
-        GetBlockRequirements(numberOfBlocksRequiredFrom,
-            blockIdsIRequireFromX,
-            numberOfBlocksXRequiresFromMe,
-            blockIdsXRequiresFromMe);
-
-        // OK, now to get on with the actual sending of the data...
-        // Except first, it'll be helpful to organise things by blocks.
-        std::map<site_t, std::vector<proc_t> > coresInterestedInEachBlock;
-        std::map<site_t, std::vector<idx_t> > moveDataForEachBlock;
-        std::map<site_t, idx_t> movesForEachLocalBlock;
-        // And it'll also be super-handy to know how many moves we're going to have locally.
-        std::vector<idx_t> movesForEachBlockWeCareAbout(geometry.GetBlockCount(), 0);
-        ShareMoveCounts(movesForEachLocalBlock,
-            blockIdsXRequiresFromMe,
-            coresInterestedInEachBlock,
-            moveData,
-            moveDataForEachBlock,
-            blockIdsIRequireFromX,
-            movesForEachBlockWeCareAbout);
-
-        ShareMoveData(movesForEachBlockWeCareAbout,
-            blockIdsIRequireFromX,
-            blockIdsXRequiresFromMe,
-            moveDataForEachBlock);
-      }
-#endif
-
-      bool OptimisedDecomposition::ShouldValidate() const
-      {
-#ifdef HEMELB_VALIDATE_GEOMETRY
-        return true;
-#else
-        return false;
-#endif
-      }
+        xchg.receive(
+            [&](int src, int count) {
+                auto& rbuf = arriving[src];
+                rbuf.resize(count);
+                return rbuf.data();
+            },
+            [&](int src, auto* buf) {
+                // no-op
+            }
+        );
+    }
 
-      void OptimisedDecomposition::ValidateVertexDistribution()
-      {
+    void OptimisedDecomposition::ValidateVertexDistribution() {
         log::Logger::Log<log::Debug, log::OnePerCore>("Validating the vertex distribution.");
         // vtxDistribn should be the same on all cores.
         std::vector<idx_t> vtxDistribnRecv = comms.AllReduce(vtxDistribn, MPI_MIN);
 
-        for (proc_t rank = 0; rank < comms.Size() + 1; ++rank)
-        {
-          if (vtxDistribn[rank] != vtxDistribnRecv[rank])
-          {
-            log::Logger::Log<log::Critical, log::OnePerCore>("vtxDistribn[%i] was %li but at least one other core had it as %li.",
-                                                             rank,
-                                                             vtxDistribn[rank],
-                                                             vtxDistribnRecv[rank]);
-          }
+        for (proc_t rank = 0; rank < comms.Size() + 1; ++rank) {
+            if (vtxDistribn[rank] != vtxDistribnRecv[rank]) {
+                log::Logger::Log<log::Critical, log::OnePerCore>(
+                    "vtxDistribn[%i] was %li but at least one other core had it as %li.",
+                    rank,
+                    vtxDistribn[rank],
+                    vtxDistribnRecv[rank]
+                );
+            }
         }
+    }
 
-      }
-
-      void OptimisedDecomposition::ValidateAdjacencyData(idx_t localVertexCount)
-      {
+    void OptimisedDecomposition::ValidateAdjacencyData(idx_t localVertexCount) {
         // If we're using debugging logs, check that the arguments are consistent across all cores.
         // To verify: vtxDistribn, adjacenciesPerVertex, adjacencies
-        if (ShouldValidate())
-        {
-          log::Logger::Log<log::Debug, log::OnePerCore>("Validating the graph adjacency structure");
-          // Create an array of lists to store all of this node's adjacencies, arranged by the
-          // proc the adjacent vertex is on.
-          std::vector<std::multimap<idx_t, idx_t> > adjByNeighProc(comms.Size(),
-                                                                   std::multimap<idx_t, idx_t>());
-          // The adjacency data should correspond across all cores.
-          for (idx_t index = 0; index < localVertexCount; ++index)
-          {
+        log::Logger::Log<log::Debug, log::OnePerCore>("Validating the graph adjacency structure");
+        // Create an array of lists to store all of this node's adjacencies, arranged by the
+        // proc the adjacent vertex is on.
+        using edge = std::array<idx_t, 2>;
+        std::map<int, std::vector<edge>> adjByNeighProc;
+        auto check_edge = [this] (idx_t v, idx_t w) {
+          idx_t w_local = w - vtxDistribn[comms.Rank()];
+          bool found = false;
+          for (
+              idx_t edge_i = adjacenciesPerVertex[w_local];
+              edge_i < adjacenciesPerVertex[w_local + 1];
+              ++edge_i
+          ) {
+            if (localAdjacencies[edge_i] == v) {
+              found = true;
+              break;
+            }
+          }
+          if (!found)
+            log::Logger::Log<log::Critical, log::OnePerCore>(
+                "Could not find reverse for edge from vertex %li to %li",
+                v, w
+            );
+        };
+
+        // The adjacency data should correspond across all cores.
+        for (idx_t index = 0; index < localVertexCount; ++index) {
             idx_t vertex = vtxDistribn[comms.Rank()] + index;
             // Iterate over each adjacency (of each vertex).
-            for (idx_t adjNumber = 0;
-                adjNumber < (adjacenciesPerVertex[index + 1] - adjacenciesPerVertex[index]);
-                ++adjNumber)
-            {
-              idx_t adjacentVertex = localAdjacencies[adjacenciesPerVertex[index] + adjNumber];
-              proc_t adjacentProc = -1;
-              // Calculate the proc of the neighbouring vertex.
-              for (proc_t proc = 0; proc < comms.Size(); ++proc)
-              {
-                if (vtxDistribn[proc] <= adjacentVertex && vtxDistribn[proc + 1] > adjacentVertex)
-                {
-                  adjacentProc = proc;
-                  break;
+            for (
+                idx_t adj_idx = adjacenciesPerVertex[index];
+                adj_idx < adjacenciesPerVertex[index + 1];
+                ++adj_idx
+            ) {
+                idx_t adjacentVertex = localAdjacencies[adj_idx];
+                // Find the proc of the neighbouring vertex.
+                // This iterator should point to the next process's start pos
+                auto ub = std::upper_bound(vtxDistribn.begin(), vtxDistribn.end(), adjacentVertex);
+                if (ub == vtxDistribn.end()) {
+                    log::Logger::Log<log::Critical, log::OnePerCore>("The vertex %li has a neighbour %li which doesn\'t appear to live on any processor.",
+                                                                     vertex,
+                                                                     adjacentVertex);
+                    continue;
+                }
+                int adj_proc = std::distance(vtxDistribn.begin(), ub) - 1;
+
+                // Most edges should be local - check these in place to speed it up.
+                if (adj_proc == comms.Rank()) {
+                  check_edge(vertex, adjacentVertex);
+                } else {
+                  // Store the edge for later checking
+                  adjByNeighProc[adj_proc].push_back({adjacentVertex, vertex});
                 }
-              }
-
-              // If it doesn't appear to belong to any proc, something's wrong.
-              if (adjacentProc == -1)
-              {
-                log::Logger::Log<log::Critical, log::OnePerCore>("The vertex %li has a neighbour %li which doesn\'t appear to live on any processor.",
-                                                                 vertex,
-                                                                 adjacentVertex);
-                continue;
-              }
-              // Store the data if it does belong to a proc.
-              adjByNeighProc[adjacentProc].insert(std::pair<idx_t, idx_t>(adjacentVertex, vertex));
-            }
-
-          }
-
-          // Create variables for the neighbour data to go into.
-          std::vector<idx_t> counts(comms.Size());
-          std::vector<std::vector<idx_t> > data(comms.Size());
-          log::Logger::Log<log::Debug, log::OnePerCore>("Validating neighbour data");
-          // Now spread and compare the adjacency information. Larger ranks send data to smaller
-          // ranks which receive the data and compare it.
-          for (proc_t neigh = 0; neigh < (proc_t) ( ( ( ( (comms.Size()))))); ++neigh)
-          {
-            SendAdjacencyDataToLowerRankedProc(neigh,
-                                               counts[neigh],
-                                               data[neigh],
-                                               adjByNeighProc[neigh]);
-            if (neigh < comms.Rank())
-            {
-              // Sending arrays don't perform comparison.
-              continue;
             }
-            CompareAdjacencyData(neigh, counts[neigh], data[neigh], adjByNeighProc[neigh]);
-          }
-
-        }
-
-      }
-
-      void OptimisedDecomposition::SendAdjacencyDataToLowerRankedProc(
-          proc_t neighbouringProc, idx_t& neighboursAdjacencyCount,
-          std::vector<idx_t>& neighboursAdjacencyData,
-          std::multimap<idx_t, idx_t>& expectedAdjacencyData)
-      {
-        if (neighbouringProc < comms.Rank())
-        {
-          // Send the array length.
-          neighboursAdjacencyCount = 2 * expectedAdjacencyData.size();
-          comms.Send(neighboursAdjacencyCount, neighbouringProc, 42);
-          // Create a sendable array (std::lists aren't organised in a sendable format).
-          neighboursAdjacencyData.resize(neighboursAdjacencyCount);
-          unsigned int adjacencyIndex = 0;
-          for (auto & it : expectedAdjacencyData)
-          {
-            neighboursAdjacencyData[2 * adjacencyIndex] = it.first;
-            neighboursAdjacencyData[2 * adjacencyIndex + 1] = it.second;
-            ++adjacencyIndex;
-          }
-          // Send the data to the neighbouringProc.
-          comms.Send(neighboursAdjacencyData, neighbouringProc, 43);
-        }
-        else
-        // If this is a greater rank number than the neighbouringProc, receive the data.
-        if (neighbouringProc > comms.Rank())
-        {
-          comms.Receive(neighboursAdjacencyCount, neighbouringProc, 42);
-          neighboursAdjacencyData.resize(neighboursAdjacencyCount);
-          comms.Receive(neighboursAdjacencyData, neighbouringProc, 43);
-        }
-        else // Neigh == mTopologyRank, i.e. neighbouring vertices on the same proc
-        // Duplicate the data.
-        {
-          neighboursAdjacencyCount = 2 * expectedAdjacencyData.size();
-          neighboursAdjacencyData.resize(neighboursAdjacencyCount);
-          int adjacencyIndex = 0;
-          for (auto & it : expectedAdjacencyData)
-          {
-            neighboursAdjacencyData[2 * adjacencyIndex] = it.first;
-            neighboursAdjacencyData[2 * adjacencyIndex + 1] = it.second;
-            ++adjacencyIndex;
-          }
         }
 
-      }
-
-      void OptimisedDecomposition::CompareAdjacencyData(
-          proc_t neighbouringProc, idx_t neighboursAdjacencyCount,
-          const std::vector<idx_t>& neighboursAdjacencyData,
-          std::multimap<idx_t, idx_t>& expectedAdjacencyData)
-      {
-        // Now we compare. First go through the received data which is ordered as (adjacent
-        // vertex, vertex) wrt the neighbouring proc.
-        for (idx_t ii = 0; ii < neighboursAdjacencyCount; ii += 2)
-        {
-          bool found = false;
-          // Go through each neighbour we know about on this proc, and check whether it
-          // matches the current received neighbour-data.
-          for (auto it =
-              expectedAdjacencyData.find(neighboursAdjacencyData[ii + 1]);
-              it != expectedAdjacencyData.end(); ++it)
-          {
-            idx_t recvAdj = it->first;
-            idx_t recvAdj2 = it->second;
-            if (neighboursAdjacencyData[ii] == recvAdj2
-                && neighboursAdjacencyData[ii + 1] == recvAdj)
-            {
-              expectedAdjacencyData.erase(it);
-              found = true;
-              break;
+        // Now sort the cross-process edges
+        for (auto& [_, edges]: adjByNeighProc) {
+            std::sort(
+                edges.begin(), edges.end(), [](edge const& a, edge const& b) {
+                  return a[0] < b[0];
+                }
+            );
+        }
+
+        std::vector<std::size_t> neigh_edge_counts(comms.Size());
+        for (int root = 0; root < comms.Size(); ++root) {
+          // Send any edges we've got to the current root process
+          auto maybe_edges = adjByNeighProc.find(root);
+          std::size_t const nedges = (maybe_edges == adjByNeighProc.end()) ?
+            0 // no edges to that process
+            : maybe_edges->second.size();
+
+          net::MpiCall{MPI_Gather}(&nedges, 1, net::MpiDataType<std::size_t>(), neigh_edge_counts.data(), 1, net::MpiDataType<std::size_t>(), root, comms);
+
+          if (root == comms.Rank()) {
+            // Root receives and checks non-zeros
+            std::vector<edge> neigh_edges;
+            for (int src = 0; src < comms.Size(); ++src) {
+              if (neigh_edge_counts[src]) {
+                neigh_edges.resize(neigh_edge_counts[src]);
+                comms.Receive(neigh_edges, src, 43);
+                // Check!
+                for (auto [w, v]: neigh_edges) {
+                  check_edge(v, w);
+                }
+              }
             }
-          }
-
-          // No neighbour data on this proc matched the data received.
-          if (!found)
-          {
-            log::Logger::Log<log::Critical, log::OnePerCore>("Neighbour proc %i had adjacency (%li,%li) that wasn't present on this processor.",
-                                                             neighbouringProc,
-                                                             neighboursAdjacencyData[ii],
-                                                             neighboursAdjacencyData[ii + 1]);
+          } else {
+            if (nedges)
+              comms.Send(maybe_edges->second, root, 43);
           }
         }
+    }
 
-        // The local store of adjacencies should now be empty, if there was complete matching.
-        auto it = expectedAdjacencyData.begin();
-        while (it != expectedAdjacencyData.end())
-        {
-          idx_t adj1 = it->first;
-          idx_t adj2 = it->second;
-          ++it;
-          // We had neighbour-data on this proc that didn't match that received.
-          log::Logger::Log<log::Critical, log::OnePerCore>("The local processor has adjacency (%li,%li) that isn't present on neighbouring processor %i.",
-                                                           adj1,
-                                                           adj2,
-                                                           neighbouringProc);
-        }
-      }
+    void OptimisedDecomposition::ValidateFirstSiteIndexOnEachBlock() {
+        // Check that values are
+        // a) the same across all processes
+        // b) increase from zero to the the total number of fluid
+        // sites by at least one per block and at most by sites per
+        // block
 
-      void OptimisedDecomposition::ValidateFirstSiteIndexOnEachBlock()
-      {
         log::Logger::Log<log::Debug, log::OnePerCore>("Validating the firstSiteIndexPerBlock values.");
-        // Reduce finding the maximum across all nodes. Note that we have to use the maximum
-        // because some cores will have -1 for a block (indicating that it has no neighbours on
-        // that block.
-        std::vector<idx_t> firstSiteIndexPerBlockRecv = comms.AllReduce(firstSiteIndexPerBlock,
-        MPI_MAX);
-
-        for (site_t block = 0; block < geometry.GetBlockCount(); ++block)
-        {
-          if (firstSiteIndexPerBlock[block] >= 0
-              && firstSiteIndexPerBlock[block] != firstSiteIndexPerBlockRecv[block])
-          {
-            log::Logger::Log<log::Critical, log::OnePerCore>("This core had the first site index on block %li as %li but at least one other core had it as %li.",
-                                                             block,
-                                                             firstSiteIndexPerBlock[block],
-                                                             firstSiteIndexPerBlockRecv[block]);
-          }
+        auto const n_non_solid = tree.levels[tree.n_levels].sites_per_node.size();
+        auto const total_fluid = tree.levels[0].sites_per_node[0];
+
+        HASSERT(firstSiteIndexPerBlockOct.size() == n_non_solid + 1);
+        HASSERT(firstSiteIndexPerBlockOct[0] == 0);
+        for (auto i = 0UL; i < n_non_solid; ++i) {
+            auto d = firstSiteIndexPerBlockOct[i+1] - firstSiteIndexPerBlockOct[i];
+            HASSERT(d > 0);
+            HASSERT(d <= geometry.GetSitesPerBlock());
+        }
+        HASSERT(firstSiteIndexPerBlockOct[n_non_solid] == total_fluid);
+
+        auto firstSiteIndexPerBlockRecv = comms.AllReduce(firstSiteIndexPerBlockOct, MPI_MAX);
+
+        for (auto i = 0UL; i < n_non_solid; ++i) {
+            if (firstSiteIndexPerBlockOct[i] != firstSiteIndexPerBlockRecv[i]) {
+                log::Logger::Log<log::Critical, log::OnePerCore>(
+                    "This process had the first site index on block %li as %li but at least one other core had it as %li.",
+                    i,
+                    firstSiteIndexPerBlockOct[i],
+                    firstSiteIndexPerBlockRecv[i]
+                );
+            }
         }
-      }
+    }
 }
diff --git a/Code/geometry/decomposition/OptimisedDecomposition.h b/Code/geometry/decomposition/OptimisedDecomposition.h
index a1fbc395d..442d3a37f 100644
--- a/Code/geometry/decomposition/OptimisedDecomposition.h
+++ b/Code/geometry/decomposition/OptimisedDecomposition.h
@@ -16,40 +16,45 @@
 #include "geometry/SiteData.h"
 #include "geometry/GeometryBlock.h"
 
-namespace hemelb
-{
-  namespace geometry
-  {
+namespace hemelb::geometry {
+    namespace octree { class LookupTree; }
+
     namespace decomposition
     {
+      // Given an initial basic decomposition done at the block level,
+      // with all blocks on a process (plus halo) read into the
+      // GmyReadResult, use ParMETIS to optimise this.
+      //
+      // The result of the optimisation is a description of how to
+      // change the sites on this rank: those staying, leaving and
+      // arriving (see below for details).
       class OptimisedDecomposition
       {
-        public:
-          OptimisedDecomposition(reporting::Timers& timers, net::MpiCommunicator& comms,
+      public:
+          // Constructor actually does the optimisation - collective over comm.
+          OptimisedDecomposition(reporting::Timers& timers, net::MpiCommunicator comms,
                                  const GmyReadResult& geometry,
-                                 const lb::LatticeInfo& latticeInfo,
-                                 const std::vector<proc_t>& procForEachBlock,
-                                 const std::vector<site_t>& fluidSitesPerBlock);
+                                 const lb::LatticeInfo& latticeInfo);
 
-          /**
-           * Returns a vector with the number of moves coming from each core
-           * @return
-           */
-          inline const std::vector<idx_t>& GetMovesCountPerCore() const
-          {
-            return allMoves;
+          // NOTE! All the sites in staying, leaving and arriving are
+          // sorted first by block ID and then by intra-block site ID.
+
+          // Get the sites that aren't moved
+          inline SiteVec const& GetStaying() const {
+              return staying;
           }
 
-          /**
-           * Returns a vector with the list of moves
-           * @return
-           */
-          inline const std::vector<idx_t>& GetMovesList() const
-          {
-            return movesList;
+          // Get a map (by destination rank) of sites leaving
+          inline MovesMap const& GetLeaving() const {
+              return leaving;
           }
 
-        private:
+          // Get a map (by source rank) of sites arriving
+          inline MovesMap const& GetArriving() const {
+              return arriving;
+          }
+
+      private:
           /**
            * Populates the vector of vertex weights with different values for each local site type.
            * This allows ParMETIS to more efficiently decompose the system.
@@ -65,11 +70,6 @@ namespace hemelb
            */
           void PopulateSiteDistribution();
 
-          /**
-           * Calculate the array of contiguous indices of the first fluid site on each block
-           */
-          void PopulateFirstSiteIndexOnEachBlock();
-
           /**
            * Gets the list of adjacencies and the count of adjacencies per local fluid site
            * in a format suitable for use with ParMetis.
@@ -92,12 +92,6 @@ namespace hemelb
            */
           void PopulateMovesList();
 
-          /**
-           * Return true if we should validate.
-           * @return
-           */
-          bool ShouldValidate() const;
-
           /**
            * Validates the vertex distribution array.
            */
@@ -113,115 +107,34 @@ namespace hemelb
            */
           void ValidateAdjacencyData(idx_t localVertexCount);
 
-          /**
-           * Sends the adjacency data to the process of lower rank of the two. THIS IS INEFFICIENT.
-           * We only do it for validation purposes.
-           *
-           * @param neighbouringProc
-           * @param neighboursAdjacencyCount
-           * @param neighboursAdjacencyData Array to receive neighbour's expectations about adjacencies
-           * @param expectedAdjacencyData Adjacency data as this core expects it to be
-           */
-          void SendAdjacencyDataToLowerRankedProc(
-              proc_t neighbouringProc, idx_t& neighboursAdjacencyCount,
-              std::vector<idx_t>& neighboursAdjacencyData,
-              std::multimap<idx_t, idx_t>& expectedAdjacencyData);
-
-          /**
-           * Compares this core's and a neighbouring core's version of the adjacency data between
-           * them.
-           *
-           * @param neighbouringProc
-           * @param neighboursAdjacencyCount
-           * @param neighboursAdjacencyData Array to receive neighbour's expectations about adjacencies
-           * @param expectedAdjacencyData Adjacency data as this core expects it to be
-           */
-          void CompareAdjacencyData(proc_t neighbouringProc, idx_t neighboursAdjacencyCount,
-                                    const std::vector<idx_t>& neighboursAdjacencyData,
-                                    std::multimap<idx_t, idx_t>& expectedAdjacencyData);
-
           /**
            * Compile a list of all the moves that need to be made from this processor.
-           * @param blockIdLookupByLastSiteIndex
-           * @return
-           */
-          std::vector<idx_t> CompileMoveData(
-              std::map<site_t, site_t>& blockIdLookupByLastSiteIndex);
-
-          /**
-           * Force some other cores to take info on blocks they might not know they need to know
-           * about.
-           * @param moveData
-           * @param blockIdsIRequireFromX
-           */
-          void ForceSomeBlocksOnOtherCores(
-              std::vector<idx_t>& moveData,
-              std::map<proc_t, std::vector<site_t> >& blockIdsIRequireFromX);
-
-          /**
-           * Get the blocks required from every other processor.
-           *
-           * @param numberOfBlocksRequiredFrom
-           * @param blockIdsIRequireFromX
-           * @param numberOfBlocksXRequiresFromMe
-           * @param blockIdsXRequiresFromMe
-           */
-          void GetBlockRequirements(
-              std::vector<site_t>& numberOfBlocksRequiredFrom,
-              std::map<proc_t, std::vector<site_t> >& blockIdsIRequireFromX,
-              std::vector<site_t>& numberOfBlocksXRequiresFromMe,
-              std::map<proc_t, std::vector<site_t> >& blockIdsXRequiresFromMe);
-
-          /**
-           * Share the number of moves to be made between each pair of processors that need to move
-           * data.
-           * @param movesForEachLocalBlock
-           * @param blockIdsXRequiresFromMe
-           * @param coresInterestedInEachBlock
-           * @param moveData
-           * @param moveDataForEachBlock
-           * @param blockIdsIRequireFromX
-           * @param movesForEachBlockWeCareAbout
            */
-          void ShareMoveCounts(std::map<site_t, idx_t>& movesForEachLocalBlock,
-                               std::map<proc_t, std::vector<site_t> >& blockIdsXRequiresFromMe,
-                               std::map<site_t, std::vector<proc_t> >& coresInterestedInEachBlock,
-                               std::vector<idx_t>& moveData,
-                               std::map<site_t, std::vector<idx_t> >& moveDataForEachBlock,
-                               std::map<proc_t, std::vector<site_t> >& blockIdsIRequireFromX,
-                               std::vector<idx_t>& movesForEachBlockWeCareAbout);
-
-          /**
-           * Share the move data between cores
-           *
-           * @param movesForEachBlockWeCareAbout
-           * @param blockIdsIRequireFromX
-           * @param blockIdsXRequiresFromMe
-           * @param moveDataForEachBlock
-           */
-          void ShareMoveData(std::vector<idx_t> movesForEachBlockWeCareAbout,
-                             std::map<proc_t, std::vector<site_t> > blockIdsIRequireFromX,
-                             std::map<proc_t, std::vector<site_t> > blockIdsXRequiresFromMe,
-                             std::map<site_t, std::vector<idx_t> > moveDataForEachBlock);
+          MovesMap CompileMoveData();
 
           reporting::Timers& timers; //! Timers for reporting.
-          net::MpiCommunicator& comms; //! Communicator
+          net::MpiCommunicator comms; //! Communicator
           const GmyReadResult& geometry; //! The geometry being optimised.
+          octree::LookupTree const& tree;
           const lb::LatticeInfo& latticeInfo; //! The lattice info to optimise for.
-          const std::vector<proc_t>& procForEachBlock; //! The processor assigned to each block at the moment
-          const std::vector<site_t>& fluidSitesPerBlock; //! The number of fluid sites on each block.
+          const std::vector<proc_t>& procForBlockOct; //! The initial MPI process for each block, in OCT layout
+          const std::vector<U64>& fluidSitesPerBlockOct; //! The number of fluid sites per block, in OCT layout
+          std::vector<idx_t> vtxCountPerProc; //! The number of vertices on each process.
           std::vector<idx_t> vtxDistribn; //! The vertex distribution across participating cores.
-          std::vector<idx_t> firstSiteIndexPerBlock; //! The global contiguous index of the first fluid site on each block.
+          std::vector<idx_t> firstSiteIndexPerBlockOct; //! The global contiguous index of the first fluid site on each block.
+          std::map<U64, std::vector<U16>> gmySiteIdForBlockOct; //! A map keyed on block OCT id to an array of all the fluid sites' GMY local site ID.
           std::vector<idx_t> adjacenciesPerVertex; //! The number of adjacencies for each local fluid site
           std::vector<idx_t> vertexWeights; //! The weight of each local fluid site
           std::vector<real_t> vertexCoordinates; //! The coordinates of each local fluid site
           std::vector<idx_t> localAdjacencies; //! The list of adjacent vertex numbers for each local fluid site
           std::vector<idx_t> partitionVector; //! The results of the optimisation -- which core each fluid site should go to.
-          std::vector<idx_t> allMoves; //! The list of move counts from each core
-          std::vector<idx_t> movesList;
+          // The result of the optimisation: the sites staying,
+          // leaving and arriving, the latter two organised by the key
+          // being the src/dest rank.
+          SiteVec staying;
+          MovesMap leaving;
+          MovesMap arriving;
       };
     }
-  }
 }
-
 #endif /* HEMELB_GEOMETRY_DECOMPOSITION_OPTIMISEDDECOMPOSITION_H */
diff --git a/Code/reporting/BuildInfo.cc b/Code/reporting/BuildInfo.cc
index 23d2555c3..d1ae5acbf 100644
--- a/Code/reporting/BuildInfo.cc
+++ b/Code/reporting/BuildInfo.cc
@@ -15,7 +15,6 @@ namespace hemelb::reporting {
         build.SetValue("OPTIMISATION", build_info::OPTIMISATION);
         build.SetBoolValue("USE_SSE3", build_info::USE_SSE3);
         build.SetValue("TIME", build_info::BUILD_TIME);
-        build.SetValue("READING_GROUP_SIZE", build_info::READING_GROUP_SIZE);
         build.SetValue("LATTICE_TYPE", build_info::LATTICE);
         build.SetValue("KERNEL_TYPE", build_info::KERNEL);
         build.SetValue("WALL_BOUNDARY_CONDITION", build_info::WALL_BOUNDARY);
@@ -27,4 +26,4 @@ namespace hemelb::reporting {
         build.SetValue("POINTPOINT_IMPLEMENTATION", build_info::POINTPOINT_IMPLEMENTATION);
         build.SetValue("STENCIL", build_info::STENCIL);
     }
-}
\ No newline at end of file
+}
diff --git a/Code/resources/report.txt.ctp b/Code/resources/report.txt.ctp
index b9a28e9d3..46dbddea9 100644
--- a/Code/resources/report.txt.ctp
+++ b/Code/resources/report.txt.ctp
@@ -30,7 +30,6 @@ Build type: {{TYPE}}
 Optimisation level: {{OPTIMISATION}}
 Use SSE3: {{USE_SSE3}}
 Built at: {{TIME}}
-Reading group size: {{READING_GROUP_SIZE}}
 Lattice: {{LATTICE_TYPE}}
 Kernel: {{KERNEL_TYPE}}
 Wall boundary condition: {{WALL_BOUNDARY_CONDITION}}
diff --git a/Code/resources/report.xml.ctp b/Code/resources/report.xml.ctp
index 257352ded..136d8e9d5 100644
--- a/Code/resources/report.xml.ctp
+++ b/Code/resources/report.xml.ctp
@@ -14,7 +14,6 @@
 		<optimisation>{{OPTIMISATION}}</optimisation>
                 <use_sse3>{{USE_SSE3}}</use_sse3>
 		<date>{{TIME}}</date>
-		<reading_group>{{READING_GROUP_SIZE}}</reading_group>
 		<lattice_type>{{LATTICE_TYPE}}</lattice_type>
 		<kernel_type>{{KERNEL_TYPE}}</kernel_type>
 		<wall_boundary_condition>{{WALL_BOUNDARY_CONDITION}}</wall_boundary_condition>
@@ -76,4 +75,4 @@
 		</timer>
 		{{/TIMER}}
 	</timings>
-</report>
\ No newline at end of file
+</report>
diff --git a/Code/tests/geometry/LookupTreeTests.cc b/Code/tests/geometry/LookupTreeTests.cc
index efb8e49fc..02b65a9b1 100644
--- a/Code/tests/geometry/LookupTreeTests.cc
+++ b/Code/tests/geometry/LookupTreeTests.cc
@@ -179,7 +179,7 @@ namespace hemelb::tests
     template <typename T>
     void check_vec(std::vector<T> const& actual, std::vector<T> const& expected) {
         REQUIRE(actual.size() == expected.size());
-        for (int i = 0; i < actual.size(); ++i) {
+        for (int i = 0; i < std::ssize(actual); ++i) {
             REQUIRE(actual[i] == expected[i]);
         }
     }
@@ -271,10 +271,10 @@ namespace hemelb::tests
         REQUIRE(b334.path[3] == Level::NC);
 
         // fluid sites per block, ordered by GMY index
-        auto spb = std::vector<int>{
+        auto spb = std::vector<U64>{
             287, 328, 328, 328, 123, 287, 328, 328, 328, 123, 287, 328, 328, 328, 123, 287, 328, 328, 328, 123
         };
-        std::map<U64, int> oct2spb;
+        std::map<U64, U64> oct2spb;
         for (U64 block = 0U; block < 20; ++block) {
             auto b_ijk = Vec16(block / 10U, (block / 5U) % 2U, block % 5U);
             auto oct = ijk_to_oct(b_ijk);
diff --git a/Code/tests/redblood/parallel/GraphCommsTests.cc b/Code/tests/redblood/parallel/GraphCommsTests.cc
index 57281cc82..acff074a3 100644
--- a/Code/tests/redblood/parallel/GraphCommsTests.cc
+++ b/Code/tests/redblood/parallel/GraphCommsTests.cc
@@ -38,7 +38,7 @@ namespace hemelb::tests
 
         //! Creates a master simulation
         template<class STENCIL = stencil::FourPoint>
-        [[nodiscard]] auto CreateMasterSim(net::MpiCommunicator const &comm) const
+        [[nodiscard]] auto CreateMasterSim(net::IOCommunicator const &comm) const
         {
             return std::make_shared<MasterSim<STENCIL>>(*options, comm);
         }
diff --git a/Code/tests/redblood/parallel/MPIIntegrateVelocities.cc b/Code/tests/redblood/parallel/MPIIntegrateVelocities.cc
index 538291e3e..37fc57e04 100644
--- a/Code/tests/redblood/parallel/MPIIntegrateVelocities.cc
+++ b/Code/tests/redblood/parallel/MPIIntegrateVelocities.cc
@@ -60,7 +60,7 @@ namespace hemelb::tests
 
         //! Creates a master simulation
         template<class STENCIL>
-        auto CreateMasterSim(net::MpiCommunicator const &comm) const
+        auto CreateMasterSim(net::IOCommunicator const &comm) const
         {
             return std::make_shared<MasterSim<STENCIL>>(*options, comm);
         }
@@ -102,7 +102,7 @@ namespace hemelb::tests
 
       auto const world = net::MpiCommunicator::World();
       auto const color = world.Rank() == 0;
-      auto const split = world.Split(color);
+      auto const split = net::IOCommunicator(world.Split(color));
       auto master = CreateMasterSim<STENCIL>(split);
       auto& fieldData = master->GetFieldData();
       auto& dom = fieldData.GetDomain();
diff --git a/Code/tests/redblood/parallel/MPILockStepTests.cc b/Code/tests/redblood/parallel/MPILockStepTests.cc
index 3ae75f6cc..fdcafd36c 100644
--- a/Code/tests/redblood/parallel/MPILockStepTests.cc
+++ b/Code/tests/redblood/parallel/MPILockStepTests.cc
@@ -47,7 +47,7 @@ namespace hemelb::tests
 
         //! Creates a master simulation
         template<class STENCIL>
-        [[nodiscard]] auto CreateMasterSim(net::MpiCommunicator const &comm) const
+        [[nodiscard]] auto CreateMasterSim(net::IOCommunicator const &comm) const
         {
             return std::make_shared<MasterSim<STENCIL>>(*options, comm);
         }
@@ -148,7 +148,7 @@ namespace hemelb::tests
 
         auto const world = Comms();
         auto const color = world.Rank() == 0;
-        auto const split = world.Split(color);
+        auto const split = net::IOCommunicator(world.Split(color));
         if(world.Size() < 3) {
             log::Logger::Log<log::Debug, log::Singleton>(
                     "Lock step tests of no interest if fewer than 3 processors");
diff --git a/Code/tests/redblood/parallel/MPIParallelIntegrationTests.cc b/Code/tests/redblood/parallel/MPIParallelIntegrationTests.cc
index b2d8ff478..b406317cd 100644
--- a/Code/tests/redblood/parallel/MPIParallelIntegrationTests.cc
+++ b/Code/tests/redblood/parallel/MPIParallelIntegrationTests.cc
@@ -47,7 +47,7 @@ namespace hemelb::tests
 
         //! Creates a master simulation
         template<class STENCIL>
-        [[nodiscard]] auto CreateMasterSim(net::MpiCommunicator const &comm) const
+        [[nodiscard]] auto CreateMasterSim(net::IOCommunicator const &comm) const
         {
             return std::make_shared<MasterSim<STENCIL>>(*options, comm);
         }
@@ -95,7 +95,7 @@ namespace hemelb::tests
 
       auto const world = net::MpiCommunicator::World();
       auto const color = world.Rank() == 0;
-      auto const split = world.Split(color);
+      auto const split = net::IOCommunicator(world.Split(color));
 
       auto master = CreateMasterSim<STENCIL>(split);
       REQUIRE(master);
diff --git a/Code/tests/redblood/parallel/MPISpreadForcesTests.cc b/Code/tests/redblood/parallel/MPISpreadForcesTests.cc
index b1716f68c..09bf3cac5 100644
--- a/Code/tests/redblood/parallel/MPISpreadForcesTests.cc
+++ b/Code/tests/redblood/parallel/MPISpreadForcesTests.cc
@@ -54,7 +54,7 @@ namespace hemelb::tests
 
         //! Creates a master simulation
         template<class STENCIL>
-        auto CreateMasterSim(net::MpiCommunicator const &comm) const
+        auto CreateMasterSim(net::IOCommunicator const &comm) const
         {
             return std::make_shared<MasterSim<STENCIL>>(*options, comm);
         }
@@ -96,7 +96,7 @@ namespace hemelb::tests
           return;
         }
       auto const color = world.Rank() == 0;
-      auto const split = world.Split(color);
+      auto const split = net::IOCommunicator(world.Split(color));
       auto master = CreateMasterSim<STENCIL>(split);
       auto& fieldData = master->GetFieldData();
       auto& dom = fieldData.GetDomain();
diff --git a/Code/tests/redblood/parallel/ParallelFixtureTests.cc b/Code/tests/redblood/parallel/ParallelFixtureTests.cc
index 413874399..fcca45547 100644
--- a/Code/tests/redblood/parallel/ParallelFixtureTests.cc
+++ b/Code/tests/redblood/parallel/ParallelFixtureTests.cc
@@ -8,6 +8,7 @@
 
 #include <catch2/catch.hpp>
 
+#include "util/span.h"
 #include "redblood/parallel/NodeCharacterizer.h"
 #include "configuration/CommandLine.h"
 #include "tests/redblood/Fixtures.h"
@@ -41,7 +42,7 @@ namespace hemelb::tests
 
         //! Creates a master simulation
         template<class STENCIL>
-        auto CreateMasterSim(net::MpiCommunicator const &comm) const
+        auto CreateMasterSim(net::IOCommunicator const &comm) const
         {
             return std::make_shared<MasterSim<STENCIL>>(*options, comm);
         }
@@ -109,7 +110,7 @@ namespace hemelb::tests
 	    {
 	      std::copy(procs.begin(), procs.end(), expected.begin());
 	    }
-          world.Broadcast(expected, positions_are_from_this_proc);
+          world.Broadcast(to_span(expected), positions_are_from_this_proc);
 
           std::set<proc_t> expected_set(expected.begin(), expected.end());