Major simplification of geometry reading and optimisation

I could not understand how the OptimisedDecomposition class worked while trying to track down a bug. This lead to a complete re-write of how the distributed graph optimised by ParMETIS was constructed and how the "moves" (i.e. which vertices need to go to which process) were calculated. This has been rewritten in a much simpler and slightly faster way, using the net::SparseExchange class. The GeometryReader is also much simpler and leans more on the octree to avoid unnecessary work. The IO pattern is much simpler also now: the GMY block data is now collectively streamed onto node "leader" processes and then individual MPI processes copy out the parts they care about.
hemelb-codes · May 10, 2024 · efbe76c · efbe76c
1 parent 46f45e3
commit efbe76c
Show file tree

Hide file tree

Showing 22 changed files with 1,320 additions and 2,004 deletions.
diff --git a/CMake/HemeLbOptions.cmake b/CMake/HemeLbOptions.cmake
@@ -47,8 +47,6 @@ endif()
 #
 pass_cachevar(HEMELB HEMELB_EXECUTABLE "hemelb"
   STRING "File name of executable to produce")
-pass_cachevar(HEMELB HEMELB_READING_GROUP_SIZE 5
-  STRING "Number of cores to use to read geometry file.")
 pass_cachevar_choice(HEMELB HEMELB_LOG_LEVEL Info
   STRING "Log level"
   Critical Error Warning Info Debug Trace)

diff --git a/Code/CMakeLists.txt b/Code/CMakeLists.txt
@@ -28,7 +28,6 @@ if (HEMELB_USE_ALL_WARNINGS_GNU)
 endif()
 
 add_definitions(-DHEMELB_CODE)
-add_definitions(-DHEMELB_READING_GROUP_SIZE=${HEMELB_READING_GROUP_SIZE})
 add_definitions(-DHEMELB_COMPUTE_ARCHITECTURE=${HEMELB_COMPUTE_ARCHITECTURE})
 
 if(HEMELB_VALIDATE_GEOMETRY)

diff --git a/Code/geometry/Domain.cc b/Code/geometry/Domain.cc
diff --git a/Code/geometry/GeometryBlock.h b/Code/geometry/GeometryBlock.h
@@ -9,20 +9,16 @@
 #include <vector>
 #include "geometry/GeometrySite.h"
 
-namespace hemelb
+namespace hemelb::geometry
 {
-  namespace geometry
-  {
     /***
      * Model of the information stored for a block in a geometry file.
      * Just gives the array of sites
      */
     struct BlockReadResult
     {
-      public:
         std::vector<GeometrySite> Sites;
     };
-  }
 }
 
 #endif /* HEMELB_GEOMETRY_GEOMETRYBLOCK_H */
diff --git a/Code/geometry/GeometryReader.cc b/Code/geometry/GeometryReader.cc
diff --git a/Code/geometry/GeometryReader.h b/Code/geometry/GeometryReader.h
@@ -37,139 +37,101 @@ namespace hemelb::geometry
         GmyReadResult LoadAndDecompose(const std::string& dataFilePath);
 
     private:
-        /**
-         * Read from the file into a buffer. We read this on a single core then broadcast it.
-         * This has proven to be more efficient than reading in on every core (even using a collective
-         * read).
-         *
-         * Note this allocates memory and returns the pointer to you.
-         *
-         * @param nBytes
-         * @return
-         */
-        std::vector<char> ReadOnAllTasks(unsigned nBytes);
+        // Read from the file into a buffer on all processes.
+        // This is collective and start and nBytes must be the same on all ranks.
+        std::vector<char> ReadAllProcesses(std::size_t startBytes, unsigned nBytes);
 
+        // Read the preamble and create the empty read result.
         GmyReadResult ReadPreamble();
 
+        // Read the block header with basic size data for each block in
+        // the domain.
         void ReadHeader(site_t blockCount);
 
-        void ReadInBlocksWithHalo(GmyReadResult& geometry, const std::vector<proc_t>& unitForEachBlock,
-                                  const proc_t localRank);
-
-        /**
-         * Compile a list of blocks to be read onto this core, including all the ones we perform
-         * LB on, and also any of their neighbouring blocks.
-         *
-         * NOTE: that the skipping-over of blocks without any fluid sites is dealt with by other
-         * code.
-         *
-         * @param geometry [in] GmyReadResult object as it has been read so far
-         * @param unitForEachBlock [in] The initial processor assigned to each block
-         * @param localRank [in] Local rank number
-         * @return Vector with true for each block we should read in.
-         */
-        std::vector<bool> DecideWhichBlocksToReadIncludingHalo(
-                const GmyReadResult& geometry, const std::vector<proc_t>& unitForEachBlock,
-                const proc_t localRank);
-
-        /**
-         * Reads in a single block and ensures it is distributed to all cores that need it.
-         *
-         * @param offsetSoFar [in] The offset into the file to read from to get the block.
-         * @param geometry [out] The geometry object to populate with info about the block.
-         * @param procsWantingThisBlock [in] A list of proc ids where info about this block is required.
-         * @param blockNumber [in] The id of the block we're reading.
-         * @param neededOnThisRank [in] A boolean indicating whether the block is required locally.
-         */
-        void ReadInBlock(MPI_Offset offsetSoFar, GmyReadResult& geometry,
-                         const std::vector<proc_t>& procsWantingThisBlock, const site_t blockNumber,
-                         const bool neededOnThisRank);
-
-        /**
-         * Decompress the block data. Uses the known number of sites to get an
-         * upper bound on the uncompressed data to simplify the code and avoid
-         * reallocation.
-         * @param compressed
-         * @param sites
-         * @return
-         */
+        // map from block GMY index to vect of data
+        using block_cache = std::map<U64, std::vector<char>>;
+
+        // Load compressed data from the file if the predicate is true for that block's GMY index.
+        template <std::predicate<std::size_t> PredT>
+        block_cache ReadCompressedBlockData(GmyReadResult const& geometry, PredT&& p);
+
+
+        // Given a vector of the block OCT ids that we want, add a
+        // one-block halo and read those blocks into the
+        // GmyReadResult.
+        void ReadInBlocksWithHalo(GmyReadResult& geometry,
+                                  const std::vector<U64>& blocksWanted);
+
+        // Given a vector of the block OCT ids wanted, add a one-block
+        // halo and return a vector of those block OCT ids.
+        std::vector<U64> DecideWhichBlocksToReadIncludingHalo(
+            const GmyReadResult& geometry, const std::vector<U64>& blocks_wanted
+        ) const;
+
+        // Parse a compressed block of data into the geometry at the given GMY index
+        void DeserialiseBlock(GmyReadResult& geometry,
+                              std::vector<char> const& compressed_data, site_t block_gmy);
+
+        // Decompress the block data
         std::vector<char> DecompressBlockData(const std::vector<char>& compressed,
                                               const unsigned int uncompressedBytes);
 
+        // Given a reader for a block's data, parse that into the
+        // GmyReadResult at the given index.
         void ParseBlock(GmyReadResult& geometry, const site_t block,
                         io::XdrReader& reader);
 
-        /**
-         * Parse the next site from the XDR reader. Note that we return by copy here.
-         * @param reader
-         * @return
-         */
+        // Parse the next site from the XDR reader
         GeometrySite ParseSite(io::XdrReader& reader);
 
-        /**
-         * Calculates the number of the rank used to read in a given block.
-         * Intent is to move this into Decomposition class, which will also handle knowledge of which procs to use for reading, and own the decomposition topology.
-         *
-         * @param blockNumber
-         * @return
-         */
-        proc_t GetReadingCoreForBlock(site_t blockNumber);
-
-        /**
-         * Optimise the domain decomposition using ParMetis. We take this approach because ParMetis
-         * is more efficient when given an initial decomposition to start with.
-         * @param geometry
-         * @param procForEachBlock
-         */
+        // Use the OptimisedDecomposition class to refine a simple,
+        // block-level initial decomposition.
         void OptimiseDomainDecomposition(GmyReadResult& geometry,
                                          const std::vector<proc_t>& procForEachBlock);
 
+        // Check for self-consistency
         void ValidateGeometry(const GmyReadResult& geometry);
 
-        /**
-         * Get the length of the header section, given the number of blocks.
-         *
-         * @param blockCount The number of blocks.
-         * @return
-         */
+        // Get the length of the header section, given the number of blocks.
         site_t GetHeaderLength(site_t blockCount) const;
 
-        void RereadBlocks(GmyReadResult& geometry, const std::vector<idx_t>& movesPerProc,
-                          const std::vector<idx_t>& movesList,
-                          const std::vector<int>& procForEachBlock);
-
-        void ImplementMoves(GmyReadResult& geometry, const std::vector<proc_t>& procForEachBlock,
-                            const std::vector<idx_t>& movesFromEachProc,
-                            const std::vector<idx_t>& movesList) const;
+        // Given knowledge of which sites are needed, read the blocks
+        // required (i.e. the blocks the sites belong to plus a
+        // one-block halo) into the GmyReadResult.
+        void RereadBlocks(GmyReadResult& geometry, SiteVec const& staying_sites,
+                          MovesMap const& arriving_sites);
 
-        /**
-         * True if we should validate the geometry.
-         * @return
-         */
-        bool ShouldValidate() const;
-
-        //! The rank which reads in the header information.
-        static const proc_t HEADER_READING_RANK = 0;
-        //! The number of cores (0-READING_GROUP_SIZE-1) that read files in parallel
-        static const proc_t READING_GROUP_SIZE = HEMELB_READING_GROUP_SIZE;
+        // Set the rank of sites assigned to this process in the GmyReadResult.
+        void ImplementMoves(GmyReadResult& geometry, SiteVec const& staying,
+                            MovesMap const& arriving, MovesMap const& leaving) const;
 
         //! Info about the connectivity of the lattice.
         const lb::LatticeInfo& latticeInfo;
-        //! File accessed to read in the geometry data.
+
+        // File accessed to read in the geometry data.
+        //
+        // We are going
+        // to read the whole file collectively and sequentially in
+        // chunks. The node leaders will read data either into private
+        // memory and broadcast or into node-shared memory and do a
+        // barrier/fence.
         net::MpiFile file;
-        //! Information about the file, to give cues and hints to MPI.
 
-        //! Communication info for all ranks that will need a slice of the geometry
-        net::MpiCommunicator computeComms;
+        //! Communicator for all ranks that will need a slice of the geometry
+        net::IOCommunicator computeComms;
 
-        //! The number of fluid sites on each block in the geometry
+        //! How many blocks with at least one fluid site
+        U64 nFluidBlocks;
+        //! The number of fluid sites on each block in the file.
         std::vector<site_t> fluidSitesOnEachBlock;
-        //! The number of bytes each block takes up while still compressed.
+        //! The number of bytes each block in the file takes up while still compressed.
         std::vector<unsigned int> bytesPerCompressedBlock;
-        //! The number of bytes each block takes up when uncompressed.
+        //! The number of bytes each block in the file takes up when uncompressed.
         std::vector<unsigned int> bytesPerUncompressedBlock;
-        //! The processor assigned to each block.
+        //! The process assigned to each block.
         std::vector<proc_t> principalProcForEachBlock;
+        //! The process for fluid-containing blocks in octree order
+        std::vector<proc_t> procForBlockOct;
 
         //! Timings object for recording the time taken for each step of the domain decomposition.
         hemelb::reporting::Timers &timings;

diff --git a/Code/geometry/GmyReadResult.cc b/Code/geometry/GmyReadResult.cc
@@ -39,20 +39,11 @@ namespace hemelb::geometry {
 
     site_t GmyReadResult::FindFluidSiteIndexInBlock(site_t fluidSiteBlock, site_t neighbourSiteId) const
     {
-        site_t SiteId = 0;
-        // Calculate the site's id over the whole geometry,
-        for (site_t neighSite = 0; neighSite < GetSitesPerBlock(); neighSite++)
-        {
-            if (neighSite == neighbourSiteId)
-            {
-                break;
+        auto& sites = Blocks[fluidSiteBlock].Sites;
+        return std::count_if(
+            &sites[0], &sites[neighbourSiteId], [](GeometrySite const& s) {
+                return s.isFluid;
             }
-            else if (Blocks[fluidSiteBlock].Sites[neighSite].targetProcessor != SITE_OR_BLOCK_SOLID)
-            {
-                SiteId++;
-            }
-        }
-
-        return SiteId;
+        );
     }
-}
+}
diff --git a/Code/geometry/GmyReadResult.h b/Code/geometry/GmyReadResult.h
@@ -7,6 +7,7 @@
 #define HEMELB_GEOMETRY_GMYREADRESULT_H
 
 #include <memory>
+#include <map>
 #include <vector>
 
 #include "units.h"
@@ -18,6 +19,15 @@
 namespace hemelb::geometry
 {
     namespace octree { class DistributedStore; }
+
+    // A site description is the block's OCT index (hence U64 to
+    // match the tree's choice) and the index of the site within
+    // the block (could be U16, but use 64 as padding will force
+    // that anyway).
+    using SiteDesc = std::array<U64,2>;
+    using SiteVec = std::vector<SiteDesc>;
+    using MovesMap = std::map<int, SiteVec>;
+
     /***
      * Model of the information in a geometry file
      */
@@ -61,6 +71,11 @@ namespace hemelb::geometry
           return (blockI * dimensionsInBlocks.y() + blockJ) * dimensionsInBlocks.z() + blockK;
         }
 
+        inline U64 GetBlockIdFromBlockCoordinates(Vec16 ijk) const {
+            auto IJK = ijk.as<U64>();
+            return (IJK[0] * dimensionsInBlocks[1] + IJK[1]) * dimensionsInBlocks[2] + IJK[2];
+        }
+
         /***
          * Get the coordinates of a block from a block id.
          */
@@ -136,5 +151,53 @@ namespace hemelb::geometry
         std::unique_ptr<octree::DistributedStore> block_store;
     };
 
+    // This iterator will move through the sites that make up a
+    // block. On dereference it returns a pair of the current 3D
+    // position and the 1D index in geometry file order.
+    struct BlockSiteIterator {
+        GmyReadResult const* gmy;
+        Vec16 pos;
+        site_t idx;
+
+        using value_type = std::pair<Vec16, site_t>;
+        inline value_type operator*() const {
+            return {pos, idx};
+        }
+
+        inline BlockSiteIterator& operator++() {
+            auto B = gmy->GetBlockSize();
+            idx++;
+            if (++pos[2] == B) {
+                pos[2] = 0;
+                if (++pos[1] == B) {
+                    pos[1] = 0;
+                    ++pos[0];
+                }
+            }
+            return *this;
+        }
+
+        inline friend bool operator==(BlockSiteIterator const& l, BlockSiteIterator const& r) {
+          return l.idx == r.idx;
+        }
+    };
+
+    // Represent the range of valid sites in a block that belongs to
+    // the GmyReadResult.
+    struct BlockSiteRange {
+        GmyReadResult const* gmy;
+        auto begin() const {
+            return BlockSiteIterator{gmy, {0,0,0}, 0};
+        }
+        auto end() const {
+            auto B = gmy->GetBlockSize();
+            return BlockSiteIterator{gmy, {B, B, B}, B*B*B};
+        }
+    };
+
+    // Helper for the above.
+    inline auto IterSitesInBlock(GmyReadResult const& gmy) {
+        return BlockSiteRange{&gmy};
+    }
 }
 #endif // HEMELB_GEOMETRY_GMYREADRESULT_H
diff --git a/Code/geometry/LookupTree.h b/Code/geometry/LookupTree.h
@@ -297,6 +297,10 @@ namespace hemelb::geometry::octree
             return storage_rank.size();
         }
 
+        [[nodiscard]] inline auto& GetBlockOwnerRank() const {
+            return storage_rank;
+        }
+
         // Allow many writes to the same block, wherever it is.
         // Constructed by a WriteSession below
         struct BlockWriteChunk {

diff --git a/Code/geometry/decomposition/BasicDecomposition.cc b/Code/geometry/decomposition/BasicDecomposition.cc
@@ -79,8 +79,8 @@ namespace hemelb::geometry::decomposition
                      rank_for_block.begin(), rank_for_block.end(), comm_size);
 
         // We need to return data organised in GMY file order
-        // Initialise output to -1 => SOLID, we will overwrite the non-solid below
-        std::fill(procAssignedToEachBlock.begin(), procAssignedToEachBlock.end(), -1);
+        // Initialise output to SOLID, we will overwrite the non-solid below
+        std::fill(procAssignedToEachBlock.begin(), procAssignedToEachBlock.end(), SITE_OR_BLOCK_SOLID);
 
         // We need to know the octree ID to map to 3D coords.
         auto& block_oct_ids = tree.levels[tree.n_levels].node_ids;
@@ -95,7 +95,7 @@ namespace hemelb::geometry::decomposition
         return rank_for_block;
     }
 
-  void BasicDecomposition::Validate(std::vector<proc_t>& procAssignedToEachBlock, net::MpiCommunicator const& communicator) const
+    void BasicDecomposition::Validate(std::vector<proc_t>& procAssignedToEachBlock, net::MpiCommunicator const& communicator) const
     {
         log::Logger::Log<log::Debug, log::OnePerCore>("Validating procForEachBlock");