diff --git a/CMake/HemeLbOptions.cmake b/CMake/HemeLbOptions.cmake index 9b76e6191..08b03c2bb 100644 --- a/CMake/HemeLbOptions.cmake +++ b/CMake/HemeLbOptions.cmake @@ -40,8 +40,6 @@ endif() # pass_cachevar(HEMELB HEMELB_EXECUTABLE "hemelb" STRING "File name of executable to produce") -pass_cachevar(HEMELB HEMELB_READING_GROUP_SIZE 5 - STRING "Number of cores to use to read geometry file.") pass_cachevar_choice(HEMELB HEMELB_LOG_LEVEL Info STRING "Log level" Critical Error Warning Info Debug Trace) diff --git a/Code/CMakeLists.txt b/Code/CMakeLists.txt index 9fed5225b..d1ec98a37 100644 --- a/Code/CMakeLists.txt +++ b/Code/CMakeLists.txt @@ -28,7 +28,6 @@ if (HEMELB_USE_ALL_WARNINGS_GNU) endif() add_definitions(-DHEMELB_CODE) -add_definitions(-DHEMELB_READING_GROUP_SIZE=${HEMELB_READING_GROUP_SIZE}) add_definitions(-DHEMELB_COMPUTE_ARCHITECTURE=${HEMELB_COMPUTE_ARCHITECTURE}) if(HEMELB_VALIDATE_GEOMETRY) diff --git a/Code/geometry/Domain.cc b/Code/geometry/Domain.cc index 868996a58..bc7f36c19 100644 --- a/Code/geometry/Domain.cc +++ b/Code/geometry/Domain.cc @@ -66,206 +66,224 @@ namespace hemelb::geometry blockCount = blockCounts.x() * blockCounts.y() * blockCounts.z(); } - void Domain::ProcessReadSites(const GmyReadResult & readResult) - { - log::Logger::Log("Processing sites assigned to each MPI process"); - const auto max_site_index = sites - util::Vector3D::Ones(); - blocks.resize(rank_for_site_store->GetBlockCount()); - - totalSharedFs = 0; - - std::vector domainEdgeSiteData[COLLISION_TYPES]; - std::vector midDomainSiteData[COLLISION_TYPES]; - std::vector domainEdgeBlockNumber[COLLISION_TYPES]; - std::vector midDomainBlockNumber[COLLISION_TYPES]; - std::vector domainEdgeSiteNumber[COLLISION_TYPES]; - std::vector midDomainSiteNumber[COLLISION_TYPES]; - std::vector > domainEdgeWallNormals[COLLISION_TYPES]; - std::vector > midDomainWallNormals[COLLISION_TYPES]; - std::vector domainEdgeWallDistance[COLLISION_TYPES]; - std::vector midDomainWallDistance[COLLISION_TYPES]; - - proc_t localRank = comms.Rank(); - - // Iterate over all blocks in site units - for (auto leaf: rank_for_site_store->GetTree().IterLeaves()) - { - auto block_ijk = leaf.coords(); - site_t blockGmyIdx = GetBlockGmyIdxFromBlockCoords(block_ijk); - auto const& blockReadIn = readResult.Blocks[blockGmyIdx]; - - if (blockReadIn.Sites.empty()) + void Domain::ProcessReadSites(const GmyReadResult & readResult) + { + log::Logger::Log("Processing sites assigned to each MPI process"); + const auto max_site_index = sites - util::Vector3D::Ones(); + blocks.resize(rank_for_site_store->GetBlockCount()); + + totalSharedFs = 0; + + std::vector domainEdgeSiteData[COLLISION_TYPES]; + std::vector midDomainSiteData[COLLISION_TYPES]; + std::vector domainEdgeBlockNumber[COLLISION_TYPES]; + std::vector midDomainBlockNumber[COLLISION_TYPES]; + std::vector domainEdgeSiteNumber[COLLISION_TYPES]; + std::vector midDomainSiteNumber[COLLISION_TYPES]; + std::vector > domainEdgeWallNormals[COLLISION_TYPES]; + std::vector > midDomainWallNormals[COLLISION_TYPES]; + std::vector domainEdgeWallDistance[COLLISION_TYPES]; + std::vector midDomainWallDistance[COLLISION_TYPES]; + + proc_t localRank = comms.Rank(); + + // We need to know the coordinates of all the domain edge + // sites below so we can loop over them to figure out which + // processes they live on once the RMA data structures are + // initialised. + std::vector> edge_sites; + + // This closure returns which rank a site (given by global + // 3D coordinate) lives on, according to the + // GmyReadResult. Since that is only guaranteed to know + // the rank of sites which are assigned to this rank, it + // may well return UNKNOWN_PROCESS + auto read_result_rank_for_site = [this, &max_site_index, &readResult]( + util::Vector3D const& global_site_coords, + Vec16& block_coords, site_t& site_gmy_idx + ) { + // If outside the bounding box it is definitely non-fluid + if (!global_site_coords.IsInRange( + util::Vector3D::Zero(), + max_site_index + )) + return SITE_OR_BLOCK_SOLID; + + // ... (that is actually being simulated and not a solid)... + block_coords = (global_site_coords / readResult.GetBlockSize()).as(); + site_t block_gmy_idx = readResult.GetBlockIdFromBlockCoordinates(block_coords.x(), + block_coords.y(), + block_coords.z()); + + // Move on if the neighbour is in a block of solids + // in which case the block will contain zero sites + // Or on if the neighbour site is solid + // in which case the targetProcessor is SITE_OR_BLOCK_SOLID + // Or the neighbour is also on this processor + // in which case the targetProcessor is localRank + if (readResult.Blocks[block_gmy_idx].Sites.empty()) + return SITE_OR_BLOCK_SOLID; + + auto site_local_coords = global_site_coords % readResult.GetBlockSize(); + site_gmy_idx = readResult.GetSiteIdFromSiteCoordinates(site_local_coords.x(), + site_local_coords.y(), + site_local_coords.z()); + return readResult.Blocks[block_gmy_idx].Sites[site_gmy_idx].targetProcessor; + }; + + for (auto leaf: rank_for_site_store->GetTree().IterLeaves()) { + auto block_ijk = leaf.coords(); + site_t blockGmyIdx = GetBlockGmyIdxFromBlockCoords(block_ijk); + auto const& blockReadIn = readResult.Blocks[blockGmyIdx]; + + if (blockReadIn.Sites.empty()) + continue; + + auto blockOctIdx = leaf.index(); + auto lowest_site_in_block = block_ijk.as() * GetBlockSize(); + + // Iterate over all sites within the current block. + for ( + auto siteTraverser = SiteTraverser(*this); + siteTraverser.CurrentLocationValid(); + siteTraverser.TraverseOne() + ) { + site_t localSiteId = siteTraverser.GetCurrentIndex(); + // Create block if required + if (blocks[blockOctIdx].IsEmpty()) + blocks[blockOctIdx] = Block(GetSitesPerBlockVolumeUnit()); + + auto assignedRank = blockReadIn.Sites[localSiteId].targetProcessor; + blocks[blockOctIdx].SetProcessorRankForSite(localSiteId, assignedRank); + + // If the site is not on this processor, continue. + if (assignedRank != localRank) continue; - auto blockOctIdx = leaf.index(); - auto lowest_site_in_block = block_ijk.as() * GetBlockSize(); - - // Iterate over all sites within the current block. - for (auto siteTraverser = SiteTraverser(*this); - siteTraverser.CurrentLocationValid(); siteTraverser.TraverseOne()) - { - site_t localSiteId = siteTraverser.GetCurrentIndex(); - // Create block if required - if (blocks[blockOctIdx].IsEmpty()) - blocks[blockOctIdx] = Block(GetSitesPerBlockVolumeUnit()); - - auto assignedRank = blockReadIn.Sites[localSiteId].targetProcessor; - blocks[blockOctIdx].SetProcessorRankForSite(localSiteId, assignedRank); - - // If the site is not on this processor, continue. - if (assignedRank != localRank) + bool isMidDomainSite = true; + util::Vector3D siteGlobalCoords = lowest_site_in_block + siteTraverser.GetCurrentLocation(); + // Iterate over all non-zero direction vectors. + for (unsigned int l = 1; l < latticeInfo.GetNumVectors(); l++) { + // Find the neighbour site co-ords in this direction. + util::Vector3D neighbourGlobalCoords = siteGlobalCoords + + latticeInfo.GetVector(l).as(); + Vec16 neighbourBlock; + site_t neighbourSiteId; + site_t neighbourProc = read_result_rank_for_site( + neighbourGlobalCoords, neighbourBlock, neighbourSiteId + ); + if (neighbourProc == SITE_OR_BLOCK_SOLID || localRank == neighbourProc) continue; + isMidDomainSite = false; + totalSharedFs++; + } - bool isMidDomainSite = true; - // Iterate over all non-zero direction vectors. - for (unsigned int l = 1; l < latticeInfo.GetNumVectors(); l++) - { - // Find the neighbour site co-ords in this direction. - util::Vector3D neighbourGlobalCoords = - lowest_site_in_block - + siteTraverser.GetCurrentLocation() - + latticeInfo.GetVector(l).as(); - - // If outside the bounding box it is definitely non-fluid - if (!neighbourGlobalCoords.IsInRange( - util::Vector3D::Zero(), - max_site_index - )) - continue; - - - // ... (that is actually being simulated and not a solid)... - auto neighbourBlock = neighbourGlobalCoords / readResult.GetBlockSize(); - auto neighbourSite = neighbourGlobalCoords % readResult.GetBlockSize(); - site_t neighBlockGmyIdx = readResult.GetBlockIdFromBlockCoordinates(neighbourBlock.x(), - neighbourBlock.y(), - neighbourBlock.z()); - - // Move on if the neighbour is in a block of solids - // in which case the block will contain zero sites - // Or on if the neighbour site is solid - // in which case the targetProcessor is SITE_OR_BLOCK_SOLID - // Or the neighbour is also on this processor - // in which case the targetProcessor is localRank - if (readResult.Blocks[neighBlockGmyIdx].Sites.empty()) - continue; - - site_t neighbourSiteId = readResult.GetSiteIdFromSiteCoordinates(neighbourSite.x(), - neighbourSite.y(), - neighbourSite.z()); - - proc_t neighbourProc = readResult.Blocks[neighBlockGmyIdx].Sites[neighbourSiteId].targetProcessor; - if (neighbourProc == SITE_OR_BLOCK_SOLID || localRank == neighbourProc) - continue; + if (!isMidDomainSite) + edge_sites.push_back(siteGlobalCoords); + + // Set the collision type data. map_block site data is renumbered according to + // fluid site numbers within a particular collision type. + SiteData siteData(blockReadIn.Sites[localSiteId]); + int l = -1; + switch (siteData.GetCollisionType()) { + case FLUID: + l = 0; + break; + case WALL: + l = 1; + break; + case INLET: + l = 2; + break; + case OUTLET: + l = 3; + break; + case (INLET | WALL): + l = 4; + break; + case (OUTLET | WALL): + l = 5; + break; + } - isMidDomainSite = false; - totalSharedFs++; - - // Iterate over neighbouring processors until we find the one with the - // neighbouring site on it. - auto neighProcWithSite = std::find_if( - neighbouringProcs.begin(), neighbouringProcs.end(), - [&](NeighbouringProcessor const& np) { - return np.Rank == neighbourProc; - }); - if (neighProcWithSite == neighbouringProcs.end()) { - // We didn't find a neighbour-proc with the - // neighbour-site on it, so we need a new - // neighbouring processor. - - // Store rank of neighbour in >neigh_proc[neigh_procs] - NeighbouringProcessor lNewNeighbour; - lNewNeighbour.SharedDistributionCount = 1; - lNewNeighbour.Rank = neighbourProc; - neighbouringProcs.push_back(lNewNeighbour); - - // if debugging then output decisions with reasoning for all neighbour processors - log::Logger::Log("domain_type: added %i as neighbour for %i because site %i in block %i is neighbour to site %i in block %i in direction (%i,%i,%i)\n", - (int) neighbourProc, - (int) localRank, - (int) neighbourSiteId, - (int) neighBlockGmyIdx, - (int) localSiteId, - (int) blockGmyIdx, - latticeInfo.GetVector(l).x(), - latticeInfo.GetVector(l).y(), - latticeInfo.GetVector(l).z()); - } else { - // Did find it, increment the shared count - ++neighProcWithSite->SharedDistributionCount; - } + const util::Vector3D& normal = blockReadIn.Sites[localSiteId].wallNormalAvailable ? + blockReadIn.Sites[localSiteId].wallNormal : + util::Vector3D(NO_VALUE); + + if (isMidDomainSite) { + midDomainBlockNumber[l].push_back(blockOctIdx); + midDomainSiteNumber[l].push_back(localSiteId); + midDomainSiteData[l].push_back(siteData); + midDomainWallNormals[l].push_back(normal); + for (Direction direction = 1; direction < latticeInfo.GetNumVectors(); direction++) { + midDomainWallDistance[l].push_back(blockReadIn.Sites[localSiteId].links[direction - 1].distanceToIntersection); } - - // Set the collision type data. map_block site data is renumbered according to - // fluid site numbers within a particular collision type. - SiteData siteData(blockReadIn.Sites[localSiteId]); - int l = -1; - switch (siteData.GetCollisionType()) - { - case FLUID: - l = 0; - break; - case WALL: - l = 1; - break; - case INLET: - l = 2; - break; - case OUTLET: - l = 3; - break; - case (INLET | WALL): - l = 4; - break; - case (OUTLET | WALL): - l = 5; - break; + } else { + domainEdgeBlockNumber[l].push_back(blockOctIdx); + domainEdgeSiteNumber[l].push_back(localSiteId); + domainEdgeSiteData[l].push_back(siteData); + domainEdgeWallNormals[l].push_back(normal); + for (Direction direction = 1; direction < latticeInfo.GetNumVectors(); direction++) { + domainEdgeWallDistance[l].push_back(blockReadIn.Sites[localSiteId].links[direction - 1].distanceToIntersection); } + } + } + } - const util::Vector3D& normal = blockReadIn.Sites[localSiteId].wallNormalAvailable ? - blockReadIn.Sites[localSiteId].wallNormal : - util::Vector3D(NO_VALUE); - - //SiteData siteData(blockReadIn.Sites[localSiteId]); + PopulateWithReadData(midDomainBlockNumber, + midDomainSiteNumber, + midDomainSiteData, + midDomainWallNormals, + midDomainWallDistance, + domainEdgeBlockNumber, + domainEdgeSiteNumber, + domainEdgeSiteData, + domainEdgeWallNormals, + domainEdgeWallDistance); + + // We now have the distributed store setup, so can find which + // process any site lives one. Set up the neighbouring + // processes for the edge sites. + for (auto const& site_global_coords: edge_sites) { + for (unsigned int l = 1; l < latticeInfo.GetNumVectors(); l++) { + // Find the neighbour site co-ords in this direction. + util::Vector3D neigh_global_coords = site_global_coords + latticeInfo.GetVector(l).as(); + Vec16 neigh_block; + site_t neigh_site_id; + site_t rrProc = read_result_rank_for_site( + neigh_global_coords, neigh_block, neigh_site_id + ); + + if (rrProc == SITE_OR_BLOCK_SOLID || rrProc == localRank) + continue; - if (isMidDomainSite) - { - midDomainBlockNumber[l].push_back(blockOctIdx); - midDomainSiteNumber[l].push_back(localSiteId); - midDomainSiteData[l].push_back(siteData); - midDomainWallNormals[l].push_back(normal); - for (Direction direction = 1; direction < latticeInfo.GetNumVectors(); direction++) - { - midDomainWallDistance[l].push_back(blockReadIn.Sites[localSiteId].links[direction - 1].distanceToIntersection); - } + auto [neighbourProc, remoteSiteIdx] = rank_for_site_store->GetSiteData( + neigh_block, neigh_site_id + ); + auto neighProcWithSite = std::find_if( + neighbouringProcs.begin(), neighbouringProcs.end(), + [&](NeighbouringProcessor const& np) { + return np.Rank == neighbourProc; } - else - { - domainEdgeBlockNumber[l].push_back(blockOctIdx); - domainEdgeSiteNumber[l].push_back(localSiteId); - domainEdgeSiteData[l].push_back(siteData); - domainEdgeWallNormals[l].push_back(normal); - for (Direction direction = 1; direction < latticeInfo.GetNumVectors(); direction++) - { - domainEdgeWallDistance[l].push_back(blockReadIn.Sites[localSiteId].links[direction - 1].distanceToIntersection); - } - } - + ); + + if (neighProcWithSite == neighbouringProcs.end()) { + // We didn't find a neighbour-proc with the + // neighbour-site on it, so we need a new + // neighbouring processor. + + // Store rank of neighbour in >neigh_proc[neigh_procs] + NeighbouringProcessor lNewNeighbour; + lNewNeighbour.SharedDistributionCount = 1; + lNewNeighbour.Rank = neighbourProc; + neighbouringProcs.push_back(lNewNeighbour); + } else { + // Did find it, increment the shared count + ++neighProcWithSite->SharedDistributionCount; } - } - - PopulateWithReadData(midDomainBlockNumber, - midDomainSiteNumber, - midDomainSiteData, - midDomainWallNormals, - midDomainWallDistance, - domainEdgeBlockNumber, - domainEdgeSiteNumber, - domainEdgeSiteData, - domainEdgeWallNormals, - domainEdgeWallDistance); } + } void Domain::PopulateWithReadData( const std::vector midDomainBlockNumbers[COLLISION_TYPES], diff --git a/Code/geometry/GeometryBlock.h b/Code/geometry/GeometryBlock.h index fcbb739f4..c89f4c729 100644 --- a/Code/geometry/GeometryBlock.h +++ b/Code/geometry/GeometryBlock.h @@ -9,20 +9,16 @@ #include #include "geometry/GeometrySite.h" -namespace hemelb +namespace hemelb::geometry { - namespace geometry - { /*** * Model of the information stored for a block in a geometry file. * Just gives the array of sites */ struct BlockReadResult { - public: std::vector Sites; }; - } } #endif /* HEMELB_GEOMETRY_GEOMETRYBLOCK_H */ diff --git a/Code/geometry/GeometryReader.cc b/Code/geometry/GeometryReader.cc index e98535bd2..ce5bc3fe6 100644 --- a/Code/geometry/GeometryReader.cc +++ b/Code/geometry/GeometryReader.cc @@ -16,9 +16,12 @@ #include "geometry/GeometryReader.h" #include "geometry/LookupTree.h" #include "net/net.h" +#include "net/SparseExchange.h" #include "net/IOCommunicator.h" #include "log/Logger.h" +#include "util/span.h" #include "util/utilityFunctions.h" +#include "util/Iterator.h" #include "constants.h" namespace hemelb::geometry @@ -76,43 +79,28 @@ namespace hemelb::geometry GmyReadResult GeometryReader::LoadAndDecompose(const std::string& dataFilePath) { - log::Logger::Log("Starting file read timer"); - timings[hemelb::reporting::Timers::fileRead].Start(); - - // Create hints about how we'll read the file. See Chapter 13, page 400 of the MPI 2.2 spec. - MPI_Info fileInfo; - HEMELB_MPI_CALL(MPI_Info_create, (&fileInfo)); - std::string accessStyle = "access_style"; - std::string accessStyleValue = "sequential"; - std::string buffering = "collective_buffering"; - std::string bufferingValue = "true"; - - HEMELB_MPI_CALL(MPI_Info_set, - (fileInfo, const_cast (accessStyle.c_str()), const_cast (accessStyleValue.c_str()))); - HEMELB_MPI_CALL(MPI_Info_set, - (fileInfo, const_cast (buffering.c_str()), const_cast (bufferingValue.c_str()))); - - // Open the file. - file = net::MpiFile::Open(computeComms, dataFilePath, MPI_MODE_RDONLY, fileInfo); - log::Logger::Log("Opened config file %s", dataFilePath.c_str()); + timings[reporting::Timers::fileRead].Start(); + + // Open the file for read on node leaders + if (computeComms.AmNodeLeader()) { + file = net::MpiFile::Open( + computeComms.GetLeadersComm(), dataFilePath, MPI_MODE_RDONLY, MPI_INFO_NULL + ); + } + + log::Logger::Log("Opened geometry file %s", dataFilePath.c_str()); // TODO: Why is there this fflush? fflush(nullptr); - // Set the view to the file. - file.SetView(0, MPI_CHAR, MPI_CHAR, "native", fileInfo); - log::Logger::Log("Reading file preamble"); GmyReadResult geometry = ReadPreamble(); log::Logger::Log("Reading file header"); ReadHeader(geometry.GetBlockCount()); - - // Close the file - only the ranks participating in the topology need to read it again. - file.Close(); - + timings[reporting::Timers::fileRead].Stop(); { - timings[hemelb::reporting::Timers::initialDecomposition].Start(); + timings[reporting::Timers::initialDecomposition].Start(); principalProcForEachBlock.resize(geometry.GetBlockCount()); log::Logger::Log("Creating block-level octree"); @@ -120,46 +108,53 @@ namespace hemelb::geometry geometry.GetBlockDimensions().as(), fluidSitesOnEachBlock ); + nFluidBlocks = blockTree.levels.back().node_ids.size(); + log::Logger::Log( + "Geometry has %lu / %ld active blocks, total %ld sites", + nFluidBlocks, + geometry.GetBlockCount(), + blockTree.levels[0].sites_per_node[0] + ); // Get an initial base-level decomposition of the domain macro-blocks over processors. // This will later be improved upon by ParMetis. log::Logger::Log("Beginning initial decomposition"); decomposition::BasicDecomposition basicDecomposer(geometry, - computeComms); + computeComms.Size()); - auto owner_for_block = basicDecomposer.Decompose(blockTree, principalProcForEachBlock); + // This vector only has entries for blocks that have a least one fluid site + procForBlockOct = basicDecomposer.Decompose(blockTree, principalProcForEachBlock); geometry.block_store = std::make_unique( geometry.GetSitesPerBlock(), std::move(blockTree), - std::move(owner_for_block), + procForBlockOct, computeComms ); - if (ShouldValidate()) { + if constexpr (build_info::VALIDATE_GEOMETRY) { log::Logger::Log("Validating initial decomposition"); - basicDecomposer.Validate(principalProcForEachBlock); + basicDecomposer.Validate(principalProcForEachBlock, computeComms); } - timings[hemelb::reporting::Timers::initialDecomposition].Stop(); + timings[reporting::Timers::initialDecomposition].Stop(); } - // Perform the initial read-in. - log::Logger::Log("Reading in my blocks"); - - // Reopen in the file just between the nodes in the topology decomposition. Read in blocks - // local to this node. - file = net::MpiFile::Open(computeComms, dataFilePath, MPI_MODE_RDONLY, fileInfo); - - ReadInBlocksWithHalo(geometry, principalProcForEachBlock, computeComms.Rank()); - - if (ShouldValidate()) + timings[reporting::Timers::fileRead].Start(); { + std::vector blocks_wanted; + blocks_wanted.reserve((2*nFluidBlocks) / computeComms.Size()); + for (U64 i = 0; i < nFluidBlocks; ++i) { + if (procForBlockOct[i] == computeComms.Rank()) + blocks_wanted.push_back(i); + } + ReadInBlocksWithHalo(geometry, blocks_wanted); + } + if constexpr (build_info::VALIDATE_GEOMETRY) { ValidateGeometry(geometry); } + timings[reporting::Timers::fileRead].Stop(); - timings[hemelb::reporting::Timers::fileRead].Stop(); - - hemelb::log::Logger::Log("Optimising the domain decomposition."); - timings[hemelb::reporting::Timers::domainDecomposition].Start(); + log::Logger::Log("Optimising the domain decomposition."); + timings[reporting::Timers::domainDecomposition].Start(); // Having done an initial decomposition of the geometry, and read in the data, we optimise the // domain decomposition. @@ -167,31 +162,28 @@ namespace hemelb::geometry OptimiseDomainDecomposition(geometry, principalProcForEachBlock); log::Logger::Log("Ending domain decomposition optimisation"); - if (ShouldValidate()) - { + if constexpr (build_info::VALIDATE_GEOMETRY) { log::Logger::Log("Validating optimised decomposition"); ValidateGeometry(geometry); } - file.Close(); - // Finish up - close the file, set the timings, deallocate memory. - HEMELB_MPI_CALL(MPI_Info_free, (&fileInfo)); - - timings[hemelb::reporting::Timers::domainDecomposition].Stop(); + timings[reporting::Timers::domainDecomposition].Stop(); return geometry; } - std::vector GeometryReader::ReadOnAllTasks(unsigned nBytes) + std::vector GeometryReader::ReadAllProcesses(std::size_t start, unsigned nBytes) { - std::vector buffer(nBytes); - const net::MpiCommunicator& comm = file.GetCommunicator(); - if (comm.Rank() == HEADER_READING_RANK) - { - file.Read(buffer); - } - comm.Broadcast(buffer, HEADER_READING_RANK); - return buffer; + // result + std::vector buffer(nBytes); + auto sp = to_span(buffer); + if (computeComms.AmNodeLeader()) { + // file is opened by the leaders comm only + file.ReadAtAll(start, sp); + } + // Broadcast from node leader to others + computeComms.GetNodeComm().Broadcast(sp, 0); + return buffer; } /** @@ -199,11 +191,11 @@ namespace hemelb::geometry */ GmyReadResult GeometryReader::ReadPreamble() { - std::vector preambleBuffer = ReadOnAllTasks(gmy::PreambleLength); + std::vector preambleBuffer = ReadAllProcesses(0, gmy::PreambleLength); // Create an Xdr translator based on the read-in data. auto preambleReader = io::XdrMemReader(preambleBuffer.data(), - gmy::PreambleLength); + gmy::PreambleLength); uint32_t hlbMagicNumber, gmyMagicNumber, version; // Read in housekeeping values @@ -267,11 +259,11 @@ namespace hemelb::geometry void GeometryReader::ReadHeader(site_t blockCount) { site_t headerByteCount = GetHeaderLength(blockCount); - std::vector headerBuffer = ReadOnAllTasks(headerByteCount); + std::vector headerBuffer = ReadAllProcesses(gmy::PreambleLength, headerByteCount); // Create a Xdr translation object to translate from binary - auto preambleReader = hemelb::io::XdrMemReader(headerBuffer.data(), - headerByteCount); + auto preambleReader = io::XdrMemReader(headerBuffer.data(), + headerByteCount); fluidSitesOnEachBlock.reserve(blockCount); bytesPerCompressedBlock.reserve(blockCount); @@ -291,169 +283,232 @@ namespace hemelb::geometry } } + // Args are vectors with index representing a block ID. + // First arg is which blocks this rank wants + // Second arg is which rank "owns" them + // Returns a pair with: + // - a map from source rank to vector of block IDs wanted from it + // - the total number of blocks wanted. + auto ComputeSources(std::vector const& wanted, std::vector const& ranks_they_are_on) { + std::map> ans; + auto const N = wanted.size(); + unsigned total = 0; + for (std::size_t i = 0; i < N; ++i) { + if (wanted[i]) { + // This will default construct an empty vec if this is the first time + auto& blocks = ans[ranks_they_are_on[i]]; + blocks.push_back(i); + total += 1; + } + } + return std::make_pair(std::move(ans), total); + } + + + // Return a map from block gmy index to compressed data, for + // those blocks that where the predicate returns true + template PredT> + auto GeometryReader::ReadCompressedBlockData(GmyReadResult const& geometry, PredT&& want) -> block_cache { + // Strategy: read the entire file once and filter out those + // blocks we want. + + // Going to read the file in large chunks (but note using + // whole blocks) and only on the node leaders (using + // collective MPI IO). Going to use the node communicator to + // set up a shared memory allocation to hold this and then + // filter in parallel. + constexpr auto MiB = std::size_t(1) << 20; + constexpr std::size_t MAX_GMY_BUFFER_SIZE = 64U * MiB; + + log::Logger::Log("Buffering compressed site data onto owning processes."); + log::Logger::Log("Maximum buffer size %lu B", MAX_GMY_BUFFER_SIZE); + + block_cache ans; + + // Work out where blocks live in the gmy **file** + std::size_t const nBlocksGmy = bytesPerCompressedBlock.size(); + log::Logger::Log("Number of GMY blocks %lu", nBlocksGmy); + std::size_t const dataStart = gmy::PreambleLength + GetHeaderLength(geometry.GetBlockCount()); + + // N + 1 elements, elem i holds the start of block i, elem i+1 holds the end + auto const blockBoundsGmy = [&] () { + std::vector ans(nBlocksGmy + 1); + ans[0] = dataStart; + std::inclusive_scan( + bytesPerCompressedBlock.begin(), bytesPerCompressedBlock.end(), + ans.begin() + 1, std::plus(), dataStart + ); + return ans; + }(); + + log::Logger::Log("Setup node level shared memory"); + + MPI_Win win; + char* local_buf = nullptr; + + // Full size of buffer across node communicator + auto total_buf_size = std::min(blockBoundsGmy[nBlocksGmy] - blockBoundsGmy[0], + MAX_GMY_BUFFER_SIZE); + + auto&& nodeComm = computeComms.GetNodeComm(); + auto local_buf_size = (total_buf_size - 1) / nodeComm.Size() + 1; + + // Need contiguous memory. + // Note local_buf has pointer into that process's "bit" of memory + net::MpiCall{MPI_Win_allocate_shared}( + local_buf_size, 1, MPI_INFO_NULL, nodeComm, &local_buf, &win + ); + // Get the whole thing's start address + char* buf = local_buf - nodeComm.Rank() * local_buf_size; + + // Open a passive access epoch to the shared buffer + net::MpiCall{MPI_Win_lock_all}(MPI_MODE_NOCHECK, win); + + // Get to work reading chunks + std::size_t const* blockBoundsGmy_end = &*blockBoundsGmy.end(); + for (std::size_t i_first_block = 0; i_first_block < nBlocksGmy; /* end of loop */) { + // Given we know which block we're starting at, figure out + // the most whole blocks we can fit in the buffer. + auto first_block_ptr = &blockBoundsGmy[i_first_block]; + auto max_read_pos = *first_block_ptr + total_buf_size; + // upper bound gives the first elem after max_read_pos or _end if none + auto end_ptr = std::upper_bound(first_block_ptr, blockBoundsGmy_end, max_read_pos) - 1; + std::size_t n_blocks = end_ptr - first_block_ptr; + auto read_size = *end_ptr - *first_block_ptr; + log::Logger::Log("Reading blocks from %lu count %lu", i_first_block, n_blocks); + + // Only read on node leader + if (computeComms.AmNodeLeader()) { + auto sp = std::span(buf, read_size); + // Collective on leaders comm + file.ReadAtAll(*first_block_ptr, sp); + } + // Need to wait for leader to read + nodeComm.Barrier(); + // Sync the memory + net::MpiCall{MPI_Win_sync}(win); + + // Now we've read a chunk of the file. Go through it, + // copying out the blocks we want. + for (std::size_t i = 0; i < n_blocks; ++i) { + auto block_gmy = i_first_block + i; + if (want(block_gmy)) { + // Recall std::map::operator[] will create the element. + auto& block_data = ans[block_gmy]; + block_data.resize(bytesPerCompressedBlock[block_gmy]); + + auto buf_pos = blockBoundsGmy[block_gmy] - blockBoundsGmy[i_first_block]; + std::memcpy(block_data.data(), &buf[buf_pos], bytesPerCompressedBlock[block_gmy]); + } + } + + i_first_block += n_blocks; + } + // Close the access epoch + net::MpiCall{MPI_Win_unlock_all}(win); + // and free the window & buffer + net::MpiCall{MPI_Win_free}(&win); + + log::Logger::Log("Finished caching blocks"); + return ans; + } + /** - * Read in the necessary blocks from the file. + * Read and deserialise the necessary blocks from the file into + * `geometry`. Collective. + * + * Initially, we have procForBlock which is the rank where each + * block is wanted (or -1 if unknown). + * + * - Decide which blocks this rank want (i.e. procForBlock == + * Rank() plus neigbours). + * + * - Collectively read the file, filtering blocks to ranks that + * want them. + * + * - Deserialise those blocks into the geometry. */ void GeometryReader::ReadInBlocksWithHalo(GmyReadResult& geometry, - const std::vector& unitForEachBlock, - const proc_t localRank) + const std::vector& blocksWanted) { // Create a list of which blocks to read in. - timings[hemelb::reporting::Timers::readBlocksPrelim].Start(); + timings[reporting::Timers::readBlocksPrelim].Start(); // Populate the list of blocks to read (including a halo one block wide around all // local blocks). log::Logger::Log("Determining blocks to read"); - std::vector readBlock = DecideWhichBlocksToReadIncludingHalo(geometry, - unitForEachBlock, - localRank); - if (ShouldValidate()) - { - log::Logger::Log("Validating block sizes"); + auto halo_wanted = DecideWhichBlocksToReadIncludingHalo(geometry, blocksWanted); - // Validate the uncompressed length of the block on disk fits out expectations. - for (site_t block = 0; block < geometry.GetBlockCount(); ++block) - { - if (bytesPerUncompressedBlock[block] - > gmy::GetMaxBlockRecordLength(geometry.GetBlockSize(), - fluidSitesOnEachBlock[block])) - { - log::Logger::Log("Block %i is %i bytes when the longest possible block should be %i bytes", - block, - bytesPerUncompressedBlock[block], - gmy::GetMaxBlockRecordLength(geometry.GetBlockSize(), - fluidSitesOnEachBlock[block])); - } - } + // GMY file indexs of blocks we want. + std::vector wanted_gmys; + wanted_gmys.reserve(halo_wanted.size()); + auto&& tree = geometry.block_store->GetTree(); + for (auto idx: halo_wanted) { + auto ijk = tree.GetLeafCoords(idx); + wanted_gmys.push_back(geometry.GetBlockIdFromBlockCoordinates(ijk)); } + // Recall we hit every block in GMY order, so sort this. We now + // only have to test one value at a time and bump the iterator + // forward when it matches. + std::sort(wanted_gmys.begin(), wanted_gmys.end()); + + auto compressed_block_data = ReadCompressedBlockData( + geometry, + [lower=wanted_gmys.cbegin(), upper=wanted_gmys.cend()] (std::size_t gmy) mutable { + if (lower != upper && *lower == gmy) { + ++lower; + return true; + } else { + return false; + } + } + ); - // Next we spread round the lists of which blocks each core needs access to. - log::Logger::Log("Informing reading cores of block needs"); - net::Net net = net::Net(computeComms); - Needs needs(geometry.GetBlockCount(), - readBlock, - util::NumericalFunctions::min(READING_GROUP_SIZE, computeComms.Size()), - net, - ShouldValidate()); - - timings[hemelb::reporting::Timers::readBlocksPrelim].Stop(); - log::Logger::Log("Reading blocks"); - timings[hemelb::reporting::Timers::readBlocksAll].Start(); - - // Set the initial offset to the first block, which will be updated as we progress - // through the blocks. - MPI_Offset offset = gmy::PreambleLength - + GetHeaderLength(geometry.GetBlockCount()); - - // Iterate over each block. - for (site_t nextBlockToRead = 0; nextBlockToRead < geometry.GetBlockCount(); - ++nextBlockToRead) - { - // Read in the block on all cores (nothing will be done if this core doesn't need the block). - ReadInBlock(offset, - geometry, - needs.ProcessorsNeedingBlock(nextBlockToRead), - nextBlockToRead, - readBlock[nextBlockToRead]); - - // Update the offset to be ready for the next block. - offset += bytesPerCompressedBlock[nextBlockToRead]; + for (auto& [gmy_idx, data]: compressed_block_data) { + DeserialiseBlock(geometry, data, gmy_idx); } - - timings[hemelb::reporting::Timers::readBlocksAll].Stop(); } - void GeometryReader::ReadInBlock(MPI_Offset offsetSoFar, GmyReadResult& geometry, - const std::vector& procsWantingThisBlock, - const site_t blockNumber, const bool neededOnThisRank) - { - // Easy case if there are no sites on the block. - if (fluidSitesOnEachBlock[blockNumber] <= 0) - { - return; - } - std::vector compressedBlockData; - proc_t readingCore = GetReadingCoreForBlock(blockNumber); - - net::Net net = net::Net(computeComms); - - if (readingCore == computeComms.Rank()) - { - timings[hemelb::reporting::Timers::readBlock].Start(); - // Read the data. - compressedBlockData.resize(bytesPerCompressedBlock[blockNumber]); - file.ReadAt(offsetSoFar, compressedBlockData); - - // Spread it. - for (std::vector::const_iterator receiver = procsWantingThisBlock.begin(); - receiver != procsWantingThisBlock.end(); receiver++) - { - if (*receiver != computeComms.Rank()) - { - - net.RequestSendV(std::span(compressedBlockData), *receiver); - } - } - timings[hemelb::reporting::Timers::readBlock].Stop(); - } - else if (neededOnThisRank) - { - compressedBlockData.resize(bytesPerCompressedBlock[blockNumber]); - - net.RequestReceiveV(std::span(compressedBlockData), readingCore); - - } - else - { - return; - } - timings[hemelb::reporting::Timers::readNet].Start(); - net.Dispatch(); - timings[hemelb::reporting::Timers::readNet].Stop(); - timings[hemelb::reporting::Timers::readParse].Start(); - if (neededOnThisRank) - { + void GeometryReader::DeserialiseBlock( + GmyReadResult& geometry, std::vector const& compressedBlockData, + site_t block_gmy + ) { + timings[reporting::Timers::readParse].Start(); // Create an Xdr interpreter. - std::vector blockData = DecompressBlockData(compressedBlockData, - bytesPerUncompressedBlock[blockNumber]); + auto blockData = DecompressBlockData(compressedBlockData, + bytesPerUncompressedBlock[block_gmy]); io::XdrMemReader lReader(&blockData.front(), blockData.size()); - ParseBlock(geometry, blockNumber, lReader); + ParseBlock(geometry, block_gmy, lReader); // If debug-level logging, check that we've read in as many sites as anticipated. - if (ShouldValidate()) - { + if constexpr (build_info::VALIDATE_GEOMETRY) { // Count the sites read, site_t numSitesRead = 0; for (site_t site = 0; site < geometry.GetSitesPerBlock(); ++site) { - if (geometry.Blocks[blockNumber].Sites[site].targetProcessor != SITE_OR_BLOCK_SOLID) + if (geometry.Blocks[block_gmy].Sites[site].targetProcessor != SITE_OR_BLOCK_SOLID) { ++numSitesRead; } } // Compare with the sites we expected to read. - if (numSitesRead != fluidSitesOnEachBlock[blockNumber]) + if (numSitesRead != fluidSitesOnEachBlock[block_gmy]) { log::Logger::Log("Was expecting %i fluid sites on block %i but actually read %i", - fluidSitesOnEachBlock[blockNumber], - blockNumber, + fluidSitesOnEachBlock[block_gmy], + block_gmy, numSitesRead); } } - } - else if (!geometry.Blocks[blockNumber].Sites.empty()) - { - geometry.Blocks[blockNumber].Sites = std::vector(0, GeometrySite(false)); - } - timings[hemelb::reporting::Timers::readParse].Stop(); + timings[reporting::Timers::readParse].Stop(); } std::vector GeometryReader::DecompressBlockData(const std::vector& compressed, const unsigned int uncompressedBytes) { - timings[hemelb::reporting::Timers::unzip].Start(); + timings[reporting::Timers::unzip].Start(); // For zlib return codes. int ret; @@ -484,7 +539,7 @@ namespace hemelb::geometry if (ret != Z_OK) throw Exception() << "Decompression error for block"; - timings[hemelb::reporting::Timers::unzip].Stop(); + timings[reporting::Timers::unzip].Stop(); return uncompressed; } @@ -529,11 +584,11 @@ namespace hemelb::geometry for (auto&& dir: gmy::Neighbourhood) { // read the type of the intersection and create a link... - auto intersectionType = [&]() { - unsigned readType; - reader.read(readType); - return CutTypeValidator::Run(readType); - } (); + auto intersectionType = [&]() { + unsigned readType; + reader.read(readType); + return CutTypeValidator::Run(readType); + } (); GeometrySiteLink link; link.type = intersectionType; @@ -575,9 +630,9 @@ namespace hemelb::geometry } auto normalAvailable = [&]() { - unsigned normalAvailable; - reader.read(normalAvailable); - return WallNormalAvailabilityValidator::Run(normalAvailable); + unsigned normalAvailable; + reader.read(normalAvailable); + return WallNormalAvailabilityValidator::Run(normalAvailable); }(); readInSite.wallNormalAvailable = (normalAvailable == gmy::WallNormalAvailability::AVAILABLE); @@ -600,12 +655,6 @@ namespace hemelb::geometry return readInSite; } - proc_t GeometryReader::GetReadingCoreForBlock(site_t blockNumber) - { - return proc_t(blockNumber - % util::NumericalFunctions::min(READING_GROUP_SIZE, computeComms.Size())); - } - /** * This function is only called if in geometry-validation mode. * @param geometry @@ -614,156 +663,128 @@ namespace hemelb::geometry { log::Logger::Log("Validating the GlobalLatticeData"); - // We check the isFluid property and the link type for each direction + auto const SPB = geometry.GetSitesPerBlock(); + auto const NV = latticeInfo.GetNumVectors(); + constexpr auto UMAX = std::numeric_limits::max(); - std::vector myProcForSite; - std::vector dummySiteData; + std::vector myProcForSite(SPB); + std::vector dummySiteData(SPB * NV); + // We check the isFluid property and the link type for each direction // We also validate that each processor has the same beliefs about each site. - for (site_t block = 0; block < geometry.GetBlockCount(); ++block) - { - // Clear vectors - myProcForSite.clear(); - dummySiteData.clear(); - - if (geometry.Blocks[block].Sites.empty()) - { - for (site_t localSite = 0; localSite < geometry.GetSitesPerBlock(); ++localSite) - { - myProcForSite.push_back(SITE_OR_BLOCK_SOLID); - dummySiteData.push_back(std::numeric_limits::max()); - for (Direction direction = 1; direction < latticeInfo.GetNumVectors(); ++direction) - { - dummySiteData.push_back(std::numeric_limits::max()); + for (site_t block_gmy = 0; block_gmy < geometry.GetBlockCount(); ++block_gmy) { + auto const& block = geometry.Blocks[block_gmy]; + + if (block.Sites.empty()) { + std::fill(myProcForSite.begin(), myProcForSite.end(), SITE_OR_BLOCK_SOLID); + std::fill(dummySiteData.begin(), dummySiteData.end(), UMAX); + } else { + for (site_t localSite = 0; localSite < SPB; ++localSite) { + auto const& site = block.Sites[localSite]; + myProcForSite[localSite] = site.targetProcessor; + + auto dsd_start = localSite*NV; + dummySiteData[dsd_start] = site.isFluid; + for (Direction direction = 1; direction < NV; ++direction) { + dummySiteData[dsd_start + direction] = site.isFluid ? unsigned(site.links[direction-1].type) : UMAX; } } } - else - { - for (site_t localSite = 0; localSite < geometry.GetSitesPerBlock(); ++localSite) - { - myProcForSite.push_back(geometry.Blocks[block].Sites[localSite].targetProcessor); - - dummySiteData.push_back(geometry.Blocks[block].Sites[localSite].isFluid); - for (Direction direction = 1; direction < latticeInfo.GetNumVectors(); ++direction) - { - if (geometry.Blocks[block].Sites[localSite].isFluid) - { - dummySiteData.push_back(static_cast(geometry.Blocks[block].Sites[localSite].links[direction - 1].type)); - } - else - { - dummySiteData.push_back(std::numeric_limits::max()); - } - } - } - } - - // Reduce using a minimum to find the actual processor for each site (ignoring the + // Reduce using a maximum to find the actual processor for each site (ignoring the // invalid entries). - std::vector procForSiteRecv = computeComms.AllReduce(myProcForSite, MPI_MIN); + std::vector procForSiteRecv = computeComms.AllReduce(myProcForSite, MPI_MAX); std::vector siteDataRecv = computeComms.AllReduce(dummySiteData, MPI_MIN); - for (site_t site = 0; site < geometry.GetSitesPerBlock(); ++site) - { - if (procForSiteRecv[site] == computeComms.Rank() - && (myProcForSite[site] != computeComms.Rank())) - { - log::Logger::Log("Other cores think this core has site %li on block %li but it disagrees.", - site, - block); + for (site_t site = 0; site < SPB; ++site) { + if (myProcForSite[site] < 0) + continue; + + if (myProcForSite[site] != procForSiteRecv[site]) { + log::Logger::Log( + "Site %li of block %li believed to be on %li but other process thinks %li", + site, block, myProcForSite[site], procForSiteRecv[site] + ); } - else if (myProcForSite[site] != SITE_OR_BLOCK_SOLID && procForSiteRecv[site] - != myProcForSite[site]) - { - log::Logger::Log("This core thought that core %li has site %li on block %li but others think it's on core %li.", - myProcForSite[site], + + if (dummySiteData[site * NV] != siteDataRecv[site * NV]) { + log::Logger::Log("Different fluid state was found for site %li on block %li. One: %li, Two: %li .", site, block, - procForSiteRecv[site]); + dummySiteData[site*NV], + siteDataRecv[site*NV] + ); } - if (!geometry.Blocks[block].Sites.empty()) - { - if (dummySiteData[site * latticeInfo.GetNumVectors()] - != siteDataRecv[site * latticeInfo.GetNumVectors()]) - { - log::Logger::Log("Different fluid state was found for site %li on block %li. One: %li, Two: %li .", + for (Direction dir = 1; dir < NV; ++dir) { + if (dummySiteData[site * NV + dir] != siteDataRecv[site * NV+ dir]) { + log::Logger::Log("Different link type was found for site %li, link %i on block %li. One: %li, Two: %li .", site, + dir, block, - dummySiteData[site - * latticeInfo.GetNumVectors()], - siteDataRecv[site - * latticeInfo.GetNumVectors()]); - } - - for (Direction direction = 1; direction < latticeInfo.GetNumVectors(); ++direction) - { - if (dummySiteData[site * latticeInfo.GetNumVectors() + direction] - != siteDataRecv[site * latticeInfo.GetNumVectors() + direction]) - { - log::Logger::Log("Different link type was found for site %li, link %i on block %li. One: %li, Two: %li .", - site, - direction, - block, - dummySiteData[site - * latticeInfo.GetNumVectors() - + direction], - siteDataRecv[site - * latticeInfo.GetNumVectors() - + direction]); - } + dummySiteData[site*NV + dir], + siteDataRecv[site *NV + dir]); } - } + } } } - std::vector GeometryReader::DecideWhichBlocksToReadIncludingHalo( - const GmyReadResult& geometry, const std::vector& unitForEachBlock, proc_t localRank) - { - std::vector shouldReadBlock(geometry.GetBlockCount(), false); - // Read a block in if it has fluid sites and is to live on the current processor. Also read - // in any neighbours with fluid sites. + // Go through blocks_wanted and add any 26-neighbouring blocks that are non-solid, using OCT ids. + std::vector GeometryReader::DecideWhichBlocksToReadIncludingHalo( + const GmyReadResult& geometry, const std::vector& blocks_wanted + ) const { + // Going to have to go from "compressed" octree order index + // (idx) -> 3D grid coordinate (ijk) to compute neighbours. Get + // set up for this. auto const block_dims = geometry.GetBlockDimensions(); - for (site_t blockI = 0; blockI < block_dims.x(); ++blockI) - { - for (site_t blockJ = 0; blockJ < block_dims.y(); ++blockJ) - { - for (site_t blockK = 0; blockK < block_dims.z(); ++blockK) - { - site_t lBlockId = geometry.GetBlockIdFromBlockCoordinates(blockI, blockJ, blockK); - - if (unitForEachBlock[lBlockId] != localRank) - { - continue; - } - - // Read in all neighbouring blocks. - for (site_t neighI = std::max(0, blockI - 1); - (neighI <= (blockI + 1)) && (neighI < block_dims.x()); ++neighI) - { - for (site_t neighJ = std::max(0, blockJ - 1); - (neighJ <= (blockJ + 1)) && (neighJ < block_dims.y()); ++neighJ) - { - for (site_t neighK = std::max(0, blockK - 1); - (neighK <= (blockK + 1)) && (neighK < block_dims.z()); - ++neighK) - { - site_t lNeighId = geometry.GetBlockIdFromBlockCoordinates(neighI, neighJ, neighK); - - shouldReadBlock[lNeighId] = true; - } - } - } + constexpr auto U16_MAX = std::numeric_limits::max(); + auto&& tree = geometry.block_store->GetTree(); + + // Start with no blocks wanted. + std::vector want_block_on_this_rank(nFluidBlocks, false); + + // Main loop + for (auto block_idx: blocks_wanted) { + // Could set this here, but will cover in loop over halo below + // want_block_on_this_rank[block_idx] = true; + + auto block_ijk = tree.GetLeafCoords(block_idx); + + // Compute the bounds of neighbours to consider + Vec16 lo, hi; + for (int d = 0; d < 3; ++d) { + // Beware the "usual arithmetic conversions"! + U16 const v = block_ijk[d]; + lo[d] = v > 0U ? v - 1U : 0U; + // Note, we will use inclusive hi limit below so max + // allowed value is block_dims[d] - 1 + U16 const w = v < U16_MAX ? v + 1U : v; + hi[d] = w < block_dims[d] ? w : block_dims[d] - 1; } - } + + // 3D loop + for (U16 ni = lo[0]; ni <= hi[0]; ++ni) + for (U16 nj = lo[1]; nj <= hi[1]; ++nj) + for (U16 nk = lo[2]; nk <= hi[2]; ++nk) { + auto neigh_ijk = Vec16{ni, nj, nk}; + auto neigh_idx = tree.GetPath(neigh_ijk).leaf(); + // Recall that the octree will return a path + // with "no child" for all levels where the + // requested node doesn't exist. + if (neigh_idx != octree::Level::NC) + want_block_on_this_rank[neigh_idx] = true; + } } - return shouldReadBlock; + std::vector ans; + for (unsigned i = 0; i < nFluidBlocks; ++i) { + if (want_block_on_this_rank[i]) + ans.push_back(i); + } + return ans; } void GeometryReader::OptimiseDomainDecomposition(GmyReadResult& geometry, @@ -772,27 +793,24 @@ namespace hemelb::geometry decomposition::OptimisedDecomposition optimiser(timings, computeComms, geometry, - latticeInfo, - procForEachBlock, - fluidSitesOnEachBlock); + latticeInfo); - timings[hemelb::reporting::Timers::reRead].Start(); + timings[reporting::Timers::reRead].Start(); log::Logger::Log("Rereading blocks"); // Reread the blocks based on the ParMetis decomposition. RereadBlocks(geometry, - optimiser.GetMovesCountPerCore(), - optimiser.GetMovesList(), - procForEachBlock); - timings[hemelb::reporting::Timers::reRead].Stop(); + optimiser.GetStaying(), + optimiser.GetArriving()); + timings[reporting::Timers::reRead].Stop(); - timings[hemelb::reporting::Timers::moves].Start(); + timings[reporting::Timers::moves].Start(); // Implement the decomposition now that we have read the necessary data. log::Logger::Log("Implementing moves"); ImplementMoves(geometry, - procForEachBlock, - optimiser.GetMovesCountPerCore(), - optimiser.GetMovesList()); - timings[hemelb::reporting::Timers::moves].Stop(); + optimiser.GetStaying(), + optimiser.GetArriving(), + optimiser.GetLeaving()); + timings[reporting::Timers::moves].Stop(); } // The header section of the config file contains a number of records. @@ -801,117 +819,131 @@ namespace hemelb::geometry return gmy::HeaderRecordLength * blockCount; } - void GeometryReader::RereadBlocks(GmyReadResult& geometry, const std::vector& movesPerProc, - const std::vector& movesList, - const std::vector& procForEachBlock) - { - // Initialise the array (of which proc each block belongs to) to what it was before. - std::vector newProcForEachBlock(geometry.GetBlockCount()); - - for (site_t blockNumber = 0; blockNumber < geometry.GetBlockCount(); ++blockNumber) - { - newProcForEachBlock[blockNumber] = procForEachBlock[blockNumber]; - } + // Iterator to advance through the moves vector to the first move + // in the next block, recalling that moves are sorted by block and + // there's a max number of sites per block. + struct only_block_id_iterator { + using Iter = SiteVec::const_iterator; - // Set the proc for each block to be the current proc whenever a site on that block is - // going to be moved to the current proc. - idx_t moveIndex = 0; + site_t max_step; + Iter pos; + Iter end; - for (proc_t fromProc = 0; fromProc < computeComms.Size(); ++fromProc) - { - for (idx_t moveNumber = 0; moveNumber < movesPerProc[fromProc]; ++moveNumber) - { - idx_t block = movesList[3 * moveIndex]; - idx_t toProc = movesList[3 * moveIndex + 2]; - ++moveIndex; + // Only care about block ID + auto operator*() const { + return (*pos)[0]; + } - if (toProc == (idx_t) computeComms.Rank()) - { - newProcForEachBlock[block] = computeComms.Rank(); - } + only_block_id_iterator& operator++() { + auto block = **this; + ++pos; // Guard the case where have a fully fluid block + auto limit = std::min(pos + max_step, end); + pos = std::upper_bound( + pos, limit, + block, + [](U64 l, SiteDesc const& r) {return l < r[0]; } + ); + return *this; } - } - // Reread the blocks into the GlobalLatticeData now. - ReadInBlocksWithHalo(geometry, newProcForEachBlock, computeComms.Rank()); - } + operator bool() const { + return pos < end; + } + }; - void GeometryReader::ImplementMoves(GmyReadResult& geometry, - const std::vector& procForEachBlock, - const std::vector& movesFromEachProc, - const std::vector& movesList) const + void GeometryReader::RereadBlocks(GmyReadResult& geometry, SiteVec const& staying, + MovesMap const& arriving) { - // First all, set the proc rank for each site to what it originally was before - // domain decomposition optimisation. Go through each block... - for (site_t block = 0; block < geometry.GetBlockCount(); ++block) - { - // If this proc has owned a fluid site on this block either before or after optimisation, - // the following will be non-null. - if (!geometry.Blocks[block].Sites.empty()) - { - // Get the original proc for that block. - proc_t originalProc = procForEachBlock[block]; + // Go through the sites that we will end up with, and compute + // the unique list of blocks (by OCT index) that we need. Recall + // that the moves lists are sorted by block and then site id + // (tho we only care about block). + std::vector optimal_blocks; + + // We can be simple for the staying blocks as opt is empty at + // start and know: + // - that the blocks monotonically increase thru the array + // - that there are at most SPB sites per block + auto const SPB = geometry.GetSitesPerBlock(); + for (auto it = only_block_id_iterator{SPB, staying.begin(), staying.end()}; it; ++it) { + optimal_blocks.push_back(*it); + } - // For each site on that block... - for (site_t siteIndex = 0; siteIndex < geometry.GetSitesPerBlock(); ++siteIndex) - { - // ... if the site is non-solid... - if (geometry.Blocks[block].Sites[siteIndex].targetProcessor != SITE_OR_BLOCK_SOLID) - { - // ... set its rank to be the rank it had before optimisation. - geometry.Blocks[block].Sites[siteIndex].targetProcessor = originalProc; - } + // For the arriving sites, need to deal with insertion + for (auto& [src_rank, data]: arriving) { + // Recall that each rank's data is sorted by block ID, thus we + // can search for the insert point more efficiently. + auto opt_pos = optimal_blocks.begin(); + + for (auto it = only_block_id_iterator{SPB, data.begin(), data.end()}; it; ++it) { + auto block = *it; + + auto lb = std::lower_bound(opt_pos, optimal_blocks.end(), block); + // BUT adding a block may invalidate the opt_pos iterator + if (lb == optimal_blocks.end()) { + // Not found and greater than any value + optimal_blocks.push_back(block); + opt_pos = optimal_blocks.end(); + } else if (*lb == block) { + // already there + ++opt_pos; + } else { + // Not found and need to insert and move past the inserted one + opt_pos = ++optimal_blocks.insert(lb, block); } } } - // Now implement the moves suggested by parmetis. - idx_t moveIndex = 0; + // Reread the blocks into the GlobalLatticeData now. + ReadInBlocksWithHalo(geometry, optimal_blocks); + } - // For each source proc, go through as many moves as it had. - for (proc_t fromProc = 0; fromProc < computeComms.Size(); ++fromProc) - { - for (idx_t moveNumber = 0; moveNumber < movesFromEachProc[fromProc]; ++moveNumber) - { - // For each move, get the block, site and destination proc. - idx_t block = movesList[3 * moveIndex]; - idx_t site = movesList[3 * moveIndex + 1]; - idx_t toProc = movesList[3 * moveIndex + 2]; + void GeometryReader::ImplementMoves(GmyReadResult& geometry, + SiteVec const& staying, + MovesMap const& arriving, + MovesMap const& leaving) const + { + // Given a vector of sites (sorted by block, then site index + // within the block), assign the process to that site in the + // GmyReadResult. + auto set_rank_for_sites = [&] (SiteVec const& sites, int rank) { + + auto block_start = sites.begin(); + auto end = sites.end(); + auto&& tree = geometry.block_store->GetTree(); + + while (block_start != end) { + auto const block_oct = (*block_start)[0]; + auto const nsites = tree.levels[tree.n_levels].sites_per_node[block_oct]; + auto const block_end = (++only_block_id_iterator{nsites, block_start, end}).pos; + + auto const block_ijk = tree.GetLeafCoords(block_oct); + auto const block_gmy = geometry.GetBlockIdFromBlockCoordinates(block_ijk); + + auto& block_sites = geometry.Blocks[block_gmy].Sites; + for (auto it = block_start; it < block_end; ++it) { + auto [block, site_idx] = *it; + HASSERT(block == block_oct); + HASSERT(!block_sites.empty()); + + auto& site = block_sites[site_idx]; + HASSERT(site.isFluid); + site.targetProcessor = rank; + } - // Only implement the move if we have read that block's data. - if (!geometry.Blocks[block].Sites.empty()) - { - // Some logging code - the unmodified rank for each move's site should equal - // lFromProc. - if (ShouldValidate()) - { - if (geometry.Blocks[block].Sites[site].targetProcessor - != fromProc) - { - log::Logger::Log("Block %ld, site %ld from move %u was originally on proc %i, not proc %u.", - block, - site, - moveIndex, - geometry.Blocks[block].Sites[site].targetProcessor, - fromProc); - } + block_start = block_end; } + }; - // Implement the move. - geometry.Blocks[block].Sites[site].targetProcessor = toProc; - } - - ++moveIndex; + // First set proc for staying sites + set_rank_for_sites(staying, computeComms.Rank()); + // Then arriving sites + for (auto const& [_, sites]: arriving) { + set_rank_for_sites(sites, computeComms.Rank()); } - } + // We could set the leaving sites, but can't guarantee we get + // all the neighbours of our sites, so no saving of time + // really. } - bool GeometryReader::ShouldValidate() const - { -#ifdef HEMELB_VALIDATE_GEOMETRY - return true; -#else - return false; -#endif - } } diff --git a/Code/geometry/GeometryReader.h b/Code/geometry/GeometryReader.h index 08a18587a..c12b135de 100644 --- a/Code/geometry/GeometryReader.h +++ b/Code/geometry/GeometryReader.h @@ -37,139 +37,101 @@ namespace hemelb::geometry GmyReadResult LoadAndDecompose(const std::string& dataFilePath); private: - /** - * Read from the file into a buffer. We read this on a single core then broadcast it. - * This has proven to be more efficient than reading in on every core (even using a collective - * read). - * - * Note this allocates memory and returns the pointer to you. - * - * @param nBytes - * @return - */ - std::vector ReadOnAllTasks(unsigned nBytes); + // Read from the file into a buffer on all processes. + // This is collective and start and nBytes must be the same on all ranks. + std::vector ReadAllProcesses(std::size_t startBytes, unsigned nBytes); + // Read the preamble and create the empty read result. GmyReadResult ReadPreamble(); + // Read the block header with basic size data for each block in + // the domain. void ReadHeader(site_t blockCount); - void ReadInBlocksWithHalo(GmyReadResult& geometry, const std::vector& unitForEachBlock, - const proc_t localRank); - - /** - * Compile a list of blocks to be read onto this core, including all the ones we perform - * LB on, and also any of their neighbouring blocks. - * - * NOTE: that the skipping-over of blocks without any fluid sites is dealt with by other - * code. - * - * @param geometry [in] GmyReadResult object as it has been read so far - * @param unitForEachBlock [in] The initial processor assigned to each block - * @param localRank [in] Local rank number - * @return Vector with true for each block we should read in. - */ - std::vector DecideWhichBlocksToReadIncludingHalo( - const GmyReadResult& geometry, const std::vector& unitForEachBlock, - const proc_t localRank); - - /** - * Reads in a single block and ensures it is distributed to all cores that need it. - * - * @param offsetSoFar [in] The offset into the file to read from to get the block. - * @param geometry [out] The geometry object to populate with info about the block. - * @param procsWantingThisBlock [in] A list of proc ids where info about this block is required. - * @param blockNumber [in] The id of the block we're reading. - * @param neededOnThisRank [in] A boolean indicating whether the block is required locally. - */ - void ReadInBlock(MPI_Offset offsetSoFar, GmyReadResult& geometry, - const std::vector& procsWantingThisBlock, const site_t blockNumber, - const bool neededOnThisRank); - - /** - * Decompress the block data. Uses the known number of sites to get an - * upper bound on the uncompressed data to simplify the code and avoid - * reallocation. - * @param compressed - * @param sites - * @return - */ + // map from block GMY index to vect of data + using block_cache = std::map>; + + // Load compressed data from the file if the predicate is true for that block's GMY index. + template PredT> + block_cache ReadCompressedBlockData(GmyReadResult const& geometry, PredT&& p); + + + // Given a vector of the block OCT ids that we want, add a + // one-block halo and read those blocks into the + // GmyReadResult. + void ReadInBlocksWithHalo(GmyReadResult& geometry, + const std::vector& blocksWanted); + + // Given a vector of the block OCT ids wanted, add a one-block + // halo and return a vector of those block OCT ids. + std::vector DecideWhichBlocksToReadIncludingHalo( + const GmyReadResult& geometry, const std::vector& blocks_wanted + ) const; + + // Parse a compressed block of data into the geometry at the given GMY index + void DeserialiseBlock(GmyReadResult& geometry, + std::vector const& compressed_data, site_t block_gmy); + + // Decompress the block data std::vector DecompressBlockData(const std::vector& compressed, const unsigned int uncompressedBytes); + // Given a reader for a block's data, parse that into the + // GmyReadResult at the given index. void ParseBlock(GmyReadResult& geometry, const site_t block, io::XdrReader& reader); - /** - * Parse the next site from the XDR reader. Note that we return by copy here. - * @param reader - * @return - */ + // Parse the next site from the XDR reader GeometrySite ParseSite(io::XdrReader& reader); - /** - * Calculates the number of the rank used to read in a given block. - * Intent is to move this into Decomposition class, which will also handle knowledge of which procs to use for reading, and own the decomposition topology. - * - * @param blockNumber - * @return - */ - proc_t GetReadingCoreForBlock(site_t blockNumber); - - /** - * Optimise the domain decomposition using ParMetis. We take this approach because ParMetis - * is more efficient when given an initial decomposition to start with. - * @param geometry - * @param procForEachBlock - */ + // Use the OptimisedDecomposition class to refine a simple, + // block-level initial decomposition. void OptimiseDomainDecomposition(GmyReadResult& geometry, const std::vector& procForEachBlock); + // Check for self-consistency void ValidateGeometry(const GmyReadResult& geometry); - /** - * Get the length of the header section, given the number of blocks. - * - * @param blockCount The number of blocks. - * @return - */ + // Get the length of the header section, given the number of blocks. site_t GetHeaderLength(site_t blockCount) const; - void RereadBlocks(GmyReadResult& geometry, const std::vector& movesPerProc, - const std::vector& movesList, - const std::vector& procForEachBlock); - - void ImplementMoves(GmyReadResult& geometry, const std::vector& procForEachBlock, - const std::vector& movesFromEachProc, - const std::vector& movesList) const; + // Given knowledge of which sites are needed, read the blocks + // required (i.e. the blocks the sites belong to plus a + // one-block halo) into the GmyReadResult. + void RereadBlocks(GmyReadResult& geometry, SiteVec const& staying_sites, + MovesMap const& arriving_sites); - /** - * True if we should validate the geometry. - * @return - */ - bool ShouldValidate() const; - - //! The rank which reads in the header information. - static const proc_t HEADER_READING_RANK = 0; - //! The number of cores (0-READING_GROUP_SIZE-1) that read files in parallel - static const proc_t READING_GROUP_SIZE = HEMELB_READING_GROUP_SIZE; + // Set the rank of sites assigned to this process in the GmyReadResult. + void ImplementMoves(GmyReadResult& geometry, SiteVec const& staying, + MovesMap const& arriving, MovesMap const& leaving) const; //! Info about the connectivity of the lattice. const lb::LatticeInfo& latticeInfo; - //! File accessed to read in the geometry data. + + // File accessed to read in the geometry data. + // + // We are going + // to read the whole file collectively and sequentially in + // chunks. The node leaders will read data either into private + // memory and broadcast or into node-shared memory and do a + // barrier/fence. net::MpiFile file; - //! Information about the file, to give cues and hints to MPI. - //! Communication info for all ranks that will need a slice of the geometry - net::MpiCommunicator computeComms; + //! Communicator for all ranks that will need a slice of the geometry + net::IOCommunicator computeComms; - //! The number of fluid sites on each block in the geometry + //! How many blocks with at least one fluid site + U64 nFluidBlocks; + //! The number of fluid sites on each block in the file. std::vector fluidSitesOnEachBlock; - //! The number of bytes each block takes up while still compressed. + //! The number of bytes each block in the file takes up while still compressed. std::vector bytesPerCompressedBlock; - //! The number of bytes each block takes up when uncompressed. + //! The number of bytes each block in the file takes up when uncompressed. std::vector bytesPerUncompressedBlock; - //! The processor assigned to each block. + //! The process assigned to each block. std::vector principalProcForEachBlock; + //! The process for fluid-containing blocks in octree order + std::vector procForBlockOct; //! Timings object for recording the time taken for each step of the domain decomposition. hemelb::reporting::Timers &timings; diff --git a/Code/geometry/GmyReadResult.cc b/Code/geometry/GmyReadResult.cc index 0c4bcd13b..0a4f54b4b 100644 --- a/Code/geometry/GmyReadResult.cc +++ b/Code/geometry/GmyReadResult.cc @@ -39,20 +39,11 @@ namespace hemelb::geometry { site_t GmyReadResult::FindFluidSiteIndexInBlock(site_t fluidSiteBlock, site_t neighbourSiteId) const { - site_t SiteId = 0; - // Calculate the site's id over the whole geometry, - for (site_t neighSite = 0; neighSite < GetSitesPerBlock(); neighSite++) - { - if (neighSite == neighbourSiteId) - { - break; + auto& sites = Blocks[fluidSiteBlock].Sites; + return std::count_if( + &sites[0], &sites[neighbourSiteId], [](GeometrySite const& s) { + return s.isFluid; } - else if (Blocks[fluidSiteBlock].Sites[neighSite].targetProcessor != SITE_OR_BLOCK_SOLID) - { - SiteId++; - } - } - - return SiteId; + ); } -} \ No newline at end of file +} diff --git a/Code/geometry/GmyReadResult.h b/Code/geometry/GmyReadResult.h index c8b4bc7b9..77da7c4d7 100644 --- a/Code/geometry/GmyReadResult.h +++ b/Code/geometry/GmyReadResult.h @@ -7,6 +7,7 @@ #define HEMELB_GEOMETRY_GMYREADRESULT_H #include +#include #include #include "units.h" @@ -18,6 +19,15 @@ namespace hemelb::geometry { namespace octree { class DistributedStore; } + + // A site description is the block's OCT index (hence U64 to + // match the tree's choice) and the index of the site within + // the block (could be U16, but use 64 as padding will force + // that anyway). + using SiteDesc = std::array; + using SiteVec = std::vector; + using MovesMap = std::map; + /*** * Model of the information in a geometry file */ @@ -61,6 +71,11 @@ namespace hemelb::geometry return (blockI * dimensionsInBlocks.y() + blockJ) * dimensionsInBlocks.z() + blockK; } + inline U64 GetBlockIdFromBlockCoordinates(Vec16 ijk) const { + auto IJK = ijk.as(); + return (IJK[0] * dimensionsInBlocks[1] + IJK[1]) * dimensionsInBlocks[2] + IJK[2]; + } + /*** * Get the coordinates of a block from a block id. */ @@ -136,5 +151,53 @@ namespace hemelb::geometry std::unique_ptr block_store; }; + // This iterator will move through the sites that make up a + // block. On dereference it returns a pair of the current 3D + // position and the 1D index in geometry file order. + struct BlockSiteIterator { + GmyReadResult const* gmy; + Vec16 pos; + site_t idx; + + using value_type = std::pair; + inline value_type operator*() const { + return {pos, idx}; + } + + inline BlockSiteIterator& operator++() { + auto B = gmy->GetBlockSize(); + idx++; + if (++pos[2] == B) { + pos[2] = 0; + if (++pos[1] == B) { + pos[1] = 0; + ++pos[0]; + } + } + return *this; + } + + inline friend bool operator==(BlockSiteIterator const& l, BlockSiteIterator const& r) { + return l.idx == r.idx; + } + }; + + // Represent the range of valid sites in a block that belongs to + // the GmyReadResult. + struct BlockSiteRange { + GmyReadResult const* gmy; + auto begin() const { + return BlockSiteIterator{gmy, {0,0,0}, 0}; + } + auto end() const { + auto B = gmy->GetBlockSize(); + return BlockSiteIterator{gmy, {B, B, B}, B*B*B}; + } + }; + + // Helper for the above. + inline auto IterSitesInBlock(GmyReadResult const& gmy) { + return BlockSiteRange{&gmy}; + } } #endif // HEMELB_GEOMETRY_GMYREADRESULT_H diff --git a/Code/geometry/LookupTree.h b/Code/geometry/LookupTree.h index a57d5d962..9b116e1f2 100644 --- a/Code/geometry/LookupTree.h +++ b/Code/geometry/LookupTree.h @@ -297,6 +297,10 @@ namespace hemelb::geometry::octree return storage_rank.size(); } + [[nodiscard]] inline auto& GetBlockOwnerRank() const { + return storage_rank; + } + // Allow many writes to the same block, wherever it is. // Constructed by a WriteSession below struct BlockWriteChunk { diff --git a/Code/geometry/decomposition/BasicDecomposition.cc b/Code/geometry/decomposition/BasicDecomposition.cc index 91ce68ee0..dd8828162 100644 --- a/Code/geometry/decomposition/BasicDecomposition.cc +++ b/Code/geometry/decomposition/BasicDecomposition.cc @@ -79,8 +79,8 @@ namespace hemelb::geometry::decomposition rank_for_block.begin(), rank_for_block.end(), comm_size); // We need to return data organised in GMY file order - // Initialise output to -1 => SOLID, we will overwrite the non-solid below - std::fill(procAssignedToEachBlock.begin(), procAssignedToEachBlock.end(), -1); + // Initialise output to SOLID, we will overwrite the non-solid below + std::fill(procAssignedToEachBlock.begin(), procAssignedToEachBlock.end(), SITE_OR_BLOCK_SOLID); // We need to know the octree ID to map to 3D coords. auto& block_oct_ids = tree.levels[tree.n_levels].node_ids; @@ -95,7 +95,7 @@ namespace hemelb::geometry::decomposition return rank_for_block; } - void BasicDecomposition::Validate(std::vector& procAssignedToEachBlock, net::MpiCommunicator const& communicator) const + void BasicDecomposition::Validate(std::vector& procAssignedToEachBlock, net::MpiCommunicator const& communicator) const { log::Logger::Log("Validating procForEachBlock"); diff --git a/Code/geometry/decomposition/OptimisedDecomposition.cc b/Code/geometry/decomposition/OptimisedDecomposition.cc index 70c7f1aa8..960fe3b62 100644 --- a/Code/geometry/decomposition/OptimisedDecomposition.cc +++ b/Code/geometry/decomposition/OptimisedDecomposition.cc @@ -6,34 +6,35 @@ #include "geometry/ParmetisHeader.h" #include "geometry/decomposition/OptimisedDecomposition.h" #include "geometry/decomposition/DecompositionWeights.h" +#include "geometry/LookupTree.h" + #include "lb/lattices/D3Q27.h" #include "log/Logger.h" -#include "net/net.h" +#include "net/SparseExchange.h" +#include "util/Iterator.h" +#include "util/span.h" namespace hemelb::geometry::decomposition { - OptimisedDecomposition::OptimisedDecomposition( - reporting::Timers& timers, net::MpiCommunicator& comms, const GmyReadResult& geometry, - const lb::LatticeInfo& latticeInfo, const std::vector& procForEachBlock, - const std::vector& fluidSitesOnEachBlock) : - timers(timers), comms(comms), geometry(geometry), latticeInfo(latticeInfo), - procForEachBlock(procForEachBlock), fluidSitesPerBlock(fluidSitesOnEachBlock) - { - timers[hemelb::reporting::Timers::InitialGeometryRead].Start(); //overall dbg timing + using namespace util; + + OptimisedDecomposition::OptimisedDecomposition( + reporting::Timers& timers, + net::MpiCommunicator c, + const GmyReadResult& geometry, + const lb::LatticeInfo& latticeInfo + ) : timers(timers), comms(std::move(c)), geometry(geometry), + tree(geometry.block_store->GetTree()), latticeInfo(latticeInfo), + procForBlockOct(geometry.block_store->GetBlockOwnerRank()), + fluidSitesPerBlockOct(tree.levels.back().sites_per_node) + { + timers[reporting::Timers::InitialGeometryRead].Start(); //overall dbg timing // Calculate the site distribution and validate if appropriate. PopulateSiteDistribution(); - if (ShouldValidate()) - { + if constexpr (build_info::VALIDATE_GEOMETRY) { ValidateVertexDistribution(); - } - - // Calculate the firstSiteIndexOnEachBlock and validate if appropriate - PopulateFirstSiteIndexOnEachBlock(); - - if (ShouldValidate()) - { ValidateFirstSiteIndexOnEachBlock(); } @@ -42,45 +43,29 @@ namespace hemelb::geometry::decomposition PopulateAdjacencyData(localVertexCount); - if (ShouldValidate()) - { + if constexpr (build_info::VALIDATE_GEOMETRY) { ValidateAdjacencyData(localVertexCount); } log::Logger::Log("Adj length %i", localAdjacencies.size()); - timers[hemelb::reporting::Timers::InitialGeometryRead].Stop(); + timers[reporting::Timers::InitialGeometryRead].Stop(); // Call parmetis. - timers[hemelb::reporting::Timers::parmetis].Start(); + timers[reporting::Timers::parmetis].Start(); log::Logger::Log("Making the call to Parmetis"); - - bool do_decomposition = true; -#ifdef HEMELB_NO_DECOMPOSITION - do_decomposition = false; -#endif - - if (do_decomposition) - { - CallParmetis(localVertexCount); - timers[hemelb::reporting::Timers::parmetis].Stop(); - log::Logger::Log("Parmetis has finished."); - - // Convert the ParMetis results into a nice format. - timers[hemelb::reporting::Timers::PopulateOptimisationMovesList].Start(); - log::Logger::Log("Getting moves lists for this core."); - PopulateMovesList(); - } -#ifdef HEMELB_NO_DECOMPOSITION - else - { - LoadDecomposition(); //this is a modified version of PopulateMovesList(); - } -#endif - + CallParmetis(localVertexCount); + timers[reporting::Timers::parmetis].Stop(); + log::Logger::Log("Parmetis has finished."); + + // Now each process knows which rank all its sites belong + // on. Tell the destination ranks which sites they need. + timers[reporting::Timers::PopulateOptimisationMovesList].Start(); + log::Logger::Log("Getting moves lists for this core."); + PopulateMovesList(); log::Logger::Log("Done getting moves lists for this core"); - timers[hemelb::reporting::Timers::PopulateOptimisationMovesList].Stop(); - } + timers[reporting::Timers::PopulateOptimisationMovesList].Stop(); + } void OptimisedDecomposition::CallParmetis(idx_t localVertexCount) { @@ -112,22 +97,24 @@ namespace hemelb::geometry::decomposition // Weight all vertices evenly. //std::vector < idx_t > vertexWeight(localVertexCount, 1); - // Populate the vertex weight data arrays (for ParMetis) and print out the number of different fluid sites on each core + // Populate the vertex weight data arrays (for ParMetis) and + // print out the number of different fluid sites on each core PopulateVertexWeightData(localVertexCount); - // Set the weights of each partition to be even, and to sum to 1. - idx_t desiredPartitionSize = comms.Size(); + // Going to follow ParMETIS docs for naming parameters. Comments to make less cryptic! - std::vector domainWeights(desiredPartitionSize, - (real_t) (1.0) / ((real_t) (desiredPartitionSize))); + // Desired number of sub domains. + idx_t ncon = 1; // Number of constraints + idx_t wgtflag = 2; // Indicate weighting 2 => vertices only + idx_t numflag = 0; // Numbering scheme 0 => C style from 0 + idx_t nparts = comms.Size(); + + // Set the weights of each partition to be even, and to sum to 1. + std::vector tpwgts(nparts, real_t(1.0) / real_t(nparts)); // A bunch of values ParMetis needs. - idx_t noConstraints = 1; - idx_t weightFlag = 2; - idx_t numberingFlag = 0; idx_t edgesCut = 0; idx_t options[4] = { 0, 0, 0, 0 }; - if (ShouldValidate()) - { + if constexpr (build_info::VALIDATE_GEOMETRY) { // Specify that some options are set and that we should // debug everything. // Specify that we have set some options @@ -142,32 +129,33 @@ namespace hemelb::geometry::decomposition // info on remappining (64) options[1] = 1 | 2 | 4 | 8 | 32 | 64; } - real_t tolerance = 1.001F; - log::Logger::Log("Calling ParMetis"); - // Reserve 1 on these vectors so that the reference to their first element - // exists (even if it's unused). - // Reserve on the vectors to be certain they're at least 1 in capacity (so &vector[0] works) - partitionVector.reserve(1); - vtxDistribn.reserve(1); - adjacenciesPerVertex.reserve(1); - localAdjacencies.reserve(1); - vertexWeights.reserve(1); + // Tolerance. 1 => perfect balance, npars => perfect imbalance. + // Docs recommend 1.05 so we are being quite strict here. + real_t ubvec = 1.001F; MPI_Comm communicator = comms; - ParMETIS_V3_PartKway(&vtxDistribn[0], - &adjacenciesPerVertex[0], - &localAdjacencies[0], - &vertexWeights[0], - nullptr, - &weightFlag, - &numberingFlag, - &noConstraints, - &desiredPartitionSize, - &domainWeights[0], - &tolerance, - options, - &edgesCut, - &partitionVector[0], - &communicator); + + log::Logger::Log("Calling ParMetis"); + int err = ParMETIS_V3_RefineKway( + vtxDistribn.data(), + adjacenciesPerVertex.data(), + localAdjacencies.data(), + vertexWeights.data(), + nullptr, // adjwgt, adjacency weights + &wgtflag, + &numflag, + &ncon, + &nparts, + tpwgts.data(), + &ubvec, + options, + &edgesCut, + partitionVector.data(), + &communicator + ); + + if (err != METIS_OK) { + throw Exception() << "ParMETIS error"; + } log::Logger::Log("ParMetis returned."); if (comms.Rank() == comms.Size() - 1) @@ -182,1097 +170,451 @@ namespace hemelb::geometry::decomposition } } - void OptimisedDecomposition::PopulateVertexWeightData(idx_t localVertexCount) - { + void OptimisedDecomposition::PopulateVertexWeightData(idx_t localVertexCount) + { // These counters will be used later on to count the number of each type of vertex site - int FluidSiteCounter = 0, WallSiteCounter = 0, IOSiteCounter = 0, WallIOSiteCounter = 0; - int localweight = 1; + std::array siteCounters; + std::fill(begin(siteCounters), end(siteCounters), 0); + vertexWeights.resize(localVertexCount); + idx_t i_wgt = 0; // For each block (counting up by lowest site id)... - auto const& block_dims = geometry.GetBlockDimensions(); - for (site_t blockI = 0; blockI < block_dims.x(); blockI++) - { - for (site_t blockJ = 0; blockJ < block_dims.y(); blockJ++) - { - for (site_t blockK = 0; blockK < block_dims.z(); blockK++) - { - const site_t blockNumber = geometry.GetBlockIdFromBlockCoordinates(blockI, - blockJ, - blockK); - - // Only consider sites on this processor. - if (procForEachBlock[blockNumber] != comms.Rank()) - { + for (auto [block_idx, block_rank]: enumerate_with(procForBlockOct)) { + if (block_rank != comms.Rank()) + // Only consider sites on this procesr. continue; - } - const BlockReadResult& blockReadResult = geometry.Blocks[blockNumber]; - - site_t m = -1; - const int block_size = geometry.GetBlockSize(); - const int blockXCoord = blockI * block_size; - const int blockYCoord = blockJ * block_size; - const int blockZCoord = blockK * block_size; - - // ... iterate over sites within the block... - for (site_t localSiteI = 0; localSiteI < block_size; localSiteI++) - { - for (site_t localSiteJ = 0; localSiteJ < block_size; localSiteJ++) - { - for (site_t localSiteK = 0; localSiteK < block_size; localSiteK++) - { - ++m; - - // ... only looking at non-solid sites... - if (blockReadResult.Sites[m].targetProcessor == SITE_OR_BLOCK_SOLID) - { - continue; + auto block_ijk = tree.GetLeafCoords(block_idx); + auto block_gmy = geometry.GetBlockIdFromBlockCoordinates(block_ijk); + + const BlockReadResult& blockReadResult = geometry.Blocks[block_gmy]; + + // util::Vector3D block_coord = BS * block_ijk; + // ... iterate over sites within the block... + for (auto [local_site_ijk, m]: IterSitesInBlock(geometry)) { + // ... only looking at non-solid sites... + if (blockReadResult.Sites[m].targetProcessor == SITE_OR_BLOCK_SOLID) + continue; + + SiteData siteData(blockReadResult.Sites[m]); + int site_type_i = [&] () { + switch (siteData.GetCollisionType()) { + case FLUID: + return 0; + case WALL: + return 1; + case INLET: + return 2; + case OUTLET: + return 3; + case (INLET | WALL): + return 4; + case (OUTLET | WALL): + return 5; + default: + throw Exception() << "Bad collision type"; } - - //Getting Site ID to be able to identify site type - site_t localSiteId = geometry.GetSiteIdFromSiteCoordinates(localSiteI, - localSiteJ, - localSiteK); - - //Switch structure which identifies site type and assigns the proper weight to each vertex - - SiteData siteData(blockReadResult.Sites[localSiteId]); - - switch (siteData.GetCollisionType()) - { - case FLUID: - localweight = hemelbSiteWeights[0]; - ++FluidSiteCounter; - break; - - case WALL: - localweight = hemelbSiteWeights[1]; - ++WallSiteCounter; - break; - - case INLET: - localweight = hemelbSiteWeights[2]; - ++IOSiteCounter; - break; - - case OUTLET: - localweight = hemelbSiteWeights[3]; - ++IOSiteCounter; - break; - - case (INLET | WALL): - localweight = hemelbSiteWeights[4]; - ++WallIOSiteCounter; - break; - - case (OUTLET | WALL): - localweight = hemelbSiteWeights[5]; - ++WallIOSiteCounter; - break; - } - - vertexWeights.push_back(localweight); - vertexCoordinates.push_back(blockXCoord + localSiteI); - vertexCoordinates.push_back(blockYCoord + localSiteJ); - vertexCoordinates.push_back(blockZCoord + localSiteK); - - } - - } - - } - + }(); + ++siteCounters[site_type_i]; + vertexWeights[i_wgt++] = hemelbSiteWeights[site_type_i]; } - - } } - int TotalCoreWeight = ( (FluidSiteCounter * hemelbSiteWeights[0]) - + (WallSiteCounter * hemelbSiteWeights[1]) + (IOSiteCounter * hemelbSiteWeights[2]) - + (WallIOSiteCounter * hemelbSiteWeights[4])) / hemelbSiteWeights[0]; - int TotalSites = FluidSiteCounter + WallSiteCounter + WallIOSiteCounter; + if (i_wgt != localVertexCount) + throw Exception() << "Wrong number of vertices: expected " << localVertexCount << " got " << i_wgt; + + int TotalCoreWeight = std::inner_product(begin(siteCounters), end(siteCounters), + hemelbSiteWeights, 0); + int TotalSites = std::reduce(begin(siteCounters), end(siteCounters), 0); log::Logger::Log("There are %u Bulk Flow Sites, %u Wall Sites, %u IO Sites, %u WallIO Sites on core %u. Total: %u (Weighted %u Points)", - FluidSiteCounter, - WallSiteCounter, - IOSiteCounter, - WallIOSiteCounter, + siteCounters[0], + siteCounters[1], + siteCounters[2] + siteCounters[3], + siteCounters[4] + siteCounters[5], comms.Rank(), TotalSites, TotalCoreWeight); - } - - void OptimisedDecomposition::PopulateSiteDistribution() - { - vtxDistribn.resize(comms.Size() + 1, 0); - // Firstly, count the sites per processor. Do this off-by-one - // to be compatible with ParMetis. - for (site_t block = 0; block < geometry.GetBlockCount(); ++block) - { - if (procForEachBlock[block] >= 0) - { - vtxDistribn[1 + procForEachBlock[block]] += (idx_t) (fluidSitesPerBlock[block]); - } - } + } - // Now make the count cumulative, again off-by-one. - for (proc_t rank = 0; rank < comms.Size(); ++rank) - { - vtxDistribn[rank + 1] += vtxDistribn[rank]; - } - } - - void OptimisedDecomposition::PopulateFirstSiteIndexOnEachBlock() - { - // First calculate the lowest site index on each proc - relatively easy. - std::vector firstSiteOnProc(vtxDistribn); - // Now for each block (in ascending order), the smallest site index is the smallest site - // index on its processor, incremented by the number of sites observed from that processor - // so far. - for (site_t block = 0; block < geometry.GetBlockCount(); ++block) - { - proc_t proc = procForEachBlock[block]; - if (proc < 0) - { - firstSiteIndexPerBlock.push_back(-1); - } - else - { - firstSiteIndexPerBlock.push_back(firstSiteOnProc[proc]); - firstSiteOnProc[proc] += (idx_t) (fluidSitesPerBlock[block]); - } + void OptimisedDecomposition::PopulateSiteDistribution() + { + // Parmetis needs to know (on all processes) how + // vertices/sites are distributed across the processes. + // + // The `vtxdist` array has P+1 elements. + // + // `vtxdist[i]` gives the total number of vertices that are on + // processes with rank less than `i`. + // + // `vtxdist[0]` is always zero + // + // `vtxdist[P]` has the total number of vertices + // + // `vtxdist[i+1] - vtxdist[i]` gives the number of vertices on + // process i + // + // We also need to know what vertex indices to assign to each + // site. Do this contiguously for the sites within each block + // per-process. + // + // We will also need to be able to translate between a site's + // GMY index in a block and it's fluid-only index. + + auto& procForBlockOct = geometry.block_store->GetBlockOwnerRank(); + auto& tree = geometry.block_store->GetTree(); + auto& fluidSitesPerBlockOct = tree.levels[tree.n_levels].sites_per_node; + auto const NBLOCKS = geometry.block_store->GetBlockCount(); + + // First, count the sites per process and assign contiguous + // IDs to the sites in each block. + U64 total_sites = 0; + vtxCountPerProc = std::vector(comms.Size(), 0); + firstSiteIndexPerBlockOct.resize(NBLOCKS + 1); + + for (site_t block = 0; block < NBLOCKS; ++block) { + auto block_rank = procForBlockOct[block]; + firstSiteIndexPerBlockOct[block] = total_sites; + vtxCountPerProc[block_rank] += fluidSitesPerBlockOct[block]; + total_sites += fluidSitesPerBlockOct[block]; + } + firstSiteIndexPerBlockOct[NBLOCKS] = total_sites; + + // Now do the scan to setup vtxdist + vtxDistribn = std::vector(comms.Size() + 1); + vtxDistribn[0] = 0; + std::inclusive_scan( + vtxCountPerProc.begin(), vtxCountPerProc.end(), + vtxDistribn.begin() + 1, + std::plus() + ); + + // Now, for every block we've got data for, create the mapping + // from block OCT id to a vector of the GMY local site ids of + // the fluid sites. + for (auto [block_idx, block_rank]: enumerate_with(procForBlockOct)) { + auto block_ijk = tree.GetLeafCoords(block_idx); + auto block_gmy = geometry.GetBlockIdFromBlockCoordinates(block_ijk); + auto& blockReadResult = geometry.Blocks[block_gmy]; + if (blockReadResult.Sites.empty()) + continue; + std::vector& block_fluid_site_gmy_ids = gmySiteIdForBlockOct[block_idx]; + for (auto const& [i, s]: util::enumerate_with(blockReadResult.Sites)) { + // ... only looking at non-solid sites... + if (s.targetProcessor != SITE_OR_BLOCK_SOLID) { + block_fluid_site_gmy_ids.push_back(i); + } + } } - } + } - void OptimisedDecomposition::PopulateAdjacencyData(idx_t localVertexCount) - { + void OptimisedDecomposition::PopulateAdjacencyData(idx_t localVertexCount) + { adjacenciesPerVertex.push_back(0); - // TODO: Reimplement using traversers or iterators. - // For each block (counting up by lowest site id)... auto const& block_dims = geometry.GetBlockDimensions(); - for (site_t blockI = 0; blockI < block_dims.x(); blockI++) - { - for (site_t blockJ = 0; blockJ < block_dims.y(); blockJ++) - { - for (site_t blockK = 0; blockK < block_dims.z(); blockK++) - { - const site_t blockNumber = geometry.GetBlockIdFromBlockCoordinates(blockI, - blockJ, - blockK); - // ... considering only the ones which live on this proc... - if (procForEachBlock[blockNumber] != comms.Rank()) - { + auto const BS = geometry.GetBlockSize(); + // For range check below + auto const lo_site = Vector3D::Zero(); + auto const hi_site = block_dims.as() * BS - Vector3D::Ones(); + + // For each block (counting up by lowest site id)... + for (auto [block_idx, block_rank]: enumerate_with(procForBlockOct)) { + if (block_rank != comms.Rank()) + // ... considering only the ones which live on this proc... continue; - } - const BlockReadResult& blockReadResult = geometry.Blocks[blockNumber]; - site_t m = -1; - // ... iterate over sites within the block... - for (site_t localSiteI = 0; localSiteI < geometry.GetBlockSize(); localSiteI++) - { - for (site_t localSiteJ = 0; localSiteJ < geometry.GetBlockSize(); localSiteJ++) - { - for (site_t localSiteK = 0; localSiteK < geometry.GetBlockSize(); localSiteK++) - { - ++m; - // ... only looking at non-solid sites... - if (blockReadResult.Sites[m].targetProcessor == SITE_OR_BLOCK_SOLID) - { - continue; - } - // ... for each lattice direction... - for (unsigned int l = 1; l < latticeInfo.GetNumVectors(); l++) - { - // ... which leads to a valid neighbouring site... - site_t neighbourI = blockI * geometry.GetBlockSize() + localSiteI - + latticeInfo.GetVector(l).x(); - site_t neighbourJ = blockJ * geometry.GetBlockSize() + localSiteJ - + latticeInfo.GetVector(l).y(); - site_t neighbourK = blockK * geometry.GetBlockSize() + localSiteK - + latticeInfo.GetVector(l).z(); - if (neighbourI < 0 || neighbourJ < 0 || neighbourK < 0 - || neighbourI - >= (geometry.GetBlockSize() * block_dims.x()) - || neighbourJ - >= (geometry.GetBlockSize() * block_dims.y()) - || neighbourK - >= (geometry.GetBlockSize() * block_dims.z())) - { + + auto block_ijk = tree.GetLeafCoords(block_idx); + auto block_gmy = geometry.GetBlockIdFromBlockCoordinates(block_ijk); + + auto& blockReadResult = geometry.Blocks[block_gmy]; + for (auto [local_site_ijk, m]: IterSitesInBlock(geometry)) { + // ... only looking at non-solid sites... + if (blockReadResult.Sites[m].targetProcessor == SITE_OR_BLOCK_SOLID) + continue; + + auto global_site_ijk = block_ijk.as() * BS + local_site_ijk; + // ... for each lattice direction (recalling vector[0] == {0,0,0}) ... + for (unsigned int l = 1; l < latticeInfo.GetNumVectors(); l++) { + // ... which leads to a valid neighbouring site... + auto neigh_ijk = global_site_ijk + latticeInfo.GetVector(l); + if (!neigh_ijk.IsInRange(lo_site, hi_site)) continue; - } - // ... (that is actually being simulated and not a solid)... - site_t neighbourBlockI = neighbourI / geometry.GetBlockSize(); - site_t neighbourBlockJ = neighbourJ / geometry.GetBlockSize(); - site_t neighbourBlockK = neighbourK / geometry.GetBlockSize(); - site_t neighbourSiteI = neighbourI % geometry.GetBlockSize(); - site_t neighbourSiteJ = neighbourJ % geometry.GetBlockSize(); - site_t neighbourSiteK = neighbourK % geometry.GetBlockSize(); - site_t neighbourBlockId = - geometry.GetBlockIdFromBlockCoordinates(neighbourBlockI, - neighbourBlockJ, - neighbourBlockK); - const BlockReadResult& neighbourBlock = geometry.Blocks[neighbourBlockId]; - site_t neighbourSiteId = - geometry.GetSiteIdFromSiteCoordinates(neighbourSiteI, - neighbourSiteJ, - neighbourSiteK); - if (neighbourBlock.Sites.empty() - || neighbourBlock.Sites[neighbourSiteId].targetProcessor == SITE_OR_BLOCK_SOLID) - { + + // ... (that is actually being simulated and not a solid)... + auto neigh_block = Vec16(neigh_ijk / BS); + auto neigh_idx = tree.GetPath(neigh_block).leaf(); + // Recall that the octree will return a path + // with "no child" for all levels where the + // requested node doesn't exist. + if (neigh_idx == octree::Level::NC) continue; - } - site_t neighGlobalSiteId = firstSiteIndexPerBlock[neighbourBlockId] - + geometry.FindFluidSiteIndexInBlock(neighbourBlockId, neighbourSiteId); + auto [ni, nj, nk] = neigh_ijk % BS; + U16 neighbourSiteId = geometry.GetSiteIdFromSiteCoordinates(ni, nj, nk); - // then add this to the list of adjacencies. - localAdjacencies.push_back((idx_t) (neighGlobalSiteId)); - } + // This is the GMY local site IDs for only the + // fluid sites on the neighbour block + auto const& neigh_gmy_sites = gmySiteIdForBlockOct[neigh_idx]; + auto lb = std::lower_bound(neigh_gmy_sites.begin(), neigh_gmy_sites.end(), neighbourSiteId); + // lb is either end or the first elem greater than or equal to what we want + // end => SOLID; greater than or equal => SOLID + if (lb == neigh_gmy_sites.end() || *lb != neighbourSiteId) + continue; + // have equal + auto local_contig_idx = std::distance(neigh_gmy_sites.begin(), lb); + U64 neighGlobalSiteId = firstSiteIndexPerBlockOct[neigh_idx] + local_contig_idx; - // The cumulative count of adjacencies for this vertex is equal to the total - // number of adjacencies we've entered. - // NOTE: The prefix operator is correct here because - // the array has a leading 0 not relating to any site. - adjacenciesPerVertex.push_back(localAdjacencies.size()); - } + // then add this to the list of adjacencies. + localAdjacencies.push_back(idx_t(neighGlobalSiteId)); } - } + + // The cumulative count of adjacencies for this vertex is equal to the total + // number of adjacencies we've entered. + // NOTE: The prefix operator is correct here because + // the array has a leading 0 not relating to any site. + adjacenciesPerVertex.push_back(localAdjacencies.size()); } - } } - } + } - std::vector OptimisedDecomposition::CompileMoveData( - std::map& blockIdLookupByLastSiteIndex) - { - // Right. Let's count how many sites we're going to have to move. Count the local number of - // sites to be moved, and collect the site id and the destination processor. - std::vector moveData; - const idx_t myLowest = vtxDistribn[comms.Rank()]; - const idx_t myHighest = vtxDistribn[comms.Rank() + 1] - 1; + // We are returning a map (keyed by the process rank) of where + // each site that starts on this process should end up. Every site + // is here and the values of the map (vectors) will be sorted, + // first by block ID and then by site ID (note this for use in the + // geometry reader!) + auto OptimisedDecomposition::CompileMoveData() -> MovesMap + { + // Key is the destination rank, value is the list of [block, site id] pairs + std::map>> movesToRank; + std::vector moveData; + auto const myLowest = vtxDistribn[comms.Rank()]; + auto const mySiteCount = vtxDistribn[comms.Rank() + 1] - myLowest; // For each local fluid site... - for (idx_t ii = 0; ii <= (myHighest - myLowest); ++ii) - { - // ... if it's going elsewhere... - if (partitionVector[ii] != comms.Rank()) - { - // ... get its id on the local processor... - idx_t localFluidSiteId = myLowest + ii; - - // ... find out which block it's on, using our lookup map... - - // A feature of std::map::equal_range is that if there's no equal key, both iterators - // returned will point to the entry with the next greatest key. Since we store block - // ids by last fluid site number, this immediately gives us the block id. - std::pair::iterator, std::map::iterator> rangeMatch = - blockIdLookupByLastSiteIndex.equal_range(localFluidSiteId); - - site_t fluidSiteBlock = rangeMatch.first->second; - - // Check the block id is correct - if (ShouldValidate()) - { - if (procForEachBlock[fluidSiteBlock] < 0) - { - log::Logger::Log("Found block %i for site %i but this block has a processor of %i assigned", - fluidSiteBlock, - localFluidSiteId, - procForEachBlock[fluidSiteBlock]); - } - if (firstSiteIndexPerBlock[fluidSiteBlock] > localFluidSiteId) - { - log::Logger::Log("Found block %i for site %i but sites on this block start at number %i", - fluidSiteBlock, - localFluidSiteId, - firstSiteIndexPerBlock[fluidSiteBlock]); - } - if (firstSiteIndexPerBlock[fluidSiteBlock] + fluidSitesPerBlock[fluidSiteBlock] - 1 - < localFluidSiteId) - { - log::Logger::Log("Found block %i for site %i but there are %i sites on this block starting at %i", - fluidSiteBlock, - localFluidSiteId, - fluidSitesPerBlock[fluidSiteBlock], - firstSiteIndexPerBlock[fluidSiteBlock]); + // (NB: since the vertices/sites are sorted by block and then + // site, the results are also so sorted.) + for (idx_t ii = 0; ii < mySiteCount; ++ii) + { + // Where should it be? Catalogue even sites staying here for simplicity + auto dest_rank = partitionVector[ii]; + + idx_t global_site_id = myLowest + ii; + // Find out which block it's on. + + // Upper bound gives the iterator to the first one after + // (recalling that we have an extra entry with the total + // number of the fluid sites at the end). + auto after = std::upper_bound( + firstSiteIndexPerBlockOct.begin(), firstSiteIndexPerBlockOct.end(), + global_site_id + ); + U64 const block_idx = std::distance(firstSiteIndexPerBlockOct.begin(), after) - 1; + + if constexpr (build_info::VALIDATE_GEOMETRY) { + // Check the block id is correct + if ( + global_site_id < firstSiteIndexPerBlockOct[block_idx] || + global_site_id >= firstSiteIndexPerBlockOct[block_idx+1]) { + log::Logger::Log( + "Found block %i for site %i but sites on this block start at number %i and finish before %i", + block_idx, + global_site_id , + firstSiteIndexPerBlockOct[block_idx], + firstSiteIndexPerBlockOct[block_idx + 1] + ); } } // ... and find its site id within that block. Start by working out how many fluid sites // we have to pass before we arrive at the fluid site we're after... - site_t fluidSitesToPass = localFluidSiteId - firstSiteIndexPerBlock[fluidSiteBlock]; - site_t siteIndex = 0; - - siteIndex = geometry.FindSiteIndexInBlock(fluidSiteBlock, fluidSitesToPass); - - // The above code could go wrong, so in debug logging mode, we do some extra tests. - if (ShouldValidate()) - { - // If we've ended up on an impossible block, or one that doesn't live on this rank, - // inform the user. - if (fluidSiteBlock >= geometry.GetBlockCount() - || procForEachBlock[fluidSiteBlock] != comms.Rank()) - { - log::Logger::Log("Partition element %i wrongly assigned to block %u of %i (block on processor %i)", - ii, - fluidSiteBlock, - geometry.GetBlockCount(), - procForEachBlock[fluidSiteBlock]); - } - // Similarly, if we've ended up with an impossible site index, or a solid site, - // print an error message. - if (siteIndex >= geometry.GetSitesPerBlock() - || geometry.Blocks[fluidSiteBlock].Sites[siteIndex].targetProcessor - == SITE_OR_BLOCK_SOLID) - { - log::Logger::Log("Partition element %i wrongly assigned to site %u of %i (block %i%s)", - ii, - siteIndex, - fluidSitesPerBlock[fluidSiteBlock], - fluidSiteBlock, - geometry.Blocks[fluidSiteBlock].Sites[siteIndex].targetProcessor - == SITE_OR_BLOCK_SOLID ? - " and site is solid" : - ""); - } - } + site_t block_site_id = global_site_id - firstSiteIndexPerBlockOct[block_idx]; + auto siteIndexGmy = gmySiteIdForBlockOct[block_idx][block_site_id]; // Add the block, site and destination rank to our move list. - moveData.push_back(fluidSiteBlock); - moveData.push_back(siteIndex); - moveData.push_back(partitionVector[ii]); - } - - } - - return moveData; - } - - void OptimisedDecomposition::ForceSomeBlocksOnOtherCores( - std::vector& moveData, - std::map >& blockIdsIRequireFromX) - { - timers[hemelb::reporting::Timers::moveForcingNumbers].Start(); - - net::Net netForMoveSending(comms); - - // We also need to force some data upon blocks, i.e. when they're receiving data from a new - // block they didn't previously want to know about. - std::map > blockForcedUponX; - std::vector numberOfBlocksIForceUponX(comms.Size(), 0); - for (idx_t moveNumber = 0; moveNumber < (idx_t) (moveData.size()); moveNumber += 3) - { - proc_t target_proc = moveData[moveNumber + 2]; - site_t blockId = moveData[moveNumber]; - if (std::count(blockForcedUponX[target_proc].begin(), - blockForcedUponX[target_proc].end(), - blockId) == 0) - { - blockForcedUponX[target_proc].push_back(blockId); - ++numberOfBlocksIForceUponX[target_proc]; - log::Logger::Log("I'm ensuring proc %i takes data about block %i", - target_proc, - blockId); - } - } - - // Now find how many blocks are being forced upon us from every other core. - log::Logger::Log("Moving forcing block numbers"); - std::vector blocksForcedOnMe = comms.AllToAll(numberOfBlocksIForceUponX); - timers[hemelb::reporting::Timers::moveForcingNumbers].Stop(); - - timers[hemelb::reporting::Timers::moveForcingData].Start(); - // Now get all the blocks being forced upon me. - std::map > blocksForcedOnMeByEachProc; - for (proc_t otherProc = 0; otherProc < (proc_t) ( ( ( ( (comms.Size()))))); ++otherProc) - { - if (blocksForcedOnMe[otherProc] > 0) - { - blocksForcedOnMeByEachProc[otherProc] = - std::vector(blocksForcedOnMe[otherProc]); - netForMoveSending.RequestReceiveV(std::span(blocksForcedOnMeByEachProc[otherProc]), otherProc); - } - if (numberOfBlocksIForceUponX[otherProc] > 0) - { - netForMoveSending.RequestSendV(std::span(blockForcedUponX[otherProc]), otherProc); - } - log::Logger::Log("I'm forcing %i blocks on proc %i.", - numberOfBlocksIForceUponX[otherProc], - otherProc); - } - - log::Logger::Log("Moving forcing block ids"); - netForMoveSending.Dispatch(); - // Now go through every block forced upon me and add it to the list of ones I want. - for (proc_t otherProc = 0; otherProc < (proc_t) ( ( ( ( (comms.Size()))))); ++otherProc) - { - if (blocksForcedOnMe[otherProc] > 0) - { - for (auto it = blocksForcedOnMeByEachProc[otherProc].begin(); - it != blocksForcedOnMeByEachProc[otherProc].end(); ++it) - { - if (std::count(blockIdsIRequireFromX[otherProc].begin(), - blockIdsIRequireFromX[otherProc].end(), - *it) == 0) - { - blockIdsIRequireFromX[otherProc].push_back(*it); - log::Logger::Log("I'm being forced to take block %i from proc %i", - *it, - otherProc); - } - // We also need to take all neighbours of the forced block from their processors. - auto blockCoords = geometry.GetBlockCoordinatesFromBlockId(*it).as(); - // Iterate over every direction we might need (except 0 as we obviously already have - // that block in the list). - for (Direction direction = 1; direction < lb::D3Q27::NUMVECTORS; - ++direction) - { - // Calculate the putative neighbour's coordinates... - auto neighbourCoords = (blockCoords + lb::D3Q27::VECTORS[direction]).as(); - // If the neighbour is a real block... - if (geometry.AreBlockCoordinatesValid(neighbourCoords)) - { - // Get the block id, and check whether it has any fluid sites... - site_t neighbourBlockId = - geometry.GetBlockIdFromBlockCoordinates(neighbourCoords.x(), - neighbourCoords.y(), - neighbourCoords.z()); - proc_t neighbourBlockProc = procForEachBlock[neighbourBlockId]; - if (neighbourBlockProc >= 0) - { - // Check whether this is a block we're already interested in from that neighbour. - if (std::count(blockIdsIRequireFromX[neighbourBlockProc].begin(), - blockIdsIRequireFromX[neighbourBlockProc].end(), - neighbourBlockId) == 0) - { - // Then add it to the list of blocks we're getting from that neighbour. - blockIdsIRequireFromX[neighbourBlockProc].push_back(neighbourBlockId); - log::Logger::Log("I need to also take block %i from proc %i", - neighbourBlockId, - neighbourBlockProc); - } - } - - } - - } - - } - - } - - } - - timers[hemelb::reporting::Timers::moveForcingData].Stop(); - } - - void OptimisedDecomposition::GetBlockRequirements( - std::vector& numberOfBlocksRequiredFrom, - std::map >& blockIdsIRequireFromX, - std::vector& numberOfBlocksXRequiresFromMe, - std::map >& blockIdsXRequiresFromMe) - { - log::Logger::Log("Calculating block requirements"); - timers[hemelb::reporting::Timers::blockRequirements].Start(); - // Populate numberOfBlocksRequiredFrom - for (proc_t otherProc = 0; otherProc < (proc_t) ( ( ( ( (comms.Size()))))); ++otherProc) - { - numberOfBlocksRequiredFrom[otherProc] = blockIdsIRequireFromX.count(otherProc) == 0 ? - 0 : - blockIdsIRequireFromX[otherProc].size(); - log::Logger::Log("I require a total of %i blocks from proc %i", - numberOfBlocksRequiredFrom[otherProc], - otherProc); - } - // Now perform the exchange s.t. each core knows how many blocks are required of it from - // each other core. - numberOfBlocksXRequiresFromMe = comms.AllToAll(numberOfBlocksRequiredFrom); - - // Awesome. Now we need to get a list of all the blocks wanted from each core by each other - // core. - net::Net netForMoveSending(comms); - for (proc_t otherProc = 0; otherProc < (proc_t) ( ( ( ( (comms.Size()))))); ++otherProc) - { - blockIdsXRequiresFromMe[otherProc] = - std::vector(numberOfBlocksXRequiresFromMe[otherProc]); - log::Logger::Log("Proc %i requires %i blocks from me", - otherProc, - blockIdsXRequiresFromMe[otherProc].size()); - netForMoveSending.RequestReceiveV(std::span(blockIdsXRequiresFromMe[otherProc]), otherProc); - netForMoveSending.RequestSendV(std::span(blockIdsIRequireFromX[otherProc]), otherProc); - } - netForMoveSending.Dispatch(); - timers[hemelb::reporting::Timers::blockRequirements].Stop(); - } - - void OptimisedDecomposition::ShareMoveCounts( - std::map& movesForEachLocalBlock, - std::map >& blockIdsXRequiresFromMe, - std::map >& coresInterestedInEachBlock, - std::vector& moveData, std::map >& moveDataForEachBlock, - std::map >& blockIdsIRequireFromX, - std::vector& movesForEachBlockWeCareAbout) - { - timers[hemelb::reporting::Timers::moveCountsSending].Start(); - // Initialise the moves for each local block to 0. This handles an edge case where a local - // block has no moves. - for (site_t blockId = 0; blockId < geometry.GetBlockCount(); ++blockId) - { - if (procForEachBlock[blockId] == comms.Rank()) - { - movesForEachLocalBlock[blockId] = 0; - } - } - - for (proc_t otherProc = 0; otherProc < (proc_t) ( ( ( ( (comms.Size()))))); ++otherProc) - { - for (site_t blockNum = 0; - blockNum < (site_t) ( ( ( ( (blockIdsXRequiresFromMe[otherProc].size()))))); - ++blockNum) - { - site_t blockId = blockIdsXRequiresFromMe[otherProc][blockNum]; - log::Logger::Log("Proc %i requires block %i from me", - otherProc, - blockId); - if (coresInterestedInEachBlock.count(blockId) == 0) - { - coresInterestedInEachBlock[blockId] = std::vector(); - } - coresInterestedInEachBlock[blockId].push_back(otherProc); - } - - } - - for (site_t moveNumber = 0; moveNumber < (site_t) ( ( ( ( (moveData.size()))))); - moveNumber += 3) - { - site_t blockId = moveData[moveNumber]; - if (moveDataForEachBlock.count(blockId) == 0) - { - moveDataForEachBlock[blockId] = std::vector(); - } - moveDataForEachBlock[blockId].push_back(blockId); - moveDataForEachBlock[blockId].push_back(moveData[moveNumber + 1]); - moveDataForEachBlock[blockId].push_back(moveData[moveNumber + 2]); - movesForEachLocalBlock[blockId]++; + movesToRank[dest_rank].push_back({block_idx, siteIndexGmy}); } - net::Net netForMoveSending(comms); - for (proc_t otherProc = 0; otherProc < (proc_t) ( ( ( ( (comms.Size()))))); ++otherProc) - { - for (auto it = blockIdsIRequireFromX[otherProc].begin(); - it != blockIdsIRequireFromX[otherProc].end(); ++it) - { - netForMoveSending.RequestReceiveR(movesForEachBlockWeCareAbout[*it], otherProc); - log::Logger::Log("I want the move count for block %i from proc %i", - *it, - otherProc); - } - for (auto it = blockIdsXRequiresFromMe[otherProc].begin(); - it != blockIdsXRequiresFromMe[otherProc].end(); ++it) - { - netForMoveSending.RequestSendR(movesForEachLocalBlock[*it], otherProc); - log::Logger::Log("I'm sending move count for block %i to proc %i", - *it, - otherProc); - } - } - - log::Logger::Log("Sending move counts"); - netForMoveSending.Dispatch(); - timers[hemelb::reporting::Timers::moveCountsSending].Stop(); + return movesToRank; } - void OptimisedDecomposition::ShareMoveData( - std::vector movesForEachBlockWeCareAbout, - std::map > blockIdsIRequireFromX, - std::map > blockIdsXRequiresFromMe, - std::map > moveDataForEachBlock) - { - timers[hemelb::reporting::Timers::moveDataSending].Start(); - idx_t totalMovesToReceive = 0; - for (site_t blockId = 0; blockId < geometry.GetBlockCount(); ++blockId) - { - totalMovesToReceive += movesForEachBlockWeCareAbout[blockId]; - } - log::Logger::Log("I'm expecting a total of %i moves", - totalMovesToReceive); - // Gather the moves to the places they need to go to. - // Moves list has block, site id, destination proc - movesList.resize(totalMovesToReceive * 3); - idx_t localMoveId = 0; - - net::Net netForMoveSending(comms); + // Organise out the moves: those sites stay, leaving and arriving. + void OptimisedDecomposition::PopulateMovesList() { + auto rank = comms.Rank(); - for (proc_t otherProc = 0; otherProc < (proc_t) ( ( ( ( (comms.Size()))))); ++otherProc) - { - allMoves[otherProc] = 0; - for (auto it = blockIdsIRequireFromX[otherProc].begin(); - it != blockIdsIRequireFromX[otherProc].end(); ++it) - { - if (movesForEachBlockWeCareAbout[*it] > 0) - { - netForMoveSending.RequestReceive(&movesList[localMoveId * 3], - 3 * movesForEachBlockWeCareAbout[*it], - otherProc); - localMoveId += movesForEachBlockWeCareAbout[*it]; - allMoves[otherProc] += movesForEachBlockWeCareAbout[*it]; - log::Logger::Log("Expect %i moves from from proc %i about block %i", - movesForEachBlockWeCareAbout[*it], - otherProc, - *it); - } - } - - for (auto& bi: blockIdsXRequiresFromMe[otherProc]) - { - if (!moveDataForEachBlock[bi].empty()) - { - netForMoveSending.RequestSendV(std::span(moveDataForEachBlock[bi]), otherProc); - log::Logger::Log("Sending %i moves from to proc %i about block %i", - moveDataForEachBlock[bi].size() / 3, - otherProc, - bi); - } - } + // Work out where *all* sites should be + leaving = CompileMoveData(); - log::Logger::Log("%i moves from proc %i", - allMoves[otherProc], - otherProc); - } + // Separate out those staying + staying = std::move(leaving[rank]); + leaving.erase(rank); - log::Logger::Log("Sending move data"); - netForMoveSending.Dispatch(); - timers[hemelb::reporting::Timers::moveDataSending].Stop(); - } - - /** - * Returns a list of the fluid sites to be moved. - * - * NOTE: This function's return value is a dynamically-allocated array of all the moves to be - * performed, ordered by (origin processor [with a count described by the content of the first - * parameter], site id on the origin processor). The contents of the array are contiguous - * triplets of ints: (block id, site id on block, destination rank). - * - * @return - */ - void OptimisedDecomposition::PopulateMovesList() - { - allMoves = std::vector(comms.Size()); - - // Create a map for looking up block Ids: the map is from the contiguous site index - // of the last fluid site on the block, to the block id. - std::map blockIdLookupByLastSiteIndex; - for (site_t blockId = 0; blockId < geometry.GetBlockCount(); ++blockId) - { - if (procForEachBlock[blockId] >= 0 && procForEachBlock[blockId] != SITE_OR_BLOCK_SOLID) - { - site_t lastFluidSiteId = firstSiteIndexPerBlock[blockId] + fluidSitesPerBlock[blockId] - - 1; - blockIdLookupByLastSiteIndex[lastFluidSiteId] = blockId; - } - } - - // Right. Let's count how many sites we're going to have to move. Count the local number of - // sites to be moved, and collect the site id and the destination processor. - std::vector moveData = CompileMoveData(blockIdLookupByLastSiteIndex); // Spread the move data around log::Logger::Log("Starting to spread move data"); - // First, for each core, gather a list of which blocks the current core wants to - // know more data about. - // Handily, the blocks we want to know about are exactly those for which we already - // have some data. - std::map > blockIdsIRequireFromX; - for (site_t block = 0; block < geometry.GetBlockCount(); ++block) - { - if (!geometry.Blocks[block].Sites.empty()) - { - proc_t residentProc = procForEachBlock[block]; - blockIdsIRequireFromX[residentProc].push_back(block); - log::Logger::Log("I require block %i from proc %i (running total %i)", - block, - residentProc, - blockIdsIRequireFromX[residentProc].size()); - } - } - - // We also need to force some data upon blocks, i.e. when they're receiving data from a new - // block they didn't previously want to know about. - ForceSomeBlocksOnOtherCores(moveData, blockIdsIRequireFromX); - - // Now we want to spread this info around so that each core knows which blocks each other - // requires from it. - std::vector numberOfBlocksRequiredFrom(comms.Size(), 0); - std::vector numberOfBlocksXRequiresFromMe(comms.Size(), 0); - std::map > blockIdsXRequiresFromMe; - GetBlockRequirements(numberOfBlocksRequiredFrom, - blockIdsIRequireFromX, - numberOfBlocksXRequiresFromMe, - blockIdsXRequiresFromMe); - - // OK, now to get on with the actual sending of the data... - // Except first, it'll be helpful to organise things by blocks. - std::map > coresInterestedInEachBlock; - std::map > moveDataForEachBlock; - std::map movesForEachLocalBlock; - // And it'll also be super-handy to know how many moves we're going to have locally. - std::vector movesForEachBlockWeCareAbout(geometry.GetBlockCount(), 0); - ShareMoveCounts(movesForEachLocalBlock, - blockIdsXRequiresFromMe, - coresInterestedInEachBlock, - moveData, - moveDataForEachBlock, - blockIdsIRequireFromX, - movesForEachBlockWeCareAbout); - - ShareMoveData(movesForEachBlockWeCareAbout, - blockIdsIRequireFromX, - blockIdsXRequiresFromMe, - moveDataForEachBlock); - } - -#if HEMELB_NO_DECOMPOSITION - std::vector OptimisedDecomposition::CompileMoveDataFromFile( - std::map& blockIdLookupByLastSiteIndex, std::string decomposition_file) - { - // Right. Let's count how many sites we're going to have to move. Count the local number of - // sites to be moved, and collect the site id and the destination processor. - std::vector moveData; - const idx_t myLowest = vtxDistribn[comms.Rank()]; - const idx_t myHighest = vtxDistribn[comms.Rank() + 1] - 1; - - // For each local fluid site... - for (idx_t ii = 0; ii <= (myHighest - myLowest); ++ii) - { - // ... if it's going elsewhere... - if (partitionVector[ii] != comms.Rank()) - { - // ... get its id on the local processor... - idx_t localFluidSiteId = myLowest + ii; - - // ... find out which block it's on, using our lookup map... - - // A feature of std::map::equal_range is that if there's no equal key, both iterators - // returned will point to the entry with the next greatest key. Since we store block - // ids by last fluid site number, this immediately gives us the block id. - std::pair::iterator, std::map::iterator> rangeMatch = - blockIdLookupByLastSiteIndex.equal_range(localFluidSiteId); - - idx_t fluidSiteBlock = rangeMatch.first->second; - - // ... and find its site id within that block. Start by working out how many fluid sites - // we have to pass before we arrive at the fluid site we're after... - idx_t fluidSitesToPass = localFluidSiteId - firstSiteIndexPerBlock[fluidSiteBlock]; - - site_t siteIndex = geometry.FindSiteIndexInBlock(fluidSiteBlock, fluidSitesToPass); - //TODO: put back validation (it's in CompileMovesData). - - // Add the block, site and destination rank to our move list. - moveData.push_back(fluidSiteBlock); - moveData.push_back(siteIndex); - moveData.push_back(partitionVector[ii]); - } - - } - - return moveData; - } -#endif - -#if HEMELB_NO_DECOMPOSITION - /* Below is a modified version of PopulateMovesList(). */ - void OptimisedDecomposition::LoadDecomposition() - { - allMoves = std::vector(comms.Size()); - - // Create a map for looking up block Ids: the map is from the contiguous site index - // of the last fluid site on the block, to the block id. - std::map blockIdLookupByLastSiteIndex; - for (site_t blockId = 0; blockId < geometry.GetBlockCount(); ++blockId) - { - if (procForEachBlock[blockId] >= 0 && procForEachBlock[blockId] != SITE_OR_BLOCK_SOLID) - { - site_t lastFluidSiteId = firstSiteIndexPerBlock[blockId] + fluidSitesPerBlock[blockId] - - 1; - blockIdLookupByLastSiteIndex[lastFluidSiteId] = blockId; - } + net::sparse_exchange xchg(comms, 444); + for (auto const& [dest, moves]: leaving) { + xchg.send(to_span(moves), dest); } - // Right. Let's count how many sites we're going to have to move. Count the local number of - // sites to be moved, and collect the site id and the destination processor. - std::vector moveData = CompileMoveDataFromFile(blockIdLookupByLastSiteIndex); - // Spread the move data around - log::Logger::Log("Starting to spread move data"); - // First, for each core, gather a list of which blocks the current core wants to - // know more data about. - // Handily, the blocks we want to know about are exactly those for which we already - // have some data. - std::map > blockIdsIRequireFromX; - for (site_t block = 0; block < geometry.GetBlockCount(); ++block) - { - if (!geometry.Blocks[block].Sites.empty()) - { - proc_t residentProc = procForEachBlock[block]; - blockIdsIRequireFromX[residentProc].push_back(block); - log::Logger::Log("I require block %i from proc %i (running total %i)", - block, - residentProc, - blockIdsIRequireFromX[residentProc].size()); - } - } - - // We also need to force some data upon blocks, i.e. when they're receiving data from a new - // block they didn't previously want to know about. - ForceSomeBlocksOnOtherCores(moveData, blockIdsIRequireFromX); - - // Now we want to spread this info around so that each core knows which blocks each other - // requires from it. - std::vector numberOfBlocksRequiredFrom(comms.Size(), 0); - std::vector numberOfBlocksXRequiresFromMe(comms.Size(), 0); - std::map > blockIdsXRequiresFromMe; - GetBlockRequirements(numberOfBlocksRequiredFrom, - blockIdsIRequireFromX, - numberOfBlocksXRequiresFromMe, - blockIdsXRequiresFromMe); - - // OK, now to get on with the actual sending of the data... - // Except first, it'll be helpful to organise things by blocks. - std::map > coresInterestedInEachBlock; - std::map > moveDataForEachBlock; - std::map movesForEachLocalBlock; - // And it'll also be super-handy to know how many moves we're going to have locally. - std::vector movesForEachBlockWeCareAbout(geometry.GetBlockCount(), 0); - ShareMoveCounts(movesForEachLocalBlock, - blockIdsXRequiresFromMe, - coresInterestedInEachBlock, - moveData, - moveDataForEachBlock, - blockIdsIRequireFromX, - movesForEachBlockWeCareAbout); - - ShareMoveData(movesForEachBlockWeCareAbout, - blockIdsIRequireFromX, - blockIdsXRequiresFromMe, - moveDataForEachBlock); - } -#endif - - bool OptimisedDecomposition::ShouldValidate() const - { -#ifdef HEMELB_VALIDATE_GEOMETRY - return true; -#else - return false; -#endif - } + xchg.receive( + [&](int src, int count) { + auto& rbuf = arriving[src]; + rbuf.resize(count); + return rbuf.data(); + }, + [&](int src, auto* buf) { + // no-op + } + ); + } - void OptimisedDecomposition::ValidateVertexDistribution() - { + void OptimisedDecomposition::ValidateVertexDistribution() { log::Logger::Log("Validating the vertex distribution."); // vtxDistribn should be the same on all cores. std::vector vtxDistribnRecv = comms.AllReduce(vtxDistribn, MPI_MIN); - for (proc_t rank = 0; rank < comms.Size() + 1; ++rank) - { - if (vtxDistribn[rank] != vtxDistribnRecv[rank]) - { - log::Logger::Log("vtxDistribn[%i] was %li but at least one other core had it as %li.", - rank, - vtxDistribn[rank], - vtxDistribnRecv[rank]); - } + for (proc_t rank = 0; rank < comms.Size() + 1; ++rank) { + if (vtxDistribn[rank] != vtxDistribnRecv[rank]) { + log::Logger::Log( + "vtxDistribn[%i] was %li but at least one other core had it as %li.", + rank, + vtxDistribn[rank], + vtxDistribnRecv[rank] + ); + } } + } - } - - void OptimisedDecomposition::ValidateAdjacencyData(idx_t localVertexCount) - { + void OptimisedDecomposition::ValidateAdjacencyData(idx_t localVertexCount) { // If we're using debugging logs, check that the arguments are consistent across all cores. // To verify: vtxDistribn, adjacenciesPerVertex, adjacencies - if (ShouldValidate()) - { - log::Logger::Log("Validating the graph adjacency structure"); - // Create an array of lists to store all of this node's adjacencies, arranged by the - // proc the adjacent vertex is on. - std::vector > adjByNeighProc(comms.Size(), - std::multimap()); - // The adjacency data should correspond across all cores. - for (idx_t index = 0; index < localVertexCount; ++index) - { + log::Logger::Log("Validating the graph adjacency structure"); + // Create an array of lists to store all of this node's adjacencies, arranged by the + // proc the adjacent vertex is on. + using edge = std::array; + std::map> adjByNeighProc; + auto check_edge = [this] (idx_t v, idx_t w) { + idx_t w_local = w - vtxDistribn[comms.Rank()]; + bool found = false; + for ( + idx_t edge_i = adjacenciesPerVertex[w_local]; + edge_i < adjacenciesPerVertex[w_local + 1]; + ++edge_i + ) { + if (localAdjacencies[edge_i] == v) { + found = true; + break; + } + } + if (!found) + log::Logger::Log( + "Could not find reverse for edge from vertex %li to %li", + v, w + ); + }; + + // The adjacency data should correspond across all cores. + for (idx_t index = 0; index < localVertexCount; ++index) { idx_t vertex = vtxDistribn[comms.Rank()] + index; // Iterate over each adjacency (of each vertex). - for (idx_t adjNumber = 0; - adjNumber < (adjacenciesPerVertex[index + 1] - adjacenciesPerVertex[index]); - ++adjNumber) - { - idx_t adjacentVertex = localAdjacencies[adjacenciesPerVertex[index] + adjNumber]; - proc_t adjacentProc = -1; - // Calculate the proc of the neighbouring vertex. - for (proc_t proc = 0; proc < comms.Size(); ++proc) - { - if (vtxDistribn[proc] <= adjacentVertex && vtxDistribn[proc + 1] > adjacentVertex) - { - adjacentProc = proc; - break; + for ( + idx_t adj_idx = adjacenciesPerVertex[index]; + adj_idx < adjacenciesPerVertex[index + 1]; + ++adj_idx + ) { + idx_t adjacentVertex = localAdjacencies[adj_idx]; + // Find the proc of the neighbouring vertex. + // This iterator should point to the next process's start pos + auto ub = std::upper_bound(vtxDistribn.begin(), vtxDistribn.end(), adjacentVertex); + if (ub == vtxDistribn.end()) { + log::Logger::Log("The vertex %li has a neighbour %li which doesn\'t appear to live on any processor.", + vertex, + adjacentVertex); + continue; + } + int adj_proc = std::distance(vtxDistribn.begin(), ub) - 1; + + // Most edges should be local - check these in place to speed it up. + if (adj_proc == comms.Rank()) { + check_edge(vertex, adjacentVertex); + } else { + // Store the edge for later checking + adjByNeighProc[adj_proc].push_back({adjacentVertex, vertex}); } - } - - // If it doesn't appear to belong to any proc, something's wrong. - if (adjacentProc == -1) - { - log::Logger::Log("The vertex %li has a neighbour %li which doesn\'t appear to live on any processor.", - vertex, - adjacentVertex); - continue; - } - // Store the data if it does belong to a proc. - adjByNeighProc[adjacentProc].insert(std::pair(adjacentVertex, vertex)); - } - - } - - // Create variables for the neighbour data to go into. - std::vector counts(comms.Size()); - std::vector > data(comms.Size()); - log::Logger::Log("Validating neighbour data"); - // Now spread and compare the adjacency information. Larger ranks send data to smaller - // ranks which receive the data and compare it. - for (proc_t neigh = 0; neigh < (proc_t) ( ( ( ( (comms.Size()))))); ++neigh) - { - SendAdjacencyDataToLowerRankedProc(neigh, - counts[neigh], - data[neigh], - adjByNeighProc[neigh]); - if (neigh < comms.Rank()) - { - // Sending arrays don't perform comparison. - continue; } - CompareAdjacencyData(neigh, counts[neigh], data[neigh], adjByNeighProc[neigh]); - } - - } - - } - - void OptimisedDecomposition::SendAdjacencyDataToLowerRankedProc( - proc_t neighbouringProc, idx_t& neighboursAdjacencyCount, - std::vector& neighboursAdjacencyData, - std::multimap& expectedAdjacencyData) - { - if (neighbouringProc < comms.Rank()) - { - // Send the array length. - neighboursAdjacencyCount = 2 * expectedAdjacencyData.size(); - comms.Send(neighboursAdjacencyCount, neighbouringProc, 42); - // Create a sendable array (std::lists aren't organised in a sendable format). - neighboursAdjacencyData.resize(neighboursAdjacencyCount); - unsigned int adjacencyIndex = 0; - for (auto & it : expectedAdjacencyData) - { - neighboursAdjacencyData[2 * adjacencyIndex] = it.first; - neighboursAdjacencyData[2 * adjacencyIndex + 1] = it.second; - ++adjacencyIndex; - } - // Send the data to the neighbouringProc. - comms.Send(neighboursAdjacencyData, neighbouringProc, 43); - } - else - // If this is a greater rank number than the neighbouringProc, receive the data. - if (neighbouringProc > comms.Rank()) - { - comms.Receive(neighboursAdjacencyCount, neighbouringProc, 42); - neighboursAdjacencyData.resize(neighboursAdjacencyCount); - comms.Receive(neighboursAdjacencyData, neighbouringProc, 43); - } - else // Neigh == mTopologyRank, i.e. neighbouring vertices on the same proc - // Duplicate the data. - { - neighboursAdjacencyCount = 2 * expectedAdjacencyData.size(); - neighboursAdjacencyData.resize(neighboursAdjacencyCount); - int adjacencyIndex = 0; - for (auto & it : expectedAdjacencyData) - { - neighboursAdjacencyData[2 * adjacencyIndex] = it.first; - neighboursAdjacencyData[2 * adjacencyIndex + 1] = it.second; - ++adjacencyIndex; - } } - } - - void OptimisedDecomposition::CompareAdjacencyData( - proc_t neighbouringProc, idx_t neighboursAdjacencyCount, - const std::vector& neighboursAdjacencyData, - std::multimap& expectedAdjacencyData) - { - // Now we compare. First go through the received data which is ordered as (adjacent - // vertex, vertex) wrt the neighbouring proc. - for (idx_t ii = 0; ii < neighboursAdjacencyCount; ii += 2) - { - bool found = false; - // Go through each neighbour we know about on this proc, and check whether it - // matches the current received neighbour-data. - for (auto it = - expectedAdjacencyData.find(neighboursAdjacencyData[ii + 1]); - it != expectedAdjacencyData.end(); ++it) - { - idx_t recvAdj = it->first; - idx_t recvAdj2 = it->second; - if (neighboursAdjacencyData[ii] == recvAdj2 - && neighboursAdjacencyData[ii + 1] == recvAdj) - { - expectedAdjacencyData.erase(it); - found = true; - break; + // Now sort the cross-process edges + for (auto& [_, edges]: adjByNeighProc) { + std::sort( + edges.begin(), edges.end(), [](edge const& a, edge const& b) { + return a[0] < b[0]; + } + ); + } + + std::vector neigh_edge_counts(comms.Size()); + for (int root = 0; root < comms.Size(); ++root) { + // Send any edges we've got to the current root process + auto maybe_edges = adjByNeighProc.find(root); + std::size_t const nedges = (maybe_edges == adjByNeighProc.end()) ? + 0 // no edges to that process + : maybe_edges->second.size(); + + net::MpiCall{MPI_Gather}(&nedges, 1, net::MpiDataType(), neigh_edge_counts.data(), 1, net::MpiDataType(), root, comms); + + if (root == comms.Rank()) { + // Root receives and checks non-zeros + std::vector neigh_edges; + for (int src = 0; src < comms.Size(); ++src) { + if (neigh_edge_counts[src]) { + neigh_edges.resize(neigh_edge_counts[src]); + comms.Receive(neigh_edges, src, 43); + // Check! + for (auto [w, v]: neigh_edges) { + check_edge(v, w); + } + } } - } - - // No neighbour data on this proc matched the data received. - if (!found) - { - log::Logger::Log("Neighbour proc %i had adjacency (%li,%li) that wasn't present on this processor.", - neighbouringProc, - neighboursAdjacencyData[ii], - neighboursAdjacencyData[ii + 1]); + } else { + if (nedges) + comms.Send(maybe_edges->second, root, 43); } } + } - // The local store of adjacencies should now be empty, if there was complete matching. - auto it = expectedAdjacencyData.begin(); - while (it != expectedAdjacencyData.end()) - { - idx_t adj1 = it->first; - idx_t adj2 = it->second; - ++it; - // We had neighbour-data on this proc that didn't match that received. - log::Logger::Log("The local processor has adjacency (%li,%li) that isn't present on neighbouring processor %i.", - adj1, - adj2, - neighbouringProc); - } - } + void OptimisedDecomposition::ValidateFirstSiteIndexOnEachBlock() { + // Check that values are + // a) the same across all processes + // b) increase from zero to the the total number of fluid + // sites by at least one per block and at most by sites per + // block - void OptimisedDecomposition::ValidateFirstSiteIndexOnEachBlock() - { log::Logger::Log("Validating the firstSiteIndexPerBlock values."); - // Reduce finding the maximum across all nodes. Note that we have to use the maximum - // because some cores will have -1 for a block (indicating that it has no neighbours on - // that block. - std::vector firstSiteIndexPerBlockRecv = comms.AllReduce(firstSiteIndexPerBlock, - MPI_MAX); - - for (site_t block = 0; block < geometry.GetBlockCount(); ++block) - { - if (firstSiteIndexPerBlock[block] >= 0 - && firstSiteIndexPerBlock[block] != firstSiteIndexPerBlockRecv[block]) - { - log::Logger::Log("This core had the first site index on block %li as %li but at least one other core had it as %li.", - block, - firstSiteIndexPerBlock[block], - firstSiteIndexPerBlockRecv[block]); - } + auto const n_non_solid = tree.levels[tree.n_levels].sites_per_node.size(); + auto const total_fluid = tree.levels[0].sites_per_node[0]; + + HASSERT(firstSiteIndexPerBlockOct.size() == n_non_solid + 1); + HASSERT(firstSiteIndexPerBlockOct[0] == 0); + for (auto i = 0UL; i < n_non_solid; ++i) { + auto d = firstSiteIndexPerBlockOct[i+1] - firstSiteIndexPerBlockOct[i]; + HASSERT(d > 0); + HASSERT(d <= geometry.GetSitesPerBlock()); + } + HASSERT(firstSiteIndexPerBlockOct[n_non_solid] == total_fluid); + + auto firstSiteIndexPerBlockRecv = comms.AllReduce(firstSiteIndexPerBlockOct, MPI_MAX); + + for (auto i = 0UL; i < n_non_solid; ++i) { + if (firstSiteIndexPerBlockOct[i] != firstSiteIndexPerBlockRecv[i]) { + log::Logger::Log( + "This process had the first site index on block %li as %li but at least one other core had it as %li.", + i, + firstSiteIndexPerBlockOct[i], + firstSiteIndexPerBlockRecv[i] + ); + } } - } + } } diff --git a/Code/geometry/decomposition/OptimisedDecomposition.h b/Code/geometry/decomposition/OptimisedDecomposition.h index a1fbc395d..442d3a37f 100644 --- a/Code/geometry/decomposition/OptimisedDecomposition.h +++ b/Code/geometry/decomposition/OptimisedDecomposition.h @@ -16,40 +16,45 @@ #include "geometry/SiteData.h" #include "geometry/GeometryBlock.h" -namespace hemelb -{ - namespace geometry - { +namespace hemelb::geometry { + namespace octree { class LookupTree; } + namespace decomposition { + // Given an initial basic decomposition done at the block level, + // with all blocks on a process (plus halo) read into the + // GmyReadResult, use ParMETIS to optimise this. + // + // The result of the optimisation is a description of how to + // change the sites on this rank: those staying, leaving and + // arriving (see below for details). class OptimisedDecomposition { - public: - OptimisedDecomposition(reporting::Timers& timers, net::MpiCommunicator& comms, + public: + // Constructor actually does the optimisation - collective over comm. + OptimisedDecomposition(reporting::Timers& timers, net::MpiCommunicator comms, const GmyReadResult& geometry, - const lb::LatticeInfo& latticeInfo, - const std::vector& procForEachBlock, - const std::vector& fluidSitesPerBlock); + const lb::LatticeInfo& latticeInfo); - /** - * Returns a vector with the number of moves coming from each core - * @return - */ - inline const std::vector& GetMovesCountPerCore() const - { - return allMoves; + // NOTE! All the sites in staying, leaving and arriving are + // sorted first by block ID and then by intra-block site ID. + + // Get the sites that aren't moved + inline SiteVec const& GetStaying() const { + return staying; } - /** - * Returns a vector with the list of moves - * @return - */ - inline const std::vector& GetMovesList() const - { - return movesList; + // Get a map (by destination rank) of sites leaving + inline MovesMap const& GetLeaving() const { + return leaving; } - private: + // Get a map (by source rank) of sites arriving + inline MovesMap const& GetArriving() const { + return arriving; + } + + private: /** * Populates the vector of vertex weights with different values for each local site type. * This allows ParMETIS to more efficiently decompose the system. @@ -65,11 +70,6 @@ namespace hemelb */ void PopulateSiteDistribution(); - /** - * Calculate the array of contiguous indices of the first fluid site on each block - */ - void PopulateFirstSiteIndexOnEachBlock(); - /** * Gets the list of adjacencies and the count of adjacencies per local fluid site * in a format suitable for use with ParMetis. @@ -92,12 +92,6 @@ namespace hemelb */ void PopulateMovesList(); - /** - * Return true if we should validate. - * @return - */ - bool ShouldValidate() const; - /** * Validates the vertex distribution array. */ @@ -113,115 +107,34 @@ namespace hemelb */ void ValidateAdjacencyData(idx_t localVertexCount); - /** - * Sends the adjacency data to the process of lower rank of the two. THIS IS INEFFICIENT. - * We only do it for validation purposes. - * - * @param neighbouringProc - * @param neighboursAdjacencyCount - * @param neighboursAdjacencyData Array to receive neighbour's expectations about adjacencies - * @param expectedAdjacencyData Adjacency data as this core expects it to be - */ - void SendAdjacencyDataToLowerRankedProc( - proc_t neighbouringProc, idx_t& neighboursAdjacencyCount, - std::vector& neighboursAdjacencyData, - std::multimap& expectedAdjacencyData); - - /** - * Compares this core's and a neighbouring core's version of the adjacency data between - * them. - * - * @param neighbouringProc - * @param neighboursAdjacencyCount - * @param neighboursAdjacencyData Array to receive neighbour's expectations about adjacencies - * @param expectedAdjacencyData Adjacency data as this core expects it to be - */ - void CompareAdjacencyData(proc_t neighbouringProc, idx_t neighboursAdjacencyCount, - const std::vector& neighboursAdjacencyData, - std::multimap& expectedAdjacencyData); - /** * Compile a list of all the moves that need to be made from this processor. - * @param blockIdLookupByLastSiteIndex - * @return - */ - std::vector CompileMoveData( - std::map& blockIdLookupByLastSiteIndex); - - /** - * Force some other cores to take info on blocks they might not know they need to know - * about. - * @param moveData - * @param blockIdsIRequireFromX - */ - void ForceSomeBlocksOnOtherCores( - std::vector& moveData, - std::map >& blockIdsIRequireFromX); - - /** - * Get the blocks required from every other processor. - * - * @param numberOfBlocksRequiredFrom - * @param blockIdsIRequireFromX - * @param numberOfBlocksXRequiresFromMe - * @param blockIdsXRequiresFromMe - */ - void GetBlockRequirements( - std::vector& numberOfBlocksRequiredFrom, - std::map >& blockIdsIRequireFromX, - std::vector& numberOfBlocksXRequiresFromMe, - std::map >& blockIdsXRequiresFromMe); - - /** - * Share the number of moves to be made between each pair of processors that need to move - * data. - * @param movesForEachLocalBlock - * @param blockIdsXRequiresFromMe - * @param coresInterestedInEachBlock - * @param moveData - * @param moveDataForEachBlock - * @param blockIdsIRequireFromX - * @param movesForEachBlockWeCareAbout */ - void ShareMoveCounts(std::map& movesForEachLocalBlock, - std::map >& blockIdsXRequiresFromMe, - std::map >& coresInterestedInEachBlock, - std::vector& moveData, - std::map >& moveDataForEachBlock, - std::map >& blockIdsIRequireFromX, - std::vector& movesForEachBlockWeCareAbout); - - /** - * Share the move data between cores - * - * @param movesForEachBlockWeCareAbout - * @param blockIdsIRequireFromX - * @param blockIdsXRequiresFromMe - * @param moveDataForEachBlock - */ - void ShareMoveData(std::vector movesForEachBlockWeCareAbout, - std::map > blockIdsIRequireFromX, - std::map > blockIdsXRequiresFromMe, - std::map > moveDataForEachBlock); + MovesMap CompileMoveData(); reporting::Timers& timers; //! Timers for reporting. - net::MpiCommunicator& comms; //! Communicator + net::MpiCommunicator comms; //! Communicator const GmyReadResult& geometry; //! The geometry being optimised. + octree::LookupTree const& tree; const lb::LatticeInfo& latticeInfo; //! The lattice info to optimise for. - const std::vector& procForEachBlock; //! The processor assigned to each block at the moment - const std::vector& fluidSitesPerBlock; //! The number of fluid sites on each block. + const std::vector& procForBlockOct; //! The initial MPI process for each block, in OCT layout + const std::vector& fluidSitesPerBlockOct; //! The number of fluid sites per block, in OCT layout + std::vector vtxCountPerProc; //! The number of vertices on each process. std::vector vtxDistribn; //! The vertex distribution across participating cores. - std::vector firstSiteIndexPerBlock; //! The global contiguous index of the first fluid site on each block. + std::vector firstSiteIndexPerBlockOct; //! The global contiguous index of the first fluid site on each block. + std::map> gmySiteIdForBlockOct; //! A map keyed on block OCT id to an array of all the fluid sites' GMY local site ID. std::vector adjacenciesPerVertex; //! The number of adjacencies for each local fluid site std::vector vertexWeights; //! The weight of each local fluid site std::vector vertexCoordinates; //! The coordinates of each local fluid site std::vector localAdjacencies; //! The list of adjacent vertex numbers for each local fluid site std::vector partitionVector; //! The results of the optimisation -- which core each fluid site should go to. - std::vector allMoves; //! The list of move counts from each core - std::vector movesList; + // The result of the optimisation: the sites staying, + // leaving and arriving, the latter two organised by the key + // being the src/dest rank. + SiteVec staying; + MovesMap leaving; + MovesMap arriving; }; } - } } - #endif /* HEMELB_GEOMETRY_DECOMPOSITION_OPTIMISEDDECOMPOSITION_H */ diff --git a/Code/reporting/BuildInfo.cc b/Code/reporting/BuildInfo.cc index 23d2555c3..d1ae5acbf 100644 --- a/Code/reporting/BuildInfo.cc +++ b/Code/reporting/BuildInfo.cc @@ -15,7 +15,6 @@ namespace hemelb::reporting { build.SetValue("OPTIMISATION", build_info::OPTIMISATION); build.SetBoolValue("USE_SSE3", build_info::USE_SSE3); build.SetValue("TIME", build_info::BUILD_TIME); - build.SetValue("READING_GROUP_SIZE", build_info::READING_GROUP_SIZE); build.SetValue("LATTICE_TYPE", build_info::LATTICE); build.SetValue("KERNEL_TYPE", build_info::KERNEL); build.SetValue("WALL_BOUNDARY_CONDITION", build_info::WALL_BOUNDARY); @@ -27,4 +26,4 @@ namespace hemelb::reporting { build.SetValue("POINTPOINT_IMPLEMENTATION", build_info::POINTPOINT_IMPLEMENTATION); build.SetValue("STENCIL", build_info::STENCIL); } -} \ No newline at end of file +} diff --git a/Code/resources/report.txt.ctp b/Code/resources/report.txt.ctp index b9a28e9d3..46dbddea9 100644 --- a/Code/resources/report.txt.ctp +++ b/Code/resources/report.txt.ctp @@ -30,7 +30,6 @@ Build type: {{TYPE}} Optimisation level: {{OPTIMISATION}} Use SSE3: {{USE_SSE3}} Built at: {{TIME}} -Reading group size: {{READING_GROUP_SIZE}} Lattice: {{LATTICE_TYPE}} Kernel: {{KERNEL_TYPE}} Wall boundary condition: {{WALL_BOUNDARY_CONDITION}} diff --git a/Code/resources/report.xml.ctp b/Code/resources/report.xml.ctp index 257352ded..136d8e9d5 100644 --- a/Code/resources/report.xml.ctp +++ b/Code/resources/report.xml.ctp @@ -14,7 +14,6 @@ {{OPTIMISATION}} {{USE_SSE3}} {{TIME}} - {{READING_GROUP_SIZE}} {{LATTICE_TYPE}} {{KERNEL_TYPE}} {{WALL_BOUNDARY_CONDITION}} @@ -76,4 +75,4 @@ {{/TIMER}} - \ No newline at end of file + diff --git a/Code/tests/geometry/LookupTreeTests.cc b/Code/tests/geometry/LookupTreeTests.cc index efb8e49fc..02b65a9b1 100644 --- a/Code/tests/geometry/LookupTreeTests.cc +++ b/Code/tests/geometry/LookupTreeTests.cc @@ -179,7 +179,7 @@ namespace hemelb::tests template void check_vec(std::vector const& actual, std::vector const& expected) { REQUIRE(actual.size() == expected.size()); - for (int i = 0; i < actual.size(); ++i) { + for (int i = 0; i < std::ssize(actual); ++i) { REQUIRE(actual[i] == expected[i]); } } @@ -271,10 +271,10 @@ namespace hemelb::tests REQUIRE(b334.path[3] == Level::NC); // fluid sites per block, ordered by GMY index - auto spb = std::vector{ + auto spb = std::vector{ 287, 328, 328, 328, 123, 287, 328, 328, 328, 123, 287, 328, 328, 328, 123, 287, 328, 328, 328, 123 }; - std::map oct2spb; + std::map oct2spb; for (U64 block = 0U; block < 20; ++block) { auto b_ijk = Vec16(block / 10U, (block / 5U) % 2U, block % 5U); auto oct = ijk_to_oct(b_ijk);