Skip to content

Commit

Permalink
Merge pull request #1327 from LLNL/bugfix/whitlock/point_query_tags
Browse files Browse the repository at this point in the history
Fixes for tags that affected PointQuery on some systems.
  • Loading branch information
BradWhitlock authored Oct 22, 2024
2 parents fa0f8d3 + f3fa292 commit 0d8db6d
Show file tree
Hide file tree
Showing 5 changed files with 262 additions and 60 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@ and this project aspires to adhere to [Semantic Versioning](https://semver.org/s
#### Conduit
- Changed the MPI CMake target used by conduit from `MPI:MPI_CXX` to `MPI:MPI_C` to provide better compatibility with downstream tools.

#### Blueprint
- Certain algorithms that use MPI tags had their tag values lowered since some MPI implementations do not support large values.

#### Relay
- User-supplied warning and error handlers are suspended during `conduit::relay::communicate_using_schema::execute()` so exceptions will be thrown properly when there is an MPI error. The handlers are restored before the execute method returns.
- `conduit::relay::communicate_using_schema::execute()` flushes logs as they are generated, in case of error. This is mostly to facilitate internal debugging.
- Changes were made to how Relay queries the upper limit for MPI tags to work around problems on some systems.

## [0.9.2] - Released 2024-05-21

### Added
Expand Down
6 changes: 3 additions & 3 deletions src/libs/blueprint/conduit_blueprint_mpi_mesh_parmetis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@ void generate_global_element_and_vertex_ids(conduit::Node &mesh,

if (adjset_name != "")
{
const int TAG_SHARED_NODE_SYNC = 175000000;
const int TAG_SHARED_NODE_SYNC = 175;
// map of groups -> global vtx ids
std::map<std::set<uint64>, std::vector<uint64>> groups_2_vids;
// map of rank -> sends to/recvs from that rank of global vtx ids for
Expand Down Expand Up @@ -464,7 +464,7 @@ void generate_global_element_and_vertex_ids(conduit::Node &mesh,
for (const std::set<uint64>& group : recv_groups)
{
index_t domid = *(group.begin());
const int tag = conduit::relay::mpi::safe_tag(TAG_SHARED_NODE_SYNC + domid * 100 + group_idx);
const int tag = conduit::relay::mpi::safe_tag(TAG_SHARED_NODE_SYNC + domid * 100 + group_idx, comm);
async_recvs.push_back(MPI_Request{});
group_idx++;
std::vector<uint64>& recvbuf = groups_2_vids[group];
Expand All @@ -484,7 +484,7 @@ void generate_global_element_and_vertex_ids(conduit::Node &mesh,
for (const std::set<uint64>& group : send_groups)
{
index_t domid = *(group.begin());
const int tag = conduit::relay::mpi::safe_tag(TAG_SHARED_NODE_SYNC + domid * 100 + group_idx);
const int tag = conduit::relay::mpi::safe_tag(TAG_SHARED_NODE_SYNC + domid * 100 + group_idx, comm);
async_sends.push_back(MPI_Request{});
group_idx++;
std::vector<uint64>& sendbuf = groups_2_vids[group];
Expand Down
23 changes: 12 additions & 11 deletions src/libs/blueprint/conduit_blueprint_mpi_mesh_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,8 @@ PointQuery::execute(const std::string &coordsetName)
// the results. The results get stored in m_domResults.
std::map<std::pair<int,int>, conduit::Node *> input_sends, result_sends,
input_recvs, result_recvs;
int inputs_tag = 55000000;
int results_tag = 66000000;
const int inputs_tag = 550;
const int results_tag = 660;
for(int pass = 0; pass < 2; pass++)
{
conduit::relay::mpi::communicate_using_schema C(m_comm);
Expand All @@ -187,10 +187,10 @@ PointQuery::execute(const std::string &coordsetName)
#endif
for(size_t i = 0; i < allqueries.size(); i += 3)
{
int asker = allqueries[i];
int domain = allqueries[i+1];
int npts = allqueries[i+2];
int owner = domain_to_rank[domain];
const int asker = allqueries[i];
const int domain = allqueries[i+1];
const int npts = allqueries[i+2];
const int owner = domain_to_rank[domain];

if(asker == rank)
{
Expand Down Expand Up @@ -278,6 +278,7 @@ PointQuery::execute(const std::string &coordsetName)
for(auto it = result_recvs.begin(); it != result_recvs.end(); it++)
{
int domain = it->first.second;

const conduit::Node &r = it->second->fetch_existing("results");
auto acc = r.as_int_accessor();
std::vector<int> &result = m_domResults[domain];
Expand Down Expand Up @@ -435,12 +436,12 @@ MatchQuery::execute()
C.set_logging_root("mpi_matchquery");
C.set_logging(true);
#endif
int query_tag = 77000000;
const int query_tag = 770;
for(size_t i = 0; i < allqueries.size(); i += ntuple_values)
{
int owner = allqueries[i];
int domain = allqueries[i + 1];
int query_domain = allqueries[i + 2];
const int owner = allqueries[i];
const int domain = allqueries[i + 1];
const int query_domain = allqueries[i + 2];

auto oppositeKey = std::make_pair(query_domain, domain);

Expand Down Expand Up @@ -667,7 +668,7 @@ compare_pointwise_impl(conduit::Node &mesh, const std::string &adjsetName,

// Iterate over each of the possible adjset relationships. Not all of these
// will have adjset groups.
const int tag = 12211221;
const int tag = 122;
for(int d0 = 0; d0 < maxDomains; d0++)
{
for(int d1 = d0 + 1; d1 < maxDomains; d1++)
Expand Down
Loading

0 comments on commit 0d8db6d

Please sign in to comment.