From 5c0bc8a19fc3f9904541de6fb9bde95495298eb4 Mon Sep 17 00:00:00 2001 From: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com> Date: Wed, 1 Nov 2023 15:41:03 -0400 Subject: [PATCH] [BUG] Check if Dask has quit to avoid throwing an exception and triggering a segfault on ddp exit (#3961) Currently, when training with ddp, if dask exits before the `CuGraphStore` is cleaned up, an exception is thrown, which causes ddp to quit with an error, which then causes a segfault, making users think that the workflow has failed when it has actually succeeded. This bug gracefully displays a warning if the dask dataset can't be deleted, which resolves this issue. Authors: - Alex Barghi (https://github.com/alexbarghi-nv) Approvers: - Vibhu Jawa (https://github.com/VibhuJawa) - Tingyu Wang (https://github.com/tingyu66) - Rick Ratzel (https://github.com/rlratzel) URL: https://github.com/rapidsai/cugraph/pull/3961 --- python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py index fd2172e6ade..6192cd621d5 100644 --- a/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py +++ b/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py @@ -320,7 +320,13 @@ def __init__( def __del__(self): if self.__is_graph_owner: if isinstance(self.__graph._plc_graph, dict): - distributed.get_client().unpublish_dataset("cugraph_graph") + try: + distributed.get_client().unpublish_dataset("cugraph_graph") + except TypeError: + warnings.warn( + "Could not unpublish graph dataset, most likely because" + " dask has already shut down." + ) del self.__graph def __make_offsets(self, input_dict):