From 4ee227c7c84ef487828ecadd5fe86934f1fce4eb Mon Sep 17 00:00:00 2001 From: Joseph Nke <76006812+jnke2016@users.noreply.github.com> Date: Thu, 7 Sep 2023 16:39:22 -0500 Subject: [PATCH 1/3] Remove the assumption made on the client data's keys (#3835) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When calling `client.has_what(`) which returns the data's key that are held in each worker’s memory, those keys used to be returned as string but a recent change in `dask` changed the type to tuples   From `{worker_ip_address: ("('from-delayed-190587f1b2318dc54d5f92a79e59b71a', 0)", "('from-delayed-190587f1b2318dc54d5f92a79e59b71a', 1)")}` to`{worker_ip_address: (('from-delayed-c3d92b2cc9948634e82a0b2b62453a6c', 0), ('from-delayed-c3d92b2cc9948634e82a0b2b62453a6c', 1))}`   When mapping workers to persisted data in the function `get_persisted_df_worker_map`, an assumption about the type of those keys was made thereby breaking our MG tests. This PR removes that assumption. Closes #3834 Authors: - Joseph Nke (https://github.com/jnke2016) - Alex Barghi (https://github.com/alexbarghi-nv) Approvers: - Alex Barghi (https://github.com/alexbarghi-nv) URL: https://github.com/rapidsai/cugraph/pull/3835 --- .../cugraph/cugraph/dask/common/part_utils.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/python/cugraph/cugraph/dask/common/part_utils.py b/python/cugraph/cugraph/dask/common/part_utils.py index fda7e257367..7c0aad6c3ee 100644 --- a/python/cugraph/cugraph/dask/common/part_utils.py +++ b/python/cugraph/cugraph/dask/common/part_utils.py @@ -73,7 +73,7 @@ def persist_distributed_data(dask_df, client): _keys = dask_df.__dask_keys__() worker_dict = {} for i, key in enumerate(_keys): - worker_dict[str(key)] = tuple([worker_addresses[i]]) + worker_dict[key] = tuple([worker_addresses[i]]) persisted = client.persist(dask_df, workers=worker_dict) parts = futures_of(persisted) return parts @@ -89,7 +89,7 @@ def get_persisted_df_worker_map(dask_df, client): ddf_keys = futures_of(dask_df) output_map = {} for w, w_keys in client.has_what().items(): - output_map[w] = [ddf_k for ddf_k in ddf_keys if str(ddf_k.key) in w_keys] + output_map[w] = [ddf_k for ddf_k in ddf_keys if ddf_k.key in w_keys] if len(output_map[w]) == 0: output_map[w] = _create_empty_dask_df_future(dask_df._meta, client, w) return output_map @@ -157,7 +157,7 @@ async def _extract_partitions( # NOTE: We colocate (X, y) here by zipping delayed # n partitions of them as (X1, y1), (X2, y2)... # and asking client to compute a single future for - # each tuple in the list + # each tuple in the list. dela = [np.asarray(d.to_delayed()) for d in dask_obj] # TODO: ravel() is causing strange behavior w/ delayed Arrays which are @@ -167,7 +167,7 @@ async def _extract_partitions( parts = client.compute([p for p in zip(*raveled)]) await wait(parts) - key_to_part = [(str(part.key), part) for part in parts] + key_to_part = [(part.key, part) for part in parts] who_has = await client.who_has(parts) return [(first(who_has[key]), part) for key, part in key_to_part] @@ -229,7 +229,7 @@ def load_balance_func(ddf_, by, client=None): wait(parts) who_has = client.who_has(parts) - key_to_part = [(str(part.key), part) for part in parts] + key_to_part = [(part.key, part) for part in parts] gpu_fututres = [ (first(who_has[key]), part.key[1], part) for key, part in key_to_part ] @@ -245,7 +245,7 @@ def load_balance_func(ddf_, by, client=None): for cumsum in cumsum_parts: num_rows.append(cumsum.iloc[-1]) - # Calculate current partition divisions + # Calculate current partition divisions. divisions = [sum(num_rows[0:x:1]) for x in range(0, len(num_rows) + 1)] divisions[-1] = divisions[-1] - 1 divisions = tuple(divisions) @@ -271,7 +271,7 @@ def load_balance_func(ddf_, by, client=None): def concat_dfs(df_list): """ - Concat a list of cudf dataframes + Concat a list of cudf dataframes. """ return cudf.concat(df_list) @@ -279,17 +279,17 @@ def concat_dfs(df_list): def get_delayed_dict(ddf): """ Returns a dicitionary with the dataframe tasks as keys and - the dataframe delayed objects as values + the dataframe delayed objects as values. """ df_delayed = {} for delayed_obj in ddf.to_delayed(): - df_delayed[str(delayed_obj.key)] = delayed_obj + df_delayed[delayed_obj.key] = delayed_obj return df_delayed def concat_within_workers(client, ddf): """ - Concats all partitions within workers without transfers + Concats all partitions within workers without transfers. """ df_delayed = get_delayed_dict(ddf) From 6779e896edf310f5bcaad5acb8673995041c2801 Mon Sep 17 00:00:00 2001 From: ralph <137829296+nv-rliu@users.noreply.github.com> Date: Fri, 8 Sep 2023 09:58:44 -0400 Subject: [PATCH 2/3] Adding metadata getter methods to datasets API (#3821) Closes #3820 This PR adds simple getter methods to the `dataset` class, which allows users to easily get information about datasets without need to access the `metadata` dict or look in the directory. ```python from cugraph.datasets import karate # users now call karate.number_of_nodes() # instead of karate.metadata['number_of_nodes'] ``` Authors: - ralph (https://github.com/nv-rliu) Approvers: - Alex Barghi (https://github.com/alexbarghi-nv) URL: https://github.com/rapidsai/cugraph/pull/3821 --- python/cugraph/cugraph/datasets/dataset.py | 36 +++++++++++++++++++ .../cugraph/tests/utils/test_dataset.py | 10 ++++++ 2 files changed, 46 insertions(+) diff --git a/python/cugraph/cugraph/datasets/dataset.py b/python/cugraph/cugraph/datasets/dataset.py index b276a87b88e..877eade7708 100644 --- a/python/cugraph/cugraph/datasets/dataset.py +++ b/python/cugraph/cugraph/datasets/dataset.py @@ -266,6 +266,42 @@ def get_path(self): return self._path.absolute() + def is_directed(self): + """ + Returns True if the graph is a directed graph. + """ + return self.metadata["is_directed"] + + def is_multigraph(self): + """ + Returns True if the graph is a multigraph. + """ + return self.metadata["is_multigraph"] + + def is_symmetric(self): + """ + Returns True if the graph is symmetric. + """ + return self.metadata["is_symmetric"] + + def number_of_nodes(self): + """ + An alias of number_of_vertices() + """ + return self.number_of_vertices() + + def number_of_vertices(self): + """ + Get the number of vertices in the graph. + """ + return self.metadata["number_of_nodes"] + + def number_of_edges(self): + """ + Get the number of edges in the graph. + """ + return self.metadata["number_of_edges"] + def download_all(force=False): """ diff --git a/python/cugraph/cugraph/tests/utils/test_dataset.py b/python/cugraph/cugraph/tests/utils/test_dataset.py index 643d0468d46..c2a4f7c6072 100644 --- a/python/cugraph/cugraph/tests/utils/test_dataset.py +++ b/python/cugraph/cugraph/tests/utils/test_dataset.py @@ -328,6 +328,16 @@ def test_is_multigraph(dataset): assert G.is_multigraph() == dataset.metadata["is_multigraph"] +@pytest.mark.parametrize("dataset", ALL_DATASETS) +def test_object_getters(dataset): + assert dataset.is_directed() == dataset.metadata["is_directed"] + assert dataset.is_multigraph() == dataset.metadata["is_multigraph"] + assert dataset.is_symmetric() == dataset.metadata["is_symmetric"] + assert dataset.number_of_nodes() == dataset.metadata["number_of_nodes"] + assert dataset.number_of_vertices() == dataset.metadata["number_of_nodes"] + assert dataset.number_of_edges() == dataset.metadata["number_of_edges"] + + # # Test experimental for DeprecationWarnings # From 17b34479094e42e1401d0e5354d8da98672ba291 Mon Sep 17 00:00:00 2001 From: Rick Ratzel <3039903+rlratzel@users.noreply.github.com> Date: Fri, 8 Sep 2023 13:38:22 -0500 Subject: [PATCH 3/3] Uses `conda mambabuild` rather than `mamba mambabuild` (#3853) Applies same changes for the same reasons as cuDF PR https://github.com/rapidsai/cudf/pull/14067 to cuGraph. Authors: - Rick Ratzel (https://github.com/rlratzel) Approvers: - Ray Douglass (https://github.com/raydouglass) URL: https://github.com/rapidsai/cugraph/pull/3853 --- ci/build_cpp.sh | 2 +- ci/build_python.sh | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh index 3fd57f24c40..3fb72cac08b 100755 --- a/ci/build_cpp.sh +++ b/ci/build_cpp.sh @@ -11,6 +11,6 @@ rapids-print-env rapids-logger "Begin cpp build" -rapids-mamba-retry mambabuild conda/recipes/libcugraph +rapids-conda-retry mambabuild conda/recipes/libcugraph rapids-upload-conda-to-s3 cpp diff --git a/ci/build_python.sh b/ci/build_python.sh index 429ba649d1d..62eb6c2ccec 100755 --- a/ci/build_python.sh +++ b/ci/build_python.sh @@ -15,12 +15,12 @@ rapids-logger "Begin py build" # TODO: Remove `--no-test` flags once importing on a CPU # node works correctly -rapids-mamba-retry mambabuild \ +rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ conda/recipes/pylibcugraph -rapids-mamba-retry mambabuild \ +rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ @@ -30,7 +30,7 @@ rapids-mamba-retry mambabuild \ # platform to ensure it is included in each set of artifacts, since test # scripts only install from one set of artifacts based on the CUDA version used # for the test run. -rapids-mamba-retry mambabuild \ +rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ @@ -40,7 +40,7 @@ rapids-mamba-retry mambabuild \ # built on each CUDA platform to ensure they are included in each set of # artifacts, since test scripts only install from one set of artifacts based on # the CUDA version used for the test run. -rapids-mamba-retry mambabuild \ +rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ @@ -50,7 +50,7 @@ RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}" if [[ ${RAPIDS_CUDA_MAJOR} == "11" ]]; then # Only CUDA 11 is supported right now due to PyTorch requirement. - rapids-mamba-retry mambabuild \ + rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ @@ -60,7 +60,7 @@ if [[ ${RAPIDS_CUDA_MAJOR} == "11" ]]; then conda/recipes/cugraph-pyg # Only CUDA 11 is supported right now due to PyTorch requirement. - rapids-mamba-retry mambabuild \ + rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \