Skip to content

Commit

Permalink
Fixes for pandas 2, latest cudf, and wheel building (#4144)
Browse files Browse the repository at this point in the history
This PR contains a number of different fixes currently required to get cugraph tests passing:
- There are two main changes for pandas 2 compatibility:
    - [pandas renamed `DataFrame.applymap` to `DataFrame.map`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.map.html) so creating the renumbering map with a column `map` caused problems for attribute-based column access `renumber_map.map`. Those columns are now renamed to `renumber_map`.
    - Empty columns now default to str rather than float, so tests that assumed we could access the values as cupy arrays failed because cudf's string columns cannot be converted to cupy arrays. These columns are now always cast to float in the tests before the cupy conversion.
- cugraph-dgl and cugraph-pyg's wheel builds were not downloading the latest cugraph/pylibcugraph wheels to run tests. As a result, the above pandas 2 fixes didn't take when running the dgl and pyg tests. I updated the wheel building scripts to account for this discrepancy.
- rapidsai/cudf#14202 made a breaking change to how characters are encoded in strings columns in cudf, which broke cugraph_etl. This PR fixes the code that depended on the old APIs.

This code also includes a small patch to the cugraph_etl CMake so that it exports the correct package name (previously it was using cugraph).

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Rick Ratzel (https://github.com/rlratzel)
  - Jake Awe (https://github.com/AyodeAwe)

URL: #4144
  • Loading branch information
vyasr authored Feb 6, 2024
1 parent f8f4eae commit bf5aa60
Show file tree
Hide file tree
Showing 12 changed files with 62 additions and 38 deletions.
5 changes: 5 additions & 0 deletions ci/test_wheel_cugraph-dgl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ python_package_name=$(echo ${package_name}|sed 's/-/_/g')
mkdir -p ./dist
RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"

# Download wheels built during this job.
RAPIDS_PY_WHEEL_NAME="pylibcugraph_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-deps
RAPIDS_PY_WHEEL_NAME="cugraph_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-deps
python -m pip install ./local-deps/*.whl

# use 'ls' to expand wildcard before adding `[extra]` requires for pip
RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
# pip creates wheels using python package names
Expand Down
5 changes: 5 additions & 0 deletions ci/test_wheel_cugraph-pyg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ python_package_name=$(echo ${package_name}|sed 's/-/_/g')
mkdir -p ./dist
RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"

# Download wheels built during this job.
RAPIDS_PY_WHEEL_NAME="pylibcugraph_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-deps
RAPIDS_PY_WHEEL_NAME="cugraph_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-deps
python -m pip install ./local-deps/*.whl

# use 'ls' to expand wildcard before adding `[extra]` requires for pip
RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
# pip creates wheels using python package names
Expand Down
4 changes: 2 additions & 2 deletions cpp/libcugraph_etl/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#=============================================================================
# Copyright (c) 2021-2023, NVIDIA CORPORATION.
# Copyright (c) 2021-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -190,7 +190,7 @@ rapids_export(INSTALL cugraph_etl

################################################################################
# - build export ---------------------------------------------------------------
rapids_export(BUILD cugraph
rapids_export(BUILD cugraph_etl
EXPORT_SET cugraph_etl-exports
GLOBAL_TARGETS cugraph cugraph_c cugraph_etl
NAMESPACE cugraph::
Expand Down
16 changes: 9 additions & 7 deletions cpp/libcugraph_etl/src/renumbering.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -776,15 +776,15 @@ struct renumber_functor {
for (int i = 0; i < src_view.num_columns(); i++) {
auto str_col_view = cudf::strings_column_view(src_view.column(i));
src_vertex_chars_ptrs.push_back(
const_cast<char_type*>(str_col_view.chars().data<char_type>()));
const_cast<char_type*>(str_col_view.parent().data<char_type>()));
src_vertex_offset_ptrs.push_back(
const_cast<str_offset_type*>(str_col_view.offsets().data<str_offset_type>()));
}

for (int i = 0; i < dst_view.num_columns(); i++) {
auto str_col_view = cudf::strings_column_view(dst_view.column(i));
dst_vertex_chars_ptrs.push_back(
const_cast<char_type*>(str_col_view.chars().data<char_type>()));
const_cast<char_type*>(str_col_view.parent().data<char_type>()));
dst_vertex_offset_ptrs.push_back(
const_cast<str_offset_type*>(str_col_view.offsets().data<str_offset_type>()));
}
Expand Down Expand Up @@ -970,13 +970,14 @@ struct renumber_functor {
std::move(unrenumber_col1_chars),
rmm::device_buffer{},
0);
auto str_col_1_contents = str_col_1->release();

renumber_table_vectors.push_back(
cudf::make_strings_column(size_type(key_value_count),
std::move(offset_col_1),
std::move(str_col_1),
std::move(*str_col_1_contents.data),
0,
rmm::device_buffer(size_type(0), exec_strm)));
std::move(*str_col_1_contents.null_mask)));

auto offset_col_2 =
std::make_unique<cudf::column>(cudf::data_type(cudf::type_id::INT32),
Expand All @@ -991,13 +992,14 @@ struct renumber_functor {
std::move(unrenumber_col2_chars),
rmm::device_buffer{},
0);
auto str_col_2_contents = str_col_2->release();

renumber_table_vectors.push_back(
cudf::make_strings_column(size_type(key_value_count),
std::move(offset_col_2),
std::move(str_col_2),
std::move(*str_col_2_contents.data),
0,
rmm::device_buffer(size_type(0), exec_strm)));
std::move(*str_col_2_contents.null_mask)));

// make table from string columns - did at the end

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand Down Expand Up @@ -446,7 +446,7 @@ def _process_sampled_df_csc(
major_offsets = cast_to_tensor(df.major_offsets.dropna())
label_hop_offsets = cast_to_tensor(df.label_hop_offsets.dropna())
renumber_map_offsets = cast_to_tensor(df.renumber_map_offsets.dropna())
renumber_map = cast_to_tensor(df.map.dropna())
renumber_map = cast_to_tensor(df["map"].dropna())
minors = cast_to_tensor(df.minors.dropna())

n_batches = len(renumber_map_offsets) - 1
Expand Down
8 changes: 5 additions & 3 deletions python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023, NVIDIA CORPORATION.
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand Down Expand Up @@ -181,7 +181,9 @@ def _write_samples_to_parquet_csr(
[
cudf.Series(minors_array[results_start:results_end], name="minors"),
cudf.Series(
renumber_map.map.values[renumber_map_start:renumber_map_end],
renumber_map.renumber_map.values[
renumber_map_start:renumber_map_end
],
name="map",
),
label_hop_offsets_current_partition,
Expand Down Expand Up @@ -299,7 +301,7 @@ def _write_samples_to_parquet_coo(
else:
renumber_map_end_ix = offsets_z.renumber_map_offsets.iloc[0]

renumber_map_p = renumber_map.map.iloc[
renumber_map_p = renumber_map.renumber_map.iloc[
renumber_map_start_ix:renumber_map_end_ix
]

Expand Down
14 changes: 9 additions & 5 deletions python/cugraph/cugraph/gnn/dgl_extensions/dgl_uniform_sampler.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023, NVIDIA CORPORATION.
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand Down Expand Up @@ -120,9 +120,9 @@ def sample_neighbors(
return self._get_edgeid_type_d(sampled_df)
else:
return (
sampled_df[src_n].values,
sampled_df[dst_n].values,
sampled_df["indices"].values,
sampled_df[src_n].astype("float").values,
sampled_df[dst_n].astype("float").values,
sampled_df["indices"].astype("float").values,
)

def _get_edgeid_type_d(self, df):
Expand All @@ -134,7 +134,11 @@ def _get_edgeid_type_d(self, df):
for etype, etype_id in self.etype_id_dict.items()
}
return {
etype: (df[src_n].values, df[dst_n].values, df["indices"].values)
etype: (
df[src_n].astype("float").values,
df[dst_n].astype("float").values,
df["indices"].astype("float").values,
)
for etype, df in result_d.items()
}

Expand Down
4 changes: 2 additions & 2 deletions python/cugraph/cugraph/sampling/sampling_utilities.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023, NVIDIA CORPORATION.
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand Down Expand Up @@ -74,7 +74,7 @@ def sampling_results_from_cupy_array_dict(
if renumber:
renumber_df = cudf.DataFrame(
{
"map": cupy_array_dict["renumber_map"],
"renumber_map": cupy_array_dict["renumber_map"],
}
)

Expand Down
4 changes: 2 additions & 2 deletions python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023, NVIDIA CORPORATION.
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand Down Expand Up @@ -222,7 +222,7 @@ def test_bulk_sampler_partitions(scratch_dir):
]

recovered_samples = cudf.read_parquet(os.path.join(samples_path, file))
recovered_map = recovered_samples.map
recovered_map = recovered_samples["map"]
recovered_samples = recovered_samples.drop("map", axis=1).dropna()

for current_batch_id in range(start_batch_id, end_batch_id + 1):
Expand Down
4 changes: 2 additions & 2 deletions python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023, NVIDIA CORPORATION.
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand Down Expand Up @@ -166,7 +166,7 @@ def test_bulk_sampler_partitions(dask_client, scratch_dir, mg_input):
]

recovered_samples = cudf.read_parquet(os.path.join(samples_path, file))
recovered_map = recovered_samples.map
recovered_map = recovered_samples["map"]
recovered_samples = recovered_samples.drop("map", axis=1).dropna()

for current_batch_id in range(start_batch_id, end_batch_id + 1):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand Down Expand Up @@ -796,7 +796,9 @@ def test_uniform_neighbor_sample_renumber(hops):
expected_renumber_map = cudf.concat([sources_hop_0, destinations_hop]).unique()

assert sorted(expected_renumber_map.values_host.tolist()) == sorted(
renumber_map.map[0 : len(expected_renumber_map)].values_host.tolist()
renumber_map.renumber_map[
0 : len(expected_renumber_map)
].values_host.tolist()
)
assert (renumber_map.batch_id == 0).all()

Expand Down Expand Up @@ -854,7 +856,9 @@ def test_uniform_neighbor_sample_offset_renumber(hops):
expected_renumber_map = cudf.concat([sources_hop_0, destinations_hop]).unique()

assert sorted(expected_renumber_map.values_host.tolist()) == sorted(
renumber_map.map[0 : len(expected_renumber_map)].values_host.tolist()
renumber_map.renumber_map[
0 : len(expected_renumber_map)
].values_host.tolist()
)

renumber_map_offsets = offsets_renumbered.renumber_map_offsets.dropna()
Expand Down Expand Up @@ -902,8 +906,8 @@ def test_uniform_neighbor_sample_csr_csc_global(hops, seed):
minors = sampling_results["minors"].dropna()
assert len(majors) == len(minors)

majors = renumber_map.map.iloc[majors]
minors = renumber_map.map.iloc[minors]
majors = renumber_map.renumber_map.iloc[majors]
minors = renumber_map.renumber_map.iloc[minors]

for i in range(len(majors)):
assert 1 == len(el[(el.src == majors.iloc[i]) & (el.dst == minors.iloc[i])])
Expand Down Expand Up @@ -952,8 +956,8 @@ def test_uniform_neighbor_sample_csr_csc_local(hops, seed):
majors = cudf.Series(cupy.arange(len(major_offsets) - 1))
majors = majors.repeat(cupy.diff(major_offsets))

majors = renumber_map.map.iloc[majors]
minors = renumber_map.map.iloc[minors]
majors = renumber_map.renumber_map.iloc[majors]
minors = renumber_map.renumber_map.iloc[minors]

for i in range(len(majors)):
assert 1 == len(el[(el.src == majors.iloc[i]) & (el.dst == minors.iloc[i])])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1015,7 +1015,7 @@ def test_uniform_neighbor_sample_renumber(dask_client, hops):

assert (renumber_map.batch_id == 0).all()
assert (
renumber_map.map.nunique()
renumber_map.renumber_map.nunique()
== cudf.concat(
[sources_hop_0, sampling_results_renumbered.destinations]
).nunique()
Expand Down Expand Up @@ -1091,7 +1091,9 @@ def test_uniform_neighbor_sample_offset_renumber(dask_client, hops):
expected_renumber_map = cudf.concat([sources_hop_0, destinations_hop]).unique()

assert sorted(expected_renumber_map.values_host.tolist()) == sorted(
renumber_map.map[0 : len(expected_renumber_map)].values_host.tolist()
renumber_map.renumber_map[
0 : len(expected_renumber_map)
].values_host.tolist()
)

renumber_map_offsets = offsets_renumbered.renumber_map_offsets.dropna()
Expand Down Expand Up @@ -1153,8 +1155,8 @@ def test_uniform_neighbor_sample_csr_csc_global(dask_client, hops, seed):
minors = sampling_results["minors"].dropna()
assert len(majors) == len(minors)

majors = renumber_map.map.iloc[majors]
minors = renumber_map.map.iloc[minors]
majors = renumber_map.renumber_map.iloc[majors]
minors = renumber_map.renumber_map.iloc[minors]

for i in range(len(majors)):
assert 1 == len(el[(el.src == majors.iloc[i]) & (el.dst == minors.iloc[i])])
Expand Down Expand Up @@ -1221,8 +1223,8 @@ def test_uniform_neighbor_sample_csr_csc_local(dask_client, hops, seed):
majors = cudf.Series(cupy.arange(len(major_offsets) - 1))
majors = majors.repeat(cupy.diff(major_offsets))

majors = renumber_map.map.iloc[majors]
minors = renumber_map.map.iloc[minors]
majors = renumber_map.renumber_map.iloc[majors]
minors = renumber_map.renumber_map.iloc[minors]

for i in range(len(majors)):
assert 1 == len(el[(el.src == majors.iloc[i]) & (el.dst == minors.iloc[i])])
Expand Down

0 comments on commit bf5aa60

Please sign in to comment.