From eb3aadce7f32020f06ed7db4a1630b5d66ad7ddb Mon Sep 17 00:00:00 2001 From: Alexandria Barghi Date: Mon, 25 Sep 2023 09:12:48 -0700 Subject: [PATCH] fix wrong index + off by 1 error, add check in test --- python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py | 5 +++-- python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py | 5 +++++ .../cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py | 5 +++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py index 3783b696057..102bed8428c 100644 --- a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py +++ b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py @@ -157,7 +157,7 @@ def _write_samples_to_parquet_csr( [0, -1] ].values # legal since offsets has the 1 extra offset results_start, results_end = major_offsets_array[ - [major_offsets_start, major_offsets_end] + [major_offsets_start, major_offsets_end - 1] ] # avoid d2h copy # no need to use end batch id, just ensure the batch is labeled correctly @@ -186,7 +186,8 @@ def _write_samples_to_parquet_csr( ), label_hop_offsets_current_partition, cudf.Series( - major_offsets_array[results_start:results_end], name="major_offsets" + major_offsets_array[major_offsets_start:major_offsets_end], + name="major_offsets", ), cudf.Series(weight_array[results_start:results_end], name="weight"), cudf.Series(edge_id_array[results_start:results_end], name="edge_id"), diff --git a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py index c1bac8b44c4..a945881394b 100644 --- a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py +++ b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py @@ -341,4 +341,9 @@ def test_bulk_sampler_csr(scratch_dir): assert len(os.listdir(samples_path)) == 21 + for file in os.listdir(samples_path): + df = cudf.read_parquet(os.path.join(samples_path, file)) + + assert df.major_offsets.dropna().iloc[-1] - df.major_offsets.iloc[0] == len(df) + shutil.rmtree(samples_path) diff --git a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py index 37eecdec58e..aee81e5ffed 100644 --- a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py +++ b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py @@ -297,4 +297,9 @@ def test_bulk_sampler_csr(dask_client, scratch_dir, mg_input): assert len(os.listdir(samples_path)) == 21 + for file in os.listdir(samples_path): + df = cudf.read_parquet(os.path.join(samples_path, file)) + + assert df.major_offsets.dropna().iloc[-1] - df.major_offsets.iloc[0] == len(df) + shutil.rmtree(samples_path)