diff --git a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py index 3783b696057..102bed8428c 100644 --- a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py +++ b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py @@ -157,7 +157,7 @@ def _write_samples_to_parquet_csr( [0, -1] ].values # legal since offsets has the 1 extra offset results_start, results_end = major_offsets_array[ - [major_offsets_start, major_offsets_end] + [major_offsets_start, major_offsets_end - 1] ] # avoid d2h copy # no need to use end batch id, just ensure the batch is labeled correctly @@ -186,7 +186,8 @@ def _write_samples_to_parquet_csr( ), label_hop_offsets_current_partition, cudf.Series( - major_offsets_array[results_start:results_end], name="major_offsets" + major_offsets_array[major_offsets_start:major_offsets_end], + name="major_offsets", ), cudf.Series(weight_array[results_start:results_end], name="weight"), cudf.Series(edge_id_array[results_start:results_end], name="edge_id"), diff --git a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py index c1bac8b44c4..a945881394b 100644 --- a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py +++ b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py @@ -341,4 +341,9 @@ def test_bulk_sampler_csr(scratch_dir): assert len(os.listdir(samples_path)) == 21 + for file in os.listdir(samples_path): + df = cudf.read_parquet(os.path.join(samples_path, file)) + + assert df.major_offsets.dropna().iloc[-1] - df.major_offsets.iloc[0] == len(df) + shutil.rmtree(samples_path) diff --git a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py index 37eecdec58e..aee81e5ffed 100644 --- a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py +++ b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py @@ -297,4 +297,9 @@ def test_bulk_sampler_csr(dask_client, scratch_dir, mg_input): assert len(os.listdir(samples_path)) == 21 + for file in os.listdir(samples_path): + df = cudf.read_parquet(os.path.join(samples_path, file)) + + assert df.major_offsets.dropna().iloc[-1] - df.major_offsets.iloc[0] == len(df) + shutil.rmtree(samples_path)