diff --git a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py index 22999843b1f..7e2944c35ce 100644 --- a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py +++ b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py @@ -60,7 +60,6 @@ def _write_samples_to_parquet_coo( if partition_info != "sg" and (not isinstance(partition_info, dict)): raise ValueError("Invalid value of partition_info") - print('offsets:', offsets) offsets = offsets[:-1] # Offsets is always in order, so the last batch id is always the highest @@ -73,9 +72,6 @@ def _write_samples_to_parquet_coo( start_batch_id = offsets_p.batch_id.iloc[0] end_batch_id = offsets_p.batch_id.iloc[len(offsets_p) - 1] - print('partition:', start_batch_id, end_batch_id) - print('max batch id:', max_batch_id) - reached_end = end_batch_id == max_batch_id start_ix = offsets_p.offsets.iloc[0] diff --git a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_io_mg.py b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_io_mg.py index 41f68c08e5c..ca0b4a7ae35 100644 --- a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_io_mg.py +++ b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_io_mg.py @@ -38,8 +38,12 @@ def test_bulk_sampler_io(scratch_dir): divisions=[0, 8, 11] ) - offsets = cudf.DataFrame({"offsets": [0, 0], "batch_id": [0, 1]}) - offsets = dask_cudf.from_cudf(offsets, npartitions=2) + assert len(results) == 12 + + offsets = cudf.DataFrame({"offsets": [0, 8, 0, 4], "batch_id": [0, None, 1, None]}) + offsets = dask_cudf.from_cudf(offsets, npartitions=1).repartition( + divisions=[0, 2, 3] + ) samples_path = os.path.join(scratch_dir, "mg_test_bulk_sampler_io") create_directory_with_overwrite(samples_path) @@ -149,9 +153,9 @@ def test_bulk_sampler_io_empty_batch(scratch_dir): ) # some batches are missing - offsets = cudf.DataFrame({"offsets": [0, 8, 0, 4], "batch_id": [0, 3, 4, 10]}) + offsets = cudf.DataFrame({"offsets": [0, 8, 12, 0, 4, 8], "batch_id": [0, 3, None, 4, 10, None]}) offsets = dask_cudf.from_cudf(offsets, npartitions=1).repartition( - divisions=[0, 2, 3] + divisions=[0, 3, 5] ) samples_path = os.path.join(scratch_dir, "mg_test_bulk_sampler_io_empty_batch")