From eb3aadce7f32020f06ed7db4a1630b5d66ad7ddb Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Mon, 25 Sep 2023 09:12:48 -0700
Subject: [PATCH] fix wrong index + off by 1 error, add check in test

---
 python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py   | 5 +++--
 python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py   | 5 +++++
 .../cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py   | 5 +++++
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py
index 3783b696057..102bed8428c 100644
--- a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py
+++ b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py
@@ -157,7 +157,7 @@ def _write_samples_to_parquet_csr(
             [0, -1]
         ].values  # legal since offsets has the 1 extra offset
         results_start, results_end = major_offsets_array[
-            [major_offsets_start, major_offsets_end]
+            [major_offsets_start, major_offsets_end - 1]
         ]  # avoid d2h copy
 
         # no need to use end batch id, just ensure the batch is labeled correctly
@@ -186,7 +186,8 @@ def _write_samples_to_parquet_csr(
                 ),
                 label_hop_offsets_current_partition,
                 cudf.Series(
-                    major_offsets_array[results_start:results_end], name="major_offsets"
+                    major_offsets_array[major_offsets_start:major_offsets_end],
+                    name="major_offsets",
                 ),
                 cudf.Series(weight_array[results_start:results_end], name="weight"),
                 cudf.Series(edge_id_array[results_start:results_end], name="edge_id"),
diff --git a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py
index c1bac8b44c4..a945881394b 100644
--- a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py
+++ b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py
@@ -341,4 +341,9 @@ def test_bulk_sampler_csr(scratch_dir):
 
     assert len(os.listdir(samples_path)) == 21
 
+    for file in os.listdir(samples_path):
+        df = cudf.read_parquet(os.path.join(samples_path, file))
+
+        assert df.major_offsets.dropna().iloc[-1] - df.major_offsets.iloc[0] == len(df)
+
     shutil.rmtree(samples_path)
diff --git a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py
index 37eecdec58e..aee81e5ffed 100644
--- a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py
+++ b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py
@@ -297,4 +297,9 @@ def test_bulk_sampler_csr(dask_client, scratch_dir, mg_input):
 
     assert len(os.listdir(samples_path)) == 21
 
+    for file in os.listdir(samples_path):
+        df = cudf.read_parquet(os.path.join(samples_path, file))
+
+        assert df.major_offsets.dropna().iloc[-1] - df.major_offsets.iloc[0] == len(df)
+
     shutil.rmtree(samples_path)