From 27b23a24f19904c7fe38d2cd61694db109f2d75a Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Wed, 8 Nov 2023 17:43:25 -0500
Subject: [PATCH 1/2] Allow `raft-ann-bench/run` to continue after encountering
 bad YAML configs (#1980)

Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/1980
---
 .../src/raft-ann-bench/data_export/__main__.py     |  4 ++++
 .../generate_groundtruth/__main__.py               |  4 ++++
 .../src/raft-ann-bench/get_dataset/__main__.py     |  5 +++++
 .../src/raft-ann-bench/plot/__main__.py            |  4 ++++
 .../src/raft-ann-bench/run/__main__.py             | 14 +++++++++++++-
 .../raft-ann-bench/split_groundtruth/__main__.py   |  5 +++++
 6 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py
index e19ada2934..47da9f39fa 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py
@@ -17,6 +17,7 @@
 import argparse
 import json
 import os
+import sys
 import warnings
 
 import pandas as pd
@@ -147,6 +148,9 @@ def main():
         default=default_dataset_path,
     )
 
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
     args = parser.parse_args()
 
     convert_json_to_csv_build(args.dataset, args.dataset_path)
diff --git a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py
index 77a930f81e..f4d97edea5 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py
@@ -16,6 +16,7 @@
 #
 import argparse
 import os
+import sys
 
 import cupy as cp
 import numpy as np
@@ -178,6 +179,9 @@ def main():
         " commonly used with RAFT ANN are 'sqeuclidean' and 'inner_product'",
     )
 
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
     args = parser.parse_args()
 
     if args.rows is not None:
diff --git a/python/raft-ann-bench/src/raft-ann-bench/get_dataset/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/get_dataset/__main__.py
index 4e6a0119b4..0a6c37aabc 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/get_dataset/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/get_dataset/__main__.py
@@ -16,6 +16,7 @@
 import argparse
 import os
 import subprocess
+import sys
 from urllib.request import urlretrieve
 
 
@@ -101,6 +102,10 @@ def main():
         help="normalize cosine distance to inner product",
         action="store_true",
     )
+
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
     args = parser.parse_args()
 
     download(args.dataset, args.normalize, args.dataset_path)
diff --git a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py
index 78f8aea8b8..c45ff5b14e 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py
@@ -22,6 +22,7 @@
 import argparse
 import itertools
 import os
+import sys
 from collections import OrderedDict
 
 import matplotlib as mpl
@@ -486,6 +487,9 @@ def main():
         action="store_true",
     )
 
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
     args = parser.parse_args()
 
     if args.algorithms:
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
index 6b01263c27..c9fde6dd7e 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
@@ -18,7 +18,9 @@
 import json
 import os
 import subprocess
+import sys
 import uuid
+import warnings
 from importlib import import_module
 
 import yaml
@@ -292,6 +294,9 @@ def main():
         action="store_true",
     )
 
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
     args = parser.parse_args()
 
     # If both build and search are not provided,
@@ -368,7 +373,14 @@ def main():
     algos_conf = dict()
     for algo_f in algos_conf_fs:
         with open(algo_f, "r") as f:
-            algo = yaml.safe_load(f)
+            try:
+                algo = yaml.safe_load(f)
+            except Exception as e:
+                warnings.warn(
+                    f"Could not load YAML config {algo_f} due to "
+                    + e.with_traceback()
+                )
+                continue
             insert_algo = True
             insert_algo_group = False
             if filter_algos:
diff --git a/python/raft-ann-bench/src/raft-ann-bench/split_groundtruth/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/split_groundtruth/__main__.py
index b886d40ea7..c65360ebb0 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/split_groundtruth/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/split_groundtruth/__main__.py
@@ -16,6 +16,7 @@
 import argparse
 import os
 import subprocess
+import sys
 
 
 def split_groundtruth(groundtruth_filepath):
@@ -43,6 +44,10 @@ def main():
         help="Path to billion-scale dataset groundtruth file",
         required=True,
     )
+
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
     args = parser.parse_args()
 
     split_groundtruth(args.groundtruth)

From 061c0cf7a50bff8600ca9ecef86edee81a354233 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 9 Nov 2023 02:40:42 +0100
Subject: [PATCH 2/2] Catching conversion errors in data_export instead of
 fully failing (#1979)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/raft/pull/1979
---
 .../raft-ann-bench/data_export/__main__.py    | 142 ++++++++++--------
 .../src/raft-ann-bench/run/__main__.py        |   3 +-
 2 files changed, 81 insertions(+), 64 deletions(-)

diff --git a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py
index 47da9f39fa..fd6c2077e7 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py
@@ -18,6 +18,7 @@
 import json
 import os
 import sys
+import traceback
 import warnings
 
 import pandas as pd
@@ -58,74 +59,89 @@ def read_file(dataset, dataset_path, method):
 
 def convert_json_to_csv_build(dataset, dataset_path):
     for file, algo_name, df in read_file(dataset, dataset_path, "build"):
-        algo_name = algo_name.replace("_base", "")
-        df["name"] = df["name"].str.split("/").str[0]
-        write = pd.DataFrame(
-            {
-                "algo_name": [algo_name] * len(df),
-                "index_name": df["name"],
-                "time": df["real_time"],
-            }
-        )
-        for name in df:
-            if name not in skip_build_cols:
-                write[name] = df[name]
-        filepath = os.path.normpath(file).split(os.sep)
-        filename = filepath[-1].split("-")[0] + ".csv"
-        write.to_csv(
-            os.path.join(f"{os.sep}".join(filepath[:-1]), filename),
-            index=False,
-        )
+        try:
+            algo_name = algo_name.replace("_base", "")
+            df["name"] = df["name"].str.split("/").str[0]
+            write = pd.DataFrame(
+                {
+                    "algo_name": [algo_name] * len(df),
+                    "index_name": df["name"],
+                    "time": df["real_time"],
+                }
+            )
+            for name in df:
+                if name not in skip_build_cols:
+                    write[name] = df[name]
+            filepath = os.path.normpath(file).split(os.sep)
+            filename = filepath[-1].split("-")[0] + ".csv"
+            write.to_csv(
+                os.path.join(f"{os.sep}".join(filepath[:-1]), filename),
+                index=False,
+            )
+        except Exception as e:
+            print(
+                "An error occurred processing file %s (%s). Skipping..."
+                % (file, e)
+            )
+            traceback.print_exc()
 
 
 def convert_json_to_csv_search(dataset, dataset_path):
     for file, algo_name, df in read_file(dataset, dataset_path, "search"):
-        build_file = os.path.join(
-            dataset_path, dataset, "result", "build", f"{algo_name}.csv"
-        )
-        algo_name = algo_name.replace("_base", "")
-        df["name"] = df["name"].str.split("/").str[0]
-        write = pd.DataFrame(
-            {
-                "algo_name": [algo_name] * len(df),
-                "index_name": df["name"],
-                "recall": df["Recall"],
-                "qps": df["items_per_second"],
-            }
-        )
-        for name in df:
-            if name not in skip_search_cols:
-                write[name] = df[name]
-
-        if os.path.exists(build_file):
-            build_df = pd.read_csv(build_file)
-            write_ncols = len(write.columns)
-            write["build time"] = None
-            write["build threads"] = None
-            write["build cpu_time"] = None
-            write["build GPU"] = None
-
-            for col_idx in range(5, len(build_df.columns)):
-                col_name = build_df.columns[col_idx]
-                write[col_name] = None
-
-            for s_index, search_row in write.iterrows():
-                for b_index, build_row in build_df.iterrows():
-                    if search_row["index_name"] == build_row["index_name"]:
-                        write.iloc[s_index, write_ncols] = build_df.iloc[
-                            b_index, 2
-                        ]
-                        write.iloc[s_index, write_ncols + 1 :] = build_df.iloc[
-                            b_index, 3:
-                        ]
-                        break
-        else:
-            warnings.warn(
-                f"Build CSV not found for {algo_name}, build params won't be "
-                "appended in the Search CSV"
+        try:
+            build_file = os.path.join(
+                dataset_path, dataset, "result", "build", f"{algo_name}.csv"
             )
-
-        write.to_csv(file.replace(".json", ".csv"), index=False)
+            algo_name = algo_name.replace("_base", "")
+            df["name"] = df["name"].str.split("/").str[0]
+            write = pd.DataFrame(
+                {
+                    "algo_name": [algo_name] * len(df),
+                    "index_name": df["name"],
+                    "recall": df["Recall"],
+                    "qps": df["items_per_second"],
+                }
+            )
+            for name in df:
+                if name not in skip_search_cols:
+                    write[name] = df[name]
+
+            if os.path.exists(build_file):
+                build_df = pd.read_csv(build_file)
+                write_ncols = len(write.columns)
+                write["build time"] = None
+                write["build threads"] = None
+                write["build cpu_time"] = None
+                write["build GPU"] = None
+
+                for col_idx in range(5, len(build_df.columns)):
+                    col_name = build_df.columns[col_idx]
+                    write[col_name] = None
+
+                for s_index, search_row in write.iterrows():
+                    for b_index, build_row in build_df.iterrows():
+                        if search_row["index_name"] == build_row["index_name"]:
+                            write.iloc[s_index, write_ncols] = build_df.iloc[
+                                b_index, 2
+                            ]
+                            write.iloc[
+                                s_index, write_ncols + 1 :
+                            ] = build_df.iloc[b_index, 3:]
+                            break
+            else:
+                warnings.warn(
+                    f"Build CSV not found for {algo_name}, "
+                    f"build params won't be "
+                    "appended in the Search CSV"
+                )
+
+            write.to_csv(file.replace(".json", ".csv"), index=False)
+        except Exception as e:
+            print(
+                "An error occurred processing file %s (%s). Skipping..."
+                % (file, e)
+            )
+            traceback.print_exc()
 
 
 def main():
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
index c9fde6dd7e..a33467b554 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
@@ -132,7 +132,8 @@ def run_build_and_search(
                 except Exception as e:
                     print("Error occurred running benchmark: %s" % e)
                 finally:
-                    os.remove(temp_conf_filename)
+                    if not search:
+                        os.remove(temp_conf_filename)
 
         if search:
             search_folder = os.path.join(legacy_result_folder, "search")