From 27b23a24f19904c7fe38d2cd61694db109f2d75a Mon Sep 17 00:00:00 2001 From: Divye Gala Date: Wed, 8 Nov 2023 17:43:25 -0500 Subject: [PATCH 1/2] Allow `raft-ann-bench/run` to continue after encountering bad YAML configs (#1980) Authors: - Divye Gala (https://github.com/divyegala) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/1980 --- .../src/raft-ann-bench/data_export/__main__.py | 4 ++++ .../generate_groundtruth/__main__.py | 4 ++++ .../src/raft-ann-bench/get_dataset/__main__.py | 5 +++++ .../src/raft-ann-bench/plot/__main__.py | 4 ++++ .../src/raft-ann-bench/run/__main__.py | 14 +++++++++++++- .../raft-ann-bench/split_groundtruth/__main__.py | 5 +++++ 6 files changed, 35 insertions(+), 1 deletion(-) diff --git a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py index e19ada2934..47da9f39fa 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py @@ -17,6 +17,7 @@ import argparse import json import os +import sys import warnings import pandas as pd @@ -147,6 +148,9 @@ def main(): default=default_dataset_path, ) + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) args = parser.parse_args() convert_json_to_csv_build(args.dataset, args.dataset_path) diff --git a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py index 77a930f81e..f4d97edea5 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py @@ -16,6 +16,7 @@ # import argparse import os +import sys import cupy as cp import numpy as np @@ -178,6 +179,9 @@ def main(): " commonly used with RAFT ANN are 'sqeuclidean' and 'inner_product'", ) + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) args = parser.parse_args() if args.rows is not None: diff --git a/python/raft-ann-bench/src/raft-ann-bench/get_dataset/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/get_dataset/__main__.py index 4e6a0119b4..0a6c37aabc 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/get_dataset/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/get_dataset/__main__.py @@ -16,6 +16,7 @@ import argparse import os import subprocess +import sys from urllib.request import urlretrieve @@ -101,6 +102,10 @@ def main(): help="normalize cosine distance to inner product", action="store_true", ) + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) args = parser.parse_args() download(args.dataset, args.normalize, args.dataset_path) diff --git a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py index 78f8aea8b8..c45ff5b14e 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py @@ -22,6 +22,7 @@ import argparse import itertools import os +import sys from collections import OrderedDict import matplotlib as mpl @@ -486,6 +487,9 @@ def main(): action="store_true", ) + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) args = parser.parse_args() if args.algorithms: diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py index 6b01263c27..c9fde6dd7e 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py @@ -18,7 +18,9 @@ import json import os import subprocess +import sys import uuid +import warnings from importlib import import_module import yaml @@ -292,6 +294,9 @@ def main(): action="store_true", ) + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) args = parser.parse_args() # If both build and search are not provided, @@ -368,7 +373,14 @@ def main(): algos_conf = dict() for algo_f in algos_conf_fs: with open(algo_f, "r") as f: - algo = yaml.safe_load(f) + try: + algo = yaml.safe_load(f) + except Exception as e: + warnings.warn( + f"Could not load YAML config {algo_f} due to " + + e.with_traceback() + ) + continue insert_algo = True insert_algo_group = False if filter_algos: diff --git a/python/raft-ann-bench/src/raft-ann-bench/split_groundtruth/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/split_groundtruth/__main__.py index b886d40ea7..c65360ebb0 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/split_groundtruth/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/split_groundtruth/__main__.py @@ -16,6 +16,7 @@ import argparse import os import subprocess +import sys def split_groundtruth(groundtruth_filepath): @@ -43,6 +44,10 @@ def main(): help="Path to billion-scale dataset groundtruth file", required=True, ) + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) args = parser.parse_args() split_groundtruth(args.groundtruth) From 061c0cf7a50bff8600ca9ecef86edee81a354233 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 9 Nov 2023 02:40:42 +0100 Subject: [PATCH 2/2] Catching conversion errors in data_export instead of fully failing (#1979) Authors: - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Divye Gala (https://github.com/divyegala) URL: https://github.com/rapidsai/raft/pull/1979 --- .../raft-ann-bench/data_export/__main__.py | 142 ++++++++++-------- .../src/raft-ann-bench/run/__main__.py | 3 +- 2 files changed, 81 insertions(+), 64 deletions(-) diff --git a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py index 47da9f39fa..fd6c2077e7 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py @@ -18,6 +18,7 @@ import json import os import sys +import traceback import warnings import pandas as pd @@ -58,74 +59,89 @@ def read_file(dataset, dataset_path, method): def convert_json_to_csv_build(dataset, dataset_path): for file, algo_name, df in read_file(dataset, dataset_path, "build"): - algo_name = algo_name.replace("_base", "") - df["name"] = df["name"].str.split("/").str[0] - write = pd.DataFrame( - { - "algo_name": [algo_name] * len(df), - "index_name": df["name"], - "time": df["real_time"], - } - ) - for name in df: - if name not in skip_build_cols: - write[name] = df[name] - filepath = os.path.normpath(file).split(os.sep) - filename = filepath[-1].split("-")[0] + ".csv" - write.to_csv( - os.path.join(f"{os.sep}".join(filepath[:-1]), filename), - index=False, - ) + try: + algo_name = algo_name.replace("_base", "") + df["name"] = df["name"].str.split("/").str[0] + write = pd.DataFrame( + { + "algo_name": [algo_name] * len(df), + "index_name": df["name"], + "time": df["real_time"], + } + ) + for name in df: + if name not in skip_build_cols: + write[name] = df[name] + filepath = os.path.normpath(file).split(os.sep) + filename = filepath[-1].split("-")[0] + ".csv" + write.to_csv( + os.path.join(f"{os.sep}".join(filepath[:-1]), filename), + index=False, + ) + except Exception as e: + print( + "An error occurred processing file %s (%s). Skipping..." + % (file, e) + ) + traceback.print_exc() def convert_json_to_csv_search(dataset, dataset_path): for file, algo_name, df in read_file(dataset, dataset_path, "search"): - build_file = os.path.join( - dataset_path, dataset, "result", "build", f"{algo_name}.csv" - ) - algo_name = algo_name.replace("_base", "") - df["name"] = df["name"].str.split("/").str[0] - write = pd.DataFrame( - { - "algo_name": [algo_name] * len(df), - "index_name": df["name"], - "recall": df["Recall"], - "qps": df["items_per_second"], - } - ) - for name in df: - if name not in skip_search_cols: - write[name] = df[name] - - if os.path.exists(build_file): - build_df = pd.read_csv(build_file) - write_ncols = len(write.columns) - write["build time"] = None - write["build threads"] = None - write["build cpu_time"] = None - write["build GPU"] = None - - for col_idx in range(5, len(build_df.columns)): - col_name = build_df.columns[col_idx] - write[col_name] = None - - for s_index, search_row in write.iterrows(): - for b_index, build_row in build_df.iterrows(): - if search_row["index_name"] == build_row["index_name"]: - write.iloc[s_index, write_ncols] = build_df.iloc[ - b_index, 2 - ] - write.iloc[s_index, write_ncols + 1 :] = build_df.iloc[ - b_index, 3: - ] - break - else: - warnings.warn( - f"Build CSV not found for {algo_name}, build params won't be " - "appended in the Search CSV" + try: + build_file = os.path.join( + dataset_path, dataset, "result", "build", f"{algo_name}.csv" ) - - write.to_csv(file.replace(".json", ".csv"), index=False) + algo_name = algo_name.replace("_base", "") + df["name"] = df["name"].str.split("/").str[0] + write = pd.DataFrame( + { + "algo_name": [algo_name] * len(df), + "index_name": df["name"], + "recall": df["Recall"], + "qps": df["items_per_second"], + } + ) + for name in df: + if name not in skip_search_cols: + write[name] = df[name] + + if os.path.exists(build_file): + build_df = pd.read_csv(build_file) + write_ncols = len(write.columns) + write["build time"] = None + write["build threads"] = None + write["build cpu_time"] = None + write["build GPU"] = None + + for col_idx in range(5, len(build_df.columns)): + col_name = build_df.columns[col_idx] + write[col_name] = None + + for s_index, search_row in write.iterrows(): + for b_index, build_row in build_df.iterrows(): + if search_row["index_name"] == build_row["index_name"]: + write.iloc[s_index, write_ncols] = build_df.iloc[ + b_index, 2 + ] + write.iloc[ + s_index, write_ncols + 1 : + ] = build_df.iloc[b_index, 3:] + break + else: + warnings.warn( + f"Build CSV not found for {algo_name}, " + f"build params won't be " + "appended in the Search CSV" + ) + + write.to_csv(file.replace(".json", ".csv"), index=False) + except Exception as e: + print( + "An error occurred processing file %s (%s). Skipping..." + % (file, e) + ) + traceback.print_exc() def main(): diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py index c9fde6dd7e..a33467b554 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py @@ -132,7 +132,8 @@ def run_build_and_search( except Exception as e: print("Error occurred running benchmark: %s" % e) finally: - os.remove(temp_conf_filename) + if not search: + os.remove(temp_conf_filename) if search: search_folder = os.path.join(legacy_result_folder, "search")