From bdc5117f883d482cfd70d6572d50a08d2acb51f2 Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Fri, 17 Nov 2023 20:57:49 -0500
Subject: [PATCH] Export Pareto frontier in `raft-ann-bench.data_export`
 (#2009)

This PR now exports 3 CSVs from search result JSON files of the suffixes:
1. `raw`: All results
2. `throughput`: Pareto frontier of throughput results
3. `latency`: Pareto frontier of latency results

The Pareto frontier is now no more created in `raft-ann-bench.plot`.

Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2009
---
 docs/source/raft_ann_benchmarks.md            |  30 ++-
 .../raft-ann-bench/data_export/__main__.py    | 112 ++++++++---
 .../src/raft-ann-bench/plot/__main__.py       | 188 ++++++++++--------
 3 files changed, 210 insertions(+), 120 deletions(-)
diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
index e6c4eaedd0..dcdfc2cec9 100644
--- a/docs/source/raft_ann_benchmarks.md
+++ b/docs/source/raft_ann_benchmarks.md
@@ -198,27 +198,33 @@ options:
   --dataset-path DATASET_PATH
                         path to dataset folder (default: ${RAPIDS_DATASET_ROOT_DIR})
 ```
-Build statistics CSV file is stored in `<dataset-path/<dataset>/result/build/<algo-k{k}-batch_size{batch_size}.csv>`
-and index search statistics CSV file in `<dataset-path/<dataset>/result/search/<algo-k{k}-batch_size{batch_size}.csv>`.
+Build statistics CSV file is stored in `<dataset-path/<dataset>/result/build/<algo_group.csv>`
+and index search statistics CSV file in `<dataset-path/<dataset>/result/search/<algo_group-k{k}-batch_size{batch_size}_{suffix}.csv>`, where suffix has three values:
+1. `raw`: All search results are exported
+2. `throughput`: Pareto frontier of throughput results is exported
+3. `latency`: Pareto frontier of latency results is exported
+
 
 ### Step 4: Plot Results
 The script `raft-ann-bench.plot` will plot results for all algorithms found in index search statistics
-CSV file in `<dataset-path/<dataset>/result/search/<-k{k}-batch_size{batch_size}>.csv`.
+CSV files `<dataset-path/<dataset>/result/search/*.csv`.
 
 The usage of this script is:
 ```bash
-usage: __main__.py [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filepath OUTPUT_FILEPATH] [--algorithms ALGORITHMS] [--groups GROUPS] [--algo-groups ALGO_GROUPS] [-k COUNT]
-                   [-bs BATCH_SIZE] [--build] [--search] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--raw]
+usage:  [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filepath OUTPUT_FILEPATH] [--algorithms ALGORITHMS] [--groups GROUPS] [--algo-groups ALGO_GROUPS]
+        [-k COUNT] [-bs BATCH_SIZE] [--build] [--search] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--mode {throughput,latency}] [--time-unit {s,ms,us}]
+        [--raw]
 
 options:
   -h, --help            show this help message and exit
   --dataset DATASET     dataset to plot (default: glove-100-inner)
   --dataset-path DATASET_PATH
-                        path to dataset folder (default: os.getcwd()/datasets/)
+                        path to dataset folder (default: /home/coder/raft/datasets/)
   --output-filepath OUTPUT_FILEPATH
-                        directory for PNG to be saved (default: os.getcwd())
+                        directory for PNG to be saved (default: /home/coder/raft)
   --algorithms ALGORITHMS
-                        plot only comma separated list of named algorithms. If parameters `groups` and `algo-groups are both undefined, then group `base` is plot by default (default: None)
+                        plot only comma separated list of named algorithms. If parameters `groups` and `algo-groups are both undefined, then group `base` is plot by default
+                        (default: None)
   --groups GROUPS       plot only comma separated groups of parameters (default: base)
   --algo-groups ALGO_GROUPS, --algo-groups ALGO_GROUPS
                         add comma separated <algorithm>.<group> to plot. Example usage: "--algo-groups=raft_cagra.large,hnswlib.large" (default: None)
@@ -231,8 +237,14 @@ options:
   --x-scale X_SCALE     Scale to use when drawing the X-axis. Typically linear, logit or a2 (default: linear)
   --y-scale {linear,log,symlog,logit}
                         Scale to use when drawing the Y-axis (default: linear)
-  --raw                 Show raw results (not just Pareto frontier) in faded colours (default: False)
+  --mode {throughput,latency}
+                        search mode whose Pareto frontier is used on the y-axis (default: throughput)
+  --time-unit {s,ms,us}
+                        time unit to plot when mode is latency (default: ms)
+  --raw                 Show raw results (not just Pareto frontier) of mode arg (default: False)
 ```
+`mode`: plots pareto frontier of `throughput` or `latency` results exported in the previous step
+
 `algorithms`: plots all algorithms that it can find results for the specified `dataset`. By default, only `base` group will be plotted.
 
 `groups`: plot only specific groups of parameters configurations for an algorithm. Groups are defined in YAML configs (see `configuration`), and by default run `base` group
diff --git a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py
index 4978c99d60..572b81bbe2 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py
@@ -43,9 +43,26 @@
 )
 
 skip_search_cols = (
-    set(["recall", "qps", "items_per_second", "Recall"]) | skip_build_cols
+    set(["recall", "qps", "latency", "items_per_second", "Recall", "Latency"])
+    | skip_build_cols
 )
 
+metrics = {
+    "k-nn": {
+        "description": "Recall",
+        "worst": float("-inf"),
+        "lim": [0.0, 1.03],
+    },
+    "throughput": {
+        "description": "Queries per second (1/s)",
+        "worst": float("-inf"),
+    },
+    "latency": {
+        "description": "Search Latency (s)",
+        "worst": float("inf"),
+    },
+}
+
 
 def read_file(dataset, dataset_path, method):
     dir = os.path.join(dataset_path, dataset, "result", method)
@@ -92,6 +109,31 @@ def convert_json_to_csv_build(dataset, dataset_path):
             traceback.print_exc()
 
 
+def create_pointset(data, xn, yn):
+    xm, ym = (metrics[xn], metrics[yn])
+    rev_y = -1 if ym["worst"] < 0 else 1
+    rev_x = -1 if xm["worst"] < 0 else 1
+
+    y_idx = 3 if yn == "throughput" else 4
+    data.sort(key=lambda t: (rev_y * t[y_idx], rev_x * t[2]))
+
+    lines = []
+    last_x = xm["worst"]
+    comparator = (
+        (lambda xv, lx: xv > lx) if last_x < 0 else (lambda xv, lx: xv < lx)
+    )
+    for d in data:
+        if comparator(d[2], last_x):
+            last_x = d[2]
+            lines.append(d)
+    return lines
+
+
+def get_frontier(df, metric):
+    lines = create_pointset(df.values.tolist(), "k-nn", metric)
+    return pd.DataFrame(lines, columns=df.columns)
+
+
 def convert_json_to_csv_search(dataset, dataset_path):
     for file, algo_name, df in read_file(dataset, dataset_path, "search"):
         try:
@@ -100,14 +142,21 @@ def convert_json_to_csv_search(dataset, dataset_path):
             )
             algo_name = algo_name.replace("_base", "")
             df["name"] = df["name"].str.split("/").str[0]
-            write = pd.DataFrame(
-                {
-                    "algo_name": [algo_name] * len(df),
-                    "index_name": df["name"],
-                    "recall": df["Recall"],
-                    "qps": df["items_per_second"],
-                }
-            )
+            try:
+                write = pd.DataFrame(
+                    {
+                        "algo_name": [algo_name] * len(df),
+                        "index_name": df["name"],
+                        "recall": df["Recall"],
+                        "throughput": df["items_per_second"],
+                        "latency": df["Latency"],
+                    }
+                )
+            except Exception as e:
+                print(
+                    "Search file %s (%s) missing a key. Skipping..."
+                    % (file, e)
+                )
             for name in df:
                 if name not in skip_search_cols:
                     write[name] = df[name]
@@ -120,20 +169,29 @@ def convert_json_to_csv_search(dataset, dataset_path):
                 write["build cpu_time"] = None
                 write["build GPU"] = None
 
-                for col_idx in range(6, len(build_df.columns)):
-                    col_name = build_df.columns[col_idx]
-                    write[col_name] = None
-
-                for s_index, search_row in write.iterrows():
-                    for b_index, build_row in build_df.iterrows():
-                        if search_row["index_name"] == build_row["index_name"]:
-                            write.iloc[s_index, write_ncols] = build_df.iloc[
-                                b_index, 2
-                            ]
-                            write.iloc[
-                                s_index, write_ncols + 1 :
-                            ] = build_df.iloc[b_index, 3:]
-                            break
+                try:
+                    for col_idx in range(6, len(build_df.columns)):
+                        col_name = build_df.columns[col_idx]
+                        write[col_name] = None
+
+                    for s_index, search_row in write.iterrows():
+                        for b_index, build_row in build_df.iterrows():
+                            if (
+                                search_row["index_name"]
+                                == build_row["index_name"]
+                            ):
+                                write.iloc[
+                                    s_index, write_ncols
+                                ] = build_df.iloc[b_index, 2]
+                                write.iloc[
+                                    s_index, write_ncols + 1 :
+                                ] = build_df.iloc[b_index, 3:]
+                                break
+                except Exception as e:
+                    print(
+                        "Build file %s (%s) missing a key. Skipping..."
+                        % (build_file, e)
+                    )
             else:
                 warnings.warn(
                     f"Build CSV not found for {algo_name}, "
@@ -141,7 +199,13 @@ def convert_json_to_csv_search(dataset, dataset_path):
                     "appended in the Search CSV"
                 )
 
-            write.to_csv(file.replace(".json", ".csv"), index=False)
+            write.to_csv(file.replace(".json", "_raw.csv"), index=False)
+            throughput = get_frontier(write, "throughput")
+            throughput.to_csv(
+                file.replace(".json", "_throughput.csv"), index=False
+            )
+            latency = get_frontier(write, "latency")
+            latency.to_csv(file.replace(".json", "_latency.csv"), index=False)
         except Exception as e:
             print(
                 "An error occurred processing file %s (%s). Skipping..."
diff --git a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py
index c45ff5b14e..8bd54170c9 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py
@@ -38,10 +38,14 @@
         "worst": float("-inf"),
         "lim": [0.0, 1.03],
     },
-    "qps": {
+    "throughput": {
         "description": "Queries per second (1/s)",
         "worst": float("-inf"),
     },
+    "latency": {
+        "description": "Search Latency (s)",
+        "worst": float("inf"),
+    },
 }
 
 
@@ -98,53 +102,20 @@ def create_linestyles(unique_algorithms):
     )
 
 
-def get_up_down(metric):
-    if metric["worst"] == float("inf"):
-        return "down"
-    return "up"
-
-
-def get_left_right(metric):
-    if metric["worst"] == float("inf"):
-        return "left"
-    return "right"
-
-
-def create_pointset(data, xn, yn):
-    xm, ym = (metrics[xn], metrics[yn])
-    rev_y = -1 if ym["worst"] < 0 else 1
-    rev_x = -1 if xm["worst"] < 0 else 1
-    data.sort(key=lambda t: (rev_y * t[-1], rev_x * t[-2]))
-
-    axs, ays, als, aidxs = [], [], [], []
-    # Generate Pareto frontier
-    xs, ys, ls, idxs = [], [], [], []
-    last_x = xm["worst"]
-    comparator = (
-        (lambda xv, lx: xv > lx) if last_x < 0 else (lambda xv, lx: xv < lx)
-    )
-    for algo_name, index_name, xv, yv in data:
-        if not xv or not yv:
-            continue
-        axs.append(xv)
-        ays.append(yv)
-        als.append(algo_name)
-        aidxs.append(algo_name)
-        if comparator(xv, last_x):
-            last_x = xv
-            xs.append(xv)
-            ys.append(yv)
-            ls.append(algo_name)
-            idxs.append(index_name)
-    return xs, ys, ls, idxs, axs, ays, als, aidxs
-
-
 def create_plot_search(
-    all_data, raw, x_scale, y_scale, fn_out, linestyles, dataset, k, batch_size
+    all_data,
+    x_scale,
+    y_scale,
+    fn_out,
+    linestyles,
+    dataset,
+    k,
+    batch_size,
+    mode,
+    time_unit,
 ):
     xn = "k-nn"
-    yn = "qps"
-    xm, ym = (metrics[xn], metrics[yn])
+    xm, ym = (metrics[xn], metrics[mode])
     # Now generate each plot
     handles = []
     labels = []
@@ -152,17 +123,15 @@ def create_plot_search(
 
     # Sorting by mean y-value helps aligning plots with labels
     def mean_y(algo):
-        xs, ys, ls, idxs, axs, ays, als, aidxs = create_pointset(
-            all_data[algo], xn, yn
-        )
-        return -np.log(np.array(ys)).mean()
+        points = np.array(all_data[algo], dtype=object)
+        return -np.log(np.array(points[:, 3], dtype=np.float32)).mean()
 
     # Find range for logit x-scale
     min_x, max_x = 1, 0
     for algo in sorted(all_data.keys(), key=mean_y):
-        xs, ys, ls, idxs, axs, ays, als, aidxs = create_pointset(
-            all_data[algo], xn, yn
-        )
+        points = np.array(all_data[algo], dtype=object)
+        xs = points[:, 2]
+        ys = points[:, 3]
         min_x = min([min_x] + [x for x in xs if x > 0])
         max_x = max([max_x] + [x for x in xs if x < 1])
         color, faded, linestyle, marker = linestyles[algo]
@@ -178,23 +147,15 @@ def mean_y(algo):
             marker=marker,
         )
         handles.append(handle)
-        if raw:
-            (handle2,) = plt.plot(
-                axs,
-                ays,
-                "-",
-                label=algo,
-                color=faded,
-                ms=5,
-                mew=2,
-                lw=2,
-                marker=marker,
-            )
+
         labels.append(algo)
 
     ax = plt.gca()
-    ax.set_ylabel(ym["description"])
-    ax.set_xlabel(xm["description"])
+    y_description = ym["description"]
+    if mode == "latency":
+        y_description = y_description.replace("(s)", f"({time_unit})")
+    ax.set_ylabel(y_description)
+    ax.set_xlabel("Recall")
     # Custom scales of the type --x-scale a3
     if x_scale[0] == "a":
         alpha = float(x_scale[1:])
@@ -250,10 +211,8 @@ def inv_fun(x):
 
 
 def create_plot_build(
-    build_results, search_results, linestyles, fn_out, dataset, k, batch_size
+    build_results, search_results, linestyles, fn_out, dataset
 ):
-    xn = "k-nn"
-    yn = "qps"
 
     qps_85 = [-1] * len(linestyles)
     bt_85 = [0] * len(linestyles)
@@ -271,16 +230,17 @@ def create_plot_build(
     colors = OrderedDict()
 
     # Sorting by mean y-value helps aligning plots with labels
+
     def mean_y(algo):
-        xs, ys, ls, idxs, axs, ays, als, aidxs = create_pointset(
-            search_results[algo], xn, yn
-        )
-        return -np.log(np.array(ys)).mean()
+        points = np.array(search_results[algo], dtype=object)
+        return -np.log(np.array(points[:, 3], dtype=np.float32)).mean()
 
     for pos, algo in enumerate(sorted(search_results.keys(), key=mean_y)):
-        xs, ys, ls, idxs, axs, ays, als, aidxs = create_pointset(
-            search_results[algo], xn, yn
-        )
+        points = np.array(search_results[algo], dtype=object)
+        xs = points[:, 2]
+        ys = points[:, 3]
+        ls = points[:, 0]
+        idxs = points[:, 1]
         # x is recall, y is qps, ls is algo_name, idxs is index_name
         for i in range(len(xs)):
             if xs[i] >= 0.85 and xs[i] < 0.9 and ys[i] > qps_85[pos]:
@@ -311,11 +271,11 @@ def mean_y(algo):
     fig.savefig(fn_out)
 
 
-def load_lines(results_path, result_files, method, index_key):
+def load_lines(results_path, result_files, method, index_key, mode, time_unit):
     results = dict()
 
     for result_filename in result_files:
-        if result_filename.endswith(".csv"):
+        try:
             with open(os.path.join(results_path, result_filename), "r") as f:
                 lines = f.readlines()
                 lines = lines[:-1] if lines[-1] == "\n" else lines
@@ -323,7 +283,8 @@ def load_lines(results_path, result_files, method, index_key):
                 if method == "build":
                     key_idx = [2]
                 elif method == "search":
-                    key_idx = [2, 3]
+                    y_idx = 3 if mode == "throughput" else 4
+                    key_idx = [2, y_idx]
 
                 for line in lines[1:]:
                     split_lines = line.split(",")
@@ -340,7 +301,22 @@ def load_lines(results_path, result_files, method, index_key):
                     to_add = [algo_name, index_name]
                     for key_i in key_idx:
                         to_add.append(float(split_lines[key_i]))
+                    if (
+                        mode == "latency"
+                        and time_unit != "s"
+                        and method == "search"
+                    ):
+                        to_add[-1] = (
+                            to_add[-1] * (10**3)
+                            if time_unit == "ms"
+                            else to_add[-1] * (10**6)
+                        )
                     results[dict_key].append(to_add)
+        except Exception:
+            print(
+                f"An error occurred processing file {result_filename}. "
+                "Skipping..."
+            )
 
     return results
 
@@ -354,12 +330,31 @@ def load_all_results(
     batch_size,
     method,
     index_key,
+    raw,
+    mode,
+    time_unit,
 ):
     results_path = os.path.join(dataset_path, "result", method)
     result_files = os.listdir(results_path)
-    result_files = [
-        result_file for result_file in result_files if ".csv" in result_file
-    ]
+    if method == "build":
+        result_files = [
+            result_file
+            for result_file in result_files
+            if ".csv" in result_file
+        ]
+    elif method == "search":
+        if raw:
+            suffix = "_raw"
+        else:
+            suffix = f"_{mode}"
+        result_files = [
+            result_file
+            for result_file in result_files
+            if f"{suffix}.csv" in result_file
+        ]
+    if len(result_files) == 0:
+        raise FileNotFoundError(f"No CSV result files found in {results_path}")
+
     if method == "search":
         result_files = [
             result_filename
@@ -407,7 +402,9 @@ def load_all_results(
         final_results = final_results + final_algo_groups
         final_results = set(final_results)
 
-    results = load_lines(results_path, final_results, method, index_key)
+    results = load_lines(
+        results_path, final_results, method, index_key, mode, time_unit
+    )
 
     return results
 
@@ -481,9 +478,21 @@ def main():
         choices=["linear", "log", "symlog", "logit"],
         default="linear",
     )
+    parser.add_argument(
+        "--mode",
+        help="search mode whose Pareto frontier is used on the y-axis",
+        choices=["throughput", "latency"],
+        default="throughput",
+    )
+    parser.add_argument(
+        "--time-unit",
+        help="time unit to plot when mode is latency",
+        choices=["s", "ms", "us"],
+        default="ms",
+    )
     parser.add_argument(
         "--raw",
-        help="Show raw results (not just Pareto frontier) in faded colours",
+        help="Show raw results (not just Pareto frontier) of mode arg",
         action="store_true",
     )
 
@@ -528,12 +537,14 @@ def main():
         batch_size,
         "search",
         "algo",
+        args.raw,
+        args.mode,
+        args.time_unit,
     )
     linestyles = create_linestyles(sorted(search_results.keys()))
     if search:
         create_plot_search(
             search_results,
-            args.raw,
             args.x_scale,
             args.y_scale,
             search_output_filepath,
@@ -541,6 +552,8 @@ def main():
             args.dataset,
             k,
             batch_size,
+            args.mode,
+            args.time_unit,
         )
     if build:
         build_results = load_all_results(
@@ -552,6 +565,9 @@ def main():
             batch_size,
             "build",
             "index",
+            args.raw,
+            args.mode,
+            args.time_unit,
         )
         create_plot_build(
             build_results,
@@ -559,8 +575,6 @@ def main():
             linestyles,
             build_output_filepath,
             args.dataset,
-            k,
-            batch_size,
         )