Update raft-ann-bench output filenames and add features to plotting (…

…#2043) This PR: 1. Adds more clarity to filenames by using `,` as separator instead of `_` 2. Adds 80% and 99% recall bars to build plots 3. Does not plot a recall level in build plot if no data is present 4. Adds a `x-start` argument which allows controlling the minimum recall level used on the x-axis of the search plot 5. Fixes sometimes occurring multi-line issue in search plots 6. Build time plots now plot average build times for an index corresponding a search query in each recall range Authors: - Divye Gala (https://github.com/divyegala) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: #2043
rapidsai · Dec 21, 2023 · bae049b · bae049b
1 parent 7e098b2
commit bae049b
Show file tree

Hide file tree

Showing 4 changed files with 136 additions and 75 deletions.
diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
@@ -171,8 +171,8 @@ options:
 
 `algo-groups`: this parameter is helpful to append any specific algorithm+group combination to run the benchmark for in addition to all the arguments from `algorithms` and `groups`. It is of the format `<algorithm>.<group>`, or for example, `raft_cagra.large`
 
-For every algorithm run by this script, it outputs an index build statistics JSON file in `<dataset-path/<dataset>/result/build/<algo_{group}-{k}-{batch_size}.json>`
-and an index search statistics JSON file in `<dataset-path/<dataset>/result/search/<algo_{group}-{k}-{batch_size}.json>`. NOTE: The filenams will not have "_{group}" if `group = "base"`.
+For every algorithm run by this script, it outputs an index build statistics JSON file in `<dataset-path/<dataset>/result/build/<{algo},{group}.json>`
+and an index search statistics JSON file in `<dataset-path/<dataset>/result/search/<{algo},{group},k{k},bs{batch_size}.json>`. NOTE: The filenames will not have ",{group}" if `group = "base"`.
 
 `dataset-path` :
 1. data is read from `<dataset-path>/<dataset>`
@@ -198,8 +198,8 @@ options:
   --dataset-path DATASET_PATH
                         path to dataset folder (default: ${RAPIDS_DATASET_ROOT_DIR})
 ```
-Build statistics CSV file is stored in `<dataset-path/<dataset>/result/build/<algo_group.csv>`
-and index search statistics CSV file in `<dataset-path/<dataset>/result/search/<algo_group-k{k}-batch_size{batch_size}_{suffix}.csv>`, where suffix has three values:
+Build statistics CSV file is stored in `<dataset-path/<dataset>/result/build/<{algo},{group}.csv>`
+and index search statistics CSV file in `<dataset-path/<dataset>/result/search/<{algo},{group},k{k},bs{batch_size},{suffix}.csv>`, where suffix has three values:
 1. `raw`: All search results are exported
 2. `throughput`: Pareto frontier of throughput results is exported
 3. `latency`: Pareto frontier of latency results is exported
@@ -212,8 +212,8 @@ CSV files `<dataset-path/<dataset>/result/search/*.csv`.
 The usage of this script is:
 ```bash
 usage:  [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filepath OUTPUT_FILEPATH] [--algorithms ALGORITHMS] [--groups GROUPS] [--algo-groups ALGO_GROUPS]
-        [-k COUNT] [-bs BATCH_SIZE] [--build] [--search] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--mode {throughput,latency}] [--time-unit {s,ms,us}]
-        [--raw]
+        [-k COUNT] [-bs BATCH_SIZE] [--build] [--search] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--x-start X_START] [--mode {throughput,latency}]
+        [--time-unit {s,ms,us}] [--raw]
 
 options:
   -h, --help            show this help message and exit
@@ -237,6 +237,7 @@ options:
   --x-scale X_SCALE     Scale to use when drawing the X-axis. Typically linear, logit or a2 (default: linear)
   --y-scale {linear,log,symlog,logit}
                         Scale to use when drawing the Y-axis (default: linear)
+  --x-start X_START     Recall values to start the x-axis from (default: 0.8)
   --mode {throughput,latency}
                         search mode whose Pareto frontier is used on the y-axis (default: throughput)
   --time-unit {s,ms,us}

diff --git a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py
@@ -74,7 +74,9 @@ def read_file(dataset, dataset_path, method):
                 try:
                     data = json.load(f)
                     df = pd.DataFrame(data["benchmarks"])
-                    yield os.path.join(dir, file), file.split("-")[0], df
+                    filename_split = file.split(",")
+                    algo_name = (filename_split[0], filename_split[1])
+                    yield os.path.join(dir, file), algo_name, df
                 except Exception as e:
                     print(
                         "An error occurred processing file %s (%s). "
@@ -85,7 +87,10 @@ def read_file(dataset, dataset_path, method):
 def convert_json_to_csv_build(dataset, dataset_path):
     for file, algo_name, df in read_file(dataset, dataset_path, "build"):
         try:
-            algo_name = algo_name.replace("_base", "")
+            if "base" in algo_name[1]:
+                algo_name = algo_name[0]
+            else:
+                algo_name = "_".join(algo_name)
             df["name"] = df["name"].str.split("/").str[0]
             write = pd.DataFrame(
                 {
@@ -97,12 +102,7 @@ def convert_json_to_csv_build(dataset, dataset_path):
             for name in df:
                 if name not in skip_build_cols:
                     write[name] = df[name]
-            filepath = os.path.normpath(file).split(os.sep)
-            filename = filepath[-1].split("-")[0] + ".csv"
-            write.to_csv(
-                os.path.join(f"{os.sep}".join(filepath[:-1]), filename),
-                index=False,
-            )
+            write.to_csv(file.replace(".json", ".csv"), index=False)
         except Exception as e:
             print(
                 "An error occurred processing file %s (%s). Skipping..."
@@ -140,9 +140,17 @@ def convert_json_to_csv_search(dataset, dataset_path):
     for file, algo_name, df in read_file(dataset, dataset_path, "search"):
         try:
             build_file = os.path.join(
-                dataset_path, dataset, "result", "build", f"{algo_name}.csv"
+                dataset_path,
+                dataset,
+                "result",
+                "build",
+                f"{','.join(algo_name)}.csv",
             )
-            algo_name = algo_name.replace("_base", "")
+            print(build_file)
+            if "base" in algo_name[1]:
+                algo_name = algo_name[0]
+            else:
+                algo_name = "_".join(algo_name)
             df["name"] = df["name"].str.split("/").str[0]
             try:
                 write = pd.DataFrame(
@@ -201,13 +209,13 @@ def convert_json_to_csv_search(dataset, dataset_path):
                     "appended in the Search CSV"
                 )
 
-            write.to_csv(file.replace(".json", "_raw.csv"), index=False)
+            write.to_csv(file.replace(".json", ",raw.csv"), index=False)
             throughput = get_frontier(write, "throughput")
             throughput.to_csv(
-                file.replace(".json", "_throughput.csv"), index=False
+                file.replace(".json", ",throughput.csv"), index=False
             )
             latency = get_frontier(write, "latency")
-            latency.to_csv(file.replace(".json", "_latency.csv"), index=False)
+            latency.to_csv(file.replace(".json", ",latency.csv"), index=False)
         except Exception as e:
             print(
                 "An error occurred processing file %s (%s). Skipping..."

diff --git a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py
@@ -62,6 +62,19 @@ def positive_int(input_str: str) -> int:
     return i
 
 
+def positive_float(input_str: str) -> float:
+    try:
+        i = float(input_str)
+        if i < 0.0:
+            raise ValueError
+    except ValueError:
+        raise argparse.ArgumentTypeError(
+            f"{input_str} is not a positive float"
+        )
+
+    return i
+
+
 def generate_n_colors(n):
     vs = np.linspace(0.3, 0.9, 7)
     colors = [(0.9, 0.4, 0.4, 1.0)]
@@ -113,9 +126,11 @@ def create_plot_search(
     batch_size,
     mode,
     time_unit,
+    x_start,
 ):
     xn = "k-nn"
     xm, ym = (metrics[xn], metrics[mode])
+    xm["lim"][0] = x_start
     # Now generate each plot
     handles = []
     labels = []
@@ -211,20 +226,15 @@ def inv_fun(x):
 
 
 def create_plot_build(
-    build_results, search_results, linestyles, fn_out, dataset
+    build_results, search_results, linestyles, fn_out, dataset, k, batch_size
 ):
+    bt_80 = [0] * len(linestyles)
 
-    qps_85 = [-1] * len(linestyles)
-    bt_85 = [0] * len(linestyles)
-    i_85 = [-1] * len(linestyles)
-
-    qps_90 = [-1] * len(linestyles)
     bt_90 = [0] * len(linestyles)
-    i_90 = [-1] * len(linestyles)
 
-    qps_95 = [-1] * len(linestyles)
     bt_95 = [0] * len(linestyles)
-    i_95 = [-1] * len(linestyles)
+
+    bt_99 = [0] * len(linestyles)
 
     data = OrderedDict()
     colors = OrderedDict()
@@ -237,35 +247,59 @@ def mean_y(algo):
 
     for pos, algo in enumerate(sorted(search_results.keys(), key=mean_y)):
         points = np.array(search_results[algo], dtype=object)
+        # x is recall, ls is algo_name, idxs is index_name
         xs = points[:, 2]
-        ys = points[:, 3]
         ls = points[:, 0]
         idxs = points[:, 1]
-        # x is recall, y is qps, ls is algo_name, idxs is index_name
+
+        len_80, len_90, len_95, len_99 = 0, 0, 0, 0
         for i in range(len(xs)):
-            if xs[i] >= 0.85 and xs[i] < 0.9 and ys[i] > qps_85[pos]:
-                qps_85[pos] = ys[i]
-                bt_85[pos] = build_results[(ls[i], idxs[i])][0][2]
-                i_85[pos] = idxs[i]
-            elif xs[i] >= 0.9 and xs[i] < 0.95 and ys[i] > qps_90[pos]:
-                qps_90[pos] = ys[i]
-                bt_90[pos] = build_results[(ls[i], idxs[i])][0][2]
-                i_90[pos] = idxs[i]
-            elif xs[i] >= 0.95 and ys[i] > qps_95[pos]:
-                qps_95[pos] = ys[i]
-                bt_95[pos] = build_results[(ls[i], idxs[i])][0][2]
-                i_95[pos] = idxs[i]
-        data[algo] = [bt_85[pos], bt_90[pos], bt_95[pos]]
+            if xs[i] >= 0.80 and xs[i] < 0.90:
+                bt_80[pos] = bt_80[pos] + build_results[(ls[i], idxs[i])][0][2]
+                len_80 = len_80 + 1
+            elif xs[i] >= 0.9 and xs[i] < 0.95:
+                bt_90[pos] = bt_90[pos] + build_results[(ls[i], idxs[i])][0][2]
+                len_90 = len_90 + 1
+            elif xs[i] >= 0.95 and xs[i] < 0.99:
+                bt_95[pos] = bt_95[pos] + build_results[(ls[i], idxs[i])][0][2]
+                len_95 = len_95 + 1
+            elif xs[i] >= 0.99:
+                bt_99[pos] = bt_99[pos] + build_results[(ls[i], idxs[i])][0][2]
+                len_99 = len_99 + 1
+        if len_80 > 0:
+            bt_80[pos] = bt_80[pos] / len_80
+        if len_90 > 0:
+            bt_90[pos] = bt_90[pos] / len_90
+        if len_95 > 0:
+            bt_95[pos] = bt_95[pos] / len_95
+        if len_99 > 0:
+            bt_99[pos] = bt_99[pos] / len_99
+        data[algo] = [
+            bt_80[pos],
+            bt_90[pos],
+            bt_95[pos],
+            bt_99[pos],
+        ]
         colors[algo] = linestyles[algo][0]
 
-    index = ["@85% Recall", "@90% Recall", "@95% Recall"]
+    index = [
+        "@80% Recall",
+        "@90% Recall",
+        "@95% Recall",
+        "@99% Recall",
+    ]
 
     df = pd.DataFrame(data, index=index)
+    df.replace(0.0, np.nan, inplace=True)
+    df = df.dropna(how="all")
     plt.figure(figsize=(12, 9))
     ax = df.plot.bar(rot=0, color=colors)
     fig = ax.get_figure()
     print(f"writing build output to {fn_out}")
-    plt.title("Build Time for Highest QPS")
+    plt.title(
+        "Average Build Time within Recall Range "
+        f"for k={k} batch_size={batch_size}"
+    )
     plt.suptitle(f"{dataset}")
     plt.ylabel("Build Time (s)")
     fig.savefig(fn_out)
@@ -344,9 +378,9 @@ def load_all_results(
         ]
     elif method == "search":
         if raw:
-            suffix = "_raw"
+            suffix = ",raw"
         else:
-            suffix = f"_{mode}"
+            suffix = f",{mode}"
         result_files = [
             result_file
             for result_file in result_files
@@ -356,22 +390,20 @@ def load_all_results(
         raise FileNotFoundError(f"No CSV result files found in {results_path}")
 
     if method == "search":
-        result_files = [
-            result_filename
-            for result_filename in result_files
-            if f"{k}-{batch_size}" in result_filename
-        ]
-        algo_group_files = [
-            result_filename.split("-")[0] for result_filename in result_files
-        ]
-    else:
-        algo_group_files = [
-            result_filename for result_filename in result_files
-        ]
-
-    for i in range(len(algo_group_files)):
-        algo_group = algo_group_files[i].replace(".csv", "").split("_")
-        algo_group_files[i] = ("_".join(algo_group[:-1]), algo_group[-1])
+        filter_k_bs = []
+        for result_filename in result_files:
+            filename_split = result_filename.split(",")
+            if (
+                int(filename_split[-3][1:]) == k
+                and int(filename_split[-2][2:]) == batch_size
+            ):
+                filter_k_bs.append(result_filename)
+        result_files = filter_k_bs
+
+    algo_group_files = [
+        result_filename.replace(".csv", "").split(",")[:2]
+        for result_filename in result_files
+    ]
     algo_group_files = list(zip(*algo_group_files))
 
     if len(algorithms) > 0:
@@ -478,6 +510,12 @@ def main():
         choices=["linear", "log", "symlog", "logit"],
         default="linear",
     )
+    parser.add_argument(
+        "--x-start",
+        help="Recall values to start the x-axis from",
+        default=0.8,
+        type=positive_float,
+    )
     parser.add_argument(
         "--mode",
         help="search mode whose Pareto frontier is used on the y-axis",
@@ -525,7 +563,7 @@ def main():
     )
     build_output_filepath = os.path.join(
         args.output_filepath,
-        f"build-{args.dataset}.png",
+        f"build-{args.dataset}-k{k}-batch_size{batch_size}.png",
     )
 
     search_results = load_all_results(
@@ -554,6 +592,7 @@ def main():
             batch_size,
             args.mode,
             args.time_unit,
+            args.x_start,
         )
     if build:
         build_results = load_all_results(
@@ -575,6 +614,8 @@ def main():
             linestyles,
             build_output_filepath,
             args.dataset,
+            k,
+            batch_size,
         )