pareto frontier in data_export

rapidsai · Nov 17, 2023 · c48476b · c48476b
1 parent 7754040
commit c48476b
Show file tree

Hide file tree

Showing 3 changed files with 145 additions and 94 deletions.
diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
@@ -198,27 +198,32 @@ options:
   --dataset-path DATASET_PATH
                         path to dataset folder (default: ${RAPIDS_DATASET_ROOT_DIR})
 ```
-Build statistics CSV file is stored in `<dataset-path/<dataset>/result/build/<algo-k{k}-batch_size{batch_size}.csv>`
-and index search statistics CSV file in `<dataset-path/<dataset>/result/search/<algo-k{k}-batch_size{batch_size}.csv>`.
+Build statistics CSV file is stored in `<dataset-path/<dataset>/result/build/<algo_group.csv>`
+and index search statistics CSV file in `<dataset-path/<dataset>/result/search/<algo_group-k{k}-batch_size{batch_size}_{suffix}.csv>`, where suffix has three values:
+1. `raw`: All search results are exported
+2. `throughput`: Pareto frontier of throughput results is exported
+3. `latency`: Pareto frontier of latency results is exported
+
 
 ### Step 4: Plot Results
 The script `raft-ann-bench.plot` will plot results for all algorithms found in index search statistics
-CSV file in `<dataset-path/<dataset>/result/search/<-k{k}-batch_size{batch_size}>.csv`.
+CSV files `<dataset-path/<dataset>/result/search/*.csv`.
 
 The usage of this script is:
 ```bash
-usage: __main__.py [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filepath OUTPUT_FILEPATH] [--algorithms ALGORITHMS] [--groups GROUPS] [--algo-groups ALGO_GROUPS] [-k COUNT]
-                   [-bs BATCH_SIZE] [--build] [--search] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--raw]
+usage:  [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filepath OUTPUT_FILEPATH] [--algorithms ALGORITHMS] [--groups GROUPS] [--algo-groups ALGO_GROUPS]
+        [-k COUNT] [-bs BATCH_SIZE] [--build] [--search] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--mode {throughput,latency}] [--raw]
 
 options:
   -h, --help            show this help message and exit
   --dataset DATASET     dataset to plot (default: glove-100-inner)
   --dataset-path DATASET_PATH
-                        path to dataset folder (default: os.getcwd()/datasets/)
+                        path to dataset folder (default: /home/coder/raft/datasets/)
   --output-filepath OUTPUT_FILEPATH
-                        directory for PNG to be saved (default: os.getcwd())
+                        directory for PNG to be saved (default: /home/coder/raft)
   --algorithms ALGORITHMS
-                        plot only comma separated list of named algorithms. If parameters `groups` and `algo-groups are both undefined, then group `base` is plot by default (default: None)
+                        plot only comma separated list of named algorithms. If parameters `groups` and `algo-groups are both undefined, then group `base` is plot by default
+                        (default: None)
   --groups GROUPS       plot only comma separated groups of parameters (default: base)
   --algo-groups ALGO_GROUPS, --algo-groups ALGO_GROUPS
                         add comma separated <algorithm>.<group> to plot. Example usage: "--algo-groups=raft_cagra.large,hnswlib.large" (default: None)
@@ -231,8 +236,12 @@ options:
   --x-scale X_SCALE     Scale to use when drawing the X-axis. Typically linear, logit or a2 (default: linear)
   --y-scale {linear,log,symlog,logit}
                         Scale to use when drawing the Y-axis (default: linear)
-  --raw                 Show raw results (not just Pareto frontier) in faded colours (default: False)
+  --mode {throughput,latency}
+                        metric whose Pareto frontier is used on the y-axis (default: throughput)
+  --raw                 Show raw results (not just Pareto frontier) of metric arg (default: False)
 ```
+`mode`: plots pareto frontier of `throughput` or `latency` results exported in the previous step
+
 `algorithms`: plots all algorithms that it can find results for the specified `dataset`. By default, only `base` group will be plotted.
 
 `groups`: plot only specific groups of parameters configurations for an algorithm. Groups are defined in YAML configs (see `configuration`), and by default run `base` group

diff --git a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py
@@ -43,9 +43,26 @@
 )
 
 skip_search_cols = (
-    set(["recall", "qps", "items_per_second", "Recall"]) | skip_build_cols
+    set(["recall", "qps", "latency", "items_per_second", "Recall", "Latency"])
+    | skip_build_cols
 )
 
+metrics = {
+    "k-nn": {
+        "description": "Recall",
+        "worst": float("-inf"),
+        "lim": [0.0, 1.03],
+    },
+    "throughput": {
+        "description": "Queries per second (1/s)",
+        "worst": float("-inf"),
+    },
+    "latency": {
+        "description": "Search Latency (s)",
+        "worst": float("inf"),
+    },
+}
+
 
 def read_file(dataset, dataset_path, method):
     dir = os.path.join(dataset_path, dataset, "result", method)
@@ -92,6 +109,31 @@ def convert_json_to_csv_build(dataset, dataset_path):
             traceback.print_exc()
 
 
+def create_pointset(data, xn, yn):
+    xm, ym = (metrics[xn], metrics[yn])
+    rev_y = -1 if ym["worst"] < 0 else 1
+    rev_x = -1 if xm["worst"] < 0 else 1
+
+    y_idx = 3 if yn == "throughput" else 4
+    data.sort(key=lambda t: (rev_y * t[y_idx], rev_x * t[2]))
+
+    lines = []
+    last_x = xm["worst"]
+    comparator = (
+        (lambda xv, lx: xv > lx) if last_x < 0 else (lambda xv, lx: xv < lx)
+    )
+    for d in data:
+        if comparator(d[2], last_x):
+            last_x = d[2]
+            lines.append(d)
+    return lines
+
+
+def get_frontier(df, metric):
+    lines = create_pointset(df.values.tolist(), "k-nn", metric)
+    return pd.DataFrame(lines, columns=df.columns)
+
+
 def convert_json_to_csv_search(dataset, dataset_path):
     for file, algo_name, df in read_file(dataset, dataset_path, "search"):
         try:
@@ -105,7 +147,8 @@ def convert_json_to_csv_search(dataset, dataset_path):
                     "algo_name": [algo_name] * len(df),
                     "index_name": df["name"],
                     "recall": df["Recall"],
-                    "qps": df["items_per_second"],
+                    "throughput": df["items_per_second"],
+                    "latency": df["Latency"],
                 }
             )
             for name in df:
@@ -141,7 +184,13 @@ def convert_json_to_csv_search(dataset, dataset_path):
                     "appended in the Search CSV"
                 )
 
-            write.to_csv(file.replace(".json", ".csv"), index=False)
+            write.to_csv(file.replace(".json", "_raw.csv"), index=False)
+            throughput = get_frontier(write, "throughput")
+            throughput.to_csv(
+                file.replace(".json", "_throughput.csv"), index=False
+            )
+            latency = get_frontier(write, "latency")
+            latency.to_csv(file.replace(".json", "_latency.csv"), index=False)
         except Exception as e:
             print(
                 "An error occurred processing file %s (%s). Skipping..."