Skip to content

Commit

Permalink
Export Pareto frontier in raft-ann-bench.data_export (#2009)
Browse files Browse the repository at this point in the history
This PR now exports 3 CSVs from search result JSON files of the suffixes:
1. `raw`: All results
2. `throughput`: Pareto frontier of throughput results
3. `latency`: Pareto frontier of latency results

The Pareto frontier is now no more created in `raft-ann-bench.plot`.

Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: #2009
  • Loading branch information
divyegala authored Nov 18, 2023
1 parent bf16c50 commit bdc5117
Show file tree
Hide file tree
Showing 3 changed files with 210 additions and 120 deletions.
30 changes: 21 additions & 9 deletions docs/source/raft_ann_benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -198,27 +198,33 @@ options:
--dataset-path DATASET_PATH
path to dataset folder (default: ${RAPIDS_DATASET_ROOT_DIR})
```
Build statistics CSV file is stored in `<dataset-path/<dataset>/result/build/<algo-k{k}-batch_size{batch_size}.csv>`
and index search statistics CSV file in `<dataset-path/<dataset>/result/search/<algo-k{k}-batch_size{batch_size}.csv>`.
Build statistics CSV file is stored in `<dataset-path/<dataset>/result/build/<algo_group.csv>`
and index search statistics CSV file in `<dataset-path/<dataset>/result/search/<algo_group-k{k}-batch_size{batch_size}_{suffix}.csv>`, where suffix has three values:
1. `raw`: All search results are exported
2. `throughput`: Pareto frontier of throughput results is exported
3. `latency`: Pareto frontier of latency results is exported
### Step 4: Plot Results
The script `raft-ann-bench.plot` will plot results for all algorithms found in index search statistics
CSV file in `<dataset-path/<dataset>/result/search/<-k{k}-batch_size{batch_size}>.csv`.
CSV files `<dataset-path/<dataset>/result/search/*.csv`.
The usage of this script is:
```bash
usage: __main__.py [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filepath OUTPUT_FILEPATH] [--algorithms ALGORITHMS] [--groups GROUPS] [--algo-groups ALGO_GROUPS] [-k COUNT]
[-bs BATCH_SIZE] [--build] [--search] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--raw]
usage: [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filepath OUTPUT_FILEPATH] [--algorithms ALGORITHMS] [--groups GROUPS] [--algo-groups ALGO_GROUPS]
[-k COUNT] [-bs BATCH_SIZE] [--build] [--search] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--mode {throughput,latency}] [--time-unit {s,ms,us}]
[--raw]
options:
-h, --help show this help message and exit
--dataset DATASET dataset to plot (default: glove-100-inner)
--dataset-path DATASET_PATH
path to dataset folder (default: os.getcwd()/datasets/)
path to dataset folder (default: /home/coder/raft/datasets/)
--output-filepath OUTPUT_FILEPATH
directory for PNG to be saved (default: os.getcwd())
directory for PNG to be saved (default: /home/coder/raft)
--algorithms ALGORITHMS
plot only comma separated list of named algorithms. If parameters `groups` and `algo-groups are both undefined, then group `base` is plot by default (default: None)
plot only comma separated list of named algorithms. If parameters `groups` and `algo-groups are both undefined, then group `base` is plot by default
(default: None)
--groups GROUPS plot only comma separated groups of parameters (default: base)
--algo-groups ALGO_GROUPS, --algo-groups ALGO_GROUPS
add comma separated <algorithm>.<group> to plot. Example usage: "--algo-groups=raft_cagra.large,hnswlib.large" (default: None)
Expand All @@ -231,8 +237,14 @@ options:
--x-scale X_SCALE Scale to use when drawing the X-axis. Typically linear, logit or a2 (default: linear)
--y-scale {linear,log,symlog,logit}
Scale to use when drawing the Y-axis (default: linear)
--raw Show raw results (not just Pareto frontier) in faded colours (default: False)
--mode {throughput,latency}
search mode whose Pareto frontier is used on the y-axis (default: throughput)
--time-unit {s,ms,us}
time unit to plot when mode is latency (default: ms)
--raw Show raw results (not just Pareto frontier) of mode arg (default: False)
```
`mode`: plots pareto frontier of `throughput` or `latency` results exported in the previous step
`algorithms`: plots all algorithms that it can find results for the specified `dataset`. By default, only `base` group will be plotted.
`groups`: plot only specific groups of parameters configurations for an algorithm. Groups are defined in YAML configs (see `configuration`), and by default run `base` group
Expand Down
112 changes: 88 additions & 24 deletions python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,26 @@
)

skip_search_cols = (
set(["recall", "qps", "items_per_second", "Recall"]) | skip_build_cols
set(["recall", "qps", "latency", "items_per_second", "Recall", "Latency"])
| skip_build_cols
)

metrics = {
"k-nn": {
"description": "Recall",
"worst": float("-inf"),
"lim": [0.0, 1.03],
},
"throughput": {
"description": "Queries per second (1/s)",
"worst": float("-inf"),
},
"latency": {
"description": "Search Latency (s)",
"worst": float("inf"),
},
}


def read_file(dataset, dataset_path, method):
dir = os.path.join(dataset_path, dataset, "result", method)
Expand Down Expand Up @@ -92,6 +109,31 @@ def convert_json_to_csv_build(dataset, dataset_path):
traceback.print_exc()


def create_pointset(data, xn, yn):
xm, ym = (metrics[xn], metrics[yn])
rev_y = -1 if ym["worst"] < 0 else 1
rev_x = -1 if xm["worst"] < 0 else 1

y_idx = 3 if yn == "throughput" else 4
data.sort(key=lambda t: (rev_y * t[y_idx], rev_x * t[2]))

lines = []
last_x = xm["worst"]
comparator = (
(lambda xv, lx: xv > lx) if last_x < 0 else (lambda xv, lx: xv < lx)
)
for d in data:
if comparator(d[2], last_x):
last_x = d[2]
lines.append(d)
return lines


def get_frontier(df, metric):
lines = create_pointset(df.values.tolist(), "k-nn", metric)
return pd.DataFrame(lines, columns=df.columns)


def convert_json_to_csv_search(dataset, dataset_path):
for file, algo_name, df in read_file(dataset, dataset_path, "search"):
try:
Expand All @@ -100,14 +142,21 @@ def convert_json_to_csv_search(dataset, dataset_path):
)
algo_name = algo_name.replace("_base", "")
df["name"] = df["name"].str.split("/").str[0]
write = pd.DataFrame(
{
"algo_name": [algo_name] * len(df),
"index_name": df["name"],
"recall": df["Recall"],
"qps": df["items_per_second"],
}
)
try:
write = pd.DataFrame(
{
"algo_name": [algo_name] * len(df),
"index_name": df["name"],
"recall": df["Recall"],
"throughput": df["items_per_second"],
"latency": df["Latency"],
}
)
except Exception as e:
print(
"Search file %s (%s) missing a key. Skipping..."
% (file, e)
)
for name in df:
if name not in skip_search_cols:
write[name] = df[name]
Expand All @@ -120,28 +169,43 @@ def convert_json_to_csv_search(dataset, dataset_path):
write["build cpu_time"] = None
write["build GPU"] = None

for col_idx in range(6, len(build_df.columns)):
col_name = build_df.columns[col_idx]
write[col_name] = None

for s_index, search_row in write.iterrows():
for b_index, build_row in build_df.iterrows():
if search_row["index_name"] == build_row["index_name"]:
write.iloc[s_index, write_ncols] = build_df.iloc[
b_index, 2
]
write.iloc[
s_index, write_ncols + 1 :
] = build_df.iloc[b_index, 3:]
break
try:
for col_idx in range(6, len(build_df.columns)):
col_name = build_df.columns[col_idx]
write[col_name] = None

for s_index, search_row in write.iterrows():
for b_index, build_row in build_df.iterrows():
if (
search_row["index_name"]
== build_row["index_name"]
):
write.iloc[
s_index, write_ncols
] = build_df.iloc[b_index, 2]
write.iloc[
s_index, write_ncols + 1 :
] = build_df.iloc[b_index, 3:]
break
except Exception as e:
print(
"Build file %s (%s) missing a key. Skipping..."
% (build_file, e)
)
else:
warnings.warn(
f"Build CSV not found for {algo_name}, "
f"build params won't be "
"appended in the Search CSV"
)

write.to_csv(file.replace(".json", ".csv"), index=False)
write.to_csv(file.replace(".json", "_raw.csv"), index=False)
throughput = get_frontier(write, "throughput")
throughput.to_csv(
file.replace(".json", "_throughput.csv"), index=False
)
latency = get_frontier(write, "latency")
latency.to_csv(file.replace(".json", "_latency.csv"), index=False)
except Exception as e:
print(
"An error occurred processing file %s (%s). Skipping..."
Expand Down
Loading

0 comments on commit bdc5117

Please sign in to comment.