Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Export Pareto frontier in raft-ann-bench.data_export #2009

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 21 additions & 9 deletions docs/source/raft_ann_benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -198,27 +198,33 @@ options:
--dataset-path DATASET_PATH
path to dataset folder (default: ${RAPIDS_DATASET_ROOT_DIR})
```
Build statistics CSV file is stored in `<dataset-path/<dataset>/result/build/<algo-k{k}-batch_size{batch_size}.csv>`
and index search statistics CSV file in `<dataset-path/<dataset>/result/search/<algo-k{k}-batch_size{batch_size}.csv>`.
Build statistics CSV file is stored in `<dataset-path/<dataset>/result/build/<algo_group.csv>`
and index search statistics CSV file in `<dataset-path/<dataset>/result/search/<algo_group-k{k}-batch_size{batch_size}_{suffix}.csv>`, where suffix has three values:
1. `raw`: All search results are exported
2. `throughput`: Pareto frontier of throughput results is exported
3. `latency`: Pareto frontier of latency results is exported


### Step 4: Plot Results
The script `raft-ann-bench.plot` will plot results for all algorithms found in index search statistics
CSV file in `<dataset-path/<dataset>/result/search/<-k{k}-batch_size{batch_size}>.csv`.
CSV files `<dataset-path/<dataset>/result/search/*.csv`.

The usage of this script is:
```bash
usage: __main__.py [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filepath OUTPUT_FILEPATH] [--algorithms ALGORITHMS] [--groups GROUPS] [--algo-groups ALGO_GROUPS] [-k COUNT]
[-bs BATCH_SIZE] [--build] [--search] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--raw]
usage: [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filepath OUTPUT_FILEPATH] [--algorithms ALGORITHMS] [--groups GROUPS] [--algo-groups ALGO_GROUPS]
[-k COUNT] [-bs BATCH_SIZE] [--build] [--search] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--mode {throughput,latency}] [--time-unit {s,ms,us}]
[--raw]

options:
-h, --help show this help message and exit
--dataset DATASET dataset to plot (default: glove-100-inner)
--dataset-path DATASET_PATH
path to dataset folder (default: os.getcwd()/datasets/)
path to dataset folder (default: /home/coder/raft/datasets/)
--output-filepath OUTPUT_FILEPATH
directory for PNG to be saved (default: os.getcwd())
directory for PNG to be saved (default: /home/coder/raft)
--algorithms ALGORITHMS
plot only comma separated list of named algorithms. If parameters `groups` and `algo-groups are both undefined, then group `base` is plot by default (default: None)
plot only comma separated list of named algorithms. If parameters `groups` and `algo-groups are both undefined, then group `base` is plot by default
(default: None)
--groups GROUPS plot only comma separated groups of parameters (default: base)
--algo-groups ALGO_GROUPS, --algo-groups ALGO_GROUPS
add comma separated <algorithm>.<group> to plot. Example usage: "--algo-groups=raft_cagra.large,hnswlib.large" (default: None)
Expand All @@ -231,8 +237,14 @@ options:
--x-scale X_SCALE Scale to use when drawing the X-axis. Typically linear, logit or a2 (default: linear)
--y-scale {linear,log,symlog,logit}
Scale to use when drawing the Y-axis (default: linear)
--raw Show raw results (not just Pareto frontier) in faded colours (default: False)
--mode {throughput,latency}
search mode whose Pareto frontier is used on the y-axis (default: throughput)
--time-unit {s,ms,us}
time unit to plot when mode is latency (default: ms)
--raw Show raw results (not just Pareto frontier) of mode arg (default: False)
```
`mode`: plots pareto frontier of `throughput` or `latency` results exported in the previous step

`algorithms`: plots all algorithms that it can find results for the specified `dataset`. By default, only `base` group will be plotted.

`groups`: plot only specific groups of parameters configurations for an algorithm. Groups are defined in YAML configs (see `configuration`), and by default run `base` group
Expand Down
112 changes: 88 additions & 24 deletions python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,26 @@
)

skip_search_cols = (
set(["recall", "qps", "items_per_second", "Recall"]) | skip_build_cols
set(["recall", "qps", "latency", "items_per_second", "Recall", "Latency"])
| skip_build_cols
)

metrics = {
"k-nn": {
"description": "Recall",
"worst": float("-inf"),
"lim": [0.0, 1.03],
},
"throughput": {
"description": "Queries per second (1/s)",
"worst": float("-inf"),
},
"latency": {
"description": "Search Latency (s)",
"worst": float("inf"),
},
}


def read_file(dataset, dataset_path, method):
dir = os.path.join(dataset_path, dataset, "result", method)
Expand Down Expand Up @@ -92,6 +109,31 @@ def convert_json_to_csv_build(dataset, dataset_path):
traceback.print_exc()


def create_pointset(data, xn, yn):
xm, ym = (metrics[xn], metrics[yn])
rev_y = -1 if ym["worst"] < 0 else 1
rev_x = -1 if xm["worst"] < 0 else 1

y_idx = 3 if yn == "throughput" else 4
data.sort(key=lambda t: (rev_y * t[y_idx], rev_x * t[2]))

lines = []
last_x = xm["worst"]
comparator = (
(lambda xv, lx: xv > lx) if last_x < 0 else (lambda xv, lx: xv < lx)
)
for d in data:
if comparator(d[2], last_x):
last_x = d[2]
lines.append(d)
return lines


def get_frontier(df, metric):
lines = create_pointset(df.values.tolist(), "k-nn", metric)
return pd.DataFrame(lines, columns=df.columns)


def convert_json_to_csv_search(dataset, dataset_path):
for file, algo_name, df in read_file(dataset, dataset_path, "search"):
try:
Expand All @@ -100,14 +142,21 @@ def convert_json_to_csv_search(dataset, dataset_path):
)
algo_name = algo_name.replace("_base", "")
df["name"] = df["name"].str.split("/").str[0]
write = pd.DataFrame(
{
"algo_name": [algo_name] * len(df),
"index_name": df["name"],
"recall": df["Recall"],
"qps": df["items_per_second"],
}
)
try:
write = pd.DataFrame(
{
"algo_name": [algo_name] * len(df),
"index_name": df["name"],
"recall": df["Recall"],
"throughput": df["items_per_second"],
"latency": df["Latency"],
}
)
except Exception as e:
print(
"Search file %s (%s) missing a key. Skipping..."
% (file, e)
)
for name in df:
if name not in skip_search_cols:
write[name] = df[name]
Expand All @@ -120,28 +169,43 @@ def convert_json_to_csv_search(dataset, dataset_path):
write["build cpu_time"] = None
write["build GPU"] = None

for col_idx in range(6, len(build_df.columns)):
col_name = build_df.columns[col_idx]
write[col_name] = None

for s_index, search_row in write.iterrows():
for b_index, build_row in build_df.iterrows():
if search_row["index_name"] == build_row["index_name"]:
write.iloc[s_index, write_ncols] = build_df.iloc[
b_index, 2
]
write.iloc[
s_index, write_ncols + 1 :
] = build_df.iloc[b_index, 3:]
break
try:
for col_idx in range(6, len(build_df.columns)):
col_name = build_df.columns[col_idx]
write[col_name] = None

for s_index, search_row in write.iterrows():
for b_index, build_row in build_df.iterrows():
if (
search_row["index_name"]
== build_row["index_name"]
):
write.iloc[
s_index, write_ncols
] = build_df.iloc[b_index, 2]
write.iloc[
s_index, write_ncols + 1 :
] = build_df.iloc[b_index, 3:]
break
except Exception as e:
print(
"Build file %s (%s) missing a key. Skipping..."
% (build_file, e)
)
else:
warnings.warn(
f"Build CSV not found for {algo_name}, "
f"build params won't be "
"appended in the Search CSV"
)

write.to_csv(file.replace(".json", ".csv"), index=False)
write.to_csv(file.replace(".json", "_raw.csv"), index=False)
throughput = get_frontier(write, "throughput")
throughput.to_csv(
file.replace(".json", "_throughput.csv"), index=False
)
latency = get_frontier(write, "latency")
latency.to_csv(file.replace(".json", "_latency.csv"), index=False)
except Exception as e:
print(
"An error occurred processing file %s (%s). Skipping..."
Expand Down
Loading