Skip to content

Commit

Permalink
pareto frontier in data_export
Browse files Browse the repository at this point in the history
  • Loading branch information
divyegala committed Nov 17, 2023
1 parent 7754040 commit c48476b
Show file tree
Hide file tree
Showing 3 changed files with 145 additions and 94 deletions.
27 changes: 18 additions & 9 deletions docs/source/raft_ann_benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -198,27 +198,32 @@ options:
--dataset-path DATASET_PATH
path to dataset folder (default: ${RAPIDS_DATASET_ROOT_DIR})
```
Build statistics CSV file is stored in `<dataset-path/<dataset>/result/build/<algo-k{k}-batch_size{batch_size}.csv>`
and index search statistics CSV file in `<dataset-path/<dataset>/result/search/<algo-k{k}-batch_size{batch_size}.csv>`.
Build statistics CSV file is stored in `<dataset-path/<dataset>/result/build/<algo_group.csv>`
and index search statistics CSV file in `<dataset-path/<dataset>/result/search/<algo_group-k{k}-batch_size{batch_size}_{suffix}.csv>`, where suffix has three values:
1. `raw`: All search results are exported
2. `throughput`: Pareto frontier of throughput results is exported
3. `latency`: Pareto frontier of latency results is exported
### Step 4: Plot Results
The script `raft-ann-bench.plot` will plot results for all algorithms found in index search statistics
CSV file in `<dataset-path/<dataset>/result/search/<-k{k}-batch_size{batch_size}>.csv`.
CSV files `<dataset-path/<dataset>/result/search/*.csv`.
The usage of this script is:
```bash
usage: __main__.py [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filepath OUTPUT_FILEPATH] [--algorithms ALGORITHMS] [--groups GROUPS] [--algo-groups ALGO_GROUPS] [-k COUNT]
[-bs BATCH_SIZE] [--build] [--search] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--raw]
usage: [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filepath OUTPUT_FILEPATH] [--algorithms ALGORITHMS] [--groups GROUPS] [--algo-groups ALGO_GROUPS]
[-k COUNT] [-bs BATCH_SIZE] [--build] [--search] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--mode {throughput,latency}] [--raw]
options:
-h, --help show this help message and exit
--dataset DATASET dataset to plot (default: glove-100-inner)
--dataset-path DATASET_PATH
path to dataset folder (default: os.getcwd()/datasets/)
path to dataset folder (default: /home/coder/raft/datasets/)
--output-filepath OUTPUT_FILEPATH
directory for PNG to be saved (default: os.getcwd())
directory for PNG to be saved (default: /home/coder/raft)
--algorithms ALGORITHMS
plot only comma separated list of named algorithms. If parameters `groups` and `algo-groups are both undefined, then group `base` is plot by default (default: None)
plot only comma separated list of named algorithms. If parameters `groups` and `algo-groups are both undefined, then group `base` is plot by default
(default: None)
--groups GROUPS plot only comma separated groups of parameters (default: base)
--algo-groups ALGO_GROUPS, --algo-groups ALGO_GROUPS
add comma separated <algorithm>.<group> to plot. Example usage: "--algo-groups=raft_cagra.large,hnswlib.large" (default: None)
Expand All @@ -231,8 +236,12 @@ options:
--x-scale X_SCALE Scale to use when drawing the X-axis. Typically linear, logit or a2 (default: linear)
--y-scale {linear,log,symlog,logit}
Scale to use when drawing the Y-axis (default: linear)
--raw Show raw results (not just Pareto frontier) in faded colours (default: False)
--mode {throughput,latency}
metric whose Pareto frontier is used on the y-axis (default: throughput)
--raw Show raw results (not just Pareto frontier) of metric arg (default: False)
```
`mode`: plots pareto frontier of `throughput` or `latency` results exported in the previous step
`algorithms`: plots all algorithms that it can find results for the specified `dataset`. By default, only `base` group will be plotted.
`groups`: plot only specific groups of parameters configurations for an algorithm. Groups are defined in YAML configs (see `configuration`), and by default run `base` group
Expand Down
55 changes: 52 additions & 3 deletions python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,26 @@
)

skip_search_cols = (
set(["recall", "qps", "items_per_second", "Recall"]) | skip_build_cols
set(["recall", "qps", "latency", "items_per_second", "Recall", "Latency"])
| skip_build_cols
)

metrics = {
"k-nn": {
"description": "Recall",
"worst": float("-inf"),
"lim": [0.0, 1.03],
},
"throughput": {
"description": "Queries per second (1/s)",
"worst": float("-inf"),
},
"latency": {
"description": "Search Latency (s)",
"worst": float("inf"),
},
}


def read_file(dataset, dataset_path, method):
dir = os.path.join(dataset_path, dataset, "result", method)
Expand Down Expand Up @@ -92,6 +109,31 @@ def convert_json_to_csv_build(dataset, dataset_path):
traceback.print_exc()


def create_pointset(data, xn, yn):
xm, ym = (metrics[xn], metrics[yn])
rev_y = -1 if ym["worst"] < 0 else 1
rev_x = -1 if xm["worst"] < 0 else 1

y_idx = 3 if yn == "throughput" else 4
data.sort(key=lambda t: (rev_y * t[y_idx], rev_x * t[2]))

lines = []
last_x = xm["worst"]
comparator = (
(lambda xv, lx: xv > lx) if last_x < 0 else (lambda xv, lx: xv < lx)
)
for d in data:
if comparator(d[2], last_x):
last_x = d[2]
lines.append(d)
return lines


def get_frontier(df, metric):
lines = create_pointset(df.values.tolist(), "k-nn", metric)
return pd.DataFrame(lines, columns=df.columns)


def convert_json_to_csv_search(dataset, dataset_path):
for file, algo_name, df in read_file(dataset, dataset_path, "search"):
try:
Expand All @@ -105,7 +147,8 @@ def convert_json_to_csv_search(dataset, dataset_path):
"algo_name": [algo_name] * len(df),
"index_name": df["name"],
"recall": df["Recall"],
"qps": df["items_per_second"],
"throughput": df["items_per_second"],
"latency": df["Latency"],
}
)
for name in df:
Expand Down Expand Up @@ -141,7 +184,13 @@ def convert_json_to_csv_search(dataset, dataset_path):
"appended in the Search CSV"
)

write.to_csv(file.replace(".json", ".csv"), index=False)
write.to_csv(file.replace(".json", "_raw.csv"), index=False)
throughput = get_frontier(write, "throughput")
throughput.to_csv(
file.replace(".json", "_throughput.csv"), index=False
)
latency = get_frontier(write, "latency")
latency.to_csv(file.replace(".json", "_latency.csv"), index=False)
except Exception as e:
print(
"An error occurred processing file %s (%s). Skipping..."
Expand Down
Loading

0 comments on commit c48476b

Please sign in to comment.