Skip to content

Commit

Permalink
Update raft-ann-bench output filenames and add features to plotting (
Browse files Browse the repository at this point in the history
…#2043)

This PR:
1. Adds more clarity to filenames by using `,` as separator instead of `_`
2. Adds 80% and 99% recall bars to build plots
3. Does not plot a recall level in build plot if no data is present
4. Adds a `x-start` argument which allows controlling the minimum recall level used on the x-axis of the search plot
5. Fixes sometimes occurring multi-line issue in search plots
6. Build time plots now plot average build times for an index corresponding a search query in each recall range

Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: #2043
  • Loading branch information
divyegala authored Dec 21, 2023
1 parent 7e098b2 commit bae049b
Show file tree
Hide file tree
Showing 4 changed files with 136 additions and 75 deletions.
13 changes: 7 additions & 6 deletions docs/source/raft_ann_benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,8 @@ options:
`algo-groups`: this parameter is helpful to append any specific algorithm+group combination to run the benchmark for in addition to all the arguments from `algorithms` and `groups`. It is of the format `<algorithm>.<group>`, or for example, `raft_cagra.large`
For every algorithm run by this script, it outputs an index build statistics JSON file in `<dataset-path/<dataset>/result/build/<algo_{group}-{k}-{batch_size}.json>`
and an index search statistics JSON file in `<dataset-path/<dataset>/result/search/<algo_{group}-{k}-{batch_size}.json>`. NOTE: The filenams will not have "_{group}" if `group = "base"`.
For every algorithm run by this script, it outputs an index build statistics JSON file in `<dataset-path/<dataset>/result/build/<{algo},{group}.json>`
and an index search statistics JSON file in `<dataset-path/<dataset>/result/search/<{algo},{group},k{k},bs{batch_size}.json>`. NOTE: The filenames will not have ",{group}" if `group = "base"`.
`dataset-path` :
1. data is read from `<dataset-path>/<dataset>`
Expand All @@ -198,8 +198,8 @@ options:
--dataset-path DATASET_PATH
path to dataset folder (default: ${RAPIDS_DATASET_ROOT_DIR})
```
Build statistics CSV file is stored in `<dataset-path/<dataset>/result/build/<algo_group.csv>`
and index search statistics CSV file in `<dataset-path/<dataset>/result/search/<algo_group-k{k}-batch_size{batch_size}_{suffix}.csv>`, where suffix has three values:
Build statistics CSV file is stored in `<dataset-path/<dataset>/result/build/<{algo},{group}.csv>`
and index search statistics CSV file in `<dataset-path/<dataset>/result/search/<{algo},{group},k{k},bs{batch_size},{suffix}.csv>`, where suffix has three values:
1. `raw`: All search results are exported
2. `throughput`: Pareto frontier of throughput results is exported
3. `latency`: Pareto frontier of latency results is exported
Expand All @@ -212,8 +212,8 @@ CSV files `<dataset-path/<dataset>/result/search/*.csv`.
The usage of this script is:
```bash
usage: [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filepath OUTPUT_FILEPATH] [--algorithms ALGORITHMS] [--groups GROUPS] [--algo-groups ALGO_GROUPS]
[-k COUNT] [-bs BATCH_SIZE] [--build] [--search] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--mode {throughput,latency}] [--time-unit {s,ms,us}]
[--raw]
[-k COUNT] [-bs BATCH_SIZE] [--build] [--search] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--x-start X_START] [--mode {throughput,latency}]
[--time-unit {s,ms,us}] [--raw]
options:
-h, --help show this help message and exit
Expand All @@ -237,6 +237,7 @@ options:
--x-scale X_SCALE Scale to use when drawing the X-axis. Typically linear, logit or a2 (default: linear)
--y-scale {linear,log,symlog,logit}
Scale to use when drawing the Y-axis (default: linear)
--x-start X_START Recall values to start the x-axis from (default: 0.8)
--mode {throughput,latency}
search mode whose Pareto frontier is used on the y-axis (default: throughput)
--time-unit {s,ms,us}
Expand Down
34 changes: 21 additions & 13 deletions python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,9 @@ def read_file(dataset, dataset_path, method):
try:
data = json.load(f)
df = pd.DataFrame(data["benchmarks"])
yield os.path.join(dir, file), file.split("-")[0], df
filename_split = file.split(",")
algo_name = (filename_split[0], filename_split[1])
yield os.path.join(dir, file), algo_name, df
except Exception as e:
print(
"An error occurred processing file %s (%s). "
Expand All @@ -85,7 +87,10 @@ def read_file(dataset, dataset_path, method):
def convert_json_to_csv_build(dataset, dataset_path):
for file, algo_name, df in read_file(dataset, dataset_path, "build"):
try:
algo_name = algo_name.replace("_base", "")
if "base" in algo_name[1]:
algo_name = algo_name[0]
else:
algo_name = "_".join(algo_name)
df["name"] = df["name"].str.split("/").str[0]
write = pd.DataFrame(
{
Expand All @@ -97,12 +102,7 @@ def convert_json_to_csv_build(dataset, dataset_path):
for name in df:
if name not in skip_build_cols:
write[name] = df[name]
filepath = os.path.normpath(file).split(os.sep)
filename = filepath[-1].split("-")[0] + ".csv"
write.to_csv(
os.path.join(f"{os.sep}".join(filepath[:-1]), filename),
index=False,
)
write.to_csv(file.replace(".json", ".csv"), index=False)
except Exception as e:
print(
"An error occurred processing file %s (%s). Skipping..."
Expand Down Expand Up @@ -140,9 +140,17 @@ def convert_json_to_csv_search(dataset, dataset_path):
for file, algo_name, df in read_file(dataset, dataset_path, "search"):
try:
build_file = os.path.join(
dataset_path, dataset, "result", "build", f"{algo_name}.csv"
dataset_path,
dataset,
"result",
"build",
f"{','.join(algo_name)}.csv",
)
algo_name = algo_name.replace("_base", "")
print(build_file)
if "base" in algo_name[1]:
algo_name = algo_name[0]
else:
algo_name = "_".join(algo_name)
df["name"] = df["name"].str.split("/").str[0]
try:
write = pd.DataFrame(
Expand Down Expand Up @@ -201,13 +209,13 @@ def convert_json_to_csv_search(dataset, dataset_path):
"appended in the Search CSV"
)

write.to_csv(file.replace(".json", "_raw.csv"), index=False)
write.to_csv(file.replace(".json", ",raw.csv"), index=False)
throughput = get_frontier(write, "throughput")
throughput.to_csv(
file.replace(".json", "_throughput.csv"), index=False
file.replace(".json", ",throughput.csv"), index=False
)
latency = get_frontier(write, "latency")
latency.to_csv(file.replace(".json", "_latency.csv"), index=False)
latency.to_csv(file.replace(".json", ",latency.csv"), index=False)
except Exception as e:
print(
"An error occurred processing file %s (%s). Skipping..."
Expand Down
131 changes: 86 additions & 45 deletions python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,19 @@ def positive_int(input_str: str) -> int:
return i


def positive_float(input_str: str) -> float:
try:
i = float(input_str)
if i < 0.0:
raise ValueError
except ValueError:
raise argparse.ArgumentTypeError(
f"{input_str} is not a positive float"
)

return i


def generate_n_colors(n):
vs = np.linspace(0.3, 0.9, 7)
colors = [(0.9, 0.4, 0.4, 1.0)]
Expand Down Expand Up @@ -113,9 +126,11 @@ def create_plot_search(
batch_size,
mode,
time_unit,
x_start,
):
xn = "k-nn"
xm, ym = (metrics[xn], metrics[mode])
xm["lim"][0] = x_start
# Now generate each plot
handles = []
labels = []
Expand Down Expand Up @@ -211,20 +226,15 @@ def inv_fun(x):


def create_plot_build(
build_results, search_results, linestyles, fn_out, dataset
build_results, search_results, linestyles, fn_out, dataset, k, batch_size
):
bt_80 = [0] * len(linestyles)

qps_85 = [-1] * len(linestyles)
bt_85 = [0] * len(linestyles)
i_85 = [-1] * len(linestyles)

qps_90 = [-1] * len(linestyles)
bt_90 = [0] * len(linestyles)
i_90 = [-1] * len(linestyles)

qps_95 = [-1] * len(linestyles)
bt_95 = [0] * len(linestyles)
i_95 = [-1] * len(linestyles)

bt_99 = [0] * len(linestyles)

data = OrderedDict()
colors = OrderedDict()
Expand All @@ -237,35 +247,59 @@ def mean_y(algo):

for pos, algo in enumerate(sorted(search_results.keys(), key=mean_y)):
points = np.array(search_results[algo], dtype=object)
# x is recall, ls is algo_name, idxs is index_name
xs = points[:, 2]
ys = points[:, 3]
ls = points[:, 0]
idxs = points[:, 1]
# x is recall, y is qps, ls is algo_name, idxs is index_name

len_80, len_90, len_95, len_99 = 0, 0, 0, 0
for i in range(len(xs)):
if xs[i] >= 0.85 and xs[i] < 0.9 and ys[i] > qps_85[pos]:
qps_85[pos] = ys[i]
bt_85[pos] = build_results[(ls[i], idxs[i])][0][2]
i_85[pos] = idxs[i]
elif xs[i] >= 0.9 and xs[i] < 0.95 and ys[i] > qps_90[pos]:
qps_90[pos] = ys[i]
bt_90[pos] = build_results[(ls[i], idxs[i])][0][2]
i_90[pos] = idxs[i]
elif xs[i] >= 0.95 and ys[i] > qps_95[pos]:
qps_95[pos] = ys[i]
bt_95[pos] = build_results[(ls[i], idxs[i])][0][2]
i_95[pos] = idxs[i]
data[algo] = [bt_85[pos], bt_90[pos], bt_95[pos]]
if xs[i] >= 0.80 and xs[i] < 0.90:
bt_80[pos] = bt_80[pos] + build_results[(ls[i], idxs[i])][0][2]
len_80 = len_80 + 1
elif xs[i] >= 0.9 and xs[i] < 0.95:
bt_90[pos] = bt_90[pos] + build_results[(ls[i], idxs[i])][0][2]
len_90 = len_90 + 1
elif xs[i] >= 0.95 and xs[i] < 0.99:
bt_95[pos] = bt_95[pos] + build_results[(ls[i], idxs[i])][0][2]
len_95 = len_95 + 1
elif xs[i] >= 0.99:
bt_99[pos] = bt_99[pos] + build_results[(ls[i], idxs[i])][0][2]
len_99 = len_99 + 1
if len_80 > 0:
bt_80[pos] = bt_80[pos] / len_80
if len_90 > 0:
bt_90[pos] = bt_90[pos] / len_90
if len_95 > 0:
bt_95[pos] = bt_95[pos] / len_95
if len_99 > 0:
bt_99[pos] = bt_99[pos] / len_99
data[algo] = [
bt_80[pos],
bt_90[pos],
bt_95[pos],
bt_99[pos],
]
colors[algo] = linestyles[algo][0]

index = ["@85% Recall", "@90% Recall", "@95% Recall"]
index = [
"@80% Recall",
"@90% Recall",
"@95% Recall",
"@99% Recall",
]

df = pd.DataFrame(data, index=index)
df.replace(0.0, np.nan, inplace=True)
df = df.dropna(how="all")
plt.figure(figsize=(12, 9))
ax = df.plot.bar(rot=0, color=colors)
fig = ax.get_figure()
print(f"writing build output to {fn_out}")
plt.title("Build Time for Highest QPS")
plt.title(
"Average Build Time within Recall Range "
f"for k={k} batch_size={batch_size}"
)
plt.suptitle(f"{dataset}")
plt.ylabel("Build Time (s)")
fig.savefig(fn_out)
Expand Down Expand Up @@ -344,9 +378,9 @@ def load_all_results(
]
elif method == "search":
if raw:
suffix = "_raw"
suffix = ",raw"
else:
suffix = f"_{mode}"
suffix = f",{mode}"
result_files = [
result_file
for result_file in result_files
Expand All @@ -356,22 +390,20 @@ def load_all_results(
raise FileNotFoundError(f"No CSV result files found in {results_path}")

if method == "search":
result_files = [
result_filename
for result_filename in result_files
if f"{k}-{batch_size}" in result_filename
]
algo_group_files = [
result_filename.split("-")[0] for result_filename in result_files
]
else:
algo_group_files = [
result_filename for result_filename in result_files
]

for i in range(len(algo_group_files)):
algo_group = algo_group_files[i].replace(".csv", "").split("_")
algo_group_files[i] = ("_".join(algo_group[:-1]), algo_group[-1])
filter_k_bs = []
for result_filename in result_files:
filename_split = result_filename.split(",")
if (
int(filename_split[-3][1:]) == k
and int(filename_split[-2][2:]) == batch_size
):
filter_k_bs.append(result_filename)
result_files = filter_k_bs

algo_group_files = [
result_filename.replace(".csv", "").split(",")[:2]
for result_filename in result_files
]
algo_group_files = list(zip(*algo_group_files))

if len(algorithms) > 0:
Expand Down Expand Up @@ -478,6 +510,12 @@ def main():
choices=["linear", "log", "symlog", "logit"],
default="linear",
)
parser.add_argument(
"--x-start",
help="Recall values to start the x-axis from",
default=0.8,
type=positive_float,
)
parser.add_argument(
"--mode",
help="search mode whose Pareto frontier is used on the y-axis",
Expand Down Expand Up @@ -525,7 +563,7 @@ def main():
)
build_output_filepath = os.path.join(
args.output_filepath,
f"build-{args.dataset}.png",
f"build-{args.dataset}-k{k}-batch_size{batch_size}.png",
)

search_results = load_all_results(
Expand Down Expand Up @@ -554,6 +592,7 @@ def main():
batch_size,
args.mode,
args.time_unit,
args.x_start,
)
if build:
build_results = load_all_results(
Expand All @@ -575,6 +614,8 @@ def main():
linestyles,
build_output_filepath,
args.dataset,
k,
batch_size,
)


Expand Down
Loading

0 comments on commit bae049b

Please sign in to comment.