diff --git a/experiments/Aggregate Experimental Results.ipynb b/experiments/Aggregate Experimental Results.ipynb
new file mode 100644
index 0000000..c698769
--- /dev/null
+++ b/experiments/Aggregate Experimental Results.ipynb
@@ -0,0 +1,531 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "882091db-451c-4bfc-8a93-ef26dde618b8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "8a2ad73b-9d9f-4a24-bd1f-306dea2ccfe8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " harmbench | \n",
+ " advbench | \n",
+ " catqa | \n",
+ " xstest | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " aya-23 | \n",
+ " 0.725 | \n",
+ " 0.913462 | \n",
+ " 0.898182 | \n",
+ " 0.7 | \n",
+ "
\n",
+ " \n",
+ " gemma-1.1-7b | \n",
+ " 0.965 | \n",
+ " 0.994231 | \n",
+ " 0.938182 | \n",
+ " 0.626667 | \n",
+ "
\n",
+ " \n",
+ " gemma-7b | \n",
+ " 0.92 | \n",
+ " 0.978846 | \n",
+ " 0.961818 | \n",
+ " 0.64 | \n",
+ "
\n",
+ " \n",
+ " gemma2-9b | \n",
+ " 0.995 | \n",
+ " 1.0 | \n",
+ " 0.994545 | \n",
+ " 0.7 | \n",
+ "
\n",
+ " \n",
+ " llama2-7b | \n",
+ " 0.99 | \n",
+ " 1.0 | \n",
+ " 0.996364 | \n",
+ " 0.097778 | \n",
+ "
\n",
+ " \n",
+ " llama3-8b | \n",
+ " 0.95 | \n",
+ " 0.990385 | \n",
+ " 0.990909 | \n",
+ " 0.737778 | \n",
+ "
\n",
+ " \n",
+ " llama3.1-8b | \n",
+ " 0.98 | \n",
+ " 1.0 | \n",
+ " 0.996364 | \n",
+ " 0.626667 | \n",
+ "
\n",
+ " \n",
+ " mistral-7b | \n",
+ " 0.635 | \n",
+ " 0.709615 | \n",
+ " 0.790909 | \n",
+ " 0.911111 | \n",
+ "
\n",
+ " \n",
+ " mistral-nemo-12b | \n",
+ " 0.77 | \n",
+ " 0.9 | \n",
+ " 0.914545 | \n",
+ " 0.777778 | \n",
+ "
\n",
+ " \n",
+ " mixtral-8x7b | \n",
+ " 0.825 | \n",
+ " 0.857143 | \n",
+ " 0.627273 | \n",
+ " 0.755556 | \n",
+ "
\n",
+ " \n",
+ " phi3-mini | \n",
+ " 0.975 | \n",
+ " 0.996154 | \n",
+ " 0.992727 | \n",
+ " 0.788889 | \n",
+ "
\n",
+ " \n",
+ " qwen2-0.5b | \n",
+ " 0.94 | \n",
+ " 0.973077 | \n",
+ " 0.898182 | \n",
+ " 0.493333 | \n",
+ "
\n",
+ " \n",
+ " qwen2-1.5b | \n",
+ " 0.95 | \n",
+ " 0.992308 | \n",
+ " 0.985455 | \n",
+ " 0.782222 | \n",
+ "
\n",
+ " \n",
+ " qwen2-7b | \n",
+ " 0.94 | \n",
+ " 0.998077 | \n",
+ " 0.989091 | \n",
+ " 0.853333 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " harmbench advbench catqa xstest\n",
+ "aya-23 0.725 0.913462 0.898182 0.7\n",
+ "gemma-1.1-7b 0.965 0.994231 0.938182 0.626667\n",
+ "gemma-7b 0.92 0.978846 0.961818 0.64\n",
+ "gemma2-9b 0.995 1.0 0.994545 0.7\n",
+ "llama2-7b 0.99 1.0 0.996364 0.097778\n",
+ "llama3-8b 0.95 0.990385 0.990909 0.737778\n",
+ "llama3.1-8b 0.98 1.0 0.996364 0.626667\n",
+ "mistral-7b 0.635 0.709615 0.790909 0.911111\n",
+ "mistral-nemo-12b 0.77 0.9 0.914545 0.777778\n",
+ "mixtral-8x7b 0.825 0.857143 0.627273 0.755556\n",
+ "phi3-mini 0.975 0.996154 0.992727 0.788889\n",
+ "qwen2-0.5b 0.94 0.973077 0.898182 0.493333\n",
+ "qwen2-1.5b 0.95 0.992308 0.985455 0.782222\n",
+ "qwen2-7b 0.94 0.998077 0.989091 0.853333"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data = {}\n",
+ "\n",
+ "for dataset in [\"harmbench\", \"advbench\", \"catqa\", \"xstest\"]:\n",
+ " data[dataset] = {}\n",
+ " for llm in [\n",
+ " \"llama3.1-8b\", \"llama3-8b\", \"llama2-7b\",\n",
+ " \"gemma2-9b\", \"gemma-1.1-7b\", \"gemma-7b\",\n",
+ " \"mistral-nemo-12b\", \"mistral-7b\", \"mixtral-8x7b\",\n",
+ " \"phi3-mini\", #\"phi3-small\",\n",
+ " \"qwen2-7b\", \"qwen2-1.5b\", \"qwen2-0.5b\",\n",
+ " \"aya-23\"\n",
+ " # \"yi-1.5-6b\"\n",
+ " ]:\n",
+ " try:\n",
+ " data[dataset][llm] = pd.read_json(f\"logs/{dataset}/{llm}.json\").to_dict(\"index\")#.to_dict(\"index\")\n",
+ " except FileNotFoundError:\n",
+ " print(f\"could not load {dataset}/{llm}\")\n",
+ "\n",
+ "data = pd.DataFrame.from_dict({\n",
+ " (i, j, k): data[i][j][k] for i in data for j in data[i] for k in data[i][j]\n",
+ "}, orient=\"index\")\n",
+ "\n",
+ "data.score.unstack().mean(axis=1).unstack().T[[\"harmbench\", \"advbench\", \"catqa\", \"xstest\"]]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "5478fd81-2bdf-47e1-98cc-209d820c5a60",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Aya Arabic | \n",
+ " Aya English | \n",
+ " Aya Filipino | \n",
+ " Aya French | \n",
+ " Aya Hindi | \n",
+ " Aya Russian | \n",
+ " Aya Serbian | \n",
+ " Aya Spanish | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " gemma-7b | \n",
+ " 95.444444 | \n",
+ " 99.088146 | \n",
+ " 89.990089 | \n",
+ " 99.261993 | \n",
+ " 88.52459 | \n",
+ " 97.020854 | \n",
+ " 93.439364 | \n",
+ " 98.081841 | \n",
+ "
\n",
+ " \n",
+ " llama2-7b | \n",
+ " 99.222222 | \n",
+ " 99.392097 | \n",
+ " 98.612488 | \n",
+ " 99.753998 | \n",
+ " 99.016393 | \n",
+ " 97.517378 | \n",
+ " 99.403579 | \n",
+ " 98.976982 | \n",
+ "
\n",
+ " \n",
+ " mistral-7b | \n",
+ " 90.777778 | \n",
+ " 95.035461 | \n",
+ " 92.368682 | \n",
+ " 95.940959 | \n",
+ " 79.562842 | \n",
+ " 90.168818 | \n",
+ " 94.035785 | \n",
+ " 93.478261 | \n",
+ "
\n",
+ " \n",
+ " phi3-mini | \n",
+ " 84.555556 | \n",
+ " 97.87234 | \n",
+ " 88.800793 | \n",
+ " 98.646986 | \n",
+ " 66.338798 | \n",
+ " 88.083416 | \n",
+ " 85.487078 | \n",
+ " 96.29156 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Aya Arabic Aya English Aya Filipino Aya French Aya Hindi \\\n",
+ "gemma-7b 95.444444 99.088146 89.990089 99.261993 88.52459 \n",
+ "llama2-7b 99.222222 99.392097 98.612488 99.753998 99.016393 \n",
+ "mistral-7b 90.777778 95.035461 92.368682 95.940959 79.562842 \n",
+ "phi3-mini 84.555556 97.87234 88.800793 98.646986 66.338798 \n",
+ "\n",
+ " Aya Russian Aya Serbian Aya Spanish \n",
+ "gemma-7b 97.020854 93.439364 98.081841 \n",
+ "llama2-7b 97.517378 99.403579 98.976982 \n",
+ "mistral-7b 90.168818 94.035785 93.478261 \n",
+ "phi3-mini 88.083416 85.487078 96.29156 "
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ling_data = {}\n",
+ "\n",
+ "for dataset in [\"arabic\", \"english\", \"filipino\", \"french\", \"hindi\", \"russian\", \"serbian\", \"spanish\"]:\n",
+ " ling_data[\"Aya \" + dataset.title()] = {}\n",
+ " for llm in [\n",
+ " #\"llama3.1-8b\", \"llama3-8b\", \n",
+ " \"llama2-7b\",\n",
+ " #\"gemma2-9b\", \"gemma-1.1-7b\", \n",
+ " \"gemma-7b\",\n",
+ " #\"mistral-nemo-12b\", \n",
+ " \"mistral-7b\", \n",
+ " #\"mixtral-8x7b\",\n",
+ " \"phi3-mini\", #\"phi3-small\",\n",
+ " #\"qwen2-7b\", \"qwen2-1.5b\", \"qwen2-0.5b\",\n",
+ " #\"aya-23\"\n",
+ " # \"yi-1.5-6b\"\n",
+ " ]:\n",
+ " try:\n",
+ " ling_data[\"Aya \" + dataset.title()][llm] = pd.read_json(f\"logs/aya-{dataset[:2]}/{llm}.json\").to_dict(\"index\")#.to_dict(\"index\")\n",
+ " except FileNotFoundError:\n",
+ " print(f\"could not load {dataset}/{llm}\")\n",
+ "\n",
+ "ling_data = pd.DataFrame.from_dict({\n",
+ " (i, j, k): ling_data[i][j][k] for i in ling_data for j in ling_data[i] for k in ling_data[i][j]\n",
+ "}, orient=\"index\")\n",
+ "\n",
+ "ling_data.score.unstack().mean(axis=1).unstack().T * 100"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "1b62757c-7d8e-4b40-8ecb-d0fe92a6245e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " aya-ar | \n",
+ " aya-en | \n",
+ " aya-fi | \n",
+ " aya-fr | \n",
+ " aya-hi | \n",
+ " aya-ru | \n",
+ " aya-se | \n",
+ " aya-sp | \n",
+ " xstest | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " llamaguard3 | \n",
+ " 0.442222 | \n",
+ " 0.536981 | \n",
+ " 0.322101 | \n",
+ " 0.634686 | \n",
+ " 0.66776 | \n",
+ " 0.633565 | \n",
+ " 0.487078 | \n",
+ " 0.641944 | \n",
+ " 0.893333 | \n",
+ "
\n",
+ " \n",
+ " llamaguard2 | \n",
+ " 0.414444 | \n",
+ " 0.671733 | \n",
+ " 0.3667 | \n",
+ " 0.714637 | \n",
+ " 0.66776 | \n",
+ " 0.619662 | \n",
+ " 0.516899 | \n",
+ " 0.671355 | \n",
+ " 0.888889 | \n",
+ "
\n",
+ " \n",
+ " llamaguard | \n",
+ " 0.192222 | \n",
+ " 0.715299 | \n",
+ " 0.248761 | \n",
+ " 0.745387 | \n",
+ " 0.231694 | \n",
+ " 0.616683 | \n",
+ " 0.507952 | \n",
+ " 0.705882 | \n",
+ " 0.831111 | \n",
+ "
\n",
+ " \n",
+ " walledguard | \n",
+ " 0.233333 | \n",
+ " 0.743668 | \n",
+ " 0.075322 | \n",
+ " 0.653137 | \n",
+ " 0.0 | \n",
+ " 0.503476 | \n",
+ " 0.121272 | \n",
+ " 0.644501 | \n",
+ " 0.873333 | \n",
+ "
\n",
+ " \n",
+ " lionguard | \n",
+ " 0.005556 | \n",
+ " 0.302938 | \n",
+ " 0.078295 | \n",
+ " 0.089791 | \n",
+ " 0.073224 | \n",
+ " 0.006951 | \n",
+ " 0.119284 | \n",
+ " 0.071611 | \n",
+ " 0.64 | \n",
+ "
\n",
+ " \n",
+ " promptguard | \n",
+ " 0.996667 | \n",
+ " 1.0 | \n",
+ " 0.999009 | \n",
+ " 0.99877 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.444444 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " aya-ar aya-en aya-fi aya-fr aya-hi aya-ru \\\n",
+ "llamaguard3 0.442222 0.536981 0.322101 0.634686 0.66776 0.633565 \n",
+ "llamaguard2 0.414444 0.671733 0.3667 0.714637 0.66776 0.619662 \n",
+ "llamaguard 0.192222 0.715299 0.248761 0.745387 0.231694 0.616683 \n",
+ "walledguard 0.233333 0.743668 0.075322 0.653137 0.0 0.503476 \n",
+ "lionguard 0.005556 0.302938 0.078295 0.089791 0.073224 0.006951 \n",
+ "promptguard 0.996667 1.0 0.999009 0.99877 1.0 1.0 \n",
+ "\n",
+ " aya-se aya-sp xstest \n",
+ "llamaguard3 0.487078 0.641944 0.893333 \n",
+ "llamaguard2 0.516899 0.671355 0.888889 \n",
+ "llamaguard 0.507952 0.705882 0.831111 \n",
+ "walledguard 0.121272 0.644501 0.873333 \n",
+ "lionguard 0.119284 0.071611 0.64 \n",
+ "promptguard 1.0 1.0 0.444444 "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "judge_data = {}\n",
+ "\n",
+ "for dataset in [\"xstest\"]+[\"aya-\"+i[:2] for i in [\"arabic\", \"english\", \"filipino\", \"french\", \"hindi\", \"russian\", \"serbian\", \"spanish\"]]:\n",
+ " judge_data[dataset] = {}\n",
+ " for judge in [\n",
+ " \"llamaguard3\", \"llamaguard2\", \"llamaguard\", \n",
+ " \"walledguard\", \"lionguard\", \"promptguard\",\n",
+ " #\"toxicitymodel\", \"toxic-bert\", \n",
+ " #\"multilingual-toxic-xlm-roberta\", \"unbiased-toxic-roberta\"\n",
+ " ]:\n",
+ " try:\n",
+ " judge_data[dataset][judge] = pd.read_json(f\"logs/judge-eval/{dataset}/{judge}.json\").to_dict(\"index\")#.to_dict(\"index\")\n",
+ " except FileNotFoundError:\n",
+ " #print(f\"could not load {dataset}/{judge}\")\n",
+ " pass\n",
+ "\n",
+ "judge_data = pd.DataFrame.from_dict({\n",
+ " (i, j, k): judge_data[i][j][k] for i in judge_data for j in judge_data[i] for k in judge_data[i][j]\n",
+ "}, orient=\"index\")\n",
+ "\n",
+ "judge_data.score.unstack().mean(axis=1).unstack().T.loc[[\n",
+ " \"llamaguard3\", \"llamaguard2\", \"llamaguard\",\n",
+ " \"walledguard\", \"lionguard\", \"promptguard\"\n",
+ "]]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bf73e5a6-e835-41fc-8462-5a3f1886d19a",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "citation-manager": {
+ "items": {}
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.10"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}