diff --git a/experiments/Aggregate Experimental Results.ipynb b/experiments/Aggregate Experimental Results.ipynb new file mode 100644 index 0000000..c698769 --- /dev/null +++ b/experiments/Aggregate Experimental Results.ipynb @@ -0,0 +1,531 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "882091db-451c-4bfc-8a93-ef26dde618b8", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "8a2ad73b-9d9f-4a24-bd1f-306dea2ccfe8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
harmbenchadvbenchcatqaxstest
aya-230.7250.9134620.8981820.7
gemma-1.1-7b0.9650.9942310.9381820.626667
gemma-7b0.920.9788460.9618180.64
gemma2-9b0.9951.00.9945450.7
llama2-7b0.991.00.9963640.097778
llama3-8b0.950.9903850.9909090.737778
llama3.1-8b0.981.00.9963640.626667
mistral-7b0.6350.7096150.7909090.911111
mistral-nemo-12b0.770.90.9145450.777778
mixtral-8x7b0.8250.8571430.6272730.755556
phi3-mini0.9750.9961540.9927270.788889
qwen2-0.5b0.940.9730770.8981820.493333
qwen2-1.5b0.950.9923080.9854550.782222
qwen2-7b0.940.9980770.9890910.853333
\n", + "
" + ], + "text/plain": [ + " harmbench advbench catqa xstest\n", + "aya-23 0.725 0.913462 0.898182 0.7\n", + "gemma-1.1-7b 0.965 0.994231 0.938182 0.626667\n", + "gemma-7b 0.92 0.978846 0.961818 0.64\n", + "gemma2-9b 0.995 1.0 0.994545 0.7\n", + "llama2-7b 0.99 1.0 0.996364 0.097778\n", + "llama3-8b 0.95 0.990385 0.990909 0.737778\n", + "llama3.1-8b 0.98 1.0 0.996364 0.626667\n", + "mistral-7b 0.635 0.709615 0.790909 0.911111\n", + "mistral-nemo-12b 0.77 0.9 0.914545 0.777778\n", + "mixtral-8x7b 0.825 0.857143 0.627273 0.755556\n", + "phi3-mini 0.975 0.996154 0.992727 0.788889\n", + "qwen2-0.5b 0.94 0.973077 0.898182 0.493333\n", + "qwen2-1.5b 0.95 0.992308 0.985455 0.782222\n", + "qwen2-7b 0.94 0.998077 0.989091 0.853333" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = {}\n", + "\n", + "for dataset in [\"harmbench\", \"advbench\", \"catqa\", \"xstest\"]:\n", + " data[dataset] = {}\n", + " for llm in [\n", + " \"llama3.1-8b\", \"llama3-8b\", \"llama2-7b\",\n", + " \"gemma2-9b\", \"gemma-1.1-7b\", \"gemma-7b\",\n", + " \"mistral-nemo-12b\", \"mistral-7b\", \"mixtral-8x7b\",\n", + " \"phi3-mini\", #\"phi3-small\",\n", + " \"qwen2-7b\", \"qwen2-1.5b\", \"qwen2-0.5b\",\n", + " \"aya-23\"\n", + " # \"yi-1.5-6b\"\n", + " ]:\n", + " try:\n", + " data[dataset][llm] = pd.read_json(f\"logs/{dataset}/{llm}.json\").to_dict(\"index\")#.to_dict(\"index\")\n", + " except FileNotFoundError:\n", + " print(f\"could not load {dataset}/{llm}\")\n", + "\n", + "data = pd.DataFrame.from_dict({\n", + " (i, j, k): data[i][j][k] for i in data for j in data[i] for k in data[i][j]\n", + "}, orient=\"index\")\n", + "\n", + "data.score.unstack().mean(axis=1).unstack().T[[\"harmbench\", \"advbench\", \"catqa\", \"xstest\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "5478fd81-2bdf-47e1-98cc-209d820c5a60", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Aya ArabicAya EnglishAya FilipinoAya FrenchAya HindiAya RussianAya SerbianAya Spanish
gemma-7b95.44444499.08814689.99008999.26199388.5245997.02085493.43936498.081841
llama2-7b99.22222299.39209798.61248899.75399899.01639397.51737899.40357998.976982
mistral-7b90.77777895.03546192.36868295.94095979.56284290.16881894.03578593.478261
phi3-mini84.55555697.8723488.80079398.64698666.33879888.08341685.48707896.29156
\n", + "
" + ], + "text/plain": [ + " Aya Arabic Aya English Aya Filipino Aya French Aya Hindi \\\n", + "gemma-7b 95.444444 99.088146 89.990089 99.261993 88.52459 \n", + "llama2-7b 99.222222 99.392097 98.612488 99.753998 99.016393 \n", + "mistral-7b 90.777778 95.035461 92.368682 95.940959 79.562842 \n", + "phi3-mini 84.555556 97.87234 88.800793 98.646986 66.338798 \n", + "\n", + " Aya Russian Aya Serbian Aya Spanish \n", + "gemma-7b 97.020854 93.439364 98.081841 \n", + "llama2-7b 97.517378 99.403579 98.976982 \n", + "mistral-7b 90.168818 94.035785 93.478261 \n", + "phi3-mini 88.083416 85.487078 96.29156 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ling_data = {}\n", + "\n", + "for dataset in [\"arabic\", \"english\", \"filipino\", \"french\", \"hindi\", \"russian\", \"serbian\", \"spanish\"]:\n", + " ling_data[\"Aya \" + dataset.title()] = {}\n", + " for llm in [\n", + " #\"llama3.1-8b\", \"llama3-8b\", \n", + " \"llama2-7b\",\n", + " #\"gemma2-9b\", \"gemma-1.1-7b\", \n", + " \"gemma-7b\",\n", + " #\"mistral-nemo-12b\", \n", + " \"mistral-7b\", \n", + " #\"mixtral-8x7b\",\n", + " \"phi3-mini\", #\"phi3-small\",\n", + " #\"qwen2-7b\", \"qwen2-1.5b\", \"qwen2-0.5b\",\n", + " #\"aya-23\"\n", + " # \"yi-1.5-6b\"\n", + " ]:\n", + " try:\n", + " ling_data[\"Aya \" + dataset.title()][llm] = pd.read_json(f\"logs/aya-{dataset[:2]}/{llm}.json\").to_dict(\"index\")#.to_dict(\"index\")\n", + " except FileNotFoundError:\n", + " print(f\"could not load {dataset}/{llm}\")\n", + "\n", + "ling_data = pd.DataFrame.from_dict({\n", + " (i, j, k): ling_data[i][j][k] for i in ling_data for j in ling_data[i] for k in ling_data[i][j]\n", + "}, orient=\"index\")\n", + "\n", + "ling_data.score.unstack().mean(axis=1).unstack().T * 100" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "1b62757c-7d8e-4b40-8ecb-d0fe92a6245e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
aya-araya-enaya-fiaya-fraya-hiaya-ruaya-seaya-spxstest
llamaguard30.4422220.5369810.3221010.6346860.667760.6335650.4870780.6419440.893333
llamaguard20.4144440.6717330.36670.7146370.667760.6196620.5168990.6713550.888889
llamaguard0.1922220.7152990.2487610.7453870.2316940.6166830.5079520.7058820.831111
walledguard0.2333330.7436680.0753220.6531370.00.5034760.1212720.6445010.873333
lionguard0.0055560.3029380.0782950.0897910.0732240.0069510.1192840.0716110.64
promptguard0.9966671.00.9990090.998771.01.01.01.00.444444
\n", + "
" + ], + "text/plain": [ + " aya-ar aya-en aya-fi aya-fr aya-hi aya-ru \\\n", + "llamaguard3 0.442222 0.536981 0.322101 0.634686 0.66776 0.633565 \n", + "llamaguard2 0.414444 0.671733 0.3667 0.714637 0.66776 0.619662 \n", + "llamaguard 0.192222 0.715299 0.248761 0.745387 0.231694 0.616683 \n", + "walledguard 0.233333 0.743668 0.075322 0.653137 0.0 0.503476 \n", + "lionguard 0.005556 0.302938 0.078295 0.089791 0.073224 0.006951 \n", + "promptguard 0.996667 1.0 0.999009 0.99877 1.0 1.0 \n", + "\n", + " aya-se aya-sp xstest \n", + "llamaguard3 0.487078 0.641944 0.893333 \n", + "llamaguard2 0.516899 0.671355 0.888889 \n", + "llamaguard 0.507952 0.705882 0.831111 \n", + "walledguard 0.121272 0.644501 0.873333 \n", + "lionguard 0.119284 0.071611 0.64 \n", + "promptguard 1.0 1.0 0.444444 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "judge_data = {}\n", + "\n", + "for dataset in [\"xstest\"]+[\"aya-\"+i[:2] for i in [\"arabic\", \"english\", \"filipino\", \"french\", \"hindi\", \"russian\", \"serbian\", \"spanish\"]]:\n", + " judge_data[dataset] = {}\n", + " for judge in [\n", + " \"llamaguard3\", \"llamaguard2\", \"llamaguard\", \n", + " \"walledguard\", \"lionguard\", \"promptguard\",\n", + " #\"toxicitymodel\", \"toxic-bert\", \n", + " #\"multilingual-toxic-xlm-roberta\", \"unbiased-toxic-roberta\"\n", + " ]:\n", + " try:\n", + " judge_data[dataset][judge] = pd.read_json(f\"logs/judge-eval/{dataset}/{judge}.json\").to_dict(\"index\")#.to_dict(\"index\")\n", + " except FileNotFoundError:\n", + " #print(f\"could not load {dataset}/{judge}\")\n", + " pass\n", + "\n", + "judge_data = pd.DataFrame.from_dict({\n", + " (i, j, k): judge_data[i][j][k] for i in judge_data for j in judge_data[i] for k in judge_data[i][j]\n", + "}, orient=\"index\")\n", + "\n", + "judge_data.score.unstack().mean(axis=1).unstack().T.loc[[\n", + " \"llamaguard3\", \"llamaguard2\", \"llamaguard\",\n", + " \"walledguard\", \"lionguard\", \"promptguard\"\n", + "]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf73e5a6-e835-41fc-8462-5a3f1886d19a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "citation-manager": { + "items": {} + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}