From 73079395a8c62d19bf82db80ab3cb8a46914a096 Mon Sep 17 00:00:00 2001 From: Prannaya Date: Fri, 2 Aug 2024 10:11:32 +0800 Subject: [PATCH] feat(experiments): scripts to run various experiments one after another --- experiments/scripts/run_catqa.sh | 37 ++++++++ experiments/scripts/run_judge.sh | 109 ++++++++++++++++++++++ experiments/scripts/run_llama2_7b.sh | 7 ++ experiments/scripts/run_llama_refusal.sh | 30 ++++++ experiments/scripts/run_mistral.sh | 4 + experiments/scripts/run_qwen_phi_gemma.sh | 33 +++++++ experiments/scripts/run_qwen_yi.sh | 15 +++ 7 files changed, 235 insertions(+) create mode 100644 experiments/scripts/run_catqa.sh create mode 100644 experiments/scripts/run_judge.sh create mode 100644 experiments/scripts/run_llama2_7b.sh create mode 100644 experiments/scripts/run_llama_refusal.sh create mode 100644 experiments/scripts/run_mistral.sh create mode 100644 experiments/scripts/run_qwen_phi_gemma.sh create mode 100644 experiments/scripts/run_qwen_yi.sh diff --git a/experiments/scripts/run_catqa.sh b/experiments/scripts/run_catqa.sh new file mode 100644 index 0000000..a9d7593 --- /dev/null +++ b/experiments/scripts/run_catqa.sh @@ -0,0 +1,37 @@ +echo "Running in $(pwd)" + +echo "Beginning Llama 3.1 8B on CatQA" +python experiments/prompt_benchmarks.py -m llama3.1-8b -d catqa --verbose > ./experiments/runs/llama3.1-8b-catqa.log 2> ./experiments/runs/llama3.1-8b-catqa.log +echo "Beginning Llama 3 8B on CatQA" +python experiments/prompt_benchmarks.py -m llama3-8b -d catqa --verbose > ./experiments/runs/llama3-8b-catqa.log 2> ./experiments/runs/llama3-8b-catqa.log +echo "Beginning Llama 2 7B on CatQA" +python experiments/prompt_benchmarks.py -m llama2-7b -d catqa --verbose > ./experiments/runs/llama2-7b-catqa.log 2> ./experiments/runs/llama2-7b-catqa.log +echo "Experiments on Llama Completed" + +# echo "Beginning Qwen 2 0.5B on CatQA" +# python experiments/prompt_benchmarks.py -m qwen2-0.5b -d catqa --verbose > ./experiments/runs/qwen2-0.5b-catqa.log 2> ./experiments/runs/qwen2-0.5b-catqa.log +# echo "Beginning Qwen 2 1.5B on CatQA" +# python experiments/prompt_benchmarks.py -m qwen2-1.5b -d catqa --verbose > ./experiments/runs/qwen2-1.5b-catqa.log 2> ./experiments/runs/qwen2-1.5b-catqa.log +# echo "Beginning Qwen 2 7B on CatQA" +# python experiments/prompt_benchmarks.py -m qwen2-7b -d catqa --verbose > ./experiments/runs/qwen2-7b-catqa.log 2> ./experiments/runs/qwen2-7b-catqa.log +# echo "Experiments on Qwen 2 Completed" + +# echo "Beginning Phi 3 Mini on CatQA" +# python experiments/prompt_benchmarks.py -m phi3-mini -d catqa --verbose > ./experiments/runs/phi3-mini-catqa.log 2> ./experiments/runs/phi3-mini-catqa.log +# echo "Experiments on Phi 3 Mini Completed" + +# echo "Beginning Mistral Nemo 12B on CatQA" +# python experiments/prompt_benchmarks.py -m mistral-nemo-12b -d catqa --verbose > ./experiments/runs/mistral-nemo-12b-catqa.log 2> ./experiments/runs/mistral-nemo-12b-catqa.log +# echo "Beginning Mixtral 8x7B on CatQA" +# python experiments/prompt_benchmarks.py -m mistral-8x7b -d catqa --verbose > ./experiments/runs/mistral-8x7b-catqa.log 2> ./experiments/runs/mixtral-8x7b-catqa.log +# echo "Beginning Mistral 7B on CatQA" +# python experiments/prompt_benchmarks.py -m mistral-7b -d catqa --verbose > ./experiments/runs/mistral-7b-catqa.log 2> ./experiments/runs/mistral-7b-catqa.log +# echo "Experiments on Mistral Completed" + +# echo "Beginning Gemma 2 9B on CatQA" +# python experiments/prompt_benchmarks.py -m gemma2-9b -d catqa --verbose > ./experiments/runs/gemma2-9b-catqa.log 2> ./experiments/runs/gemma2-9b-catqa.log +# echo "Beginning Gemma 1.1 7B on CatQA" +# python experiments/prompt_benchmarks.py -m gemma-1.1-7b -d catqa --verbose > ./experiments/runs/gemma-1.1-7b-catqa.log 2> ./experiments/runs/gemma-1.1-7b-catqa.log +# echo "Beginning Gemma 7B on CatQA" +# python experiments/prompt_benchmarks.py -m gemma-7b -d catqa --verbose > ./experiments/runs/gemma-7b-catqa.log 2> ./experiments/runs/gemma-7b-catqa.log +# echo "Experiments on Gemma Completed" \ No newline at end of file diff --git a/experiments/scripts/run_judge.sh b/experiments/scripts/run_judge.sh new file mode 100644 index 0000000..62cfe49 --- /dev/null +++ b/experiments/scripts/run_judge.sh @@ -0,0 +1,109 @@ +echo "Running in $(pwd)" + +# echo "Beginning Tests on xstest" +# python experiments/judge_benchmarks.py -j llamaguard3 -d xstest -v > ./experiments/runs/llamaguard3-xstest.log 2> ./experiments/runs/llamaguard3-xstest.log +# python experiments/judge_benchmarks.py -j llamaguard2 -d xstest -v > ./experiments/runs/llamaguard2-xstest.log 2> ./experiments/runs/llamaguard2-xstest.log +# python experiments/judge_benchmarks.py -j llamaguard -d xstest -v > ./experiments/runs/llamaguard-xstest.log 2> ./experiments/runs/llamaguard-xstest.log +# python experiments/judge_benchmarks.py -j walledguard -d xstest -v > ./experiments/runs/walledguard-xstest.log 2> ./experiments/runs/walledguard-xstest.log +# python experiments/judge_benchmarks.py -j lionguard -d xstest -v > ./experiments/runs/lionguard-xstest.log 2> ./experiments/runs/lionguard-xstest.log +# python experiments/judge_benchmarks.py -j promptguard -d xstest -v > ./experiments/runs/promptguard-xstest.log 2> ./experiments/runs/promptguard-xstest.log +# python experiments/judge_benchmarks.py -j toxicitymodel -d xstest -v > ./experiments/runs/toxicitymodel-xstest.log 2> ./experiments/runs/toxicitymodel-xstest.log +# python experiments/judge_benchmarks.py -j toxic-bert -d xstest -v > ./experiments/runs/toxic-bert-xstest.log 2> ./experiments/runs/toxic-bert-xstest.log +# python experiments/judge_benchmarks.py -j multilingual-toxic-xlm-roberta -d xstest -v > ./experiments/runs/multilingual-toxic-xlm-roberta-xstest.log 2> ./experiments/runs/multilingual-toxic-xlm-roberta-xstest.log +# python experiments/judge_benchmarks.py -j unbiased-toxic-roberta -d xstest -v > ./experiments/runs/unbiased-toxic-roberta-xstest.log 2> ./experiments/runs/unbiased-toxic-roberta-xstest.log + +# echo "Beginning Tests on aya-ar" +# python experiments/judge_benchmarks.py -j llamaguard3 -d aya-ar -v > ./experiments/runs/llamaguard3-aya-ar.log 2> ./experiments/runs/llamaguard3-aya-ar.log +# python experiments/judge_benchmarks.py -j llamaguard2 -d aya-ar -v > ./experiments/runs/llamaguard2-aya-ar.log 2> ./experiments/runs/llamaguard2-aya-ar.log +# python experiments/judge_benchmarks.py -j llamaguard -d aya-ar -v > ./experiments/runs/llamaguard-aya-ar.log 2> ./experiments/runs/llamaguard-aya-ar.log +# python experiments/judge_benchmarks.py -j walledguard -d aya-ar -v > ./experiments/runs/walledguard-aya-ar.log 2> ./experiments/runs/walledguard-aya-ar.log +# python experiments/judge_benchmarks.py -j lionguard -d aya-ar -v > ./experiments/runs/lionguard-aya-ar.log 2> ./experiments/runs/lionguard-aya-ar.log +# python experiments/judge_benchmarks.py -j promptguard -d aya-ar -v > ./experiments/runs/promptguard-aya-ar.log 2> ./experiments/runs/promptguard-aya-ar.log +# python experiments/judge_benchmarks.py -j toxicitymodel -d aya-ar -v > ./experiments/runs/toxicitymodel-aya-ar.log 2> ./experiments/runs/toxicitymodel-aya-ar.log +# python experiments/judge_benchmarks.py -j toxic-bert -d aya-ar -v > ./experiments/runs/toxic-bert-aya-ar.log 2> ./experiments/runs/toxic-bert-aya-ar.log +# python experiments/judge_benchmarks.py -j multilingual-toxic-xlm-roberta -d aya-ar -v > ./experiments/runs/multilingual-toxic-xlm-roberta-aya-ar.log 2> ./experiments/runs/multilingual-toxic-xlm-roberta-aya-ar.log +# python experiments/judge_benchmarks.py -j unbiased-toxic-roberta -d aya-ar -v > ./experiments/runs/unbiased-toxic-roberta-aya-ar.log 2> ./experiments/runs/unbiased-toxic-roberta-aya-ar.log + +# echo "Beginning Tests on aya-en" +# python experiments/judge_benchmarks.py -j llamaguard3 -d aya-en -v > ./experiments/runs/llamaguard3-aya-en.log 2> ./experiments/runs/llamaguard3-aya-en.log +# python experiments/judge_benchmarks.py -j llamaguard2 -d aya-en -v > ./experiments/runs/llamaguard2-aya-en.log 2> ./experiments/runs/llamaguard2-aya-en.log +# python experiments/judge_benchmarks.py -j llamaguard -d aya-en -v > ./experiments/runs/llamaguard-aya-en.log 2> ./experiments/runs/llamaguard-aya-en.log +# python experiments/judge_benchmarks.py -j walledguard -d aya-en -v > ./experiments/runs/walledguard-aya-en.log 2> ./experiments/runs/walledguard-aya-en.log +# python experiments/judge_benchmarks.py -j lionguard -d aya-en -v > ./experiments/runs/lionguard-aya-en.log 2> ./experiments/runs/lionguard-aya-en.log +# python experiments/judge_benchmarks.py -j promptguard -d aya-en -v > ./experiments/runs/promptguard-aya-en.log 2> ./experiments/runs/promptguard-aya-en.log +# python experiments/judge_benchmarks.py -j toxicitymodel -d aya-en -v > ./experiments/runs/toxicitymodel-aya-en.log 2> ./experiments/runs/toxicitymodel-aya-en.log +# python experiments/judge_benchmarks.py -j toxic-bert -d aya-en -v > ./experiments/runs/toxic-bert-aya-en.log 2> ./experiments/runs/toxic-bert-aya-en.log +# python experiments/judge_benchmarks.py -j multilingual-toxic-xlm-roberta -d aya-en -v > ./experiments/runs/multilingual-toxic-xlm-roberta-aya-en.log 2> ./experiments/runs/multilingual-toxic-xlm-roberta-aya-en.log +# python experiments/judge_benchmarks.py -j unbiased-toxic-roberta -d aya-en -v > ./experiments/runs/unbiased-toxic-roberta-aya-en.log 2> ./experiments/runs/unbiased-toxic-roberta-aya-en.log + +# echo "Beginning Tests on aya-fi" +# python experiments/judge_benchmarks.py -j llamaguard3 -d aya-fi -v > ./experiments/runs/llamaguard3-aya-fi.log 2> ./experiments/runs/llamaguard3-aya-fi.log +# python experiments/judge_benchmarks.py -j llamaguard2 -d aya-fi -v > ./experiments/runs/llamaguard2-aya-fi.log 2> ./experiments/runs/llamaguard2-aya-fi.log +# python experiments/judge_benchmarks.py -j llamaguard -d aya-fi -v > ./experiments/runs/llamaguard-aya-fi.log 2> ./experiments/runs/llamaguard-aya-fi.log +# python experiments/judge_benchmarks.py -j walledguard -d aya-fi -v > ./experiments/runs/walledguard-aya-fi.log 2> ./experiments/runs/walledguard-aya-fi.log +# python experiments/judge_benchmarks.py -j lionguard -d aya-fi -v > ./experiments/runs/lionguard-aya-fi.log 2> ./experiments/runs/lionguard-aya-fi.log +# python experiments/judge_benchmarks.py -j promptguard -d aya-fi -v > ./experiments/runs/promptguard-aya-fi.log 2> ./experiments/runs/promptguard-aya-fi.log +# python experiments/judge_benchmarks.py -j toxicitymodel -d aya-fi -v > ./experiments/runs/toxicitymodel-aya-fi.log 2> ./experiments/runs/toxicitymodel-aya-fi.log +# python experiments/judge_benchmarks.py -j toxic-bert -d aya-fi -v > ./experiments/runs/toxic-bert-aya-fi.log 2> ./experiments/runs/toxic-bert-aya-fi.log +# python experiments/judge_benchmarks.py -j multilingual-toxic-xlm-roberta -d aya-fi -v > ./experiments/runs/multilingual-toxic-xlm-roberta-aya-fi.log 2> ./experiments/runs/multilingual-toxic-xlm-roberta-aya-fi.log +# python experiments/judge_benchmarks.py -j unbiased-toxic-roberta -d aya-fi -v > ./experiments/runs/unbiased-toxic-roberta-aya-fi.log 2> ./experiments/runs/unbiased-toxic-roberta-aya-fi.log + +echo "Beginning Tests on aya-fr" +# python experiments/judge_benchmarks.py -j llamaguard3 -d aya-fr -v > ./experiments/runs/llamaguard3-aya-fr.log 2> ./experiments/runs/llamaguard3-aya-fr.log +# python experiments/judge_benchmarks.py -j llamaguard2 -d aya-fr -v > ./experiments/runs/llamaguard2-aya-fr.log 2> ./experiments/runs/llamaguard2-aya-fr.log +# python experiments/judge_benchmarks.py -j llamaguard -d aya-fr -v > ./experiments/runs/llamaguard-aya-fr.log 2> ./experiments/runs/llamaguard-aya-fr.log +# python experiments/judge_benchmarks.py -j walledguard -d aya-fr -v > ./experiments/runs/walledguard-aya-fr.log 2> ./experiments/runs/walledguard-aya-fr.log +python experiments/judge_benchmarks.py -j lionguard -d aya-fr -v > ./experiments/runs/lionguard-aya-fr.log 2> ./experiments/runs/lionguard-aya-fr.log +python experiments/judge_benchmarks.py -j promptguard -d aya-fr -v > ./experiments/runs/promptguard-aya-fr.log 2> ./experiments/runs/promptguard-aya-fr.log +# python experiments/judge_benchmarks.py -j toxicitymodel -d aya-fr -v > ./experiments/runs/toxicitymodel-aya-fr.log 2> ./experiments/runs/toxicitymodel-aya-fr.log +# python experiments/judge_benchmarks.py -j toxic-bert -d aya-fr -v > ./experiments/runs/toxic-bert-aya-fr.log 2> ./experiments/runs/toxic-bert-aya-fr.log +# python experiments/judge_benchmarks.py -j multilingual-toxic-xlm-roberta -d aya-fr -v > ./experiments/runs/multilingual-toxic-xlm-roberta-aya-fr.log 2> ./experiments/runs/multilingual-toxic-xlm-roberta-aya-fr.log +# python experiments/judge_benchmarks.py -j unbiased-toxic-roberta -d aya-fr -v > ./experiments/runs/unbiased-toxic-roberta-aya-fr.log 2> ./experiments/runs/unbiased-toxic-roberta-aya-fr.log + +echo "Beginning Tests on aya-hi" +python experiments/judge_benchmarks.py -j llamaguard3 -d aya-hi -v > ./experiments/runs/llamaguard3-aya-hi.log 2> ./experiments/runs/llamaguard3-aya-hi.log +python experiments/judge_benchmarks.py -j llamaguard2 -d aya-hi -v > ./experiments/runs/llamaguard2-aya-hi.log 2> ./experiments/runs/llamaguard2-aya-hi.log +python experiments/judge_benchmarks.py -j llamaguard -d aya-hi -v > ./experiments/runs/llamaguard-aya-hi.log 2> ./experiments/runs/llamaguard-aya-hi.log +python experiments/judge_benchmarks.py -j walledguard -d aya-hi -v > ./experiments/runs/walledguard-aya-hi.log 2> ./experiments/runs/walledguard-aya-hi.log +python experiments/judge_benchmarks.py -j lionguard -d aya-hi -v > ./experiments/runs/lionguard-aya-hi.log 2> ./experiments/runs/lionguard-aya-hi.log +python experiments/judge_benchmarks.py -j promptguard -d aya-hi -v > ./experiments/runs/promptguard-aya-hi.log 2> ./experiments/runs/promptguard-aya-hi.log +# python experiments/judge_benchmarks.py -j toxicitymodel -d aya-hi -v > ./experiments/runs/toxicitymodel-aya-hi.log 2> ./experiments/runs/toxicitymodel-aya-hi.log +# python experiments/judge_benchmarks.py -j toxic-bert -d aya-hi -v > ./experiments/runs/toxic-bert-aya-hi.log 2> ./experiments/runs/toxic-bert-aya-hi.log +# python experiments/judge_benchmarks.py -j multilingual-toxic-xlm-roberta -d aya-hi -v > ./experiments/runs/multilingual-toxic-xlm-roberta-aya-hi.log 2> ./experiments/runs/multilingual-toxic-xlm-roberta-aya-hi.log +# python experiments/judge_benchmarks.py -j unbiased-toxic-roberta -d aya-hi -v > ./experiments/runs/unbiased-toxic-roberta-aya-hi.log 2> ./experiments/runs/unbiased-toxic-roberta-aya-hi.log + +echo "Beginning Tests on aya-ru" +python experiments/judge_benchmarks.py -j llamaguard3 -d aya-ru -v > ./experiments/runs/llamaguard3-aya-ru.log 2> ./experiments/runs/llamaguard3-aya-ru.log +python experiments/judge_benchmarks.py -j llamaguard2 -d aya-ru -v > ./experiments/runs/llamaguard2-aya-ru.log 2> ./experiments/runs/llamaguard2-aya-ru.log +python experiments/judge_benchmarks.py -j llamaguard -d aya-ru -v > ./experiments/runs/llamaguard-aya-ru.log 2> ./experiments/runs/llamaguard-aya-ru.log +python experiments/judge_benchmarks.py -j walledguard -d aya-ru -v > ./experiments/runs/walledguard-aya-ru.log 2> ./experiments/runs/walledguard-aya-ru.log +python experiments/judge_benchmarks.py -j lionguard -d aya-ru -v > ./experiments/runs/lionguard-aya-ru.log 2> ./experiments/runs/lionguard-aya-ru.log +python experiments/judge_benchmarks.py -j promptguard -d aya-ru -v > ./experiments/runs/promptguard-aya-ru.log 2> ./experiments/runs/promptguard-aya-ru.log +# python experiments/judge_benchmarks.py -j toxicitymodel -d aya-ru -v > ./experiments/runs/toxicitymodel-aya-ru.log 2> ./experiments/runs/toxicitymodel-aya-ru.log +# python experiments/judge_benchmarks.py -j toxic-bert -d aya-ru -v > ./experiments/runs/toxic-bert-aya-ru.log 2> ./experiments/runs/toxic-bert-aya-ru.log +# python experiments/judge_benchmarks.py -j multilingual-toxic-xlm-roberta -d aya-ru -v > ./experiments/runs/multilingual-toxic-xlm-roberta-aya-ru.log 2> ./experiments/runs/multilingual-toxic-xlm-roberta-aya-ru.log +# python experiments/judge_benchmarks.py -j unbiased-toxic-roberta -d aya-ru -v > ./experiments/runs/unbiased-toxic-roberta-aya-ru.log 2> ./experiments/runs/unbiased-toxic-roberta-aya-ru.log + +echo "Beginning Tests on aya-se" +python experiments/judge_benchmarks.py -j llamaguard3 -d aya-se -v > ./experiments/runs/llamaguard3-aya-se.log 2> ./experiments/runs/llamaguard3-aya-se.log +python experiments/judge_benchmarks.py -j llamaguard2 -d aya-se -v > ./experiments/runs/llamaguard2-aya-se.log 2> ./experiments/runs/llamaguard2-aya-se.log +python experiments/judge_benchmarks.py -j llamaguard -d aya-se -v > ./experiments/runs/llamaguard-aya-se.log 2> ./experiments/runs/llamaguard-aya-se.log +python experiments/judge_benchmarks.py -j walledguard -d aya-se -v > ./experiments/runs/walledguard-aya-se.log 2> ./experiments/runs/walledguard-aya-se.log +python experiments/judge_benchmarks.py -j lionguard -d aya-se -v > ./experiments/runs/lionguard-aya-se.log 2> ./experiments/runs/lionguard-aya-se.log +python experiments/judge_benchmarks.py -j promptguard -d aya-se -v > ./experiments/runs/promptguard-aya-se.log 2> ./experiments/runs/promptguard-aya-se.log +# python experiments/judge_benchmarks.py -j toxicitymodel -d aya-se -v > ./experiments/runs/toxicitymodel-aya-se.log 2> ./experiments/runs/toxicitymodel-aya-se.log +# python experiments/judge_benchmarks.py -j toxic-bert -d aya-se -v > ./experiments/runs/toxic-bert-aya-se.log 2> ./experiments/runs/toxic-bert-aya-se.log +# python experiments/judge_benchmarks.py -j multilingual-toxic-xlm-roberta -d aya-se -v > ./experiments/runs/multilingual-toxic-xlm-roberta-aya-se.log 2> ./experiments/runs/multilingual-toxic-xlm-roberta-aya-se.log +# python experiments/judge_benchmarks.py -j unbiased-toxic-roberta -d aya-se -v > ./experiments/runs/unbiased-toxic-roberta-aya-se.log 2> ./experiments/runs/unbiased-toxic-roberta-aya-se.log + +echo "Beginning Tests on aya-sp" +python experiments/judge_benchmarks.py -j llamaguard3 -d aya-sp -v > ./experiments/runs/llamaguard3-aya-sp.log 2> ./experiments/runs/llamaguard3-aya-sp.log +python experiments/judge_benchmarks.py -j llamaguard2 -d aya-sp -v > ./experiments/runs/llamaguard2-aya-sp.log 2> ./experiments/runs/llamaguard2-aya-sp.log +python experiments/judge_benchmarks.py -j llamaguard -d aya-sp -v > ./experiments/runs/llamaguard-aya-sp.log 2> ./experiments/runs/llamaguard-aya-sp.log +python experiments/judge_benchmarks.py -j walledguard -d aya-sp -v > ./experiments/runs/walledguard-aya-sp.log 2> ./experiments/runs/walledguard-aya-sp.log +python experiments/judge_benchmarks.py -j lionguard -d aya-sp -v > ./experiments/runs/lionguard-aya-sp.log 2> ./experiments/runs/lionguard-aya-sp.log +python experiments/judge_benchmarks.py -j promptguard -d aya-sp -v > ./experiments/runs/promptguard-aya-sp.log 2> ./experiments/runs/promptguard-aya-sp.log +# python experiments/judge_benchmarks.py -j toxicitymodel -d aya-sp -v > ./experiments/runs/toxicitymodel-aya-sp.log 2> ./experiments/runs/toxicitymodel-aya-sp.log +# python experiments/judge_benchmarks.py -j toxic-bert -d aya-sp -v > ./experiments/runs/toxic-bert-aya-sp.log 2> ./experiments/runs/toxic-bert-aya-sp.log +# python experiments/judge_benchmarks.py -j multilingual-toxic-xlm-roberta -d aya-sp -v > ./experiments/runs/multilingual-toxic-xlm-roberta-aya-sp.log 2> ./experiments/runs/multilingual-toxic-xlm-roberta-aya-sp.log +# python experiments/judge_benchmarks.py -j unbiased-toxic-roberta -d aya-sp -v > ./experiments/runs/unbiased-toxic-roberta-aya-sp.log 2> ./experiments/runs/unbiased-toxic-roberta-aya-sp.log \ No newline at end of file diff --git a/experiments/scripts/run_llama2_7b.sh b/experiments/scripts/run_llama2_7b.sh new file mode 100644 index 0000000..65d753a --- /dev/null +++ b/experiments/scripts/run_llama2_7b.sh @@ -0,0 +1,7 @@ +echo "Running in $(pwd)" + +echo "Beginning Llama 2 7B on HarmBench" +python experiments/prompt_benchmarks.py -m llama2-7b -d harmbench --verbose > ./experiments/runs/llama2-7b-harmbench.log 2> ./experiments/runs/llama2-7b-harmbench.log +echo "Beginning Llama 2 7B on AdvBench" +python experiments/prompt_benchmarks.py -m llama2-7b -d advbench --verbose > ./experiments/runs/llama2-7b-advbench.log 2> ./experiments/runs/llama2-7b-advbench.log +echo "Experiments on Llama 2 7B Completed" \ No newline at end of file diff --git a/experiments/scripts/run_llama_refusal.sh b/experiments/scripts/run_llama_refusal.sh new file mode 100644 index 0000000..8b2a4b5 --- /dev/null +++ b/experiments/scripts/run_llama_refusal.sh @@ -0,0 +1,30 @@ +echo "Running in $(pwd)" + +# echo "Beginning Llama 3.1 8B on XSTest x MCQ" +# python experiments/refusal_benchmarks.py -m llama3.1-8b --verbose > ./experiments/runs/llama3.1-8b-refusal.log 2> ./experiments/runs/llama3.1-8b-refusal.log +# echo "Beginning Llama 3 8B on XSTest x MCQ" +# python experiments/refusal_benchmarks.py -m llama3-8b --verbose > ./experiments/runs/llama3-8b-refusal.log 2> ./experiments/runs/llama3-8b-refusal.log +# echo "Beginning Llama 2 7B on XSTest x MCQ" +# python experiments/refusal_benchmarks.py -m llama2-7b --verbose > ./experiments/runs/llama2-7b-refusal.log 2> ./experiments/runs/llama2-7b-refusal.log +# echo "Refusal Experiments on Llama Completed" + + +# echo "Beginning Qwen 2 0.5B on XSTest x MCQ" +# python experiments/refusal_benchmarks.py -m qwen2-0.5b --verbose > ./experiments/runs/qwen2-0.5b-refusal.log 2> ./experiments/runs/qwen2-0.5b-refusal.log +# echo "Refusal Experiments on Qwen 2 0.5B Completed" + +# echo "Beginning Qwen 2 1.5B on XSTest x MCQ" +# python experiments/refusal_benchmarks.py -m qwen2-1.5b --verbose > ./experiments/runs/qwen2-1.5b-refusal.log 2> ./experiments/runs/qwen2-1.5b-refusal.log +# echo "Refusal Experiments on Qwen 2 1.5B Completed" + +# echo "Beginning Phi 3 Mini on XSTest x MCQ" +# python experiments/refusal_benchmarks.py -m phi3-mini --verbose > ./experiments/runs/phi3-mini-refusal.log 2> ./experiments/runs/phi3-mini-refusal.log +# echo "Refusal Experiments on Phi 3 Mini Completed" + +# echo "Beginning Mistral 7B on XSTest x MCQ" +# python experiments/refusal_benchmarks.py -m mistral-7b --verbose > ./experiments/runs/mistral-7b-refusal.log 2> ./experiments/runs/mistral-7b-refusal.log +# echo "Refusal Experiments on Mistral 7B Completed" + +echo "Beginning Gemma 7B on XSTest x MCQ" +python experiments/refusal_benchmarks.py -m gemma-7b --verbose > ./experiments/runs/gemma-7b-refusal.log 2> ./experiments/runs/gemma-7b-refusal.log +echo "Refusal Experiments on Gemma 7B Completed" \ No newline at end of file diff --git a/experiments/scripts/run_mistral.sh b/experiments/scripts/run_mistral.sh new file mode 100644 index 0000000..52f668e --- /dev/null +++ b/experiments/scripts/run_mistral.sh @@ -0,0 +1,4 @@ +echo "Beginning Mistral 7B v0.3 on HarmBench" +python experiments/prompt_benchmarks.py -m mistral-7b -d harmbench --verbose > mistral-7b-harmbench.log 2> mistral-7b-harmbench.log +echo "Beginning Mistral 7B v0.3 on AdvBench" +python experiments/prompt_benchmarks.py -m mistral-7b -d advbench --verbose > mistral-7b-advbench.log 2> mistral-7b-advbench.log diff --git a/experiments/scripts/run_qwen_phi_gemma.sh b/experiments/scripts/run_qwen_phi_gemma.sh new file mode 100644 index 0000000..053a742 --- /dev/null +++ b/experiments/scripts/run_qwen_phi_gemma.sh @@ -0,0 +1,33 @@ +echo "Running in $(pwd)" + +# echo "Beginning Phi 3 Mini on HarmBench" +# python experiments/prompt_benchmarks.py -m phi3-mini -d harmbench --verbose > ./experiments/runs/phi3-mini-harmbench.log 2> ./experiments/runs/phi3-mini-harmbench.log +# echo "Beginning Phi 3 Mini on AdvBench" +# python experiments/prompt_benchmarks.py -m phi3-mini -d advbench --verbose > ./experiments/runs/phi3-mini-advbench.log 2> ./experiments/runs/phi3-mini-advbench.log +# echo "Experiments on Phi 3 Mini Completed" + +# echo "Beginning Qwen 2 7B on HarmBench" +# python experiments/prompt_benchmarks.py -m qwen2-7b -d harmbench --verbose > ./experiments/runs/qwen2-7b-harmbench.log 2> ./experiments/runs/qwen2-7b-harmbench.log +# echo "Beginning Qwen 2 7B on AdvBench" +# python experiments/prompt_benchmarks.py -m qwen2-7b -d advbench --verbose > ./experiments/runs/qwen2-7b-advbench.log 2> ./experiments/runs/qwen2-7b-advbench.log +# echo "Experiments on Qwen 2 7B Completed" + +# echo "Beginning Gemma 1.1 7B on HarmBench" +# python experiments/prompt_benchmarks.py -m gemma-1.1-7b -d harmbench --verbose > ./experiments/runs/gemma-1.1-7b-harmbench.log 2> ./experiments/runs/gemma-1.1-7b-harmbench.log +# echo "Beginning Gemma 1.1 7B on AdvBench" +# python experiments/prompt_benchmarks.py -m gemma-1.1-7b -d advbench --verbose > ./experiments/runs/gemma-1.1-7b-advbench.log 2> ./experiments/runs/gemma-1.1-7b-advbench.log +# echo "Experiments on Gemma 1.1 7B Completed" + +# echo "Beginning Qwen 1.5 7B on HarmBench" +# python experiments/prompt_benchmarks.py -m qwen-1.5-7b -d harmbench --verbose > ./experiments/runs/qwen-1.5-7b-harmbench.log 2> ./experiments/runs/qwen-1.5-7b-harmbench.log +# echo "Beginning Qwen 1.5 7B on AdvBench" +# python experiments/prompt_benchmarks.py -m qwen-1.5-7b -d advbench --verbose > ./experiments/runs/qwen-1.5-7b-advbench.log 2> ./experiments/runs/qwen-1.5-7b-advbench.log +# echo "Experiments on Qwen 1.5 7B Completed" + +# echo "Beginning Gemma 7B on HarmBench" +# python experiments/prompt_benchmarks.py -m gemma-7b -d harmbench --verbose > ./experiments/runs/gemma-7b-harmbench.log 2> ./experiments/runs/gemma-7b-harmbench.log +# echo "Beginning Gemma 7B on AdvBench" +# python experiments/prompt_benchmarks.py -m gemma-7b -d advbench --verbose > ./experiments/runs/gemma-7b-advbench.log 2> ./experiments/runs/gemma-7b-advbench.log +# echo "Experiments on Gemma 7B Completed" + + diff --git a/experiments/scripts/run_qwen_yi.sh b/experiments/scripts/run_qwen_yi.sh new file mode 100644 index 0000000..06ff199 --- /dev/null +++ b/experiments/scripts/run_qwen_yi.sh @@ -0,0 +1,15 @@ +echo "Beginning Qwen 2 0.5B on HarmBench" +python experiments/prompt_benchmarks.py -m qwen2-0.5b -d harmbench --verbose > qwen2-0.5b-harmbench.log 2> qwen2-0.5b-harmbench.log + +echo "Beginning Qwen 2 0.5B on AdvBench" +python experiments/prompt_benchmarks.py -m qwen2-0.5b -d advbench --verbose > qwen2-0.5b-advbench.log 2> qwen2-0.5b-advbench.log + +echo "Beginning Qwen 2 1.5B on HarmBench" +python experiments/prompt_benchmarks.py -m qwen2-1.5b -d harmbench --verbose > qwen2-1.5b-harmbench.log 2> qwen2-1.5b-harmbench.log +echo "Beginning Qwen 2 1.5B on AdvBench" +python experiments/prompt_benchmarks.py -m qwen2-1.5b -d advbench --verbose > qwen2-1.5b-advbench.log 2> qwen2-1.5b-advbench.log + +echo "Beginning Yi 1.5 6B on HarmBench" +python experiments/prompt_benchmarks.py -m yi-1.5-6b -d harmbench --verbose > yi-1.5-6b-harmbench.log 2> yi-1.5-6b-harmbench.log +echo "Beginning Yi 1.5 6B on AdvBench" +python experiments/prompt_benchmarks.py -m yi-1.5-6b -d advbench --verbose > yi-1.5-6b-advbench.log 2> yi-1.5-6b-advbench.log \ No newline at end of file