Skip to content

Commit

Permalink
feat(experiments): scripts to run various experiments one after another
Browse files Browse the repository at this point in the history
  • Loading branch information
ThePyProgrammer committed Aug 2, 2024
1 parent 7f29519 commit 7307939
Show file tree
Hide file tree
Showing 7 changed files with 235 additions and 0 deletions.
37 changes: 37 additions & 0 deletions experiments/scripts/run_catqa.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
echo "Running in $(pwd)"

echo "Beginning Llama 3.1 8B on CatQA"
python experiments/prompt_benchmarks.py -m llama3.1-8b -d catqa --verbose > ./experiments/runs/llama3.1-8b-catqa.log 2> ./experiments/runs/llama3.1-8b-catqa.log
echo "Beginning Llama 3 8B on CatQA"
python experiments/prompt_benchmarks.py -m llama3-8b -d catqa --verbose > ./experiments/runs/llama3-8b-catqa.log 2> ./experiments/runs/llama3-8b-catqa.log
echo "Beginning Llama 2 7B on CatQA"
python experiments/prompt_benchmarks.py -m llama2-7b -d catqa --verbose > ./experiments/runs/llama2-7b-catqa.log 2> ./experiments/runs/llama2-7b-catqa.log
echo "Experiments on Llama Completed"

# echo "Beginning Qwen 2 0.5B on CatQA"
# python experiments/prompt_benchmarks.py -m qwen2-0.5b -d catqa --verbose > ./experiments/runs/qwen2-0.5b-catqa.log 2> ./experiments/runs/qwen2-0.5b-catqa.log
# echo "Beginning Qwen 2 1.5B on CatQA"
# python experiments/prompt_benchmarks.py -m qwen2-1.5b -d catqa --verbose > ./experiments/runs/qwen2-1.5b-catqa.log 2> ./experiments/runs/qwen2-1.5b-catqa.log
# echo "Beginning Qwen 2 7B on CatQA"
# python experiments/prompt_benchmarks.py -m qwen2-7b -d catqa --verbose > ./experiments/runs/qwen2-7b-catqa.log 2> ./experiments/runs/qwen2-7b-catqa.log
# echo "Experiments on Qwen 2 Completed"

# echo "Beginning Phi 3 Mini on CatQA"
# python experiments/prompt_benchmarks.py -m phi3-mini -d catqa --verbose > ./experiments/runs/phi3-mini-catqa.log 2> ./experiments/runs/phi3-mini-catqa.log
# echo "Experiments on Phi 3 Mini Completed"

# echo "Beginning Mistral Nemo 12B on CatQA"
# python experiments/prompt_benchmarks.py -m mistral-nemo-12b -d catqa --verbose > ./experiments/runs/mistral-nemo-12b-catqa.log 2> ./experiments/runs/mistral-nemo-12b-catqa.log
# echo "Beginning Mixtral 8x7B on CatQA"
# python experiments/prompt_benchmarks.py -m mistral-8x7b -d catqa --verbose > ./experiments/runs/mistral-8x7b-catqa.log 2> ./experiments/runs/mixtral-8x7b-catqa.log
# echo "Beginning Mistral 7B on CatQA"
# python experiments/prompt_benchmarks.py -m mistral-7b -d catqa --verbose > ./experiments/runs/mistral-7b-catqa.log 2> ./experiments/runs/mistral-7b-catqa.log
# echo "Experiments on Mistral Completed"

# echo "Beginning Gemma 2 9B on CatQA"
# python experiments/prompt_benchmarks.py -m gemma2-9b -d catqa --verbose > ./experiments/runs/gemma2-9b-catqa.log 2> ./experiments/runs/gemma2-9b-catqa.log
# echo "Beginning Gemma 1.1 7B on CatQA"
# python experiments/prompt_benchmarks.py -m gemma-1.1-7b -d catqa --verbose > ./experiments/runs/gemma-1.1-7b-catqa.log 2> ./experiments/runs/gemma-1.1-7b-catqa.log
# echo "Beginning Gemma 7B on CatQA"
# python experiments/prompt_benchmarks.py -m gemma-7b -d catqa --verbose > ./experiments/runs/gemma-7b-catqa.log 2> ./experiments/runs/gemma-7b-catqa.log
# echo "Experiments on Gemma Completed"
109 changes: 109 additions & 0 deletions experiments/scripts/run_judge.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
echo "Running in $(pwd)"

# echo "Beginning Tests on xstest"
# python experiments/judge_benchmarks.py -j llamaguard3 -d xstest -v > ./experiments/runs/llamaguard3-xstest.log 2> ./experiments/runs/llamaguard3-xstest.log
# python experiments/judge_benchmarks.py -j llamaguard2 -d xstest -v > ./experiments/runs/llamaguard2-xstest.log 2> ./experiments/runs/llamaguard2-xstest.log
# python experiments/judge_benchmarks.py -j llamaguard -d xstest -v > ./experiments/runs/llamaguard-xstest.log 2> ./experiments/runs/llamaguard-xstest.log
# python experiments/judge_benchmarks.py -j walledguard -d xstest -v > ./experiments/runs/walledguard-xstest.log 2> ./experiments/runs/walledguard-xstest.log
# python experiments/judge_benchmarks.py -j lionguard -d xstest -v > ./experiments/runs/lionguard-xstest.log 2> ./experiments/runs/lionguard-xstest.log
# python experiments/judge_benchmarks.py -j promptguard -d xstest -v > ./experiments/runs/promptguard-xstest.log 2> ./experiments/runs/promptguard-xstest.log
# python experiments/judge_benchmarks.py -j toxicitymodel -d xstest -v > ./experiments/runs/toxicitymodel-xstest.log 2> ./experiments/runs/toxicitymodel-xstest.log
# python experiments/judge_benchmarks.py -j toxic-bert -d xstest -v > ./experiments/runs/toxic-bert-xstest.log 2> ./experiments/runs/toxic-bert-xstest.log
# python experiments/judge_benchmarks.py -j multilingual-toxic-xlm-roberta -d xstest -v > ./experiments/runs/multilingual-toxic-xlm-roberta-xstest.log 2> ./experiments/runs/multilingual-toxic-xlm-roberta-xstest.log
# python experiments/judge_benchmarks.py -j unbiased-toxic-roberta -d xstest -v > ./experiments/runs/unbiased-toxic-roberta-xstest.log 2> ./experiments/runs/unbiased-toxic-roberta-xstest.log

# echo "Beginning Tests on aya-ar"
# python experiments/judge_benchmarks.py -j llamaguard3 -d aya-ar -v > ./experiments/runs/llamaguard3-aya-ar.log 2> ./experiments/runs/llamaguard3-aya-ar.log
# python experiments/judge_benchmarks.py -j llamaguard2 -d aya-ar -v > ./experiments/runs/llamaguard2-aya-ar.log 2> ./experiments/runs/llamaguard2-aya-ar.log
# python experiments/judge_benchmarks.py -j llamaguard -d aya-ar -v > ./experiments/runs/llamaguard-aya-ar.log 2> ./experiments/runs/llamaguard-aya-ar.log
# python experiments/judge_benchmarks.py -j walledguard -d aya-ar -v > ./experiments/runs/walledguard-aya-ar.log 2> ./experiments/runs/walledguard-aya-ar.log
# python experiments/judge_benchmarks.py -j lionguard -d aya-ar -v > ./experiments/runs/lionguard-aya-ar.log 2> ./experiments/runs/lionguard-aya-ar.log
# python experiments/judge_benchmarks.py -j promptguard -d aya-ar -v > ./experiments/runs/promptguard-aya-ar.log 2> ./experiments/runs/promptguard-aya-ar.log
# python experiments/judge_benchmarks.py -j toxicitymodel -d aya-ar -v > ./experiments/runs/toxicitymodel-aya-ar.log 2> ./experiments/runs/toxicitymodel-aya-ar.log
# python experiments/judge_benchmarks.py -j toxic-bert -d aya-ar -v > ./experiments/runs/toxic-bert-aya-ar.log 2> ./experiments/runs/toxic-bert-aya-ar.log
# python experiments/judge_benchmarks.py -j multilingual-toxic-xlm-roberta -d aya-ar -v > ./experiments/runs/multilingual-toxic-xlm-roberta-aya-ar.log 2> ./experiments/runs/multilingual-toxic-xlm-roberta-aya-ar.log
# python experiments/judge_benchmarks.py -j unbiased-toxic-roberta -d aya-ar -v > ./experiments/runs/unbiased-toxic-roberta-aya-ar.log 2> ./experiments/runs/unbiased-toxic-roberta-aya-ar.log

# echo "Beginning Tests on aya-en"
# python experiments/judge_benchmarks.py -j llamaguard3 -d aya-en -v > ./experiments/runs/llamaguard3-aya-en.log 2> ./experiments/runs/llamaguard3-aya-en.log
# python experiments/judge_benchmarks.py -j llamaguard2 -d aya-en -v > ./experiments/runs/llamaguard2-aya-en.log 2> ./experiments/runs/llamaguard2-aya-en.log
# python experiments/judge_benchmarks.py -j llamaguard -d aya-en -v > ./experiments/runs/llamaguard-aya-en.log 2> ./experiments/runs/llamaguard-aya-en.log
# python experiments/judge_benchmarks.py -j walledguard -d aya-en -v > ./experiments/runs/walledguard-aya-en.log 2> ./experiments/runs/walledguard-aya-en.log
# python experiments/judge_benchmarks.py -j lionguard -d aya-en -v > ./experiments/runs/lionguard-aya-en.log 2> ./experiments/runs/lionguard-aya-en.log
# python experiments/judge_benchmarks.py -j promptguard -d aya-en -v > ./experiments/runs/promptguard-aya-en.log 2> ./experiments/runs/promptguard-aya-en.log
# python experiments/judge_benchmarks.py -j toxicitymodel -d aya-en -v > ./experiments/runs/toxicitymodel-aya-en.log 2> ./experiments/runs/toxicitymodel-aya-en.log
# python experiments/judge_benchmarks.py -j toxic-bert -d aya-en -v > ./experiments/runs/toxic-bert-aya-en.log 2> ./experiments/runs/toxic-bert-aya-en.log
# python experiments/judge_benchmarks.py -j multilingual-toxic-xlm-roberta -d aya-en -v > ./experiments/runs/multilingual-toxic-xlm-roberta-aya-en.log 2> ./experiments/runs/multilingual-toxic-xlm-roberta-aya-en.log
# python experiments/judge_benchmarks.py -j unbiased-toxic-roberta -d aya-en -v > ./experiments/runs/unbiased-toxic-roberta-aya-en.log 2> ./experiments/runs/unbiased-toxic-roberta-aya-en.log

# echo "Beginning Tests on aya-fi"
# python experiments/judge_benchmarks.py -j llamaguard3 -d aya-fi -v > ./experiments/runs/llamaguard3-aya-fi.log 2> ./experiments/runs/llamaguard3-aya-fi.log
# python experiments/judge_benchmarks.py -j llamaguard2 -d aya-fi -v > ./experiments/runs/llamaguard2-aya-fi.log 2> ./experiments/runs/llamaguard2-aya-fi.log
# python experiments/judge_benchmarks.py -j llamaguard -d aya-fi -v > ./experiments/runs/llamaguard-aya-fi.log 2> ./experiments/runs/llamaguard-aya-fi.log
# python experiments/judge_benchmarks.py -j walledguard -d aya-fi -v > ./experiments/runs/walledguard-aya-fi.log 2> ./experiments/runs/walledguard-aya-fi.log
# python experiments/judge_benchmarks.py -j lionguard -d aya-fi -v > ./experiments/runs/lionguard-aya-fi.log 2> ./experiments/runs/lionguard-aya-fi.log
# python experiments/judge_benchmarks.py -j promptguard -d aya-fi -v > ./experiments/runs/promptguard-aya-fi.log 2> ./experiments/runs/promptguard-aya-fi.log
# python experiments/judge_benchmarks.py -j toxicitymodel -d aya-fi -v > ./experiments/runs/toxicitymodel-aya-fi.log 2> ./experiments/runs/toxicitymodel-aya-fi.log
# python experiments/judge_benchmarks.py -j toxic-bert -d aya-fi -v > ./experiments/runs/toxic-bert-aya-fi.log 2> ./experiments/runs/toxic-bert-aya-fi.log
# python experiments/judge_benchmarks.py -j multilingual-toxic-xlm-roberta -d aya-fi -v > ./experiments/runs/multilingual-toxic-xlm-roberta-aya-fi.log 2> ./experiments/runs/multilingual-toxic-xlm-roberta-aya-fi.log
# python experiments/judge_benchmarks.py -j unbiased-toxic-roberta -d aya-fi -v > ./experiments/runs/unbiased-toxic-roberta-aya-fi.log 2> ./experiments/runs/unbiased-toxic-roberta-aya-fi.log

echo "Beginning Tests on aya-fr"
# python experiments/judge_benchmarks.py -j llamaguard3 -d aya-fr -v > ./experiments/runs/llamaguard3-aya-fr.log 2> ./experiments/runs/llamaguard3-aya-fr.log
# python experiments/judge_benchmarks.py -j llamaguard2 -d aya-fr -v > ./experiments/runs/llamaguard2-aya-fr.log 2> ./experiments/runs/llamaguard2-aya-fr.log
# python experiments/judge_benchmarks.py -j llamaguard -d aya-fr -v > ./experiments/runs/llamaguard-aya-fr.log 2> ./experiments/runs/llamaguard-aya-fr.log
# python experiments/judge_benchmarks.py -j walledguard -d aya-fr -v > ./experiments/runs/walledguard-aya-fr.log 2> ./experiments/runs/walledguard-aya-fr.log
python experiments/judge_benchmarks.py -j lionguard -d aya-fr -v > ./experiments/runs/lionguard-aya-fr.log 2> ./experiments/runs/lionguard-aya-fr.log
python experiments/judge_benchmarks.py -j promptguard -d aya-fr -v > ./experiments/runs/promptguard-aya-fr.log 2> ./experiments/runs/promptguard-aya-fr.log
# python experiments/judge_benchmarks.py -j toxicitymodel -d aya-fr -v > ./experiments/runs/toxicitymodel-aya-fr.log 2> ./experiments/runs/toxicitymodel-aya-fr.log
# python experiments/judge_benchmarks.py -j toxic-bert -d aya-fr -v > ./experiments/runs/toxic-bert-aya-fr.log 2> ./experiments/runs/toxic-bert-aya-fr.log
# python experiments/judge_benchmarks.py -j multilingual-toxic-xlm-roberta -d aya-fr -v > ./experiments/runs/multilingual-toxic-xlm-roberta-aya-fr.log 2> ./experiments/runs/multilingual-toxic-xlm-roberta-aya-fr.log
# python experiments/judge_benchmarks.py -j unbiased-toxic-roberta -d aya-fr -v > ./experiments/runs/unbiased-toxic-roberta-aya-fr.log 2> ./experiments/runs/unbiased-toxic-roberta-aya-fr.log

echo "Beginning Tests on aya-hi"
python experiments/judge_benchmarks.py -j llamaguard3 -d aya-hi -v > ./experiments/runs/llamaguard3-aya-hi.log 2> ./experiments/runs/llamaguard3-aya-hi.log
python experiments/judge_benchmarks.py -j llamaguard2 -d aya-hi -v > ./experiments/runs/llamaguard2-aya-hi.log 2> ./experiments/runs/llamaguard2-aya-hi.log
python experiments/judge_benchmarks.py -j llamaguard -d aya-hi -v > ./experiments/runs/llamaguard-aya-hi.log 2> ./experiments/runs/llamaguard-aya-hi.log
python experiments/judge_benchmarks.py -j walledguard -d aya-hi -v > ./experiments/runs/walledguard-aya-hi.log 2> ./experiments/runs/walledguard-aya-hi.log
python experiments/judge_benchmarks.py -j lionguard -d aya-hi -v > ./experiments/runs/lionguard-aya-hi.log 2> ./experiments/runs/lionguard-aya-hi.log
python experiments/judge_benchmarks.py -j promptguard -d aya-hi -v > ./experiments/runs/promptguard-aya-hi.log 2> ./experiments/runs/promptguard-aya-hi.log
# python experiments/judge_benchmarks.py -j toxicitymodel -d aya-hi -v > ./experiments/runs/toxicitymodel-aya-hi.log 2> ./experiments/runs/toxicitymodel-aya-hi.log
# python experiments/judge_benchmarks.py -j toxic-bert -d aya-hi -v > ./experiments/runs/toxic-bert-aya-hi.log 2> ./experiments/runs/toxic-bert-aya-hi.log
# python experiments/judge_benchmarks.py -j multilingual-toxic-xlm-roberta -d aya-hi -v > ./experiments/runs/multilingual-toxic-xlm-roberta-aya-hi.log 2> ./experiments/runs/multilingual-toxic-xlm-roberta-aya-hi.log
# python experiments/judge_benchmarks.py -j unbiased-toxic-roberta -d aya-hi -v > ./experiments/runs/unbiased-toxic-roberta-aya-hi.log 2> ./experiments/runs/unbiased-toxic-roberta-aya-hi.log

echo "Beginning Tests on aya-ru"
python experiments/judge_benchmarks.py -j llamaguard3 -d aya-ru -v > ./experiments/runs/llamaguard3-aya-ru.log 2> ./experiments/runs/llamaguard3-aya-ru.log
python experiments/judge_benchmarks.py -j llamaguard2 -d aya-ru -v > ./experiments/runs/llamaguard2-aya-ru.log 2> ./experiments/runs/llamaguard2-aya-ru.log
python experiments/judge_benchmarks.py -j llamaguard -d aya-ru -v > ./experiments/runs/llamaguard-aya-ru.log 2> ./experiments/runs/llamaguard-aya-ru.log
python experiments/judge_benchmarks.py -j walledguard -d aya-ru -v > ./experiments/runs/walledguard-aya-ru.log 2> ./experiments/runs/walledguard-aya-ru.log
python experiments/judge_benchmarks.py -j lionguard -d aya-ru -v > ./experiments/runs/lionguard-aya-ru.log 2> ./experiments/runs/lionguard-aya-ru.log
python experiments/judge_benchmarks.py -j promptguard -d aya-ru -v > ./experiments/runs/promptguard-aya-ru.log 2> ./experiments/runs/promptguard-aya-ru.log
# python experiments/judge_benchmarks.py -j toxicitymodel -d aya-ru -v > ./experiments/runs/toxicitymodel-aya-ru.log 2> ./experiments/runs/toxicitymodel-aya-ru.log
# python experiments/judge_benchmarks.py -j toxic-bert -d aya-ru -v > ./experiments/runs/toxic-bert-aya-ru.log 2> ./experiments/runs/toxic-bert-aya-ru.log
# python experiments/judge_benchmarks.py -j multilingual-toxic-xlm-roberta -d aya-ru -v > ./experiments/runs/multilingual-toxic-xlm-roberta-aya-ru.log 2> ./experiments/runs/multilingual-toxic-xlm-roberta-aya-ru.log
# python experiments/judge_benchmarks.py -j unbiased-toxic-roberta -d aya-ru -v > ./experiments/runs/unbiased-toxic-roberta-aya-ru.log 2> ./experiments/runs/unbiased-toxic-roberta-aya-ru.log

echo "Beginning Tests on aya-se"
python experiments/judge_benchmarks.py -j llamaguard3 -d aya-se -v > ./experiments/runs/llamaguard3-aya-se.log 2> ./experiments/runs/llamaguard3-aya-se.log
python experiments/judge_benchmarks.py -j llamaguard2 -d aya-se -v > ./experiments/runs/llamaguard2-aya-se.log 2> ./experiments/runs/llamaguard2-aya-se.log
python experiments/judge_benchmarks.py -j llamaguard -d aya-se -v > ./experiments/runs/llamaguard-aya-se.log 2> ./experiments/runs/llamaguard-aya-se.log
python experiments/judge_benchmarks.py -j walledguard -d aya-se -v > ./experiments/runs/walledguard-aya-se.log 2> ./experiments/runs/walledguard-aya-se.log
python experiments/judge_benchmarks.py -j lionguard -d aya-se -v > ./experiments/runs/lionguard-aya-se.log 2> ./experiments/runs/lionguard-aya-se.log
python experiments/judge_benchmarks.py -j promptguard -d aya-se -v > ./experiments/runs/promptguard-aya-se.log 2> ./experiments/runs/promptguard-aya-se.log
# python experiments/judge_benchmarks.py -j toxicitymodel -d aya-se -v > ./experiments/runs/toxicitymodel-aya-se.log 2> ./experiments/runs/toxicitymodel-aya-se.log
# python experiments/judge_benchmarks.py -j toxic-bert -d aya-se -v > ./experiments/runs/toxic-bert-aya-se.log 2> ./experiments/runs/toxic-bert-aya-se.log
# python experiments/judge_benchmarks.py -j multilingual-toxic-xlm-roberta -d aya-se -v > ./experiments/runs/multilingual-toxic-xlm-roberta-aya-se.log 2> ./experiments/runs/multilingual-toxic-xlm-roberta-aya-se.log
# python experiments/judge_benchmarks.py -j unbiased-toxic-roberta -d aya-se -v > ./experiments/runs/unbiased-toxic-roberta-aya-se.log 2> ./experiments/runs/unbiased-toxic-roberta-aya-se.log

echo "Beginning Tests on aya-sp"
python experiments/judge_benchmarks.py -j llamaguard3 -d aya-sp -v > ./experiments/runs/llamaguard3-aya-sp.log 2> ./experiments/runs/llamaguard3-aya-sp.log
python experiments/judge_benchmarks.py -j llamaguard2 -d aya-sp -v > ./experiments/runs/llamaguard2-aya-sp.log 2> ./experiments/runs/llamaguard2-aya-sp.log
python experiments/judge_benchmarks.py -j llamaguard -d aya-sp -v > ./experiments/runs/llamaguard-aya-sp.log 2> ./experiments/runs/llamaguard-aya-sp.log
python experiments/judge_benchmarks.py -j walledguard -d aya-sp -v > ./experiments/runs/walledguard-aya-sp.log 2> ./experiments/runs/walledguard-aya-sp.log
python experiments/judge_benchmarks.py -j lionguard -d aya-sp -v > ./experiments/runs/lionguard-aya-sp.log 2> ./experiments/runs/lionguard-aya-sp.log
python experiments/judge_benchmarks.py -j promptguard -d aya-sp -v > ./experiments/runs/promptguard-aya-sp.log 2> ./experiments/runs/promptguard-aya-sp.log
# python experiments/judge_benchmarks.py -j toxicitymodel -d aya-sp -v > ./experiments/runs/toxicitymodel-aya-sp.log 2> ./experiments/runs/toxicitymodel-aya-sp.log
# python experiments/judge_benchmarks.py -j toxic-bert -d aya-sp -v > ./experiments/runs/toxic-bert-aya-sp.log 2> ./experiments/runs/toxic-bert-aya-sp.log
# python experiments/judge_benchmarks.py -j multilingual-toxic-xlm-roberta -d aya-sp -v > ./experiments/runs/multilingual-toxic-xlm-roberta-aya-sp.log 2> ./experiments/runs/multilingual-toxic-xlm-roberta-aya-sp.log
# python experiments/judge_benchmarks.py -j unbiased-toxic-roberta -d aya-sp -v > ./experiments/runs/unbiased-toxic-roberta-aya-sp.log 2> ./experiments/runs/unbiased-toxic-roberta-aya-sp.log
7 changes: 7 additions & 0 deletions experiments/scripts/run_llama2_7b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
echo "Running in $(pwd)"

echo "Beginning Llama 2 7B on HarmBench"
python experiments/prompt_benchmarks.py -m llama2-7b -d harmbench --verbose > ./experiments/runs/llama2-7b-harmbench.log 2> ./experiments/runs/llama2-7b-harmbench.log
echo "Beginning Llama 2 7B on AdvBench"
python experiments/prompt_benchmarks.py -m llama2-7b -d advbench --verbose > ./experiments/runs/llama2-7b-advbench.log 2> ./experiments/runs/llama2-7b-advbench.log
echo "Experiments on Llama 2 7B Completed"
Loading

0 comments on commit 7307939

Please sign in to comment.