Skip to content

Commit

Permalink
Organize scripts for generating paper tables/figures
Browse files Browse the repository at this point in the history
  • Loading branch information
iamgroot42 committed Jan 29, 2024
1 parent 27b2591 commit a246f3a
Show file tree
Hide file tree
Showing 7 changed files with 116 additions and 2 deletions.
3 changes: 2 additions & 1 deletion configs/mi.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
"pct_words_masked": 0.3,
"span_length": 2,
"dump_cache": false,
"load_from_cache": true
"load_from_cache": true,
"neighbor_strategy": "random"
},
"env_config": {
"results": "results_new",
Expand Down
24 changes: 24 additions & 0 deletions scripts/paper_scripts/figure2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash
version=figure2
n_samples=1000


for model in "1.4b" "12b" "2.8b" "6.9b"
do
for i_range in "900 1000 1000" "9900 10000 10000" "19900 20000 20000" "29900 30000 30000" "39900 40000 40000" "49900 50000 50000" "4900 5000 5000" "14900 15000 15000" "24900 25000 25000" "34900 35000 35000" "44900 45000 45000" "59900 60000 60000" "69900 70000 70000" "79900 80000 80000" "89900 90000 90000" "98900 99000 99000" "54900 55000 55000" "64900 65000 65000" "74900 75000 75000" "84900 85000 85000" "94900 95000 95000"
do
set -- $i_range
# echo $1 and $2
echo ckpt$3
python run.py \
--config configs/mi.json \
--base_model "EleutherAI/pythia-$model-deduped" \
--revision step$3 \
--presampled_dataset_member /gscratch/h2lab/micdun/pile-domains/pythia/utils/batch_viewing/token_indicies/$1-$2-indicies-n$n_samples-samples.npy \
--presampled_dataset_nonmember /gscratch/h2lab/micdun/mimir/data/tokenized_test/0.0-0.8/full_pile_$n_samples/test_tk.npy \
--n_samples $n_samples \
--specific_source "$1-$2-pile" \
--output_name $version \
--pretokenized true
done
done
25 changes: 25 additions & 0 deletions scripts/paper_scripts/table1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash
version=table1
ngram=13

for model in "pythia-160m" "pythia-1.4b" "pythia-2.8b" "pythia-6.9b" "pythia-12b"
do
for subset in "wikipedia_(en)" "github" "pile_cc" "pubmed_central" "arxiv" "dm_mathematics" "hackernews"
do
knocky python run.py \
--config configs/mi.json \
--revision step99000 \
--base_model "EleutherAI/${model}-deduped" \
--specific_source ${subset}_ngram_${ngram}_\<0.8_truncated \
--output_name $version
done
# full_pile specifically
python run.py \
--config configs/mi.json \
--revision step99000 \
--base_model "EleutherAI/${model}-deduped" \
--specific_source "full_pile" \
--output_name $version
--n_samples 10000
done

23 changes: 23 additions & 0 deletions scripts/paper_scripts/table11.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/bash
version=table11
ngram=13

for model in "gpt-neo-125m" "gpt-neo-1.3B" "gpt-neo-2.7B"
do
for subset in "wikipedia_(en)" "github" "pile_cc" "pubmed_central" "arxiv" "dm_mathematics" "hackernews"
do
knocky python run.py \
--config configs/mi.json \
--base_model "EleutherAI/${model}" \
--specific_source ${subset}_ngram_${ngram}_\<0.8_truncated \
--output_name $version
done
# full_pile specifically
python run.py \
--config configs/mi.json \
--base_model "EleutherAI/${model}" \
--specific_source "full_pile" \
--output_name $version
--n_samples 10000
done

16 changes: 16 additions & 0 deletions scripts/paper_scripts/table3.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash
version=table3

for ngram in "7" "13"
do
for subset in "wikipedia_(en)" "github" "pubmed_central" "pile_cc" "arxiv"
do
knocky python run.py \
--config configs/mi.json \
--revision step99000 \
--base_model "EleutherAI/pythia-12b-deduped" \
--specific_source ${subset}_ngram_${ngram}_\<0.2_truncated \
--output_name $version
done
done

25 changes: 25 additions & 0 deletions scripts/paper_scripts/table8.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash
version=table8
ngram=13

for model in "pythia-160m" "pythia-1.4b" "pythia-2.8b" "pythia-6.9b" "pythia-12b"
do
for subset in "wikipedia_(en)" "github" "pile_cc" "pubmed_central" "arxiv" "dm_mathematics" "hackernews"
do
knocky python run.py \
--config configs/mi.json \
--revision step99000 \
--base_model "EleutherAI/${model}" \
--specific_source ${subset}_ngram_${ngram}_\<0.8_truncated \
--output_name $version
done
# full_pile specifically
python run.py \
--config configs/mi.json \
--revision step99000 \
--base_model "EleutherAI/${model}" \
--specific_source "full_pile" \
--output_name $version
--n_samples 10000
done

2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
description="Python package for measuring memorization in LLMs",
author="Anshuman Suri, Michael Duan, Niloofar Mireshghallah",
author_email="[email protected]",
version="0.5",
version="0.6",
url="https://github.com/iamgroot42/mimir",
license="MIT",
python_requires=">=3.9",
Expand Down

0 comments on commit a246f3a

Please sign in to comment.