Organize scripts for generating paper tables/figures

iamgroot42 · Jan 29, 2024 · a246f3a · a246f3a
1 parent 27b2591
commit a246f3a
Show file tree

Hide file tree

Showing 7 changed files with 116 additions and 2 deletions.
diff --git a/configs/mi.json b/configs/mi.json
@@ -23,7 +23,8 @@
         "pct_words_masked": 0.3,
         "span_length": 2,
         "dump_cache": false,
-        "load_from_cache": true
+        "load_from_cache": true,
+        "neighbor_strategy": "random"
     },
     "env_config": {
         "results": "results_new",

diff --git a/scripts/paper_scripts/figure2.sh b/scripts/paper_scripts/figure2.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+version=figure2
+n_samples=1000
+
+
+for model in "1.4b" "12b" "2.8b" "6.9b"
+do
+    for i_range in "900 1000 1000" "9900 10000 10000" "19900 20000 20000" "29900 30000 30000" "39900 40000 40000" "49900 50000 50000" "4900 5000 5000" "14900 15000 15000" "24900 25000 25000" "34900 35000 35000" "44900 45000 45000" "59900 60000 60000" "69900 70000 70000" "79900 80000 80000" "89900 90000 90000" "98900 99000 99000" "54900 55000 55000" "64900 65000 65000" "74900 75000 75000" "84900 85000 85000" "94900 95000 95000"
+    do
+        set -- $i_range
+        # echo $1 and $2
+        echo ckpt$3
+        python run.py \
+            --config configs/mi.json \
+            --base_model "EleutherAI/pythia-$model-deduped" \
+            --revision step$3 \
+            --presampled_dataset_member /gscratch/h2lab/micdun/pile-domains/pythia/utils/batch_viewing/token_indicies/$1-$2-indicies-n$n_samples-samples.npy \
+            --presampled_dataset_nonmember /gscratch/h2lab/micdun/mimir/data/tokenized_test/0.0-0.8/full_pile_$n_samples/test_tk.npy \
+            --n_samples $n_samples \
+            --specific_source "$1-$2-pile" \
+            --output_name $version \
+            --pretokenized true
+    done
+done
diff --git a/scripts/paper_scripts/table1.sh b/scripts/paper_scripts/table1.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+version=table1
+ngram=13
+
+for model in "pythia-160m" "pythia-1.4b" "pythia-2.8b" "pythia-6.9b" "pythia-12b"
+do
+    for subset in "wikipedia_(en)" "github" "pile_cc" "pubmed_central" "arxiv" "dm_mathematics" "hackernews"
+    do
+        knocky python run.py \
+            --config configs/mi.json \
+            --revision step99000 \
+            --base_model "EleutherAI/${model}-deduped" \
+            --specific_source ${subset}_ngram_${ngram}_\<0.8_truncated \
+            --output_name $version
+    done
+    # full_pile specifically
+    python run.py \
+            --config configs/mi.json \
+            --revision step99000 \
+            --base_model "EleutherAI/${model}-deduped" \
+            --specific_source "full_pile" \
+            --output_name $version
+            --n_samples 10000
+done
+
diff --git a/scripts/paper_scripts/table11.sh b/scripts/paper_scripts/table11.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+version=table11
+ngram=13
+
+for model in "gpt-neo-125m" "gpt-neo-1.3B" "gpt-neo-2.7B"
+do
+    for subset in "wikipedia_(en)" "github" "pile_cc" "pubmed_central" "arxiv" "dm_mathematics" "hackernews"
+    do
+        knocky python run.py \
+            --config configs/mi.json \
+            --base_model "EleutherAI/${model}" \
+            --specific_source ${subset}_ngram_${ngram}_\<0.8_truncated \
+            --output_name $version
+    done
+    # full_pile specifically
+    python run.py \
+            --config configs/mi.json \
+            --base_model "EleutherAI/${model}" \
+            --specific_source "full_pile" \
+            --output_name $version
+            --n_samples 10000
+done
+
diff --git a/scripts/paper_scripts/table3.sh b/scripts/paper_scripts/table3.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+version=table3
+
+for ngram in "7" "13"
+do
+    for subset in "wikipedia_(en)" "github" "pubmed_central" "pile_cc" "arxiv"
+    do
+        knocky python run.py \
+            --config configs/mi.json \
+            --revision step99000 \
+            --base_model "EleutherAI/pythia-12b-deduped" \
+            --specific_source ${subset}_ngram_${ngram}_\<0.2_truncated \
+            --output_name $version
+    done
+done
+
diff --git a/scripts/paper_scripts/table8.sh b/scripts/paper_scripts/table8.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+version=table8
+ngram=13
+
+for model in "pythia-160m" "pythia-1.4b" "pythia-2.8b" "pythia-6.9b" "pythia-12b"
+do
+    for subset in "wikipedia_(en)" "github" "pile_cc" "pubmed_central" "arxiv" "dm_mathematics" "hackernews"
+    do
+        knocky python run.py \
+            --config configs/mi.json \
+            --revision step99000 \
+            --base_model "EleutherAI/${model}" \
+            --specific_source ${subset}_ngram_${ngram}_\<0.8_truncated \
+            --output_name $version
+    done
+    # full_pile specifically
+    python run.py \
+            --config configs/mi.json \
+            --revision step99000 \
+            --base_model "EleutherAI/${model}" \
+            --specific_source "full_pile" \
+            --output_name $version
+            --n_samples 10000
+done
+
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
     description="Python package for measuring memorization in LLMs",
     author="Anshuman Suri, Michael Duan, Niloofar Mireshghallah",
     author_email="[email protected]",
-    version="0.5",
+    version="0.6",
     url="https://github.com/iamgroot42/mimir",
     license="MIT",
     python_requires=">=3.9",