Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/iamgroot42/mimir
Browse files Browse the repository at this point in the history
  • Loading branch information
iamgroot42 committed Jan 29, 2024
2 parents fc50f52 + e639e7a commit 27b2591
Show file tree
Hide file tree
Showing 26 changed files with 907 additions and 480 deletions.
9 changes: 2 additions & 7 deletions configs/mi.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,10 @@
"output_name": "unified_mia",
"specific_source": "Github",
"n_samples": 1000,
"blackbox_attacks": ["ref"],
"blackbox_attacks": ["loss","ref", "zlib", "min_k"],
"ref_config": {
"models": [
"stabilityai/stablelm-base-alpha-3b-v2",
"EleutherAI/gpt-neo-1.3B",
"EleutherAI/pythia-1.4b-deduped",
"EleutherAI/pythia-1.4b",
"facebook/opt-1.3B",
"distilgpt2"
"stabilityai/stablelm-base-alpha-3b-v2"
]
},
"neighborhood_config": {
Expand Down
34 changes: 34 additions & 0 deletions configs/new_mi.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"base_model": "EleutherAI/gpt-neo-125m",
"dataset_member": "the_pile",
"dataset_nonmember": "the_pile",
"min_words": 100,
"max_words": 200,
"max_tokens": 512,
"max_data": 100000,
"output_name": "unified_mia",
"specific_source": "Github",
"n_samples": 1000,
"blackbox_attacks": ["loss", "ref", "min_k"],
"ref_config": {
"models": [
"stabilityai/stablelm-base-alpha-3b-v2"
]
},
"neighborhood_config": {
"model": "bert",
"n_perturbation_list": [
25
],
"pct_words_masked": 0.3,
"span_length": 2,
"dump_cache": false,
"load_from_cache": true
},
"env_config": {
"results": "results_new",
"device_map": "balanced_low_0"
},
"dump_cache": false,
"load_from_cache": true
}
13 changes: 1 addition & 12 deletions configs/ref_exp_mi.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,7 @@
"blackbox_attacks": ["ref"],
"ref_config": {
"models": [
"EleutherAI/pythia-70m-deduped",
"EleutherAI/pythia-1.4b-deduped",
"EleutherAI/pythia-2.8b-deduped",
"gpt2",
"distilgpt2",
"EleutherAI/gpt-neo-2.7B",
"EleutherAI/gpt-neo-1.3B",
"EleutherAI/gpt-neo-125m",
"facebook/opt-350m",
"facebook/opt-1.3B",
"facebook/opt-125m",
"kernelmachine/silo-pdswby-1.3b"
"huggyllama/llama-7b"
]
},
"neighborhood_config": {
Expand Down
9 changes: 4 additions & 5 deletions configs/single_gpu_mi.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,9 @@
"output_name": "unified_mia",
"specific_source": "Github",
"n_samples": 1000,
"blackbox_attacks": ["ref"],
"blackbox_attacks": ["ne"],
"ref_config": {
"models": [
"huggyllama/llama-13b"
]
"models": []
},
"neighborhood_config": {
"model": "bert",
Expand All @@ -27,7 +25,8 @@
},
"env_config": {
"results": "results_new",
"device": "cuda:0"
"device": "cuda:0",
"device_aux": "cuda:0"
},
"dump_cache": false,
"load_from_cache": true
Expand Down
34 changes: 27 additions & 7 deletions data/cache_data.sh
Original file line number Diff line number Diff line change
@@ -1,15 +1,21 @@
#!/bin/bash
ngram=7
for subset in "full_pile"
for date in "arxiv_2020-08" #"arxiv_2021-01" "arxiv_2021-06" "arxiv_2022-06" "arxiv_2023-06" #"full_pile"
do
echo caching data for $subset
# echo caching data for $subset
# python run.py \
# --config configs/cache_data.json \
# --presampled_dataset_member "/gscratch/h2lab/micdun/mimir/data/full_pile/full_pile_10000/train_raw.jsonl" \
# --presampled_dataset_nonmember "/gscratch/h2lab/micdun/mimir/data/full_pile/0.0-0.8/full_pile_10000/test_raw.jsonl" \
# --specific_source $subset \
# --max_data 10000 \
# --n_samples 10000
python run.py \
--config configs/cache_data.json \
--presampled_dataset_member "/gscratch/h2lab/micdun/mimir/data/full_pile/full_pile_10000/train_raw.jsonl" \
--presampled_dataset_nonmember "/gscratch/h2lab/micdun/mimir/data/full_pile/0.0-0.8/full_pile_10000/test_raw.jsonl" \
--specific_source $subset \
--max_data 10000 \
--n_samples 10000
--presampled_dataset_member "/gscratch/h2lab/micdun/mimir/data/pile_subsets/arxiv/train_raw.jsonl" \
--presampled_dataset_nonmember "/mmfs1/gscratch/h2lab/micdun/mimir/data/temporal_arxiv/${date}/${date}/test_raw.jsonl" \
--specific_source $date \
--n_samples 1000
done
#"/gscratch/h2lab/micdun/mimir/data/ngram_overlap_thresholded_pile_subsets/truncated+ngram_$ngram/0.0-0.2/$subset/test_raw.jsonl"

Expand All @@ -32,3 +38,17 @@ done
# --presampled_dataset_member "/gscratch/h2lab/micdun/mimir/data/wikiMIA/sampled/wikiMIA/train_raw.jsonl" \
# --presampled_dataset_nonmember "/gscratch/h2lab/micdun/mimir/data/wikiMIA/sampled/wikiMIA/test_raw.jsonl" \
# --specific_source wikiMIA

# python run.py \
# --config configs/cache_data.json \
# --presampled_dataset_member "/gscratch/h2lab/micdun/mimir/data/pile_subsets/wikipedia_(en)/train_raw.jsonl" \
# --presampled_dataset_nonmember "/mmfs1/gscratch/h2lab/micdun/mimir/data/temporal_wiki/temporal_wiki/test_raw.jsonl" \
# --specific_source temporal_wiki_full \
# --n_samples 1000

# python run.py \
# --config configs/cache_data.json \
# --presampled_dataset_member "/gscratch/h2lab/micdun/mimir/data/pile_subsets/arxiv/train_raw.jsonl" \
# --presampled_dataset_nonmember "/mmfs1/gscratch/h2lab/micdun/mimir/data/temporal_arxiv/{$date}/{$date}/test_raw.jsonl" \
# --specific_source $date \
# --n_samples 1000
3 changes: 2 additions & 1 deletion data/create_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ def process_texts(data, min_len, provided_subset=None):
tokenized = text.split()
# Initial simple filter to get candidates surpassing min_len requirement
if len(tokenized) >= min_len:
dp["raw"] = text
# TODO: for temporal_wiki, need to append title metadata to front. Should refactor this
dp["raw"] = dp["title"] + "\n\n" + text if "title" in dp and provided_subset == 'temporal_wiki' else text
subset_samples[pile_subset].append(dp)

return subset_samples, subset_counts
Expand Down
9 changes: 8 additions & 1 deletion data/create_pile_subsets.sh
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,11 @@ done
# --n_samples 1000 \
# --max_ngram_overlap 0.8 \
# --provided_subset arxiv_2020-06 \
# --split test
# --split test

python create_datasets.py \
/mmfs1/gscratch/h2lab/micdun/mimir/data/temporal_wiki/wikitext_latest_full.json \
--benchmark_dir temporal_wiki \
--provided_subset temporal_wiki \
--split test \
--n_samples 1000
6 changes: 2 additions & 4 deletions data/neighbor_gen.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
#!/bin/bash
# Ideally, run after caching data with cache_data.json
for subset in "full_pile" #"wikimia" "arxiv_2021_01" "arxiv_2021_06" "arxiv_2022_01" "arxiv_2022_06" "arxiv_2023_01" "arxiv_2023_06" "c4"
for subset in "arxiv_2020-08" "arxiv_2021-01" "arxiv_2021-06" "arxiv_2022-01" "arxiv_2022-06" "arxiv_2023-01" "arxiv_2023-06" #"temporal_wiki_full" #"wikimia" "c4"
do
echo generating neighbors for $subset
python run.py \
--config configs/neighbor_gen_new.json \
--specific_source $subset \
--max_data 10000 \
--n_samples 10000
--specific_source $subset
done
Loading

0 comments on commit 27b2591

Please sign in to comment.