Skip to content

Commit

Permalink
Merge pull request #3 from iamgroot42/michael/exp_changes
Browse files Browse the repository at this point in the history
Update to NE code
  • Loading branch information
iamgroot42 authored Jan 28, 2024
2 parents 8640376 + a9691fa commit b16e461
Show file tree
Hide file tree
Showing 25 changed files with 734 additions and 410 deletions.
9 changes: 2 additions & 7 deletions configs/mi.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,10 @@
"output_name": "unified_mia",
"specific_source": "Github",
"n_samples": 1000,
"blackbox_attacks": ["ref"],
"blackbox_attacks": ["loss","ref", "zlib", "min_k"],
"ref_config": {
"models": [
"stabilityai/stablelm-base-alpha-3b-v2",
"EleutherAI/gpt-neo-1.3B",
"EleutherAI/pythia-1.4b-deduped",
"EleutherAI/pythia-1.4b",
"facebook/opt-1.3B",
"distilgpt2"
"stabilityai/stablelm-base-alpha-3b-v2"
]
},
"neighborhood_config": {
Expand Down
34 changes: 34 additions & 0 deletions configs/new_mi.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"base_model": "EleutherAI/gpt-neo-125m",
"dataset_member": "the_pile",
"dataset_nonmember": "the_pile",
"min_words": 100,
"max_words": 200,
"max_tokens": 512,
"max_data": 100000,
"output_name": "unified_mia",
"specific_source": "Github",
"n_samples": 1000,
"blackbox_attacks": ["loss", "ref", "min_k"],
"ref_config": {
"models": [
"stabilityai/stablelm-base-alpha-3b-v2"
]
},
"neighborhood_config": {
"model": "bert",
"n_perturbation_list": [
25
],
"pct_words_masked": 0.3,
"span_length": 2,
"dump_cache": false,
"load_from_cache": true
},
"env_config": {
"results": "results_new",
"device_map": "balanced_low_0"
},
"dump_cache": false,
"load_from_cache": true
}
13 changes: 1 addition & 12 deletions configs/ref_exp_mi.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,7 @@
"blackbox_attacks": ["ref"],
"ref_config": {
"models": [
"EleutherAI/pythia-70m-deduped",
"EleutherAI/pythia-1.4b-deduped",
"EleutherAI/pythia-2.8b-deduped",
"gpt2",
"distilgpt2",
"EleutherAI/gpt-neo-2.7B",
"EleutherAI/gpt-neo-1.3B",
"EleutherAI/gpt-neo-125m",
"facebook/opt-350m",
"facebook/opt-1.3B",
"facebook/opt-125m",
"kernelmachine/silo-pdswby-1.3b"
"huggyllama/llama-7b"
]
},
"neighborhood_config": {
Expand Down
9 changes: 4 additions & 5 deletions configs/single_gpu_mi.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,9 @@
"output_name": "unified_mia",
"specific_source": "Github",
"n_samples": 1000,
"blackbox_attacks": ["ref"],
"blackbox_attacks": ["ne"],
"ref_config": {
"models": [
"huggyllama/llama-13b"
]
"models": []
},
"neighborhood_config": {
"model": "bert",
Expand All @@ -27,7 +25,8 @@
},
"env_config": {
"results": "results_new",
"device": "cuda:0"
"device": "cuda:0",
"device_aux": "cuda:0"
},
"dump_cache": false,
"load_from_cache": true
Expand Down
34 changes: 27 additions & 7 deletions data/cache_data.sh
Original file line number Diff line number Diff line change
@@ -1,15 +1,21 @@
#!/bin/bash
ngram=7
for subset in "full_pile"
for date in "arxiv_2020-08" #"arxiv_2021-01" "arxiv_2021-06" "arxiv_2022-06" "arxiv_2023-06" #"full_pile"
do
echo caching data for $subset
# echo caching data for $subset
# python run.py \
# --config configs/cache_data.json \
# --presampled_dataset_member "/gscratch/h2lab/micdun/mimir/data/full_pile/full_pile_10000/train_raw.jsonl" \
# --presampled_dataset_nonmember "/gscratch/h2lab/micdun/mimir/data/full_pile/0.0-0.8/full_pile_10000/test_raw.jsonl" \
# --specific_source $subset \
# --max_data 10000 \
# --n_samples 10000
python run.py \
--config configs/cache_data.json \
--presampled_dataset_member "/gscratch/h2lab/micdun/mimir/data/full_pile/full_pile_10000/train_raw.jsonl" \
--presampled_dataset_nonmember "/gscratch/h2lab/micdun/mimir/data/full_pile/0.0-0.8/full_pile_10000/test_raw.jsonl" \
--specific_source $subset \
--max_data 10000 \
--n_samples 10000
--presampled_dataset_member "/gscratch/h2lab/micdun/mimir/data/pile_subsets/arxiv/train_raw.jsonl" \
--presampled_dataset_nonmember "/mmfs1/gscratch/h2lab/micdun/mimir/data/temporal_arxiv/${date}/${date}/test_raw.jsonl" \
--specific_source $date \
--n_samples 1000
done
#"/gscratch/h2lab/micdun/mimir/data/ngram_overlap_thresholded_pile_subsets/truncated+ngram_$ngram/0.0-0.2/$subset/test_raw.jsonl"

Expand All @@ -32,3 +38,17 @@ done
# --presampled_dataset_member "/gscratch/h2lab/micdun/mimir/data/wikiMIA/sampled/wikiMIA/train_raw.jsonl" \
# --presampled_dataset_nonmember "/gscratch/h2lab/micdun/mimir/data/wikiMIA/sampled/wikiMIA/test_raw.jsonl" \
# --specific_source wikiMIA

# python run.py \
# --config configs/cache_data.json \
# --presampled_dataset_member "/gscratch/h2lab/micdun/mimir/data/pile_subsets/wikipedia_(en)/train_raw.jsonl" \
# --presampled_dataset_nonmember "/mmfs1/gscratch/h2lab/micdun/mimir/data/temporal_wiki/temporal_wiki/test_raw.jsonl" \
# --specific_source temporal_wiki_full \
# --n_samples 1000

# python run.py \
# --config configs/cache_data.json \
# --presampled_dataset_member "/gscratch/h2lab/micdun/mimir/data/pile_subsets/arxiv/train_raw.jsonl" \
# --presampled_dataset_nonmember "/mmfs1/gscratch/h2lab/micdun/mimir/data/temporal_arxiv/{$date}/{$date}/test_raw.jsonl" \
# --specific_source $date \
# --n_samples 1000
3 changes: 2 additions & 1 deletion data/create_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ def process_texts(data, min_len, provided_subset=None):
tokenized = text.split()
# Initial simple filter to get candidates surpassing min_len requirement
if len(tokenized) >= min_len:
dp["raw"] = text
# TODO: for temporal_wiki, need to append title metadata to front. Should refactor this
dp["raw"] = dp["title"] + "\n\n" + text if "title" in dp and provided_subset == 'temporal_wiki' else text
subset_samples[pile_subset].append(dp)

return subset_samples, subset_counts
Expand Down
9 changes: 8 additions & 1 deletion data/create_pile_subsets.sh
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,11 @@ done
# --n_samples 1000 \
# --max_ngram_overlap 0.8 \
# --provided_subset arxiv_2020-06 \
# --split test
# --split test

python create_datasets.py \
/mmfs1/gscratch/h2lab/micdun/mimir/data/temporal_wiki/wikitext_latest_full.json \
--benchmark_dir temporal_wiki \
--provided_subset temporal_wiki \
--split test \
--n_samples 1000
6 changes: 2 additions & 4 deletions data/neighbor_gen.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
#!/bin/bash
# Ideally, run after caching data with cache_data.json
for subset in "full_pile" #"wikimia" "arxiv_2021_01" "arxiv_2021_06" "arxiv_2022_01" "arxiv_2022_06" "arxiv_2023_01" "arxiv_2023_06" "c4"
for subset in "arxiv_2020-08" "arxiv_2021-01" "arxiv_2021-06" "arxiv_2022-01" "arxiv_2022-06" "arxiv_2023-01" "arxiv_2023-06" #"temporal_wiki_full" #"wikimia" "c4"
do
echo generating neighbors for $subset
python run.py \
--config configs/neighbor_gen_new.json \
--specific_source $subset \
--max_data 10000 \
--n_samples 10000
--specific_source $subset
done
4 changes: 2 additions & 2 deletions local/agg_ref_mia.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
for ref in args.ref_files:
f_ref_metrics = open(ref)
ref_metrics_dict = json.load(f_ref_metrics)
ref_member_scores = np.array(ref_metrics_dict['predictions']["members"])
ref_nonmember_scores = np.array(ref_metrics_dict['predictions']["nonmembers"])
ref_member_scores = np.array(ref_metrics_dict['predictions']["member"])
ref_nonmember_scores = np.array(ref_metrics_dict['predictions']["nonmember"])

if sum_member_scores is None:
sum_member_scores = ref_member_scores
Expand Down
650 changes: 416 additions & 234 deletions local/figures_for_paper.ipynb

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions local/report_results.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/bash
python parse_results.py \
/gscratch/h2lab/micdun/mimir/results_new/mia_unified_mia_v5_ne/*/* \
--output mia_unified_mia_v5_ne_results.json
/gscratch/h2lab/micdun/mimir/results_new/mia_unified_mia_v5_temporal_arxiv_v2/*/* \
--output mia_unified_mia_v5_temporal_arxiv_v2_results.json
10 changes: 8 additions & 2 deletions local/temp.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
#!/bin/bash
python agg_ref_mia.py \
"/mmfs1/gscratch/h2lab/micdun/mimir/results_new/mia_unified_mia_v4_llama_ref/EleutherAI_pythia-2.8b-deduped--bert-temp/fp32-0.3-1-the_pile-the_pile-1000200100_plen30_--tok_false-wikipedia_(en)/ref_model_decapoda-research_llama-7b-hf_lira_ratio_threshold_results.json" \
"/mmfs1/gscratch/h2lab/micdun/mimir/results_new/mia_unified_mia_v4_pile_refs/EleutherAI_pythia-2.8b-deduped--bert-temp/fp32-0.3-1-the_pile-the_pile-1000200100_plen30_--tok_false-wikipedia_(en)/ref_model_facebook_opt-125m_lira_ratio_threshold_results.json" \
"/mmfs1/gscratch/h2lab/micdun/mimir/results_new/mia_unified_mia_v5_ref_tab1_v2_remainder_llama/EleutherAI_pythia-12b-deduped--bert-temp/fp32-0.3-1-the_pile-the_pile-1000200100_plen30_--tok_false-arxiv_ngram_13_<0.8_truncated/ref-llama-7b_results.json" \
"/mmfs1/gscratch/h2lab/micdun/mimir/results_new/mia_unified_mia_v5_ref_tab1_v2/EleutherAI_pythia-12b-deduped--bert-temp/fp32-0.3-1-the_pile-the_pile-1000200100_plen30_--tok_false-arxiv_ngram_13_<0.8_truncated/ref-stablelm-base-alpha-3b-v2_results.json" \
"/mmfs1/gscratch/h2lab/micdun/mimir/results_new/mia_unified_mia_v5_ref_tab1_v2_remainder/EleutherAI_pythia-12b-deduped--bert-temp/fp32-0.3-1-the_pile-the_pile-1000200100_plen30_--tok_false-arxiv_ngram_13_<0.8_truncated/ref-gpt2_results.json" \
"/mmfs1/gscratch/h2lab/micdun/mimir/results_new/mia_unified_mia_v5_ref_tab1_v2_remainder/EleutherAI_pythia-12b-deduped--bert-temp/fp32-0.3-1-the_pile-the_pile-1000200100_plen30_--tok_false-arxiv_ngram_13_<0.8_truncated/ref-silo-pdswby-1.3b_results.json" \
"/mmfs1/gscratch/h2lab/micdun/mimir/results_new/mia_unified_mia_v5_ref_tab1_v2/EleutherAI_pythia-12b-deduped--bert-temp/fp32-0.3-1-the_pile-the_pile-1000200100_plen30_--tok_false-arxiv_ngram_13_<0.8_truncated/ref-distilgpt2_results.json" \
"/mmfs1/gscratch/h2lab/micdun/mimir/results_new/mia_unified_mia_v5_ref_tab1_v2/EleutherAI_pythia-12b-deduped--bert-temp/fp32-0.3-1-the_pile-the_pile-1000200100_plen30_--tok_false-arxiv_ngram_13_<0.8_truncated/ref-opt-1.3B_results.json" \
"/mmfs1/gscratch/h2lab/micdun/mimir/results_new/mia_unified_mia_v5_ref_tab1_v2/EleutherAI_pythia-12b-deduped--bert-temp/fp32-0.3-1-the_pile-the_pile-1000200100_plen30_--tok_false-arxiv_ngram_13_<0.8_truncated/ref-gpt-neo-1.3B_results.json" \
"/mmfs1/gscratch/h2lab/micdun/mimir/results_new/mia_unified_mia_v5_ref_tab1_v2/EleutherAI_pythia-12b-deduped--bert-temp/fp32-0.3-1-the_pile-the_pile-1000200100_plen30_--tok_false-arxiv_ngram_13_<0.8_truncated/ref-pythia-1.4b-deduped_results.json"



Expand Down
81 changes: 58 additions & 23 deletions mimir/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import numpy as np
import transformers
import time
from collections import defaultdict
from tqdm import tqdm
from multiprocessing.pool import ThreadPool
import torch.nn.functional as F
Expand Down Expand Up @@ -80,18 +81,17 @@ def get_probabilities(self, text: str, tokens=None):
if labels.shape[0] != 1:
# expand first dimension
labels = labels.unsqueeze(0)
labels = labels.to(self.device)
else:
tokenized = self.tokenizer(
text, return_tensors="pt").to(self.device)
text, return_tensors="pt")
labels = tokenized.input_ids

all_prob = []
for i in range(0, labels.size(1), self.stride):
begin_loc = max(i + self.stride - self.max_length, 0)
end_loc = min(i + self.stride, labels.size(1))
trg_len = end_loc - i # may be different from stride on last loop
input_ids = labels[:, begin_loc:end_loc]
input_ids = labels[:, begin_loc:end_loc].to(self.device)
target_ids = input_ids.clone()
target_ids[:, :-trg_len] = -100

Expand Down Expand Up @@ -171,7 +171,7 @@ def load_base_model_and_tokenizer(self, model_kwargs):
else:
tokenizer = transformers.AutoTokenizer.from_pretrained(
self.name, **optional_tok_kwargs, cache_dir=self.cache_dir)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

return model, tokenizer

Expand Down Expand Up @@ -320,28 +320,63 @@ def get_rank(self, text: str, log: bool=False):

# TODO extend for longer sequences
@torch.no_grad()
def get_lls(self, texts: str, batch_size: int = 6):
# return [self.get_ll(text) for text in texts]
tokenized = self.tokenizer(texts, return_tensors="pt", padding=True)
labels = tokenized.input_ids
total_size = labels.shape[0]
def get_lls(self, texts: List[str], batch_size: int = 6):
#return [self.get_ll(text) for text in texts] # -np.mean([self.get_ll(text) for text in texts])
# tokenized = self.tokenizer(texts, return_tensors="pt", padding=True)
# labels = tokenized.input_ids
total_size = len(texts)
losses = []
for i in range(0, total_size, batch_size):
label_batch = labels[i:i+batch_size].to(self.device)
output = self.model(label_batch, labels=label_batch, return_dict=False)
loss = output[0]
# logits = output.logits
# # Shift so that tokens < n predict n
# shift_logits = logits[..., :-1, :].contiguous()
# shift_logits = torch.transpose(shift_logits, 1, 2)
# shift_labels = label_batch[..., 1:].contiguous()
# loss = F.cross_entropy(input=shift_logits, target=shift_labels)#, reduction='none').mean(dim=1)
losses.append(loss.item() * batch_size)
# del label_batch
# del shift_logits
# Delegate batches and tokenize
batch = texts[i:i+batch_size]
tokenized = self.tokenizer(batch, return_tensors="pt", padding=True, return_attention_mask=True)
label_batch = tokenized.input_ids

# # mask out padding tokens
attention_mask = tokenized.attention_mask
assert attention_mask.size() == label_batch.size()

needs_sliding = label_batch.size(1) > self.max_length
if not needs_sliding:
label_batch = label_batch.to(self.device)
attention_mask = attention_mask.to(self.device)

# Collect token probabilities per sample in batch
all_prob = defaultdict(list)
for i in range(0, label_batch.size(1), self.stride):
begin_loc = max(i + self.stride - self.max_length, 0)
end_loc = min(i + self.stride, label_batch.size(1))
trg_len = end_loc - i # may be different from stride on last loop
input_ids = label_batch[:, begin_loc:end_loc]
mask = attention_mask[:, begin_loc:end_loc]
if needs_sliding:
input_ids = input_ids.to(self.device)
mask = mask.to(self.device)

target_ids = input_ids.clone()
# Don't count padded tokens or tokens that already have computed probabilities
target_ids[:, :-trg_len] = -100
# target_ids[attention_mask == 0] = -100

outputs = self.model(input_ids, labels=target_ids, attention_mask=mask)
logits = outputs.logits
shift_logits = logits[..., :-1, :].contiguous()
probabilities = torch.nn.functional.log_softmax(shift_logits, dim=-1)
shift_labels = target_ids[..., 1:].contiguous()

for i, sample in enumerate(shift_labels):
for j, token_id in enumerate(sample):
if token_id != -100 and token_id != self.tokenizer.pad_token_id:
probability = probabilities[i, j, token_id].item()
all_prob[i].append(probability)

# average over each sample to get losses
batch_losses = [-np.mean(all_prob[idx]) for idx in range(label_batch.size(0))]
# print(batch_losses)
losses.extend(batch_losses)
del label_batch
del output
return np.sum(losses) / total_size
del attention_mask
return losses #np.mean(losses)

@torch.no_grad()
def get_min_k_prob(self, text: str, tokens=None, probs=None, k=.2, window=1, stride=1):
Expand Down
Loading

0 comments on commit b16e461

Please sign in to comment.