Merge pull request #3 from iamgroot42/michael/exp_changes

Update to NE code
iamgroot42 · Jan 28, 2024 · b16e461 · b16e461
2 parents 8640376 + a9691fa
commit b16e461
Show file tree

Hide file tree

Showing 25 changed files with 734 additions and 410 deletions.
diff --git a/configs/mi.json b/configs/mi.json
@@ -9,15 +9,10 @@
     "output_name": "unified_mia",
     "specific_source": "Github",
     "n_samples": 1000,
-    "blackbox_attacks": ["ref"],
+    "blackbox_attacks": ["loss","ref", "zlib", "min_k"],
     "ref_config": {
         "models": [
-            "stabilityai/stablelm-base-alpha-3b-v2",
-            "EleutherAI/gpt-neo-1.3B",
-            "EleutherAI/pythia-1.4b-deduped",
-            "EleutherAI/pythia-1.4b",
-            "facebook/opt-1.3B",
-            "distilgpt2"
+            "stabilityai/stablelm-base-alpha-3b-v2"
         ]
     },
     "neighborhood_config": {

diff --git a/configs/new_mi.json b/configs/new_mi.json
@@ -0,0 +1,34 @@
+{
+    "base_model": "EleutherAI/gpt-neo-125m",
+    "dataset_member": "the_pile",
+    "dataset_nonmember": "the_pile",
+    "min_words": 100,
+    "max_words": 200,
+    "max_tokens": 512,
+    "max_data": 100000,
+    "output_name": "unified_mia",
+    "specific_source": "Github",
+    "n_samples": 1000,
+    "blackbox_attacks": ["loss", "ref", "min_k"],
+    "ref_config": {
+        "models": [
+            "stabilityai/stablelm-base-alpha-3b-v2"
+        ]
+    },
+    "neighborhood_config": {
+        "model": "bert",
+        "n_perturbation_list": [
+            25
+        ],
+        "pct_words_masked": 0.3,
+        "span_length": 2,
+        "dump_cache": false,
+        "load_from_cache": true
+    },
+    "env_config": {
+        "results": "results_new",
+        "device_map": "balanced_low_0"
+    },
+    "dump_cache": false,
+    "load_from_cache": true
+}
diff --git a/configs/ref_exp_mi.json b/configs/ref_exp_mi.json
@@ -12,18 +12,7 @@
     "blackbox_attacks": ["ref"],
     "ref_config": {
         "models": [
-            "EleutherAI/pythia-70m-deduped",
-            "EleutherAI/pythia-1.4b-deduped",
-            "EleutherAI/pythia-2.8b-deduped",
-            "gpt2",
-            "distilgpt2",
-            "EleutherAI/gpt-neo-2.7B",
-            "EleutherAI/gpt-neo-1.3B",
-            "EleutherAI/gpt-neo-125m",
-            "facebook/opt-350m",
-            "facebook/opt-1.3B",
-            "facebook/opt-125m",
-            "kernelmachine/silo-pdswby-1.3b"
+            "huggyllama/llama-7b"
         ]
     },
     "neighborhood_config": {

diff --git a/configs/single_gpu_mi.json b/configs/single_gpu_mi.json
@@ -9,11 +9,9 @@
     "output_name": "unified_mia",
     "specific_source": "Github",
     "n_samples": 1000,
-    "blackbox_attacks": ["ref"],
+    "blackbox_attacks": ["ne"],
     "ref_config": {
-        "models": [
-            "huggyllama/llama-13b"
-        ]
+        "models": []
     },
     "neighborhood_config": {
         "model": "bert",
@@ -27,7 +25,8 @@
     },
     "env_config": {
         "results": "results_new",
-        "device": "cuda:0"
+        "device": "cuda:0",
+        "device_aux": "cuda:0"
     },
     "dump_cache": false,
     "load_from_cache": true

diff --git a/data/cache_data.sh b/data/cache_data.sh
@@ -1,15 +1,21 @@
 #!/bin/bash
 ngram=7
-for subset in "full_pile"
+for date in "arxiv_2020-08" #"arxiv_2021-01" "arxiv_2021-06" "arxiv_2022-06" "arxiv_2023-06" #"full_pile"
 do
-    echo caching data for $subset
+    # echo caching data for $subset
+    # python run.py \
+    #     --config configs/cache_data.json \
+    #     --presampled_dataset_member "/gscratch/h2lab/micdun/mimir/data/full_pile/full_pile_10000/train_raw.jsonl" \
+    #     --presampled_dataset_nonmember "/gscratch/h2lab/micdun/mimir/data/full_pile/0.0-0.8/full_pile_10000/test_raw.jsonl" \
+    #     --specific_source $subset \
+    #     --max_data 10000 \
+    #     --n_samples 10000
     python run.py \
         --config configs/cache_data.json \
-        --presampled_dataset_member "/gscratch/h2lab/micdun/mimir/data/full_pile/full_pile_10000/train_raw.jsonl" \
-        --presampled_dataset_nonmember "/gscratch/h2lab/micdun/mimir/data/full_pile/0.0-0.8/full_pile_10000/test_raw.jsonl" \
-        --specific_source $subset \
-        --max_data 10000 \
-        --n_samples 10000
+        --presampled_dataset_member "/gscratch/h2lab/micdun/mimir/data/pile_subsets/arxiv/train_raw.jsonl" \
+        --presampled_dataset_nonmember "/mmfs1/gscratch/h2lab/micdun/mimir/data/temporal_arxiv/${date}/${date}/test_raw.jsonl" \
+        --specific_source $date \
+        --n_samples 1000
 done
  #"/gscratch/h2lab/micdun/mimir/data/ngram_overlap_thresholded_pile_subsets/truncated+ngram_$ngram/0.0-0.2/$subset/test_raw.jsonl"
 
@@ -32,3 +38,17 @@ done
 #         --presampled_dataset_member "/gscratch/h2lab/micdun/mimir/data/wikiMIA/sampled/wikiMIA/train_raw.jsonl" \
 #         --presampled_dataset_nonmember "/gscratch/h2lab/micdun/mimir/data/wikiMIA/sampled/wikiMIA/test_raw.jsonl" \
 #         --specific_source wikiMIA
+
+# python run.py \
+#         --config configs/cache_data.json \
+#         --presampled_dataset_member "/gscratch/h2lab/micdun/mimir/data/pile_subsets/wikipedia_(en)/train_raw.jsonl" \
+#         --presampled_dataset_nonmember "/mmfs1/gscratch/h2lab/micdun/mimir/data/temporal_wiki/temporal_wiki/test_raw.jsonl" \
+#         --specific_source temporal_wiki_full \
+#         --n_samples 1000
+
+# python run.py \
+#         --config configs/cache_data.json \
+#         --presampled_dataset_member "/gscratch/h2lab/micdun/mimir/data/pile_subsets/arxiv/train_raw.jsonl" \
+#         --presampled_dataset_nonmember "/mmfs1/gscratch/h2lab/micdun/mimir/data/temporal_arxiv/{$date}/{$date}/test_raw.jsonl" \
+#         --specific_source $date \
+#         --n_samples 1000
diff --git a/data/create_datasets.py b/data/create_datasets.py
@@ -29,7 +29,8 @@ def process_texts(data, min_len, provided_subset=None):
         tokenized = text.split()
         # Initial simple filter to get candidates surpassing min_len requirement
         if len(tokenized) >= min_len:
-            dp["raw"] = text
+            # TODO: for temporal_wiki, need to append title metadata to front. Should refactor this
+            dp["raw"] = dp["title"] + "\n\n" + text if "title" in dp and provided_subset == 'temporal_wiki' else text
             subset_samples[pile_subset].append(dp)
 
     return subset_samples, subset_counts

diff --git a/data/create_pile_subsets.sh b/data/create_pile_subsets.sh
@@ -64,4 +64,11 @@ done
 #     --n_samples 1000 \
 #     --max_ngram_overlap 0.8 \
 #     --provided_subset arxiv_2020-06 \
-#     --split test
+#     --split test
+
+python create_datasets.py \
+        /mmfs1/gscratch/h2lab/micdun/mimir/data/temporal_wiki/wikitext_latest_full.json \
+        --benchmark_dir temporal_wiki \
+        --provided_subset temporal_wiki \
+        --split test \
+        --n_samples 1000
diff --git a/data/neighbor_gen.sh b/data/neighbor_gen.sh
@@ -1,11 +1,9 @@
 #!/bin/bash
 # Ideally, run after caching data with cache_data.json
-for subset in "full_pile" #"wikimia" "arxiv_2021_01" "arxiv_2021_06" "arxiv_2022_01" "arxiv_2022_06" "arxiv_2023_01" "arxiv_2023_06" "c4"
+for subset in "arxiv_2020-08" "arxiv_2021-01" "arxiv_2021-06" "arxiv_2022-01" "arxiv_2022-06" "arxiv_2023-01" "arxiv_2023-06" #"temporal_wiki_full" #"wikimia" "c4"
 do
     echo generating neighbors for $subset
     python run.py \
         --config configs/neighbor_gen_new.json \
-        --specific_source $subset \
-        --max_data 10000 \
-        --n_samples 10000
+        --specific_source $subset
 done
diff --git a/local/agg_ref_mia.py b/local/agg_ref_mia.py
@@ -16,8 +16,8 @@
     for ref in args.ref_files:
         f_ref_metrics = open(ref)
         ref_metrics_dict = json.load(f_ref_metrics)
-        ref_member_scores = np.array(ref_metrics_dict['predictions']["members"])
-        ref_nonmember_scores = np.array(ref_metrics_dict['predictions']["nonmembers"])
+        ref_member_scores = np.array(ref_metrics_dict['predictions']["member"])
+        ref_nonmember_scores = np.array(ref_metrics_dict['predictions']["nonmember"])
 
         if sum_member_scores is None:
             sum_member_scores = ref_member_scores

diff --git a/local/figures_for_paper.ipynb b/local/figures_for_paper.ipynb
diff --git a/local/report_results.sh b/local/report_results.sh
@@ -1,4 +1,4 @@
 #!/bin/bash
 python parse_results.py \
-    /gscratch/h2lab/micdun/mimir/results_new/mia_unified_mia_v5_ne/*/* \
-    --output mia_unified_mia_v5_ne_results.json
+    /gscratch/h2lab/micdun/mimir/results_new/mia_unified_mia_v5_temporal_arxiv_v2/*/* \
+    --output mia_unified_mia_v5_temporal_arxiv_v2_results.json
diff --git a/local/temp.sh b/local/temp.sh
@@ -1,7 +1,13 @@
 #!/bin/bash
 python agg_ref_mia.py \
-    "/mmfs1/gscratch/h2lab/micdun/mimir/results_new/mia_unified_mia_v4_llama_ref/EleutherAI_pythia-2.8b-deduped--bert-temp/fp32-0.3-1-the_pile-the_pile-1000200100_plen30_--tok_false-wikipedia_(en)/ref_model_decapoda-research_llama-7b-hf_lira_ratio_threshold_results.json" \
-    "/mmfs1/gscratch/h2lab/micdun/mimir/results_new/mia_unified_mia_v4_pile_refs/EleutherAI_pythia-2.8b-deduped--bert-temp/fp32-0.3-1-the_pile-the_pile-1000200100_plen30_--tok_false-wikipedia_(en)/ref_model_facebook_opt-125m_lira_ratio_threshold_results.json" \
+    "/mmfs1/gscratch/h2lab/micdun/mimir/results_new/mia_unified_mia_v5_ref_tab1_v2_remainder_llama/EleutherAI_pythia-12b-deduped--bert-temp/fp32-0.3-1-the_pile-the_pile-1000200100_plen30_--tok_false-arxiv_ngram_13_<0.8_truncated/ref-llama-7b_results.json" \
+    "/mmfs1/gscratch/h2lab/micdun/mimir/results_new/mia_unified_mia_v5_ref_tab1_v2/EleutherAI_pythia-12b-deduped--bert-temp/fp32-0.3-1-the_pile-the_pile-1000200100_plen30_--tok_false-arxiv_ngram_13_<0.8_truncated/ref-stablelm-base-alpha-3b-v2_results.json" \
+    "/mmfs1/gscratch/h2lab/micdun/mimir/results_new/mia_unified_mia_v5_ref_tab1_v2_remainder/EleutherAI_pythia-12b-deduped--bert-temp/fp32-0.3-1-the_pile-the_pile-1000200100_plen30_--tok_false-arxiv_ngram_13_<0.8_truncated/ref-gpt2_results.json" \
+    "/mmfs1/gscratch/h2lab/micdun/mimir/results_new/mia_unified_mia_v5_ref_tab1_v2_remainder/EleutherAI_pythia-12b-deduped--bert-temp/fp32-0.3-1-the_pile-the_pile-1000200100_plen30_--tok_false-arxiv_ngram_13_<0.8_truncated/ref-silo-pdswby-1.3b_results.json" \
+    "/mmfs1/gscratch/h2lab/micdun/mimir/results_new/mia_unified_mia_v5_ref_tab1_v2/EleutherAI_pythia-12b-deduped--bert-temp/fp32-0.3-1-the_pile-the_pile-1000200100_plen30_--tok_false-arxiv_ngram_13_<0.8_truncated/ref-distilgpt2_results.json" \
+    "/mmfs1/gscratch/h2lab/micdun/mimir/results_new/mia_unified_mia_v5_ref_tab1_v2/EleutherAI_pythia-12b-deduped--bert-temp/fp32-0.3-1-the_pile-the_pile-1000200100_plen30_--tok_false-arxiv_ngram_13_<0.8_truncated/ref-opt-1.3B_results.json" \
+    "/mmfs1/gscratch/h2lab/micdun/mimir/results_new/mia_unified_mia_v5_ref_tab1_v2/EleutherAI_pythia-12b-deduped--bert-temp/fp32-0.3-1-the_pile-the_pile-1000200100_plen30_--tok_false-arxiv_ngram_13_<0.8_truncated/ref-gpt-neo-1.3B_results.json" \
+    "/mmfs1/gscratch/h2lab/micdun/mimir/results_new/mia_unified_mia_v5_ref_tab1_v2/EleutherAI_pythia-12b-deduped--bert-temp/fp32-0.3-1-the_pile-the_pile-1000200100_plen30_--tok_false-arxiv_ngram_13_<0.8_truncated/ref-pythia-1.4b-deduped_results.json" 
 
 
 

diff --git a/mimir/models.py b/mimir/models.py
@@ -8,6 +8,7 @@
 import numpy as np
 import transformers
 import time
+from collections import defaultdict
 from tqdm import tqdm
 from multiprocessing.pool import ThreadPool
 import torch.nn.functional as F
@@ -80,18 +81,17 @@ def get_probabilities(self, text: str, tokens=None):
             if labels.shape[0] != 1:
                 # expand first dimension
                 labels = labels.unsqueeze(0)
-            labels = labels.to(self.device)
         else:
             tokenized = self.tokenizer(
-                text, return_tensors="pt").to(self.device)
+                text, return_tensors="pt")
             labels = tokenized.input_ids
 
         all_prob = []
         for i in range(0, labels.size(1), self.stride):
             begin_loc = max(i + self.stride - self.max_length, 0)
             end_loc = min(i + self.stride, labels.size(1))
             trg_len = end_loc - i  # may be different from stride on last loop
-            input_ids = labels[:, begin_loc:end_loc]
+            input_ids = labels[:, begin_loc:end_loc].to(self.device)
             target_ids = input_ids.clone()
             target_ids[:, :-trg_len] = -100
 
@@ -171,7 +171,7 @@ def load_base_model_and_tokenizer(self, model_kwargs):
         else:
             tokenizer = transformers.AutoTokenizer.from_pretrained(
                 self.name, **optional_tok_kwargs, cache_dir=self.cache_dir)
-        tokenizer.pad_token_id = tokenizer.eos_token_id
+        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
 
         return model, tokenizer
 
@@ -320,28 +320,63 @@ def get_rank(self, text: str, log: bool=False):
 
     # TODO extend for longer sequences
     @torch.no_grad()
-    def get_lls(self, texts: str, batch_size: int = 6):
-        # return [self.get_ll(text) for text in texts]
-        tokenized = self.tokenizer(texts, return_tensors="pt", padding=True)
-        labels = tokenized.input_ids
-        total_size = labels.shape[0]
+    def get_lls(self, texts: List[str], batch_size: int = 6):
+        #return [self.get_ll(text) for text in texts] # -np.mean([self.get_ll(text) for text in texts])
+        # tokenized = self.tokenizer(texts, return_tensors="pt", padding=True)
+        # labels = tokenized.input_ids
+        total_size = len(texts)
         losses = []
         for i in range(0, total_size, batch_size):
-            label_batch = labels[i:i+batch_size].to(self.device)
-            output = self.model(label_batch, labels=label_batch, return_dict=False)
-            loss = output[0]
-            # logits = output.logits
-            # # Shift so that tokens < n predict n
-            # shift_logits = logits[..., :-1, :].contiguous()
-            # shift_logits = torch.transpose(shift_logits, 1, 2)
-            # shift_labels = label_batch[..., 1:].contiguous()
-            # loss = F.cross_entropy(input=shift_logits, target=shift_labels)#, reduction='none').mean(dim=1)
-            losses.append(loss.item() * batch_size)
-            # del label_batch
-            # del shift_logits
+            # Delegate batches and tokenize
+            batch = texts[i:i+batch_size]
+            tokenized = self.tokenizer(batch, return_tensors="pt", padding=True, return_attention_mask=True)
+            label_batch = tokenized.input_ids
+
+            # # mask out padding tokens
+            attention_mask = tokenized.attention_mask
+            assert attention_mask.size() == label_batch.size()
+
+            needs_sliding = label_batch.size(1) > self.max_length
+            if not needs_sliding:
+                label_batch = label_batch.to(self.device)
+                attention_mask = attention_mask.to(self.device)
+
+            # Collect token probabilities per sample in batch
+            all_prob = defaultdict(list)
+            for i in range(0, label_batch.size(1), self.stride):
+                begin_loc = max(i + self.stride - self.max_length, 0)
+                end_loc = min(i + self.stride, label_batch.size(1))
+                trg_len = end_loc - i  # may be different from stride on last loop
+                input_ids = label_batch[:, begin_loc:end_loc]
+                mask = attention_mask[:, begin_loc:end_loc]
+                if needs_sliding:
+                    input_ids = input_ids.to(self.device)
+                    mask = mask.to(self.device)
+
+                target_ids = input_ids.clone()
+                # Don't count padded tokens or tokens that already have computed probabilities
+                target_ids[:, :-trg_len] = -100
+                # target_ids[attention_mask == 0] = -100
+
+                outputs = self.model(input_ids, labels=target_ids, attention_mask=mask)
+                logits = outputs.logits
+                shift_logits = logits[..., :-1, :].contiguous()
+                probabilities = torch.nn.functional.log_softmax(shift_logits, dim=-1)
+                shift_labels = target_ids[..., 1:].contiguous()
+
+                for i, sample in enumerate(shift_labels):
+                    for j, token_id in enumerate(sample):
+                        if token_id != -100 and token_id != self.tokenizer.pad_token_id:
+                            probability = probabilities[i, j, token_id].item()
+                            all_prob[i].append(probability)
+
+            # average over each sample to get losses
+            batch_losses = [-np.mean(all_prob[idx]) for idx in range(label_batch.size(0))]
+            # print(batch_losses)
+            losses.extend(batch_losses)
             del label_batch
-            del output
-        return np.sum(losses) / total_size
+            del attention_mask
+        return losses #np.mean(losses)
 
     @torch.no_grad()
     def get_min_k_prob(self, text: str, tokens=None, probs=None, k=.2, window=1, stride=1):