diff --git a/docs/attacks/all_attacks.html b/docs/attacks/all_attacks.html index 2f8af53..a6a4391 100644 --- a/docs/attacks/all_attacks.html +++ b/docs/attacks/all_attacks.html @@ -41,6 +41,7 @@

Module mimir.attacks.all_attacks

REFERENCE_BASED = "ref" # Done ZLIB = "zlib" # Done MIN_K = "min_k" # Done + MIN_K_PLUS_PLUS = "min_k++" # Done NEIGHBOR = "ne" # Done GRADNORM = "gradnorm" # Done # QUANTILE = "quantile" # Uncomment when tested implementation is available @@ -122,6 +123,7 @@

Classes

REFERENCE_BASED = "ref" # Done ZLIB = "zlib" # Done MIN_K = "min_k" # Done + MIN_K_PLUS_PLUS = "min_k++" # Done NEIGHBOR = "ne" # Done GRADNORM = "gradnorm" # Done # QUANTILE = "quantile" # Uncomment when tested implementation is available @@ -145,6 +147,10 @@

Class variables

+
var MIN_K_PLUS_PLUS
+
+
+
var NEIGHBOR
@@ -224,6 +230,7 @@

Subclasses

  • GradNormAttack
  • LOSSAttack
  • MinKProbAttack
  • +
  • MinKPlusPlusAttack
  • NeighborhoodAttack
  • QuantileAttack
  • ReferenceAttack
  • @@ -326,6 +333,7 @@

    GRADNORM
  • LOSS
  • MIN_K
  • +
  • MIN_K_PLUS_PLUS
  • NEIGHBOR
  • REFERENCE_BASED
  • ZLIB
  • diff --git a/docs/attacks/index.html b/docs/attacks/index.html index 66733df..7d97cc6 100644 --- a/docs/attacks/index.html +++ b/docs/attacks/index.html @@ -55,6 +55,10 @@

    Sub-modules

    Min-k % Prob Attack: https://arxiv.org/pdf/2310.16789.pdf

    +
    mimir.attacks.min_k_plus_plus
    +
    +

    Min-K%++ Attack: https://github.com/zjysteven/mink-plus-plus

    +
    mimir.attacks.neighborhood

    Neighborhood-MIA attack https://arxiv.org/pdf/2305.18462.pdf

    @@ -108,6 +112,7 @@

    Index

  • mimir.attacks.gradnorm
  • mimir.attacks.loss
  • mimir.attacks.min_k
  • +
  • mimir.attacks.min_k_plus_plus
  • mimir.attacks.neighborhood
  • mimir.attacks.quantile
  • mimir.attacks.reference
  • diff --git a/docs/attacks/min_k_plus_plus.html b/docs/attacks/min_k_plus_plus.html new file mode 100644 index 0000000..cb4bd9f --- /dev/null +++ b/docs/attacks/min_k_plus_plus.html @@ -0,0 +1,165 @@ + + + + + + +mimir.attacks.min_k_plus_plus API documentation + + + + + + + + + + + +
    +
    +
    +

    Module mimir.attacks.min_k_plus_plus

    +
    +
    +

    Min-K%++ Attack: https://github.com/zjysteven/mink-plus-plus

    +
    + +Expand source code + +
    """
    +    Min-K%++ Attack: https://github.com/zjysteven/mink-plus-plus
    +"""
    +import torch as ch
    +import numpy as np
    +from mimir.attacks.all_attacks import Attack
    +from mimir.models import Model
    +from mimir.config import ExperimentConfig
    +
    +
    +class MinKPlusPlusAttack(Attack):
    +
    +    def __init__(self, config: ExperimentConfig, model: Model):
    +        super().__init__(config, model, ref_model=None)
    +
    +    @ch.no_grad()
    +    def _attack(self, document, probs, tokens=None, **kwargs):
    +        """
    +        Min-K%++ Attack. 
    +        Gets token probabilties, normalize with the mean and std over the whole categorical distribution,
    +        and returns normalized likelihood when computed over top k% of ngrams.
    +        """
    +        # Hyper-params specific to min-k attack
    +        k: float = kwargs.get("k", 0.2)
    +        all_probs = kwargs.get("all_probs", None)
    +
    +        # these are all log probabilites
    +        target_prob, all_probs = (
    +            (probs, all_probs)
    +            if (probs is not None and all_probs is not None)
    +            else self.model.get_probabilities(document, tokens=tokens, return_all_probs=True)
    +        )
    +        
    +        mu = (ch.exp(all_probs) * all_probs).sum(-1)
    +        sigma = (ch.exp(all_probs) * ch.square(all_probs)).sum(-1) - ch.square(mu)
    +        scores = (np.array(target_prob) - mu.numpy()) / sigma.sqrt().numpy()
    +        
    +        return -np.mean(sorted(scores)[:int(len(scores) * k)])
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    Classes

    +
    +
    +class MinKPlusPlusAttack +(config: ExperimentConfig, model: Model) +
    +
    +
    +
    + +Expand source code + +
    class MinKPlusPlusAttack(Attack):
    +
    +    def __init__(self, config: ExperimentConfig, model: Model):
    +        super().__init__(config, model, ref_model=None)
    +
    +    @ch.no_grad()
    +    def _attack(self, document, probs, tokens=None, **kwargs):
    +        """
    +        Min-K%++ Attack. 
    +        Gets token probabilties, normalize with the mean and std over the whole categorical distribution,
    +        and returns normalized likelihood when computed over top k% of ngrams.
    +        """
    +        # Hyper-params specific to min-k attack
    +        k: float = kwargs.get("k", 0.2)
    +        all_probs = kwargs.get("all_probs", None)
    +
    +        # these are all log probabilites
    +        target_prob, all_probs = (
    +            (probs, all_probs)
    +            if (probs is not None and all_probs is not None)
    +            else self.model.get_probabilities(document, tokens=tokens, return_all_probs=True)
    +        )
    +        
    +        mu = (ch.exp(all_probs) * all_probs).sum(-1)
    +        sigma = (ch.exp(all_probs) * ch.square(all_probs)).sum(-1) - ch.square(mu)
    +        scores = (np.array(target_prob) - mu.numpy()) / sigma.sqrt().numpy()
    +        
    +        return -np.mean(sorted(scores)[:int(len(scores) * k)])
    +
    +

    Ancestors

    + +

    Inherited members

    + +
    +
    +
    +
    + +
    + + + \ No newline at end of file diff --git a/docs/attacks/utils.html b/docs/attacks/utils.html index 6ec49c8..264ee55 100644 --- a/docs/attacks/utils.html +++ b/docs/attacks/utils.html @@ -32,6 +32,7 @@

    Module mimir.attacks.utils

    from mimir.attacks.reference import ReferenceAttack from mimir.attacks.zlib import ZLIBAttack from mimir.attacks.min_k import MinKProbAttack +from mimir.attacks.min_k_plus_plus import MinKPlusPlusAttack from mimir.attacks.neighborhood import NeighborhoodAttack from mimir.attacks.gradnorm import GradNormAttack @@ -43,6 +44,7 @@

    Module mimir.attacks.utils

    AllAttacks.REFERENCE_BASED: ReferenceAttack, AllAttacks.ZLIB: ZLIBAttack, AllAttacks.MIN_K: MinKProbAttack, + AllAttacks.MIN_K_PLUS_PLUS: MinKPlusPlusAttack, AllAttacks.NEIGHBOR: NeighborhoodAttack, AllAttacks.GRADNORM: GradNormAttack, } @@ -74,6 +76,7 @@

    Functions

    AllAttacks.REFERENCE_BASED: ReferenceAttack, AllAttacks.ZLIB: ZLIBAttack, AllAttacks.MIN_K: MinKProbAttack, + AllAttacks.MIN_K_PLUS_PLUS: MinKPlusPlusAttack, AllAttacks.NEIGHBOR: NeighborhoodAttack, AllAttacks.GRADNORM: GradNormAttack, } diff --git a/docs/models.html b/docs/models.html index 9638c98..568f229 100644 --- a/docs/models.html +++ b/docs/models.html @@ -99,7 +99,8 @@

    Module mimir.models

    def get_probabilities(self, text: str, tokens: np.ndarray = None, - no_grads: bool = True): + no_grads: bool = True, + return_all_probs: bool = False): """ Get the probabilities or log-softmaxed logits for a text under the current model. Args: @@ -127,7 +128,8 @@

    Module mimir.models

    text, return_tensors="pt") labels = tokenized.input_ids - all_prob = [] + target_token_log_prob = [] + all_token_log_prob = [] for i in range(0, labels.size(1), self.stride): begin_loc = max(i + self.stride - self.max_length, 0) end_loc = min(i + self.stride, labels.size(1)) @@ -140,7 +142,7 @@

    Module mimir.models

    if no_grads: logits = logits.cpu() shift_logits = logits[..., :-1, :].contiguous() - probabilities = torch.nn.functional.log_softmax(shift_logits, dim=-1) + log_probabilities = torch.nn.functional.log_softmax(shift_logits, dim=-1) shift_labels = target_ids[..., 1:] if no_grads: shift_labels = shift_labels.cpu() @@ -152,17 +154,23 @@

    Module mimir.models

    for i, token_id in enumerate(labels_processed): if token_id != -100: - probability = probabilities[0, i, token_id] + log_probability = log_probabilities[0, i, token_id] if no_grads: - probability = probability.item() - all_prob.append(probability) + log_probability = log_probability.item() + target_token_log_prob.append(log_probability) + all_token_log_prob.append(log_probabilities[0, i]) + # Should be equal to # of tokens - 1 to account for shift - assert len(all_prob) == labels.size(1) - 1 + assert len(target_token_log_prob) == labels.size(1) - 1 + all_token_log_prob = torch.stack(all_token_log_prob, dim=0) + assert len(target_token_log_prob) == len(all_token_log_prob) if not no_grads: - all_prob = torch.stack(all_prob) + target_token_log_prob = torch.stack(target_token_log_prob) - return all_prob + if not return_all_probs: + return target_token_log_prob + return target_token_log_prob, all_token_log_prob @torch.no_grad() def get_ll(self, @@ -1244,7 +1252,8 @@

    Inherited members

    def get_probabilities(self, text: str, tokens: np.ndarray = None, - no_grads: bool = True): + no_grads: bool = True, + return_all_probs: bool = False): """ Get the probabilities or log-softmaxed logits for a text under the current model. Args: @@ -1272,7 +1281,8 @@

    Inherited members

    text, return_tensors="pt") labels = tokenized.input_ids - all_prob = [] + target_token_log_prob = [] + all_token_log_prob = [] for i in range(0, labels.size(1), self.stride): begin_loc = max(i + self.stride - self.max_length, 0) end_loc = min(i + self.stride, labels.size(1)) @@ -1285,7 +1295,7 @@

    Inherited members

    if no_grads: logits = logits.cpu() shift_logits = logits[..., :-1, :].contiguous() - probabilities = torch.nn.functional.log_softmax(shift_logits, dim=-1) + log_probabilities = torch.nn.functional.log_softmax(shift_logits, dim=-1) shift_labels = target_ids[..., 1:] if no_grads: shift_labels = shift_labels.cpu() @@ -1297,17 +1307,23 @@

    Inherited members

    for i, token_id in enumerate(labels_processed): if token_id != -100: - probability = probabilities[0, i, token_id] + log_probability = log_probabilities[0, i, token_id] if no_grads: - probability = probability.item() - all_prob.append(probability) + log_probability = log_probability.item() + target_token_log_prob.append(log_probability) + all_token_log_prob.append(log_probabilities[0, i]) + # Should be equal to # of tokens - 1 to account for shift - assert len(all_prob) == labels.size(1) - 1 + assert len(target_token_log_prob) == labels.size(1) - 1 + all_token_log_prob = torch.stack(all_token_log_prob, dim=0) + assert len(target_token_log_prob) == len(all_token_log_prob) if not no_grads: - all_prob = torch.stack(all_prob) + target_token_log_prob = torch.stack(target_token_log_prob) - return all_prob + if not return_all_probs: + return target_token_log_prob + return target_token_log_prob, all_token_log_prob @torch.no_grad() def get_ll(self, @@ -1501,7 +1517,7 @@

    Args

    -def get_probabilities(self, text: str, tokens: numpy.ndarray = None, no_grads: bool = True) +def get_probabilities(self, text: str, tokens: numpy.ndarray = None, no_grads: bool = True, return_all_probs: bool = False)

    Get the probabilities or log-softmaxed logits for a text under the current model.

    @@ -1530,7 +1546,8 @@

    Returns

    def get_probabilities(self,
                           text: str,
                           tokens: np.ndarray = None,
    -                      no_grads: bool = True):
    +                      no_grads: bool = True,
    +                      return_all_probs: bool = False):
         """
             Get the probabilities or log-softmaxed logits for a text under the current model.
             Args:
    @@ -1558,7 +1575,8 @@ 

    Returns

    text, return_tensors="pt") labels = tokenized.input_ids - all_prob = [] + target_token_log_prob = [] + all_token_log_prob = [] for i in range(0, labels.size(1), self.stride): begin_loc = max(i + self.stride - self.max_length, 0) end_loc = min(i + self.stride, labels.size(1)) @@ -1571,7 +1589,7 @@

    Returns

    if no_grads: logits = logits.cpu() shift_logits = logits[..., :-1, :].contiguous() - probabilities = torch.nn.functional.log_softmax(shift_logits, dim=-1) + log_probabilities = torch.nn.functional.log_softmax(shift_logits, dim=-1) shift_labels = target_ids[..., 1:] if no_grads: shift_labels = shift_labels.cpu() @@ -1583,17 +1601,23 @@

    Returns

    for i, token_id in enumerate(labels_processed): if token_id != -100: - probability = probabilities[0, i, token_id] + log_probability = log_probabilities[0, i, token_id] if no_grads: - probability = probability.item() - all_prob.append(probability) + log_probability = log_probability.item() + target_token_log_prob.append(log_probability) + all_token_log_prob.append(log_probabilities[0, i]) + # Should be equal to # of tokens - 1 to account for shift - assert len(all_prob) == labels.size(1) - 1 + assert len(target_token_log_prob) == labels.size(1) - 1 + all_token_log_prob = torch.stack(all_token_log_prob, dim=0) + assert len(target_token_log_prob) == len(all_token_log_prob) if not no_grads: - all_prob = torch.stack(all_prob) + target_token_log_prob = torch.stack(target_token_log_prob) - return all_prob
    + if not return_all_probs: + return target_token_log_prob + return target_token_log_prob, all_token_log_prob