refactor evaluation

SeanLee97 · Jul 26, 2024 · c51e05d · c51e05d
1 parent ccce515
commit c51e05d
Show file tree

Hide file tree

Showing 5 changed files with 138 additions and 76 deletions.
diff --git a/angle_emb/__init__.py b/angle_emb/__init__.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 
-from .angle import *
+from .angle import *  # NOQA
+from .evaluation import *  # NOQA
 
 
 __version__ = '0.4.8'
diff --git a/angle_emb/angle.py b/angle_emb/angle.py
@@ -10,15 +10,10 @@
 from typing import Any, Dict, Optional, List, Union, Tuple, Callable
 from dataclasses import dataclass
 
-import scipy
-import scipy.stats
-import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import bitsandbytes as bnb
-from tqdm import tqdm
-from boltons.iterutils import chunked_iter
 from datasets import Dataset
 from transformers import (
     AutoModelForCausalLM, AutoModel, AutoTokenizer,
@@ -35,6 +30,7 @@
 from peft.tuners.lora import LoraLayer
 
 from .utils import logger
+from .evaluation import CorrelationEvaluator
 
 
 DEFAULT_LLM_PATTERNS = [r'.*llama.*', r'.*qwen.*', r'.*baichuan.*', r'.*mistral.*']
@@ -237,44 +233,6 @@ def contrastive_with_negative_loss(
     return nn.CrossEntropyLoss()(scores, labels)
 
 
-def compute_corrcoef(x: np.ndarray, y: np.ndarray) -> float:
-    """
-    Compute correlation coefficients
-
-    :param x: np.ndarry, x array
-    :param y: np.ndarry, y array
-
-    :return: float
-    """
-    return scipy.stats.spearmanr(x, y).correlation
-
-
-def l2_normalize(arr: np.ndarray) -> np.ndarray:
-    """
-    Normalize array using L2
-
-    :param arr: np.ndarray, input array
-
-    :return: np.ndarray
-    """
-    norms = (arr**2).sum(axis=1, keepdims=True)**0.5
-    return arr / np.clip(norms, 1e-8, np.inf)
-
-
-def optimal_threshold(y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[float, float]:
-    """
-    Compute optimal threshold
-
-    :param y_true: np.ndarray, y_true
-    :param y_pred: np.ndarray, y_true
-
-    :return: Tuple[float, float]
-    """
-    loss = lambda t: -np.mean((y_true > 0.5) == (y_pred > np.tanh(t)))  # NOQA
-    result = scipy.optimize.minimize(loss, 1, method='Powell')
-    return np.tanh(result.x), -result.fun
-
-
 def check_llm(model_name_or_path: str, llm_regex_patterns: List[str] = None) -> bool:
     if llm_regex_patterns is not None:
         llm_regex_patterns += DEFAULT_LLM_PATTERNS
@@ -1499,35 +1457,21 @@ def fit(self,
             trainer.push_to_hub()
         self.backbone.save_pretrained(output_dir)
 
-    def evaluate(self, data: Dataset, batch_size: int = 32, threshold: Optional[float] = None, device: Any = None):
-        self.backbone.eval()
-        data_collator = AngleDataCollator(
-            self.tokenizer,
-            return_tensors="pt",
-            max_length=self.max_length,
-            filter_duplicate=False,
-        )
-        y_trues, y_preds = [], []
-        # for X, y in data.make_iter(random=False):
-        for features in tqdm(chunked_iter(data, batch_size), desc='Evaluate'):
-            X = data_collator(features)
-            y = X.pop('labels', None)
-            y_trues.extend(y[::2, 0].detach().cpu().numpy())
-            with torch.no_grad():
-                X.to(device or self.device)
-                x_vecs = self.pooler(X,
-                                     pooling_strategy=self.pooling_strategy).detach().float().cpu().numpy()
-            x_vecs = l2_normalize(x_vecs)
-            pred = (x_vecs[::2] * x_vecs[1::2]).sum(1)
-            y_preds.extend(pred)
-
-        y_trues, y_preds = np.array(y_trues), np.array(y_preds)
-        corrcoef = compute_corrcoef(y_trues, y_preds)
-        if threshold is None:
-            _, accuracy = optimal_threshold(y_trues, y_preds)
-        else:
-            accuracy = np.mean((y_trues > 0.5) == (y_preds > threshold))
-        return corrcoef, accuracy
+    def evaluate(self, data: Dataset, batch_size: int = 32, metric: str = 'spearman_cosine') -> float:
+        """ evaluate
+
+        :param data: Dataset, DatasetFormats.A is required
+        :param batch_size: int. Default 32.
+        :param metric: str. Default 'spearman_cosine'.
+
+        :return: float.
+        """
+        return CorrelationEvaluator(
+            text1=data['text1'],
+            text2=data['text2'],
+            labels=data['label'],
+            batch_size=batch_size,
+        )(self)[metric]
 
     def encode(self,
                inputs: Union[List[str], Tuple[str], List[Dict], str],
@@ -1656,7 +1600,7 @@ def __init__(self,
         self.hub_private_repo = hub_private_repo
 
     def on_epoch_end(self, args, state, control, **kwargs):
-        corrcoef, accuracy = self.evaluate_fn(self.valid_ds)
+        corrcoef = self.evaluate_fn(self.valid_ds)
         if corrcoef > self.best_corrcoef:
             self.best_corrcoef = corrcoef
             print('new best corrcoef!')
@@ -1669,4 +1613,4 @@ def on_epoch_end(self, args, state, control, **kwargs):
                         private=self.hub_private_repo,
                         exist_ok=True,
                         commit_message='new best checkpoint')
-        print(f'corrcoef: {corrcoef}, accuracy: {accuracy}, best corrcoef: {self.best_corrcoef}')
+        logger.info(f'corrcoef: {corrcoef}, best corrcoef: {self.best_corrcoef}')
diff --git a/angle_emb/evaluation.py b/angle_emb/evaluation.py
@@ -0,0 +1,97 @@
+# -*- coding: utf-8 -*-
+
+from typing import List
+
+import numpy as np
+from boltons.iterutils import chunked_iter
+from tqdm import tqdm
+from sklearn.metrics.pairwise import (
+    paired_cosine_distances,
+    paired_euclidean_distances,
+    paired_manhattan_distances
+)
+from scipy.stats import pearsonr, spearmanr
+
+from .angle import AnglE
+
+
+class CorrelationEvaluator(object):
+    def __init__(
+        self,
+        text1: List[str],
+        text2: List[str],
+        labels: List[float],
+        batch_size: int = 32
+    ):
+        assert len(text1) == len(text2) == len(labels), "text1, text2, and labels must have the same length"
+        self.text1 = text1
+        self.text2 = text2
+        self.labels = labels
+        self.batch_size = batch_size
+
+    def __call__(self, model: AnglE, **kwargs) -> dict:
+        """ Evaluate the model on the given dataset.
+
+        :param model: AnglE, the model to evaluate.
+        :param kwargs: Additional keyword arguments to pass to the `encode` method of the model.
+
+        :return: dict, The evaluation results.
+        """
+        embeddings1 = []
+        embeddings2 = []
+        for chunk in tqdm(chunked_iter(range(len(self.text1)), self.batch_size)):
+            batch_text1 = [self.text1[i] for i in chunk]
+            batch_text2 = [self.text2[i] for i in chunk]
+
+            batch_embeddings1 = model.encode(batch_text1, **kwargs)
+            batch_embeddings2 = model.encode(batch_text2, **kwargs)
+            embeddings1.append(batch_embeddings1)
+            embeddings2.append(batch_embeddings2)
+
+        embeddings1 = np.concatenate(embeddings1, axis=0)
+        embeddings2 = np.concatenate(embeddings2, axis=0)
+
+        cosine_labels = 1 - (paired_cosine_distances(embeddings1, embeddings2))
+        manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2)
+        euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2)
+        dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2)]
+
+        pearson_cosine, _ = pearsonr(self.labels, cosine_labels)
+        spearman_cosine, _ = spearmanr(self.labels, cosine_labels)
+
+        pearson_manhattan, _ = pearsonr(self.labels, manhattan_distances)
+        spearman_manhattan, _ = spearmanr(self.labels, manhattan_distances)
+
+        pearson_euclidean, _ = pearsonr(self.labels, euclidean_distances)
+        spearman_euclidean, _ = spearmanr(self.labels, euclidean_distances)
+
+        pearson_dot, _ = pearsonr(self.labels, dot_products)
+        spearman_dot, _ = spearmanr(self.labels, dot_products)
+
+        metrics = {
+            "pearson_cosine": pearson_cosine,
+            "spearman_cosine": spearman_cosine,
+            "pearson_manhattan": pearson_manhattan,
+            "spearman_manhattan": spearman_manhattan,
+            "pearson_euclidean": pearson_euclidean,
+            "spearman_euclidean": spearman_euclidean,
+            "pearson_dot": pearson_dot,
+            "spearman_dot": spearman_dot,
+        }
+        return metrics
+
+    def list_all_metrics(self) -> List[str]:
+        """ Get a list of all the metrics that can be computed by this evaluator.
+
+        :return: List[str], A list of all the metrics that can be computed by this evaluator.
+        """
+        return [
+            "pearson_cosine",
+            "spearman_cosine",
+            "pearson_manhattan",
+            "spearman_manhattan",
+            "pearson_euclidean",
+            "spearman_euclidean",
+            "pearson_dot",
+            "spearman_dot",
+        ]
diff --git a/requirements.txt b/requirements.txt
@@ -6,4 +6,5 @@ prettytable
 transformers>=4.32.1
 scipy
 einops
-wandb
+wandb
+scikit-learn
diff --git a/tests/test_eval.py b/tests/test_eval.py
@@ -0,0 +1,19 @@
+# -*- coding: utf-8 -*-
+
+
+def test_eval():
+    from datasets import load_dataset
+    from angle_emb import AnglE, CorrelationEvaluator
+
+    angle = AnglE.from_pretrained('WhereIsAI/UAE-Large-V1', pooling_strategy='cls')
+    eval_dataset = load_dataset('sentence-transformers/stsb', split="test")
+
+    spearman = CorrelationEvaluator(
+        text1=eval_dataset["sentence1"],
+        text2=eval_dataset["sentence2"],
+        labels=eval_dataset["score"],
+    )(angle)['spearman_cosine']
+    assert spearman > 0.9
+
+    spearman = angle.evaluate(eval_dataset)
+    assert spearman > 0.9
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,4 +6,5 @@ prettytable @@
     transformers>=4.32.1
     scipy
     einops
-    wandb
+    wandb
+    scikit-learn