Skip to content

Commit

Permalink
bs_sent np optimization and more confs
Browse files Browse the repository at this point in the history
bs_sent: mnli allow specify classifier, cos_sim allow specify embedder
fix: #5, #10 direction 1
  • Loading branch information
TURX committed Dec 9, 2022
1 parent de4de4b commit 1a5d751
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 65 deletions.
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,10 @@ Implemented in `/bertscore_sentence`
Usage:
```python
import bertscore_sentence.eval as bertscore_sentence
import dar_env
metrics = {
"bertscore-sentence": functools.partial(bertscore_sentence.compute),
"bertscore-sentence-cos-mpnet": functools.partial(bertscore_sentence.compute, embedder=dar_env.sent_embedder_mpnet),
"bertscore-sentence-cos-roberta": functools.partial(bertscore_sentence.compute, embedder=dar_env.sent_embedder_roberta),
}
```

Expand All @@ -72,8 +74,10 @@ Implemented in `/mnli`
Usage:
```python
import mnli.eval as mnli
import dar_env
metrics = {
"bertscore-sentence-mnli": functools.partial(mnli.bertscore_sentence_compute),
"bertscore-sentence-mnli-roberta": functools.partial(mnli.bertscore_sentence_compute, classifier=dar_env.mnli_classifier_roberta),
"bertscore-sentence-mnli-bart": functools.partial(mnli.bertscore_sentence_compute, classifier=dar_env.mnli_classifier_bart),
}
```

Expand Down
60 changes: 27 additions & 33 deletions bertscore_sentence/eval.py
Original file line number Diff line number Diff line change
@@ -1,72 +1,66 @@
import sys
from os import path

file_path = path.abspath(__file__)
sys.path.append(path.dirname(path.dirname(file_path)))

import typing
import numpy as np
import torch
from tqdm.auto import trange
from dar_env import nlp
from dar_env import sent_embedder as embedder
from dar_env import nlp_spacy
import functools
import sentence_transformers


def cos_sim_mat_f(cand, ref) -> np.ndarray:
def cos_sim_mat_f(cand, ref, embedder) -> np.ndarray:
def bert_encode(piece: str):
sentence_emb = list()
doc = nlp(piece)
doc = nlp_spacy(piece)
doc_sents = [sent.text for sent in doc.sents]
for sentence in doc_sents:
with torch.no_grad():
sentence_emb.append(embedder.encode(sentence, convert_to_numpy=True))
return sentence_emb, doc_sents

cand_sentence_emb, cand_sentences = bert_encode(cand)
ref_sentence_emb, ref_sentences = bert_encode(ref)
sim_mat = np.zeros((len(ref_sentence_emb), len(cand_sentence_emb)))
for i in range(len(ref_sentence_emb)):
for j in range(len(cand_sentence_emb)):
numerator = np.dot(ref_sentence_emb[i], cand_sentence_emb[j]) # float32
denominator = np.dot(np.linalg.norm(ref_sentence_emb[i]),
np.linalg.norm(cand_sentence_emb[j])) # float32
cos_sim = np.divide(numerator, denominator) # float32
sim_mat[i][j] = cos_sim
del numerator, denominator, cos_sim
return sim_mat, cand_sentences, ref_sentences
ref_sent_emb_list, ref_sents = bert_encode(ref)
cand_sent_emb_list, cand_sents = bert_encode(cand)
ref_sent_emb = np.stack(ref_sent_emb_list, axis=0)
cand_sent_emb = np.stack(cand_sent_emb_list, axis=0)
numerators = np.inner(ref_sent_emb, cand_sent_emb)
ref_sent_emb_norms = np.linalg.norm(ref_sent_emb, axis=1)
cand_sent_emb_norms = np.linalg.norm(cand_sent_emb, axis=1)
denominators = np.outer(ref_sent_emb_norms, cand_sent_emb_norms)
sim_mat = np.divide(numerators, denominators)
return sim_mat, cand_sents, ref_sents


def score_np(predictions: typing.List[str], references: typing.List[str], sim_mat_f: typing.Callable) -> np.ndarray:
cands, refs = predictions, references # simple renaming.
all_scores = np.empty((len(cands), 3))

all_scores = np.zeros((len(cands), 3))

for index in trange(len(cands), desc="bertscore-sentence cands {}".format(sim_mat_f.__name__), leave=False): # all pieces, len(cands) == len(refs)
sim_mat, cand_sentences, ref_sentences = sim_mat_f(cand=cands[index], ref=refs[index])
for index in trange(len(cands), desc="bertscore-sentence {}".format(sim_mat_f.__name__), leave=False): # all pieces, len(cands) == len(refs)
sim_mat, cand_sents, ref_sents = sim_mat_f(cand=cands[index], ref=refs[index])

def sum_max(is_r: bool) -> float:
sum_result = 0.0
if is_r:
for i in range(len(ref_sentences)):
sum_result += sim_mat[i].max()
return np.sum(np.max(sim_mat, axis=1))
else:
sim_mat_t = sim_mat.transpose()
for j in range(len(cand_sentences)):
sum_result += sim_mat_t[j].max()
del sim_mat_t
return sum_result
return np.sum(np.max(sim_mat, axis=0)) # equals to np.sum(np.max(sim_mat.T, axis=1))

R = (1 / len(ref_sentences)) * sum_max(True)
P = (1 / len(cand_sentences)) * sum_max(False)
R = (1 / len(ref_sents)) * sum_max(True)
P = (1 / len(cand_sents)) * sum_max(False)
F = 2 * ((P * R) / (P + R))
all_scores[index, :] = np.array([P, R, F])
del sim_mat

return all_scores


def compute(predictions: typing.List[str], references: typing.List[str], sim_mat_f: typing.Callable = cos_sim_mat_f) -> typing.Dict:
cands, refs = predictions, references # simple renaming.
def compute(predictions: typing.List[str], references: typing.List[str], sim_mat_f: typing.Optional[typing.Callable] = None, embedder: typing.Optional[sentence_transformers.SentenceTransformer] = None) -> typing.Dict:
cands, refs = predictions, references # simple renaming
if sim_mat_f is None: # cosine similarity by default
sim_mat_f = functools.partial(cos_sim_mat_f, embedder=embedder)
sim_mat_f.__name__ = " ".join(["cos", embedder.__name__])
score_arr = score_np(predictions=cands, references=refs, sim_mat_f=sim_mat_f)
return {
"P": score_arr[:, 0].tolist(),
Expand Down
13 changes: 9 additions & 4 deletions dar_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,15 @@
import evaluate


nlp = spacy.load("en_core_web_lg")
mnli_classifier = pipeline("text-classification",
model="roberta-large-mnli", top_k=None)
sent_embedder = sentence_transformers.SentenceTransformer("all-MiniLM-L6-v2")
nlp_spacy = spacy.load("en_core_web_lg")
mnli_classifier_roberta = pipeline("text-classification", model="roberta-large-mnli", top_k=None)
mnli_classifier_roberta.__name__ = "roberta-large-mnli"
mnli_classifier_bart = pipeline("text-classification", model="facebook/bart-large-mnli", top_k=None)
mnli_classifier_bart.__name__ = "bart-large-mnli"
sent_embedder_mpnet = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
sent_embedder_mpnet.__name__ = "all-mpnet-base-v2"
sent_embedder_roberta = sentence_transformers.SentenceTransformer("all-roberta-large-v1")
sent_embedder_roberta.__name__ = "all-roberta-large-v1"
bertscore = evaluate.load("bertscore")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")
Expand Down
26 changes: 14 additions & 12 deletions mnli/eval.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import sys
from os import path

file_path = path.abspath(__file__)
sys.path.append(path.dirname(path.dirname(file_path)))

Expand All @@ -9,22 +8,25 @@
import numpy as np
from mnli.sim import similarity
from dar_env import nlp
import functools
import transformers


def mnli_sim_mat(cand, ref) -> np.ndarray:
def mnli_sim_mat(cand: str, ref: str, classifier: transformers.Pipeline) -> np.ndarray:
def segmentation(piece: str):
doc = nlp(piece)
doc = nlp_spacy(piece)
doc_sents = [sent.text for sent in doc.sents]
return doc_sents

cand_sentences = segmentation(cand)
ref_sentences = segmentation(ref)
sim_mat = np.zeros((len(ref_sentences), len(cand_sentences)))
for i in range(len(ref_sentences)):
for j in range(len(cand_sentences)):
sim_mat[i][j] = similarity(ref_sentences[i], cand_sentences[j])
return sim_mat, cand_sentences, ref_sentences
cand_sents = segmentation(cand)
ref_sents = segmentation(ref)
sent_pairs = [" ".join([x, y]) for x in ref_sents for y in cand_sents]
sim_mat = np.empty((len(ref_sents), len(cand_sents)))
sim_mat.flat = similarity(sent_pairs, classifier)
return sim_mat, cand_sents, ref_sents


def bertscore_sentence_compute(predictions: typing.List[str], references: typing.List[str]) -> typing.Dict:
return eval.compute(predictions=predictions, references=references, sim_mat_f=mnli_sim_mat)
def bertscore_sentence_compute(predictions: typing.List[str], references: typing.List[str], classifier: transformers.Pipeline) -> typing.Dict:
sim_mat_f = functools.partial(mnli_sim_mat, classifier=classifier)
sim_mat_f.__name__ = " ".join(["mnli", classifier.__name__])
return eval.compute(predictions=predictions, references=references, sim_mat_f=sim_mat_f)
25 changes: 14 additions & 11 deletions mnli/sim.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,24 @@
import sys
from os import path

file_path = path.abspath(__file__)
sys.path.append(path.dirname(path.dirname(file_path)))

from dar_env import mnli_classifier
import typing
import transformers


def similarity(sentence_a: str, sentence_b: str):
sequence = " ".join([sentence_a, sentence_b])
classes = mnli_classifier(sequence)
for c in classes[0]:
if c["label"] == "NEUTRAL":
return 1 - c["score"]
raise Exception("Not found NEUTRAL class")
def similarity(sent_pairs: typing.List[str], classifier: transformers.Pipeline):
classes = classifier(sent_pairs)
scores = []
for c in classes:
for category in c:
if category["label"] == "NEUTRAL":
scores.append(1 - category["score"])
break
return scores


if __name__ == "__main__":
print(similarity("Each computer program uses a region of memory called the stack to enable functions to work properly.",
"From the outside, Les 4G, a Lyonnais bouchon (traditional restaurant), looked much like the nondescript cafe-cum-tobacco shops that can be found in most small French towns, but inside the decor was as warm and inviting as a country pub."))
sample_a = "Each computer program uses a region of memory called the stack to enable functions to work properly."
sample_b = "From the outside, Les 4G, a Lyonnais bouchon (traditional restaurant), looked much like the nondescript cafe-cum-tobacco shops that can be found in most small French towns, but inside the decor was as warm and inviting as a country pub."
print(similarity([" ".join([sample_a, sample_b])]))
5 changes: 2 additions & 3 deletions topk/eval.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
import sys
from os import path

file_path = path.abspath(__file__)
sys.path.append(path.dirname(path.dirname(file_path)))

import typing
from dar_env import nlp, bertscore, rouge, bleurt
from dar_env import nlp_spacy, bertscore, rouge, bleurt


def extract_topk_doc(ref: str, topk: int) -> str:
doc = nlp(ref)
doc = nlp_spacy(ref)
doc_sents = [sent.text for sent in doc.sents]
topk_sents = doc_sents[0:topk]
return " ".join(topk_sents)
Expand Down

0 comments on commit 1a5d751

Please sign in to comment.