Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added metrics for genai text #2514

Merged
merged 19 commits into from
Jan 29, 2024
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Copyright (c) Microsoft Corporation
# Licensed under the MIT License.

"""Compute AI-assisted metrics for generative text models."""

from pathlib import Path

import evaluate
kartikc727 marked this conversation as resolved.
Show resolved Hide resolved


def get_genai_metric(metric_name, **metric_kwargs):
"""Get the metric from the genai library.

:param metric_name: The name of the metric.
:type metric_name: str
:param metric_kwargs: The keyword arguments to pass to the metric.
:type metric_kwargs: dict
:return: The metric.
:rtype: float
"""
curr_file_dir = Path(__file__).resolve().parent
metric = evaluate.load(
str(curr_file_dir.joinpath(f'scripts/{metric_name}.py')))
return metric.compute(**metric_kwargs)
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# Copyright (c) Microsoft Corporation
# Licensed under the MIT License.

"""Coherence metric."""

import datasets
kartikc727 marked this conversation as resolved.
Show resolved Hide resolved
import evaluate
import pandas as pd

logger = evaluate.logging.get_logger(__name__)


_CITATION = """
kartikc727 marked this conversation as resolved.
Show resolved Hide resolved
"""

_DESCRIPTION = """The coherence metric.
"""

_KWARGS_DESCRIPTION = """
**SOME DESCRIPTION**
"""

_SYS_PROMPT = """
You are an AI assistant. You will be given the definition of an evaluation \
metric for assessing the quality of an answer in a question-answering task. \
Your job is to compute an accurate evaluation score using the provided \
evaluation metric.
Your response will be used in automated evaluation of question-answering \
systems, and must be an integer between 1 and 5, and nothing else.
""".strip()

_TEMPLATE = """
Coherence of an answer is measured by how well all the sentences fit together \
and sound naturally as a whole. Consider the overall quality of the answer \
when evaluating coherence. Given the question and answer, score the coherence \
of answer between one to five stars using the following rating scale:
One star: the answer completely lacks coherence
Two stars: the answer mostly lacks coherence
Three stars: the answer is partially coherent
Four stars: the answer is mostly coherent
Five stars: the answer has perfect coherency

This rating value should always be an integer between 1 and 5. So the rating \
produced should be 1 or 2 or 3 or 4 or 5.
Some examples of valid responses are:
1
2
5
Some examples of invalid responses are:
1/5
1.5
3.0
5 stars

QUESTION:
{question}

ANSWER:
{prediction}

RATING:
""".strip()


@evaluate.utils.file_utils.add_start_docstrings(
_DESCRIPTION, _KWARGS_DESCRIPTION)
class Coherence(evaluate.Metric):
def _info(self):

return evaluate.MetricInfo(
kartikc727 marked this conversation as resolved.
Show resolved Hide resolved
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence")
}
),
)

def _compute(self, *, predictions=None, references=None, **kwargs):
kartikc727 marked this conversation as resolved.
Show resolved Hide resolved
m = []
templated_ques = []

for p, r in zip(predictions, references):
templated_ques.append(_TEMPLATE.format(question=r, prediction=p))

model = kwargs['wrapper_model']

inp = pd.DataFrame({
'questions': templated_ques,
'sys_prompt': _SYS_PROMPT})

responses = model.predict(inp)

for r in responses:
try:
m.append(int(r))
except ValueError as e:
logger.warning('Failed to parse metric `%s`: %s', r, e)
m.append(0)
return {'scores': m}
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# Copyright (c) Microsoft Corporation
# Licensed under the MIT License.

"""Equivalence metric."""

import datasets
import evaluate
import pandas as pd

logger = evaluate.logging.get_logger(__name__)


_CITATION = """
"""

_DESCRIPTION = """The equivalence metric.
"""

_KWARGS_DESCRIPTION = """
**SOME DESCRIPTION**
"""

_SYS_PROMPT = """
You are an AI assistant. You will be given the definition of an evaluation \
metric for assessing the quality of an answer in a question-answering task. \
Your job is to compute an accurate evaluation score using the provided \
evaluation metric.
Your response will be used in automated evaluation of question-answering \
systems, and must be an integer between 1 and 5, and nothing else.
""".strip()

_TEMPLATE = """
Equivalence, as a metric, measures the similarity between the predicted \
answer and the correct answer. If the information and content in the \
predicted answer is similar or equivalent to the correct answer, then the \
value of the Equivalence metric should be high, else it should be low. Given \
the question, correct answer, and predicted answer, determine the value of \
Equivalence metric using the following rating scale:
One star: the predicted answer is not at all similar to the correct answer
Two stars: the predicted answer is mostly not similar to the correct answer
Three stars: the predicted answer is somewhat similar to the correct answer
Four stars: the predicted answer is mostly similar to the correct answer
Five stars: the predicted answer is completely similar to the correct answer

This rating value should always be an integer between 1 and 5. So the rating \
produced should be 1 or 2 or 3 or 4 or 5.

QUESTION:
{question}

CORRECT ANSWER:
{answer}

PREDICTED ANSWER:
{prediction}
""".strip()


@evaluate.utils.file_utils.add_start_docstrings(
_DESCRIPTION, _KWARGS_DESCRIPTION)
class Equivalence(evaluate.Metric):
def _info(self):

return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence"),
"answers": datasets.Value("string", id="sequence")
}
),
)

def _compute(self, *, predictions=None, references=None, **kwargs):
m = []
templated_ques = []

answers = kwargs['answers']
for p, r, a in zip(predictions, references, answers):
templated_ques.append(_TEMPLATE.format(
question=r, prediction=p, answer=a))

model = kwargs['wrapper_model']

inp = pd.DataFrame({
'questions': templated_ques,
'sys_prompt': _SYS_PROMPT})

responses = model.predict(inp)

for r in responses:
try:
m.append(int(r))
except ValueError as e:
logger.warning('Failed to parse metric `%s`: %s', r, e)
m.append(0)
return {'scores': m}
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# Copyright (c) Microsoft Corporation
# Licensed under the MIT License.

"""Fluency metric."""

import datasets
import evaluate
import pandas as pd

logger = evaluate.logging.get_logger(__name__)


_CITATION = """
"""

_DESCRIPTION = """The fluency metric.
"""

_KWARGS_DESCRIPTION = """
**SOME DESCRIPTION**
"""

_SYS_PROMPT = """
You are an AI assistant. You will be given the definition of an evaluation \
metric for assessing the quality of an answer in a question-answering task. \
Your job is to compute an accurate evaluation score using the provided \
evaluation metric.
Your response will be used in automated evaluation of question-answering \
systems, and must be an integer between 1 and 5, and nothing else.
""".strip()

_TEMPLATE = """
Fluency measures the quality of individual sentences in the answer, and \
whether they are well-written and grammatically correct. Consider the quality \
of individual sentences when evaluating fluency. Given the question and \
answer, score the fluency of the answer between one to five stars using the \
following rating scale:
One star: the answer completely lacks fluency
Two stars: the answer mostly lacks fluency
Three stars: the answer is partially fluent
Four stars: the answer is mostly fluent
Five stars: the answer has perfect fluency

This rating value should always be an integer between 1 and 5. So the rating \
produced should be 1 or 2 or 3 or 4 or 5.

QUESTION:
{question}

ANSWER:
{prediction}
""".strip()


@evaluate.utils.file_utils.add_start_docstrings(
_DESCRIPTION, _KWARGS_DESCRIPTION)
class Fluency(evaluate.Metric):
def _info(self):

return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence")
}
),
)

def _compute(self, *, predictions=None, references=None, **kwargs):
m = []
templated_ques = []

for p, r in zip(predictions, references):
templated_ques.append(_TEMPLATE.format(question=r, prediction=p))

model = kwargs['wrapper_model']

inp = pd.DataFrame({
'questions': templated_ques,
'sys_prompt': _SYS_PROMPT})

responses = model.predict(inp)

for r in responses:
try:
m.append(int(r))
except ValueError as e:
logger.warning('Failed to parse metric `%s`: %s', r, e)
m.append(0)
return {'scores': m}
Loading
Loading