-
Notifications
You must be signed in to change notification settings - Fork 368
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into genai-explainer
- Loading branch information
Showing
9 changed files
with
569 additions
and
0 deletions.
There are no files selected for viewing
16 changes: 16 additions & 0 deletions
16
responsibleai_text/responsibleai_text/utils/genai_metrics/constants.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# Copyright (c) Microsoft Corporation | ||
# Licensed under the MIT License. | ||
|
||
"""Constants for genai_metrics.""" | ||
|
||
_CITATION = """ | ||
""" | ||
|
||
_SYS_PROMPT = """ | ||
You are an AI assistant. You will be given the definition of an evaluation \ | ||
metric for assessing the quality of an answer in a question-answering task. \ | ||
Your job is to compute an accurate evaluation score using the provided \ | ||
evaluation metric. | ||
Your response will be used in automated evaluation of question-answering \ | ||
systems, and must be an integer between 1 and 5, and nothing else. | ||
""".strip() |
32 changes: 32 additions & 0 deletions
32
responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
# Copyright (c) Microsoft Corporation | ||
# Licensed under the MIT License. | ||
|
||
"""Compute AI-assisted metrics for generative text models.""" | ||
|
||
import logging | ||
from pathlib import Path | ||
|
||
module_logger = logging.getLogger(__name__) | ||
module_logger.setLevel(logging.INFO) | ||
|
||
try: | ||
import evaluate | ||
except ImportError: | ||
module_logger.debug( | ||
'Could not import evaluate, required if using a genai model') | ||
|
||
|
||
def get_genai_metric(metric_name, **metric_kwargs): | ||
"""Get the metric from the genai library. | ||
:param metric_name: The name of the metric. | ||
:type metric_name: str | ||
:param metric_kwargs: The keyword arguments to pass to the metric. | ||
:type metric_kwargs: dict | ||
:return: The metric. | ||
:rtype: float | ||
""" | ||
curr_file_dir = Path(__file__).resolve().parent | ||
metric = evaluate.load( | ||
str(curr_file_dir.joinpath(f'scripts/{metric_name}.py'))) | ||
return metric.compute(**metric_kwargs) |
38 changes: 38 additions & 0 deletions
38
responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/_compute.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
# Copyright (c) Microsoft Corporation | ||
# Licensed under the MIT License. | ||
|
||
"""Helper function to compute metrics.""" | ||
|
||
import pandas as pd | ||
|
||
from responsibleai_text.utils.genai_metrics.constants import _SYS_PROMPT | ||
|
||
|
||
def format_str(s, **kwargs): | ||
"""Zip all the kwargs together and format the string in a loop""" | ||
keys = list(kwargs.keys()) | ||
lists = [kwargs[k] for k in keys] | ||
formatted = [] | ||
for vals in zip(*lists): | ||
fmt_kwargs = {k: v for k, v in zip(keys, vals)} | ||
formatted.append(s.format(**fmt_kwargs)) | ||
return formatted | ||
|
||
|
||
def _compute_metric(template, logger, wrapper_model, **kwargs): | ||
m = [] | ||
templated_ques = format_str(template, **kwargs) | ||
|
||
inp = pd.DataFrame({ | ||
'questions': templated_ques, | ||
'sys_prompt': _SYS_PROMPT}) | ||
|
||
responses = wrapper_model.predict(inp) | ||
|
||
for r in responses: | ||
try: | ||
m.append(int(r)) | ||
except ValueError as e: | ||
logger.warning('Failed to parse metric `%s`: %s', r, e) | ||
m.append(0) | ||
return {'scores': m} |
87 changes: 87 additions & 0 deletions
87
responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/coherence.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
# Copyright (c) Microsoft Corporation | ||
# Licensed under the MIT License. | ||
|
||
"""Coherence metric.""" | ||
|
||
import logging | ||
|
||
from responsibleai_text.utils.genai_metrics.constants import _CITATION | ||
from responsibleai_text.utils.genai_metrics.scripts._compute import \ | ||
_compute_metric | ||
|
||
module_logger = logging.getLogger(__name__) | ||
module_logger.setLevel(logging.INFO) | ||
|
||
try: | ||
import evaluate | ||
except ImportError: | ||
module_logger.debug( | ||
'Could not import evaluate, required if using a genai model') | ||
|
||
try: | ||
import datasets | ||
except ImportError: | ||
module_logger.debug( | ||
'Could not import datasets, required if using a genai model') | ||
|
||
logger = evaluate.logging.get_logger(__name__) | ||
|
||
_DESCRIPTION = """The coherence metric. | ||
""" | ||
|
||
_KWARGS_DESCRIPTION = """ | ||
**SOME DESCRIPTION** | ||
""" | ||
|
||
_TEMPLATE = """ | ||
Coherence of an answer is measured by how well all the sentences fit together \ | ||
and sound naturally as a whole. Consider the overall quality of the answer \ | ||
when evaluating coherence. Given the question and answer, score the coherence \ | ||
of answer between one to five stars using the following rating scale: | ||
One star: the answer completely lacks coherence | ||
Two stars: the answer mostly lacks coherence | ||
Three stars: the answer is partially coherent | ||
Four stars: the answer is mostly coherent | ||
Five stars: the answer has perfect coherency | ||
This rating value should always be an integer between 1 and 5. So the rating \ | ||
produced should be 1 or 2 or 3 or 4 or 5. | ||
Some examples of valid responses are: | ||
1 | ||
2 | ||
5 | ||
Some examples of invalid responses are: | ||
1/5 | ||
1.5 | ||
3.0 | ||
5 stars | ||
QUESTION: | ||
{question} | ||
ANSWER: | ||
{prediction} | ||
RATING: | ||
""".strip() | ||
|
||
|
||
@evaluate.utils.file_utils.add_start_docstrings( | ||
_DESCRIPTION, _KWARGS_DESCRIPTION) | ||
class Coherence(evaluate.Metric): | ||
def _info(self): | ||
return evaluate.MetricInfo( | ||
description=_DESCRIPTION, | ||
citation=_CITATION, | ||
inputs_description=_KWARGS_DESCRIPTION, | ||
features=datasets.Features({ | ||
"predictions": datasets.Value("string", id="sequence"), | ||
"references": datasets.Value("string", id="sequence")})) | ||
|
||
def _compute(self, *, predictions=None, references=None, **kwargs): | ||
return _compute_metric( | ||
_TEMPLATE, | ||
logger, | ||
kwargs['wrapper_model'], | ||
prediction=predictions, | ||
question=references) |
83 changes: 83 additions & 0 deletions
83
responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
# Copyright (c) Microsoft Corporation | ||
# Licensed under the MIT License. | ||
|
||
"""Equivalence metric.""" | ||
|
||
import logging | ||
|
||
from responsibleai_text.utils.genai_metrics.constants import _CITATION | ||
from responsibleai_text.utils.genai_metrics.scripts._compute import \ | ||
_compute_metric | ||
|
||
module_logger = logging.getLogger(__name__) | ||
module_logger.setLevel(logging.INFO) | ||
|
||
try: | ||
import evaluate | ||
except ImportError: | ||
module_logger.debug( | ||
'Could not import evaluate, required if using a genai model') | ||
|
||
try: | ||
import datasets | ||
except ImportError: | ||
module_logger.debug( | ||
'Could not import datasets, required if using a genai model') | ||
|
||
logger = evaluate.logging.get_logger(__name__) | ||
|
||
_DESCRIPTION = """The equivalence metric. | ||
""" | ||
|
||
_KWARGS_DESCRIPTION = """ | ||
**SOME DESCRIPTION** | ||
""" | ||
|
||
_TEMPLATE = """ | ||
Equivalence, as a metric, measures the similarity between the predicted \ | ||
answer and the correct answer. If the information and content in the \ | ||
predicted answer is similar or equivalent to the correct answer, then the \ | ||
value of the Equivalence metric should be high, else it should be low. Given \ | ||
the question, correct answer, and predicted answer, determine the value of \ | ||
Equivalence metric using the following rating scale: | ||
One star: the predicted answer is not at all similar to the correct answer | ||
Two stars: the predicted answer is mostly not similar to the correct answer | ||
Three stars: the predicted answer is somewhat similar to the correct answer | ||
Four stars: the predicted answer is mostly similar to the correct answer | ||
Five stars: the predicted answer is completely similar to the correct answer | ||
This rating value should always be an integer between 1 and 5. So the rating \ | ||
produced should be 1 or 2 or 3 or 4 or 5. | ||
QUESTION: | ||
{question} | ||
CORRECT ANSWER: | ||
{answer} | ||
PREDICTED ANSWER: | ||
{prediction} | ||
""".strip() | ||
|
||
|
||
@evaluate.utils.file_utils.add_start_docstrings( | ||
_DESCRIPTION, _KWARGS_DESCRIPTION) | ||
class Equivalence(evaluate.Metric): | ||
def _info(self): | ||
return evaluate.MetricInfo( | ||
description=_DESCRIPTION, | ||
citation=_CITATION, | ||
inputs_description=_KWARGS_DESCRIPTION, | ||
features=datasets.Features({ | ||
"predictions": datasets.Value("string", id="sequence"), | ||
"references": datasets.Value("string", id="sequence"), | ||
"answers": datasets.Value("string", id="sequence")})) | ||
|
||
def _compute(self, *, predictions=None, references=None, **kwargs): | ||
return _compute_metric( | ||
_TEMPLATE, | ||
logger, | ||
kwargs['wrapper_model'], | ||
prediction=predictions, | ||
question=references, | ||
answer=kwargs['answers']) |
77 changes: 77 additions & 0 deletions
77
responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
# Copyright (c) Microsoft Corporation | ||
# Licensed under the MIT License. | ||
|
||
"""Fluency metric.""" | ||
|
||
import logging | ||
|
||
from responsibleai_text.utils.genai_metrics.constants import _CITATION | ||
from responsibleai_text.utils.genai_metrics.scripts._compute import \ | ||
_compute_metric | ||
|
||
module_logger = logging.getLogger(__name__) | ||
module_logger.setLevel(logging.INFO) | ||
|
||
try: | ||
import evaluate | ||
except ImportError: | ||
module_logger.debug( | ||
'Could not import evaluate, required if using a genai model') | ||
|
||
try: | ||
import datasets | ||
except ImportError: | ||
module_logger.debug( | ||
'Could not import datasets, required if using a genai model') | ||
|
||
logger = evaluate.logging.get_logger(__name__) | ||
|
||
_DESCRIPTION = """The fluency metric. | ||
""" | ||
|
||
_KWARGS_DESCRIPTION = """ | ||
**SOME DESCRIPTION** | ||
""" | ||
|
||
_TEMPLATE = """ | ||
Fluency measures the quality of individual sentences in the answer, and \ | ||
whether they are well-written and grammatically correct. Consider the quality \ | ||
of individual sentences when evaluating fluency. Given the question and \ | ||
answer, score the fluency of the answer between one to five stars using the \ | ||
following rating scale: | ||
One star: the answer completely lacks fluency | ||
Two stars: the answer mostly lacks fluency | ||
Three stars: the answer is partially fluent | ||
Four stars: the answer is mostly fluent | ||
Five stars: the answer has perfect fluency | ||
This rating value should always be an integer between 1 and 5. So the rating \ | ||
produced should be 1 or 2 or 3 or 4 or 5. | ||
QUESTION: | ||
{question} | ||
ANSWER: | ||
{prediction} | ||
""".strip() | ||
|
||
|
||
@evaluate.utils.file_utils.add_start_docstrings( | ||
_DESCRIPTION, _KWARGS_DESCRIPTION) | ||
class Fluency(evaluate.Metric): | ||
def _info(self): | ||
return evaluate.MetricInfo( | ||
description=_DESCRIPTION, | ||
citation=_CITATION, | ||
inputs_description=_KWARGS_DESCRIPTION, | ||
features=datasets.Features({ | ||
"predictions": datasets.Value("string", id="sequence"), | ||
"references": datasets.Value("string", id="sequence")})) | ||
|
||
def _compute(self, *, predictions=None, references=None, **kwargs): | ||
return _compute_metric( | ||
_TEMPLATE, | ||
logger, | ||
kwargs['wrapper_model'], | ||
prediction=predictions, | ||
question=references) |
Oops, something went wrong.