Skip to content

Commit

Permalink
add genai metrics endpoint in UI for model overview metrics (#2517) (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
imatiach-msft authored Jan 30, 2024
1 parent 7aa72fb commit 84428aa
Show file tree
Hide file tree
Showing 15 changed files with 367 additions and 28 deletions.
14 changes: 14 additions & 0 deletions apps/widget/src/app/ModelAssessment.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,20 @@ export class ModelAssessment extends React.Component<IModelAssessmentProps> {
abortSignal
);
};
callBack.requestGenerativeTextMetrics = async (
selectionIndexes: number[][],
generativeTextCache: Map<string, Map<string, number>>,
abortSignal: AbortSignal
): Promise<any[]> => {
const parameters = [selectionIndexes, generativeTextCache];
return connectToFlaskServiceWithBackupCall(
this.props.config,
parameters,
"handle_generative_text_json",
"/get_generative_text_metrics",
abortSignal
);
};
callBack.requestMatrix = async (
data: any[]
): Promise<IErrorAnalysisMatrix> => {
Expand Down
1 change: 1 addition & 0 deletions apps/widget/src/app/ModelAssessmentUtils.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ export interface IModelAssessmentProps {
export type CallbackType = Pick<
IModelAssessmentDashboardProps,
| "requestExp"
| "requestGenerativeTextMetrics"
| "requestObjectDetectionMetrics"
| "requestPredictions"
| "requestQuestionAnsweringMetrics"
Expand Down
1 change: 1 addition & 0 deletions libs/core-ui/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ export * from "./lib/util/getFilterBoundsArgs";
export * from "./lib/util/calculateBoxData";
export * from "./lib/util/calculateConfusionMatrixData";
export * from "./lib/util/calculateLineData";
export * from "./lib/util/GenerativeTextStatisticsUtils";
export * from "./lib/util/MultilabelStatisticsUtils";
export * from "./lib/util/ObjectDetectionStatisticsUtils";
export * from "./lib/util/QuestionAnsweringStatisticsUtils";
Expand Down
7 changes: 7 additions & 0 deletions libs/core-ui/src/lib/Context/ModelAssessmentContext.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,13 @@ export interface IModelAssessmentContext {
requestExp?:
| ((index: number | number[], abortSignal: AbortSignal) => Promise<any[]>)
| undefined;
requestGenerativeTextMetrics?:
| ((
selectionIndexes: number[][],
generativeTextCache: Map<string, Map<string, number>>,
abortSignal: AbortSignal
) => Promise<any[]>)
| undefined;
requestObjectDetectionMetrics?:
| ((
selectionIndexes: number[][],
Expand Down
1 change: 1 addition & 0 deletions libs/core-ui/src/lib/Interfaces/IExplanationContext.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { JointDataset } from "../util/JointDataset";
export enum ModelTypes {
Regression = "regression",
Binary = "binary",
GenerativeText = "generativetext",
Multiclass = "multiclass",
ImageBinary = "imagebinary",
ImageMulticlass = "imagemulticlass",
Expand Down
88 changes: 88 additions & 0 deletions libs/core-ui/src/lib/util/GenerativeTextStatisticsUtils.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

import { localization } from "@responsible-ai/localization";

import {
ILabeledStatistic,
TotalCohortSamples
} from "../Interfaces/IStatistic";

import { QuestionAnsweringMetrics } from "./QuestionAnsweringStatisticsUtils";

export enum GenerativeTextMetrics {
Coherence = "coherence",
Fluency = "fluency",
Equivalence = "equivalence",
Groundedness = "groundedness",
Relevance = "relevance"
}

export const generateGenerativeTextStats: (
selectionIndexes: number[][],
generativeTextCache: Map<string, Map<string, number>>
) => ILabeledStatistic[][] = (
selectionIndexes: number[][],
generativeTextCache: Map<string, Map<string, number>>
): ILabeledStatistic[][] => {
return selectionIndexes.map((selectionArray) => {
const count = selectionArray.length;

const value = generativeTextCache.get(selectionArray.toString());
const stat: Map<string, number> = value ? value : new Map<string, number>();

const stats = [
{
key: TotalCohortSamples,
label: localization.Interpret.Statistics.samples,
stat: count
}
];
for (const [key, value] of stat.entries()) {
let label = "";
switch (key) {
case GenerativeTextMetrics.Coherence:
label = localization.Interpret.Statistics.coherence;
break;
case GenerativeTextMetrics.Fluency:
label = localization.Interpret.Statistics.fluency;
break;
case GenerativeTextMetrics.Equivalence:
label = localization.Interpret.Statistics.equivalence;
break;
case GenerativeTextMetrics.Groundedness:
label = localization.Interpret.Statistics.groundedness;
break;
case GenerativeTextMetrics.Relevance:
label = localization.Interpret.Statistics.relevance;
break;
case QuestionAnsweringMetrics.ExactMatchRatio:
label = localization.Interpret.Statistics.exactMatchRatio;
break;
case QuestionAnsweringMetrics.F1Score:
label = localization.Interpret.Statistics.f1Score;
break;
case QuestionAnsweringMetrics.MeteorScore:
label = localization.Interpret.Statistics.meteorScore;
break;
case QuestionAnsweringMetrics.BleuScore:
label = localization.Interpret.Statistics.bleuScore;
break;
case QuestionAnsweringMetrics.BertScore:
label = localization.Interpret.Statistics.bertScore;
break;
case QuestionAnsweringMetrics.RougeScore:
label = localization.Interpret.Statistics.rougeScore;
break;
default:
break;
}
stats.push({
key,
label,
stat: value
});
}
return stats;
});
};
7 changes: 6 additions & 1 deletion libs/core-ui/src/lib/util/StatisticsUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import {
} from "../Interfaces/IStatistic";
import { IsBinary } from "../util/ExplanationUtils";

import { generateGenerativeTextStats } from "./GenerativeTextStatisticsUtils";
import { JointDataset } from "./JointDataset";
import { ClassificationEnum } from "./JointDatasetUtils";
import { generateMulticlassStats } from "./MulticlassStatisticsUtils";
Expand Down Expand Up @@ -156,7 +157,8 @@ export const generateMetrics: (
modelType: ModelTypes,
objectDetectionCache?: Map<string, [number, number, number]>,
objectDetectionInputs?: [string, string, number],
questionAnsweringCache?: QuestionAnsweringCacheType
questionAnsweringCache?: QuestionAnsweringCacheType,
generativeTextCache?: Map<string, Map<string, number>>
): ILabeledStatistic[][] => {
if (
modelType === ModelTypes.ImageMultilabel ||
Expand Down Expand Up @@ -192,6 +194,9 @@ export const generateMetrics: (
objectDetectionInputs
);
}
if (modelType === ModelTypes.GenerativeText && generativeTextCache) {
return generateGenerativeTextStats(selectionIndexes, generativeTextCache);
}
const outcomes = jointDataset.unwrap(JointDataset.ClassificationError);
if (IsBinary(modelType)) {
return selectionIndexes.map((selectionArray) => {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,12 @@ export class MetricSelector extends React.Component<IMetricSelectorProps> {
options.push(this.addDropdownOption(Metrics.AccuracyScore));
} else if (
IsMultilabel(modelType) ||
modelType === ModelTypes.ObjectDetection
modelType === ModelTypes.ObjectDetection ||
modelType === ModelTypes.QuestionAnswering
) {
options.push(this.addDropdownOption(Metrics.ErrorRate));
} else if (modelType === ModelTypes.GenerativeText) {
options.push(this.addDropdownOption(Metrics.MeanSquaredError));
}
return (
<Dropdown
Expand Down
25 changes: 25 additions & 0 deletions libs/localization/src/lib/en.json
Original file line number Diff line number Diff line change
Expand Up @@ -1221,19 +1221,24 @@
"_rSquared.comment": "the coefficient of determination, see https://en.wikipedia.org/wiki/Coefficient_of_determination",
"_recall.comment": "computed recall of model, see https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers",
"accuracy": "Accuracy: {0}",
"coherence": "Coherence: {0}",
"bleuScore": "Bleu score: {0}",
"bertScore": "Bert score: {0}",
"exactMatchRatio": "Exact match ratio: {0}",
"equivalence": "Equivalence: {0}",
"rougeScore": "Rouge Score: {0}",
"fluency": "Fluency: {0}",
"fnr": "False negative rate: {0}",
"fpr": "False positive rate: {0}",
"groundedness": "Groundedness: {0}",
"hammingScore": "Hamming score: {0}",
"meanPrediction": "Mean prediction {0}",
"meteorScore": "Meteor Score: {0}",
"mse": "Mean squared error: {0}",
"precision": "Precision: {0}",
"rSquared": "R²: {0}",
"recall": "Recall: {0}",
"relevance": "Relevance: {0}",
"selectionRate": "Selection rate: {0}",
"mae": "Mean absolute error: {0}",
"f1Score": "F1 score: {0}",
Expand Down Expand Up @@ -1766,10 +1771,26 @@
"name": "Accuracy score",
"description": "The fraction of data points classified correctly."
},
"coherence": {
"name": "Coherence",
"description": "Coherence of an answer is measured by how well all the sentences fit together and sound naturally as a whole."
},
"fluency": {
"name": "Fluency",
"description": "Fluency measures the quality of individual sentences in the answer, and whether they are well-written and grammatically correct."
},
"equivalence": {
"name": "Equivalence",
"description": "Equivalence, as a metric, measures the similarity between the predicted answer and the correct answer."
},
"exactMatchRatio": {
"name": "Exact match ratio",
"description": "The ratio of instances classified correctly for every label."
},
"groundedness": {
"name": "Groundedness",
"description": "Groundedness measures whether the answer follows logically from the information in the context."
},
"meteorScore": {
"name": "Meteor Score",
"description": "METEOR Score is calculated based on the harmonic mean of precision and recall, with recall weighted more than precision in question answering task."
Expand All @@ -1782,6 +1803,10 @@
"name": "Bert Score",
"description": "BERTScore focuses on computing semantic similarity between tokens of reference and machine generated text in question answering task."
},
"relevance": {
"name": "Relevance",
"description": "Relevance measures how well the answer addresses the main aspects of the question, based on the context"
},
"rougeScore": {
"name": "Rouge Score",
"description": "Rouge Score measures the ratio of words (and/or n-grams) in the reference text that appeared in the machine generated text in question answering task."
Expand Down
Loading

0 comments on commit 84428aa

Please sign in to comment.