diff --git a/testing/scorer/scorer-core/src/main/java/io/quarkiverse/langchain4j/testing/scorer/EvaluationReport.java b/testing/scorer/scorer-core/src/main/java/io/quarkiverse/langchain4j/testing/scorer/EvaluationReport.java index b6994fae4..10454fedc 100644 --- a/testing/scorer/scorer-core/src/main/java/io/quarkiverse/langchain4j/testing/scorer/EvaluationReport.java +++ b/testing/scorer/scorer-core/src/main/java/io/quarkiverse/langchain4j/testing/scorer/EvaluationReport.java @@ -56,6 +56,17 @@ public double scoreForTag(String tag) { * @throws IOException if an error occurs while writing the report */ public void writeReport(File output) throws IOException { + writeReport(output, false); + } + + /** + * Write the report to a file using the Markdown syntax. + * + * @param output the output file, must not be {@code null} + * @param includeResult whether to include the expectedOutput and result of the evaluation in the report + * @throws IOException if an error occurs while writing the report + */ + public void writeReport(File output, boolean includeResult) throws IOException { StringBuilder buffer = new StringBuilder(); buffer.append("# Evaluation Report\n\n"); buffer.append("**Global Score**: ").append(score).append("\n\n"); @@ -69,9 +80,14 @@ public void writeReport(File output) throws IOException { } buffer.append("\n## Details\n\n"); + var detailHeader = includeResult ? "### " : "- "; for (Scorer.EvaluationResult evaluation : evaluations) { - buffer.append("- ").append(evaluation.sample().name()).append(": ") + buffer.append(detailHeader).append(evaluation.sample().name()).append(": ") .append(evaluation.passed() ? "PASSED" : "FAILED").append("\n"); + if (includeResult) { + buffer.append("#### Result\n").append(evaluation.result()).append("\n"); + buffer.append("#### Expected Output\n").append(evaluation.sample().expectedOutput()).append("\n"); + } } Files.write(output.toPath(), buffer.toString().getBytes()); diff --git a/testing/scorer/scorer-core/src/main/java/io/quarkiverse/langchain4j/testing/scorer/Scorer.java b/testing/scorer/scorer-core/src/main/java/io/quarkiverse/langchain4j/testing/scorer/Scorer.java index 34153814e..2c6195abe 100644 --- a/testing/scorer/scorer-core/src/main/java/io/quarkiverse/langchain4j/testing/scorer/Scorer.java +++ b/testing/scorer/scorer-core/src/main/java/io/quarkiverse/langchain4j/testing/scorer/Scorer.java @@ -39,8 +39,8 @@ public EvaluationReport evaluate(Samples samples, Function var response = execute(sample, function); LOG.infof("Evaluating sample `%s`", sample.name()); for (EvaluationStrategy strategy : strategies) { - EvaluationResult evaluation = new EvaluationResult<>(sample, - strategy.evaluate(sample, response)); + EvaluationResult evaluation = EvaluationResult.fromCompletedEvaluation(sample, + response, strategy.evaluate(sample, response)); LOG.infof("Evaluation of sample `%s` with strategy `%s`: %s", sample.name(), strategy.getClass().getSimpleName(), evaluation.passed() ? "OK" : "KO"); @@ -48,7 +48,7 @@ public EvaluationReport evaluate(Samples samples, Function } } catch (Throwable e) { LOG.errorf(e, "Failed to evaluate sample `%s`", sample.name()); - evaluations.add(new EvaluationResult<>(sample, false)); + evaluations.add(EvaluationResult.fromEvaluationThrowable(sample, e)); } finally { latch.countDown(); } @@ -66,7 +66,14 @@ public void close() { executor.shutdown(); } - public record EvaluationResult(EvaluationSample sample, boolean passed) { + public record EvaluationResult(EvaluationSample sample, T result, Throwable thrown, boolean passed) { + public static EvaluationResult fromCompletedEvaluation(EvaluationSample sample, T result, boolean passed) { + return new EvaluationResult<>(sample, result, null, passed); + } + + public static EvaluationResult fromEvaluationThrowable(EvaluationSample sample, Throwable thrown) { + return new EvaluationResult<>(sample, null, thrown, false); + } } private T execute(EvaluationSample sample, Function function) { diff --git a/testing/scorer/scorer-core/src/test/java/io/quarkiverse/langchain4j/testing/scorer/EvaluationReportTest.java b/testing/scorer/scorer-core/src/test/java/io/quarkiverse/langchain4j/testing/scorer/EvaluationReportTest.java index 74fcfe322..a887cd676 100644 --- a/testing/scorer/scorer-core/src/test/java/io/quarkiverse/langchain4j/testing/scorer/EvaluationReportTest.java +++ b/testing/scorer/scorer-core/src/test/java/io/quarkiverse/langchain4j/testing/scorer/EvaluationReportTest.java @@ -1,6 +1,6 @@ package io.quarkiverse.langchain4j.testing.scorer; -import static org.assertj.core.api.Assertions.*; +import static org.assertj.core.api.Assertions.assertThat; import java.io.File; import java.io.IOException; @@ -14,12 +14,14 @@ class EvaluationReportTest { @Test void globalScoreShouldBeCorrect() { // Create mock evaluations. - Scorer.EvaluationResult result1 = new Scorer.EvaluationResult<>( + Scorer.EvaluationResult result1 = Scorer.EvaluationResult.fromCompletedEvaluation( new EvaluationSample<>("Sample1", new Parameters(), "expected", List.of("tag1")), + "expected", true); - Scorer.EvaluationResult result2 = new Scorer.EvaluationResult<>( + Scorer.EvaluationResult result2 = Scorer.EvaluationResult.fromCompletedEvaluation( new EvaluationSample<>("Sample2", new Parameters(), "expected", List.of("tag2")), + "some-response", false); EvaluationReport report = new EvaluationReport(List.of(result1, result2)); @@ -31,16 +33,19 @@ void globalScoreShouldBeCorrect() { @Test void scoreForTagShouldBeCorrect() { // Create mock evaluations. - Scorer.EvaluationResult result1 = new Scorer.EvaluationResult<>( + Scorer.EvaluationResult result1 = Scorer.EvaluationResult.fromCompletedEvaluation( new EvaluationSample<>("Sample1", new Parameters(), "expected", List.of("tag1")), + "expected", true); - Scorer.EvaluationResult result2 = new Scorer.EvaluationResult<>( + Scorer.EvaluationResult result2 = Scorer.EvaluationResult.fromCompletedEvaluation( new EvaluationSample<>("Sample2", new Parameters(), "expected", List.of("tag2")), + "some-response", false); - Scorer.EvaluationResult result3 = new Scorer.EvaluationResult<>( + Scorer.EvaluationResult result3 = Scorer.EvaluationResult.fromCompletedEvaluation( new EvaluationSample<>("Sample3", new Parameters(), "expected", List.of("tag1", "tag2")), + "expected", true); EvaluationReport report = new EvaluationReport(List.of(result1, result2, result3)); @@ -53,12 +58,14 @@ void scoreForTagShouldBeCorrect() { @Test void writeReportShouldGenerateMarkdownFile() throws IOException { // Create mock evaluations. - Scorer.EvaluationResult result1 = new Scorer.EvaluationResult<>( + Scorer.EvaluationResult result1 = Scorer.EvaluationResult.fromCompletedEvaluation( new EvaluationSample<>("Sample1", new Parameters(), "expected", List.of("tag1")), + "expected", true); - Scorer.EvaluationResult result2 = new Scorer.EvaluationResult<>( + Scorer.EvaluationResult result2 = Scorer.EvaluationResult.fromCompletedEvaluation( new EvaluationSample<>("Sample2", new Parameters(), "expected", List.of("tag2")), + "some-response", false); EvaluationReport report = new EvaluationReport(List.of(result1, result2)); @@ -79,4 +86,40 @@ void writeReportShouldGenerateMarkdownFile() throws IOException { assertThat(content).contains("- Sample1: PASSED"); assertThat(content).contains("- Sample2: FAILED"); } + + @Test + void writeReportShouldGenerateMarkdownFileIncudingExpectedOutputAndResult() throws IOException { + // Create mock evaluations. + Scorer.EvaluationResult result1 = Scorer.EvaluationResult.fromCompletedEvaluation( + new EvaluationSample<>("Sample1", new Parameters(), "expected1", List.of("tag1")), + "expected1", + true); + + Scorer.EvaluationResult result2 = Scorer.EvaluationResult.fromCompletedEvaluation( + new EvaluationSample<>("Sample2", new Parameters(), "expected2", List.of("tag2")), + "some-response", + false); + + EvaluationReport report = new EvaluationReport(List.of(result1, result2)); + + // Write the report to a temporary file. + File tempFile = File.createTempFile("evaluation-report", ".md"); + report.writeReport(tempFile, true); + + // Assertions + assertThat(tempFile).exists(); + String content = Files.readString(tempFile.toPath()); + assertThat(content).contains("# Evaluation Report"); + assertThat(content).contains("**Global Score**: 50.0"); + assertThat(content).contains("## Score per tags"); + assertThat(content).contains("- **tag1**: 100.0"); + assertThat(content).contains("- **tag2**: 0.0"); + assertThat(content).contains("## Details"); + assertThat(content).contains("### Sample1: PASSED"); + assertThat(content).contains("#### Result\nexpected1"); + assertThat(content).contains("#### Expected Output\nexpected1"); + assertThat(content).contains("### Sample2: FAILED"); + assertThat(content).contains("#### Result\nsome-response"); + assertThat(content).contains("#### Expected Output\nexpected2"); + } } diff --git a/testing/scorer/scorer-core/src/test/java/io/quarkiverse/langchain4j/testing/scorer/ScorerTest.java b/testing/scorer/scorer-core/src/test/java/io/quarkiverse/langchain4j/testing/scorer/ScorerTest.java index c039b762f..02dd615d2 100644 --- a/testing/scorer/scorer-core/src/test/java/io/quarkiverse/langchain4j/testing/scorer/ScorerTest.java +++ b/testing/scorer/scorer-core/src/test/java/io/quarkiverse/langchain4j/testing/scorer/ScorerTest.java @@ -27,7 +27,7 @@ void evaluateShouldReturnCorrectReport() { EvaluationSample sample1 = new EvaluationSample<>( "Sample1", new Parameters().add(new Parameter.UnnamedParameter("param1")), - "expected1", + "expected1:param1", List.of("tag1", "tag2")); EvaluationSample sample2 = new EvaluationSample<>( @@ -36,7 +36,7 @@ void evaluateShouldReturnCorrectReport() { "expected2", List.of("tag2")); - Function mockFunction = params -> "expected1"; + Function mockFunction = params -> "expected1:param1"; EvaluationStrategy strategy = (sample, actual) -> actual.equals(sample.expectedOutput()); Samples samples = new Samples<>(sample1, sample2); @@ -46,11 +46,12 @@ void evaluateShouldReturnCorrectReport() { assertThat(report.score()).isEqualTo(50.0); // Only one sample should pass. assertThat(report.evaluations()).hasSize(2); - Scorer.EvaluationResult result1 = report.evaluations().get(0); - assertThat(result1.passed()).isTrue(); - - Scorer.EvaluationResult result2 = report.evaluations().get(1); - assertThat(result2.passed()).isFalse(); + var actualEvaluations = report.evaluations().stream() + .map(e -> "%s[%s;%s=%s]".formatted(e.sample().name(), e.sample().expectedOutput(), e.result(), e.passed())) + .toList(); + assertThat(actualEvaluations).containsExactlyInAnyOrder( + "Sample1[expected1:param1;expected1:param1=true]", + "Sample2[expected2;expected1:param1=false]"); } @Test