From 60e8623710b7e2887ad37ebdbe76cd6f26675832 Mon Sep 17 00:00:00 2001 From: Alexander Milster Date: Tue, 28 Nov 2023 11:24:05 +0100 Subject: [PATCH 1/5] More similarity metrics --- .../java/de/jplag/options/SimilarityMetric.java | 9 ++++++++- .../jsonfactory/ComparisonReportWriter.java | 16 ++++++++++------ 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/core/src/main/java/de/jplag/options/SimilarityMetric.java b/core/src/main/java/de/jplag/options/SimilarityMetric.java index 0f38cb83d..ddac9997b 100644 --- a/core/src/main/java/de/jplag/options/SimilarityMetric.java +++ b/core/src/main/java/de/jplag/options/SimilarityMetric.java @@ -3,12 +3,19 @@ import java.util.function.ToDoubleFunction; import de.jplag.JPlagComparison; +import de.jplag.Match; public enum SimilarityMetric implements ToDoubleFunction { AVG("average similarity", JPlagComparison::similarity), MIN("minimum similarity", JPlagComparison::minimalSimilarity), MAX("maximal similarity", JPlagComparison::maximalSimilarity), - INTERSECTION("matched tokens", it -> (double) it.getNumberOfMatchedTokens()); + INTERSECTION("matched tokens", it -> (double) it.getNumberOfMatchedTokens()), + SYMMETRIC( + "symmetric similarity", + it -> 2.0 * it.getNumberOfMatchedTokens() / (it.firstSubmission().getNumberOfTokens() + it.secondSubmission().getNumberOfTokens())), + OVERLAP("overlap between both submissions (number of matched tokens)", JPlagComparison::getNumberOfMatchedTokens), + LONGEST_MATCH("number of tokens in the longest match", it -> it.matches().stream().mapToInt(Match::length).max().orElse(0)), + OVERALL("Sum of both submission lengths", it -> it.firstSubmission().getNumberOfTokens() + it.secondSubmission().getNumberOfTokens()); private final ToDoubleFunction similarityFunction; private final String description; diff --git a/core/src/main/java/de/jplag/reporting/jsonfactory/ComparisonReportWriter.java b/core/src/main/java/de/jplag/reporting/jsonfactory/ComparisonReportWriter.java index 242d09138..0693cea00 100644 --- a/core/src/main/java/de/jplag/reporting/jsonfactory/ComparisonReportWriter.java +++ b/core/src/main/java/de/jplag/reporting/jsonfactory/ComparisonReportWriter.java @@ -1,9 +1,6 @@ package de.jplag.reporting.jsonfactory; -import java.util.Comparator; -import java.util.List; -import java.util.Map; -import java.util.Objects; +import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Function; @@ -56,13 +53,20 @@ private void writeComparisons(String path, List comparisons) { String secondSubmissionId = submissionToIdFunction.apply(comparison.secondSubmission()); String fileName = generateComparisonName(firstSubmissionId, secondSubmissionId); addToLookUp(firstSubmissionId, secondSubmissionId, fileName); - var comparisonReport = new ComparisonReport(firstSubmissionId, secondSubmissionId, - Map.of(SimilarityMetric.AVG.name(), comparison.similarity(), SimilarityMetric.MAX.name(), comparison.maximalSimilarity()), + var comparisonReport = new ComparisonReport(firstSubmissionId, secondSubmissionId, createMetricMap(comparison), convertMatchesToReportMatches(comparison)); fileWriter.saveAsJSON(comparisonReport, path, fileName); }); } + private Map createMetricMap(JPlagComparison comparison) { + Map result = new HashMap<>(); + for (SimilarityMetric metric : SimilarityMetric.values()) { + result.put(metric.name(), metric.applyAsDouble(comparison)); + } + return result; + } + private void addToLookUp(String firstSubmissionId, String secondSubmissionId, String fileName) { writeToMap(secondSubmissionId, firstSubmissionId, fileName); writeToMap(firstSubmissionId, secondSubmissionId, fileName); From b56e4840c9c9e6f717cb8f5c21ce40666f7dc57f Mon Sep 17 00:00:00 2001 From: Alexander Milster Date: Tue, 28 Nov 2023 15:11:58 +0100 Subject: [PATCH 2/5] Fixed error in end-to-end tests due to new similarity metrics. --- .../src/main/java/de/jplag/endtoend/model/ExpectedResult.java | 1 + 1 file changed, 1 insertion(+) diff --git a/endtoend-testing/src/main/java/de/jplag/endtoend/model/ExpectedResult.java b/endtoend-testing/src/main/java/de/jplag/endtoend/model/ExpectedResult.java index ff2408adb..10b2b8dc4 100644 --- a/endtoend-testing/src/main/java/de/jplag/endtoend/model/ExpectedResult.java +++ b/endtoend-testing/src/main/java/de/jplag/endtoend/model/ExpectedResult.java @@ -24,6 +24,7 @@ public double getSimilarityForMetric(SimilarityMetric metric) { case MIN -> resultSimilarityMinimum(); case MAX -> resultSimilarityMaximum(); case INTERSECTION -> resultMatchedTokenNumber(); + default -> throw new IllegalArgumentException(String.format("Similarity metric %s not supported for end to end tests", metric.name())); }; } From b8b0da851c0c780a86b240791b9c23dae6f30ee6 Mon Sep 17 00:00:00 2001 From: Alexander Milster Date: Mon, 11 Dec 2023 12:11:23 +0100 Subject: [PATCH 3/5] Added new similarity metrics to the overview page. --- core/src/main/java/de/jplag/JPlagResult.java | 16 +++++-------- .../de/jplag/options/SimilarityMetric.java | 10 ++++++++ .../jsonfactory/ComparisonReportWriter.java | 12 ++-------- .../reportobject/mapper/MetricMapper.java | 24 ++++++------------- .../reportobject/mapper/MetricMapperTest.java | 10 +++++--- 5 files changed, 32 insertions(+), 40 deletions(-) diff --git a/core/src/main/java/de/jplag/JPlagResult.java b/core/src/main/java/de/jplag/JPlagResult.java index 2b1aabd32..beecb9af9 100644 --- a/core/src/main/java/de/jplag/JPlagResult.java +++ b/core/src/main/java/de/jplag/JPlagResult.java @@ -1,5 +1,6 @@ package de.jplag; +import java.util.Arrays; import java.util.List; import java.util.function.ToDoubleFunction; @@ -11,6 +12,7 @@ * Encapsulates the results of a comparison of a set of source code submissions. */ public class JPlagResult { + private final static int SIMILARITY_DISTRIBUTION_SIZE = 100; private List comparisons; // comparisons whose similarity was about the specified threshold @@ -23,7 +25,6 @@ public class JPlagResult { private final int[] similarityDistribution; // 10-element array representing the similarity distribution of the detected matches. private List> clusteringResult; - private final int SIMILARITY_DISTRIBUTION_SIZE = 100; public JPlagResult(List comparisons, SubmissionSet submissions, long durationInMillis, JPlagOptions options) { // sort by similarity (descending) @@ -34,15 +35,6 @@ public JPlagResult(List comparisons, SubmissionSet submissions, similarityDistribution = calculateSimilarityDistribution(comparisons); } - /** - * Drops elements from the comparison list to free memory. Note, that this affects the similarity distribution and is - * only meant to be used if you don't need the information about comparisons with lower match similarity anymore. - * @param limit the number of comparisons to keep in the list - */ - public void dropComparisons(int limit) { - this.comparisons = this.getComparisons(limit); - } - public void setClusteringResult(List> clustering) { this.clusteringResult = clustering; } @@ -127,6 +119,10 @@ public String toString() { getDuration(), getOptions().language().getName(), submissions.numberOfSubmissions()); } + public List calculateDistributionFor(ToDoubleFunction similarityMetric) { + return Arrays.stream(calculateDistributionFor(this.comparisons, similarityMetric)).boxed().toList(); + } + /** * Note: Before, comparisons with a similarity below the given threshold were also included in the similarity matrix. */ diff --git a/core/src/main/java/de/jplag/options/SimilarityMetric.java b/core/src/main/java/de/jplag/options/SimilarityMetric.java index ddac9997b..7b5f241f8 100644 --- a/core/src/main/java/de/jplag/options/SimilarityMetric.java +++ b/core/src/main/java/de/jplag/options/SimilarityMetric.java @@ -1,5 +1,7 @@ package de.jplag.options; +import java.util.HashMap; +import java.util.Map; import java.util.function.ToDoubleFunction; import de.jplag.JPlagComparison; @@ -38,4 +40,12 @@ public double applyAsDouble(JPlagComparison comparison) { public String toString() { return description; } + + public static Map createSimilarityMap(JPlagComparison comparison) { + Map result = new HashMap<>(); + for (SimilarityMetric metric : SimilarityMetric.values()) { + result.put(metric.name(), metric.applyAsDouble(comparison)); + } + return result; + } } diff --git a/core/src/main/java/de/jplag/reporting/jsonfactory/ComparisonReportWriter.java b/core/src/main/java/de/jplag/reporting/jsonfactory/ComparisonReportWriter.java index 0693cea00..6bc94e637 100644 --- a/core/src/main/java/de/jplag/reporting/jsonfactory/ComparisonReportWriter.java +++ b/core/src/main/java/de/jplag/reporting/jsonfactory/ComparisonReportWriter.java @@ -53,20 +53,12 @@ private void writeComparisons(String path, List comparisons) { String secondSubmissionId = submissionToIdFunction.apply(comparison.secondSubmission()); String fileName = generateComparisonName(firstSubmissionId, secondSubmissionId); addToLookUp(firstSubmissionId, secondSubmissionId, fileName); - var comparisonReport = new ComparisonReport(firstSubmissionId, secondSubmissionId, createMetricMap(comparison), + var comparisonReport = new ComparisonReport(firstSubmissionId, secondSubmissionId, SimilarityMetric.createSimilarityMap(comparison), convertMatchesToReportMatches(comparison)); fileWriter.saveAsJSON(comparisonReport, path, fileName); }); } - private Map createMetricMap(JPlagComparison comparison) { - Map result = new HashMap<>(); - for (SimilarityMetric metric : SimilarityMetric.values()) { - result.put(metric.name(), metric.applyAsDouble(comparison)); - } - return result; - } - private void addToLookUp(String firstSubmissionId, String secondSubmissionId, String fileName) { writeToMap(secondSubmissionId, firstSubmissionId, fileName); writeToMap(firstSubmissionId, secondSubmissionId, fileName); @@ -102,7 +94,7 @@ private Match convertMatchToReportMatch(JPlagComparison comparison, de.jplag.Mat List tokensFirst = comparison.firstSubmission().getTokenList().subList(match.startOfFirst(), match.endOfFirst() + 1); List tokensSecond = comparison.secondSubmission().getTokenList().subList(match.startOfSecond(), match.endOfSecond() + 1); - Comparator lineComparator = (first, second) -> first.getLine() - second.getLine(); + Comparator lineComparator = Comparator.comparingInt(Token::getLine); Token startOfFirst = tokensFirst.stream().min(lineComparator).orElseThrow(); Token endOfFirst = tokensFirst.stream().max(lineComparator).orElseThrow(); diff --git a/core/src/main/java/de/jplag/reporting/reportobject/mapper/MetricMapper.java b/core/src/main/java/de/jplag/reporting/reportobject/mapper/MetricMapper.java index bdad683a0..ca060d908 100644 --- a/core/src/main/java/de/jplag/reporting/reportobject/mapper/MetricMapper.java +++ b/core/src/main/java/de/jplag/reporting/reportobject/mapper/MetricMapper.java @@ -1,13 +1,10 @@ package de.jplag.reporting.reportobject.mapper; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.function.Function; -import de.jplag.JPlagComparison; import de.jplag.JPlagResult; import de.jplag.Submission; import de.jplag.options.SimilarityMetric; @@ -29,8 +26,11 @@ public MetricMapper(Function submissionToIdFunction) { * @return Map with key as name of metric and value as distribution */ public static Map> getDistributions(JPlagResult result) { - return Map.of(SimilarityMetric.AVG.name(), convertDistribution(result.getSimilarityDistribution()), SimilarityMetric.MAX.name(), - convertDistribution(result.getMaxSimilarityDistribution())); + Map> distributions = new HashMap<>(); + for (SimilarityMetric metric : SimilarityMetric.values()) { + distributions.put(metric.name(), result.calculateDistributionFor(metric)); + } + return distributions; } /** @@ -41,17 +41,7 @@ public static Map> getDistributions(JPlagResult result) { public List getTopComparisons(JPlagResult result) { return result.getComparisons(result.getOptions().maximumNumberOfComparisons()).stream() .map(comparison -> new TopComparison(submissionToIdFunction.apply(comparison.firstSubmission()), - submissionToIdFunction.apply(comparison.secondSubmission()), getComparisonMetricMap(comparison))) + submissionToIdFunction.apply(comparison.secondSubmission()), SimilarityMetric.createSimilarityMap(comparison))) .toList(); } - - private Map getComparisonMetricMap(JPlagComparison comparison) { - return Map.of(SimilarityMetric.AVG.name(), comparison.similarity(), SimilarityMetric.MAX.name(), comparison.maximalSimilarity()); - } - - private static List convertDistribution(int[] array) { - List list = new ArrayList<>(Arrays.stream(array).boxed().toList()); - Collections.reverse(list); - return list; - } } diff --git a/core/src/test/java/de/jplag/reporting/reportobject/mapper/MetricMapperTest.java b/core/src/test/java/de/jplag/reporting/reportobject/mapper/MetricMapperTest.java index 6419fda3b..16feeffb0 100644 --- a/core/src/test/java/de/jplag/reporting/reportobject/mapper/MetricMapperTest.java +++ b/core/src/test/java/de/jplag/reporting/reportobject/mapper/MetricMapperTest.java @@ -5,7 +5,7 @@ import static org.mockito.Mockito.mock; import java.util.ArrayList; -import java.util.Collections; +import java.util.Arrays; import java.util.List; import java.util.Map; @@ -16,6 +16,7 @@ import de.jplag.JPlagResult; import de.jplag.Submission; import de.jplag.options.JPlagOptions; +import de.jplag.options.SimilarityMetric; import de.jplag.reporting.reportobject.model.TopComparison; public class MetricMapperTest { @@ -39,7 +40,8 @@ public void test_getDistributions() { Map> result = MetricMapper.getDistributions(jPlagResult); // then - Assertions.assertEquals(Map.of("AVG", EXPECTED_AVG_DISTRIBUTION, "MAX", EXPECTED_MAX_DISTRIBUTION), result); + Assertions.assertEquals(EXPECTED_AVG_DISTRIBUTION, result.get("AVG")); + Assertions.assertEquals(EXPECTED_MAX_DISTRIBUTION, result.get("MAX")); } @Test @@ -59,7 +61,6 @@ public void test_getTopComparisons() { private int[] distribution(List expectedDistribution) { var reversedDistribution = new ArrayList<>(expectedDistribution); - Collections.reverse(reversedDistribution); return reversedDistribution.stream().mapToInt(Integer::intValue).toArray(); } @@ -76,6 +77,9 @@ private JPlagResult createJPlagResult(int[] avgDistribution, int[] maxDistributi doReturn(avgDistribution).when(jPlagResult).getSimilarityDistribution(); doReturn(maxDistribution).when(jPlagResult).getMaxSimilarityDistribution(); + doReturn(Arrays.stream(avgDistribution).boxed().toList()).when(jPlagResult).calculateDistributionFor(SimilarityMetric.AVG); + doReturn(Arrays.stream(maxDistribution).boxed().toList()).when(jPlagResult).calculateDistributionFor(SimilarityMetric.MAX); + JPlagOptions options = mock(JPlagOptions.class); doReturn(createComparisonsDto.length).when(options).maximumNumberOfComparisons(); doReturn(options).when(jPlagResult).getOptions(); From 2fce557fd220eab358623acfb1e5d5c3d98df8bc Mon Sep 17 00:00:00 2001 From: Alexander Milster Date: Mon, 11 Dec 2023 12:21:03 +0100 Subject: [PATCH 4/5] Fixed tests for new similarity metrics. --- .../java/de/jplag/options/SimilarityMetric.java | 12 ++++++++---- .../reportobject/mapper/MetricMapperTest.java | 14 ++++++++++++-- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/core/src/main/java/de/jplag/options/SimilarityMetric.java b/core/src/main/java/de/jplag/options/SimilarityMetric.java index 7b5f241f8..b7aa810c2 100644 --- a/core/src/main/java/de/jplag/options/SimilarityMetric.java +++ b/core/src/main/java/de/jplag/options/SimilarityMetric.java @@ -12,10 +12,14 @@ public enum SimilarityMetric implements ToDoubleFunction { MIN("minimum similarity", JPlagComparison::minimalSimilarity), MAX("maximal similarity", JPlagComparison::maximalSimilarity), INTERSECTION("matched tokens", it -> (double) it.getNumberOfMatchedTokens()), - SYMMETRIC( - "symmetric similarity", - it -> 2.0 * it.getNumberOfMatchedTokens() / (it.firstSubmission().getNumberOfTokens() + it.secondSubmission().getNumberOfTokens())), - OVERLAP("overlap between both submissions (number of matched tokens)", JPlagComparison::getNumberOfMatchedTokens), + SYMMETRIC("symmetric similarity", it -> { + int divisor = it.firstSubmission().getNumberOfTokens() + it.secondSubmission().getNumberOfTokens(); + if (divisor != 0) { + return 2.0 * it.getNumberOfMatchedTokens() / divisor; + } else { + return .0; + } + }), LONGEST_MATCH("number of tokens in the longest match", it -> it.matches().stream().mapToInt(Match::length).max().orElse(0)), OVERALL("Sum of both submission lengths", it -> it.firstSubmission().getNumberOfTokens() + it.secondSubmission().getNumberOfTokens()); diff --git a/core/src/test/java/de/jplag/reporting/reportobject/mapper/MetricMapperTest.java b/core/src/test/java/de/jplag/reporting/reportobject/mapper/MetricMapperTest.java index 16feeffb0..386efb043 100644 --- a/core/src/test/java/de/jplag/reporting/reportobject/mapper/MetricMapperTest.java +++ b/core/src/test/java/de/jplag/reporting/reportobject/mapper/MetricMapperTest.java @@ -6,6 +6,7 @@ import java.util.ArrayList; import java.util.Arrays; +import java.util.HashMap; import java.util.List; import java.util.Map; @@ -55,8 +56,7 @@ public void test_getTopComparisons() { // then Assertions.assertEquals( - List.of(new TopComparison("1", "2", Map.of("AVG", .7, "MAX", .8)), new TopComparison("3", "4", Map.of("AVG", .3, "MAX", .9))), - result); + List.of(new TopComparison("1", "2", buildSimilarityMap(.7, .8)), new TopComparison("3", "4", buildSimilarityMap(.3, .9))), result); } private int[] distribution(List expectedDistribution) { @@ -109,4 +109,14 @@ private record Comparison(CreateSubmission submission1, CreateSubmission submiss private record CreateSubmission(String name) { } + private Map buildSimilarityMap(double avg, double max) { + Map map = new HashMap<>(); + for (SimilarityMetric value : SimilarityMetric.values()) { + map.put(value.name(), 0d); + } + map.put(SimilarityMetric.AVG.name(), avg); + map.put(SimilarityMetric.MAX.name(), max); + return map; + } + } \ No newline at end of file From 9e49ba78cc7b9a1c66436caa1aafe017747d10fb Mon Sep 17 00:00:00 2001 From: Alexander Vogt Date: Wed, 13 Dec 2023 14:34:52 +0100 Subject: [PATCH 5/5] reverse distribution in report viewer --- report-viewer/src/model/Distribution.ts | 2 +- report-viewer/src/model/HundredValueDistribution.ts | 6 ++---- report-viewer/tests/unit/model/Distribution.test.ts | 6 +++--- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/report-viewer/src/model/Distribution.ts b/report-viewer/src/model/Distribution.ts index 00da64d22..d4a67838c 100644 --- a/report-viewer/src/model/Distribution.ts +++ b/report-viewer/src/model/Distribution.ts @@ -6,7 +6,7 @@ export abstract class Distribution { } /** - * Returns the distribution summed at every tenth percentile + * Returns the distribution summed at every tenth percentile, the last percentile (90%-100%) should be at index 1 */ public abstract splitIntoTenBuckets(): number[] } diff --git a/report-viewer/src/model/HundredValueDistribution.ts b/report-viewer/src/model/HundredValueDistribution.ts index 91c2831d1..b3c37aafb 100644 --- a/report-viewer/src/model/HundredValueDistribution.ts +++ b/report-viewer/src/model/HundredValueDistribution.ts @@ -9,13 +9,11 @@ export class HundredValueDistribution extends Distribution { super(distribution) } - /** - * Returns the distribution summed at every tenth percentile - */ public splitIntoTenBuckets(): number[] { const tenValueArray = new Array(10).fill(0) + const reversedDistribution = this._distribution.reverse() for (let i = 99; i >= 0; i--) { - tenValueArray[Math.floor(i / 10)] += this._distribution[i] + tenValueArray[Math.floor(i / 10)] += reversedDistribution[i] } return tenValueArray } diff --git a/report-viewer/tests/unit/model/Distribution.test.ts b/report-viewer/tests/unit/model/Distribution.test.ts index f5b6fa301..6818c81c3 100644 --- a/report-viewer/tests/unit/model/Distribution.test.ts +++ b/report-viewer/tests/unit/model/Distribution.test.ts @@ -7,10 +7,10 @@ describe('Distribution', () => { it.each([ new TenValueDistribution([0, 0, 0, 0, 0, 0, 26, 13209, 58955, 5231]), new HundredValueDistribution([ + 0, 7, 15, 42, 109, 225, 470, 869, 1442, 2052, 3025, 4056, 5091, 6130, 7023, 7292, 7445, 7177, + 6343, 5373, 4309, 3163, 2244, 1544, 923, 493, 273, 168, 61, 31, 8, 12, 2, 1, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 2, 1, 0, 1, 2, 12, 8, 31, 61, 168, 273, 493, 923, 1544, 2244, 3163, 4309, 5373, 6343, 7177, - 7445, 7292, 7023, 6130, 5091, 4056, 3025, 2052, 1442, 869, 470, 225, 109, 42, 15, 7, 0 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]) ])('get in 10 Buckets', (distribution: Distribution) => { expect(distribution.splitIntoTenBuckets()).toEqual([0, 0, 0, 0, 0, 0, 26, 13209, 58955, 5231])