Skip to content

Commit

Permalink
Merge pull request #1872 from jplag/feature/improved-normalization
Browse files Browse the repository at this point in the history
Improve code quality of token sequence normalization
  • Loading branch information
tsaglam authored Jul 31, 2024
2 parents 6e4ef50 + 86af3f8 commit 36f025c
Show file tree
Hide file tree
Showing 9 changed files with 112 additions and 86 deletions.
4 changes: 2 additions & 2 deletions core/src/main/java/de/jplag/Submission.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import org.slf4j.LoggerFactory;

import de.jplag.exceptions.LanguageException;
import de.jplag.normalization.TokenStringNormalizer;
import de.jplag.normalization.TokenSequenceNormalizer;
import de.jplag.options.JPlagOptions;

/**
Expand Down Expand Up @@ -259,7 +259,7 @@ private static File createErrorDirectory(String... subdirectoryNames) {
*/
void normalize() {
List<Integer> originalOrder = getOrder(tokenList);
tokenList = TokenStringNormalizer.normalize(tokenList);
tokenList = TokenSequenceNormalizer.normalize(tokenList);
List<Integer> normalizedOrder = getOrder(tokenList);

logger.debug("original line order: {}", originalOrder);
Expand Down
5 changes: 5 additions & 0 deletions core/src/main/java/de/jplag/SubmissionSet.java
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,11 @@ public List<Submission> getInvalidSubmissions() {
return invalidSubmissions;
}

/**
* Normalizes the token sequences of all submissions (including basecode). This makes the token sequence invariant to
* dead code insertion and independent statement reordering by removing dead tokens and optionally reordering tokens to
* a deterministic order.
*/
public void normalizeSubmissions() {
if (baseCodeSubmission != null) {
baseCodeSubmission.normalize();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import de.jplag.semantics.Variable;

/**
* Models a multiple edge in the normalization graph. Contains multiple edges.
* Models multiple edges between two nodes in the normalization graph.
*/
class MultipleEdge {
private final Set<Edge> edges;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,28 @@
import de.jplag.semantics.Variable;

/**
* Constructs the normalization graph.
* Token normalization graph, which is a directed graph based on nodes of type {@link Statement} and edges of type
* {@link MultipleEdge}. This class class inherits from {@link SimpleDirectedGraph} to provide a data structure for the
* token sequence normalization.
*/
class NormalizationGraphConstructor {
private final SimpleDirectedGraph<Statement, MultipleEdge> graph;
public class NormalizationGraph extends SimpleDirectedGraph<Statement, MultipleEdge> {

private static final long serialVersionUID = -8407465274643809647L; // generated

private int bidirectionalBlockDepth;
private final Collection<Statement> fullPositionSignificanceIncoming;
private Statement lastFullPositionSignificance;
private Statement lastPartialPositionSignificance;
private final Map<Variable, Collection<Statement>> variableReads;
private final Map<Variable, Collection<Statement>> variableWrites;
private final Set<Statement> inCurrentBidirectionalBlock;
private Statement current;

NormalizationGraphConstructor(List<Token> tokens) {
graph = new SimpleDirectedGraph<>(MultipleEdge.class);
private final transient Collection<Statement> fullPositionSignificanceIncoming;
private transient Statement lastFullPositionSignificance;
private transient Statement lastPartialPositionSignificance;
private final transient Map<Variable, Collection<Statement>> variableReads;
private final transient Map<Variable, Collection<Statement>> variableWrites;
private final transient Set<Statement> inCurrentBidirectionalBlock;
private transient Statement current;

/**
* Creates a new normalization graph.
*/
public NormalizationGraph(List<Token> tokens) {
super(MultipleEdge.class);
bidirectionalBlockDepth = 0;
fullPositionSignificanceIncoming = new ArrayList<>();
variableReads = new HashMap<>();
Expand All @@ -45,12 +52,8 @@ class NormalizationGraphConstructor {
addStatement(builderForCurrent.build());
}

SimpleDirectedGraph<Statement, MultipleEdge> get() {
return graph;
}

private void addStatement(Statement statement) {
graph.addVertex(statement);
addVertex(statement);
this.current = statement;
processBidirectionalBlock();
processFullPositionSignificance();
Expand Down Expand Up @@ -123,10 +126,10 @@ private void processWrites() {
* @param cause the variable that caused the edge, may be null
*/
private void addIncomingEdgeToCurrent(Statement start, EdgeType type, Variable cause) {
MultipleEdge multipleEdge = graph.getEdge(start, current);
MultipleEdge multipleEdge = getEdge(start, current);
if (multipleEdge == null) {
multipleEdge = new MultipleEdge();
graph.addEdge(start, current, multipleEdge);
addEdge(start, current, multipleEdge);
}
multipleEdge.addEdge(type, cause);
}
Expand All @@ -135,4 +138,5 @@ private void addVariableToMap(Map<Variable, Collection<Statement>> variableMap,
variableMap.putIfAbsent(variable, new ArrayList<>());
variableMap.get(variable).add(current);
}

}
11 changes: 8 additions & 3 deletions core/src/main/java/de/jplag/normalization/Statement.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,19 @@
import de.jplag.semantics.CodeSemantics;

/**
* Models statements, which are the nodes of the normalization graph.
* Models statements, which are the nodes of the normalization graph. A statement refers to one or more tokens.
*/
class Statement implements Comparable<Statement> {

private final List<Token> tokens;
private final int lineNumber;
private final CodeSemantics semantics;

/**
* Constructs a new Statement.
* @param tokens the list of tokens that represent this statement.
* @param lineNumber the line number where this statement occurs in the source code.
*/
Statement(List<Token> tokens, int lineNumber) {
this.tokens = Collections.unmodifiableList(tokens);
this.lineNumber = lineNumber;
Expand All @@ -30,8 +35,8 @@ CodeSemantics semantics() {
return semantics;
}

void markKeep() {
semantics.markKeep();
void markAsCritical() {
semantics.markAsCritical();
}

private int tokenOrdinal(Token token) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ class StatementBuilder {
private final List<Token> tokens;
private final int lineNumber;

/**
* Constructs a new StatementBuilder.
* @param lineNumber the line number where the statement starts in the source code.
*/
StatementBuilder(int lineNumber) {
this.lineNumber = lineNumber;
this.tokens = new ArrayList<>();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,45 +1,50 @@
package de.jplag.normalization;

import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.stream.Collectors;

import org.jgrapht.Graphs;
import org.jgrapht.graph.SimpleDirectedGraph;

import de.jplag.Token;

/**
* Performs token sequence normalization.
*/
public class TokenStringNormalizer {
public final class TokenSequenceNormalizer {

private TokenStringNormalizer() {
private TokenSequenceNormalizer() {
// private constructor for non-instantiability.
}

/**
* Performs token sequence normalization. Tokens representing dead code have been eliminated and tokens representing
* subsequent independent statements have been put in a fixed order. Works by first constructing a Normalization Graph
* and then turning it back into a token sequence.
* subsequent independent statements have been put in a fixed order if sorting is true. Works by first constructing a
* Normalization Graph and then turning it back into a token sequence. For more information refer to the
* <a href="https://doi.org/10.1145/3639478.3643074">corresponding paper</a>
* @param tokens The original token sequence, remains unaltered.
* @return The normalized token sequence as unmodifiable list.
* @return The normalized token sequence.
*/
public static List<Token> normalize(List<Token> tokens) {
SimpleDirectedGraph<Statement, MultipleEdge> normalizationGraph = new NormalizationGraphConstructor(tokens).get();
NormalizationGraph graph = new NormalizationGraph(tokens);
propagateCriticalityStatus(graph);
return normalizeWithSorting(tokens, graph);
}

// Add tokens in normalized original order, removing dead tokens
private static List<Token> normalizeWithSorting(List<Token> tokens, NormalizationGraph normalizationGraph) {
List<Token> normalizedTokens = new ArrayList<>(tokens.size());
spreadKeep(normalizationGraph);
PriorityQueue<Statement> roots = normalizationGraph.vertexSet().stream() //
.filter(v -> !Graphs.vertexHasPredecessors(normalizationGraph, v)) //
.collect(Collectors.toCollection(PriorityQueue::new));
while (!roots.isEmpty()) {
PriorityQueue<Statement> newRoots = new PriorityQueue<>();
do {
Statement statement = roots.poll();
if (statement.semantics().keep()) {
if (statement.semantics().isCritical()) {
normalizedTokens.addAll(statement.tokens());
}
for (Statement successor : Graphs.successorListOf(normalizationGraph, statement)) {
Expand All @@ -51,26 +56,29 @@ public static List<Token> normalize(List<Token> tokens) {
} while (!roots.isEmpty());
roots = newRoots;
}
return Collections.unmodifiableList(normalizedTokens);
return normalizedTokens;
}

/**
* Spread keep status to every node that does not represent dead code. Nodes without keep status are later eliminated.
* Spread criticality status to every node that does not represent dead code. Nodes without keep criticality are later
* eliminated (dead nodes). Before calling this method, only the statements that directly affect the behavior are marked
* as critical. After calling this method, this also holds true for statement that (transitively) depend (read/write) on
* the critical ones.
*/
private static void spreadKeep(SimpleDirectedGraph<Statement, MultipleEdge> normalizationGraph) {
private static void propagateCriticalityStatus(NormalizationGraph normalizationGraph) {
Queue<Statement> visit = new LinkedList<>(normalizationGraph.vertexSet().stream() //
.filter(tl -> tl.semantics().keep()).toList());
.filter(tl -> tl.semantics().isCritical()).toList());
while (!visit.isEmpty()) {
Statement current = visit.remove();
for (Statement predecessor : Graphs.predecessorListOf(normalizationGraph, current)) { // performance of iteration?
if (!predecessor.semantics().keep() && normalizationGraph.getEdge(predecessor, current).isVariableFlow()) {
predecessor.markKeep();
if (!predecessor.semantics().isCritical() && normalizationGraph.getEdge(predecessor, current).isVariableFlow()) {
predecessor.markAsCritical();
visit.add(predecessor);
}
}
for (Statement successor : Graphs.successorListOf(normalizationGraph, current)) {
if (!successor.semantics().keep() && normalizationGraph.getEdge(current, successor).isVariableReverseFlow()) {
successor.markKeep();
if (!successor.semantics().isCritical() && normalizationGraph.getEdge(current, successor).isVariableReverseFlow()) {
successor.markAsCritical();
visit.add(successor);
}
}
Expand Down
2 changes: 1 addition & 1 deletion core/src/test/java/de/jplag/NormalizationTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,4 @@ void testReorderingNormalization() {
void testInsertionReorderingNormalization() {
Assertions.assertIterableEquals(originalTokenString, tokenStringMap.get("SquaresInsertedReordered.java"));
}
}
}
Loading

0 comments on commit 36f025c

Please sign in to comment.