Skip to content

Commit

Permalink
Merge pull request #13 from intuit/perf-improvements
Browse files Browse the repository at this point in the history
- Memory usage improvements by dropping storage of "childScore" in Ma…
  • Loading branch information
manishobhatia authored Aug 9, 2019
2 parents 39fe9c6 + 1d0eb51 commit 2e3793c
Show file tree
Hide file tree
Showing 9 changed files with 3,304 additions and 89 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ be easily configured by passing a lambda expression.
* _Levenshtein_: Gets the Levenshtein distance score using apache commons similarity library
* _Jaccard_: Gets the Jaccard score using apache commons similarity library

* __Scoring__ : Expects a ```Function<Match, Double>```, this defines functions on how to accumulate scores from Tokens into Elements and from Elements into Documents
* __Scoring__ : Expects a ```BiFunction<Match, List<Score>, Double>```, this defines functions on how to accumulate scores
from Tokens into Elements and from Elements into Documents.
* _Simple Average_: Adds up total scores of each child matches / total children. This is the default scoring for Elements
* _Weighted Average_: This is useful for Document Scoring, where users can input weights on elements.
Example a phone number or email could be considered an important element to identify match between 2 User objects, and we can add weights to such elements.
Expand Down
13 changes: 7 additions & 6 deletions src/main/java/com/intuit/fuzzymatcher/domain/Document.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import java.util.List;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.BiFunction;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;
Expand All @@ -27,7 +28,7 @@
* </ul>
*/
public class Document implements Matchable {
private Document(String key, Set<Element> elements, double threshold, Function<Match, Score> scoringFunction) {
private Document(String key, Set<Element> elements, double threshold, BiFunction<Match, List<Score>, Score> scoringFunction) {
this.key = key;
this.elements = elements;
this.threshold = threshold;
Expand All @@ -38,11 +39,11 @@ private Document(String key, Set<Element> elements, double threshold, Function<M
private Set<Element> elements;
private Set<Element> preProcessedElement;
private double threshold;
private Function<Match, Score> scoringFunction;
private BiFunction<Match, List<Score>, Score> scoringFunction;
private Boolean source;
private Set<Document> matchedWith = new HashSet<>();

private static final Function<Match, Score> DEFAULT_DOCUMENT_SCORING = ScoringFunction.getExponentialWeightedAverageScore();
private static final BiFunction<Match, List<Score>, Score> DEFAULT_DOCUMENT_SCORING = ScoringFunction.getExponentialWeightedAverageScore();

public String getKey() {
return key;
Expand Down Expand Up @@ -111,7 +112,7 @@ public long getUnmatchedChildCount(Matchable other) {
}

@Override
public Function<Match, Score> getScoringFunction() {
public BiFunction<Match, List<Score>, Score> getScoringFunction() {
return this.scoringFunction != null ? this.scoringFunction : DEFAULT_DOCUMENT_SCORING;
}

Expand Down Expand Up @@ -140,7 +141,7 @@ public static class Builder {
private String key;
private Set<Element> elements;
private double threshold = 0.5;
private Function<Match, Score> scoringFunction;
private BiFunction<Match, List<Score>, Score> scoringFunction;

public Builder(String key) {
this.key = key;
Expand All @@ -159,7 +160,7 @@ public Builder addElement(Element element) {
return this;
}

public Builder setScoringFunction(Function<Match, Score> scoringFunction) {
public Builder setScoringFunction(BiFunction<Match, List<Score>, Score> scoringFunction) {
this.scoringFunction = scoringFunction;
return this;
}
Expand Down
17 changes: 11 additions & 6 deletions src/main/java/com/intuit/fuzzymatcher/domain/Element.java
Original file line number Diff line number Diff line change
Expand Up @@ -40,17 +40,17 @@ public class Element implements Matchable {
private Function<Object, Object> preProcessFunction;
private Function<Element, Stream<Token>> tokenizerFunction;
private BiFunction<Token, Token, Double> similarityMatchFunction;
private Function<Match, Score> scoringFunction;
private BiFunction<Match, List<Score>, Score> scoringFunction;
private List<Token> tokens;

private Object preProcessedValue;

private static final Function<Match, Score> DEFAULT_ELEMENT_SCORING = ScoringFunction.getSimpleAverageScore();
private static final BiFunction<Match, List<Score>, Score> DEFAULT_ELEMENT_SCORING = ScoringFunction.getSimpleAverageScore();

public Element(ElementType type, String variance, Object value, double weight, double threshold,
Function<Object, Object> preProcessFunction,
Function<Element, Stream<Token>> tokenizerFunction,
BiFunction<Token, Token, Double> similarityMatchFunction, Function<Match, Score> scoringFunction,
BiFunction<Token, Token, Double> similarityMatchFunction, BiFunction<Match, List<Score>, Score> scoringFunction,
Function<List<Token>, Stream<Match<Token>>> matchOptimizerFunction) {
this.weight = weight;
this.elementClassification = new ElementClassification(type, variance,
Expand Down Expand Up @@ -153,7 +153,7 @@ public long getUnmatchedChildCount(Matchable other) {
}

@Override
public Function<Match, Score> getScoringFunction() {
public BiFunction<Match, List<Score>, Score> getScoringFunction() {
return this.scoringFunction;
}

Expand All @@ -167,7 +167,7 @@ public static class Builder {

private Function<Element, Stream<Token>> tokenizerFunction;
private BiFunction<Token, Token, Double> similarityMatchFunction;
private Function<Match, Score> scoringFunction;
private BiFunction<Match, List<Score>, Score> scoringFunction;
private Function<List<Token>, Stream<Match<Token>>> matchOptimizerFunction;

public Builder setType(ElementType type) {
Expand All @@ -180,6 +180,11 @@ public Builder setVariance(String variance) {
return this;
}

public Builder setValue(Object value) {
this.value = value;
return this;
}

public Builder setValue(String value) {
this.value = value;
return this;
Expand Down Expand Up @@ -221,7 +226,7 @@ public Builder setSimilarityMatchFunction(BiFunction<Token, Token, Double> simil
return this;
}

public Builder setScoringFunction(Function<Match, Score> scoringFunction) {
public Builder setScoringFunction(BiFunction<Match, List<Score>, Score> scoringFunction) {
this.scoringFunction = scoringFunction;
return this;
}
Expand Down
29 changes: 13 additions & 16 deletions src/main/java/com/intuit/fuzzymatcher/domain/Match.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,19 @@
*/
public class Match<T extends Matchable> {

public Match(T t, T matchedWith, List<Score> childScores) {

public Match(T t, T matchedWith) {
this.data = t;
this.matchedWith = matchedWith;
this.childScores = childScores;
}
public Match(T t, T matchedWith, List<Score> childScores) {
this(t, matchedWith);
List<Score> maxDistinctChildScores = getMaxDistinctScores(childScores);
setScore(maxDistinctChildScores);
}

public Match(T t, T matchedWith, double result) {
this.data = t;
this.matchedWith = matchedWith;
this(t, matchedWith);
this.score = new Score(result, this);
}

Expand All @@ -34,8 +38,6 @@ public Match(T t, T matchedWith, double result) {

private Score score;

private List<Score> childScores;

public T getData() {
return this.data;
}
Expand All @@ -44,23 +46,18 @@ public T getMatchedWith() {
return matchedWith;
}

public void setMatchedWith(T matchedWith) {
this.matchedWith = matchedWith;
}

public double getResult() {
return this.getScore().getResult();
return this.score.getResult();
}

public Score getScore() {
if (this.score == null) {
this.score = this.data.getScoringFunction().apply(this);
}
return this.score;
}

public List<Score> getChildScores() {
return getMaxDistinctScores(this.childScores);
public void setScore(List<Score> childScores) {
if (this.score == null) {
this.score = this.data.getScoringFunction().apply(this, childScores);
}
}

private List<Score> getMaxDistinctScores(List<Score> scoreList) {
Expand Down
5 changes: 3 additions & 2 deletions src/main/java/com/intuit/fuzzymatcher/domain/Matchable.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package com.intuit.fuzzymatcher.domain;

import java.util.function.Function;
import java.util.List;
import java.util.function.BiFunction;

/**
*
Expand All @@ -10,7 +11,7 @@ public interface Matchable {

public long getChildCount(Matchable other);

public Function<Match, Score> getScoringFunction();
public BiFunction<Match, List<Score>, Score> getScoringFunction();

public double getWeight();

Expand Down
42 changes: 19 additions & 23 deletions src/main/java/com/intuit/fuzzymatcher/function/ScoringFunction.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
import com.intuit.fuzzymatcher.domain.Score;

import java.util.List;
import java.util.function.BiFunction;
import java.util.function.Function;
import java.util.stream.Collectors;

/**
* A functional interface to get a score between 2 Match objects
*/
public interface ScoringFunction extends Function<Match, Score> {
public interface ScoringFunction extends BiFunction<Match, List<Score>, Score> {

double EXPONENT = 1.5;
double EXPONENTIAL_INCREASE_THRESHOLD = 0.9;
Expand All @@ -24,8 +25,7 @@ public interface ScoringFunction extends Function<Match, Score> {
* @return the scoring function for Average
*/
static ScoringFunction getAverageScore() {
return match -> {
List<Score> childScores = match.getChildScores();
return (match, childScores) -> {
double numerator = getSumOfResult(childScores) + getUnmatchedChildScore(match);
double denominator = getChildCount(match);
return new Score(numerator / denominator, match);
Expand All @@ -39,8 +39,7 @@ static ScoringFunction getAverageScore() {
* @return the scoring function for Simple Average
*/
static ScoringFunction getSimpleAverageScore() {
return match -> {
List<Score> childScores = match.getChildScores();
return (match, childScores) -> {
double numerator = getSumOfResult(childScores);
double denominator = getChildCount(match);
return new Score(numerator / denominator, match);
Expand All @@ -54,13 +53,12 @@ static ScoringFunction getSimpleAverageScore() {
* @return the scoring function for WeightedAverage
*/
static ScoringFunction getWeightedAverageScore() {
return match -> {
List<Score> childScoreList = match.getChildScores();
double numerator = getSumOfWeightedResult(childScoreList)
return (match, childScores) -> {
double numerator = getSumOfWeightedResult(childScores)
+ getUnmatchedChildScore(match);
double denominator = getSumOfWeights(childScoreList)
double denominator = getSumOfWeights(childScores)
+ getChildCount(match)
- childScoreList.size();
- childScores.size();
return new Score(numerator / denominator, match);
};
}
Expand All @@ -72,21 +70,20 @@ static ScoringFunction getWeightedAverageScore() {
* @return the scoring function for ExponentialAverage
*/
static ScoringFunction getExponentialAverageScore() {
return match -> {
List<Score> childScoreList = match.getChildScores();
List<Score> perfectMatchedElements = getPerfectMatchedElement(childScoreList);
return (match, childScores) -> {
List<Score> perfectMatchedElements = getPerfectMatchedElement(childScores);

if (perfectMatchedElements.size() > 1 && getSumOfResult(perfectMatchedElements) > 1) {
double numerator = getExponentiallyIncreasedValue(getSumOfResult(perfectMatchedElements))
+ getSumOfResult(getNonPerfectMatchedElement(childScoreList))
+ getSumOfResult(getNonPerfectMatchedElement(childScores))
+ getUnmatchedChildScore(match);

double denominator = getExponentiallyIncreasedValue(perfectMatchedElements.size())
+ getChildCount(match)
- perfectMatchedElements.size();
return new Score(numerator / denominator, match);
} else
return getAverageScore().apply(match);
return getAverageScore().apply(match, childScores);
};
}

Expand All @@ -97,24 +94,23 @@ static ScoringFunction getExponentialAverageScore() {
* @return the scoring function for ExponentialWeightedAverage
*/
static ScoringFunction getExponentialWeightedAverageScore() {
return match -> {
List<Score> childScoreList = match.getChildScores();
List<Score> perfectMatchedElements = getPerfectMatchedElement(childScoreList);
return (match, childScores) -> {
List<Score> perfectMatchedElements = getPerfectMatchedElement(childScores);

// Apply Exponent if match elements > 1
if (perfectMatchedElements.size() > 1 && getSumOfWeightedResult(perfectMatchedElements) > 1) {
List<Score> notPerfectMachedElements = getNonPerfectMatchedElement(childScoreList);
List<Score> notPerfectMachedElements = getNonPerfectMatchedElement(childScores);
double numerator = getExponentiallyIncreasedValue(getSumOfWeightedResult(perfectMatchedElements))
+ getSumOfWeightedResult(notPerfectMachedElements)
+ getUnmatchedChildScore(match);

double denominator = getExponentiallyIncreasedValue(getSumOfWeights(perfectMatchedElements))
+ getSumOfWeights(notPerfectMachedElements)
+ getChildCount(match)
- childScoreList.size();
- childScores.size();
return new Score(numerator / denominator, match);
} else
return getWeightedAverageScore().apply(match);
return getWeightedAverageScore().apply(match, childScores);
};
}

Expand All @@ -125,8 +121,8 @@ static ScoringFunction getExponentialWeightedAverageScore() {
* @return the scoring function for Jaccard
*/
static ScoringFunction getJaccardScore() {
return match ->
new Score((double) match.getChildScores().size() /
return (match, childScores) ->
new Score((double) childScores.size() /
((match.getData().getChildCount(match.getMatchedWith()))), match);
}

Expand Down
Loading

0 comments on commit 2e3793c

Please sign in to comment.