Skip to content

Commit

Permalink
ReadOnlySpan<T> Support (#36)
Browse files Browse the repository at this point in the history
Resolves #35 for comparison using byte[], or any scenarios where i.e. ReadOnlySpan might be preferred.
  • Loading branch information
paulirwin authored Nov 8, 2023
1 parent 1fa3892 commit cb08cfc
Show file tree
Hide file tree
Showing 22 changed files with 313 additions and 142 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -243,3 +243,4 @@ ModelManifest.xml
# FAKE - F# Make
.fake/
*.DS_Store
.idea/
4 changes: 2 additions & 2 deletions src/F23.StringSimilarity/Cosine.cs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ namespace F23.StringSimilarity
public class Cosine : ShingleBased, INormalizedStringSimilarity, INormalizedStringDistance
{
/// <summary>
/// Implements Cosine Similarity between strings.The strings are first
/// Implements Cosine Similarity between strings. The strings are first
/// transformed in vectors of occurrences of k-shingles(sequences of k
/// characters). In this n-dimensional space, the similarity between the two
/// strings is the cosine of their respective vectors.
Expand All @@ -41,7 +41,7 @@ public class Cosine : ShingleBased, INormalizedStringSimilarity, INormalizedStri
public Cosine(int k) : base(k) { }

/// <summary>
/// Implements Cosine Similarity between strings.The strings are first
/// Implements Cosine Similarity between strings. The strings are first
/// transformed in vectors of occurrences of k-shingles(sequences of k
/// characters). In this n-dimensional space, the similarity between the two
/// strings is the cosine of their respective vectors.
Expand Down
12 changes: 8 additions & 4 deletions src/F23.StringSimilarity/Damerau.cs
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ namespace F23.StringSimilarity
/// This is not to be confused with the optimal string alignment distance, which
/// is an extension where no substring can be edited more than once.
/// </summary>
public class Damerau : IMetricStringDistance
public class Damerau : IMetricStringDistance, IMetricSpanDistance
{
/// <summary>
/// Compute the distance between strings: the minimum number of operations
Expand All @@ -54,6 +54,10 @@ public class Damerau : IMetricStringDistance
/// <returns>The computed distance.</returns>
/// <exception cref="ArgumentNullException">If s1 or s2 is null.</exception>
public double Distance(string s1, string s2)
=> Distance(s1.AsSpan(), s2.AsSpan());

public double Distance<T>(ReadOnlySpan<T> s1, ReadOnlySpan<T> s2)
where T : IEquatable<T>
{
if (s1 == null)
{
Expand All @@ -65,7 +69,7 @@ public double Distance(string s1, string s2)
throw new ArgumentNullException(nameof(s2));
}

if (s1.Equals(s2))
if (s1.SequenceEqual(s2))
{
return 0;
}
Expand All @@ -74,7 +78,7 @@ public double Distance(string s1, string s2)
int inf = s1.Length + s2.Length;

// Create and initialize the character array indices
var da = new Dictionary<char, int>();
var da = new Dictionary<T, int>();

for (int d = 0; d < s1.Length; d++)
{
Expand Down Expand Up @@ -115,7 +119,7 @@ public double Distance(string s1, string s2)
int j1 = db;

int cost = 1;
if (s1[i - 1] == s2[j - 1])
if (s1[i - 1].Equals(s2[j - 1]))
{
cost = 0;
db = j;
Expand Down
2 changes: 1 addition & 1 deletion src/F23.StringSimilarity/F23.StringSimilarity.csproj
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFrameworks>netstandard2.0</TargetFrameworks>
<TargetFramework>netstandard2.0</TargetFramework>
<PackageId>F23.StringSimilarity</PackageId>
<PackageTags>string;similarity;distance;levenshtein;jaro-winkler;lcs;cosine</PackageTags>
<Title>StringSimilarity.NET</Title>
Expand Down
23 changes: 23 additions & 0 deletions src/F23.StringSimilarity/Interfaces/IMetricSpanDistance.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
using System;

namespace F23.StringSimilarity.Interfaces
{
/// <summary>
/// Span distances that implement this interface are metrics, which means:
/// - d(x, y) ≥ 0 (non-negativity, or separation axiom)
/// - d(x, y) = 0 if and only if x = y (identity, or coincidence axiom)
/// - d(x, y) = d(y, x) (symmetry)
/// - d(x, z) ≤ d(x, y) + d(y, z) (triangle inequality).
/// </summary>
public interface IMetricSpanDistance : ISpanDistance
{
/// <summary>
/// Compute and return the metric distance.
/// </summary>
/// <param name="b1">The first span.</param>
/// <param name="b2">The second span.</param>
/// <returns>The metric distance.</returns>
new double Distance<T>(ReadOnlySpan<T> b1, ReadOnlySpan<T> b2)
where T : IEquatable<T>;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
namespace F23.StringSimilarity.Interfaces
{
public interface INormalizedSpanDistance : ISpanDistance
{
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
namespace F23.StringSimilarity.Interfaces
{
public interface INormalizedSpanSimilarity : ISpanSimilarity
{
}
}
23 changes: 23 additions & 0 deletions src/F23.StringSimilarity/Interfaces/ISpanDistance.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
using System;

namespace F23.StringSimilarity.Interfaces
{
public interface ISpanDistance
{
/// <summary>
/// Compute and return a measure of distance.
/// Must be >= 0.
///
/// This method operates on spans such as byte arrays.
/// Note that, when used on bytes, string encodings that
/// use more than one byte per codepoint (such as UTF-8)
/// are not supported and will most likely return
/// incorrect results.
/// </summary>
/// <param name="b1">The first span.</param>
/// <param name="b2">The second span.</param>
/// <returns>The measure of distance between the spans.</returns>
double Distance<T>(ReadOnlySpan<T> b1, ReadOnlySpan<T> b2)
where T : IEquatable<T>;
}
}
16 changes: 16 additions & 0 deletions src/F23.StringSimilarity/Interfaces/ISpanSimilarity.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
using System;

namespace F23.StringSimilarity.Interfaces
{
public interface ISpanSimilarity
{
/// <summary>
/// Compute and return a measure of similarity between 2 spans.
/// </summary>
/// <param name="s1">The first span</param>
/// <param name="s2">The second span</param>
/// <returns>Similarity (0 means both spans are completely different)</returns>
double Similarity<T>(ReadOnlySpan<T> s1, ReadOnlySpan<T> s2)
where T : IEquatable<T>;
}
}
30 changes: 20 additions & 10 deletions src/F23.StringSimilarity/JaroWinkler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
*/

using System;
using System.Data.SqlTypes;
using System.Linq;
using F23.StringSimilarity.Interfaces;
// ReSharper disable SuggestVarOrType_Elsewhere
Expand All @@ -38,7 +39,7 @@ namespace F23.StringSimilarity
/// Jaro-Winkler was developed in the area of record linkage (duplicate
/// detection) (Winkler, 1990). It returns a value in the interval [0.0, 1.0].
/// The distance is computed as 1 - Jaro-Winkler similarity.
public class JaroWinkler : INormalizedStringSimilarity, INormalizedStringDistance
public class JaroWinkler : INormalizedStringSimilarity, INormalizedStringDistance, INormalizedSpanSimilarity, INormalizedSpanDistance
{
private const double DEFAULT_THRESHOLD = 0.7;
private const int THREE = 3;
Expand Down Expand Up @@ -75,6 +76,10 @@ public JaroWinkler(double threshold)
/// <returns>The Jaro-Winkler similarity in the range [0, 1]</returns>
/// <exception cref="ArgumentNullException">If s1 or s2 is null.</exception>
public double Similarity(string s1, string s2)
=> Similarity(s1.AsSpan(), s2.AsSpan());

public double Similarity<T>(ReadOnlySpan<T> s1, ReadOnlySpan<T> s2)
where T : IEquatable<T>
{
if (s1 == null)
{
Expand All @@ -86,7 +91,7 @@ public double Similarity(string s1, string s2)
throw new ArgumentNullException(nameof(s2));
}

if (s1.Equals(s2))
if (s1.SequenceEqual(s2))
{
return 1f;
}
Expand Down Expand Up @@ -117,10 +122,15 @@ public double Similarity(string s1, string s2)
/// <exception cref="ArgumentNullException">If s1 or s2 is null.</exception>
public double Distance(string s1, string s2)
=> 1.0 - Similarity(s1, s2);

public double Distance<T>(ReadOnlySpan<T> s1, ReadOnlySpan<T> s2)
where T : IEquatable<T>
=> 1.0 - Similarity(s1, s2);

private static int[] Matches(string s1, string s2)
private static int[] Matches<T>(ReadOnlySpan<T> s1, ReadOnlySpan<T> s2)
where T : IEquatable<T>
{
string max, min;
ReadOnlySpan<T> max, min;
if (s1.Length > s2.Length)
{
max = s1;
Expand All @@ -141,11 +151,11 @@ private static int[] Matches(string s1, string s2)
int matches = 0;
for (int mi = 0; mi < min.Length; mi++)
{
char c1 = min[mi];
var c1 = min[mi];
for (int xi = Math.Max(mi - range, 0),
xn = Math.Min(mi + range + 1, max.Length); xi < xn; xi++)
{
if (!match_flags[xi] && c1 == max[xi])
if (!match_flags[xi] && c1.Equals(max[xi]))
{
match_indexes[mi] = xi;
match_flags[xi] = true;
Expand All @@ -154,8 +164,8 @@ private static int[] Matches(string s1, string s2)
}
}
}
char[] ms1 = new char[matches];
char[] ms2 = new char[matches];
T[] ms1 = new T[matches];
T[] ms2 = new T[matches];
for (int i = 0, si = 0; i < min.Length; i++)
{
if (match_indexes[i] != -1)
Expand All @@ -175,15 +185,15 @@ private static int[] Matches(string s1, string s2)
int transpositions = 0;
for (int mi = 0; mi < ms1.Length; mi++)
{
if (ms1[mi] != ms2[mi])
if (!ms1[mi].Equals(ms2[mi]))
{
transpositions++;
}
}
int prefix = 0;
for (int mi = 0; mi < min.Length; mi++)
{
if (s1[mi] == s2[mi])
if (s1[mi].Equals(s2[mi]))
{
prefix++;
}
Expand Down
28 changes: 17 additions & 11 deletions src/F23.StringSimilarity/Levenshtein.cs
Original file line number Diff line number Diff line change
Expand Up @@ -32,18 +32,15 @@ namespace F23.StringSimilarity
/// The Levenshtein distance between two words is the Minimum number of
/// single-character edits (insertions, deletions or substitutions) required to
/// change one string into the other.
public class Levenshtein : IMetricStringDistance
public class Levenshtein : IMetricStringDistance, IMetricSpanDistance
{
/// <summary>
/// Equivalent to Distance(s1, s2, Int32.MaxValue).
/// </summary>
/// <param name="s1">The first string to compare.</param>
/// <param name="s2">The second string to compare.</param>
/// <returns>The Levenshtein distance between strings</returns>
public double Distance(string s1, string s2)
{
return Distance(s1, s2, int.MaxValue);
}
public double Distance(string s1, string s2) => Distance(s1, s2, int.MaxValue);

/// <summary>
/// The Levenshtein distance, or edit distance, between two words is the
Expand Down Expand Up @@ -75,6 +72,14 @@ public double Distance(string s1, string s2)
/// <returns>The Levenshtein distance between strings</returns>
/// <exception cref="ArgumentNullException">If s1 or s2 is null.</exception>
public double Distance(string s1, string s2, int limit)
=> Distance(s1.AsSpan(), s2.AsSpan(), limit);

public double Distance<T>(ReadOnlySpan<T> s1, ReadOnlySpan<T> s2)
where T : IEquatable<T>
=> Distance(s1, s2, int.MaxValue);

public double Distance<T>(ReadOnlySpan<T> s1, ReadOnlySpan<T> s2, int limit)
where T : IEquatable<T>
{
if (s1 == null)
{
Expand All @@ -86,7 +91,7 @@ public double Distance(string s1, string s2, int limit)
throw new ArgumentNullException(nameof(s2));
}

if (s1.Equals(s2))
if (s1.SequenceEqual(s2))
{
return 0;
}
Expand Down Expand Up @@ -127,15 +132,16 @@ public double Distance(string s1, string s2, int limit)
for (int j = 0; j < s2.Length; j++)
{
int cost = 1;
if (s1[i] == s2[j])
if (s1[i].Equals(s2[j]))
{
cost = 0;
}

v1[j + 1] = Math.Min(
v1[j] + 1, // Cost of insertion
Math.Min(
v0[j + 1] + 1, // Cost of remove
v0[j] + cost)); // Cost of substitution
v1[j] + 1, // Cost of insertion
Math.Min(
v0[j + 1] + 1, // Cost of remove
v0[j] + cost)); // Cost of substitution

minv1 = Math.Min(minv1, v1[j + 1]);
}
Expand Down
17 changes: 11 additions & 6 deletions src/F23.StringSimilarity/LongestCommonSubsequence.cs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ namespace F23.StringSimilarity
///
/// ! This class currently implements the dynamic programming approach, which has
/// a space requirement O(m * n)!
public class LongestCommonSubsequence : IStringDistance
public class LongestCommonSubsequence : IStringDistance, ISpanDistance
{
/// <summary>
/// Return the LCS distance between strings s1 and s2, computed as |s1| +
Expand All @@ -58,6 +58,10 @@ public class LongestCommonSubsequence : IStringDistance
/// </returns>
/// <exception cref="ArgumentNullException">If s1 or s2 is null.</exception>
public double Distance(string s1, string s2)
=> Distance(s1.AsSpan(), s2.AsSpan());

public double Distance<T>(ReadOnlySpan<T> s1, ReadOnlySpan<T> s2)
where T : IEquatable<T>
{
if (s1 == null)
{
Expand All @@ -69,7 +73,7 @@ public double Distance(string s1, string s2)
throw new ArgumentNullException(nameof(s2));
}

if (s1.Equals(s2))
if (s1.SequenceEqual(s2))
{
return 0;
}
Expand All @@ -86,6 +90,10 @@ public double Distance(string s1, string s2)
/// <returns>The length of LCS(s2, s2)</returns>
/// <exception cref="ArgumentNullException">If s1 or s2 is null.</exception>
public int Length(string s1, string s2)
=> Length(s1.AsSpan(), s2.AsSpan());

internal static int Length<T>(ReadOnlySpan<T> s1, ReadOnlySpan<T> s2)
where T : IEquatable<T>
{
if (s1 == null)
{
Expand Down Expand Up @@ -113,8 +121,6 @@ public int Length(string s1, string s2)
*/
int s1_length = s1.Length;
int s2_length = s2.Length;
char[] x = s1.ToCharArray();
char[] y = s2.ToCharArray();

int[,] c = new int[s1_length + 1, s2_length + 1];

Expand All @@ -132,10 +138,9 @@ public int Length(string s1, string s2)
{
for (int j = 1; j <= s2_length; j++)
{
if (x[i - 1] == y[j - 1])
if (s1[i - 1].Equals(s2[j - 1]))
{
c[i, j] = c[i - 1, j - 1] + 1;

}
else
{
Expand Down
Loading

0 comments on commit cb08cfc

Please sign in to comment.