Skip to content

Commit

Permalink
1.0.2
Browse files Browse the repository at this point in the history
  • Loading branch information
cidrugHug8 committed Dec 28, 2023
1 parent 308d8eb commit 8a9925c
Show file tree
Hide file tree
Showing 4 changed files with 191 additions and 15 deletions.
5 changes: 3 additions & 2 deletions BleuNet/BleuNet.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@
<Description>BleuNet is a C# class library for calculating the BLEU score, a metric for evaluating the quality of machine translations.</Description>
<Title>BleuNet</Title>
<PackageProjectUrl>https://github.com/cidrugHug8/bleunet</PackageProjectUrl>
<AssemblyVersion>1.0.1.0</AssemblyVersion>
<FileVersion>1.0.1.0</FileVersion>
<AssemblyVersion>1.0.2.0</AssemblyVersion>
<FileVersion>1.0.2.0</FileVersion>
<VersionPrefix>1.0.2</VersionPrefix>
</PropertyGroup>

<ItemGroup>
Expand Down
165 changes: 164 additions & 1 deletion BleuNet/Utility.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,39 @@ namespace BleuNet
/// </summary>
public static class Utility
{
private static readonly Dictionary<string, int> nonBreakingPrefix = new();

static Utility()
{
LoadPrefixes();
}

private static void LoadPrefixes()
{
string[] lines = new string[] { "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T",
"U", "V", "W", "X", "Y", "Z", "Adj", "Adm", "Adv", "Asst", "Bart", "Bldg", "Brig", "Bros", "Capt", "Cmdr", "Col", "Comdr", "Con",
"Corp", "Cpl", "DR", "Dr", "Drs", "Ens", "Gen", "Gov", "Hon", "Hr", "Hosp", "Insp", "Lt", "MM", "MR", "MRS", "MS", "Maj", "Messrs",
"Mlle", "Mme", "Mr", "Mrs", "Ms", "Msgr", "Op", "Ord", "Pfc", "Ph", "Prof", "Pvt", "Rep", "Reps", "Res", "Rev", "Rt", "Sen", "Sens",
"Sfc", "Sgt", "Sr", "St", "Supt", "Surg", "v", "vs", "i.e", "rev", "e.g", "Rs", "No #NUMERIC_ONLY# ", "Nos", "Art #NUMERIC_ONLY#",
"Nr", "pp #NUMERIC_ONLY#", "Jan", "Feb", "Mar", "Apr", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" };
foreach (var line in lines)
{
string item = line.Trim();
if (!String.IsNullOrEmpty(item) && !item.StartsWith("#"))
{
if (Regex.IsMatch(item, @"(.*)\s+\#NUMERIC_ONLY\#"))
{
var match = Regex.Match(item, @"(.*)\s+\#NUMERIC_ONLY\#");
nonBreakingPrefix[match.Groups[1].Value] = 2;
}
else
{
nonBreakingPrefix[item] = 1;
}
}
}
}

/// <summary>
/// Tokenizes the input string into an array of words.
/// </summary>
Expand Down Expand Up @@ -50,6 +83,136 @@ public static string[] Tokenize(string line, bool lc = true)
return segmented;
}

/// <summary>
/// Tokenizes the input text.
/// </summary>
/// <remarks>
/// This method's tokenization is designed to closely match the tokenization of the tokenizer.perl script included with the statistical machine translation tool Moses when specified with -l en
/// </remarks>
/// <param name="line">The input string to tokenize.</param>
/// <param name="lc">A boolean value indicating whether to convert the input string to lower case. Default is true.</param>
/// <returns>An array of words.</returns>
public static string[] Tokenize2(string text, bool lc = true)
{
// The rest of the method implements the tokenization as in the Perl script
text = " " + text.Trim() + " ";

// Remove ASCII junk.
text = Regex.Replace(text, @"\s+", " ");
text = Regex.Replace(text, "[\u0000-\u001F]", "");

//// Find protected patterns.
//List<string> protectedTokens = new List<string>();
//foreach (var protectedPattern in protectedPatterns)
//{
// // Match and protect the patterns
// var matches = Regex.Matches(text, protectedPattern);
// foreach (Match match in matches)
// {
// protectedTokens.Add(match.Value);
// }
// // Replace found protected patterns in text with placeholders
// int i = 0;
// foreach (var token in protectedTokens)
// {
// string subst = $"THISISPROTECTED{i.ToString("D3")}";
// text = text.Replace(token, subst);
// i++;
// }
//}
//for (int i = 0; i < protectedTokens.Count; i++)
//{
// string subst = $"THISISPROTECTED{i.ToString("D3")}";
// text = text.Replace(subst, protectedTokens[i]);
//}

text = Regex.Replace(text, " +", " ");
text = text.Trim();

text = Regex.Replace(text, @"([^\w\s\.'`,\-])", " $1 ");

// Multi-dots stay together.
text = Regex.Replace(text, @"\.\.+", "DOTMULTI$0");
while (Regex.IsMatch(text, "DOTMULTI\\."))
{
text = Regex.Replace(text, @"DOTMULTI\.([^\.])", "DOTDOTMULTI $1");
text = Regex.Replace(text, @"DOTMULTI\.", "DOTDOTMULTI");
}

// First, separate out "," except if it follows a non-number
// Second, separate out "," except if it precedes a non-number
text = Regex.Replace(text, @"([^\d]),", "$1 , ");
text = Regex.Replace(text, @",([^\d])", " , $1");

// Separate out "," after a number if it's the end of a sentence/string.
text = Regex.Replace(text, @"(\d),$", "$1 ,");

// Split contractions right - Adjust apostrophes in various contexts.
text = Regex.Replace(text, @"([^\p{L}])'([^\p{L}])", "$1 ' $2"); // Non-letter, apostrophe, non-letter
text = Regex.Replace(text, @"([^\p{L}\d])'([\p{L}])", "$1 ' $2"); // Non-letter/digit, apostrophe, letter
text = Regex.Replace(text, @"([\p{L}])'([^\p{L}])", "$1 ' $2"); // Letter, apostrophe, non-letter
text = Regex.Replace(text, @"([\p{L}])'([\p{L}])", "$1'$2"); // Letter, apostrophe, letter
text = Regex.Replace(text, @"(\d)'([sS])", "$1'$2"); // Number, apostrophe, 's' (like in "1990's")

string[] words = Regex.Split(text, @"\s+"); // Split text into words
string resultText = "";
for (int i = 0; i < words.Length; i++)
{
string word = words[i];
Match match = Regex.Match(word, @"^(\S+)\.$");

if (match.Success)
{
string pre = match.Groups[1].Value;
if (i == words.Length - 1)
{
word = pre + " ."; // Separate the period for the last word
}
else if ((pre.Contains(".") && Regex.IsMatch(pre, @"\p{L}")) ||
(nonBreakingPrefix.ContainsKey(pre) && nonBreakingPrefix[pre] == 1) ||
(i < words.Length - 1 && Regex.IsMatch(words[i + 1], @"^[\p{Ll}]")))
{
// No change for specific non-breaking prefixes or certain conditions
}
else if (nonBreakingPrefix.ContainsKey(pre) && nonBreakingPrefix[pre] == 2 &&
i < words.Length - 1 && Regex.IsMatch(words[i + 1], @"^[0-9]+"))
{
// No change for certain numeric conditions
}
else
{
word = pre + " ."; // Separate the period otherwise
}
}

resultText += word + " ";
}
text = resultText.Trim();

// Clean up extraneous spaces
text = Regex.Replace(text, " +", " ");
text = text.Trim();

// Fix punctuation at end of sentences ('.' at end of sentence is missed)
text = Regex.Replace(text, @"\.\' ?$", " . ' ");

//// Restore protected phrases
//for (int i = 0; i < protectedPhrases.Count; i++)
//{
// string subst = String.Format("THISISPROTECTED{0:D3}", i);
// text = text.Replace(subst, protectedPhrases[i]);
//}

// Restore multi-dots
while (text.Contains("DOTDOTMULTI"))
{
text = text.Replace("DOTDOTMULTI", "DOTMULTI.");
}
text = text.Replace("DOTMULTI", ".");

return text.Split();
}

/// <summary>
/// Runs a benchmark on the CorpusBleu method.
/// </summary>
Expand All @@ -69,4 +232,4 @@ public static void Benchmark(string referenceFilepath="./reference.txt", string
}
}
}
}
}
26 changes: 14 additions & 12 deletions ConsoleTest/Program.cs
Original file line number Diff line number Diff line change
@@ -1,21 +1,23 @@
using BleuNet;

// Define the translated and reference sentences.
string referenceSentence = "The pessimist sees difficulty in every opportunity.";
string translatedSentence = "The pessimist sees difficulty at every opportunity.";
string referenceSentence = "Dr. Smith goes to the hospital. She arrives at 3:30 p.m.";
var z = Utility.Tokenize2(referenceSentence);
//// Define the translated and reference sentences.
//string referenceSentence = "The pessimist sees difficulty in every opportunity.";
//string translatedSentence = "The pessimist sees difficulty at every opportunity.";

var referenceSentenceTokens = new string[][] { Utility.Tokenize(referenceSentence) };
var translatedSentenceTokens = new string[][] { Utility.Tokenize(translatedSentence) };
//var referenceSentenceTokens = new string[][] { Utility.Tokenize(referenceSentence) };
//var translatedSentenceTokens = new string[][] { Utility.Tokenize(translatedSentence) };

// Calculate the BLEU score.
double score = Metrics.CorpusBleu(referenceSentenceTokens, translatedSentenceTokens);
//// Calculate the BLEU score.
//double score = Metrics.CorpusBleu(referenceSentenceTokens, translatedSentenceTokens);

// Display the result.
Console.WriteLine("BLEU Score: " + score);
//// Display the result.
//Console.WriteLine("BLEU Score: " + score);

// Calculate the sentence BLEU score.
double sentenceBleu = Metrics.SentenceBleu(referenceSentenceTokens, Utility.Tokenize(translatedSentence));
Console.WriteLine("Sentence BLEU Score: " + sentenceBleu);
//// Calculate the sentence BLEU score.
//double sentenceBleu = Metrics.SentenceBleu(referenceSentenceTokens, Utility.Tokenize(translatedSentence));
//Console.WriteLine("Sentence BLEU Score: " + sentenceBleu);


//// Define the translated and reference sentences.
Expand Down
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,16 @@ double sentenceBleu = Metrics.SentenceBleu(referenceSentenceTokens, Utility.Toke
Console.WriteLine("Sentence BLEU Score: " + sentenceBleu);
```

## New Update: Tokenize2 Method
I have added a new method `Tokenize2` to my library. This method's tokenization is designed to closely match the tokenization of the tokenizer.perl script included with the statistical machine translation tool Moses when specified with -l en.

Here is a basic usage example:

```csharp
string text = "The quick brown fox jumps over the lazy dog.";
string[] tokens = Utility.Tokenize2(text);
```

## References

**BLEU**:
Expand Down

0 comments on commit 8a9925c

Please sign in to comment.