1.0.2

cidrugHug8 · Dec 28, 2023 · 8a9925c · 8a9925c
1 parent 308d8eb
commit 8a9925c
Show file tree

Hide file tree

Showing 4 changed files with 191 additions and 15 deletions.
diff --git a/BleuNet/BleuNet.csproj b/BleuNet/BleuNet.csproj
@@ -13,8 +13,9 @@
     <Description>BleuNet is a C# class library for calculating the BLEU score, a metric for evaluating the quality of machine translations.</Description>
     <Title>BleuNet</Title>
     <PackageProjectUrl>https://github.com/cidrugHug8/bleunet</PackageProjectUrl>
-    <AssemblyVersion>1.0.1.0</AssemblyVersion>
-    <FileVersion>1.0.1.0</FileVersion>
+    <AssemblyVersion>1.0.2.0</AssemblyVersion>
+    <FileVersion>1.0.2.0</FileVersion>
+    <VersionPrefix>1.0.2</VersionPrefix>    
   </PropertyGroup>
 
   <ItemGroup>

diff --git a/BleuNet/Utility.cs b/BleuNet/Utility.cs
@@ -8,6 +8,39 @@ namespace BleuNet
     /// </summary>
     public static class Utility
     {
+        private static readonly Dictionary<string, int> nonBreakingPrefix = new();
+
+        static Utility()
+        {
+            LoadPrefixes();
+        }
+
+        private static void LoadPrefixes()
+        {
+            string[] lines = new string[] { "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", 
+                "U", "V", "W", "X", "Y", "Z", "Adj", "Adm", "Adv", "Asst", "Bart", "Bldg", "Brig", "Bros", "Capt", "Cmdr", "Col", "Comdr", "Con", 
+                "Corp", "Cpl", "DR", "Dr", "Drs", "Ens", "Gen", "Gov", "Hon", "Hr", "Hosp", "Insp", "Lt", "MM", "MR", "MRS", "MS", "Maj", "Messrs",
+                "Mlle", "Mme", "Mr", "Mrs", "Ms", "Msgr", "Op", "Ord", "Pfc", "Ph", "Prof", "Pvt", "Rep", "Reps", "Res", "Rev", "Rt", "Sen", "Sens",
+                "Sfc", "Sgt", "Sr", "St", "Supt", "Surg", "v", "vs", "i.e", "rev", "e.g", "Rs", "No #NUMERIC_ONLY# ", "Nos", "Art #NUMERIC_ONLY#",
+                "Nr", "pp #NUMERIC_ONLY#", "Jan", "Feb", "Mar", "Apr", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" };
+            foreach (var line in lines)
+            {
+                string item = line.Trim();
+                if (!String.IsNullOrEmpty(item) && !item.StartsWith("#"))
+                {
+                    if (Regex.IsMatch(item, @"(.*)\s+\#NUMERIC_ONLY\#"))
+                    {
+                        var match = Regex.Match(item, @"(.*)\s+\#NUMERIC_ONLY\#");
+                        nonBreakingPrefix[match.Groups[1].Value] = 2;
+                    }
+                    else
+                    {
+                        nonBreakingPrefix[item] = 1;
+                    }
+                }
+            }
+        }
+
         /// <summary>
         /// Tokenizes the input string into an array of words.
         /// </summary>
@@ -50,6 +83,136 @@ public static string[] Tokenize(string line, bool lc = true)
             return segmented;
         }
 
+        /// <summary>
+        /// Tokenizes the input text.
+        /// </summary>
+        /// <remarks>
+        /// This method's tokenization is designed to closely match the tokenization of the tokenizer.perl script included with the statistical machine translation tool Moses when specified with -l en
+        /// </remarks> 
+        /// <param name="line">The input string to tokenize.</param>
+        /// <param name="lc">A boolean value indicating whether to convert the input string to lower case. Default is true.</param>
+        /// <returns>An array of words.</returns>
+        public static string[] Tokenize2(string text, bool lc = true)
+        {
+            // The rest of the method implements the tokenization as in the Perl script
+            text = " " + text.Trim() + " ";
+
+            // Remove ASCII junk.
+            text = Regex.Replace(text, @"\s+", " ");
+            text = Regex.Replace(text, "[\u0000-\u001F]", "");
+
+            //// Find protected patterns.
+            //List<string> protectedTokens = new List<string>();
+            //foreach (var protectedPattern in protectedPatterns)
+            //{
+            //    // Match and protect the patterns
+            //    var matches = Regex.Matches(text, protectedPattern);
+            //    foreach (Match match in matches)
+            //    {
+            //        protectedTokens.Add(match.Value);
+            //    }
+            //    // Replace found protected patterns in text with placeholders
+            //    int i = 0;
+            //    foreach (var token in protectedTokens)
+            //    {
+            //        string subst = $"THISISPROTECTED{i.ToString("D3")}";
+            //        text = text.Replace(token, subst);
+            //        i++;
+            //    }
+            //}
+            //for (int i = 0; i < protectedTokens.Count; i++)
+            //{
+            //    string subst = $"THISISPROTECTED{i.ToString("D3")}";
+            //    text = text.Replace(subst, protectedTokens[i]);
+            //}
+
+            text = Regex.Replace(text, " +", " ");
+            text = text.Trim();
+
+            text = Regex.Replace(text, @"([^\w\s\.'`,\-])", " $1 ");
+
+            // Multi-dots stay together.
+            text = Regex.Replace(text, @"\.\.+", "DOTMULTI$0");
+            while (Regex.IsMatch(text, "DOTMULTI\\."))
+            {
+                text = Regex.Replace(text, @"DOTMULTI\.([^\.])", "DOTDOTMULTI $1");
+                text = Regex.Replace(text, @"DOTMULTI\.", "DOTDOTMULTI");
+            }
+
+            // First, separate out "," except if it follows a non-number
+            // Second, separate out "," except if it precedes a non-number
+            text = Regex.Replace(text, @"([^\d]),", "$1 , ");
+            text = Regex.Replace(text, @",([^\d])", " , $1");
+
+            // Separate out "," after a number if it's the end of a sentence/string.
+            text = Regex.Replace(text, @"(\d),$", "$1 ,");
+
+            // Split contractions right - Adjust apostrophes in various contexts.
+            text = Regex.Replace(text, @"([^\p{L}])'([^\p{L}])", "$1 ' $2"); // Non-letter, apostrophe, non-letter
+            text = Regex.Replace(text, @"([^\p{L}\d])'([\p{L}])", "$1 ' $2"); // Non-letter/digit, apostrophe, letter
+            text = Regex.Replace(text, @"([\p{L}])'([^\p{L}])", "$1 ' $2"); // Letter, apostrophe, non-letter
+            text = Regex.Replace(text, @"([\p{L}])'([\p{L}])", "$1'$2");     // Letter, apostrophe, letter
+            text = Regex.Replace(text, @"(\d)'([sS])", "$1'$2");             // Number, apostrophe, 's' (like in "1990's")
+
+            string[] words = Regex.Split(text, @"\s+"); // Split text into words
+            string resultText = "";
+            for (int i = 0; i < words.Length; i++)
+            {
+                string word = words[i];
+                Match match = Regex.Match(word, @"^(\S+)\.$");
+
+                if (match.Success)
+                {
+                    string pre = match.Groups[1].Value;
+                    if (i == words.Length - 1)
+                    {
+                        word = pre + " ."; // Separate the period for the last word
+                    }
+                    else if ((pre.Contains(".") && Regex.IsMatch(pre, @"\p{L}")) ||
+                             (nonBreakingPrefix.ContainsKey(pre) && nonBreakingPrefix[pre] == 1) ||
+                             (i < words.Length - 1 && Regex.IsMatch(words[i + 1], @"^[\p{Ll}]")))
+                    {
+                        // No change for specific non-breaking prefixes or certain conditions
+                    }
+                    else if (nonBreakingPrefix.ContainsKey(pre) && nonBreakingPrefix[pre] == 2 &&
+                             i < words.Length - 1 && Regex.IsMatch(words[i + 1], @"^[0-9]+"))
+                    {
+                        // No change for certain numeric conditions
+                    }
+                    else
+                    {
+                        word = pre + " ."; // Separate the period otherwise
+                    }
+                }
+
+                resultText += word + " ";
+            }
+            text = resultText.Trim();
+
+            // Clean up extraneous spaces
+            text = Regex.Replace(text, " +", " ");
+            text = text.Trim();
+
+            // Fix punctuation at end of sentences ('.' at end of sentence is missed)
+            text = Regex.Replace(text, @"\.\' ?$", " . ' ");
+
+            //// Restore protected phrases
+            //for (int i = 0; i < protectedPhrases.Count; i++)
+            //{
+            //    string subst = String.Format("THISISPROTECTED{0:D3}", i);
+            //    text = text.Replace(subst, protectedPhrases[i]);
+            //}
+
+            // Restore multi-dots
+            while (text.Contains("DOTDOTMULTI"))
+            {
+                text = text.Replace("DOTDOTMULTI", "DOTMULTI.");
+            }
+            text = text.Replace("DOTMULTI", ".");
+
+            return text.Split();
+        }
+
         /// <summary>
         /// Runs a benchmark on the CorpusBleu method.
         /// </summary>
@@ -69,4 +232,4 @@ public static void Benchmark(string referenceFilepath="./reference.txt", string
             }
         }
     }
-}
+}
diff --git a/ConsoleTest/Program.cs b/ConsoleTest/Program.cs
@@ -1,21 +1,23 @@
 using BleuNet;
 
-// Define the translated and reference sentences.
-string referenceSentence = "The pessimist sees difficulty in every opportunity.";
-string translatedSentence = "The pessimist sees difficulty at every opportunity.";
+string referenceSentence = "Dr. Smith goes to the hospital. She arrives at 3:30 p.m.";
+var z = Utility.Tokenize2(referenceSentence);
+//// Define the translated and reference sentences.
+//string referenceSentence = "The pessimist sees difficulty in every opportunity.";
+//string translatedSentence = "The pessimist sees difficulty at every opportunity.";
 
-var referenceSentenceTokens = new string[][] { Utility.Tokenize(referenceSentence) };
-var translatedSentenceTokens = new string[][] { Utility.Tokenize(translatedSentence) };
+//var referenceSentenceTokens = new string[][] { Utility.Tokenize(referenceSentence) };
+//var translatedSentenceTokens = new string[][] { Utility.Tokenize(translatedSentence) };
 
-// Calculate the BLEU score.
-double score = Metrics.CorpusBleu(referenceSentenceTokens, translatedSentenceTokens);
+//// Calculate the BLEU score.
+//double score = Metrics.CorpusBleu(referenceSentenceTokens, translatedSentenceTokens);
 
-// Display the result.
-Console.WriteLine("BLEU Score: " + score);
+//// Display the result.
+//Console.WriteLine("BLEU Score: " + score);
 
-// Calculate the sentence BLEU score.
-double sentenceBleu = Metrics.SentenceBleu(referenceSentenceTokens, Utility.Tokenize(translatedSentence));
-Console.WriteLine("Sentence BLEU Score: " + sentenceBleu);
+//// Calculate the sentence BLEU score.
+//double sentenceBleu = Metrics.SentenceBleu(referenceSentenceTokens, Utility.Tokenize(translatedSentence));
+//Console.WriteLine("Sentence BLEU Score: " + sentenceBleu);
 
 
 //// Define the translated and reference sentences.

diff --git a/README.md b/README.md
@@ -34,6 +34,16 @@ double sentenceBleu = Metrics.SentenceBleu(referenceSentenceTokens, Utility.Toke
 Console.WriteLine("Sentence BLEU Score: " + sentenceBleu);
 ```
 
+## New Update: Tokenize2 Method
+I have added a new method `Tokenize2` to my library. This method's tokenization is designed to closely match the tokenization of the tokenizer.perl script included with the statistical machine translation tool Moses when specified with -l en.
+
+Here is a basic usage example:
+
+```csharp
+string text = "The quick brown fox jumps over the lazy dog.";
+string[] tokens = Utility.Tokenize2(text);
+```
+
 ## References
 
 **BLEU**: