From 42b4c6b314cde0934da912ae14fbc5b78c02490a Mon Sep 17 00:00:00 2001 From: niemi Date: Wed, 14 Dec 2022 12:51:12 +0200 Subject: [PATCH] adding lemmatization to term matching --- OpusCatMTEngine/App.xaml.cs | 9 ++- OpusCatMTEngine/MTModel.cs | 61 +++++++++++++++---- OpusCatMTEngine/OpusCatMTEngine.csproj | 1 + .../Properties/Resources.Designer.cs | 6 +- OpusCatMTEngine/Properties/Resources.resx | 6 +- OpusCatMTEngine/PythonNetHelper.cs | 15 +++-- OpusCatMTEngine/Terminology/Term.cs | 54 +++++++++++++++- OpusCatMTEngine/Terminology/TermMatch.cs | 30 +++++++++ OpusCatMTEngine/UI/TerminologyView.xaml.cs | 7 ++- 9 files changed, 162 insertions(+), 27 deletions(-) create mode 100644 OpusCatMTEngine/Terminology/TermMatch.cs diff --git a/OpusCatMTEngine/App.xaml.cs b/OpusCatMTEngine/App.xaml.cs index ca54ab5..4a95714 100644 --- a/OpusCatMTEngine/App.xaml.cs +++ b/OpusCatMTEngine/App.xaml.cs @@ -100,6 +100,8 @@ private void Application_Startup(object sender, StartupEventArgs e) Log.Information("Setting Tls12 as security protocol (required for accessing online model storage"); ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls12; + this.InitializePythonEngine(); + Log.Information("Opening OPUS-CAT MT Engine window"); // Create the startup window @@ -117,10 +119,11 @@ private void Application_Startup(object sender, StartupEventArgs e) App.CloseOverlay(); } - this.InitializePythonEngine(); - +//The update check is used to keep track of use counts, so disable it in DEBUG mode to keep counts +//more accurate +#if !DEBUG this.CheckForUpdatesAsync(); - +#endif } private async void CheckForUpdatesAsync() diff --git a/OpusCatMTEngine/MTModel.cs b/OpusCatMTEngine/MTModel.cs index 9d5f9ae..aef049c 100644 --- a/OpusCatMTEngine/MTModel.cs +++ b/OpusCatMTEngine/MTModel.cs @@ -192,29 +192,68 @@ public Task Translate( if (this.SupportsTerminology && applyTerminology) { - - var lemmatizedInput = PythonNetHelper.Lemmatize(this.sourceLanguages.First(), input); - + //Apply terminology //Use a simple method of removing overlapping matches of different terms: //For each position record only the longest term match, then when annotating term data, //start from the term closest to edge and skip overlapping terms. - var termMatches = new Dictionary>>(); + var termMatches = new Dictionary>(); + + //Get lemmatized input and find lemmatized term matches. Prioritize normal term matches + //in case of overlap + var lemmatizedInput = PythonNetHelper.Lemmatize(this.sourceLanguages.First().ShortestIsoCode, input); + + //Make dicts out of + var lemmaToPositionDict = new Dictionary>(); + int lemmaCounter = 0; + foreach (var lemma in lemmatizedInput.Select(x => x.Item3)) + { + if (lemmaToPositionDict.ContainsKey(lemma)) + { + lemmaToPositionDict[lemma].Add(lemmaCounter); + } + else + { + lemmaToPositionDict[lemma] = new List() { lemmaCounter }; + } + lemmaCounter++; + } foreach (var term in this.Terminology.Terms) { var thisTermMatches = term.SourcePatternRegex.Matches(input); foreach (Match termMatch in thisTermMatches) { + if (termMatches.ContainsKey(termMatch.Index)) { - termMatches[termMatch.Index].Add(new Tuple(term, termMatch)); + termMatches[termMatch.Index].Add( + new TermMatch(term,termMatch)); } else { - termMatches[termMatch.Index] = new List>() { - new Tuple(term, termMatch)}; + termMatches[termMatch.Index] = new List() { + new TermMatch(term,termMatch)}; + } + } + + //Match term at lemma level, if specified + if (term.MatchSourceLemma) + { + var sourceLemma = term.SourceLemmas; + + //Check if first lemma in term found in sentence + if (lemmaToPositionDict.ContainsKey(sourceLemma[0])) + { + var firstLemmaPositions = lemmaToPositionDict[sourceLemma[0]]; + + //Then check if the other lemmas of the term follow in the source sentence + foreach (var startPos in firstLemmaPositions) + { + + } } + } } @@ -222,13 +261,13 @@ public Task Translate( foreach (var index in termMatches.Keys.ToList().OrderByDescending(x => x)) { //Start from longest match - var matchesDescending = termMatches[index].OrderByDescending(x => x.Item2.Length); + var matchesDescending = termMatches[index].OrderByDescending(x => x.Length); foreach (var match in matchesDescending) { - if (match.Item2.Length + index <= lastEditStart) + if (match.Length + index <= lastEditStart) { - input = input.Remove(index, match.Item2.Length).Insert(index, - $" {match.Item1.TargetLemma} "); + input = input.Remove(index, match.Length).Insert(index, + $" {match.Term.TargetLemma} "); lastEditStart = index; continue; } diff --git a/OpusCatMTEngine/OpusCatMTEngine.csproj b/OpusCatMTEngine/OpusCatMTEngine.csproj index f8e976b..2261c10 100644 --- a/OpusCatMTEngine/OpusCatMTEngine.csproj +++ b/OpusCatMTEngine/OpusCatMTEngine.csproj @@ -273,6 +273,7 @@ + diff --git a/OpusCatMTEngine/Properties/Resources.Designer.cs b/OpusCatMTEngine/Properties/Resources.Designer.cs index 9c8d814..3f7feac 100644 --- a/OpusCatMTEngine/Properties/Resources.Designer.cs +++ b/OpusCatMTEngine/Properties/Resources.Designer.cs @@ -1000,10 +1000,10 @@ public static string Settings_StoreInAppdataCheckbox { /// /// def lemmatize(self,input): /// doc = self.stanza(input) - /// lemma_string = "" + /// lemma_tuples = [] /// for sentence in doc.sentences: - /// lemma_string += " ".join([x.lemma for x in sentence.words]) - /// return lemma_string. + /// lemma_tuples +=[(x.start_char,x.end_char,x.lemma) for x in sentence.words] + /// return lemma_tuples. /// public static string StanzaWrapperCode { get { diff --git a/OpusCatMTEngine/Properties/Resources.resx b/OpusCatMTEngine/Properties/Resources.resx index 68bbef1..f8907fb 100644 --- a/OpusCatMTEngine/Properties/Resources.resx +++ b/OpusCatMTEngine/Properties/Resources.resx @@ -485,9 +485,9 @@ class StanzaWrapper: def lemmatize(self,input): doc = self.stanza(input) - lemma_string = "" + lemma_tuples = [] for sentence in doc.sentences: - lemma_string += " ".join([x.lemma for x in sentence.words]) - return lemma_string + lemma_tuples +=[(x.start_char,x.end_char,x.lemma) for x in sentence.words] + return lemma_tuples \ No newline at end of file diff --git a/OpusCatMTEngine/PythonNetHelper.cs b/OpusCatMTEngine/PythonNetHelper.cs index 64a9912..4be53e3 100644 --- a/OpusCatMTEngine/PythonNetHelper.cs +++ b/OpusCatMTEngine/PythonNetHelper.cs @@ -9,10 +9,11 @@ namespace OpusCatMTEngine { static class PythonNetHelper { - private static Dictionary LemmatizerScopes = new Dictionary(); + private static Dictionary LemmatizerScopes = new Dictionary(); - internal static string Lemmatize(IsoLanguage lang, string input) + internal static List> Lemmatize(string lang, string input) { + List> lemmaList = new List>(); using (Py.GIL()) { if (!PythonNetHelper.LemmatizerScopes.ContainsKey(lang)) @@ -27,12 +28,18 @@ internal static string Lemmatize(IsoLanguage lang, string input) scope.Import(moduleScope, "stanza_wrapper"); PythonNetHelper.LemmatizerScopes[lang] = scope.Eval( - $"stanza_wrapper.StanzaWrapper('{lang.ShortestIsoCode}', processors='tokenize, pos, lemma, depparse')"); + $"stanza_wrapper.StanzaWrapper('{lang}', processors='tokenize, pos, lemma, depparse')"); } } } - return PythonNetHelper.LemmatizerScopes[lang].lemmatize(input); + var lemmatized = PythonNetHelper.LemmatizerScopes[lang].lemmatize(input); + var output = new List>(); + foreach (var lemma in lemmatized) + { + output.Add(new Tuple((int)lemma[0],(int)lemma[1],(string)lemma[2])); + } + return output; } } } diff --git a/OpusCatMTEngine/Terminology/Term.cs b/OpusCatMTEngine/Terminology/Term.cs index ac6c209..b71e02d 100644 --- a/OpusCatMTEngine/Terminology/Term.cs +++ b/OpusCatMTEngine/Terminology/Term.cs @@ -1,6 +1,8 @@  using System; +using System.Collections.Generic; +using System.Linq; using System.Text.RegularExpressions; using YamlDotNet.Serialization; @@ -15,10 +17,16 @@ public Term() } - public Term(string sourcePattern, string targetLemma) + public Term( + string sourcePattern, + string targetLemma, + IsoLanguage sourceLang, + IsoLanguage targetLang) { this.SourcePattern = sourcePattern; this.TargetLemma = targetLemma; + this.SourceLanguageCode = sourceLang.ShortestIsoCode; + this.TargetLanguageCode = sourceLang.ShortestIsoCode; } [YamlMember(Alias = "source-pattern", ApplyNamingConventions = false)] @@ -28,13 +36,49 @@ public string SourcePattern set { _sourcePattern = value; + this.UpdateSourcePatternRegex(); } } + [YamlMember(Alias = "match-source-lemma", ApplyNamingConventions = false)] + public bool MatchSourceLemma + { + get => _matchSourceLemma; + set + { + _matchSourceLemma = value; + //Nullify this to make sure a fresh lemma is generated when needed + this.SourceLemmas = null; + } + } + + public List SourceLemmas + { + get + { + if (_sourceLemmas == null) + { + if (this.SourceLanguageCode != null) + { + _sourceLemmas = PythonNetHelper.Lemmatize( + this.SourceLanguageCode, + this.SourcePattern).Select(x => x.Item3).ToList(); + } + } + return _sourceLemmas; + } + set => _sourceLemmas = value; } + [YamlMember(Alias = "target-lemma", ApplyNamingConventions = false)] public string TargetLemma { get; set; } + [YamlMember(Alias = "source-language-code", ApplyNamingConventions = false)] + public string SourceLanguageCode { get; set; } + + [YamlMember(Alias = "target-language-code", ApplyNamingConventions = false)] + public string TargetLanguageCode { get; set; } + [YamlMember(Alias = "source-pattern-is-regex", ApplyNamingConventions = false)] public bool SourcePatternIsRegex { @@ -69,11 +113,14 @@ private void UpdateSourcePatternRegex() if (this.SourcePatternIsRegex) { - this.sourcePatternRegex = new Regex($"\\b{this.SourcePattern}\\b",sourcePatternOptions); + this.sourcePatternRegex = new Regex($"\\b{this.SourcePattern}\\b", sourcePatternOptions); } else { this.sourcePatternRegex = new Regex($"\\b{Regex.Escape(this.SourcePattern)}\\b", sourcePatternOptions); + + //Nullify source lemma (it will be generated when requested) + this.SourceLemmas = null; } } } @@ -101,6 +148,8 @@ public string Description private bool _sourcePatternIsRegex; private string _sourcePattern; private bool _sourcePatternIsCaseSensitive; + private bool _matchSourceLemma; + private List _sourceLemmas; [YamlIgnore] public Regex SourcePatternRegex @@ -111,5 +160,6 @@ public Regex SourcePatternRegex } } + } } \ No newline at end of file diff --git a/OpusCatMTEngine/Terminology/TermMatch.cs b/OpusCatMTEngine/Terminology/TermMatch.cs new file mode 100644 index 0000000..89df545 --- /dev/null +++ b/OpusCatMTEngine/Terminology/TermMatch.cs @@ -0,0 +1,30 @@ +using System; +using System.Text.RegularExpressions; +using YamlDotNet.Serialization; + +namespace OpusCatMTEngine +{ + + public class TermMatch + { + + public TermMatch(Term term, Match termMatch) + { + this.Term = term; + + } + + public TermMatch(Term term, int start, int length, bool lemmaMatch) + { + this.Term = term; + this.Length = length; + this.Start = start; + this.LemmaMatch = lemmaMatch; + } + + public Term Term { get; private set; } + public int Length { get; private set; } + public int Start { get; private set; } + public bool LemmaMatch { get; private set; } + } +} \ No newline at end of file diff --git a/OpusCatMTEngine/UI/TerminologyView.xaml.cs b/OpusCatMTEngine/UI/TerminologyView.xaml.cs index 10de80f..f87e86a 100644 --- a/OpusCatMTEngine/UI/TerminologyView.xaml.cs +++ b/OpusCatMTEngine/UI/TerminologyView.xaml.cs @@ -98,7 +98,12 @@ private void ImportTbx_Click(object sender, RoutedEventArgs e) var sourceTerm = sourceLangSet.Descendants("term").First().Value; var targetTerm = targetLangSet.Descendants("term").First().Value; - importedTerms.Add(new Term(sourceTerm, targetTerm)); + importedTerms.Add( + new Term( + sourceTerm, + targetTerm, + this.model.SourceLanguages.First(), + this.model.TargetLanguages.First())); } }