From ea23dfdbfc8a8dc4efc8060943903caab5ed4e61 Mon Sep 17 00:00:00 2001 From: niemi Date: Fri, 9 Dec 2022 18:41:19 +0200 Subject: [PATCH] fix to edit rules handling, added stanza lemmatizer --- .../AutoEditRules/AutoEditRuleCollection.cs | 19 +++++++-- OpusCatMTEngine/MTModel.cs | 10 ++++- OpusCatMTEngine/OpusCatMTEngine.csproj | 3 +- .../Properties/Resources.Designer.cs | 23 +++++++++++ OpusCatMTEngine/Properties/Resources.resx | 17 ++++++++ OpusCatMTEngine/PythonNetHelper.cs | 39 +++++++++++++++++++ 6 files changed, 105 insertions(+), 6 deletions(-) create mode 100644 OpusCatMTEngine/PythonNetHelper.cs diff --git a/OpusCatMTEngine/AutoEditRules/AutoEditRuleCollection.cs b/OpusCatMTEngine/AutoEditRules/AutoEditRuleCollection.cs index 465af72..0aab595 100644 --- a/OpusCatMTEngine/AutoEditRules/AutoEditRuleCollection.cs +++ b/OpusCatMTEngine/AutoEditRules/AutoEditRuleCollection.cs @@ -221,7 +221,7 @@ public AutoEditResult ProcessPreEditRules(string unedited) string edited = unedited; List appliedReplacements = new List(); - List> coveredUneditedSourceSpans = new List>(); + //Collect matches for all rules Dictionary> uneditedSourceMatches = this.GetAllSourceMatches(unedited); @@ -230,6 +230,12 @@ public AutoEditResult ProcessPreEditRules(string unedited) int editingOffset = 0; foreach (var matchesAtPosition in uneditedSourceMatches.OrderBy(x => x.Key)) { + //If the previous replacement has overwritten this position, skip over the match + if (endOfLastMatchIndex > matchesAtPosition.Key) + { + continue; + } + //Select the longest match (selection could be based on other factors, but this is //the simplest) var longestMatch = matchesAtPosition.Value.OrderBy(x => x.Match.Length).Last(); @@ -273,15 +279,22 @@ public AutoEditResult ProcessPostEditRules(string source, string unedited) string edited = unedited; List appliedReplacements = new List(); - List> coveredUneditedOutputSpans = new List>(); + //Collect matches for all rules Dictionary> uneditedSourceMatches = this.GetAllOutputMatches(source,unedited); int endOfLastMatchIndex = -1; - //How much the length of the edited source has changed in comparison with unedited source + + //How much the length of the edited mt has changed in comparison with unedited mt int editingOffset = 0; foreach (var matchesAtPosition in uneditedSourceMatches.OrderBy(x => x.Key)) { + //If the previous replacement has overwritten this position, skip over the match + if (endOfLastMatchIndex > matchesAtPosition.Key) + { + continue; + } + //Select the longest match (selection could be based on other factors, but this is //the simplest) var longestMatch = matchesAtPosition.Value.OrderBy(x => x.Match.Length).Last(); diff --git a/OpusCatMTEngine/MTModel.cs b/OpusCatMTEngine/MTModel.cs index ecea299..9d5f9ae 100644 --- a/OpusCatMTEngine/MTModel.cs +++ b/OpusCatMTEngine/MTModel.cs @@ -1,4 +1,5 @@ -using Serilog; +using Python.Runtime; +using Serilog; using System; using System.Collections; using System.Collections.Generic; @@ -188,14 +189,18 @@ public Task Translate( input = preEditRuleCollection.ProcessPreEditRules(input).Result; } } - + if (this.SupportsTerminology && applyTerminology) { + + var lemmatizedInput = PythonNetHelper.Lemmatize(this.sourceLanguages.First(), input); + //Apply terminology //Use a simple method of removing overlapping matches of different terms: //For each position record only the longest term match, then when annotating term data, //start from the term closest to edge and skip overlapping terms. var termMatches = new Dictionary>>(); + foreach (var term in this.Terminology.Terms) { var thisTermMatches = term.SourcePatternRegex.Matches(input); @@ -1166,6 +1171,7 @@ public Terminology Terminology get; internal set; } + private dynamic SourceLemmatizer { get; set; } private MTModelStatus status; private MTModelConfig modelConfig; diff --git a/OpusCatMTEngine/OpusCatMTEngine.csproj b/OpusCatMTEngine/OpusCatMTEngine.csproj index e18be7b..f8e976b 100644 --- a/OpusCatMTEngine/OpusCatMTEngine.csproj +++ b/OpusCatMTEngine/OpusCatMTEngine.csproj @@ -272,6 +272,7 @@ + @@ -598,6 +599,6 @@ - xcopy "$(ProjectDir)python-3.8.10-embed-amd64\*.*" "$(TargetDir)python-3.8.10-embed-amd64" /Y /I /E + REM xcopy "$(ProjectDir)python-3.8.10-embed-amd64\*.*" "$(TargetDir)python-3.8.10-embed-amd64" /Y /I /E \ No newline at end of file diff --git a/OpusCatMTEngine/Properties/Resources.Designer.cs b/OpusCatMTEngine/Properties/Resources.Designer.cs index e4fca7b..9c8d814 100644 --- a/OpusCatMTEngine/Properties/Resources.Designer.cs +++ b/OpusCatMTEngine/Properties/Resources.Designer.cs @@ -988,6 +988,29 @@ public static string Settings_StoreInAppdataCheckbox { } } + /// + /// Looks up a localized string similar to import stanza + /// + ///class StanzaWrapper: + /// + /// def __init__(self, + /// lang, + /// processors): + /// self.stanza = stanza.Pipeline(lang, processors='tokenize,pos,lemma,depparse') + /// + /// def lemmatize(self,input): + /// doc = self.stanza(input) + /// lemma_string = "" + /// for sentence in doc.sentences: + /// lemma_string += " ".join([x.lemma for x in sentence.words]) + /// return lemma_string. + /// + public static string StanzaWrapperCode { + get { + return ResourceManager.GetString("StanzaWrapperCode", resourceCulture); + } + } + /// /// Looks up a localized string similar to Add. /// diff --git a/OpusCatMTEngine/Properties/Resources.resx b/OpusCatMTEngine/Properties/Resources.resx index f8cc5db..68bbef1 100644 --- a/OpusCatMTEngine/Properties/Resources.resx +++ b/OpusCatMTEngine/Properties/Resources.resx @@ -473,4 +473,21 @@ Terminology + + import stanza + +class StanzaWrapper: + + def __init__(self, + lang, + processors): + self.stanza = stanza.Pipeline(lang, processors='tokenize,pos,lemma,depparse') + + def lemmatize(self,input): + doc = self.stanza(input) + lemma_string = "" + for sentence in doc.sentences: + lemma_string += " ".join([x.lemma for x in sentence.words]) + return lemma_string + \ No newline at end of file diff --git a/OpusCatMTEngine/PythonNetHelper.cs b/OpusCatMTEngine/PythonNetHelper.cs new file mode 100644 index 0000000..64a9912 --- /dev/null +++ b/OpusCatMTEngine/PythonNetHelper.cs @@ -0,0 +1,39 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Python.Runtime; + +namespace OpusCatMTEngine +{ + static class PythonNetHelper + { + private static Dictionary LemmatizerScopes = new Dictionary(); + + internal static string Lemmatize(IsoLanguage lang, string input) + { + using (Py.GIL()) + { + if (!PythonNetHelper.LemmatizerScopes.ContainsKey(lang)) + { + //Initialize the lemmatizer + using (var moduleScope = Py.CreateScope()) + { + moduleScope.Exec(OpusCatMTEngine.Properties.Resources.StanzaWrapperCode); + // create a Python scope + using (PyScope scope = Py.CreateScope()) + { + scope.Import(moduleScope, "stanza_wrapper"); + + PythonNetHelper.LemmatizerScopes[lang] = scope.Eval( + $"stanza_wrapper.StanzaWrapper('{lang.ShortestIsoCode}', processors='tokenize, pos, lemma, depparse')"); + } + } + } + + return PythonNetHelper.LemmatizerScopes[lang].lemmatize(input); + } + } + } +}