diff --git a/OpusCatMTEngine/App.xaml.cs b/OpusCatMTEngine/App.xaml.cs index 4a95714..71c53cf 100644 --- a/OpusCatMTEngine/App.xaml.cs +++ b/OpusCatMTEngine/App.xaml.cs @@ -181,11 +181,19 @@ private async void CheckForUpdatesAsync() private void InitializePythonEngine() { - Environment.SetEnvironmentVariable("PYTHONNET_PYDLL", ".\\python-3.8.10-embed-amd64\\python38.dll"); - Environment.SetEnvironmentVariable("PATH", ".\\python-3.8.10-embed-amd64"); - Environment.SetEnvironmentVariable("PYTHONPATH", ".\\python-3.8.10-embed-amd64"); + Environment.SetEnvironmentVariable("PYTHONNET_PYDLL", ".\\python-3.8.10-embed-amd64\\python38.dll;"); + Environment.SetEnvironmentVariable("PATH", ".\\python-3.8.10-embed-amd64;"); + Environment.SetEnvironmentVariable("PYTHONPATH", ".\\python-3.8.10-embed-amd64;"); + Environment.SetEnvironmentVariable("PYTHONHOME", ".\\python-3.8.10-embed-amd64;"); + + var home = PythonEngine.PythonHome; + PythonEngine.Initialize(); PythonEngine.BeginAllowThreads(); + using (Py.GIL()) + { + PythonEngine.ImportModule("sacremoses"); + } } /// diff --git a/OpusCatMTEngine/MTModel.cs b/OpusCatMTEngine/MTModel.cs index aef049c..5e8d785 100644 --- a/OpusCatMTEngine/MTModel.cs +++ b/OpusCatMTEngine/MTModel.cs @@ -24,7 +24,9 @@ namespace OpusCatMTEngine { + + public enum MTModelStatus { OK, @@ -66,6 +68,30 @@ public enum TagMethod public class MTModel : INotifyPropertyChanged { + private dynamic lemmatizer; + + internal List> Lemmatize(string input) + { + List> lemmaList = new List>(); + using (Py.GIL()) + { + var output = new List>(); + + dynamic processed = this.lemmatizer(input); + + foreach (var sentence in processed.sentences) + { + foreach (var word in sentence.words) + { + output.Add(new Tuple((int)word.start_char, (int)word.end_char, (string)word.lemma)); + } + } + + return output; + } + + } + public object this[string propertyName] { get @@ -168,6 +194,124 @@ internal void Shutdown() this.marianProcesses = null; } + private Dictionary> GetTermMatches(string input) + { + var termMatches = new Dictionary>(); + + foreach (var term in this.Terminology.Terms) + { + var thisTermMatches = term.SourcePatternRegex.Matches(input); + foreach (Match termMatch in thisTermMatches) + { + if (termMatches.ContainsKey(termMatch.Index)) + { + termMatches[termMatch.Index].Add( + new TermMatch(term, termMatch)); + } + else + { + termMatches[termMatch.Index] = new List() { + new TermMatch(term, termMatch)}; + } + } + + } + + return termMatches; + } + + + private Dictionary> GetLemmaMatches(string input, Dictionary> termMatches = null) + { + if (termMatches == null) + { + termMatches = new Dictionary>(); + } + + + //Get lemmatized input and find lemmatized term matches. Prioritize normal term matches + //in case of overlap + //var lemmatizedInput = PythonNetHelper.Lemmatize(this.sourceLanguages.First().ShortestIsoCode, input); + var lemmatizedInput = this.Lemmatize(input); + + var lemmaToPositionDict = new Dictionary>(); + int lemmaCounter = 0; + foreach (var lemma in lemmatizedInput.Select(x => x.Item3)) + { + if (lemmaToPositionDict.ContainsKey(lemma)) + { + lemmaToPositionDict[lemma].Add(lemmaCounter); + } + else + { + lemmaToPositionDict[lemma] = new List() { lemmaCounter }; + } + lemmaCounter++; + } + + foreach (var term in this.Terminology.Terms.Where(x => x.MatchSourceLemma)) + { + + if (term.SourceLemmas == null) + { + term.SourceLemmas = this.Lemmatize(term.SourcePattern).Select(x => x.Item3).ToList(); + } + var sourceLemmas = term.SourceLemmas; + + //Check if first lemma in term found in sentence + + bool termLemmaFound; + + if (lemmaToPositionDict.ContainsKey(sourceLemmas[0])) + { + var firstLemmaPositions = lemmaToPositionDict[sourceLemmas[0]]; + + //Then check if the other lemmas of the term follow in the source sentence + foreach (var startPos in firstLemmaPositions) + { + int sourceSentencePos = startPos; + termLemmaFound = true; + //start looking at second lemma + for (var termLemmaIndex = 1; termLemmaIndex < sourceLemmas.Count; termLemmaIndex++) + { + sourceSentencePos = startPos + termLemmaIndex; + if (sourceSentencePos < lemmatizedInput.Count) + { + if (lemmatizedInput[sourceSentencePos].Item3 != sourceLemmas[termLemmaIndex]) + { + termLemmaFound = false; + break; + } + } + else + { + termLemmaFound = false; + break; + } + } + + if (termLemmaFound) + { + var startChar = lemmatizedInput[startPos].Item1; + var endChar = lemmatizedInput[sourceSentencePos].Item2; + if (termMatches.ContainsKey(startChar)) + { + termMatches[startChar].Add( + new TermMatch(term, startChar, endChar - startChar, true)); + } + else + { + termMatches[startChar] = new List() { + new TermMatch(term, startChar, endChar-startChar, true)}; + } + } + } + } + } + + return termMatches; + } + public Task Translate( string input, IsoLanguage sourceLang, @@ -197,64 +341,12 @@ public Task Translate( //Use a simple method of removing overlapping matches of different terms: //For each position record only the longest term match, then when annotating term data, //start from the term closest to edge and skip overlapping terms. - var termMatches = new Dictionary>(); + var termMatches = this.GetTermMatches(input); - //Get lemmatized input and find lemmatized term matches. Prioritize normal term matches - //in case of overlap - var lemmatizedInput = PythonNetHelper.Lemmatize(this.sourceLanguages.First().ShortestIsoCode, input); - - //Make dicts out of - var lemmaToPositionDict = new Dictionary>(); - int lemmaCounter = 0; - foreach (var lemma in lemmatizedInput.Select(x => x.Item3)) + //Match term at lemma level, if specified + if (this.Terminology.Terms.Any(x => x.MatchSourceLemma)) { - if (lemmaToPositionDict.ContainsKey(lemma)) - { - lemmaToPositionDict[lemma].Add(lemmaCounter); - } - else - { - lemmaToPositionDict[lemma] = new List() { lemmaCounter }; - } - lemmaCounter++; - } - - foreach (var term in this.Terminology.Terms) - { - var thisTermMatches = term.SourcePatternRegex.Matches(input); - foreach (Match termMatch in thisTermMatches) - { - - if (termMatches.ContainsKey(termMatch.Index)) - { - termMatches[termMatch.Index].Add( - new TermMatch(term,termMatch)); - } - else - { - termMatches[termMatch.Index] = new List() { - new TermMatch(term,termMatch)}; - } - } - - //Match term at lemma level, if specified - if (term.MatchSourceLemma) - { - var sourceLemma = term.SourceLemmas; - - //Check if first lemma in term found in sentence - if (lemmaToPositionDict.ContainsKey(sourceLemma[0])) - { - var firstLemmaPositions = lemmaToPositionDict[sourceLemma[0]]; - - //Then check if the other lemmas of the term follow in the source sentence - foreach (var startPos in firstLemmaPositions) - { - - } - } - - } + termMatches = this.GetLemmaMatches(input, termMatches); } int lastEditStart = input.Length; @@ -768,6 +860,14 @@ public MTModel( this.ModelConfig.TerminologyGuid = this.Terminology.TerminologyGuid; this.SaveModelConfig(); } + + using (Py.GIL()) + { + dynamic stanza = PythonEngine.ImportModule("stanza"); + this.lemmatizer = stanza.Pipeline( + this.SourceLanguages[0].ShortestIsoCode, processors: "tokenize,mwt,pos,lemma"); + } + } this.ModelConfig.ModelTags.CollectionChanged += ModelTags_CollectionChanged; diff --git a/OpusCatMTEngine/OpusCatMTEngine.csproj.user b/OpusCatMTEngine/OpusCatMTEngine.csproj.user index 6cbe588..0bc87c2 100644 --- a/OpusCatMTEngine/OpusCatMTEngine.csproj.user +++ b/OpusCatMTEngine/OpusCatMTEngine.csproj.user @@ -3,4 +3,7 @@ ProjectFiles + + true + \ No newline at end of file diff --git a/OpusCatMTEngine/PythonNetHelper.cs b/OpusCatMTEngine/PythonNetHelper.cs index 4be53e3..690e8d1 100644 --- a/OpusCatMTEngine/PythonNetHelper.cs +++ b/OpusCatMTEngine/PythonNetHelper.cs @@ -10,37 +10,31 @@ namespace OpusCatMTEngine static class PythonNetHelper { private static Dictionary LemmatizerScopes = new Dictionary(); - + internal static List> Lemmatize(string lang, string input) { List> lemmaList = new List>(); using (Py.GIL()) { - if (!PythonNetHelper.LemmatizerScopes.ContainsKey(lang)) + var output = new List>(); + if (!LemmatizerScopes.ContainsKey(lang)) { - //Initialize the lemmatizer - using (var moduleScope = Py.CreateScope()) - { - moduleScope.Exec(OpusCatMTEngine.Properties.Resources.StanzaWrapperCode); - // create a Python scope - using (PyScope scope = Py.CreateScope()) - { - scope.Import(moduleScope, "stanza_wrapper"); - - PythonNetHelper.LemmatizerScopes[lang] = scope.Eval( - $"stanza_wrapper.StanzaWrapper('{lang}', processors='tokenize, pos, lemma, depparse')"); - } - } + dynamic stanza = Py.Import("stanza"); + LemmatizerScopes[lang] = stanza.Pipeline(lang, processors: "tokenize, pos, lemma, depparse"); } - var lemmatized = PythonNetHelper.LemmatizerScopes[lang].lemmatize(input); - var output = new List>(); - foreach (var lemma in lemmatized) + dynamic processed = LemmatizerScopes[lang](input); + foreach (var sentence in processed.sentences) { - output.Add(new Tuple((int)lemma[0],(int)lemma[1],(string)lemma[2])); + foreach (var word in sentence.words) + { + output.Add(new Tuple((int)word.start_char, (int)word.end_char, (string)word.lemma)); + } } + return output; } + } } } diff --git a/OpusCatMTEngine/Terminology/Term.cs b/OpusCatMTEngine/Terminology/Term.cs index b71e02d..9bbf076 100644 --- a/OpusCatMTEngine/Terminology/Term.cs +++ b/OpusCatMTEngine/Terminology/Term.cs @@ -53,18 +53,19 @@ public bool MatchSourceLemma } } + [YamlIgnore] public List SourceLemmas { get { - if (_sourceLemmas == null) + if (_sourceLemmas == null || _sourceLemmas.Count == 0) { - if (this.SourceLanguageCode != null) + /*if (this.SourceLanguageCode != null) { _sourceLemmas = PythonNetHelper.Lemmatize( this.SourceLanguageCode, this.SourcePattern).Select(x => x.Item3).ToList(); - } + }*/ } return _sourceLemmas; } diff --git a/OpusCatMTEngine/Terminology/TermMatch.cs b/OpusCatMTEngine/Terminology/TermMatch.cs index 89df545..35f6c4b 100644 --- a/OpusCatMTEngine/Terminology/TermMatch.cs +++ b/OpusCatMTEngine/Terminology/TermMatch.cs @@ -11,6 +11,9 @@ public class TermMatch public TermMatch(Term term, Match termMatch) { this.Term = term; + this.Start = termMatch.Index; + this.Length = termMatch.Length; + this.LemmaMatch = false; } diff --git a/OpusCatMTEngine/UI/TerminologyView.xaml b/OpusCatMTEngine/UI/TerminologyView.xaml index 1a8b24b..f9efb47 100644 --- a/OpusCatMTEngine/UI/TerminologyView.xaml +++ b/OpusCatMTEngine/UI/TerminologyView.xaml @@ -38,6 +38,7 @@ + diff --git a/OpusCatMTEngine/UI/TerminologyView.xaml.cs b/OpusCatMTEngine/UI/TerminologyView.xaml.cs index f87e86a..6f55b3c 100644 --- a/OpusCatMTEngine/UI/TerminologyView.xaml.cs +++ b/OpusCatMTEngine/UI/TerminologyView.xaml.cs @@ -49,11 +49,18 @@ public TerminologyView( this.Terminologies = terminologies; InitializeComponent(); + this.TermList.InitializingNewItem += InitializeNewTerm; this.TermList.ItemsSource = this.Model.Terminology.Terms; } - + private void InitializeNewTerm(object sender, InitializingNewItemEventArgs e) + { + var term = (Term)e.NewItem; + term.SourceLanguageCode = this.model.SourceLanguages.First().ShortestIsoCode; + term.TargetLanguageCode = this.model.TargetLanguages.First().ShortestIsoCode; + } + public MTModel Model { get => model; set => model = value; } public string Title { get; private set; } public ObservableCollection Terminologies { get; private set; } diff --git a/Trados2019Plugin/FinetuneTransUnitExtractor.cs b/Trados2019Plugin/FinetuneTransUnitExtractor.cs index e979ce5..a3324de 100644 --- a/Trados2019Plugin/FinetuneTransUnitExtractor.cs +++ b/Trados2019Plugin/FinetuneTransUnitExtractor.cs @@ -4,6 +4,7 @@ using Sdl.LanguagePlatform.TranslationMemoryApi; using System; using System.Collections.Generic; +using System.Globalization; using System.IO; using System.Linq; using System.Reflection; @@ -143,7 +144,11 @@ public FinetuneTransUnitExtractor( int maxConcordanceWindow) { this.tms = tms; +#if (TRADOS22) + this.sourceLanguage = new CultureInfo(tms.First().SourceLanguage.Name).TwoLetterISOLanguageName; +#else this.sourceLanguage = tms.First().SourceLanguage.TwoLetterISOLanguageName; +#endif //Shuffle the source segment to prevent focusing on the initial part of the job (in case //units needed value is reached before whole source has been processed). this.sourceSegments = sourceSegments.ToList(); diff --git a/Trados2019Plugin/OpusCatOptionControl.xaml.cs b/Trados2019Plugin/OpusCatOptionControl.xaml.cs index 40bc851..c1eab2a 100644 --- a/Trados2019Plugin/OpusCatOptionControl.xaml.cs +++ b/Trados2019Plugin/OpusCatOptionControl.xaml.cs @@ -6,6 +6,7 @@ using System.Collections.ObjectModel; using System.ComponentModel; using System.Diagnostics; +using System.Globalization; using System.Linq; using System.Runtime.CompilerServices; using System.ServiceModel; @@ -38,8 +39,15 @@ public OpusCatOptionControl(OpusCatOptionsFormWPF hostForm, this.DataContext = this; this.CredentialStore = credentialStore; this.Options = options; + +#if (TRADOS22) + this.projectLanguagePairs = languagePairs.Select( + x => $"{new CultureInfo(x.SourceCulture.Name).TwoLetterISOLanguageName}-" + + $"{new CultureInfo(x.TargetCulture.Name).TwoLetterISOLanguageName}").ToList(); +#else this.projectLanguagePairs = languagePairs.Select( x => $"{x.SourceCulture.TwoLetterISOLanguageName}-{x.TargetCulture.TwoLetterISOLanguageName}").ToList(); +#endif InitializeComponent(); this.ConnectionSelection.LanguagePairs = this.projectLanguagePairs; diff --git a/Trados2019Plugin/OpusCatProviderLanguageDirection.cs b/Trados2019Plugin/OpusCatProviderLanguageDirection.cs index acc8819..28dc2fb 100644 --- a/Trados2019Plugin/OpusCatProviderLanguageDirection.cs +++ b/Trados2019Plugin/OpusCatProviderLanguageDirection.cs @@ -14,6 +14,8 @@ using System.Text.RegularExpressions; using System.Windows.Controls; using System.Runtime.InteropServices; +using System.Globalization; +using Sdl.Core.Globalization; namespace OpusCatTranslationProvider { @@ -48,11 +50,17 @@ public OpusCatProviderLanguageDirection(OpusCatProvider provider, LanguagePair l _visitor = new OpusCatProviderElementVisitor(); +#if (TRADOS22) + var sourceCode = new CultureInfo(this._languageDirection.SourceCulture.Name).TwoLetterISOLanguageName; + var targetCode = new CultureInfo(this._languageDirection.TargetCulture.Name).TwoLetterISOLanguageName; +#else var sourceCode = this._languageDirection.SourceCulture.TwoLetterISOLanguageName; var targetCode = this._languageDirection.TargetCulture.TwoLetterISOLanguageName; +#endif + this.langpair = $"{sourceCode}-{targetCode}"; - #endregion +#endregion } @@ -79,7 +87,7 @@ public ITranslationProvider TranslationProvider /// /// /// - #region "SearchSegment" +#region "SearchSegment" public SearchResults SearchSegment(SearchSettings settings, Segment segment) { @@ -90,15 +98,21 @@ public SearchResults SearchSegment(SearchSettings settings, Segment segment) } - #region "SearchResultsObject" +#region "SearchResultsObject" SearchResults results = new SearchResults(); results.SourceSegment = segment.Duplicate(); - #endregion +#endregion string sourceText = _visitor.PlainText; - + +#if (TRADOS22) + var sourceCode = new CultureInfo(this._languageDirection.SourceCulture.Name).TwoLetterISOLanguageName; + var targetCode = new CultureInfo(this._languageDirection.TargetCulture.Name).TwoLetterISOLanguageName; +#else var sourceCode = this._languageDirection.SourceCulture.TwoLetterISOLanguageName; var targetCode = this._languageDirection.TargetCulture.TwoLetterISOLanguageName; +#endif + var langpair = $"{sourceCode}-{targetCode}"; List systemResults = this.GenerateSystemResult(sourceText, settings.Mode,segment,sourceCode,targetCode); @@ -108,7 +122,7 @@ public SearchResults SearchSegment(SearchSettings settings, Segment segment) } return results; - #endregion +#endregion } private List GenerateSystemResult( @@ -175,7 +189,7 @@ private List GenerateSystemResult( } return systemResults; } - #endregion +#endregion @@ -188,7 +202,7 @@ private List GenerateSystemResult( /// /// /// - #region "CreateSearchResult" +#region "CreateSearchResult" private SearchResult CreateSearchResult(Segment searchSegment, Segment translation, bool formattingPenalty,string mtSystem) { @@ -233,7 +247,7 @@ private SearchResult CreateSearchResult(Segment searchSegment, Segment translati return searchResult; } - #endregion +#endregion public bool CanReverseLanguageDirection @@ -243,6 +257,11 @@ public bool CanReverseLanguageDirection public static Segment CurrentTranslation { get; private set; } +#if (TRADOS22) + CultureCode ITranslationProviderLanguageDirection.SourceLanguage => new CultureCode(this.SourceLanguage); + CultureCode ITranslationProviderLanguageDirection.TargetLanguage => new CultureCode(this.TargetLanguage); +#endif + public SearchResults[] SearchSegments(SearchSettings settings, Segment[] segments) { SearchResults[] results = new SearchResults[segments.Length]; @@ -325,7 +344,7 @@ public SearchResults[] SearchTranslationUnitsMasked(SearchSettings settings, Tra } - #region "NotForThisImplementation" +#region "NotForThisImplementation" /// /// Not required for this implementation. /// @@ -403,6 +422,6 @@ public ImportResult[] AddOrUpdateTranslationUnits(TranslationUnit[] translationU { throw new NotImplementedException(); } - #endregion +#endregion } } diff --git a/Trados2019Plugin/TagRestorer.cs b/Trados2019Plugin/TagRestorer.cs index 0092c71..acd2cc1 100644 --- a/Trados2019Plugin/TagRestorer.cs +++ b/Trados2019Plugin/TagRestorer.cs @@ -120,7 +120,7 @@ private void GetInitialPositions() { Type elementType = segElement.GetType(); - if (elementType == typeof(Text)) + if (elementType == typeof(Text) && sourceSubwordQueue.Count > 0) { //Only check for numbers and letters, as they should be identical with both BPE //and SentencePiece subwords.