Skip to content

Commit

Permalink
term support modifications, trados 2022 plugin sr1 fix
Browse files Browse the repository at this point in the history
  • Loading branch information
TommiNieminen committed Jul 17, 2023
1 parent 42b4c6b commit b644792
Show file tree
Hide file tree
Showing 12 changed files with 243 additions and 94 deletions.
14 changes: 11 additions & 3 deletions OpusCatMTEngine/App.xaml.cs
Original file line number Diff line number Diff line change
Expand Up @@ -181,11 +181,19 @@ private async void CheckForUpdatesAsync()

private void InitializePythonEngine()
{
Environment.SetEnvironmentVariable("PYTHONNET_PYDLL", ".\\python-3.8.10-embed-amd64\\python38.dll");
Environment.SetEnvironmentVariable("PATH", ".\\python-3.8.10-embed-amd64");
Environment.SetEnvironmentVariable("PYTHONPATH", ".\\python-3.8.10-embed-amd64");
Environment.SetEnvironmentVariable("PYTHONNET_PYDLL", ".\\python-3.8.10-embed-amd64\\python38.dll;");
Environment.SetEnvironmentVariable("PATH", ".\\python-3.8.10-embed-amd64;");
Environment.SetEnvironmentVariable("PYTHONPATH", ".\\python-3.8.10-embed-amd64;");
Environment.SetEnvironmentVariable("PYTHONHOME", ".\\python-3.8.10-embed-amd64;");

var home = PythonEngine.PythonHome;

PythonEngine.Initialize();
PythonEngine.BeginAllowThreads();
using (Py.GIL())
{
PythonEngine.ImportModule("sacremoses");
}
}

/// <summary>
Expand Down
212 changes: 156 additions & 56 deletions OpusCatMTEngine/MTModel.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@

namespace OpusCatMTEngine
{



public enum MTModelStatus
{
OK,
Expand Down Expand Up @@ -66,6 +68,30 @@ public enum TagMethod

public class MTModel : INotifyPropertyChanged
{
private dynamic lemmatizer;

internal List<Tuple<int, int, string>> Lemmatize(string input)
{
List<Tuple<int, int, string>> lemmaList = new List<Tuple<int, int, string>>();
using (Py.GIL())
{
var output = new List<Tuple<int, int, string>>();

dynamic processed = this.lemmatizer(input);

foreach (var sentence in processed.sentences)
{
foreach (var word in sentence.words)
{
output.Add(new Tuple<int, int, string>((int)word.start_char, (int)word.end_char, (string)word.lemma));
}
}

return output;
}

}

public object this[string propertyName]
{
get
Expand Down Expand Up @@ -168,6 +194,124 @@ internal void Shutdown()
this.marianProcesses = null;
}

private Dictionary<int, List<TermMatch>> GetTermMatches(string input)
{
var termMatches = new Dictionary<int, List<TermMatch>>();

foreach (var term in this.Terminology.Terms)
{
var thisTermMatches = term.SourcePatternRegex.Matches(input);
foreach (Match termMatch in thisTermMatches)
{
if (termMatches.ContainsKey(termMatch.Index))
{
termMatches[termMatch.Index].Add(
new TermMatch(term, termMatch));
}
else
{
termMatches[termMatch.Index] = new List<TermMatch>() {
new TermMatch(term, termMatch)};
}
}

}

return termMatches;
}


private Dictionary<int,List<TermMatch>> GetLemmaMatches(string input, Dictionary<int, List<TermMatch>> termMatches = null)
{
if (termMatches == null)
{
termMatches = new Dictionary<int, List<TermMatch>>();
}


//Get lemmatized input and find lemmatized term matches. Prioritize normal term matches
//in case of overlap
//var lemmatizedInput = PythonNetHelper.Lemmatize(this.sourceLanguages.First().ShortestIsoCode, input);
var lemmatizedInput = this.Lemmatize(input);

var lemmaToPositionDict = new Dictionary<string, List<int>>();
int lemmaCounter = 0;
foreach (var lemma in lemmatizedInput.Select(x => x.Item3))
{
if (lemmaToPositionDict.ContainsKey(lemma))
{
lemmaToPositionDict[lemma].Add(lemmaCounter);
}
else
{
lemmaToPositionDict[lemma] = new List<int>() { lemmaCounter };
}
lemmaCounter++;
}

foreach (var term in this.Terminology.Terms.Where(x => x.MatchSourceLemma))
{

if (term.SourceLemmas == null)
{
term.SourceLemmas = this.Lemmatize(term.SourcePattern).Select(x => x.Item3).ToList();
}
var sourceLemmas = term.SourceLemmas;

//Check if first lemma in term found in sentence

bool termLemmaFound;

if (lemmaToPositionDict.ContainsKey(sourceLemmas[0]))
{
var firstLemmaPositions = lemmaToPositionDict[sourceLemmas[0]];

//Then check if the other lemmas of the term follow in the source sentence
foreach (var startPos in firstLemmaPositions)
{
int sourceSentencePos = startPos;
termLemmaFound = true;
//start looking at second lemma
for (var termLemmaIndex = 1; termLemmaIndex < sourceLemmas.Count; termLemmaIndex++)
{
sourceSentencePos = startPos + termLemmaIndex;
if (sourceSentencePos < lemmatizedInput.Count)
{
if (lemmatizedInput[sourceSentencePos].Item3 != sourceLemmas[termLemmaIndex])
{
termLemmaFound = false;
break;
}
}
else
{
termLemmaFound = false;
break;
}
}

if (termLemmaFound)
{
var startChar = lemmatizedInput[startPos].Item1;
var endChar = lemmatizedInput[sourceSentencePos].Item2;
if (termMatches.ContainsKey(startChar))
{
termMatches[startChar].Add(
new TermMatch(term, startChar, endChar - startChar, true));
}
else
{
termMatches[startChar] = new List<TermMatch>() {
new TermMatch(term, startChar, endChar-startChar, true)};
}
}
}
}
}

return termMatches;
}

public Task<TranslationPair> Translate(
string input,
IsoLanguage sourceLang,
Expand Down Expand Up @@ -197,64 +341,12 @@ public Task<TranslationPair> Translate(
//Use a simple method of removing overlapping matches of different terms:
//For each position record only the longest term match, then when annotating term data,
//start from the term closest to edge and skip overlapping terms.
var termMatches = new Dictionary<int, List<TermMatch>>();
var termMatches = this.GetTermMatches(input);

//Get lemmatized input and find lemmatized term matches. Prioritize normal term matches
//in case of overlap
var lemmatizedInput = PythonNetHelper.Lemmatize(this.sourceLanguages.First().ShortestIsoCode, input);

//Make dicts out of
var lemmaToPositionDict = new Dictionary<string, List<int>>();
int lemmaCounter = 0;
foreach (var lemma in lemmatizedInput.Select(x => x.Item3))
//Match term at lemma level, if specified
if (this.Terminology.Terms.Any(x => x.MatchSourceLemma))
{
if (lemmaToPositionDict.ContainsKey(lemma))
{
lemmaToPositionDict[lemma].Add(lemmaCounter);
}
else
{
lemmaToPositionDict[lemma] = new List<int>() { lemmaCounter };
}
lemmaCounter++;
}

foreach (var term in this.Terminology.Terms)
{
var thisTermMatches = term.SourcePatternRegex.Matches(input);
foreach (Match termMatch in thisTermMatches)
{

if (termMatches.ContainsKey(termMatch.Index))
{
termMatches[termMatch.Index].Add(
new TermMatch(term,termMatch));
}
else
{
termMatches[termMatch.Index] = new List<TermMatch>() {
new TermMatch(term,termMatch)};
}
}

//Match term at lemma level, if specified
if (term.MatchSourceLemma)
{
var sourceLemma = term.SourceLemmas;

//Check if first lemma in term found in sentence
if (lemmaToPositionDict.ContainsKey(sourceLemma[0]))
{
var firstLemmaPositions = lemmaToPositionDict[sourceLemma[0]];

//Then check if the other lemmas of the term follow in the source sentence
foreach (var startPos in firstLemmaPositions)
{

}
}

}
termMatches = this.GetLemmaMatches(input, termMatches);
}

int lastEditStart = input.Length;
Expand Down Expand Up @@ -768,6 +860,14 @@ public MTModel(
this.ModelConfig.TerminologyGuid = this.Terminology.TerminologyGuid;
this.SaveModelConfig();
}

using (Py.GIL())
{
dynamic stanza = PythonEngine.ImportModule("stanza");
this.lemmatizer = stanza.Pipeline(
this.SourceLanguages[0].ShortestIsoCode, processors: "tokenize,mwt,pos,lemma");
}

}

this.ModelConfig.ModelTags.CollectionChanged += ModelTags_CollectionChanged;
Expand Down
3 changes: 3 additions & 0 deletions OpusCatMTEngine/OpusCatMTEngine.csproj.user
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,7 @@
<PropertyGroup>
<ProjectView>ProjectFiles</ProjectView>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|x64'">
<EnableUnmanagedDebugging>true</EnableUnmanagedDebugging>
</PropertyGroup>
</Project>
32 changes: 13 additions & 19 deletions OpusCatMTEngine/PythonNetHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,37 +10,31 @@ namespace OpusCatMTEngine
static class PythonNetHelper
{
private static Dictionary<string, dynamic> LemmatizerScopes = new Dictionary<string, dynamic>();

internal static List<Tuple<int,int,string>> Lemmatize(string lang, string input)
{
List<Tuple<int, int, string>> lemmaList = new List<Tuple<int, int, string>>();
using (Py.GIL())
{
if (!PythonNetHelper.LemmatizerScopes.ContainsKey(lang))
var output = new List<Tuple<int, int, string>>();
if (!LemmatizerScopes.ContainsKey(lang))
{
//Initialize the lemmatizer
using (var moduleScope = Py.CreateScope())
{
moduleScope.Exec(OpusCatMTEngine.Properties.Resources.StanzaWrapperCode);
// create a Python scope
using (PyScope scope = Py.CreateScope())
{
scope.Import(moduleScope, "stanza_wrapper");

PythonNetHelper.LemmatizerScopes[lang] = scope.Eval<dynamic>(
$"stanza_wrapper.StanzaWrapper('{lang}', processors='tokenize, pos, lemma, depparse')");
}
}
dynamic stanza = Py.Import("stanza");
LemmatizerScopes[lang] = stanza.Pipeline(lang, processors: "tokenize, pos, lemma, depparse");
}

var lemmatized = PythonNetHelper.LemmatizerScopes[lang].lemmatize(input);
var output = new List<Tuple<int, int, string>>();
foreach (var lemma in lemmatized)
dynamic processed = LemmatizerScopes[lang](input);
foreach (var sentence in processed.sentences)
{
output.Add(new Tuple<int,int,string>((int)lemma[0],(int)lemma[1],(string)lemma[2]));
foreach (var word in sentence.words)
{
output.Add(new Tuple<int, int, string>((int)word.start_char, (int)word.end_char, (string)word.lemma));
}
}

return output;
}

}
}
}
7 changes: 4 additions & 3 deletions OpusCatMTEngine/Terminology/Term.cs
Original file line number Diff line number Diff line change
Expand Up @@ -53,18 +53,19 @@ public bool MatchSourceLemma
}
}

[YamlIgnore]
public List<string> SourceLemmas
{
get
{
if (_sourceLemmas == null)
if (_sourceLemmas == null || _sourceLemmas.Count == 0)
{
if (this.SourceLanguageCode != null)
/*if (this.SourceLanguageCode != null)
{
_sourceLemmas = PythonNetHelper.Lemmatize(
this.SourceLanguageCode,
this.SourcePattern).Select(x => x.Item3).ToList();
}
}*/
}
return _sourceLemmas;
}
Expand Down
3 changes: 3 additions & 0 deletions OpusCatMTEngine/Terminology/TermMatch.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ public class TermMatch
public TermMatch(Term term, Match termMatch)
{
this.Term = term;
this.Start = termMatch.Index;
this.Length = termMatch.Length;
this.LemmaMatch = false;

}

Expand Down
1 change: 1 addition & 0 deletions OpusCatMTEngine/UI/TerminologyView.xaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
<DataGridTextColumn Header="Target lemma" Binding="{Binding TargetLemma}" />
<DataGridCheckBoxColumn Header="Regex?" Binding="{Binding SourcePatternIsRegex}" />
<DataGridCheckBoxColumn Header="Case sensitive?" Binding="{Binding SourcePatternIsCaseSensitive}" />
<DataGridCheckBoxColumn Header="Match lemmas?" Binding="{Binding MatchSourceLemma}" />
</DataGrid.Columns>
</DataGrid>
</DockPanel>
Expand Down
9 changes: 8 additions & 1 deletion OpusCatMTEngine/UI/TerminologyView.xaml.cs
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,18 @@ public TerminologyView(
this.Terminologies = terminologies;

InitializeComponent();
this.TermList.InitializingNewItem += InitializeNewTerm;
this.TermList.ItemsSource = this.Model.Terminology.Terms;

}


private void InitializeNewTerm(object sender, InitializingNewItemEventArgs e)
{
var term = (Term)e.NewItem;
term.SourceLanguageCode = this.model.SourceLanguages.First().ShortestIsoCode;
term.TargetLanguageCode = this.model.TargetLanguages.First().ShortestIsoCode;
}

public MTModel Model { get => model; set => model = value; }
public string Title { get; private set; }
public ObservableCollection<Terminology> Terminologies { get; private set; }
Expand Down
Loading

0 comments on commit b644792

Please sign in to comment.