Skip to content

Commit

Permalink
adding lemmatization to term matching
Browse files Browse the repository at this point in the history
  • Loading branch information
TommiNieminen committed Dec 14, 2022
1 parent ea23dfd commit 42b4c6b
Show file tree
Hide file tree
Showing 9 changed files with 162 additions and 27 deletions.
9 changes: 6 additions & 3 deletions OpusCatMTEngine/App.xaml.cs
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ private void Application_Startup(object sender, StartupEventArgs e)
Log.Information("Setting Tls12 as security protocol (required for accessing online model storage");
ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls12;

this.InitializePythonEngine();

Log.Information("Opening OPUS-CAT MT Engine window");

// Create the startup window
Expand All @@ -117,10 +119,11 @@ private void Application_Startup(object sender, StartupEventArgs e)
App.CloseOverlay();
}

this.InitializePythonEngine();

//The update check is used to keep track of use counts, so disable it in DEBUG mode to keep counts
//more accurate
#if !DEBUG
this.CheckForUpdatesAsync();

#endif
}

private async void CheckForUpdatesAsync()
Expand Down
61 changes: 50 additions & 11 deletions OpusCatMTEngine/MTModel.cs
Original file line number Diff line number Diff line change
Expand Up @@ -192,43 +192,82 @@ public Task<TranslationPair> Translate(

if (this.SupportsTerminology && applyTerminology)
{

var lemmatizedInput = PythonNetHelper.Lemmatize(this.sourceLanguages.First(), input);


//Apply terminology
//Use a simple method of removing overlapping matches of different terms:
//For each position record only the longest term match, then when annotating term data,
//start from the term closest to edge and skip overlapping terms.
var termMatches = new Dictionary<int, List<Tuple<Term, Match>>>();
var termMatches = new Dictionary<int, List<TermMatch>>();

//Get lemmatized input and find lemmatized term matches. Prioritize normal term matches
//in case of overlap
var lemmatizedInput = PythonNetHelper.Lemmatize(this.sourceLanguages.First().ShortestIsoCode, input);

//Make dicts out of
var lemmaToPositionDict = new Dictionary<string, List<int>>();
int lemmaCounter = 0;
foreach (var lemma in lemmatizedInput.Select(x => x.Item3))
{
if (lemmaToPositionDict.ContainsKey(lemma))
{
lemmaToPositionDict[lemma].Add(lemmaCounter);
}
else
{
lemmaToPositionDict[lemma] = new List<int>() { lemmaCounter };
}
lemmaCounter++;
}

foreach (var term in this.Terminology.Terms)
{
var thisTermMatches = term.SourcePatternRegex.Matches(input);
foreach (Match termMatch in thisTermMatches)
{

if (termMatches.ContainsKey(termMatch.Index))
{
termMatches[termMatch.Index].Add(new Tuple<Term, Match>(term, termMatch));
termMatches[termMatch.Index].Add(
new TermMatch(term,termMatch));
}
else
{
termMatches[termMatch.Index] = new List<Tuple<Term, Match>>() {
new Tuple<Term, Match>(term, termMatch)};
termMatches[termMatch.Index] = new List<TermMatch>() {
new TermMatch(term,termMatch)};
}
}

//Match term at lemma level, if specified
if (term.MatchSourceLemma)
{
var sourceLemma = term.SourceLemmas;

//Check if first lemma in term found in sentence
if (lemmaToPositionDict.ContainsKey(sourceLemma[0]))
{
var firstLemmaPositions = lemmaToPositionDict[sourceLemma[0]];

//Then check if the other lemmas of the term follow in the source sentence
foreach (var startPos in firstLemmaPositions)
{

}
}

}
}

int lastEditStart = input.Length;
foreach (var index in termMatches.Keys.ToList().OrderByDescending(x => x))
{
//Start from longest match
var matchesDescending = termMatches[index].OrderByDescending(x => x.Item2.Length);
var matchesDescending = termMatches[index].OrderByDescending(x => x.Length);
foreach (var match in matchesDescending)
{
if (match.Item2.Length + index <= lastEditStart)
if (match.Length + index <= lastEditStart)
{
input = input.Remove(index, match.Item2.Length).Insert(index,
$" <term_start> <term_mask> <term_end> {match.Item1.TargetLemma} <trans_end>");
input = input.Remove(index, match.Length).Insert(index,
$" <term_start> <term_mask> <term_end> {match.Term.TargetLemma} <trans_end>");
lastEditStart = index;
continue;
}
Expand Down
1 change: 1 addition & 0 deletions OpusCatMTEngine/OpusCatMTEngine.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,7 @@
<Compile Include="Preprocessing\SentencePiecePreprocessor.cs" />
<Compile Include="Preprocessing\TmxToTxtParser.cs" />
<Compile Include="PythonNetHelper.cs" />
<Compile Include="Terminology\TermMatch.cs" />
<Compile Include="Terminology\Term.cs" />
<Compile Include="Terminology\Terminology.cs" />
<Compile Include="UI\ActionTabItem.cs" />
Expand Down
6 changes: 3 additions & 3 deletions OpusCatMTEngine/Properties/Resources.Designer.cs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions OpusCatMTEngine/Properties/Resources.resx
Original file line number Diff line number Diff line change
Expand Up @@ -485,9 +485,9 @@ class StanzaWrapper:

def lemmatize(self,input):
doc = self.stanza(input)
lemma_string = ""
lemma_tuples = []
for sentence in doc.sentences:
lemma_string += " ".join([x.lemma for x in sentence.words])
return lemma_string</value>
lemma_tuples +=[(x.start_char,x.end_char,x.lemma) for x in sentence.words]
return lemma_tuples</value>
</data>
</root>
15 changes: 11 additions & 4 deletions OpusCatMTEngine/PythonNetHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@ namespace OpusCatMTEngine
{
static class PythonNetHelper
{
private static Dictionary<IsoLanguage, dynamic> LemmatizerScopes = new Dictionary<IsoLanguage, dynamic>();
private static Dictionary<string, dynamic> LemmatizerScopes = new Dictionary<string, dynamic>();

internal static string Lemmatize(IsoLanguage lang, string input)
internal static List<Tuple<int,int,string>> Lemmatize(string lang, string input)
{
List<Tuple<int, int, string>> lemmaList = new List<Tuple<int, int, string>>();
using (Py.GIL())
{
if (!PythonNetHelper.LemmatizerScopes.ContainsKey(lang))
Expand All @@ -27,12 +28,18 @@ internal static string Lemmatize(IsoLanguage lang, string input)
scope.Import(moduleScope, "stanza_wrapper");

PythonNetHelper.LemmatizerScopes[lang] = scope.Eval<dynamic>(
$"stanza_wrapper.StanzaWrapper('{lang.ShortestIsoCode}', processors='tokenize, pos, lemma, depparse')");
$"stanza_wrapper.StanzaWrapper('{lang}', processors='tokenize, pos, lemma, depparse')");
}
}
}

return PythonNetHelper.LemmatizerScopes[lang].lemmatize(input);
var lemmatized = PythonNetHelper.LemmatizerScopes[lang].lemmatize(input);
var output = new List<Tuple<int, int, string>>();
foreach (var lemma in lemmatized)
{
output.Add(new Tuple<int,int,string>((int)lemma[0],(int)lemma[1],(string)lemma[2]));
}
return output;
}
}
}
Expand Down
54 changes: 52 additions & 2 deletions OpusCatMTEngine/Terminology/Term.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@


using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
using YamlDotNet.Serialization;

Expand All @@ -15,10 +17,16 @@ public Term()

}

public Term(string sourcePattern, string targetLemma)
public Term(
string sourcePattern,
string targetLemma,
IsoLanguage sourceLang,
IsoLanguage targetLang)
{
this.SourcePattern = sourcePattern;
this.TargetLemma = targetLemma;
this.SourceLanguageCode = sourceLang.ShortestIsoCode;
this.TargetLanguageCode = sourceLang.ShortestIsoCode;
}

[YamlMember(Alias = "source-pattern", ApplyNamingConventions = false)]
Expand All @@ -28,13 +36,49 @@ public string SourcePattern
set
{
_sourcePattern = value;

this.UpdateSourcePatternRegex();
}
}

[YamlMember(Alias = "match-source-lemma", ApplyNamingConventions = false)]
public bool MatchSourceLemma
{
get => _matchSourceLemma;
set
{
_matchSourceLemma = value;
//Nullify this to make sure a fresh lemma is generated when needed
this.SourceLemmas = null;
}
}

public List<string> SourceLemmas
{
get
{
if (_sourceLemmas == null)
{
if (this.SourceLanguageCode != null)
{
_sourceLemmas = PythonNetHelper.Lemmatize(
this.SourceLanguageCode,
this.SourcePattern).Select(x => x.Item3).ToList();
}
}
return _sourceLemmas;
}
set => _sourceLemmas = value; }

[YamlMember(Alias = "target-lemma", ApplyNamingConventions = false)]
public string TargetLemma { get; set; }

[YamlMember(Alias = "source-language-code", ApplyNamingConventions = false)]
public string SourceLanguageCode { get; set; }

[YamlMember(Alias = "target-language-code", ApplyNamingConventions = false)]
public string TargetLanguageCode { get; set; }

[YamlMember(Alias = "source-pattern-is-regex", ApplyNamingConventions = false)]
public bool SourcePatternIsRegex
{
Expand Down Expand Up @@ -69,11 +113,14 @@ private void UpdateSourcePatternRegex()

if (this.SourcePatternIsRegex)
{
this.sourcePatternRegex = new Regex($"\\b{this.SourcePattern}\\b",sourcePatternOptions);
this.sourcePatternRegex = new Regex($"\\b{this.SourcePattern}\\b", sourcePatternOptions);
}
else
{
this.sourcePatternRegex = new Regex($"\\b{Regex.Escape(this.SourcePattern)}\\b", sourcePatternOptions);

//Nullify source lemma (it will be generated when requested)
this.SourceLemmas = null;
}
}
}
Expand Down Expand Up @@ -101,6 +148,8 @@ public string Description
private bool _sourcePatternIsRegex;
private string _sourcePattern;
private bool _sourcePatternIsCaseSensitive;
private bool _matchSourceLemma;
private List<string> _sourceLemmas;

[YamlIgnore]
public Regex SourcePatternRegex
Expand All @@ -111,5 +160,6 @@ public Regex SourcePatternRegex
}
}


}
}
30 changes: 30 additions & 0 deletions OpusCatMTEngine/Terminology/TermMatch.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
using System;
using System.Text.RegularExpressions;
using YamlDotNet.Serialization;

namespace OpusCatMTEngine
{

public class TermMatch
{

public TermMatch(Term term, Match termMatch)
{
this.Term = term;

}

public TermMatch(Term term, int start, int length, bool lemmaMatch)
{
this.Term = term;
this.Length = length;
this.Start = start;
this.LemmaMatch = lemmaMatch;
}

public Term Term { get; private set; }
public int Length { get; private set; }
public int Start { get; private set; }
public bool LemmaMatch { get; private set; }
}
}
7 changes: 6 additions & 1 deletion OpusCatMTEngine/UI/TerminologyView.xaml.cs
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,12 @@ private void ImportTbx_Click(object sender, RoutedEventArgs e)
var sourceTerm = sourceLangSet.Descendants("term").First().Value;
var targetTerm = targetLangSet.Descendants("term").First().Value;

importedTerms.Add(new Term(sourceTerm, targetTerm));
importedTerms.Add(
new Term(
sourceTerm,
targetTerm,
this.model.SourceLanguages.First(),
this.model.TargetLanguages.First()));
}
}

Expand Down

0 comments on commit 42b4c6b

Please sign in to comment.