Skip to content

Commit

Permalink
fix to edit rules handling, added stanza lemmatizer
Browse files Browse the repository at this point in the history
  • Loading branch information
TommiNieminen committed Dec 9, 2022
1 parent 1abf423 commit ea23dfd
Show file tree
Hide file tree
Showing 6 changed files with 105 additions and 6 deletions.
19 changes: 16 additions & 3 deletions OpusCatMTEngine/AutoEditRules/AutoEditRuleCollection.cs
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ public AutoEditResult ProcessPreEditRules(string unedited)
string edited = unedited;

List<AutoEditRuleMatch> appliedReplacements = new List<AutoEditRuleMatch>();
List<Tuple<int, int>> coveredUneditedSourceSpans = new List<Tuple<int, int>>();

//Collect matches for all rules
Dictionary<int, List<AutoEditRuleMatch>> uneditedSourceMatches = this.GetAllSourceMatches(unedited);

Expand All @@ -230,6 +230,12 @@ public AutoEditResult ProcessPreEditRules(string unedited)
int editingOffset = 0;
foreach (var matchesAtPosition in uneditedSourceMatches.OrderBy(x => x.Key))
{
//If the previous replacement has overwritten this position, skip over the match
if (endOfLastMatchIndex > matchesAtPosition.Key)
{
continue;
}

//Select the longest match (selection could be based on other factors, but this is
//the simplest)
var longestMatch = matchesAtPosition.Value.OrderBy(x => x.Match.Length).Last();
Expand Down Expand Up @@ -273,15 +279,22 @@ public AutoEditResult ProcessPostEditRules(string source, string unedited)
string edited = unedited;

List<AutoEditRuleMatch> appliedReplacements = new List<AutoEditRuleMatch>();
List<Tuple<int, int>> coveredUneditedOutputSpans = new List<Tuple<int, int>>();

//Collect matches for all rules
Dictionary<int, List<AutoEditRuleMatch>> uneditedSourceMatches = this.GetAllOutputMatches(source,unedited);

int endOfLastMatchIndex = -1;
//How much the length of the edited source has changed in comparison with unedited source

//How much the length of the edited mt has changed in comparison with unedited mt
int editingOffset = 0;
foreach (var matchesAtPosition in uneditedSourceMatches.OrderBy(x => x.Key))
{
//If the previous replacement has overwritten this position, skip over the match
if (endOfLastMatchIndex > matchesAtPosition.Key)
{
continue;
}

//Select the longest match (selection could be based on other factors, but this is
//the simplest)
var longestMatch = matchesAtPosition.Value.OrderBy(x => x.Match.Length).Last();
Expand Down
10 changes: 8 additions & 2 deletions OpusCatMTEngine/MTModel.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using Serilog;
using Python.Runtime;
using Serilog;
using System;
using System.Collections;
using System.Collections.Generic;
Expand Down Expand Up @@ -188,14 +189,18 @@ public Task<TranslationPair> Translate(
input = preEditRuleCollection.ProcessPreEditRules(input).Result;
}
}

if (this.SupportsTerminology && applyTerminology)
{

var lemmatizedInput = PythonNetHelper.Lemmatize(this.sourceLanguages.First(), input);

//Apply terminology
//Use a simple method of removing overlapping matches of different terms:
//For each position record only the longest term match, then when annotating term data,
//start from the term closest to edge and skip overlapping terms.
var termMatches = new Dictionary<int, List<Tuple<Term, Match>>>();

foreach (var term in this.Terminology.Terms)
{
var thisTermMatches = term.SourcePatternRegex.Matches(input);
Expand Down Expand Up @@ -1166,6 +1171,7 @@ public Terminology Terminology
get;
internal set;
}
private dynamic SourceLemmatizer { get; set; }

private MTModelStatus status;
private MTModelConfig modelConfig;
Expand Down
3 changes: 2 additions & 1 deletion OpusCatMTEngine/OpusCatMTEngine.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@
<Compile Include="Preprocessing\MosesBpePreprocessor.cs" />
<Compile Include="Preprocessing\SentencePiecePreprocessor.cs" />
<Compile Include="Preprocessing\TmxToTxtParser.cs" />
<Compile Include="PythonNetHelper.cs" />
<Compile Include="Terminology\Term.cs" />
<Compile Include="Terminology\Terminology.cs" />
<Compile Include="UI\ActionTabItem.cs" />
Expand Down Expand Up @@ -598,6 +599,6 @@
<Import Project="..\packages\EntityFramework.6.3.0\build\EntityFramework.targets" Condition="Exists('..\packages\EntityFramework.6.3.0\build\EntityFramework.targets')" />
<Import Project="..\packages\System.Data.SQLite.Core.1.0.112.2\build\net40\System.Data.SQLite.Core.targets" Condition="Exists('..\packages\System.Data.SQLite.Core.1.0.112.2\build\net40\System.Data.SQLite.Core.targets')" />
<PropertyGroup>
<PreBuildEvent>xcopy "$(ProjectDir)python-3.8.10-embed-amd64\*.*" "$(TargetDir)python-3.8.10-embed-amd64" /Y /I /E</PreBuildEvent>
<PreBuildEvent>REM xcopy "$(ProjectDir)python-3.8.10-embed-amd64\*.*" "$(TargetDir)python-3.8.10-embed-amd64" /Y /I /E</PreBuildEvent>
</PropertyGroup>
</Project>
23 changes: 23 additions & 0 deletions OpusCatMTEngine/Properties/Resources.Designer.cs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions OpusCatMTEngine/Properties/Resources.resx
Original file line number Diff line number Diff line change
Expand Up @@ -473,4 +473,21 @@
<data name="Terminology_TerminologyTitle" xml:space="preserve">
<value>Terminology</value>
</data>
<data name="StanzaWrapperCode" xml:space="preserve">
<value>import stanza

class StanzaWrapper:

def __init__(self,
lang,
processors):
self.stanza = stanza.Pipeline(lang, processors='tokenize,pos,lemma,depparse')

def lemmatize(self,input):
doc = self.stanza(input)
lemma_string = ""
for sentence in doc.sentences:
lemma_string += " ".join([x.lemma for x in sentence.words])
return lemma_string</value>
</data>
</root>
39 changes: 39 additions & 0 deletions OpusCatMTEngine/PythonNetHelper.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Python.Runtime;

namespace OpusCatMTEngine
{
static class PythonNetHelper
{
private static Dictionary<IsoLanguage, dynamic> LemmatizerScopes = new Dictionary<IsoLanguage, dynamic>();

internal static string Lemmatize(IsoLanguage lang, string input)
{
using (Py.GIL())
{
if (!PythonNetHelper.LemmatizerScopes.ContainsKey(lang))
{
//Initialize the lemmatizer
using (var moduleScope = Py.CreateScope())
{
moduleScope.Exec(OpusCatMTEngine.Properties.Resources.StanzaWrapperCode);
// create a Python scope
using (PyScope scope = Py.CreateScope())
{
scope.Import(moduleScope, "stanza_wrapper");

PythonNetHelper.LemmatizerScopes[lang] = scope.Eval<dynamic>(
$"stanza_wrapper.StanzaWrapper('{lang.ShortestIsoCode}', processors='tokenize, pos, lemma, depparse')");
}
}
}

return PythonNetHelper.LemmatizerScopes[lang].lemmatize(input);
}
}
}
}

0 comments on commit ea23dfd

Please sign in to comment.