From 3a0a22fd63f38886984620a2579a3c331003df48 Mon Sep 17 00:00:00 2001 From: Andriy Rysin Date: Sat, 27 Jan 2024 12:46:58 -0500 Subject: [PATCH] generateReplacements task cleanup --- distr/morfologik-ukrainian/build.gradle | 126 ++++-------------- .../dict_uk/morfologik/Replacements.groovy | 54 ++++++++ 2 files changed, 79 insertions(+), 101 deletions(-) create mode 100644 distr/morfologik-ukrainian/buildSrc/src/main/groovy/org/dict_uk/morfologik/Replacements.groovy diff --git a/distr/morfologik-ukrainian/build.gradle b/distr/morfologik-ukrainian/build.gradle index 99907d26..c1c6ad63 100644 --- a/distr/morfologik-ukrainian/build.gradle +++ b/distr/morfologik-ukrainian/build.gradle @@ -1,6 +1,6 @@ plugins { id 'java' -// id 'groovy' + id 'groovy' id 'eclipse' id 'base' id 'maven-publish' @@ -214,48 +214,33 @@ task createOutRulesDir { } +def headText = +'''# Simple replace table%s +# Format: word=suggestion1|suggestion2|suggestion3... + +''' + +import org.dict_uk.morfologik.Replacements + task createReplacementDict(dependsOn: createOutRulesDir) { def srcDir="${inputDir}/../data/dict" def outFile="$outRulesDir/replace.txt" + def allFiles = Arrays.asList(new File(srcDir).listFiles()) + def srcFiles = allFiles.findAll{ it.name =~ /(twisters|invalid|subst).*\.lst/ } - inputs.files "$srcDir/twisters.lst", "$srcDir/invalid.lst", "$srcDir/invalid-compound.lst", "$srcDir/invalid-auto-replace.txt", "$srcDir/subst.lst", "$srcDir/invalid-composite.lst" + inputs.files srcFiles outputs.file outFile + doLast { - - def headText = -'''# Simple replace table -# Format: word=suggestion1|suggestion2|suggestion3... - -''' - def outLines = [] - inputs.files.each { File file -> - file.eachLine "UTF-8", { - if( it.startsWith('#') || ! it.contains(' #>') ) - return - - it = it.replaceFirst(/\s*# rv[^\s]+/, '') - - if( file.name.contains('composite') ) { - it = it.replaceFirst(/^([а-яіїєґА-ЯІЇЄҐ'-]+).*? - ([а-яіїєґА-ЯІЇЄҐ'-]+).*#> *(.*)/, '$1-$2=$3') - outLines << it - } - else { - it = it.replace(' +cs=', '') - it = it.replaceFirst(/^([а-яіїєґА-ЯІЇЄҐ'-]+).*#> *(.*)(#ok:.*)?/, '$1=$2') - outLines << it - } - } - } - + def outLines = Replacements.getReplacements(srcDir, srcFiles, { it.contains(' #>') && ! it.contains('ua_1992')}) + new File(outFile).text = headText + outLines.join("\n") + '\n' println "Wrote ${outLines.size()} replacements" } } - - task createSoftReplacementDict(dependsOn: createOutRulesDir) { def srcDir="${inputDir}/../data/dict" def outFile="${outRulesDir}/replace_soft.txt" @@ -267,16 +252,10 @@ task createSoftReplacementDict(dependsOn: createOutRulesDir) { doLast { + def outLines = Replacements.getReplacements(srcDir, srcFiles, { it.contains(' #> ') && ! it.contains('ua_1992')}) - def headText = -'''# Simple replace table for soft suggestions -# Format: word=suggestion1|suggestion2|suggestion3... - -''' - def outLines = getReplacements(srcDir, srcFiles, { ! it.contains('ua_1992')}) - - println "Wrote ${outLines.size()} replacements" - new File(outFile).text = headText + outLines.join('\n') + "\n" + println "Wrote ${outLines.size()} soft replacements" + new File(outFile).text = String.format(headText, " for soft suggestions") + outLines.join('\n') + "\n" } } @@ -293,43 +272,14 @@ task createNewSpellingReplacementDict(dependsOn: createOutRulesDir) { inputs.files srcFiles outputs.file outFile - doLast { + def outLines = Replacements.getReplacements(srcDir, srcFiles, { it.contains(' #> ') && it.contains('ua_1992') }) - def headText = -'''# Simple replace table for 2019 spelling suggestions -# Format: word=suggestion1|suggestion2|suggestion3... - -''' - def outLines = getReplacements(srcDir, srcFiles, {it.contains('ua_1992')}) - - println "Wrote ${outLines.size} replacements" - new File(outFile).text = headText + outLines.join('\n') + "\n" + println "Wrote ${outLines.size()} ua_2019 replacements" + new File(outFile).text = String.format(headText, " for 2019 spelling suggestions") + outLines.join('\n') + "\n" } } -List getReplacements(String srcDir, List files, Closure filter) { - def outLines = [] - - files.each{ srcFile -> - def rvLines = new File("$srcDir/$srcFile.name").readLines() - .findAll { - ! it.startsWith('#') && it.contains(' #> ') && filter(it) - }.collect{ - it = it.replace(' +cs=', '') - it = it.replaceFirst(/^([а-яіїєґА-ЯІЇЄҐ'-]+).*#> *(.*)(#ok:.*)?/, '$1=$2') - it = it.replaceFirst(/ *# rv_...(\|rv_...)* */, '') - } - - outLines.addAll(rvLines) - } - - java.text.Collator coll = java.text.Collator.getInstance(new Locale("uk", "UA")) - Collections.sort(outLines, coll) - - outLines -} - task createRenamedReplacementDict(dependsOn: [processResources, createOutRulesDir]) { def srcDir="${inputDir}/../data/dict" @@ -340,37 +290,11 @@ task createRenamedReplacementDict(dependsOn: [processResources, createOutRulesDi inputs.files srcFiles outputs.file outFile - - def headText = -'''# Simple replace table for soft suggestions -# Format: name=replacemet|(optional) explanation - -''' doLast { - def outLines = [] - - srcFiles.each{ File srcFile -> - def rvLines = new File("$srcDir/$srcFile.name").text - .split('\n') - .findAll { - ! it.startsWith('#') && it.contains(' #>> ') - }.collect{ - if( srcFile.name.contains('composite') ) { - it = it.replaceFirst(/ \/.* - /, '-') - } -// it = it.replace(' +cs=', '') - it = it.replaceFirst(/^([а-яіїєґА-ЯІЇЄҐ'-]+).*#>> *(.*)(#ok:.*)?/, '$1=$2') - it = it.replaceFirst(/ *# rv_...(\|rv_...)* */, '') - } - - outLines.addAll(rvLines) - } - - java.text.Collator coll = java.text.Collator.getInstance(new Locale("uk", "UA")) - Collections.sort(outLines, coll) - - println "Wrote ${outLines.size()} replacements" - new File(outFile).text = headText + outLines.join('\n') + "\n" + def outLines = Replacements.getReplacements(srcDir, srcFiles, { it.contains(' #>> ')}) + + println "Wrote ${outLines.size()} rename replacements" + new File(outFile).text = String.format(headText, " for toponim renaming") + outLines.join('\n') + "\n" } } diff --git a/distr/morfologik-ukrainian/buildSrc/src/main/groovy/org/dict_uk/morfologik/Replacements.groovy b/distr/morfologik-ukrainian/buildSrc/src/main/groovy/org/dict_uk/morfologik/Replacements.groovy new file mode 100644 index 00000000..b7f11a18 --- /dev/null +++ b/distr/morfologik-ukrainian/buildSrc/src/main/groovy/org/dict_uk/morfologik/Replacements.groovy @@ -0,0 +1,54 @@ +package org.dict_uk.morfologik + +import groovy.transform.CompileStatic + +public class Replacements { + private static final int MAX_REPLACEMENTS = 5 + + @CompileStatic + public static List getReplacements(String srcDir, List files, Closure filter) { + List outLines = [] + + files.each{ srcFile -> + int tooManyReplacementsCount = 0 + List rvLines = new File("$srcDir/$srcFile.name").readLines() + .findAll { + ! it.startsWith('#') && filter(it) + }.collect{ + it = it.replaceFirst(/\s*# rv[^\s]+/, '') + + if( srcFile.name.contains('composite') ) { + it = it.replaceFirst(/^([а-яіїєґА-ЯІЇЄҐ'-]+).*? - ([а-яіїєґА-ЯІЇЄҐ'-]+).*#>>? *(.*)/, '$1-$2=$3') + } + else { + it = it.replace(' +cs=', '') + it = it.replaceFirst(/^([а-яіїєґА-ЯІЇЄҐ'-]+).*#>>? *(.*)(#ok:.*)?/, '$1=$2') + } + + String[] lineParts = it.split("=") + String replStr = lineParts[1] + String[] parts = replStr.split(/\|/) + if( parts.length > MAX_REPLACEMENTS ) { + it = lineParts[0] + "=" + parts[0..3].join("|") + "|" + parts[4..-1].join("; ") + tooManyReplacementsCount++ +// if( srcFile.name == "base.lst") +// println "Adjusted to $it" + } + it + } + + outLines.addAll(rvLines) + if( tooManyReplacementsCount ) { + println "INFO: merged ${tooManyReplacementsCount} replacements to fit into 5 for ${srcFile.name}" + } + } + + java.text.Collator coll = java.text.Collator.getInstance(new Locale("uk", "UA")) + coll.setStrength(java.text.Collator.IDENTICAL) + coll.setDecomposition(java.text.Collator.NO_DECOMPOSITION) + Collections.sort(outLines, coll) + + outLines + } + +} \ No newline at end of file