From f0e18fea72a296c72708477abbe7601fc7f95d93 Mon Sep 17 00:00:00 2001 From: Andriy Rysin Date: Wed, 17 Jan 2024 16:41:53 -0500 Subject: [PATCH] code refactoring --- build.gradle | 112 +++--------------- .../groovy/org/dict_uk/check/CheckAnim.groovy | 23 ++++ .../dict_uk/check/CheckReplacements.groovy | 48 ++++++++ .../groovy/org/dict_uk/check/CheckXps.groovy | 40 +++++++ 4 files changed, 127 insertions(+), 96 deletions(-) create mode 100644 src/test/groovy/org/dict_uk/check/CheckAnim.groovy create mode 100644 src/test/groovy/org/dict_uk/check/CheckReplacements.groovy create mode 100644 src/test/groovy/org/dict_uk/check/CheckXps.groovy diff --git a/build.gradle b/build.gradle index 9b5b4986..6324bf13 100644 --- a/build.gradle +++ b/build.gradle @@ -142,100 +142,25 @@ task checkDups(type: JavaExec) { workingDir = projectDir } -task checkAnim { - doLast { - def dictDir = file("data/dict") - - def files = dictDir.listFiles().findAll { it.name =~ /.*anim.*\.lst/ } - - def lines = files.collect { File f -> - f.readLines() - .findAll { it =~ /^[А-ЯІЇЄҐ]'?[а-яіїєґ](?!.*\/adj)/ && ! (it =~ /[.\/]<|-(фонд|фест)/) } - } - .flatten() - .grep { it } - - java.text.Collator coll = java.text.Collator.getInstance(new Locale("uk", "UA")); - coll.setStrength(java.text.Collator.IDENTICAL) - coll.setDecomposition(java.text.Collator.NO_DECOMPOSITION) - - println lines.toSorted(coll).join("\n") - println "Found ${lines.size} suspicious anim" - } - } - +task checkAnim(type: JavaExec) { + classpath = sourceSets.test.runtimeClasspath + mainClass = "org.dict_uk.check.CheckAnim" -task xps { - doLast { - def xps = [:].withDefault{ [] } - def dictDir = file("data/dict") - - def files = dictDir.listFiles().findAll { it.name.endsWith('.lst') } - - def lines = files.collect { it.readLines() }.flatten() \ - .collect { it.replaceFirst(/#.*/, '') } - .findAll{ it.contains(":xp") && ! it.startsWith("+cs") && ! it.startsWith('#') } - .collect { it - it.replaceAll(/^([^ ]+)\h.*?(:xp.).*/, '$1 $2') - } - - lines.each{ - def (base, xp) = it.split(/ /) - xps[base] << xp - } - - xps.each { k,v -> v.sort() } - - xps.each { k,v -> - if( v[0] != ':xp1' ) println "out of order: $k: $v" - else if( v.size() == 1 ) println "single: $k: $v" - else { - def dups = v.countBy{it}.grep{it.value > 1}.collect{it.key} - if( dups ) println "dups: $k: $dups" - } - } - -// java.text.Collator coll = java.text.Collator.getInstance(new Locale("uk", "UA")); -// coll.setStrength(java.text.Collator.IDENTICAL) -// coll.setDecomposition(java.text.Collator.NO_DECOMPOSITION) - - } -} + workingDir = projectDir +} -task checkReplacements { - doLast { - def dictDir = file("data/dict") - - def files = dictDir.listFiles().findAll { it.name.endsWith('.lst') || it.name.endsWith('replace.txt') } - - def replWords = files.collect { it.text.split("\n") } \ - .flatten() \ - .findAll { it.contains(" #> ") } - .collect { - def repls = it.split(" #> ")[1].trim().split(/[;, \|]+/) - def word = it.split(' ', 2)[0] - if( word in repls ) { - println "Replacement is same as word: $word" - } - repls - } - .flatten() - .collect { it.replaceAll(/[()]/, '').replaceFirst(/-таки$/, '') } - .findAll { it =~ /[а-яіїєґ]/ } - .unique().sort() - - def spellWords = new File("out/words_spell.txt").text.split('\n') - spellWords += new File("data/dict/slang.lst").text.split('\n').collect{ it.replaceFirst(/ .*/, '') } - spellWords += new File("data/dict/arch.lst").text.split('\n').collect{ it.replaceFirst(/ .*/, '') } - - println "Unique replacement words: ${replWords.size}" - - replWords.removeAll(spellWords) +task checkXps(type: JavaExec) { + classpath = sourceSets.test.runtimeClasspath + mainClass = "org.dict_uk.check.CheckXps" + + workingDir = projectDir +} +task checkReplacements(type: JavaExec) { + classpath = sourceSets.test.runtimeClasspath + mainClass = "org.dict_uk.check.CheckReplacements" - println "Unknown:\n" + replWords.join("\n") - println "Total uknown: ${replWords.size}" - } + workingDir = projectDir } task checkSemtags(type: JavaExec){ @@ -243,12 +168,7 @@ task checkSemtags(type: JavaExec){ mainClass = "org.dict_uk.check.CheckSemtags" workingDir = projectDir - -// def opts = " --indent --mfl --stats --wordlist" -// args "--aff", "../data/affix" -// args "--dict", "dict" -// args opts.split() - } +} task showExpandCommand { diff --git a/src/test/groovy/org/dict_uk/check/CheckAnim.groovy b/src/test/groovy/org/dict_uk/check/CheckAnim.groovy new file mode 100644 index 00000000..d29f0f0c --- /dev/null +++ b/src/test/groovy/org/dict_uk/check/CheckAnim.groovy @@ -0,0 +1,23 @@ +#!/bin/env groovy + +package org.dict_uk.check; + +def dictDir = new File("data/dict") + +def files = dictDir.listFiles().findAll { it.name =~ /.*anim.*\.lst/ } + +assert files + +def lines = files.collect { File f -> + f.readLines() + .findAll { it =~ /^[А-ЯІЇЄҐ]'?[а-яіїєґ](?!.*\/adj)/ && ! (it =~ /[.\/]<|-(фонд|фест)/) } +} +.flatten() +.grep { it } + +java.text.Collator coll = java.text.Collator.getInstance(new Locale("uk", "UA")); +coll.setStrength(java.text.Collator.IDENTICAL) +coll.setDecomposition(java.text.Collator.NO_DECOMPOSITION) + +println lines.toSorted(coll).join("\n") +println "Found ${lines.size()} suspicious anim" diff --git a/src/test/groovy/org/dict_uk/check/CheckReplacements.groovy b/src/test/groovy/org/dict_uk/check/CheckReplacements.groovy new file mode 100644 index 00000000..f4371e94 --- /dev/null +++ b/src/test/groovy/org/dict_uk/check/CheckReplacements.groovy @@ -0,0 +1,48 @@ +#!/bin/env groovy + +package org.dict_uk.check; + +def dictDir = new File("data/dict") + +def files = dictDir.listFiles().findAll { it.name.endsWith('.lst') || it.name.endsWith('replace.txt') } + +assert files + +def replWords = files.collect { it.text.split("\n") } \ + .flatten() \ + .findAll { it.contains(" #> ") } + .collect { + def replStr = it.split(" #> ")[1].trim() + def replItems = replStr.split(/\|/) + if( replItems.size() > 5 && false ) + println "WARNING: Too many replacements ${replItems.size()} > 5 for\n\t$it" + + def repls = replStr.split(/[;, \|]+/) + def word = it.split(' ', 2)[0] + if( word in repls && ! (it =~ /(?iu) - [а-яіїєґ].* #> /) ) { + println "WARNING: Replacement is same as word: $word:\n\t$it" + } + + def dups = replItems.countBy{it}.grep{it.value > 1}.collect{it.key} + if( dups ) { + println "WARNING: Duplicate replacements: $dups:\n\t$it" + } + + repls + } + .flatten() + .collect { it.replaceAll(/[()]/, '').replaceFirst(/-таки$/, '') } + .findAll { it =~ /[а-яіїєґ]/ } + .unique().sort() + +def spellWords = new File("out/words_spell.txt").text.split('\n') +spellWords += new File("data/dict/slang.lst").text.split('\n').collect{ it.replaceFirst(/ .*/, '') } +spellWords += new File("data/dict/arch.lst").text.split('\n').collect{ it.replaceFirst(/ .*/, '') } + +println "Unique replacement words: ${replWords.size()}" + +replWords.removeAll(spellWords) + + +println "Unknown:\n" + replWords.join("\n") +println "Total uknown: ${replWords.size()}" diff --git a/src/test/groovy/org/dict_uk/check/CheckXps.groovy b/src/test/groovy/org/dict_uk/check/CheckXps.groovy new file mode 100644 index 00000000..2beca97a --- /dev/null +++ b/src/test/groovy/org/dict_uk/check/CheckXps.groovy @@ -0,0 +1,40 @@ +#!/bin/env groovy + +package org.dict_uk.check; + +def dictDir = new File("data/dict") + +def files = dictDir.listFiles().findAll { it.name.endsWith('.lst') } + +assert files + +def xps = [:].withDefault{ [] } + +def lines = files.collect { it.readLines() }.flatten() \ + .collect { it.replaceFirst(/#.*/, '') } +.findAll{ it.contains(":xp") \ + && ! it.startsWith("+cs") && ! it.startsWith('#') \ + && ! (it =~ /verb(?!.*inf)|noun(?!.*(:[mnf]:v_naz|:p:v_naz:ns))/) } +.collect { it + it.replaceAll(/^([^ ]+)\h.*?(:xp.).*/, '$1 $2') +} + +lines.each{ + def (base, xp) = it.split(/ /) + xps[base] << xp +} + +xps.each { k,v -> v.sort() } + +xps.each { k,v -> + if( v[0] != ':xp1' ) println "out of order: $k: $v" + else if( v.size() == 1 ) println "single: $k: $v" + else { + def dups = v.countBy{it}.grep{it.value > 1}.collect{it.key} + if( dups ) println "dups: $k: $dups" + } +} + +// java.text.Collator coll = java.text.Collator.getInstance(new Locale("uk", "UA")); +// coll.setStrength(java.text.Collator.IDENTICAL) +// coll.setDecomposition(java.text.Collator.NO_DECOMPOSITION)