From 1952a2efecf30873afa8198e8a63a6643ff5bed4 Mon Sep 17 00:00:00 2001 From: Patrick Tang Date: Mon, 30 Oct 2023 02:35:12 -0700 Subject: [PATCH] update scripts to update lexicon and augmented strong --- .../1.5_deleteExtraAugStrong.pl | 48 +++++++ ..._checkAugStrong.pl => 1_checkAugStrong.pl} | 0 .../1b_checkAugStrong.pl | 119 ----------------- .../2.5_deleteExtraLexicon.pl | 123 ++++++++++++++++++ .../2_convertDStrongNum.pl | 2 - .../readme.txt | 24 ++-- 6 files changed, 187 insertions(+), 129 deletions(-) create mode 100644 step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/1.5_deleteExtraAugStrong.pl rename step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/{1a_checkAugStrong.pl => 1_checkAugStrong.pl} (100%) delete mode 100644 step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/1b_checkAugStrong.pl create mode 100644 step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/2.5_deleteExtraLexicon.pl diff --git a/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/1.5_deleteExtraAugStrong.pl b/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/1.5_deleteExtraAugStrong.pl new file mode 100644 index 0000000000..48a44ce38f --- /dev/null +++ b/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/1.5_deleteExtraAugStrong.pl @@ -0,0 +1,48 @@ +#!/usr/bin/perl + +use warnings; +use strict; +use List::MoreUtils qw(firstidx); + +my $number_args = $#ARGV + 1; +if ($number_args != 2) { + print "Please provide the file name of the augstrongs file and a list of Strong to ignore as command line arguments.\n"; + exit; +} +my $lastStrong = ""; +my $inputFile = $ARGV[0]; +my %augStrong; +my %allAugInAStrong; +my %refs; +open (FH, '<', $inputFile) or die "Could not open input file: $inputFile"; +my $outputFile = $inputFile . ".out"; +open (OUT, '>', $outputFile) or die "Could not open output file"; +$inputFile = $ARGV[1]; +open (FH2, '<', $inputFile) or die "Could not open input file: $inputFile"; +my @strongToSkip = (); +while () { + chomp($_); + push(@strongToSkip, $_); +} +close (FH2); +my $skip = 0; +while () { + chomp($_); + $_ =~ s/\r//; + my $line = $_; + if ($line =~ m/^\@AugmentedStrong=\t/) { + my $currentStrong = $'; + if ( grep( /^$currentStrong$/, @strongToSkip ) ) { + print "found it $currentStrong\n"; + $skip = 1; + } + else { + $skip = 0; + } + } + if (!($skip)) { + print OUT $line . "\n"; + } +} +close (FH); +close(OUT); diff --git a/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/1a_checkAugStrong.pl b/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/1_checkAugStrong.pl similarity index 100% rename from step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/1a_checkAugStrong.pl rename to step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/1_checkAugStrong.pl diff --git a/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/1b_checkAugStrong.pl b/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/1b_checkAugStrong.pl deleted file mode 100644 index 8be5ade504..0000000000 --- a/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/1b_checkAugStrong.pl +++ /dev/null @@ -1,119 +0,0 @@ -#!/usr/bin/perl - -use warnings; -use strict; -use List::MoreUtils qw(firstidx); - -my $number_args = $#ARGV + 1; -if ($number_args != 1) { - print "Please provide the file name of the augstrongs file as a command line argument.\n"; - exit; -} -my $lastStrong = ""; -my $inputFile = $ARGV[0]; -my %augStrong; -my %allAugInAStrong; -my %refs; -open (FH, '<', $inputFile) or die "Could not open input file: $inputFile"; -while () { - chomp($_); - $_ =~ s/\r//; - my $line = $_; - if ($line =~ m/^\@AugmentedStrong=\t/) { - if (!defined $augStrong{$'}) { - $augStrong{$'} = 1; - $lastStrong = $'; - } - else { - print "$' shows up more than one time\n"; - #exit; - } - } - if ($line =~ m/^\@LXXRefs=\t/) { - $refs{$lastStrong} = $'; - my $nonAugStrong = $lastStrong; - $nonAugStrong =~ s/.{1}$//; - if (!defined $allAugInAStrong{$nonAugStrong}) { - $allAugInAStrong{$nonAugStrong} = $lastStrong; - } - else { - $allAugInAStrong{$nonAugStrong} .= "," . $lastStrong; - } - } -} -close (FH); - -my $countWithSuffix = 0; -my $countWithAddedSuffix = 0; -my $countWithMoreThanOneAddedSuffix = 0; -my %strongHasSuffix; -foreach my $key (sort keys %allAugInAStrong) { - my %passageAlreadyProsessedForStrong; - my @spl = split(',', $allAugInAStrong{$key}); - foreach my $i (@spl) { - my @refsToCheck = split(' ', $refs{$i}); - my %passageAlreadyProsessed; - foreach my $currentRef (@refsToCheck) { - if (defined $passageAlreadyProsessed{$currentRef}) { - print "issue: $i has $currentRef more than once\n"; - next; - } - if (defined $passageAlreadyProsessedForStrong{$currentRef}) { - print "issue: $key $i has $currentRef more than once\n"; - print "Please correct issue and then re-run this script\n"; - #exit; - } - $passageAlreadyProsessed{$currentRef} = 1; - $passageAlreadyProsessedForStrong{$currentRef} = 1; - if ($currentRef =~ /[A-Za-z]$/) { - $countWithSuffix ++; - my $noSuffix = $currentRef; - $strongHasSuffix{$i} = 1; - my $suffix = chop($noSuffix); - my $withDash = $noSuffix . '-'; - foreach my $k (@spl) { - my @refsToCheck2 = split(' ', $refs{$k}); - my $idx = firstidx { (($_ eq $noSuffix) || (index($_, $withDash) == 0)) } @refsToCheck2; - if ($idx > -1) { -# print "$i $k $idx $currentRef $noSuffix\n"; - if ($refsToCheck2[$idx] =~ /-/) { - $refsToCheck2[$idx] = $refsToCheck2[$idx] . $suffix; - $countWithMoreThanOneAddedSuffix ++; - } - else { - $refsToCheck2[$idx] = $refsToCheck2[$idx] . '-' . $suffix; - $countWithAddedSuffix ++; - } - $refs{$k} = join " ", @refsToCheck2; - } - } - } - } - foreach $key (keys %passageAlreadyProsessed) { - if ($key =~ /[A-Za-z]$/) { - my $noSuffix = $key; - chop($noSuffix); - if (defined $passageAlreadyProsessed{$noSuffix}) { - print "issue: $i has $key and $noSuffix. Please fix before processing\n"; - exit; - } - } - } - } -} -my $outputFileName = $inputFile . '.out'; -open (OF, '>', $outputFileName); -foreach my $key (sort keys %allAugInAStrong) { - my @spl = split(',', $allAugInAStrong{$key}); - foreach my $l (@spl) { - print OF "===============================\n" . - "\@AugmentedStrong=\t" . $l . "\n" . - "\@LXXRefs=\t" . $refs{$l} . "\n"; - } -} -close(OF); -print "Updated augmented Strong information is in new file: $outputFileName\n"; -#print "Statistics: references with suffix of an alpha character: $countWithSuffix added suffix: $countWithAddedSuffix More than one suffix: $countWithMoreThanOneAddedSuffix\n"; -#foreach my $key (sort keys %strongHasSuffix) { -# print "$key\n"; -#} diff --git a/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/2.5_deleteExtraLexicon.pl b/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/2.5_deleteExtraLexicon.pl new file mode 100644 index 0000000000..e1c274dc57 --- /dev/null +++ b/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/2.5_deleteExtraLexicon.pl @@ -0,0 +1,123 @@ +#!/usr/bin/perl + +use warnings; +use strict; + +my $number_args = $#ARGV + 1; +if ($number_args != 2) { + print "Please provide the file name of the lexicon file and a list of Strong to ignore as command line arguments.\n"; + exit; +} + +my %lastVerse; +my %processedStrongNum; +my $inputFile = $ARGV[0]; +my $outFile = $inputFile . ".out"; +open (FH, '<', $inputFile) or die "Could not open input file: $inputFile"; +open (OF, '>', $outFile) or die "Could not open output file: $outFile"; +$inputFile = $ARGV[1]; +open (FH2, '<', $inputFile) or die "Could not open input file: $inputFile"; +my @strongToSkip = (); +while () { + chomp($_); + push(@strongToSkip, $_); +} + +my $lastStrongNum = ""; +my $currNum = ""; +my @out = (); +my $foundStrNo = 0; +my $founddStrNo = 0; +my $dStrNoLine = ""; +my $strNoLine = ""; +my $lineNum = 0; +while () { + $lineNum ++; + chomp($_); + $_ =~ s/\r//; + my $line = $_; + if ($line =~ /==============$/) { + my $err = ""; + if ((!$foundStrNo) && ($lineNum > 1)) { $err = " StrNo"; print "Did not find $err around line $lineNum\n";} + if (!$founddStrNo) { $err .= " dStrNo";} + my $currentStrong = $dStrNoLine; + $currentStrong =~ s/^\@dStrNo=\t//; + if ( grep( /^$currentStrong$/, @strongToSkip ) ) { + print "found it $currentStrong\n"; + } + else { + for (@out) { + my $outLine = $_; + if ($outLine =~ /==============$/) { + print OF $outLine . "\n"; + my $lineToPrint = $dStrNoLine; + if ($lineToPrint ne "") { + $lineToPrint =~ s/^\@dStrNo=\t/\@StrNo=\t/; + } + else { $lineToPrint = $strNoLine; } + print OF $lineToPrint . "\n"; + } + elsif ($outLine !~ /^\@StrNo=\t/) { + print OF $outLine . "\n"; + } + } + } + @out = (); + $foundStrNo = 0; + $founddStrNo = 0; + $dStrNoLine = ""; + $strNoLine = ""; + $currNum = ""; + } + elsif ($line =~ /^\@StrNo=\t/) { + my $newLine = $'; + if ($currNum ne "") { + print "unmatch StrNo: " . $currNum . ", linenum: $lineNum\n"; + $currNum = ""; + exit; + } + $foundStrNo = 1; + $currNum = $newLine; + $strNoLine = $line; + } + elsif ($line =~ m/^\@dStrNo=\t/) { + my $newNum = $'; + my $newNumChop = $newNum; + if ($newNum =~ /([GH]\d+)[A-Z]$/) { + $newNumChop = $1; + } + $dStrNoLine = $line; + if (($currNum ne $newNumChop) && + (uc $currNum ne uc $newNum) && + ($currNum !~ /^[GH]\d+[abcdefg]$/) && + ($currNum !~ /^[GH]\d+$/)) { + print "different num at: " . $lineNum . " " . $currNum . " " . $newNum . "\n"; + } + $currNum = ""; + $founddStrNo = 1; + } + push @out, $line; +} +my $currentStrong = $dStrNoLine; +$currentStrong =~ s/^\@dStrNo=\t//; +if ( grep( /^$currentStrong$/, @strongToSkip ) ) { + print "found it $currentStrong\n"; +} +else { + for (@out) { + my $outLine = $_; + if ($outLine =~ /==============$/) { + print OF $outLine . "\n"; + my $lineToPrint = $dStrNoLine; + if ($lineToPrint ne "") { + $lineToPrint =~ s/^\@dStrNo=\t/\@StrNo=\t/; + } + else { $lineToPrint = $strNoLine; } + print OF $lineToPrint . "\n"; + } + elsif ($outLine !~ /^\@StrNo=\t/) { + print OF $outLine . "\n"; + } + } +} +print "Updated lexicon information is in: $outFile\n"; \ No newline at end of file diff --git a/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/2_convertDStrongNum.pl b/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/2_convertDStrongNum.pl index 365234f69a..56a1aeadcb 100644 --- a/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/2_convertDStrongNum.pl +++ b/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/2_convertDStrongNum.pl @@ -45,7 +45,6 @@ $lineToPrint = $strNoLine; } my $lastChar = chop($lineToPrint); - # $lastChar = lc $lastChar; print OF $lineToPrint . $lastChar . "\n"; } elsif ($outLine !~ /^\@StrNo=\t/) { @@ -105,7 +104,6 @@ } else { $lineToPrint = $strNoLine; } my $lastChar = chop($lineToPrint); - $lastChar = lc $lastChar; print OF $lineToPrint . $lastChar . "\n"; } elsif ($outLine !~ /^\@StrNo=\t/) { diff --git a/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/readme.txt b/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/readme.txt index febd01af65..56703bfe25 100644 --- a/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/readme.txt +++ b/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/readme.txt @@ -1,20 +1,24 @@ -1. The 1a_checkAugStrong.pl updates the augmented_strong.txt file from David Instone-Brewer +1. The 1_checkAugStrong.pl updates the augmented_strong.txt file from David Instone-Brewer to a format which can be read by the STEP Java code. -2. The 1b_checkAugStrong.pl is only needed when Patrick added the LXXRefs in September 2023. +2. 1.5_deleteExtraAugStrong.pl will delete augstrong numbers that are not found in all Bibles. + You need to run 3_getWordFreq.pl to find out which words have no occurrences. 3. merge_augstrong.pl is only needed when Patrick added the LXXRefs in September 2023. 4. The 2_convertDStrongNum.pl updates the lexicon file from David to a format which can be read by the STEP Java code. + +5. 2.5_deleteExtraAugStrong.pl will delete augstrong numbers that are not found in all Bibles. + You need to run 3_getWordFreq.pl to find out which words have no occurrences. -5. 3_getWordFreq.pl will get the frequency count of all the Greek and Hebrew words. +6. 3_getWordFreq.pl will get the frequency count of all the Greek and Hebrew words. -6. 4_addFreqList.pl will add the new frequency count back to the lexicon files. +7. 4_addFreqList.pl will add the new frequency count back to the lexicon files. -7. 5_buildDetailLexicalTag.pl adds the detail lexical tag to the lexicon files. +8. 5_buildDetailLexicalTag.pl adds the detail lexical tag to the lexicon files. -8. The 6_createSearchRange.pl adds the search range to the lexicon files. +9. The 6_createSearchRange.pl adds the search range to the lexicon files. When I get the lexicon and augmented_strong files from David, I would run the above steps 1-4. @@ -23,5 +27,9 @@ of the augmented_strongs.txt from David might already have the LXXRefs informati steps 2 and 3 might not be necessary or might need to be updated. I will then use the updated augmented_strong file and the lexicon files on the dev servers. I will -then run the 5th step (3_getWordFreq.pl). The URL used in the 3_getWordFreq script should be the URL running STEPBible with the -updated files. \ No newline at end of file +then run the 5th step (3_getWordFreq.pl). The URL used in the 3_getWordFreq script should be the URL +running STEPBible with the updated files. + +After running 3_getWordFreq, check to see if there are any words which are not found in all Bibles. +Those words should be deleted with 1.5_deleteExtraAugStrong and 2.5_deleteExtraAugStrong. If you need +to do that, you need to run the process again from step 1. \ No newline at end of file