update scripts to update lexicon and augmented strong

STEPBible · Oct 30, 2023 · 1952a2e · 1952a2e
1 parent 703317b
commit 1952a2e
Show file tree

Hide file tree

Showing 6 changed files with 187 additions and 129 deletions.
diff --git a/...s/src/main/resources/scripts_to_prep_lexicon_augmented_strong/1.5_deleteExtraAugStrong.pl b/...s/src/main/resources/scripts_to_prep_lexicon_augmented_strong/1.5_deleteExtraAugStrong.pl
@@ -0,0 +1,48 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+use List::MoreUtils qw(firstidx);
+
+my $number_args = $#ARGV + 1;  
+if ($number_args != 2) {  
+    print "Please provide the file name of the augstrongs file and a list of Strong to ignore as command line arguments.\n";  
+    exit;  
+}
+my $lastStrong = "";
+my $inputFile = $ARGV[0];  
+my %augStrong;
+my %allAugInAStrong;
+my %refs;
+open (FH, '<', $inputFile) or die "Could not open input file: $inputFile";
+my $outputFile = $inputFile . ".out";
+open (OUT, '>', $outputFile) or die "Could not open output file";
+$inputFile = $ARGV[1];
+open (FH2, '<', $inputFile) or die "Could not open input file: $inputFile";
+my @strongToSkip = ();
+while (<FH2>) {
+    chomp($_);
+	push(@strongToSkip, $_);
+}
+close (FH2);
+my $skip = 0;
+while (<FH>) {
+    chomp($_);
+    $_ =~ s/\r//;
+	my $line = $_;
+	if ($line =~ m/^\@AugmentedStrong=\t/) {
+		my $currentStrong = $';
+		if ( grep( /^$currentStrong$/, @strongToSkip ) ) {
+ 			print "found it $currentStrong\n";
+			$skip = 1;
+		}
+		else {
+			$skip = 0;
+		}
+	}
+	if (!($skip)) {
+		print OUT $line . "\n";
+	}
+}
+close (FH);
+close(OUT);
diff --git a/...con_augmented_strong/1a_checkAugStrong.pl → ...icon_augmented_strong/1_checkAugStrong.pl b/...con_augmented_strong/1a_checkAugStrong.pl → ...icon_augmented_strong/1_checkAugStrong.pl
diff --git a/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/1b_checkAugStrong.pl b/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/1b_checkAugStrong.pl
diff --git a/...ols/src/main/resources/scripts_to_prep_lexicon_augmented_strong/2.5_deleteExtraLexicon.pl b/...ols/src/main/resources/scripts_to_prep_lexicon_augmented_strong/2.5_deleteExtraLexicon.pl
@@ -0,0 +1,123 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+
+my $number_args = $#ARGV + 1;  
+if ($number_args != 2) {  
+    print "Please provide the file name of the lexicon file and a list of Strong to ignore as command line arguments.\n";  
+    exit;  
+}
+
+my %lastVerse;
+my %processedStrongNum;
+my $inputFile = $ARGV[0];
+my $outFile = $inputFile . ".out";
+open (FH, '<', $inputFile) or die "Could not open input file: $inputFile";
+open (OF, '>', $outFile) or die "Could not open output file: $outFile";
+$inputFile = $ARGV[1];
+open (FH2, '<', $inputFile) or die "Could not open input file: $inputFile";
+my @strongToSkip = ();
+while (<FH2>) {
+    chomp($_);
+	push(@strongToSkip, $_);
+}
+
+my $lastStrongNum = "";
+my $currNum = "";
+my @out = ();
+my $foundStrNo = 0;
+my $founddStrNo = 0;
+my $dStrNoLine = "";
+my $strNoLine = "";
+my $lineNum = 0;
+while (<FH>) {
+	$lineNum ++;
+    chomp($_);
+    $_ =~ s/\r//;
+	my $line = $_;
+	if ($line =~ /==============$/) {
+		my $err = "";
+		if ((!$foundStrNo) && ($lineNum > 1)) { $err = " StrNo"; print "Did not find $err around line $lineNum\n";}
+		if (!$founddStrNo) { $err .= " dStrNo";}
+		my $currentStrong = $dStrNoLine;
+		$currentStrong =~ s/^\@dStrNo=\t//;
+		if ( grep( /^$currentStrong$/, @strongToSkip ) ) {
+ 			print "found it $currentStrong\n";
+		}
+		else {
+			for (@out) {
+				my $outLine = $_;
+				if ($outLine =~ /==============$/) {
+					print OF $outLine . "\n";
+					my $lineToPrint = $dStrNoLine;
+					if ($lineToPrint ne "") {
+						$lineToPrint =~ s/^\@dStrNo=\t/\@StrNo=\t/;
+					}
+					else { $lineToPrint = $strNoLine; }
+					print OF $lineToPrint . "\n"; 
+				}
+				elsif ($outLine !~ /^\@StrNo=\t/) {
+					print OF $outLine . "\n";
+				}
+			}
+		}
+		@out = ();
+		$foundStrNo = 0;
+		$founddStrNo = 0;
+		$dStrNoLine = "";
+		$strNoLine = "";
+		$currNum = "";
+	}
+	elsif ($line =~ /^\@StrNo=\t/) {
+		my $newLine = $';
+		if ($currNum ne "") {
+			print "unmatch StrNo: " . $currNum . ", linenum: $lineNum\n";
+			$currNum = "";
+			exit;
+		}
+		$foundStrNo = 1;
+		$currNum = $newLine;
+		$strNoLine = $line;
+	}
+	elsif ($line =~ m/^\@dStrNo=\t/) {
+		my $newNum = $';
+		my $newNumChop = $newNum;
+		if ($newNum  =~ /([GH]\d+)[A-Z]$/) {
+			$newNumChop = $1;
+		}
+		$dStrNoLine = $line;
+		if (($currNum ne $newNumChop) &&
+			(uc $currNum ne uc $newNum) && 
+			($currNum !~ /^[GH]\d+[abcdefg]$/) &&
+			($currNum !~ /^[GH]\d+$/)) {
+				print "different num at: " . $lineNum . " " . $currNum . " " . $newNum . "\n";
+		}
+		$currNum = "";
+		$founddStrNo = 1;
+	}
+	push @out, $line;		
+}
+my $currentStrong = $dStrNoLine;
+$currentStrong =~ s/^\@dStrNo=\t//;
+if ( grep( /^$currentStrong$/, @strongToSkip ) ) {
+	print "found it $currentStrong\n";
+}
+else {
+	for (@out) {
+		my $outLine = $_;
+		if ($outLine =~ /==============$/) {
+			print OF $outLine . "\n";
+			my $lineToPrint = $dStrNoLine;
+			if ($lineToPrint ne "") {
+				$lineToPrint =~ s/^\@dStrNo=\t/\@StrNo=\t/;
+			}
+			else { $lineToPrint = $strNoLine; }
+			print OF $lineToPrint . "\n"; 
+		}
+		elsif ($outLine !~ /^\@StrNo=\t/) {
+			print OF $outLine . "\n";
+		}
+	}
+}
+print "Updated lexicon information is in: $outFile\n";
diff --git a/...-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/2_convertDStrongNum.pl b/...-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/2_convertDStrongNum.pl
@@ -45,7 +45,6 @@
 					$lineToPrint = $strNoLine;
 				}
 				my $lastChar = chop($lineToPrint);
-				# $lastChar = lc $lastChar;
 				print OF $lineToPrint . $lastChar . "\n"; 
 			}
 			elsif ($outLine !~ /^\@StrNo=\t/) {
@@ -105,7 +104,6 @@
         }
         else { $lineToPrint = $strNoLine; }
         my $lastChar = chop($lineToPrint);
-        $lastChar = lc $lastChar;
         print OF $lineToPrint . $lastChar . "\n"; 
     }
     elsif ($outLine !~ /^\@StrNo=\t/) {

diff --git a/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/readme.txt b/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/readme.txt
@@ -1,20 +1,24 @@
-1. The 1a_checkAugStrong.pl updates the augmented_strong.txt file from David Instone-Brewer
+1. The 1_checkAugStrong.pl updates the augmented_strong.txt file from David Instone-Brewer
    to a format which can be read by the STEP Java code.
 
-2. The 1b_checkAugStrong.pl is only needed when Patrick added the LXXRefs in September 2023.
+2. 1.5_deleteExtraAugStrong.pl will delete augstrong numbers that are not found in all Bibles.
+   You need to run 3_getWordFreq.pl to find out which words have no occurrences.
 
 3. merge_augstrong.pl is only needed when Patrick added the LXXRefs in September 2023.
 
 4. The 2_convertDStrongNum.pl updates the lexicon file from David to a format which can be
    read by the STEP Java code.
+
+5. 2.5_deleteExtraAugStrong.pl will delete augstrong numbers that are not found in all Bibles.
+   You need to run 3_getWordFreq.pl to find out which words have no occurrences.
 
-5. 3_getWordFreq.pl will get the frequency count of all the Greek and Hebrew words.
+6. 3_getWordFreq.pl will get the frequency count of all the Greek and Hebrew words.
 
-6. 4_addFreqList.pl will add the new frequency count back to the lexicon files.
+7. 4_addFreqList.pl will add the new frequency count back to the lexicon files.
 
-7. 5_buildDetailLexicalTag.pl adds the detail lexical tag to the lexicon files.
+8. 5_buildDetailLexicalTag.pl adds the detail lexical tag to the lexicon files.
 
-8. The 6_createSearchRange.pl adds the search range to the lexicon files.
+9. The 6_createSearchRange.pl adds the search range to the lexicon files.
 
 When I get the lexicon and augmented_strong files from David, I would run the above steps 1-4.
 
@@ -23,5 +27,9 @@ of the augmented_strongs.txt from David might already have the LXXRefs informati
 steps 2 and 3 might not be necessary or might need to be updated. 
 
 I will then use the updated augmented_strong file and the lexicon files on the dev servers.  I will 
-then run the 5th step (3_getWordFreq.pl).  The URL used in the 3_getWordFreq script should be the URL running STEPBible with the
-updated files.
+then run the 5th step (3_getWordFreq.pl).  The URL used in the 3_getWordFreq script should be the URL
+running STEPBible with the updated files.
+
+After running 3_getWordFreq, check to see if there are any words which are not found in all Bibles.  
+Those words should be deleted with 1.5_deleteExtraAugStrong and 2.5_deleteExtraAugStrong.  If you need
+to do that, you need to run the process again from step 1.