From 1952a2efecf30873afa8198e8a63a6643ff5bed4 Mon Sep 17 00:00:00 2001
From: Patrick Tang <patricksptang@gmail.com>
Date: Mon, 30 Oct 2023 02:35:12 -0700
Subject: [PATCH] update scripts to update lexicon and augmented strong

---
 .../1.5_deleteExtraAugStrong.pl               |  48 +++++++
 ..._checkAugStrong.pl => 1_checkAugStrong.pl} |   0
 .../1b_checkAugStrong.pl                      | 119 -----------------
 .../2.5_deleteExtraLexicon.pl                 | 123 ++++++++++++++++++
 .../2_convertDStrongNum.pl                    |   2 -
 .../readme.txt                                |  24 ++--
 6 files changed, 187 insertions(+), 129 deletions(-)
 create mode 100644 step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/1.5_deleteExtraAugStrong.pl
 rename step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/{1a_checkAugStrong.pl => 1_checkAugStrong.pl} (100%)
 delete mode 100644 step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/1b_checkAugStrong.pl
 create mode 100644 step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/2.5_deleteExtraLexicon.pl
diff --git a/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/1.5_deleteExtraAugStrong.pl b/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/1.5_deleteExtraAugStrong.pl
new file mode 100644
index 0000000000..48a44ce38f
--- /dev/null
+++ b/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/1.5_deleteExtraAugStrong.pl
@@ -0,0 +1,48 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+use List::MoreUtils qw(firstidx);
+
+my $number_args = $#ARGV + 1;  
+if ($number_args != 2) {  
+    print "Please provide the file name of the augstrongs file and a list of Strong to ignore as command line arguments.\n";  
+    exit;  
+}
+my $lastStrong = "";
+my $inputFile = $ARGV[0];  
+my %augStrong;
+my %allAugInAStrong;
+my %refs;
+open (FH, '<', $inputFile) or die "Could not open input file: $inputFile";
+my $outputFile = $inputFile . ".out";
+open (OUT, '>', $outputFile) or die "Could not open output file";
+$inputFile = $ARGV[1];
+open (FH2, '<', $inputFile) or die "Could not open input file: $inputFile";
+my @strongToSkip = ();
+while (<FH2>) {
+    chomp($_);
+	push(@strongToSkip, $_);
+}
+close (FH2);
+my $skip = 0;
+while (<FH>) {
+    chomp($_);
+    $_ =~ s/\r//;
+	my $line = $_;
+	if ($line =~ m/^\@AugmentedStrong=\t/) {
+		my $currentStrong = $';
+		if ( grep( /^$currentStrong$/, @strongToSkip ) ) {
+ 			print "found it $currentStrong\n";
+			$skip = 1;
+		}
+		else {
+			$skip = 0;
+		}
+	}
+	if (!($skip)) {
+		print OUT $line . "\n";
+	}
+}
+close (FH);
+close(OUT);
diff --git a/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/1a_checkAugStrong.pl b/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/1_checkAugStrong.pl
similarity index 100%
rename from step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/1a_checkAugStrong.pl
rename to step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/1_checkAugStrong.pl
diff --git a/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/1b_checkAugStrong.pl b/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/1b_checkAugStrong.pl
deleted file mode 100644
index 8be5ade504..0000000000
--- a/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/1b_checkAugStrong.pl
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/usr/bin/perl
-
-use warnings;
-use strict;
-use List::MoreUtils qw(firstidx);
-
-my $number_args = $#ARGV + 1;  
-if ($number_args != 1) {  
-    print "Please provide the file name of the augstrongs file as a command line argument.\n";  
-    exit;  
-}
-my $lastStrong = "";
-my $inputFile = $ARGV[0];  
-my %augStrong;
-my %allAugInAStrong;
-my %refs;
-open (FH, '<', $inputFile) or die "Could not open input file: $inputFile";
-while (<FH>) {
-    chomp($_);
-    $_ =~ s/\r//;
-	my $line = $_;
-	if ($line =~ m/^\@AugmentedStrong=\t/) {
-		if (!defined $augStrong{$'}) {
-			$augStrong{$'} = 1;
-            $lastStrong = $';
-		}
-		else {
-            print "$' shows up more than one time\n";
-            #exit;
-		}
-	}
-  	if ($line =~ m/^\@LXXRefs=\t/) {
-        $refs{$lastStrong} = $';
-        my $nonAugStrong = $lastStrong;
-        $nonAugStrong =~ s/.{1}$//;
-		if (!defined $allAugInAStrong{$nonAugStrong}) {
-			$allAugInAStrong{$nonAugStrong} = $lastStrong;
-		}
-		else {
-			$allAugInAStrong{$nonAugStrong} .= "," . $lastStrong;
-		}
-    }
-}
-close (FH);
-
-my $countWithSuffix = 0;
-my $countWithAddedSuffix = 0;
-my $countWithMoreThanOneAddedSuffix = 0;
-my %strongHasSuffix;
-foreach my $key (sort keys %allAugInAStrong) {
-    my %passageAlreadyProsessedForStrong;
-    my @spl = split(',', $allAugInAStrong{$key});
-    foreach my $i (@spl) {
-        my @refsToCheck = split(' ', $refs{$i});
-        my %passageAlreadyProsessed;
-        foreach my $currentRef (@refsToCheck) {
-            if (defined $passageAlreadyProsessed{$currentRef}) {
-                print "issue: $i has $currentRef more than once\n";
-                next;
-            }
-            if (defined $passageAlreadyProsessedForStrong{$currentRef}) {
-                print "issue: $key $i has $currentRef more than once\n";
-                print "Please correct issue and then re-run this script\n";
-                #exit;
-            }
-            $passageAlreadyProsessed{$currentRef} = 1;
-            $passageAlreadyProsessedForStrong{$currentRef} = 1;
-            if ($currentRef =~ /[A-Za-z]$/) {
-                $countWithSuffix ++;
-                my $noSuffix = $currentRef;
-                $strongHasSuffix{$i} = 1;
-                my $suffix = chop($noSuffix);
-                my $withDash = $noSuffix . '-';
-                foreach my $k (@spl) {
-                    my @refsToCheck2 = split(' ', $refs{$k});
-                    my $idx = firstidx { (($_ eq $noSuffix) || (index($_, $withDash) == 0)) } @refsToCheck2;
-                    if ($idx > -1) {
-#                        print "$i $k $idx $currentRef $noSuffix\n";
-                        if ($refsToCheck2[$idx] =~ /-/) {
-                            $refsToCheck2[$idx] = $refsToCheck2[$idx] . $suffix;
-                            $countWithMoreThanOneAddedSuffix ++;
-                        }
-                        else {
-                            $refsToCheck2[$idx] = $refsToCheck2[$idx] . '-' . $suffix;
-                            $countWithAddedSuffix ++;
-                        }
-                        $refs{$k} = join " ", @refsToCheck2;
-                    }
-                }
-            }
-        }
-        foreach $key (keys %passageAlreadyProsessed) {
-            if ($key =~ /[A-Za-z]$/) {
-                my $noSuffix = $key;
-                chop($noSuffix);
-                if (defined $passageAlreadyProsessed{$noSuffix}) {
-                    print "issue: $i has $key and $noSuffix.  Please fix before processing\n";
-                    exit;
-                }
-            }
-        }
-    }
-}
-my $outputFileName = $inputFile . '.out';
-open (OF, '>', $outputFileName);
-foreach my $key (sort keys %allAugInAStrong) {
-    my @spl = split(',', $allAugInAStrong{$key});
-    foreach my $l (@spl) {
-        print OF "===============================\n" .
-            "\@AugmentedStrong=\t" . $l . "\n" . 
-            "\@LXXRefs=\t" . $refs{$l} . "\n";
-    }
-}
-close(OF);
-print "Updated augmented Strong information is in new file: $outputFileName\n";
-#print "Statistics: references with suffix of an alpha character: $countWithSuffix added suffix: $countWithAddedSuffix More than one suffix: $countWithMoreThanOneAddedSuffix\n";
-#foreach my $key (sort keys %strongHasSuffix) {
-#        print "$key\n";
-#}        
diff --git a/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/2.5_deleteExtraLexicon.pl b/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/2.5_deleteExtraLexicon.pl
new file mode 100644
index 0000000000..e1c274dc57
--- /dev/null
+++ b/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/2.5_deleteExtraLexicon.pl
@@ -0,0 +1,123 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+
+my $number_args = $#ARGV + 1;  
+if ($number_args != 2) {  
+    print "Please provide the file name of the lexicon file and a list of Strong to ignore as command line arguments.\n";  
+    exit;  
+}
+
+my %lastVerse;
+my %processedStrongNum;
+my $inputFile = $ARGV[0];
+my $outFile = $inputFile . ".out";
+open (FH, '<', $inputFile) or die "Could not open input file: $inputFile";
+open (OF, '>', $outFile) or die "Could not open output file: $outFile";
+$inputFile = $ARGV[1];
+open (FH2, '<', $inputFile) or die "Could not open input file: $inputFile";
+my @strongToSkip = ();
+while (<FH2>) {
+    chomp($_);
+	push(@strongToSkip, $_);
+}
+
+my $lastStrongNum = "";
+my $currNum = "";
+my @out = ();
+my $foundStrNo = 0;
+my $founddStrNo = 0;
+my $dStrNoLine = "";
+my $strNoLine = "";
+my $lineNum = 0;
+while (<FH>) {
+	$lineNum ++;
+    chomp($_);
+    $_ =~ s/\r//;
+	my $line = $_;
+	if ($line =~ /==============$/) {
+		my $err = "";
+		if ((!$foundStrNo) && ($lineNum > 1)) { $err = " StrNo"; print "Did not find $err around line $lineNum\n";}
+		if (!$founddStrNo) { $err .= " dStrNo";}
+		my $currentStrong = $dStrNoLine;
+		$currentStrong =~ s/^\@dStrNo=\t//;
+		if ( grep( /^$currentStrong$/, @strongToSkip ) ) {
+ 			print "found it $currentStrong\n";
+		}
+		else {
+			for (@out) {
+				my $outLine = $_;
+				if ($outLine =~ /==============$/) {
+					print OF $outLine . "\n";
+					my $lineToPrint = $dStrNoLine;
+					if ($lineToPrint ne "") {
+						$lineToPrint =~ s/^\@dStrNo=\t/\@StrNo=\t/;
+					}
+					else { $lineToPrint = $strNoLine; }
+					print OF $lineToPrint . "\n"; 
+				}
+				elsif ($outLine !~ /^\@StrNo=\t/) {
+					print OF $outLine . "\n";
+				}
+			}
+		}
+		@out = ();
+		$foundStrNo = 0;
+		$founddStrNo = 0;
+		$dStrNoLine = "";
+		$strNoLine = "";
+		$currNum = "";
+	}
+	elsif ($line =~ /^\@StrNo=\t/) {
+		my $newLine = $';
+		if ($currNum ne "") {
+			print "unmatch StrNo: " . $currNum . ", linenum: $lineNum\n";
+			$currNum = "";
+			exit;
+		}
+		$foundStrNo = 1;
+		$currNum = $newLine;
+		$strNoLine = $line;
+	}
+	elsif ($line =~ m/^\@dStrNo=\t/) {
+		my $newNum = $';
+		my $newNumChop = $newNum;
+		if ($newNum  =~ /([GH]\d+)[A-Z]$/) {
+			$newNumChop = $1;
+		}
+		$dStrNoLine = $line;
+		if (($currNum ne $newNumChop) &&
+			(uc $currNum ne uc $newNum) && 
+			($currNum !~ /^[GH]\d+[abcdefg]$/) &&
+			($currNum !~ /^[GH]\d+$/)) {
+				print "different num at: " . $lineNum . " " . $currNum . " " . $newNum . "\n";
+		}
+		$currNum = "";
+		$founddStrNo = 1;
+	}
+	push @out, $line;		
+}
+my $currentStrong = $dStrNoLine;
+$currentStrong =~ s/^\@dStrNo=\t//;
+if ( grep( /^$currentStrong$/, @strongToSkip ) ) {
+	print "found it $currentStrong\n";
+}
+else {
+	for (@out) {
+		my $outLine = $_;
+		if ($outLine =~ /==============$/) {
+			print OF $outLine . "\n";
+			my $lineToPrint = $dStrNoLine;
+			if ($lineToPrint ne "") {
+				$lineToPrint =~ s/^\@dStrNo=\t/\@StrNo=\t/;
+			}
+			else { $lineToPrint = $strNoLine; }
+			print OF $lineToPrint . "\n"; 
+		}
+		elsif ($outLine !~ /^\@StrNo=\t/) {
+			print OF $outLine . "\n";
+		}
+	}
+}
+print "Updated lexicon information is in: $outFile\n";
\ No newline at end of file
diff --git a/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/2_convertDStrongNum.pl b/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/2_convertDStrongNum.pl
index 365234f69a..56a1aeadcb 100644
--- a/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/2_convertDStrongNum.pl
+++ b/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/2_convertDStrongNum.pl
@@ -45,7 +45,6 @@
 					$lineToPrint = $strNoLine;
 				}
 				my $lastChar = chop($lineToPrint);
-				# $lastChar = lc $lastChar;
 				print OF $lineToPrint . $lastChar . "\n"; 
 			}
 			elsif ($outLine !~ /^\@StrNo=\t/) {
@@ -105,7 +104,6 @@
         }
         else { $lineToPrint = $strNoLine; }
         my $lastChar = chop($lineToPrint);
-        $lastChar = lc $lastChar;
         print OF $lineToPrint . $lastChar . "\n"; 
     }
     elsif ($outLine !~ /^\@StrNo=\t/) {
diff --git a/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/readme.txt b/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/readme.txt
index febd01af65..56703bfe25 100644
--- a/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/readme.txt
+++ b/step-tools/src/main/resources/scripts_to_prep_lexicon_augmented_strong/readme.txt
@@ -1,20 +1,24 @@
-1. The 1a_checkAugStrong.pl updates the augmented_strong.txt file from David Instone-Brewer
+1. The 1_checkAugStrong.pl updates the augmented_strong.txt file from David Instone-Brewer
    to a format which can be read by the STEP Java code.
 
-2. The 1b_checkAugStrong.pl is only needed when Patrick added the LXXRefs in September 2023.
+2. 1.5_deleteExtraAugStrong.pl will delete augstrong numbers that are not found in all Bibles.
+   You need to run 3_getWordFreq.pl to find out which words have no occurrences.
 
 3. merge_augstrong.pl is only needed when Patrick added the LXXRefs in September 2023.
 
 4. The 2_convertDStrongNum.pl updates the lexicon file from David to a format which can be
    read by the STEP Java code.
+   
+5. 2.5_deleteExtraAugStrong.pl will delete augstrong numbers that are not found in all Bibles.
+   You need to run 3_getWordFreq.pl to find out which words have no occurrences.
 
-5. 3_getWordFreq.pl will get the frequency count of all the Greek and Hebrew words.
+6. 3_getWordFreq.pl will get the frequency count of all the Greek and Hebrew words.
 
-6. 4_addFreqList.pl will add the new frequency count back to the lexicon files.
+7. 4_addFreqList.pl will add the new frequency count back to the lexicon files.
 
-7. 5_buildDetailLexicalTag.pl adds the detail lexical tag to the lexicon files.
+8. 5_buildDetailLexicalTag.pl adds the detail lexical tag to the lexicon files.
 
-8. The 6_createSearchRange.pl adds the search range to the lexicon files.
+9. The 6_createSearchRange.pl adds the search range to the lexicon files.
 
 When I get the lexicon and augmented_strong files from David, I would run the above steps 1-4.
 
@@ -23,5 +27,9 @@ of the augmented_strongs.txt from David might already have the LXXRefs informati
 steps 2 and 3 might not be necessary or might need to be updated. 
 
 I will then use the updated augmented_strong file and the lexicon files on the dev servers.  I will 
-then run the 5th step (3_getWordFreq.pl).  The URL used in the 3_getWordFreq script should be the URL running STEPBible with the
-updated files.
\ No newline at end of file
+then run the 5th step (3_getWordFreq.pl).  The URL used in the 3_getWordFreq script should be the URL
+running STEPBible with the updated files.
+
+After running 3_getWordFreq, check to see if there are any words which are not found in all Bibles.  
+Those words should be deleted with 1.5_deleteExtraAugStrong and 2.5_deleteExtraAugStrong.  If you need
+to do that, you need to run the process again from step 1.
\ No newline at end of file