Skip to content


script to generate cache lexicon files
Browse files Browse the repository at this point in the history
  • Loading branch information
patricksptang committed Jan 4, 2024
1 parent ce809c1 commit 46a33bd
Show file tree
Hide file tree
Showing 3 changed files with 331 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#This program takes in file directories for a language lexicon, strong directory,
#and the STEP Bible website in orger to output .json files of all the words to a
#specified folder for future use.

#STEPBIBLE website
stepWebsite = ""

#output folder
#requires ending /
outFolder = "jsonfiles/"

import requests
from os.path import exists
import sys
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import json

if __name__ == "__main__":
if len(sys.argv) != 2:
print("This program needs one argument (name of robinson morphology file)")
morphFile = sys.argv[1] # name of lexicon file

session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)

count = 0
words = []
strWords = []
#read in all strong numbers
with open(morphFile,'r', encoding="utf8") as file:
while True:
line = file.readline()
if not line:
csv = line.split(",")
if not exists(outFolder + csv[0] + ".json"):
rdata = session.get(stepWebsite + csv[0], timeout=30)
if (len(rdata.text) > 17):
# print(outFolder + csv[0] + ".json",rdata.text)
with open(outFolder + csv[0] + ".json", 'w', encoding="utf8") as outfile:

Original file line number Diff line number Diff line change
@@ -0,0 +1,274 @@
#This program takes in file directories for a language lexicon, strong directory,
#and the STEP Bible website in orger to output .json files of all the words to a
#specified folder for future use.

#STEPBIBLE website
stepWebsite = ""
#output folder
#requires ending /
outFolder = "jsonfiles/"

import requests
from tqdm import tqdm
import time
from os.path import exists
import sys
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import json

vocabKeys = ["strongNumber", "stepGloss", "stepTransliteration", "count",
"_es_Gloss", "_zh_Gloss", "_zh_tw_Gloss",
"shortDef", "mediumDef", "lsjDefs",
"_es_Definition", "_vi_Definition", "_zh_Definition", "_zh_tw_Definition",
"accentedUnicode", "rawRelatedNumbers", "relatedNos",
"_stepDetailLexicalTag", "_step_Link", "_step_Type", "_searchResultRange",
"freqList", "defaultDStrong"
relatedKeys = ["strongNumber", "gloss", "_es_Gloss", "_zh_Gloss", "_zh_tw_Gloss", "stepTransliteration", "matchingForm", "_searchResultRange"]

def checkDupStrings(currentValue, strings):
if currentValue in strings:
return strings.index(currentValue)
return currentValue

def countDupStrings(orig, stringsFreq):
vocabs = orig["vocabInfos"]
if len(vocabs) != 1:
print("vocab length is not one:", len(vocabs))
for key in vocabs[0].keys():
currentValue = vocabs[0][key]
if isinstance(currentValue, int) or currentValue == "":
if key == "relatedNos":
for relatedNumEntry in currentValue:
for key2 in relatedNumEntry:
if key2 not in relatedKeys:
print("Key not found", key2)
if not isinstance(relatedNumEntry[key2], int) and relatedNumEntry[key2] != "":
if relatedNumEntry[key2] in stringsFreq.keys():
stringsFreq[relatedNumEntry[key2]] = stringsFreq[relatedNumEntry[key2]] + 1
stringsFreq[relatedNumEntry[key2]] = 1
if currentValue in stringsFreq.keys():
stringsFreq[currentValue] += 1
stringsFreq[currentValue] = 1
return stringsFreq

def buildDupStrings(stringsFreq):
strings = []
uniqueFreq = (list(set(stringsFreq.values())))
sorted_values = sorted(uniqueFreq, reverse=True)
for i in sorted_values:
if i > 1:
for key, value in stringsFreq.items():
if i == value and key not in strings:
numOfCharInIndex = len(str(len(strings)))
numOfCharInString = len(key) + 2 # add 2 because there is a begin and end quote
if numOfCharInString > numOfCharInIndex:
return strings

def shortenKey(orig, relatedNums, strings, defaultAugStrong, lxxDefaultAugStrong):
vocabs = orig["vocabInfos"]
if len(vocabs) != 1:
print("vocab length is not one:", len(vocabs))
vocabResult = [""] * len(vocabKeys)
for key in vocabs[0].keys():
key1Index = vocabKeys.index(key)
if key1Index == -1:
print("Key not found", key)
currentValue = vocabs[0][key]
if key == "strongNumber":
if not currentValue[-1].isnumeric():
vocabResult[vocabKeys.index("defaultDStrong")] = ""
currentStrongWithoutAugment = currentValue[:-1]
if currentStrongWithoutAugment in defaultAugStrong and defaultAugStrong[currentStrongWithoutAugment] == currentValue:
vocabResult[vocabKeys.index("defaultDStrong")] += "*"
if currentStrongWithoutAugment in lxxDefaultAugStrong and lxxDefaultAugStrong[currentStrongWithoutAugment] == currentValue:
vocabResult[vocabKeys.index("defaultDStrong")] += "L"
if key == "relatedNos":
currentValue = []
for relatedNumEntry in vocabs[0][key]:
found = False
index = 0
for existingRelatedNum in relatedNums:
### This is not working
checkStrongNum = existingRelatedNum[0]
if isinstance(checkStrongNum, int): # in duplicate string array
checkStrongNum = strings[checkStrongNum]
if checkStrongNum == relatedNumEntry["strongNumber"]:
found = True
index += 1
if not found:
currentRelatedNum = [""] * len(relatedKeys)
for key2 in relatedNumEntry:
key2Index = relatedKeys.index(key2)
if key2Index == -1:
print("Key not found", key2)
currentRelatedNum[key2Index] = checkDupStrings(relatedNumEntry[key2], strings)
if key == "_stepDetailLexicalTag":
detailLexArray = []
for detailLexicalTags in json.loads(vocabs[0][key]):
detailLexArray.append([detailLexicalTags[0], detailLexicalTags[1], detailLexicalTags[2], detailLexicalTags[3], detailLexicalTags[4], detailLexicalTags[5], detailLexicalTags[6]])
currentValue = detailLexArray
vocabResult[key1Index] = checkDupStrings(currentValue, strings)

return { "vocabInfo": vocabResult,
"relatedNums": relatedNums }

if __name__ == "__main__":
if len(sys.argv) != 3:
print("This program needs two arguments (name of lexicon file and name of augmented strong file)")
lexiconFile = sys.argv[1] # name of lexicon file
augStrongFile = sys.argv[2] # name of augmented Strong file

session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)

starttime = time.perf_counter()
count = 0
strongNumbers = []
augStrongNumbers = []
#read in all strong numbers
with open(lexiconFile,'r', encoding="utf8") as file:
while True:
line = file.readline()
if not line:
if line[0:8] == "@StrNo=\t":
currentStrong = line[8:].strip('\n')
if currentStrong[-1].isnumeric():
count += 1

#group strong words [[a,b,c],[a,b],[g,h,i],[g,h,i,j]]
mainStrWords = []
temp = []
# The following code will create an array like this:
# ['H8649A', 'H8649B'], ['H8656G', 'H8656H'], ['H8659G', 'H8659H', 'H8659I'], ['H8668G', 'H8668H'], ...
for n in range(len(augStrongNumbers)-1):
buffer = augStrongNumbers[n]
if augStrongNumbers[n+1][:-1] == buffer[:-1]:
temp = []

print("Total words: " , count + len(mainStrWords))
print("Single def words: ", count)
print("Mult def words: ", len(mainStrWords))

augstr = []
defaultAugStrong = {}
lxxDefaultAugStrong = {}
#get references
#form linked list with words
lastAugStr = ""
with open(augStrongFile, 'r', encoding="utf8") as file:
while True:
line = file.readline()
if not line:
if "@A" in line:
currentAugStr = line[-7:].strip('\n')
if "@R" in line and line[13:].strip('\n') == "*":
defaultAugStrong[currentAugStr[:-1]] = currentAugStr
if "@L" in line and line[10:].strip('\n') == "*":
lxxDefaultAugStrong[currentAugStr[:-1]] = currentAugStr

print("\nPacking Single Definiton Words:")
#package single definition words
for n in tqdm(range(len(strongNumbers))):
#get data
if not exists(outFolder + strongNumbers[n] + ".json"):
rdata = session.get(stepWebsite + strongNumbers[n], timeout=30)
tmp = rdata.json()
#print("word",strongNumbers[n], rdata.text)
#vocabInfo, relatedNums, strings =
if len(tmp['vocabInfos']) != 1:
print("wrong len of vocabInfos, should be 1, but got:", len(tmp['vocabInfos']), strongNumbers[n])
strings = buildDupStrings(countDupStrings(tmp, {}))
vocabInfos = []
relatedNums = []
r = shortenKey(tmp, relatedNums, strings, defaultAugStrong, lxxDefaultAugStrong)
jsonOutput = {"v": vocabInfos}
if (len(strings)) > 0:
jsonOutput["d"] = strings
if len(r["relatedNums"]) > 0:
jsonOutput["r"] = r["relatedNums"]
outResult = json.dumps(jsonOutput, ensure_ascii=False, sort_keys=True, separators=(',', ':')) #, indent=4)
#write to json file
with open(outFolder + strongNumbers[n] + ".json", 'w', encoding = "utf8") as file:

#package multidefinition words
#1. group augmented strong with references
#2. group vocabInfos
#3. group aumented tag + vocabInfos
#4. write to .json
print( "\nPackaging Multi-Definition Words")
for n in tqdm(range(len(mainStrWords))):
#adds a dictionary of {strong: number, refrences: text} to the end of the main array
if not exists(outFolder + mainStrWords[n][0][:-1] + ".json"):
mainarr = []
if len(mainStrWords[n]) > 1:
for l in range(len(mainStrWords[n])):
curAugStr = mainStrWords[n][l]
if curAugStr not in augstr:
print("something wrong, in lexicon, not in augstrong", l, mainStrWords[n])

vocabInfos = []
relatedNums = []
stringsFreq = {}
jsonFromServer = []
for word in mainStrWords[n]:
rdata = session.get(stepWebsite + word, timeout=30)
tmp = rdata.json()
stringsFreq = countDupStrings(tmp, stringsFreq)
strings = buildDupStrings(stringsFreq)
for tmp in jsonFromServer:
r = shortenKey(tmp, relatedNums, strings, defaultAugStrong, lxxDefaultAugStrong)
relatedNums = r["relatedNums"]
jsonOutput = {"v": vocabInfos}
if len(strings) > 0:
jsonOutput["d"] = strings
if len(relatedNums) > 0:
jsonOutput["r"] = relatedNums

# write to json file
with open(outFolder + mainStrWords[n][0][:-1] + ".json", 'w', encoding="utf8") as file:
file.write( json.dumps(jsonOutput, ensure_ascii=False, sort_keys=True, separators=(',', ':'))) #, indent=4) )

finishtime = time.perf_counter()
print(f"\nFinished in {round((finishtime - starttime)/60, 2)} minute(s)")
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
rm -rf com
jar -x com/tyndalehouse/step/core/data/create/lexicon/lexicon_greek.txt < /var/lib/tomcat9/webapps/step-web/WEB-INF/lib/step-core-data-24.1.2.jar
jar -x com/tyndalehouse/step/core/data/create/lexicon/lexicon_hebrew.txt < /var/lib/tomcat9/webapps/step-web/WEB-INF/lib/step-core-data-24.1.2.jar
jar -x com/tyndalehouse/step/core/data/create/augmentedStrongs/augmented_strongs.txt < /var/lib/tomcat9/webapps/step-web/WEB-INF/lib/step-core-data-24.1.2.jar
jar -x com/tyndalehouse/step/core/data/create/morphology/robinson_morphology.csv < /var/lib/tomcat9/webapps/step-web/WEB-INF/lib/step-core-data-24.1.2.jar
python3 com/tyndalehouse/step/core/data/create/morphology/robinson_morphology.csv
python3 com/tyndalehouse/step/core/data/create/lexicon/lexicon_greek.txt com/tyndalehouse/step/core/data/create/augmentedStrongs/augmented_strongs.txt
python3 com/tyndalehouse/step/core/data/create/lexicon/lexicon_hebrew.txt com/tyndalehouse/step/core/data/create/augmentedStrongs/augmented_strongs.txt

0 comments on commit 46a33bd

Please sign in to comment.