Skip to content
This repository has been archived by the owner on Jan 24, 2024. It is now read-only.

Commit

Permalink
logging? pls?
Browse files Browse the repository at this point in the history
  • Loading branch information
natsukashiixo committed Oct 5, 2023
1 parent aee1794 commit 1edfeb6
Show file tree
Hide file tree
Showing 14 changed files with 622 additions and 495 deletions.
2 changes: 1 addition & 1 deletion app/src/auto_specs.spec
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ a = Analysis(
pathex=['./', './app/src/modules'],
binaries=[],
datas=[],
hiddenimports = ['app.src.modules.functions_ui', 'app.src.modules.delete_files', 'app.src.modules.rewrite_docx', 'app.src.modules.fix_mistakes', 'app.src.modules.hocr_parser', 'app.src.modules.run_tesseract', 'app.src.modules.rotate_and_split_image', 'app.src.modules.doublepage_img_rename', 'app.src.modules.singlepage_img_rename', 'app.src.modules.verify_folders', 'app.src.modules.setup_functions', 'app.src.modules.is_image'],
hiddenimports = ['app.src.modules.functions_ui', 'app.src.modules.delete_files', 'app.src.modules.rewrite_docx', 'app.src.modules.fix_mistakes', 'app.src.modules.hocr_parser', 'app.src.modules.run_tesseract', 'app.src.modules.rotate_and_split_image', 'app.src.modules.doublepage_img_rename', 'app.src.modules.singlepage_img_rename', 'app.src.modules.verify_folders', 'app.src.modules.setup_functions', 'app.src.modules.is_image', 'app.src.modules.logger_mod'],
hookspath=[],
hooksconfig={},
runtime_hooks=[],
Expand Down
45 changes: 25 additions & 20 deletions app/src/modules/delete_files.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,37 @@
import os
from pathlib import Path
from app.src.modules.is_image import is_image as IsImage
from app.src.modules.logger_mod import write_log as WriteLog

rootfolder = './'
txt_file = Path('./TesseractOutput/less_mistakes.txt')
docx_file = Path('./TesseractOutput/to_be_parsed.docx')

def delete_irrelevant_files(rootfolder=rootfolder, txt_file=txt_file, docx_file=docx_file):
if txt_file.is_file():
txt_file.unlink(missing_ok=True)
if docx_file.is_file():
docx_file.unlink(missing_ok=True)
# Create a list of all visible files
allfiles = []
excluded_folders = {'.', 'assets', '__'}
for foldername, subfolders, filenames in os.walk(rootfolder):
subfolders[:] = [subfolder for subfolder in subfolders if not subfolder.startswith(tuple(excluded_folders))]
for filename in filenames:
#print(f'cwd is {foldername}') # debug statement
if not filename.startswith('.'):
allfiles.append(Path(foldername) / filename)
for file in allfiles:
if IsImage(file):
Path.unlink(file)
if str(file).endswith('.xml'):
Path.unlink(file)

print('Images and hOCR data deleted.')
try:
if txt_file.is_file():
txt_file.unlink(missing_ok=True)
if docx_file.is_file():
docx_file.unlink(missing_ok=True)
# Create a list of all visible files
allfiles = []
excluded_folders = {'.', 'assets', '__'}
for foldername, subfolders, filenames in os.walk(rootfolder):
subfolders[:] = [subfolder for subfolder in subfolders if not subfolder.startswith(tuple(excluded_folders))]
for filename in filenames:
#print(f'cwd is {foldername}') # debug statement
if not filename.startswith('.'):
allfiles.append(Path(foldername) / filename)
for file in allfiles:
if IsImage(file):
Path.unlink(file)
if str(file).endswith('.xml'):
Path.unlink(file)

print('Images and hOCR data deleted.')
except Exception as e:
WriteLog(e)


if __name__ == "__main__":
delete_irrelevant_files(rootfolder, txt_file, docx_file)
107 changes: 56 additions & 51 deletions app/src/modules/doublepage_img_rename.py
Original file line number Diff line number Diff line change
@@ -1,68 +1,73 @@
from pathlib import Path
from app.src.modules.is_image import is_image as IsImage
import re
from natsort import os_sorted
import os
import shutil
from app.src.modules.is_image import is_image as IsImage
from app.src.modules.functions_ui import ProgressCounter
import re
from natsort import os_sorted
from app.src.modules.logger_mod import write_log as WriteLog

rootfolder = './ImportFolder'
destination = Path('./SplitterInput/')

def double_rename(rootfolder=rootfolder, destination=destination):
allfiles = list(Path(rootfolder).rglob('*.*'))
ImageList = []
RenamedFiles = []
Counter = '0001'
has_numbers = re.compile(r'[0-9]')
try:
allfiles = list(Path(rootfolder).rglob('*.*'))
ImageList = []
RenamedFiles = []
Counter = '0001'
has_numbers = re.compile(r'[0-9]')

if all(has_numbers.search(str(file)) for file in allfiles):
print('All images are numbered, using Windows sorting')
ImageList = [file for file in allfiles if IsImage(file)]
ImageList = os_sorted(ImageList)

else:
print("Images aren't ordered, ordering based on file creation date")
try:
for file in allfiles:
if IsImage(file):
os.path.getctime(file)
#print(file, 'was created at:', os.path.getctime(file), 'in UNIX time') # Read file creation time for each image file using os.path.getctime
ImageList.append(file)
else:
print(file, "is not an image")
except BaseException as error:
print('An exception occurred while processing {}: {}'.format(file, error))
ImageList = sorted(ImageList, key=os.path.getctime)

if all(has_numbers.search(str(file)) for file in allfiles):
print('All images are numbered, using Windows sorting')
ImageList = [file for file in allfiles if IsImage(file)]
ImageList = os_sorted(ImageList)
# Rename each file sequentially using the order found in ImageList, incrementing Counter by 1 with each file read

else:
print("Images aren't ordered, ordering based on file creation date")
try:
for file in allfiles:
if IsImage(file):
os.path.getctime(file)
#print(file, 'was created at:', os.path.getctime(file), 'in UNIX time') # Read file creation time for each image file using os.path.getctime
ImageList.append(file)
else:
print(file, "is not an image")
except BaseException as error:
print('An exception occurred while processing {}: {}'.format(file, error))
ImageList = sorted(ImageList, key=os.path.getctime)

# Rename each file sequentially using the order found in ImageList, incrementing Counter by 1 with each file read

files = list(filter(IsImage, allfiles))

with ProgressCounter(len(files)) as progress:
for file in ImageList:
files = list(filter(IsImage, allfiles))

with ProgressCounter(len(files)) as progress:
for file in ImageList:
try:
#print(file, os.path.getctime(file)) #Debug statement
ext = os.path.splitext(file)[1]
NewFile = os.path.join(rootfolder, f"{Counter}{ext}")
#print(file, 'saved as:', NewFile)
shutil.copy(file, NewFile) #Creates a copy of the original file with a new name and metadata
Counter = '{:04d}'.format(int(Counter) + 1)
RenamedFiles.append(NewFile)
progress.update_progress()
except BaseException as error:
print('An exception occurred while processing {}: {}'.format(file, error))
progress.finalize()

# Move processed files to the image splitting folder
for file in RenamedFiles:
try:
#print(file, os.path.getctime(file)) #Debug statement
ext = os.path.splitext(file)[1]
NewFile = os.path.join(rootfolder, f"{Counter}{ext}")
#print(file, 'saved as:', NewFile)
shutil.copy(file, NewFile) #Creates a copy of the original file with a new name and metadata
Counter = '{:04d}'.format(int(Counter) + 1)
RenamedFiles.append(NewFile)
progress.update_progress()
#print(RenamedFiles) #Debug to make sure the right file was added to this list
#print(file, 'saved to:', destination)
shutil.move(file, destination)
except BaseException as error:
print('An exception occurred while processing {}: {}'.format(file, error))
progress.finalize()

# Move processed files to the image splitting folder
for file in RenamedFiles:
try:
#print(RenamedFiles) #Debug to make sure the right file was added to this list
#print(file, 'saved to:', destination)
shutil.move(file, destination)
except BaseException as error:
print('An exception occurred while processing {}: {}'.format(file, error))
print(f'{len(RenamedFiles)} images renamed and moved')
print(f'{len(RenamedFiles)} images renamed and moved')
except Exception as e:
WriteLog(e)


if __name__ == "__main__":
double_rename(rootfolder, destination)
107 changes: 56 additions & 51 deletions app/src/modules/fix_mistakes.py
Original file line number Diff line number Diff line change
@@ -1,74 +1,79 @@
import docx2txt
from pathlib import Path
import re
from app.src.modules.logger_mod import write_log as WriteLog

input_file = Path('./TesseractOutput/to_be_parsed.docx')
output_file = Path('./TesseractOutput/less_mistakes.txt')

def regex_corrector(input_file=input_file, output_file=output_file):
print('Starting regex correction of hOCR data')
linebreak = re.compile('[a-z]- ')
pagenumber = re.compile('(\n\d \n)|(\n\d\d \n)|(\n\d\d\d \n)')
stupid_french_e_l = re.compile('è')
stupid_french_e_u = re.compile('È')

text = docx2txt.process(input_file)

mistakes_no = 0

updated_text = ""

with open(output_file, 'wt') as f:
f.write(text)

with open(output_file, 'rt') as f: #Running a loop to count the number of mistakes because I'm inefficient
text = f.read()
for sentence in re.split('[.]', text):
lm = linebreak.search(sentence)
pm = pagenumber.search(sentence)
sfelm = stupid_french_e_l.search(sentence)
sfeum = stupid_french_e_u.search(sentence)
if lm:
mistakes_no += 1
if pm:
mistakes_no += 1
if sfelm:
mistakes_no += 1
if sfeum:
mistakes_no += 1
else:
pass
f.close()

with open(output_file, 'rt') as f:
try:
print('Starting regex correction of hOCR data')
linebreak = re.compile('[a-z]- ')
pagenumber = re.compile('(\n\d \n)|(\n\d\d \n)|(\n\d\d\d \n)')
stupid_french_e_l = re.compile('è')
stupid_french_e_u = re.compile('È')

text = docx2txt.process(input_file)

mistakes_no = 0

updated_text = ""

with open(output_file, 'wt') as f:
f.write(text)

with open(output_file, 'rt') as f: #Running a loop to count the number of mistakes because I'm inefficient
text = f.read()
for sentence in re.split('[.]', text): #Iterates through sentences and replaces regex matches
for sentence in re.split('[.]', text):
lm = linebreak.search(sentence)
pm = pagenumber.search(sentence)
sfelm = stupid_french_e_l.search(sentence)
sfeum = stupid_french_e_u.search(sentence)
if lm:
re_match = lm.group()
sentence = sentence.replace('- ', '')
mistakes_no += 1
if pm:
re_match = pm.group()
sentence = sentence.replace(re_match, '')
mistakes_no += 1
if sfelm:
re_match = sfelm.group()
sentence = sentence.replace(re_match, 'e')
mistakes_no += 1
if sfeum:
re_match = sfeum.group()
sentence = sentence.replace(re_match, 'E')
mistakes_no += 1
else:
pass
updated_text += sentence + "."
f.close()

with open(output_file, 'wt') as f:
f.write(updated_text)
f.close()

print(f'{mistakes_no} mistakes found and corrected')

with open(output_file, 'rt') as f:
text = f.read()
for sentence in re.split('[.]', text): #Iterates through sentences and replaces regex matches
lm = linebreak.search(sentence)
pm = pagenumber.search(sentence)
sfelm = stupid_french_e_l.search(sentence)
sfeum = stupid_french_e_u.search(sentence)
if lm:
re_match = lm.group()
sentence = sentence.replace('- ', '')
if pm:
re_match = pm.group()
sentence = sentence.replace(re_match, '')
if sfelm:
re_match = sfelm.group()
sentence = sentence.replace(re_match, 'e')
if sfeum:
re_match = sfeum.group()
sentence = sentence.replace(re_match, 'E')
else:
pass
updated_text += sentence + "."
f.close()

with open(output_file, 'wt') as f:
f.write(updated_text)
f.close()

print(f'{mistakes_no} mistakes found and corrected')
except Exception as e:
WriteLog(e)


if __name__ == "__main__":
regex_corrector(input_file, output_file)
41 changes: 27 additions & 14 deletions app/src/modules/functions_ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,32 @@
from typing import Union
import os
import math
from app.src.modules.logger_mod import write_log as WriteLog

def clicked():
print("buttonclick")

def open_folder(path: Union[str, os.PathLike]):
realpath = os.path.realpath(path)
os.startfile(realpath)
try:
realpath = os.path.realpath(path)
os.startfile(realpath)
except Exception as e:
WriteLog(e)


def exit_button():
#sys.exit(app.exec_() #this probably needs to be in the actual UI file I guess
pass

def round_seconds(seconds):
seconds = int(seconds)
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
return h, m, s
try:
seconds = int(seconds)
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
return h, m, s
except Exception as e:
WriteLog(e)


class ProgressCounter:
def __init__(self, total):
Expand All @@ -33,14 +42,18 @@ def __exit__(self, exc_type, exc_val, exc_tb):
pass

def update_progress(self):
self.counter += 1
percent_done = (self.counter / self.total) * 100
floored_percent = math.floor(percent_done)
if self.total < 100:
print(f"{percent_done:.2f}% done")
elif floored_percent%5 == 0 and floored_percent != self.previous_percent:
self.previous_percent = floored_percent
print(f"{percent_done:.2f}% done")
try:
self.counter += 1
percent_done = (self.counter / self.total) * 100
floored_percent = math.floor(percent_done)
if self.total < 100:
print(f"{percent_done:.2f}% done")
elif floored_percent%5 == 0 and floored_percent != self.previous_percent:
self.previous_percent = floored_percent
print(f"{percent_done:.2f}% done")
except Exception as e:
WriteLog(e)


def finalize(self):
print('Operation completed')
Expand Down
Loading

0 comments on commit 1edfeb6

Please sign in to comment.