This repository has been archived by the owner on Jan 24, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
aee1794
commit 1edfeb6
Showing
14 changed files
with
622 additions
and
495 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,32 +1,37 @@ | ||
import os | ||
from pathlib import Path | ||
from app.src.modules.is_image import is_image as IsImage | ||
from app.src.modules.logger_mod import write_log as WriteLog | ||
|
||
rootfolder = './' | ||
txt_file = Path('./TesseractOutput/less_mistakes.txt') | ||
docx_file = Path('./TesseractOutput/to_be_parsed.docx') | ||
|
||
def delete_irrelevant_files(rootfolder=rootfolder, txt_file=txt_file, docx_file=docx_file): | ||
if txt_file.is_file(): | ||
txt_file.unlink(missing_ok=True) | ||
if docx_file.is_file(): | ||
docx_file.unlink(missing_ok=True) | ||
# Create a list of all visible files | ||
allfiles = [] | ||
excluded_folders = {'.', 'assets', '__'} | ||
for foldername, subfolders, filenames in os.walk(rootfolder): | ||
subfolders[:] = [subfolder for subfolder in subfolders if not subfolder.startswith(tuple(excluded_folders))] | ||
for filename in filenames: | ||
#print(f'cwd is {foldername}') # debug statement | ||
if not filename.startswith('.'): | ||
allfiles.append(Path(foldername) / filename) | ||
for file in allfiles: | ||
if IsImage(file): | ||
Path.unlink(file) | ||
if str(file).endswith('.xml'): | ||
Path.unlink(file) | ||
|
||
print('Images and hOCR data deleted.') | ||
try: | ||
if txt_file.is_file(): | ||
txt_file.unlink(missing_ok=True) | ||
if docx_file.is_file(): | ||
docx_file.unlink(missing_ok=True) | ||
# Create a list of all visible files | ||
allfiles = [] | ||
excluded_folders = {'.', 'assets', '__'} | ||
for foldername, subfolders, filenames in os.walk(rootfolder): | ||
subfolders[:] = [subfolder for subfolder in subfolders if not subfolder.startswith(tuple(excluded_folders))] | ||
for filename in filenames: | ||
#print(f'cwd is {foldername}') # debug statement | ||
if not filename.startswith('.'): | ||
allfiles.append(Path(foldername) / filename) | ||
for file in allfiles: | ||
if IsImage(file): | ||
Path.unlink(file) | ||
if str(file).endswith('.xml'): | ||
Path.unlink(file) | ||
|
||
print('Images and hOCR data deleted.') | ||
except Exception as e: | ||
WriteLog(e) | ||
|
||
|
||
if __name__ == "__main__": | ||
delete_irrelevant_files(rootfolder, txt_file, docx_file) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,68 +1,73 @@ | ||
from pathlib import Path | ||
from app.src.modules.is_image import is_image as IsImage | ||
import re | ||
from natsort import os_sorted | ||
import os | ||
import shutil | ||
from app.src.modules.is_image import is_image as IsImage | ||
from app.src.modules.functions_ui import ProgressCounter | ||
import re | ||
from natsort import os_sorted | ||
from app.src.modules.logger_mod import write_log as WriteLog | ||
|
||
rootfolder = './ImportFolder' | ||
destination = Path('./SplitterInput/') | ||
|
||
def double_rename(rootfolder=rootfolder, destination=destination): | ||
allfiles = list(Path(rootfolder).rglob('*.*')) | ||
ImageList = [] | ||
RenamedFiles = [] | ||
Counter = '0001' | ||
has_numbers = re.compile(r'[0-9]') | ||
try: | ||
allfiles = list(Path(rootfolder).rglob('*.*')) | ||
ImageList = [] | ||
RenamedFiles = [] | ||
Counter = '0001' | ||
has_numbers = re.compile(r'[0-9]') | ||
|
||
if all(has_numbers.search(str(file)) for file in allfiles): | ||
print('All images are numbered, using Windows sorting') | ||
ImageList = [file for file in allfiles if IsImage(file)] | ||
ImageList = os_sorted(ImageList) | ||
|
||
else: | ||
print("Images aren't ordered, ordering based on file creation date") | ||
try: | ||
for file in allfiles: | ||
if IsImage(file): | ||
os.path.getctime(file) | ||
#print(file, 'was created at:', os.path.getctime(file), 'in UNIX time') # Read file creation time for each image file using os.path.getctime | ||
ImageList.append(file) | ||
else: | ||
print(file, "is not an image") | ||
except BaseException as error: | ||
print('An exception occurred while processing {}: {}'.format(file, error)) | ||
ImageList = sorted(ImageList, key=os.path.getctime) | ||
|
||
if all(has_numbers.search(str(file)) for file in allfiles): | ||
print('All images are numbered, using Windows sorting') | ||
ImageList = [file for file in allfiles if IsImage(file)] | ||
ImageList = os_sorted(ImageList) | ||
# Rename each file sequentially using the order found in ImageList, incrementing Counter by 1 with each file read | ||
|
||
else: | ||
print("Images aren't ordered, ordering based on file creation date") | ||
try: | ||
for file in allfiles: | ||
if IsImage(file): | ||
os.path.getctime(file) | ||
#print(file, 'was created at:', os.path.getctime(file), 'in UNIX time') # Read file creation time for each image file using os.path.getctime | ||
ImageList.append(file) | ||
else: | ||
print(file, "is not an image") | ||
except BaseException as error: | ||
print('An exception occurred while processing {}: {}'.format(file, error)) | ||
ImageList = sorted(ImageList, key=os.path.getctime) | ||
|
||
# Rename each file sequentially using the order found in ImageList, incrementing Counter by 1 with each file read | ||
|
||
files = list(filter(IsImage, allfiles)) | ||
|
||
with ProgressCounter(len(files)) as progress: | ||
for file in ImageList: | ||
files = list(filter(IsImage, allfiles)) | ||
|
||
with ProgressCounter(len(files)) as progress: | ||
for file in ImageList: | ||
try: | ||
#print(file, os.path.getctime(file)) #Debug statement | ||
ext = os.path.splitext(file)[1] | ||
NewFile = os.path.join(rootfolder, f"{Counter}{ext}") | ||
#print(file, 'saved as:', NewFile) | ||
shutil.copy(file, NewFile) #Creates a copy of the original file with a new name and metadata | ||
Counter = '{:04d}'.format(int(Counter) + 1) | ||
RenamedFiles.append(NewFile) | ||
progress.update_progress() | ||
except BaseException as error: | ||
print('An exception occurred while processing {}: {}'.format(file, error)) | ||
progress.finalize() | ||
|
||
# Move processed files to the image splitting folder | ||
for file in RenamedFiles: | ||
try: | ||
#print(file, os.path.getctime(file)) #Debug statement | ||
ext = os.path.splitext(file)[1] | ||
NewFile = os.path.join(rootfolder, f"{Counter}{ext}") | ||
#print(file, 'saved as:', NewFile) | ||
shutil.copy(file, NewFile) #Creates a copy of the original file with a new name and metadata | ||
Counter = '{:04d}'.format(int(Counter) + 1) | ||
RenamedFiles.append(NewFile) | ||
progress.update_progress() | ||
#print(RenamedFiles) #Debug to make sure the right file was added to this list | ||
#print(file, 'saved to:', destination) | ||
shutil.move(file, destination) | ||
except BaseException as error: | ||
print('An exception occurred while processing {}: {}'.format(file, error)) | ||
progress.finalize() | ||
|
||
# Move processed files to the image splitting folder | ||
for file in RenamedFiles: | ||
try: | ||
#print(RenamedFiles) #Debug to make sure the right file was added to this list | ||
#print(file, 'saved to:', destination) | ||
shutil.move(file, destination) | ||
except BaseException as error: | ||
print('An exception occurred while processing {}: {}'.format(file, error)) | ||
print(f'{len(RenamedFiles)} images renamed and moved') | ||
print(f'{len(RenamedFiles)} images renamed and moved') | ||
except Exception as e: | ||
WriteLog(e) | ||
|
||
|
||
if __name__ == "__main__": | ||
double_rename(rootfolder, destination) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,74 +1,79 @@ | ||
import docx2txt | ||
from pathlib import Path | ||
import re | ||
from app.src.modules.logger_mod import write_log as WriteLog | ||
|
||
input_file = Path('./TesseractOutput/to_be_parsed.docx') | ||
output_file = Path('./TesseractOutput/less_mistakes.txt') | ||
|
||
def regex_corrector(input_file=input_file, output_file=output_file): | ||
print('Starting regex correction of hOCR data') | ||
linebreak = re.compile('[a-z]- ') | ||
pagenumber = re.compile('(\n\d \n)|(\n\d\d \n)|(\n\d\d\d \n)') | ||
stupid_french_e_l = re.compile('è') | ||
stupid_french_e_u = re.compile('È') | ||
|
||
text = docx2txt.process(input_file) | ||
|
||
mistakes_no = 0 | ||
|
||
updated_text = "" | ||
|
||
with open(output_file, 'wt') as f: | ||
f.write(text) | ||
|
||
with open(output_file, 'rt') as f: #Running a loop to count the number of mistakes because I'm inefficient | ||
text = f.read() | ||
for sentence in re.split('[.]', text): | ||
lm = linebreak.search(sentence) | ||
pm = pagenumber.search(sentence) | ||
sfelm = stupid_french_e_l.search(sentence) | ||
sfeum = stupid_french_e_u.search(sentence) | ||
if lm: | ||
mistakes_no += 1 | ||
if pm: | ||
mistakes_no += 1 | ||
if sfelm: | ||
mistakes_no += 1 | ||
if sfeum: | ||
mistakes_no += 1 | ||
else: | ||
pass | ||
f.close() | ||
|
||
with open(output_file, 'rt') as f: | ||
try: | ||
print('Starting regex correction of hOCR data') | ||
linebreak = re.compile('[a-z]- ') | ||
pagenumber = re.compile('(\n\d \n)|(\n\d\d \n)|(\n\d\d\d \n)') | ||
stupid_french_e_l = re.compile('è') | ||
stupid_french_e_u = re.compile('È') | ||
|
||
text = docx2txt.process(input_file) | ||
|
||
mistakes_no = 0 | ||
|
||
updated_text = "" | ||
|
||
with open(output_file, 'wt') as f: | ||
f.write(text) | ||
|
||
with open(output_file, 'rt') as f: #Running a loop to count the number of mistakes because I'm inefficient | ||
text = f.read() | ||
for sentence in re.split('[.]', text): #Iterates through sentences and replaces regex matches | ||
for sentence in re.split('[.]', text): | ||
lm = linebreak.search(sentence) | ||
pm = pagenumber.search(sentence) | ||
sfelm = stupid_french_e_l.search(sentence) | ||
sfeum = stupid_french_e_u.search(sentence) | ||
if lm: | ||
re_match = lm.group() | ||
sentence = sentence.replace('- ', '') | ||
mistakes_no += 1 | ||
if pm: | ||
re_match = pm.group() | ||
sentence = sentence.replace(re_match, '') | ||
mistakes_no += 1 | ||
if sfelm: | ||
re_match = sfelm.group() | ||
sentence = sentence.replace(re_match, 'e') | ||
mistakes_no += 1 | ||
if sfeum: | ||
re_match = sfeum.group() | ||
sentence = sentence.replace(re_match, 'E') | ||
mistakes_no += 1 | ||
else: | ||
pass | ||
updated_text += sentence + "." | ||
f.close() | ||
|
||
with open(output_file, 'wt') as f: | ||
f.write(updated_text) | ||
f.close() | ||
|
||
print(f'{mistakes_no} mistakes found and corrected') | ||
|
||
with open(output_file, 'rt') as f: | ||
text = f.read() | ||
for sentence in re.split('[.]', text): #Iterates through sentences and replaces regex matches | ||
lm = linebreak.search(sentence) | ||
pm = pagenumber.search(sentence) | ||
sfelm = stupid_french_e_l.search(sentence) | ||
sfeum = stupid_french_e_u.search(sentence) | ||
if lm: | ||
re_match = lm.group() | ||
sentence = sentence.replace('- ', '') | ||
if pm: | ||
re_match = pm.group() | ||
sentence = sentence.replace(re_match, '') | ||
if sfelm: | ||
re_match = sfelm.group() | ||
sentence = sentence.replace(re_match, 'e') | ||
if sfeum: | ||
re_match = sfeum.group() | ||
sentence = sentence.replace(re_match, 'E') | ||
else: | ||
pass | ||
updated_text += sentence + "." | ||
f.close() | ||
|
||
with open(output_file, 'wt') as f: | ||
f.write(updated_text) | ||
f.close() | ||
|
||
print(f'{mistakes_no} mistakes found and corrected') | ||
except Exception as e: | ||
WriteLog(e) | ||
|
||
|
||
if __name__ == "__main__": | ||
regex_corrector(input_file, output_file) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.