-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_textfiles.py
64 lines (50 loc) · 1.62 KB
/
create_textfiles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 17 11:10:03 2024
@author: dnelson
"""
from bs4 import BeautifulSoup
import os
def extract_lines(soup):
lines = []
xml_lines = soup.find_all('String')
for line in xml_lines:
cleaned_line = line.get('CONTENT')
lines.append(cleaned_line)
return lines
def parse_xml(file_path):
with open(file_path) as f:
soup = BeautifulSoup(f, 'lxml-xml')
return soup
def get_xml_files(direc):
dirs = os.listdir(direc)
transcriptions = {}
for d in dirs:
xml_files = [f for f in os.listdir(os.path.join(direc, d)) if f.endswith('xml')]
if len(xml_files) > 0:
transcriptions.update({d: xml_files})
return transcriptions
def generate_plain_transcriptions(direc):
transcriptions = get_xml_files(direc)
texts = {}
for i in transcriptions.keys():
files = transcriptions[i]
for f in files:
file_name = f.replace('.xml', '')
soup = parse_xml(os.path.join(direc, i, f))
lines = extract_lines(soup)
texts.update({file_name: lines})
return(texts)
def write_textfiles(transcriptions, direc):
for i in transcriptions.keys():
path = os.path.join(direc, f'{i}.txt')
with open(path, 'w') as f:
lines = map(lambda x: x + '\n', transcriptions[i])
f.writelines(lines)
def cli():
source_dir = 'U:/htr_revcity/image_files'
target_dir = 'U:/htr_revcity/textfiles'
transcriptions = generate_plain_transcriptions(source_dir)
write_textfiles(transcriptions, target_dir)
if __name__ == '__main__':
cli()