-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathods-to-anki.py
147 lines (110 loc) · 5.65 KB
/
ods-to-anki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import zipfile
from bs4 import BeautifulSoup
import pandas as pd
from timeis import timeis, green, yellow, line, white, tic, toc
class ReadOds:
def __init__(self, filename, sheets_name):
print(f"{timeis()} {green}opening ods")
# get content xml data from OpenDocument file
ziparchive = zipfile.ZipFile(filename, "r")
xmldata = ziparchive.read("content.xml")
ziparchive.close()
#find bold styles
self.soup = BeautifulSoup(xmldata, 'xml')
self.sheets_name = sheets_name
self.df = {}
self.bold_names = self.get_bold_styles()
if isinstance(sheets_name, str):
self.sheets_name = [sheets_name]
for sheet_name in sheets_name:
sheet_rows = self.get_rows_in_sheet(sheet_name)
header = self.get_columns_in_row(sheet_rows[0])
# header = [it for it in header if it]
n_row = len(header)
data = [dict(zip(header, (self.get_columns_in_row(row)[:n_row]))) for row in sheet_rows[1:]]
self.df[sheet_name] = pd.DataFrame(data)
def process_and_save_csv(self):
print(f"{timeis()} {green}processing for anki")
test1 = self.df['analysis']['#'] == "1"
test2 = self.df['analysis']['meaning'] != ""
filter = test1 & test2
self.df['analysis'] = self.df['analysis'][filter]
self.df['analysis'].drop(["#", "x", "comments"], axis = 1, inplace=True)
self.df['analysis'].drop(self.df['analysis'].iloc[:, 20:], axis = 1, inplace=True)
self.df['analysis']['GoogleForm'] = f"""Spot a mistake? <a class="link" href="https://docs.google.com/forms/d/e/1FAIpQLSdG6zKDtlwibtrX-cbKVn4WmIs8miH4VnuJvb7f94plCDKJyA/viewform?usp=pp_url&entry.438735500=""" + self.df['analysis'].pali + """&entry.1433863141=Anki">Fix it here</a>."""
rows = self.df['analysis'].shape[0]
columns = self.df['analysis'].shape[1]
self.df['analysis'].to_csv(f'Pātimokkha Word by Word.csv', sep='\t', index=False, header=True, quoting=1)
print(f"{timeis()} {green}saving {white}{rows} {green}rows {white}{columns} {green}columns")
# print(self.df['analysis'])
def get_bold_styles(self):
'''
in xml has office:automatic-styles to configure automatic styles for document
each style name is under style:style > style:text-properties [@fo:font-weight="bold"]
'''
all_auto_styles = self.soup.find_all('office:automatic-styles')
all_text_properties = all_auto_styles[0].find_all('style:text-properties')
all_bolds = [item for item in all_text_properties if item.has_attr('fo:font-weight') and item['fo:font-weight'] == 'bold']
bold_names = [item.parent['style:name'] for item in all_bolds]
return bold_names
def get_rows_in_sheet(self, sheet_name):
print(f"{timeis()} {green}processing cell data")
current_sheet = self.soup.find('table:table', {'table:name':sheet_name})
if current_sheet == None:
print('could not find sheet', sheet_name)
return None
rows = current_sheet.find_all('table:table-row')
#ignore first row
return rows[:]
def get_columns_in_row(self, row):
ret_cells = []
cells = row.find_all('table:table-cell')
for cell in cells:
cell_value = self.process_text(cell)
if cell.has_attr('table:number-columns-repeated'):
num_repeate = 0
try:
num_repeate = int(cell['table:number-columns-repeated'])
except ValueError:
print('failed to parse repeated cell under', cell)
for _ in range(num_repeate - 1):
ret_cells.append(cell_value)
ret_cells.append(cell_value)
return ret_cells
def process_text(self, cell):
'''tex process for each column go here'''
p_texts = cell.find_all('text:p')
if p_texts == None:
return ''
ret_str = ''
for p_text in p_texts:
styled_texts = p_text.find_all('text:span')
for styled_text in styled_texts:
#find bold styles and replace with b tag
if styled_text.has_attr('text:style-name'):
if styled_text['text:style-name'] in self.bold_names:
new_b_tag = self.soup.new_tag('b')
new_b_tag.string = styled_text.string
styled_text.replace_with(new_b_tag)
else:
styled_text.replace_with(styled_text.string)
else:
styled_text.replace_with(styled_text.string)
#implement for <text:s text:c="5"> //5 spaces
styled_texts = p_text.find_all('text:s')
for styled_text in styled_texts:
if styled_text.has_attr('text:c'):
styled_text.replace_with(' '*int(styled_text['text:c']))
else:
styled_text.replace_with(' ')
#convert tags to text
ret_str += ''.join([str(it) for it in p_text.contents]) + '<br/>'
return ret_str.removesuffix('<br/>')
if __name__ == '__main__':
tic()
print(f"{timeis()} {line}")
print(f"{timeis()} {yellow}converting ods to csv")
print(f"{timeis()} {line}")
a = ReadOds("original_sources/Pātimokkha Word by Word.ods", ['analysis'])
a.process_and_save_csv()
toc()