-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvert_pinyin.py
executable file
·90 lines (86 loc) · 3.51 KB
/
convert_pinyin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#! /usr/bin/env python
# convert_pinyin.py
# David Prager Branner
# 20150125
import os
import gzip
import re
import sys
import subprocess
def main():
print('Files to be converted will be looked for in the "data" directory.')
if os.path.exists('data'):
for filename in os.listdir('data'):
if filename == '.DS_Store':
continue
print('\nTrying file\n "{}"'.format(filename))
if convert(filename):
print(' Conversion successful.')
print('\nNo more files found.\n')
else:
print('No directory "data" found.')
def convert(filename):
"""Convert tonal diacritics from font Shyrbaw to standard Unicode"""
if filename == None:
return
# Old-version pages "files" are actually directories; new ones are not.
if os.path.isdir(os.path.join('data', filename)):
# In the old version, the actual content is a gzip-ed XML file.
old_style = True
gz_filename = os.path.join('data', filename, 'index.xml.gz')
with gzip.open(gz_filename, 'rb') as f:
contents = f.read()
# print(contents[-1000:]) # debug-print statement
else:
# Not sure yet how to deal with the new version (v. 9 and later).
sys.exit('File\n {}\n cannot be converted.'.format(filename))
old_style = False
with open(filename, 'rb') as f:
contents = f.read()
diacritics = {
# Original mapping, five vowels tone 1:
# '¡': 'ā', '™': 'ē', '£': 'ī', '¢': 'ō', '∞': 'ū'
b'¡': b'ā', b'™': b'ē',
b'£': b'ī', b'¢': b'ō',
b'∞': b'ū',
# Original mapping, five vowels tone 3:
# '§': 'ǎ', '¶': 'ě', '•': 'ǐ', 'ª': 'ǒ', 'º': 'ǔ',
b'§': b'ǎ', b'¶': b'ě',
b'•': b'ǐ', b'ª': b'ǒ',
b'º': b'ǔ',
# Original mapping, u-umlaut:
# '∞': 'ǘ', '√': 'ǚ', 'π': 'ǜ'
b'≈': b'ǘ', b'√': b'ǚ',
b'π': b'ǜ',
# Note: five vowels, tones 2 and 4, were originally handled
# correctly in standard upper ASCII and so are the same as current
# Unicode.
#
# Note 2: I do not currently have a way to learn how ǖ or the
# capitalized vowels with diacritics were mapped.
}
for k in diacritics:
contents = re.sub(k, diacritics[k], contents)
if old_style:
# This is done inefficiently because of apparent filesystem issues.
old_files = os.path.join('data', 'old_files')
if not os.path.exists(old_files):
os.mkdir(old_files)
subprocess.call(
['/bin/cp', '-R', os.path.join('data', filename), old_files])
with gzip.open(gz_filename, 'wb') as f:
f.write(contents)
converted_files = os.path.join('data', 'converted_files')
if not os.path.exists(converted_files):
os.mkdir(converted_files)
subprocess.call(
['/bin/mv', os.path.join('data', filename), converted_files])
os.rename(os.path.join(converted_files, filename),
os.path.join(converted_files, filename +
'_converted_to_unicode.pages'))
else:
with open(filename, 'wb') as f:
f.write(contents)
return True
if __name__ == '__main__':
main()