forked from unicode-org/unicodetools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsplitunihan.py
executable file
·75 lines (67 loc) · 2.39 KB
/
splitunihan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/python3 -B
# -*- coding: utf-8 -*-
# © 2019 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# This script reads Unihan*.txt files and writes one new file per property.
# Invoke with one argument, the path to the Unihan data folder.
import glob
import os.path
import sys
# Maps from a property like "kTotalStrokes" to a list of data:
# - output file object
# - first range code point (string)
# - last range code point (string)
# - last range code point (int)
# - last value
_prop_data = {}
def ParseFile(in_file, root_path):
for line in in_file:
line = line.strip()
if not line: continue # empty line
if line.startswith("#"): continue # comment line
fields = line.split("\t")
assert len(fields) == 3, "expect three TAB-delimited fields\n %s\n" % line
assert fields[0].startswith("U+")
(cp_str, prop, value) = fields
cp_str = cp_str[2:] # Strip off the "U+" prefix.
cp = int(cp_str, 16)
if prop not in _prop_data:
print("new file for " + prop)
out_path = os.path.join(root_path, prop + ".txt")
out_file = open(out_path, "w")
_prop_data[prop] = [out_file, cp_str, cp_str, cp, value]
else:
data = _prop_data[prop]
(last_cp, last_value) = data[3:5]
if last_cp + 1 == cp and last_value == value:
# Extend the range.
data[2] = cp_str
data[3] = cp
else:
# Write the last range.
(out_file, first_cp_str, last_cp_str) = data[0:3]
if first_cp_str == last_cp_str:
out_file.write(first_cp_str + "\t" + last_value + "\n")
else:
out_file.write(first_cp_str + ".." + last_cp_str + "\t" + last_value + "\n")
# Start a new range.
data[1:] = (cp_str, cp_str, cp, value)
def main():
global _prop_data
root_path = sys.argv[1]
pattern = os.path.join(root_path, "Unihan*.txt")
paths = glob.glob(pattern)
for path in paths:
print(path)
with open(path, "r") as in_file:
ParseFile(in_file, root_path)
# Emit the last range for each file.
for data in _prop_data.values():
(out_file, first_cp_str, last_cp_str, _, last_value) = data
if first_cp_str == last_cp_str:
out_file.write(first_cp_str + "\t" + last_value + "\n")
else:
out_file.write(first_cp_str + ".." + last_cp_str + "\t" + last_value + "\n")
out_file.close()
if __name__ == "__main__":
main()