-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpinyin_freq_db.py
118 lines (95 loc) · 3.91 KB
/
pinyin_freq_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import re
import os
from wrapped_json import json
import glob
from collections import defaultdict
import numpy as np
from sklearn import tree
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from pinyin import normalize_pinyin, extract_normalized_pinyin
from han import filter_text_hanzi, align_hanzi_and_pinyin, CEDICT
from transformer_segmentation import segment_sentences
from merkl import task, Eval, FileRef
@task(serializer=json, deps=[FileRef('data/git/pinyin_freqs.txt')])
def make_pinyin_freq_db():
pinyin_single = defaultdict(lambda: defaultdict(list))
pinyin_all = defaultdict(lambda: defaultdict(list))
sentences = []
with open('data/git/pinyin_freqs.txt', 'r') as f:
lines = f.read().split('\n')
lines = [line for line in lines if not line.startswith('#')]
hzs = lines[::2]
pys = lines[1::2]
for hz, py in zip(hzs, pys):
sentences.append({'hanzi': hz, 'pinyin': py})
for transcript_file in glob.glob('data/remote/private/chinesepod/transcripts/*.json'):
filename = os.path.basename(transcript_file)
print('Processing: ', filename)
transcript = json.loads(open(transcript_file, 'r').read())
sentences += transcript['sentences']
hanzis = [s['hanzi'] for s in sentences]
with Eval():
segmentation = segment_sentences(hanzis, join_compound_words=False)
compounds_out = []
for sentence, (segs, poss, ners, people, compounds) in zip(sentences, segmentation):
if 'pinyin' not in sentence:
continue
hanzi = sentence['hanzi']
if len(compounds) > 0:
compounds_translations = []
for c, check_c in compounds:
if check_c is None:
continue
translations = ''
for _, _, transl, _, _ in CEDICT.v[check_c][1]:
translations += transl
compounds_translations.append((c, translations, True))
compounds_out.append((hanzi, sentence.get('translation', ''), compounds_translations))
hanzi = filter_text_hanzi(hanzi)
pinyin = sentence['pinyin']
pys = extract_normalized_pinyin(normalize_pinyin(pinyin))
hanzi_pinyin_indices = align_hanzi_and_pinyin(hanzi, pys)
i = 0
next_i = 0
for seg_i, (seg, pos) in enumerate(zip(segs, poss)):
hz = filter_text_hanzi(seg[-1])
if len(hz) == 0:
continue
i = next_i
next_i = i + len(hz)
try:
pinyin_idx = hanzi_pinyin_indices[i]
except:
breakpoint()
pass
if pinyin_idx is None:
# Was no levenshtein match
continue
pinyin_end_idx = hanzi_pinyin_indices[i + len(hz) - 1]
if pinyin_end_idx is None:
continue
t_pys = pys[pinyin_idx : pinyin_end_idx + 1]
if len(t_pys) != len(hz):
# Skipped across missing stuff
continue
py = ''.join(t_pys).lower()
exerpt = (hanzi, i, ''.join(pys), seg_i, segs, poss)
if hz not in CEDICT.v:
continue
cedict_readings = [c[1].replace(' ', '') for c in CEDICT.v[hz][1]]
if py not in cedict_readings:
continue
pinyin_single[hz][py].append(exerpt)
for i, pinyin_idx in enumerate(hanzi_pinyin_indices):
if pinyin_idx is None:
# Was no levenshtein match
continue
hz = hanzi[i]
py = pys[pinyin_idx].lower()
exerpt = (hanzi, i, ''.join(pys), seg_i, segs, poss)
pinyin_all[hz][py].append(exerpt)
out = pinyin_all, pinyin_single
return out