-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathadd_samples.py
85 lines (62 loc) · 2.34 KB
/
add_samples.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import sys
import os
from random import shuffle
import pympi
import pandas as pd
def process(file, aclew_data, t=5, start=30, n=5):
global selected
length = int(float(aclew_data.length_of_recording))
eaf = pympi.Eaf(file)
existing = [(x / 60000, y / 60000)
for x, y, _ in
eaf.get_annotation_data_for_tier('context')]
minute_range = range(start, length - t)
shuffle(minute_range)
i = 0
for x in minute_range:
if i == n:
break
if not overlaps(x, existing, t):
existing.append((x, x + 5))
i += 1
rand_ints = list(range(6, 6 + n))
shuffle(rand_ints)
for x in zip(existing[-n:], rand_ints):
ts = ((x[0][0] + 2) * 60000), ((x[0][1] - 1) * 60000)
eaf.add_annotation("code", ts[0], ts[1])
eaf.add_annotation("code_num", ts[0], ts[1], value=str(x[1]))
eaf.add_annotation(
"on_off", ts[0], ts[1], value="{}_{}".format(ts[0], ts[1]))
eaf.add_annotation("context", ts[0] - 120000, ts[1] + 60000)
eaf.add_tier("notes")
selected = selected.append({'aclew_id': aclew_data.aclew_id,
'corpus': aclew_data.corpus,
'clip_num': x[1],
'onset': ts[0],
'offset': ts[1]},
ignore_index=True)
eaf.to_file(os.path.join(output_dir, os.path.basename(file)))
def overlaps(onset, existing, t):
# print
if any(overlap(onset, existing_onset, t) for existing_onset, _ in existing):
return True
return False
def overlap(x, y, t):
if y <= x < y + t:
return True
elif y - t <= x < y:
return True
return False
if __name__ == "__main__":
start_dir = sys.argv[1]
output_dir = sys.argv[2]
aclew_df = pd.read_csv(sys.argv[3])
selected = pd.DataFrame(
columns=['aclew_id', 'corpus', 'clip_num', 'onset', 'offset'], dtype=int)
for root, dirs, files in os.walk(start_dir):
for file in files:
if file.endswith(".eaf"):
aclew_data = aclew_df.query(
'aclew_id == {}'.format(file[:5])).iloc[0]
process(os.path.join(root, file), aclew_data)
selected.to_csv('selected_regions.csv', index=False)