-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathraw_data_extractor.py
71 lines (57 loc) · 2.69 KB
/
raw_data_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import numpy as np
import pandas as pd
file = pd.read_csv('data/pa_initial.csv')
def string_processor(s):
return ' '.join(s.split()).encode('ascii', errors='ignore').strip().decode('ascii')
non_normalized_initial_300 = [string_processor(line) for line in file['Variable']]
non_normalized_initial_300 = np.array(non_normalized_initial_300)
labels_initial_300 = np.array([x.strip() for x in file['context']])
labels_initial_300 = np.array(['active lifestyle' if l in set(['fitness', 'active life', 'active life; fitness'])
else 'environment' if l in set(['arrangement', 'barrier'])
else 'physical capacity' for l in labels_initial_300])
print(non_normalized_initial_300.shape, labels_initial_300.shape)
def get_initial_data(file):
non_normalized = []
for line in file['Variable/Variable Question']:
non_normalized.append(string_processor(line))
return np.array(non_normalized)
def extract_dataset(file):
non_normalized = get_initial_data(file)
labels_full = np.array([' '*17]*len(non_normalized))
labels = np.asarray(file['Activity'].values, dtype ='U')
labels_full[labels!='nan'] = 'active lifestyle'
labels = np.asarray(file['Environment'].values, dtype ='U')
labels_full[labels!='nan'] = 'environment'
labels = np.asarray(file['Physical Function'].values, dtype ='U')
labels_full[labels!='nan'] = 'physical capacity'
labels_idx = labels_full != ' '*17
labels_full = labels_full[labels_idx]
non_normalized = non_normalized[labels_idx]
return non_normalized, labels_full
file = pd.read_csv('data/pa_annotations_HK.csv')
non_normalized_HK, labels_HK = extract_dataset(file)
print(labels_HK)
file = pd.read_csv('data/pa_annotations_JQ.csv')
non_normalized_JQ, labels_JQ = extract_dataset(file)
file = pd.read_csv('data/pa_annotations_JO.csv')
non_normalized_JO, labels_JO = extract_dataset(file)
file = pd.read_csv('data/nonactivity.csv')
non_normalized_nonactivity = []
for line in file['variable descriptions']:
non_normalized_nonactivity.append(string_processor(line))
non_normalized_nonactivity_labels = np.array(['other']*len(non_normalized_nonactivity))
text = np.hstack((non_normalized_initial_300,
non_normalized_HK,
non_normalized_JQ,
non_normalized_JO,
non_normalized_nonactivity))
labels = np.hstack((labels_initial_300,
labels_HK,
labels_JQ,
labels_JO,
non_normalized_nonactivity_labels))
data = np.vstack((text, labels))
print(data[1,:])
np.savetxt('data.csv', data.T, delimiter='|', newline='\n', fmt="%s", comments=None)
np.savez('data.npz', data.T)
print(data.shape)