-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaggregate_gz4.py
104 lines (81 loc) · 2.72 KB
/
aggregate_gz4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import pandas
from collections import Counter, OrderedDict
from pandas.io.json.normalize import nested_to_record
import progressbar
import json
missing_manga_data = pandas.read_csv('../GZ_data_dumps/2017-10-29_galaxy_zoo_missing_manga_classifications.csv')
sdss_lost_data = pandas.read_csv('../GZ_data_dumps/2017-10-29_galaxy_zoo_sdss_lost_set_classifications.csv')
sdss_data = pandas.read_csv('../GZ_data_dumps/2017-10-29_galaxy_zoo_sloan_classifications.csv')
subject_data = pandas.read_csv('../GZ_data_dumps/galaxy_zoo_subjects.csv')
subject_data['subject_id'] = [s[9:-1] for s in subject_data._id]
keys = [
'sloan-0',
'sloan-1',
'sloan-2',
'sloan-3',
'sloan-4',
'sloan-5',
'sloan-7',
'sloan-8',
'sloan-9',
'sloan-10'
]
output = OrderedDict([
('subject_id', [])
])
for k in keys:
output[k] = []
def counter_without_nan(group, key):
c = Counter(group[key])
if pandas.np.nan in c:
c.pop(pandas.np.nan)
return OrderedDict(sorted(c.items()))
widgets = [
'Reducing: ',
progressbar.Percentage(),
' ', progressbar.Bar(),
' ', progressbar.ETA()
]
def aggregate(data):
ct = 0
pbar = progressbar.ProgressBar(widgets=widgets, max_value=len(data.subject_id.unique()))
pbar.start()
for name, group in data.groupby(data.subject_id):
output['subject_id'].append(name)
for k in keys:
output[k].append(counter_without_nan(group, k))
ct += 1
pbar.update(ct)
pbar.finish()
print('missing manga')
aggregate(missing_manga_data)
print('lost set')
aggregate(sdss_lost_data)
print('sdss')
aggregate(sdss_data)
subject_id = pandas.DataFrame(output['subject_id'])
subject_id.columns = ['subject_id']
flat_output = [subject_id]
for k in keys:
flat_keys = pandas.DataFrame(nested_to_record(output[k]))
flat_keys.columns = ['{0}.{1}'.format(k, i) for i in flat_keys.columns.values]
flat_output.append(flat_keys)
vote_counts = pandas.concat(flat_output, axis=1)
vote_counts.to_csv('gz4_aggregate_with_subject_id.csv', index=False)
join_table = vote_counts.set_index('subject_id').join(subject_data.set_index('subject_id'))
join_table.index = pandas.RangeIndex(len(join_table.index))
def json_or_nan(s):
if isinstance(s, str):
return json.loads(s)
else:
return {}
def id_to_number(s):
if not pandas.isnull(s):
return int(s.split('_')[1])
else:
return pandas.np.nan
metadata = join_table.metadata.apply(json_or_nan)
metadata_table = pandas.DataFrame(nested_to_record(metadata))
metadata_table['nsa_id'] = metadata_table['nsa_id'].apply(id_to_number)
del join_table['metadata']
pandas.concat([join_table, metadata_table], axis=1).to_csv('gz4_aggregate_with_metadata.csv', index=False)