-
Notifications
You must be signed in to change notification settings - Fork 0
/
tag_merge_metadata.py
79 lines (65 loc) · 2.57 KB
/
tag_merge_metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import csv
import pandas as pd
import numpy as np
import sys
from pipeline_utils import transient_fieldnames, variables_fieldnames
import pickle as pkl
def tag_merge_metadata(output):
variables = pd.read_csv("data/metadata/variables_lc_metadata.dat", sep=" ", skiprows=3)
variables = variables.iloc[:,0:]
variables = process_variables(variables)
transients = pd.read_csv("data/metadata/transient_lc_metadata.csv", sep=",", names=transient_fieldnames, skiprows=1)
transients = process_transients(transients)
frames = [variables, transients]
data = pd.concat(frames, ignore_index=True)
print(data)
data.to_pickle(output)
def process_transients(data):
data = data[["CSS images", "RA", "Dec", "Classification"]]
data.columns = ["ID","RA","Dec","SubType"]
data = data.assign(Type =1)
data, tags = fewer_tags(data)
data = tags_to_numbers(data, tags)
return data
def process_variables(data):
#add the first line that was mistakenly loaded as header
first_row = list(data.columns)
data.loc[len(data)] = first_row
#drop unnecessary columns
data = data.iloc[:,:8]
data.columns = variables_fieldnames
data = data[["ID", "RA", "Dec", "SubType"]]
data = data.assign(Type = 0)
return data
def fewer_tags(data):
# classes = data.groupby('SubType').count()
# main_classes = classes[classes["ID"] > 100]
# tags = list(filter(lambda tag: not "?" in tag, list(main_classes.index)))
#consider only the following classes
tags = ["AGN","Blazar","CV","Flare","SN"]
data['SubType'] = np.where(
data['SubType'].isin(tags), data['SubType'], 'Unknown/Other'
)
return data, tags
def tags_to_numbers(data, tags):
tags.append('Unknown/Other')
data['SubType'] = data['SubType'].map(lambda tag: tags.index(tag),
na_action=None)
return data
# def consider_1st_class(data, tags):
# # consider all classes containing "/" as 1st class
# data['SubType'] = np.where(
# (data['SubType'].str.contains("/")) &
# (data['SubType'].str.split("/").str[0].isin(tags)),
# data['SubType'].str.split("/").str[0], data["SubType"]
# )
# return data
# def ignore_question_marks(data, tags):
# # consider all classes containing "?" as if they didn't
# data['SubType'] = np.where(
# (data['SubType'].str.contains('\\?',na=False)) &
# (~ data['SubType'].str.contains('/', na=False))&
# (data['SubType'].str.split("?").str[0].isin(tags)),
# data['SubType'].str.split("?").str[0], data["SubType"]
# )
# return data