forked from raunaks42/StarQuasarClassifier
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpcaSklearn.py
69 lines (55 loc) · 2.43 KB
/
pcaSklearn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import operator
import os
import time
import warnings
from concurrent.futures import ProcessPoolExecutor
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
# from mlxtend.evaluate import bias_variance_decomp
warnings.filterwarnings('ignore')
filenames = ['cat1.csv', 'cat1_correct_r1.csv', 'cat1_r1.csv',
'cat2.csv', 'cat2_correct_r1.csv',
'cat2_correct_r3.csv', 'cat2_misclassified_r1.csv', 'cat2_misclassified_r3.csv', 'cat2_r1.csv', 'cat2_r3.csv',
'cat3.csv', 'cat3_correct_r1.csv',
'cat3_correct_r3.csv', 'cat3_misclassified_r1.csv', 'cat3_misclassified_r3.csv', 'cat3_r1.csv', 'cat3_r3.csv',
'cat4.csv', 'cat4_correct_r1.csv',
'cat4_correct_r3.csv', 'cat4_misclassified_r1.csv', 'cat4_misclassified_r3.csv', 'cat4_r1.csv', 'cat4_r3.csv'
]
def loadDataset(filename, crossVal=False):
dataset = pd.read_csv(filename)
if crossVal:
y = dataset['class']
x = dataset['spectrometric_redshift']
x = x.to_numpy()
else:
if len(dataset.columns) == 38:
# drop all cols after class column
dataset = dataset.drop(dataset.columns[31:], axis=1)
y = dataset['class']
dataset = dataset.drop(dataset.columns[13:16], axis=1) # drop fuv
x = dataset.drop(dataset.columns[0:7], axis=1)
else: # 31 cols, ie, all catalog 4 csvs
# drop all after class
dataset = dataset.drop(dataset.columns[30:], axis=1)
y = dataset['class']
dataset = dataset.drop(dataset.columns[13:15], axis=1)
x = dataset.drop(dataset.columns[0:7], axis=1)
sc = MinMaxScaler(feature_range=(0, 1))
x = sc.fit_transform(x) # scale x vals
# y = y.to_numpy() # convert y to numpy array
return x, y
pca = PCA(n_components=2)
for filename in filenames:
totSetx, totSety = loadDataset(filename)
# print(type(x))
principalComponents = pca.fit_transform(totSetx)
principalDf = pd.DataFrame(data = principalComponents
, columns=['principal component 1', 'principal component 2'])
finalDf = pd.concat([principalDf, totSety], axis=1)
newFileName = 'pca_'+filename
finalDf.to_csv(newFileName, index=False)
print (newFileName)