-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsrc.py
132 lines (104 loc) · 4.22 KB
/
src.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env python
# coding: utf-8
# In[3]:
pip install -U scikit-learn
# In[8]:
from imblearn.over_sampling import SMOTE
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.feature_selection import SelectKBest, chi2
from imblearn.pipeline import Pipeline
# Function placeholders: load_dataset() and transform_to_sparse_matrix()
# Assume these functions are defined as per your specifications
#########################
def load_dataset(filename, is_train=True):
data = []
labels = [] if is_train else None
with open(filename) as file:
for line in file:
parts = line.strip().split()
if is_train:
labels.append(int(parts[0]))
features = [(int(feat), 1) for feat in parts[1:]]
else:
features = [(int(feat), 1) for feat in parts]
data.append(features)
return (data, labels) if is_train else data
# Function to transform data into a sparse matrix
def transform_to_sparse_matrix(data, num_features):
row_ind, col_ind, values = [], [], []
for i, row in enumerate(data):
for feat, val in row:
row_ind.append(i)
col_ind.append(feat)
values.append(val)
return csr_matrix((values, (row_ind, col_ind)), shape=(len(data), num_features))
#########################
train_data, train_labels = load_dataset('Train_data.txt', is_train=True)
test_data = load_dataset('Test_data.txt', is_train=False)
# Finding the maximum feature index
train_features = [feat for row in train_data for feat, _ in row]
test_features = [feat for row in test_data for feat, _ in row]
num_features = max(train_features + test_features) + 1
X_train = transform_to_sparse_matrix(train_data, num_features)
y_train = np.array(train_labels)
X_test = transform_to_sparse_matrix(test_data, num_features)
# Set up the F1 scorer for model evaluation
f1_scorer = make_scorer(f1_score)
# Define models and pipelines
model_params = [
{
'name': 'DecisionTree',
'pipeline': Pipeline([
('maxabsscaler', MaxAbsScaler()),
('feature_selection', SelectKBest(chi2)),
('smote', SMOTE(random_state=42)),
('model', DecisionTreeClassifier(random_state=42))
]),
'params': {
'feature_selection__k': [100, 200, 300], # Example values; adjust based on dataset size and characteristics
'model__max_depth': [10, 20, None],
'model__min_samples_split': [2, 5, 10],
'model__min_samples_leaf': [1, 2, 4],
'model__criterion': ['gini', 'entropy']
}
},
{
'name': 'NaiveBayes',
'pipeline': Pipeline([
('maxabsscaler', MaxAbsScaler()),
('feature_selection', SelectKBest(chi2)),
('smote', SMOTE(random_state=42)),
('model', BernoulliNB())
]),
'params': {
'feature_selection__k': [100, 200, 300],
'model__alpha': np.logspace(-3, 3, 7)
}
}
]
results = []
# Loop through each model configuration and perform grid search
for config in model_params:
gscv = GridSearchCV(config['pipeline'], config['params'], cv=5, scoring=f1_scorer, n_jobs=-1, verbose=1)
gscv.fit(X_train, y_train)
score = gscv.best_score_
print(f"{config['name']} best score: {score}")
results.append((score, config['name'], gscv))
# Select the best model based on F1 score
best_score, best_model_name, best_model_gscv = max(results, key=lambda item: item[0])
print(f"Best Model: {best_model_name} with F1 score: {best_score}")
# Predict on the test data using the best found model
predictions = best_model_gscv.predict(X_test)
# Write predictions to a file
with open('best_model_improved_predictions_final_28.txt', 'w') as f:
for pred in predictions:
f.write(f"{pred}\n")
print("Predictions have been written to 'best_model_improved_predictions_final_28.txt'")
# In[ ]:
test_f1_score = f1_score(y_test, predictions) # Make sure y_test is your actual test labels