-
Notifications
You must be signed in to change notification settings - Fork 1
/
log_regression_experiment.py
72 lines (63 loc) · 2.26 KB
/
log_regression_experiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.cross_validation import train_test_split
import csv
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import zero_one_loss
#load raw training data from csv
df = pd.read_csv("./clean_loan_training.csv",low_memory=False)
del df['id']
del df['member_id']
del df['emp_title']
del df['issue_d']
del df['url']
del df['desc']
del df['title']
del df['last_pymnt_d']
del df['next_pymnt_d']
del df['last_credit_pull_d']
#below are removed because they are not available in the lending club api
del df['pymnt_plan']
del df['out_prncp']
del df['out_prncp_inv']
del df['total_pymnt']
del df['total_pymnt_inv']
del df['total_rec_prncp']
del df['total_rec_int']
del df['total_rec_late_fee']
del df['recoveries']
del df['collection_recovery_fee']
loan_training_data = df.T.to_dict().values()
#load target data. For now we naievely categorize anything late as a default
target = list(csv.reader(open('./clean_loan_target.csv', 'rU')))
for n in range(len(target)):
if target[n][0] == 'Late (31-120 days)':
target[n][0] = 0
elif target[n][0] == 'Charged Off':
target[n][0] = 0
elif target[n][0] == 'Late (16-30 days)':
target[n][0] = 0
else:
target[n][0] = 1
target = np.array([n[0] for n in target])
#Vectorize the raw training data -- take String features and encode them with one hot encoding
vec = DictVectorizer()
training_data = vec.fit_transform(loan_training_data).toarray()
#For whatever reason, the vectorizor might produce feature vectors with NaN...remove those data. Only 116 when I counted.
indexOfDataToDelete=[]
for n in range(len(training_data)):
x = np.sum(training_data[n])
if np.isnan(x) or np.isinf(x):
indexOfDataToDelete.append(n)
target = np.delete(target, indexOfDataToDelete)
training_data = np.delete(training_data,indexOfDataToDelete,0)
#Split the data into training and test sets
training_data, test_data, training_target, test_target = train_test_split(training_data, target)
#Finally train the model
clf = LogisticRegression()
clf.fit(training_data, training_target)
#Predict the using test data
predicted = clf.predict(test_data)
#Print zero one loss score
print(zero_one_loss(predicted, test_target))