-
Notifications
You must be signed in to change notification settings - Fork 1
/
eda.py
executable file
·162 lines (115 loc) · 4.24 KB
/
eda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
### Importing the Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
from dateutil import parser
dataset = pd.read_csv('appdata10.csv')
### EDA ###
dataset.head()
dataset.describe()
## Data Cleaning
dataset['hour'] = dataset.hour.str.slice(1, 3).astype(int)
## Plotting
dataset2 = dataset.copy().drop(columns = [
'user',
'screen_list',
'enrolled_date',
'first_open',
'enrolled'
])
# Histograms
plt.figure(figsize=(10,6))
plt.rc('xtick',labelsize=8)
plt.rc('ytick',labelsize=8)
plt.suptitle('Histograms of Numerical Columns', fontsize = 14)
for i in range(1,dataset2.shape[1]+1):
plt.subplot(3,3,i)
f = plt.gca()
f.set_title(dataset2.columns.values[i - 1], fontsize = 8)
vals = np.size(dataset2.iloc[ : ,i-1].unique())
plt.hist(dataset2.iloc[ : ,i-1], bins = vals, color = '#3F5D7D')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
# To end the above Subplot, just uncomment this
plt.close()
# Correlation with Response
dataset2.corrwith(dataset.enrolled).plot.bar(
figsize = (20,10),
title = 'Correlation with Response Variable',
fontsize = 15,
rot = 45,
grid = True,
color = ['#32a852', '#005c1f']
)
plt.close()
# Correlation Matrix
sn.set(style="white", font_scale=2)
# Compute the correlation matrix
corr = dataset2.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(18, 15))
f.suptitle("Correlation Matrix", fontsize = 40)
# Generate a custom diverging colormap
cmap = sn.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sn.heatmap(
corr,
mask=mask,
cmap=cmap,
vmax=.3,
center=0,
square=True,
linewidths=.5,
cbar_kws={"shrink": .5}
)
plt.close()
### Feature Engineering
dataset.dtypes
dataset["first_open"] = [parser.parse(row_data) for row_data in dataset["first_open"]]
dataset["enrolled_date"] = [parser.parse(row_data)
if isinstance(row_data,str)
else row_data for row_data in dataset["enrolled_date"]
]
dataset["difference"] = (dataset.enrolled_date - dataset.first_open).astype('timedelta64[h]')
plt.figure(figsize=(10,6))
plt.rc('xtick',labelsize=8)
plt.rc('ytick',labelsize=8)
plt.hist(dataset["difference"].dropna(), color = '#3F5D7D')
plt.title('Distribution of Time-Since Enrolled', fontsize = 12)
plt.show()
plt.hist(dataset["difference"].dropna(), color = '#3F5D7D', range = [0,100])
plt.title('Distribution of Time-Since Enrolled', fontsize = 12)
plt.show()
dataset.loc[dataset.difference > 48, 'enrolled'] = 0
dataset = dataset.drop(columns = ['difference', 'enrolled_date', 'first_open'])
### Formatting the screen_list Field
# With the help of an Analyst, we got to know about the Top Most Screens, that the Users were engaged in
top_screens = pd.read_csv('top_screens.csv').top_screens.values
dataset['screen_list'] = dataset.screen_list.astype(str) + ','
for sc in top_screens:
dataset[sc] = dataset.screen_list.str.contains(sc).astype(int)
dataset['screen_list'] = dataset.screen_list.str.replace(sc + ",", "")
dataset['Other'] = dataset.screen_list.str.count(",")
dataset = dataset.drop(columns = ['screen_list'])
# Funnels (We are eliminating the screens which are highly co-related, and combining them into a single property)
savings_screens = ["Saving1", "Saving2", "Saving2Amount", "Saving4", "Saving5", "Saving6",
"Saving7", "Saving8", "Saving9", "Saving10"]
dataset["SavingsCount"] = dataset[savings_screens].sum(axis = 1)
dataset = dataset.drop(columns = savings_screens)
cm_screens = ["Credit1", "Credit2", "Credit3", "Credit3Container", "Credit3Dashboard"]
dataset["CMCount"] = dataset[cm_screens].sum(axis = 1)
dataset = dataset.drop(columns = cm_screens)
cc_screens = ["CC1", "CC1Category", "CC3"]
dataset["CCCount"] = dataset[cc_screens].sum(axis = 1)
dataset = dataset.drop(columns = cc_screens)
loan_screens = ["Loan", "Loan2", "Loan3", "Loan4"]
dataset["LoansCount"] = dataset[loan_screens].sum(axis = 1)
dataset = dataset.drop(columns = loan_screens)
#### Saving Results ####
dataset.head()
dataset.describe()
dataset.columns
dataset.to_csv('new_appdata10.csv', index = False)