-
Notifications
You must be signed in to change notification settings - Fork 1
/
data_explore_library.py
188 lines (156 loc) · 4.9 KB
/
data_explore_library.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
'''
Vedika Ahuja, Bhargavi Ganesh, and Pete Rodrigue
Library of functions used for data exploration.
'''
import os
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import (svm, ensemble, tree,
linear_model, neighbors, naive_bayes, dummy)
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.tree import export_graphviz
from sklearn.metrics import precision_recall_curve
from datetime import date, datetime, timedelta
from dateutil.relativedelta import relativedelta
from sklearn.model_selection import ParameterGrid
#plots code adapted from: https://machinelearningmastery.com/visualize-machine-learning-data-python-pandas/
def file_to_dataframe(filename):
'''
Takes a filename and returns a pandas dataframe.
Input:
filename
Returns:
pandas dataframe
'''
if os.path.exists(filename):
return pd.read_csv(filename)
def na_summary(df):
''''
Takes a dataframe and returns a table
showing which columns have NAs.
Input:
pandas dataframe
Returns:
table with nas
'''
return df.isna().sum(axis=0)
def describe_data(df, vars_to_describe=None):
'''
This function describes the data, providing
basic descriptive statistics such as min,
max, median, mean, etc.
Input:
pandas dataframe
(optional) list of variables to describe
Returns:
table with min, max, mean, median, etc
for each column in the specified df
'''
if vars_to_describe:
df = df[vars_to_describe]
return df.describe()
def histograms(df, vars_to_describe=None):
'''
Function that plots histogram of every variable in df.
Input:
pandas dataframe
(optional) list of variables to describe
'''
if vars_to_describe:
df = df[vars_to_describe]
plt.rcParams['figure.figsize'] = 16, 12
df.hist()
plt.show()
plt.savefig('output/histogram.png')
def correlations(df, vars_to_describe=None):
'''
This function takes a dataframe and returns
a correlation matrix with the specified variables.
Input:
pandas df
(optional) list of variables to describe
'''
if vars_to_describe:
df = df[vars_to_describe]
return df.corr()
def correlation_matrix(correlations):
'''
This function takes a correlation table
and plots a correlation matrix.
Input:
correlations: correlation table
'''
plt.rcParams['figure.figsize'] = 10, 10
names = correlations.columns
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,len(names),1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(names, rotation=30, rotation_mode='anchor', ha='left')
ax.set_yticklabels(names)
plt.show()
plt.savefig('output/plots/corr_matrix.png')
def pairplot(df, vars_to_describe):
'''
This function takes a dataframe and variables
to describe and plots a pairplot showing the
relationship between variables.
Inputs:
pandas dataframe
(optional) list of variables to describe
'''
plt.rcParams['figure.figsize']=(20,10)
sns.pairplot(df, vars=vars_to_describe, dropna=True, height=3.5)
plt.show()
def boxplots(df, vars_to_describe=None):
'''
This function takes a dataframe and variables
to describe and plots boxplots for all the columns
in the df.
Inputs:
pandas dataframe
(optional) list of variables to describe
'''
if vars_to_describe:
df = df[vars_to_describe]
plt.rcParams['figure.figsize'] = 16, 12
df.plot(kind='box', subplots=True,
layout=(5, math.ceil(len(df.columns)/5)),
sharex=False, sharey=False)
plt.show()
plt.savefig('output/plots/boxplot.png')
def identify_ol(df, vars_to_describe=None):
'''
This function takes a dataframe, and returns a table of outliers
Inputs:
pandas dataframe
(optional) list of variables to describe
Returns:
pandas dataframe with outliers
'''
subset_df = df.copy(deep=True)
if vars_to_describe:
subset_df = subset_df[vars_to_describe]
Q1 = subset_df.quantile(0.25)
Q3 = subset_df.quantile(0.75)
IQR = Q3 - Q1
df_out = \
subset_df[((subset_df < (Q1 - 1.5 * IQR)) | \
(subset_df > (Q3 + 1.5 * IQR))).any(axis=1)]
return df_out
def col_aggregation(df, col_to_group):
'''
This function takes in a dataframe, a string for a column name to group by,
and a name for the new index, and returns a summary table of values sorted
by percent.
'''
summary_table = df.groupby(col_to_group).size().reset_index(name='count')
summary_table['percent'] = summary_table['count']/len(df)
return summary_table.sort_values(by='percent', ascending=False)