-
Notifications
You must be signed in to change notification settings - Fork 0
/
testAllSignificance.py
128 lines (102 loc) · 4.35 KB
/
testAllSignificance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 13 17:28:59 2017
@author: jasplund
"""
#same header/import information as before.
import matplotlib.pyplot as plt
#matplotlib inline
import random
import numpy as np
import pandas as pd
from sklearn import datasets, svm, cross_validation, tree, preprocessing, metrics
from sklearn.neural_network import MLPClassifier
import sklearn.ensemble as ske
from scipy.stats import ttest_ind
from scipy import stats
from matplotlib import style
style.use('fivethirtyeight')
def ttest(df,val,group1,group2,output,significance_level = 0.05):
group1 = df[df[val] == group1]
group2 = df[df[val] == group2]
t, p = ttest_ind(group1[output],group2[output])
if (p <= significance_level):
return 1
#print("We reject the null hypothesis -- the two groups have different means.")
else:
return 0
#print("We can not reject the null hypothesis -- the groups may have the same mean.")
def ttest_test():
print("We are testing the ttest function on the Ames housing data.")
house_train_df = pd.read_csv('train.csv', index_col = None)
pvalue_pass = ttest(house_train_df,'Alley','Pave','Grvl','SalePrice')
print("The answer on the Ames Housing Data with Gravel vs. Paving is:")
print(str(pvalue_pass))
if pvalue_pass == 1:
print("This is correct.")
else:
print("This is incorrect.")
#data.boxplot('weight', by='group', figsize=(12, 8))
#ctrl = data['weight'][data.group == 'ctrl']
#grps = pd.unique(data.group.values)
#d_data = {grp:data['weight'][data.group == grp] for grp in grps}
#k = len(pd.unique(data.group)) # number of conditions
#N = len(data.values) # conditions times participants
#n = data.groupby('group').size()[0] #Participants in each condition
#SSbetween = (sum(data.groupby('group').sum()['weight']**2)/n) - (data['weight'].sum()**2)/N
#sum_y_squared = sum([value**2 for value in data['weight'].values])
#SSwithin = sum_y_squared - sum(data.groupby('group').sum()['weight']**2)/n
#SStotal = sum_y_squared - (data['weight'].sum()**2)/N
#MSbetween = SSbetween/DFbetween
#F = MSbetween/MSwithin
#p = stats.f.sf(F, DFbetween, DFwithin)
#eta_sqrd = SSbetween/SStotal
#om_sqrd = (SSbetween - (DFbetween * MSwithin))/(SStotal + MSwithin)
def anova_assumptions():
#test assumptions for ANOVA test here.
def anova_test():
# If assumptions for ANOVA are satisfied use ANOVA test. This will
# test if there is a significant difference between some pair of
# samples. We do not need to test to find which one it is.
def best_fit_slope_and_intercept(xs,ys):
m = (((mean(xs) * mean(ys)) - mean(xs*ys)) /
((mean(xs)**2)-mean(xs**2)))
b= mean(ys) - m*mean(xs)
return m, b
def squared_error(ys_orig, ys_line):
return sum((ys_line-ys_orig)**2)
def coefficient_of_determination(ys_orig,ys_line):
y_mean_line = [mean(ys_orig) for y in ys_orig]
squared_error_reg = squared_error(ys_orig, ys_line)
squared_error_y_mean = squared_error(ys_orig, ys_mean_line)
return 1 -(squared_error_reg/squared_error_y_mean)
def reg_line(xdata,ydata):
#create best fit line
xs = np.array(xdata)
ys = np.array(ydata)
return regression_line = [(m*x)+b for x in xs]
def reg_assumptions():
# Test assumptions for regression analysis.
# First test that the scatterplot is close to linear.
# Next, test that the residual plot shows a random pattern. The residual
# value is the observed value minus the predicted value.
def reg_test(xdata,ydata):
# run hypothesis test to determine whether there is a significant linear
# relationship between an independent variable X and a dependent
# variable Y
reg_line(xdata,ydata)
r_squared = coefficient_of_determination(ys, regression_line)
def sig_test():
house_train_df = pd.read_csv('train.csv', index_col = None)
for name in house_train_df:
if name!='SalePrice':
pvalue_pass = ttest(house_train_df,'Alley','Pave','Grvl','SalePrice')
def main():
#ttest_test()
#The following code will help reshape a list of lists prediction in any
# sklearn algorithm
#example_measures = np.array([[prediction 1], [prediction 2]])
#example_measures = example_measures.reshape(len(example_measures,-1))
#prediction = clf.predict(example_measures)
if __name__ == "__main__":
main()