-
Notifications
You must be signed in to change notification settings - Fork 22
/
ci_within.py
109 lines (94 loc) · 3.64 KB
/
ci_within.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# Author Denis A. Engemann <[email protected]>
# Adjustments: Josef Perktold, Per Sederberg
#
# License: BSD (3-clause)
import numpy as np
from scipy import stats
import pandas as pd
def ci_within(df, indexvar, withinvars, measvar, confint=0.95,
copy=True):
""" Compute CI / SEM correction factor
Morey 2008, Cousinaueu 2005, Loftus & Masson, 1994
Also see R-cookbook http://goo.gl/QdwJl
Note. This functions helps to generate appropriate confidence
intervals for repeated measure designs.
Standard confidence intervals are are computed on normalized data
and a correction factor is applied that prevents insanely small values.
df : instance of pandas.DataFrame
The data frame objetct.
indexvar : str
The column name of of the identifier variable that representing
subjects or repeated measures
withinvars : str | list of str
The column names of the categorial data identifying random effects
measvar : str
The column name of the response measure
confint : float
The confidence interval
copy : bool
Whether to copy the data frame or not.
"""
if copy:
df = df.copy()
# Apply Cousinaueu's method:
# compute grand mean
mean_ = df[measvar].mean()
# compute subject means
subj_means = df.groupby(indexvar)[measvar].mean().values
for subj, smean_ in zip(df[indexvar].unique(), subj_means):
# center
#df[measvar][df[indexvar] == subj] -= smean_
df.loc[df[indexvar] == subj, measvar] -= smean_
# add grand average
#df[measvar][df[indexvar] == subj] += mean_
df.loc[df[indexvar] == subj, measvar] += mean_
def sem(x):
return x.std() / np.sqrt(len(x))
def ci(x):
se = sem(x)
return se * stats.t.interval(confint, len(x - 1))[1]
aggfuncs = [np.mean, np.std, sem, ci, len]
out = df.groupby(withinvars)[measvar].agg(aggfuncs)
# compute & apply correction factor
n_within = np.prod([len(df[k].unique()) for k in withinvars],
dtype= df[measvar].dtype)
cf = np.sqrt(n_within / (n_within - 1.))
for k in ['sem', 'std', 'ci']:
out[k] *= cf
out['ci'] = stats.t.isf((1 - confint) / 2., out['len'] - 1) * out['sem']
return out
if __name__ == '__main__':
ss = '''
subject condition value
1 pretest 59.4
2 pretest 46.4
3 pretest 46.0
4 pretest 49.0
5 pretest 32.5
6 pretest 45.2
7 pretest 60.3
8 pretest 54.3
9 pretest 45.4
10 pretest 38.9
1 posttest 64.5
2 posttest 52.4
3 posttest 49.7
4 posttest 48.7
5 posttest 37.4
6 posttest 49.5
7 posttest 59.9
8 posttest 54.1
9 posttest 49.6
10 posttest 48.5'''
import StringIO
df = pd.read_fwf(StringIO.StringIO(ss), widths=[8, 10, 6], header=1)
res = ci_within(df2, 'subject', ['condition'], 'value', confint=0.95)
print(res)
print(res[['len', 'mean', 'std', 'sem', 'ci']])
#ci is different from R
#http://www.cookbook-r.com/Graphs/Plotting_means_and_error_bars_%28ggplot2%29/#error-bars-for-within-subjects-variables
#dfwc <- summarySEwithin(dfw.long, measurevar="value", withinvars="condition",
# idvar="subject", na.rm=FALSE, conf.interval=.95)
# condition N value value_norm sd se ci
# posttest 10 51.43 51.43 2.262361 0.7154214 1.618396
# pretest 10 47.74 47.74 2.262361 0.7154214 1.618396