forked from rjjfox/ab-test-calculator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
275 lines (215 loc) · 8.1 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
import streamlit as st
import matplotlib.pyplot as plt
from functions import create_plotly_table, local_css, percentage_format
from bayesian import Bayesian
from frequentist import Frequentist
st.set_page_config(
page_title="AB Test Calculator",
page_icon="https://rfoxdata.co.uk/assets/favicon/favicon-32x32.png",
)
roboto = {"fontname": "Roboto", "size": "12"}
roboto_title = {"fontname": "Roboto", "size": "14", "weight": "bold"}
roboto_bold = {"fontname": "Roboto", "size": "12", "weight": "bold"}
roboto_small = {"fontname": "Roboto", "size": "10"}
local_css("style.css")
font = {"family": "sans-serif", "sans-serif": "roboto", "size": 11}
plt.rc("font", **font)
"""
# AB test calculator
_Enter your test data into the sidebar and choose either a Bayesian or
Frequentist testing approach. Below is Bayesian by default._
---
"""
# Sidebar
st.sidebar.markdown(
"""
## Approach
"""
)
method = st.sidebar.radio("Bayesian vs. Frequentist", ["Bayesian", "Frequentist"])
st.sidebar.markdown(
"""
## Test data
"""
)
visitors_A = st.sidebar.number_input("Visitors A", value=50000, step=100)
conversions_A = st.sidebar.number_input("Conversions A", value=1500, step=10)
visitors_B = st.sidebar.number_input("Visitors B", value=50000, step=100)
conversions_B = st.sidebar.number_input("Conversions B", value=1560, step=10)
st.sidebar.markdown(
"""
## Frequentist settings
"""
)
alpha_input = 1 - st.sidebar.slider(
"Significance level", value=0.95, min_value=0.5, max_value=0.99
)
tails_input = st.sidebar.selectbox(
"One vs. two tail", ["One-tail", "Two-tail"], index=1
)
if tails_input == "One-tail":
two_tails_bool = False
else:
two_tails_bool = True
b = Bayesian(visitors_A, conversions_A, visitors_B, conversions_B)
# Bayesian Method
if method == "Bayesian":
try:
b.generate_posterior_samples()
b.calculate_probabilities()
b.plot_bayesian_probabilities()
st.text("")
bayesian_data = {
"<b>Variant</b>": ["A", "B"],
"<b>Visitors</b>": [f"{b.visitors_A:,}", f"{b.visitors_B:,}"],
"<b>Conversions</b>": [b.conversions_A, b.conversions_B],
"<b>Conversion rate</b>": [f"{b.control_cr:.2%}", f"{b.variant_cr:.2%}"],
"<b>Uplift</b>": ["", f"{b.relative_difference:.2%}"],
"<b>Likelihood of being better</b>": [f"{b.prob_A:.2%}", f"{b.prob_B:.2%}"],
}
create_plotly_table(bayesian_data)
"""
The below graph plots the simulated difference between the two
posterior distributions for the variants. It highlights the potential
range of difference between the two variants. More data will reduce
the range.
"""
st.text("")
b.plot_simulation_of_difference()
"""
---
### Recommended Reading
* [Bayesian Methods for Hackers by Cameron Davidson-Pilon]\
(https://github.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers)
* [Bayesian AB test calculator by AB Testguide]\
(https://abtestguide.com/bayesian/)
* [Beta distribution Wikipedia]\
(https://en.wikipedia.org/wiki/Beta_distribution)
"""
except ValueError:
t = """
<img class='error'
src='https://www.flaticon.com/svg/static/icons/svg/595/595067.svg'>
"""
st.markdown(t, unsafe_allow_html=True)
"""
An error occured, please check the test data input and try again.
For Bayesian calculations, the conversion rate must be between 0 and
1.
"""
else: # Frequentist
f = Frequentist(
visitors_A,
conversions_A,
visitors_B,
conversions_B,
alpha=alpha_input,
two_tails=two_tails_bool,
)
z_score, p_value = f.z_test()
power = f.get_power()
if p_value < alpha_input:
t = """
<h3 class='frequentist_title'>Significant</h3>
<img class='frequentist_icon'
src='https://raw.githubusercontent.com/rjjfox/ab-test-calculator/master/img/positive-vote.png'>
"""
st.markdown(t, unsafe_allow_html=True)
if f.relative_difference < 0:
t = (
"""
<p>B's conversion rate is <span class='lower'>"""
+ "{:.2%}".format(abs(f.relative_difference))
+ """ lower</span> than A's CR."""
)
st.markdown(t, unsafe_allow_html=True)
else:
t = (
"""
<p>B's conversion rate is <span class='higher'>"""
+ "{:.2%}".format(abs(f.relative_difference))
+ """ higher</span> than A's CR."""
)
st.markdown(t, unsafe_allow_html=True)
f"""
You can be {1-alpha_input:.0%} confident that the result is true and
due to the changes made. There is a {alpha_input:.0%} chance that the result
is a false positive or type I error meaning the result is due to
random chance.
"""
else:
t = """
<h3 class='frequentist_title'>Not significant</h3>
<img class='frequentist_icon'
src='https://raw.githubusercontent.com/rjjfox/ab-test-calculator/master/img/negative-vote.png'>
"""
st.markdown(t, unsafe_allow_html=True)
f"""
There is not enough evidence to prove that there is a
{f.relative_difference:.2%} difference in the conversion rates between
variants A and B.
"""
"""
Either collect more data to achieve greater precision in your test,
or conclude the test as inconclusive.
"""
frequentist_data = {
"<b>Variant</b>": ["A", "B"],
"<b>Visitors</b>": [f"{f.visitors_A:,}", f"{f.visitors_B:,}"],
"<b>Conversions</b>": [f.conversions_A, f.conversions_B],
"<b>Conversion rate</b>": [f"{f.control_cr:.2%}", f"{f.variant_cr:.2%}"],
"<b>Uplift</b>": ["", f"{f.relative_difference:.2%}"],
"<b>Power</b>": ["", f"{power:.4f}"],
"<b>Z-score</b>": ["", f"{z_score:.4f}"],
"<b>P-value</b>": ["", f"{p_value:.4f}"],
}
create_plotly_table(frequentist_data)
z = f.get_z_value()
"""
According to the null hypothesis, there is no difference between the means.
The plot below shows the distribution of the difference of the means that
we would expect under the null hypothesis.
"""
f.plot_test_visualisation()
if p_value < alpha_input:
f"""
The shaded areas cover {alpha_input:.0%} of the distribution. It is
because the observed mean of the variant falls into this area that we
can reject the null hypothesis with {1-alpha_input:.0%} confidence.
"""
else:
f"""
The shaded areas cover {alpha_input:.0%} of the distribution. It is
because the observed mean of the variant does not fall into this area that
we are unable to reject the null hypothesis and get a significant
result. A difference of greater than
{f.se_difference*z/f.control_cr:.2%} is needed.
"""
"""
#### Statistical Power
"""
f"""
Power is a measure of how likely we are to detect a difference when there
is one with 80% being the generally accepted threshold for statistical
validity. **The power for your test is {power:.2%}**
"""
f"""
An alternative way of defining power is that it is our likelihood of
avoiding a Type II error or a false negative. Therefore the inverse of
power is 1 - {power:.2%} = {1-power:.2%} which is our likelihood of a
type II error.
"""
f.plot_power()
"""
---
### Recommended reading
* [Z-test Wikipedia](https://en.wikipedia.org/wiki/Z-test)
* [The Math Behind AB Testing by Nguyen Ngo]\
(https://towardsdatascience.com/the-math-behind-a-b-testing-with-example-code-part-1-of-2-7be752e1d06f)
* [AB test calculator by AB Testguide](https://www.abtestguide.com/calc/)
"""
"""
### See also
* [Sample size calculator](https://abtestsamplesize.herokuapp.com/)
* [Github Repository](https://github.com/rjjfox/ab-test-calculator)
"""