-
Notifications
You must be signed in to change notification settings - Fork 1
/
pareto_gen.py
101 lines (84 loc) · 3.32 KB
/
pareto_gen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import pandas as pd
import numpy as np
from bokeh.plotting import figure
from bokeh.layouts import gridplot
from bokeh.models import FuncTickFormatter, LinearAxis
from bokeh.models.ranges import Range1d
def pareto_gen(df,
field,
bins=4,
labels=None,
include_lowest=False,
width=600,
height=600,
tools='pan,box_zoom,wheel_zoom,reset,save',
title=None,
x_axis_label=None,
y_axis_label=None,
**kwargs):
"""Return a Bokeh Pareto Chart object
Arguments:
data - must be a Pandas DataFrame
field - explicitly define the field to analyze (str)
Keyword Arguments:
bins - must satisfy requirements of pandas.cut, default 4
labels - must satisfy requirements of pandas.cut, default None
include_lowest - bool, default is False, passed on to pandas.cut
width - chart width in pixels, default 600
height - chart height in pixels, default 600
tools - figure tools, default 'pan,box_zoom,wheel_zoom,reset,save'
title - figure title
x_axis_label - figure x-axis label
y_axis_label - figure y-axis label
Example Usage for a DataFrame object 'merged' with a field 'sessions'
user-defined bins, and labels:
bins = [0, 30, 60, 90, 180, 270, merged.sessions.max()]
labels = ['0-1', '2', '3', '4', '5', '5+']
plot = pareto_gen(merged,
'sessions',
bins=bins,
labels=labels,
include_lowest=True,
title='My Title',
x_axis_label='My X-Axis',
y_axis_label='My Y-Axis')
#Display the plots
from bokeh.plotting import output_notebook, show
output_notebook()
show(plot)"""
df['categories'] = pd.cut(df[field],
bins=bins,
labels=labels,
include_lowest=include_lowest)
a = df.groupby('categories')[field].count().sort_values(ascending=False)
b = 100 * (a.cumsum()/a.sum())
data = pd.DataFrame({'{}'.format(field) : a, 'pct' : b})
x = np.arange(len(data))
label_dict = {i : data.index[i] for i in x}
#Create the plots
plot = figure(plot_width=width,
plot_height=height,
tools=tools,
x_axis_label=x_axis_label,
y_axis_label=y_axis_label,
title=title)
plot.extra_y_ranges = {'pct' : Range1d(start=0, end=100)}
plot.add_layout(LinearAxis(y_range_name='pct',
axis_label='Percent'),
'left')
plot.vbar(x=x,
top=data[field],
width=1,
line_color='red',
fill_color='red',
alpha=0.5)
plot.line(x,
data.pct,
line_color='blue',
y_range_name='pct')
#Reformat the x-axis
plot.xaxis.formatter = FuncTickFormatter(code="""
var labels = {};
return labels[tick];
""".format(label_dict))
return plot