-
Notifications
You must be signed in to change notification settings - Fork 0
/
knockouts.py
131 lines (109 loc) · 4.59 KB
/
knockouts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
""" Modelling the knockout stage of the 2018 World Cup. """
import os
import pickle
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import poisson
import pandas as pd
import pystan
def wrangle():
data = pd.read_csv("results.csv", index_col=None, parse_dates=['date'])
team_map = {
name: idx + 1
for idx, name in enumerate(set(data.home_team) | set(data.away_team))
}
stan_data = (
data
[data.date > dt.datetime.today() - dt.timedelta(days=2*365)]
.assign(team1=lambda df: list(map(team_map.__getitem__, df.home_team)),
team2=lambda df: list(map(team_map.__getitem__, df.away_team)),
goals1=lambda df: df.home_score,
goals2=lambda df: df.away_score)
[['team1', 'team2', 'goals1', 'goals2', 'worldcup', 'neutral']]
.to_dict(orient='list')
)
stan_data['n_teams'] = len(team_map)
stan_data['n_games'] = len(stan_data['team1'])
return stan_data, team_map
def fit(data, force_compile=False, **kwargs):
if not force_compile and os.path.exists("cache/knockouts.pickle"):
model = pickle.load(open("cache/knockouts.pickle", 'rb'))
else:
model = pystan.StanModel("knockouts.stan")
pickle.dump(model, open("cache/knockouts.pickle", 'wb'))
# Starting the chains from 0, as they struggle otherwise.
# The model is in log space, making 0 a convenient starting point anyway.
return model.sampling(data, init=0, **kwargs)
def _dixoncoles_pmf(goals, mus, rho):
""" The probability mass function of the Dixon-Coles distribution. """
def tau():
if goals[0] == goals[1] == 0:
return 1.0 - rho*mus[0]*mus[1]
elif goals[0] == 0 and goals[1] == 1:
return 1.0 + rho*mus[0]
elif goals[0] == 1 and goals[1] == 0:
return 1.0 + rho*mus[1]
elif goals[0] == goals[1] == 1:
return 1.0 - rho
else:
return 1.0
return tau() * poisson.pmf(goals[0], mus[0]) * poisson.pmf(goals[1], mus[1])
def graph(samples, team_map):
""" Plot model fit for quick inspection. """
fig = plt.figure()
ax = plt.subplot(111)
for name, idx in team_map.items():
off_ = np.mean(np.exp(samples['off'][:, idx-1]))
def_ = np.mean(np.exp(samples['def'][:, idx-1]))
ax.scatter([off_], [def_])
ax.annotate(name, (off_, def_))
ax.set_xlabel("Attack")
ax.set_ylabel("Defence")
plt.show()
def predict(samples, team_map, team1, team2, neutral=True, aet=True, max_goals=4):
""" Display the prediction for an arbitrary match-up. """
idx1 = team_map[team1] - 1
idx2 = team_map[team2] - 1
off1 = np.mean(np.exp(samples['off'][:, idx1]))
off2 = np.mean(np.exp(samples['off'][:, idx2]))
def1 = np.mean(np.exp(samples['def'][:, idx1]))
def2 = np.mean(np.exp(samples['def'][:, idx2]))
hfa = np.mean(np.exp(samples['hfa']))
rho = np.mean(samples['rho'])
mus = (off1 * def2 * (1.0 if neutral else hfa), off2 * def1)
probs = np.zeros((max_goals + 1, max_goals + 1))
for h in range(max_goals + 1):
for a in range(max_goals + 1):
probs[h, a] = _dixoncoles_pmf((h, a), mus, rho)
if aet:
# Compute probabilities of scoring in extra time by taking
# the DC pmf with the scoring rates divided by 3.
extra = np.zeros((max_goals + 1, max_goals + 1))
for h in range(max_goals + 1):
for a in range(max_goals + 1):
extra[h, a] = _dixoncoles_pmf((h, a), (0.33*mus[0], 0.33*mus[1]), rho)
# Copy the probabilities of draws in 90 mins and zero the entries
base = [probs[g, g] for g in range(max_goals + 1)]
for g in range(max_goals + 1):
probs[g, g] = 0.0
# Redistribute the original probabilities of draws
for g in range(max_goals + 1):
for h in range(max_goals + 1):
for a in range(max_goals + 1):
if (g + h <= max_goals) and (g + a) <= max_goals:
probs[g + h, g + a] += base[g] * extra[h, a]
fig = plt.figure()
ax = plt.subplot(111)
img = ax.imshow(probs.T, origin='upper')
for h in range(max_goals + 1):
for a in range(max_goals + 1):
plt.text(h, a, "%.1f%%" % (100.0 * probs[h, a]),
horizontalalignment='center',
verticalalignment='center')
print("%d-%d: %.1f%%" % (h, a, 100.0 * probs[h, a]))
ax.set_title("FT/AET score prediction")
ax.set_xlabel(team1)
ax.set_ylabel(team2)
plt.colorbar(img)
plt.show()