From 2850a6f26d7e9c813679e121ed78ecbf980959c7 Mon Sep 17 00:00:00 2001 From: KatrionaGoldmann Date: Thu, 28 Mar 2024 17:18:49 +0000 Subject: [PATCH] Add xgboost model --- stories/2024-01-01-Eurovision/story.ipynb | 776 ++++++++++++++++++++-- 1 file changed, 722 insertions(+), 54 deletions(-) diff --git a/stories/2024-01-01-Eurovision/story.ipynb b/stories/2024-01-01-Eurovision/story.ipynb index f423791..0d9ee0a 100644 --- a/stories/2024-01-01-Eurovision/story.ipynb +++ b/stories/2024-01-01-Eurovision/story.ipynb @@ -176,9 +176,9 @@ "def standardise_country(c):\n", " replacements = [('-', ' '), ('&', 'and'), ('Netherands', 'Netherlands'),\n", " # FYR Macedonia was formally renamed as North Macedonia in 2019\n", - " ('F.Y.R. Macedonia', 'North Macedonia'), \n", - " ('Russia', 'Russian Federation'), \n", - " ('The Netherlands', 'Netherlands'), \n", + " ('F.Y.R. Macedonia', 'North Macedonia'),\n", + " ('Russia', 'Russian Federation'),\n", + " ('The Netherlands', 'Netherlands'),\n", " ('Czech Republic', 'Czechia'),\n", " # Yugoslavia dissolved in 2002; most of it became 'Serbia and Montenegro', until 2006, when Serbia and Montenegro split ways.\n", " ('Serbia and Montenegro', 'yugoslavia'),\n", @@ -552,11 +552,11 @@ "migration = (pd.read_csv('data/migration-flows.csv')\n", " .pipe(pd.melt, id_vars=['Country', 'Year'], var_name='Migration', value_name='Count') # to long format\n", " .loc[lambda x: x['Migration'].str.contains('Emigrants')] # filter for emigrant rows\n", - " .pipe(lambda x: x.rename(columns = {col: col.lower() for col in x.columns})) # lowercase column names \n", - " .assign(migration = lambda x: x.migration.str.replace('Emigrants from ', '')) # filter for emigrant rows \n", + " .pipe(lambda x: x.rename(columns = {col: col.lower() for col in x.columns})) # lowercase column names\n", + " .assign(migration = lambda x: x.migration.str.replace('Emigrants from ', '')) # filter for emigrant rows\n", " .rename(columns={'migration': 'emigrated_from', 'country': 'emigrated_to'}) # boil down to country name\n", " .query('count >= 0') # negative counts are just total emigrants from country\n", - " .pipe(lambda x: x.assign(count = x['count'].astype(int))) # convert count to int \n", + " .pipe(lambda x: x.assign(count = x['count'].astype(int))) # convert count to int\n", ")\n", "\n", "# Clean up country names\n", @@ -918,9 +918,9 @@ "plot_country_history('united kingdom', ax, df)\n", "\n", "legend_elements = [Line2D([0], [0], marker='*', color='white', label='Winner',\n", - " markerfacecolor='gold', markersize=12, markeredgecolor='black'), \n", + " markerfacecolor='gold', markersize=12, markeredgecolor='black'),\n", " Line2D([0], [0], marker='x', color='white', label='Did not perform in final',\n", - " markerfacecolor='grey', markersize=8, markeredgecolor='grey'), \n", + " markerfacecolor='grey', markersize=8, markeredgecolor='grey'),\n", " Line2D([0], [0], marker='_', color='white', label='Competition cancelled',\n", " markerfacecolor='grey', markersize=8, markeredgecolor='grey')]\n", "fig.legend(handles=legend_elements, loc='right', ncol=1, bbox_to_anchor=(0.9, -0.05))\n", @@ -928,7 +928,7 @@ "sm = plt.cm.ScalarMappable(cmap=my_cmap, norm=plt.Normalize(vmin=1, vmax=26))\n", "cbaxes = fig.add_axes([0.2, -0.05, 0.2, 0.05]) # x y deltax deltay\n", "\n", - "fig.colorbar(sm, ax=ax, orientation='horizontal', fraction=0.02, pad=0.1, label='Position', \n", + "fig.colorbar(sm, ax=ax, orientation='horizontal', fraction=0.02, pad=0.1, label='Position',\n", " cax = cbaxes)\n", "\n", "plt.show()" @@ -958,7 +958,7 @@ " ):\n", " axs = subfig.subplots(1, 4, sharey=True, sharex=True, squeeze=True)\n", " subfig.suptitle(title, fontsize=14, fontweight='bold')\n", - " for i, country in enumerate(country_group.index[:4]): \n", + " for i, country in enumerate(country_group.index[:4]):\n", " plot_country_history(country, axs[i], df)\n", "\n", "# Add in colorbar and legend\n", @@ -1042,17 +1042,17 @@ "\n", " row_dism = 1 - df_heatmap.T.corr()\n", " row_linkage = hc.linkage(sp.distance.squareform(row_dism), method='complete')\n", - " plot = sns.clustermap(df_heatmap, row_linkage=row_linkage, col_linkage=row_linkage, \n", + " plot = sns.clustermap(df_heatmap, row_linkage=row_linkage, col_linkage=row_linkage,\n", " figsize=(9, 8),\n", - " mask=df_heatmap.isnull(), \n", + " mask=df_heatmap.isnull(),\n", " dendrogram_ratio=[0.15, 0.01],\n", " cbar_pos=(0.8, 1.01, 0.1, 0.019),\n", " cbar_kws={'orientation': 'horizontal'},\n", - " cmap=cmap, \n", + " cmap=cmap,\n", " xticklabels=1,\n", " yticklabels=1,\n", " **{'center': center} if center is not None else {})\n", - " plot.ax_col_dendrogram.set_visible(False) \n", + " plot.ax_col_dendrogram.set_visible(False)\n", "\n", " plot.fig.suptitle(suptitle, fontsize=16, y=1.02)\n", " plot.ax_heatmap.set_xlabel('Performing country')\n", @@ -1275,7 +1275,7 @@ "votes = (votes.assign(highest_average_point=lambda x: x.groupby(['to_country'])['average_points'].transform('max'))\n", " .sort_values(by='highest_average_point', ascending=True))\n", "\n", - "fig = px.scatter(votes, x='average_points', y='to_country', color='vote_deviation', \n", + "fig = px.scatter(votes, x='average_points', y='to_country', color='vote_deviation',\n", " color_continuous_scale=px.colors.diverging.RdBu_r,\n", " color_continuous_midpoint=0)\n", "fig.update_traces(hovertemplate=('Performer: %{customdata[0]}'\n", @@ -1287,7 +1287,7 @@ " customdata=votes)\n", "fig.update_layout(hoverlabel_align='left', width=640, height=640, margin=dict(l=20, r=20, t=20, b=20),\n", " xaxis={'title': 'Average points received from each voter'},\n", - " yaxis={'title': 'Performing country'}, \n", + " yaxis={'title': 'Performing country'},\n", " coloraxis_colorbar=dict(title='Deviation from average points'))\n", "\n", "fig.update_yaxes(tickfont_size=8)\n", @@ -1361,8 +1361,8 @@ "\n", "# Merge original dataframe with its reverse\n", "df_pairs = (df_pairs\n", - " .merge(df_pairs.rename(columns={'from_country': 'to_country', \n", - " 'to_country': 'from_country'}), \n", + " .merge(df_pairs.rename(columns={'from_country': 'to_country',\n", + " 'to_country': 'from_country'}),\n", " on=['from_country', 'to_country'])\n", " .drop_duplicates()\n", " .query('from_country != to_country')\n", @@ -1414,7 +1414,7 @@ "top_one_sided = (df_pairs\n", " .sort_values('votes_diff', ascending=False).head(N))\n", "\n", - "# combine \n", + "# combine\n", "top_relationships = (pd.concat([top_highs, top_lows, top_one_sided]))\n", "# add grouping\n", "top_relationships['group'] = ['high'] * N + ['low'] * N + ['one-sided'] * N\n", @@ -1640,8 +1640,8 @@ "outputs": [], "source": [ "#| code-fold: true\n", - "df_performance = df[['year', 'Artist', 'to_country', \n", - " 'total_points', \n", + "df_performance = df[['year', 'Artist', 'to_country',\n", + " 'total_points',\n", " 'rank', 'to_code2',\n", " 'Official_languages', 'Language_sung', 'Contains_English',\n", " 'Contains_NonEnglish', 'Contains_Multiple_Languages',\n", @@ -1682,7 +1682,7 @@ "df_language = df_performance.copy()\n", "\n", "df_performance['English_only'] = (df_performance['Contains_English']) & (df_performance['Number_of_Languages'] == 1 )\n", - "df_performance['No_English'] = ~df_performance['Contains_English'] \n", + "df_performance['No_English'] = ~df_performance['Contains_English']\n", "df_performance['Some_English'] = (df_performance['Contains_English']) & (df_performance['Number_of_Languages'] > 1 )\n", "\n", "# for each country get the ratio of songs that contain only English, some English and no English\n", @@ -1703,7 +1703,7 @@ "\n", "df_language.plot(kind='bar', figsize=(15, 6), stacked=True, color=colours)\n", "\n", - "plt.legend(['English only', 'Partly English', 'No English'], title=\"Performance languages\", loc=[1, 1], \n", + "plt.legend(['English only', 'Partly English', 'No English'], title=\"Performance languages\", loc=[1, 1],\n", " fontsize=14, bbox_to_anchor=(0.51, 0., 0.5, 0.5), title_fontsize=16)\n", "\n", "plt.title('How frequently countries sing in English', fontsize=20, pad=30)\n", @@ -1743,13 +1743,13 @@ "from statannot import add_stat_annotation\n", "\n", "# boxplots for each language type\n", - "ax = sns.boxplot(x='language', y='total_points', \n", - " data=df_long.loc[df_long['contains_language'] > 0], \n", - " palette=colours, showfliers=False, \n", + "ax = sns.boxplot(x='language', y='total_points',\n", + " data=df_long.loc[df_long['contains_language'] > 0],\n", + " palette=colours, showfliers=False,\n", " order=['English_only', 'Some_English', 'No_English'])\n", - "sns.stripplot(x='language', y='total_points', \n", + "sns.stripplot(x='language', y='total_points',\n", " order=['English_only', 'Some_English', 'No_English'],\n", - " data=df_long.loc[df_long['contains_language'] > 0], \n", + " data=df_long.loc[df_long['contains_language'] > 0],\n", " jitter=0.25, size=2, color=\".3\", linewidth=0)\n", "\n", "plt.title('Average points for performances in different languages', fontsize=20, pad=30)\n", @@ -1757,7 +1757,7 @@ "plt.ylabel('Total points')\n", "\n", "add_stat_annotation(ax, data=df_long.loc[df_long['contains_language'] > 0],\n", - " x='language', y='total_points', \n", + " x='language', y='total_points',\n", " order=['English_only', 'Some_English', 'No_English'],\n", " box_pairs=[(\"English_only\", \"No_English\")],\n", " test='Mann-Whitney', text_format='star', verbose=0)\n", @@ -1774,8 +1774,8 @@ "#| code-fold: true\n", "df_language = df_performance.copy()\n", "\n", - "df_performance['Own_language'] = (df_performance['Contains_Own_Language']) \n", - "df_performance['Other_language'] = ~df_performance['Contains_Own_Language'] \n", + "df_performance['Own_language'] = (df_performance['Contains_Own_Language'])\n", + "df_performance['Other_language'] = ~df_performance['Contains_Own_Language']\n", "\n", "# for each country get the ratio of songs that contain only English, some English and no English\n", "# then sort by the ratio of songs that contain only English\n", @@ -1795,7 +1795,7 @@ "\n", "df_language.plot(kind='bar', figsize=(15, 6), stacked=True, color=colours)\n", "\n", - "plt.legend(['Other language', 'Own language'], title=\"Performance languages\", loc=[1, 1], \n", + "plt.legend(['Other language', 'Own language'], title=\"Performance languages\", loc=[1, 1],\n", " fontsize=14, bbox_to_anchor=(0.51, 0., 0.5, 0.5), title_fontsize=16)\n", "\n", "plt.title('How frequently countries sing in their official languages', fontsize=20, pad=30)\n", @@ -2022,7 +2022,7 @@ "metadata": {}, "outputs": [], "source": [ - "# TODO: check if male get higher average votes. " + "# TODO: check if male get higher average votes." ] }, { @@ -2051,8 +2051,8 @@ "outputs": [], "source": [ "#| code-fold: true\n", - "df_performance = df[['year', 'Artist', 'to_country', \n", - " 'total_points', 'rank', 'to_code2', \n", + "df_performance = df[['year', 'Artist', 'to_country',\n", + " 'total_points', 'rank', 'to_code2',\n", " 'Official_languages', 'Language_sung', 'Contains_English',\n", " 'Contains_NonEnglish', 'Contains_Multiple_Languages',\n", " 'prop_emigrants_v2p', 'prop_emigrants_p2v', 'has_border',\n", @@ -2075,10 +2075,10 @@ " 'Contains_Voting_Language': 'binary',\n", " 'Contains_English': 'binary',\n", " 'Contains_NonEnglish': 'binary',\n", - " 'prop_emigrants_v2p': 'numeric', \n", - " 'prop_emigrants_p2v': 'numeric', \n", + " 'prop_emigrants_v2p': 'numeric',\n", + " 'prop_emigrants_p2v': 'numeric',\n", " 'has_border': 'binary',\n", - " 'gender': 'categorical', \n", + " 'gender': 'categorical',\n", " 'comps_without_win': 'numeric'\n", "}\n", "\n", @@ -2097,8 +2097,8 @@ " elif value == 'binary':\n", " sns.violinplot(ax=axes[j, k], x=key, y='points', data=df, color='tab:blue', inner=None, showmeans=True)\n", " else:\n", - " sns.regplot(ax=axes[j, k], x=key, y='points', data=df, ci=95, \n", - " color='tab:blue', scatter_kws={'alpha': 0.4, 'edgecolor': 'none', 's': 20}, \n", + " sns.regplot(ax=axes[j, k], x=key, y='points', data=df, ci=95,\n", + " color='tab:blue', scatter_kws={'alpha': 0.4, 'edgecolor': 'none', 's': 20},\n", " line_kws={'color': 'tab:orange'})\n", "\n", " # if key contains Prop then log scale x axis\n", @@ -2262,7 +2262,7 @@ "def train_baseline(excluded_years=None):\n", " if excluded_years is None:\n", " excluded_years = []\n", - " \n", + "\n", " # Get maximum rank in each year (= number of countries participating)\n", " df_rank_max = (df[['year', 'rank']]\n", " .query('year not in @excluded_years')\n", @@ -2270,7 +2270,7 @@ " .agg('max')\n", " .rename({'rank': 'rank_max'}, axis=1)\n", " )\n", - " \n", + "\n", " # Rescale rank to go from 0 = last place (rank = rank_max) to 1 = first place (rank = 1).\n", " df_rank = (df[['to_country', 'year', 'rank']]\n", " .query('year <= 2022')\n", @@ -2281,9 +2281,9 @@ " .agg({'rescaled_rank': 'mean'})\n", " .sort_values('rescaled_rank', ascending=False)\n", " )\n", - " \n", + "\n", " return df_rank\n", - " \n", + "\n", "ranks_without_2022 = train_baseline(excluded_years=[2022])\n", "ranks_without_2022.head(n=10)" ] @@ -2315,22 +2315,22 @@ " predictions['predicted_rank'] = predictions.index + 1\n", " predictions = predictions.set_index('to_country')\n", " return predictions\n", - " \n", + "\n", "def get_actual_ranks(year):\n", " return (df.query('year == @year')[['to_country', 'rank']]\n", " .drop_duplicates()\n", " .set_index('to_country')\n", " .sort_values('rank'))\n", - " \n", + "\n", "def join_predictions_and_actual(predicted_ranks, actual_ranks):\n", " both = predicted_ranks.join(actual_ranks, validate='one_to_one').astype(int)\n", " both = both.rename({'rank': 'actual_rank'}, axis=1)\n", " return both[['predicted_rank', 'actual_rank']]\n", - " \n", + "\n", "def get_spearman(predicted_ranks, actual_ranks):\n", " both_ranks = join_predictions_and_actual(predicted_ranks, actual_ranks)\n", " return both_ranks['predicted_rank'].corr(both_ranks['actual_rank'], method='spearman')\n", - " \n", + "\n", "predictions_baseline = predict_baseline(ranks_without_2022, 2022)\n", "actual = get_actual_ranks(2022)\n", "both = join_predictions_and_actual(predictions_baseline, actual)\n", @@ -2373,7 +2373,7 @@ " predictions_baseline = predictions_baseline.query('to_country != \"slovakia\"')\n", " actual = actual.query('to_country != \"slovakia\"')\n", " return(get_spearman(predictions_baseline, actual))\n", - " \n", + "\n", "spearmans = [get_spearman_baseline_without(year) for year in all_years]\n", "\n", "print(f'Mean Spearman coefficient across {len(all_years)} years: {np.mean(spearmans)}')\n", @@ -2658,7 +2658,7 @@ " sigmaAlpha ~ cauchy(0,1);\n", "\n", " alpha ~ normal( xphi * phi, sigmaAlpha );\n", - " \n", + "\n", " // remembering that vp is 0-indexed and alpha is 1-indexed\n", " y ~ ordered_logistic( gamma + alpha[ add(vp,1) ] + (xbeta * beta), lambda );\n", "\n", @@ -2695,7 +2695,7 @@ "# build xbeta matrix\n", "xbeta_train = df_train.loc[:,['Contains_English_bin','Contains_Own_Language_bin','male','female','comps_without_win']].values\n", "# minmax scaling of 'comps_since_last_win'\n", - "scaler = MinMaxScaler() \n", + "scaler = MinMaxScaler()\n", "xbeta_train_norm = scaler.fit_transform(xbeta_train)\n", "\n", "xbeta_test = df_test.loc[:,['Contains_English_bin','Contains_Own_Language_bin','male','female','comps_without_win']].values\n", @@ -2798,10 +2798,10 @@ "outputs": [], "source": [ "az_fit = az.from_pystan(\n", - " posterior=fit, \n", - " observed_data=\"y\", \n", + " posterior=fit,\n", + " observed_data=\"y\",\n", " posterior_predictive=\"y_hat\",\n", - " predictions=\"y_pred\", \n", + " predictions=\"y_pred\",\n", " posterior_model=posterior)\n", "\n", "az.plot_trace(az_fit, [\"beta\",\"lambda\"], figsize=(20,8), legend=True)\n", @@ -2840,6 +2840,674 @@ "Overall, a ranked XGBoost model is a powerful machine learning tool that can be used to predict rankings or orders of items, based on various factors that may influence their positions in the ranking. This technique is often used to train XGBoost models for ranking tasks, such as search engine ranking or recommendation systems." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import xgboost as xgb\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.metrics import confusion_matrix\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "from sklearn.preprocessing import StandardScaler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df['received_vote'] = df['points'].apply(lambda x: 1 if x > 0 else 0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_columns = [\n", + " 'from_code2',\n", + " 'points',\n", + " 'to_code2',\n", + " 'Contains_English',\n", + " 'Contains_NonEnglish',\n", + " 'Contains_Multiple_Languages',\n", + " 'Number_of_Languages',\n", + " 'Contains_Own_Language',\n", + " 'Contains_Voting_Language',\n", + " 'gender',\n", + " 'prop_emigrants_v2p',\n", + " 'prop_emigrants_p2v',\n", + " 'has_border',\n", + " 'comps_without_win',\n", + " 'received_vote',\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Bar chart for vote occurrences\n", + "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))\n", + "\n", + "sns.countplot(x=\"points\", data=df, ax=ax1)\n", + "ax1.set_ylabel('Count', fontsize=12)\n", + "ax1.set_xlabel('Score', fontsize=12)\n", + "ax1.set_title('Number of votes', fontsize=12)\n", + "\n", + "sns.countplot(x=\"received_vote\", data=df, ax=ax2)\n", + "ax2.set_ylabel('Count', fontsize=12)\n", + "ax2.set_xlabel('Received vote', fontsize=12)\n", + "ax2.set_title('Number of votes', fontsize=12)\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_xgboost = df[['year'] + model_columns]\n", + "\n", + "df_xgboost['has_border'] = df_xgboost['has_border'].fillna(0)\n", + "\n", + "# log10 scale the prop_emigrants column\n", + "df_xgboost['prop_emigrants_v2p'] = df_xgboost['prop_emigrants_v2p'].apply(\n", + " lambda x: np.log10(x+4e-8)\n", + ")\n", + "df_xgboost['prop_emigrants_p2v'] = df_xgboost['prop_emigrants_p2v'].apply(\n", + " lambda x: np.log10(x+4e-8)\n", + ")\n", + "\n", + "# apply the standard scaler to prop_emigrants and comps_without_win\n", + "scaler = StandardScaler()\n", + "df_xgboost[['prop_emigrants_v2p']] = scaler.fit_transform(\n", + " df_xgboost[['prop_emigrants_v2p']]\n", + ")\n", + "df_xgboost[['prop_emigrants_p2v']] = scaler.fit_transform(\n", + " df_xgboost[['prop_emigrants_p2v']]\n", + ")\n", + "\n", + "scaler = MinMaxScaler()\n", + "df_xgboost[['comps_without_win']] = scaler.fit_transform(\n", + " df_xgboost[['comps_without_win']]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(10, 4))\n", + "\n", + "# histogram of the log scaled prop_emigrants column\n", + "df_xgboost['comps_without_win'].hist(bins=20, ax=ax1)\n", + "df_xgboost['prop_emigrants_v2p'].hist(bins=20, ax=ax2)\n", + "df_xgboost['prop_emigrants_p2v'].hist(bins=20, ax=ax3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def basic_xgboost(df_input, seed, test_size, predictor_column='received_vote'):\n", + " X = df_input[model_columns]\n", + " Y = df_input[predictor_column]\n", + "\n", + " # one hot encode the gender, from country and code\n", + " for j in ['gender', 'from_code2', 'to_code2']:\n", + " j_text = '_voting' if j == 'from_code2' else ''\n", + " for i in X[j].unique():\n", + " output_binary = X[j].apply(lambda x: 1 if x == i else 0)\n", + " X[i+j_text] = output_binary\n", + "\n", + " X = X[X.columns.drop(j)]\n", + "\n", + " # convert Y to int\n", + " Y = Y.astype(int)\n", + "\n", + " X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed, stratify=X['points'])\n", + "\n", + " test_all = X_test\n", + "\n", + " # remove 'Votes' and 'received_vote' columns from X_train\n", + " X_train = X_train[X_train.columns.drop('points')]\n", + " X_train = X_train[X_train.columns.drop('received_vote')]\n", + " X_test = X_test[X_test.columns.drop('points')]\n", + " X_test = X_test[X_test.columns.drop('received_vote')]\n", + "\n", + " model = xgb.XGBClassifier()\n", + " model.fit(X_train, y_train)\n", + "\n", + " return model, X_test, y_test, test_all\n", + "\n", + "def model_predictions(model, X_test, y_test):\n", + " Xt = X_test.copy()\n", + " y_pred = model.predict(Xt)\n", + " predictions = [round(value) for value in y_pred]\n", + "\n", + " Xt['prob'] = model.predict_proba(Xt)[:,1]\n", + " Xt['predictions'] = predictions\n", + " Xt['actual'] = y_test\n", + " return Xt\n", + "\n", + "def model_evalutation(df_pred):\n", + " # evaluate predictions\n", + " accuracy = accuracy_score(df_pred['actual'], df_pred['predictions'])\n", + " print(\"Accuracy: %.2f%%\" % (accuracy * 100.0))\n", + "\n", + " # print a confusion matrix heatmap\n", + " cm = confusion_matrix(df_pred['actual'], df_pred['predictions'])\n", + " sns.heatmap(cm, annot=True, fmt=\"d\", cmap=\"Blues\")\n", + "\n", + " # add title and axis labels\n", + " plt.title('Confusion matrix' + \"(Accuracy={:.3f})\".format(accuracy))\n", + " plt.ylabel('True label')\n", + " plt.xlabel('Predicted label')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_basic, X_test, y_test, test_all = basic_xgboost(df_xgboost, seed=7, test_size=0.33)\n", + "predictions = model_predictions(model_basic, X_test, y_test)\n", + "model_evalutation(predictions)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "graph = xgb.to_graphviz(model_basic, num_trees=1, rankdir='LR')\n", + "\n", + "fig, ax = plt.subplots(figsize=(30, 50))\n", + "xgb.plot_tree(model_basic, num_trees=1, ax=ax)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import xgboost as xgb\n", + "from scipy.stats import rankdata\n", + "from sklearn.metrics import confusion_matrix\n", + "import seaborn as sns\n", + "from sklearn.model_selection import GroupShuffleSplit\n", + "from scipy.stats import spearmanr\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def xgboost_rank_model(df_input, test_size=0.33, seed=7):\n", + "\n", + " df2 = df_input[['year', 'from_code2', 'to_code2', 'points',\n", + " 'Contains_English',\n", + " 'Contains_NonEnglish', 'Contains_Multiple_Languages',\n", + " 'Number_of_Languages', 'Contains_Own_Language', 'Contains_Voting_Language', 'gender',\n", + " 'prop_emigrants_v2p', 'prop_emigrants_p2v','has_border',\n", + " 'comps_without_win']]\n", + "\n", + " df2 = df2.rename(columns={\"points\": 'rank'})\n", + "\n", + " df2['rank'] = df2['rank'].astype(int) - 1\n", + "\n", + " df2['id'] = df2['from_code2'].astype(str) + df2['year'].astype(str)\n", + "\n", + " # one hot encode the gender, from country and code\n", + " for j in ['gender', 'to_code2', 'from_code2']:\n", + " j_text = \"_voting\" if j == 'from_code2' else ''\n", + " for i in df2[j].unique():\n", + " df2[i+j_text] = df2[j].apply(lambda x: 1 if x == i else 0)\n", + "\n", + " df2 = df2[df2.columns.drop(j)]\n", + "\n", + " df2= df2[df2.columns.drop('year')]\n", + "\n", + " gss = GroupShuffleSplit(test_size=test_size, n_splits=1, random_state = seed).split(df2, groups=df2['id'])\n", + "\n", + " X_train_inds, X_test_inds = next(gss)\n", + "\n", + " train_data= df2.iloc[X_train_inds]\n", + " X_train = train_data.loc[:, ~train_data.columns.isin(['id','rank'])]\n", + " y_train = train_data.loc[:, train_data.columns.isin(['rank'])]\n", + "\n", + " groups = train_data.groupby('id').size().to_frame('size')['size'].to_numpy()\n", + "\n", + " test_data= df2.iloc[X_test_inds]\n", + " test_data = test_data[X_train.columns.tolist() + ['rank', 'id']]\n", + "\n", + " model = xgb.XGBRanker(\n", + " tree_method='hist',\n", + " booster='gbtree',\n", + " objective='rank:pairwise',\n", + " random_state=7,\n", + " learning_rate=0.1,\n", + " colsample_bytree=0.9,\n", + " eta=0.05,\n", + " max_depth=6,\n", + " n_estimators=110,\n", + " subsample=0.75\n", + " )\n", + "\n", + " model.fit(X_train, y_train, group=groups, verbose=True)\n", + "\n", + " return model, test_data, train_data\n", + "\n", + "\n", + "\n", + "def ranked_model_predictions(model, test_set):\n", + " test = test_set.copy()\n", + "\n", + " test['prediction_rel'] = np.nan\n", + " test['predictions'] = np.nan\n", + "\n", + " for i in test['id'].unique():\n", + " sub_test = test.loc[test['id'] == i, ~test.columns.isin(['id', 'rank', 'prediction_rel', 'predictions'])]\n", + "\n", + " preds = model.predict(sub_test)\n", + "\n", + " # get the order of the predictions\n", + " res = rankdata(preds, method='ordinal')\n", + " test.loc[test['id'] == i, 'prediction_rel'] = res\n", + "\n", + " # create a score for top 10 predictions getting 1:10 and others 0\n", + " top10 = test.loc[test['id'] == i, 'prediction_rel'].nlargest(10).values\n", + "\n", + "\n", + " test.loc[test['id'] == i, 'predictions'] = test.loc[test['id'] == i, 'prediction_rel'].apply(lambda x: x if x in top10 else 0)\n", + "\n", + " # if not zero subtract min(top10)\n", + " test.loc[test['id'] == i, 'predictions'] = test.loc[test['id'] == i, 'predictions'].apply(lambda x: x - max(top10) + 10 if x != 0 else 0)\n", + "\n", + " # if 10 set to 12\n", + " test.loc[test['id'] == i, 'predictions'] = test.loc[test['id'] == i, 'predictions'].apply(lambda x: 12 if x == 10 else x)\n", + " test.loc[test['id'] == i, 'predictions'] = test.loc[test['id'] == i, 'predictions'].apply(lambda x: 10 if x == 9 else x)\n", + "\n", + " test['actual'] = test['rank'] + 1\n", + "\n", + "\n", + " accuracy = accuracy_score(test['actual'], test['predictions'])\n", + " print(\"Accuracy: %.2f%%\" % (accuracy * 100.0))\n", + "\n", + " return test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_ranked, test_data, train_data = xgboost_rank_model(df_xgboost.loc[df_xgboost['points'] > 0], seed=7, test_size=0.33)\n", + "out = ranked_model_predictions(model_ranked, test_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_ranked_all, test_data_all, train_data_all = xgboost_rank_model(df_xgboost, seed=7, test_size=0.33)\n", + "out_all = ranked_model_predictions(model_ranked_all, test_data_all)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "out_all['predictions'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def violins(pred):\n", + " prediction_df = pred.copy()\n", + " fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 7))\n", + "\n", + " if(max(prediction_df['actual']) < 12):\n", + " if(max(prediction_df['actual']) == 11):\n", + " prediction_df['predictions'] = prediction_df['predictions'] + 1\n", + " prediction_df['actual'] = prediction_df['actual'] + 1\n", + " prediction_df['predictions'] = prediction_df['predictions'].apply(lambda x: 12 if x == 10 else x)\n", + " prediction_df['predictions'] = prediction_df['predictions'].apply(lambda x: 10 if x == 9 else x)\n", + " prediction_df['actual'] = prediction_df['actual'].apply(lambda x: 12 if x == 10 else x)\n", + " prediction_df['actual'] = prediction_df['actual'].apply(lambda x: 10 if x == 9 else x)\n", + "\n", + " # violin plot of predictions for each rank\n", + " sns.violinplot(x=\"actual\", y=\"predictions\", data=prediction_df, order=range(0, 13), ax=ax1)\n", + " sns.violinplot(y=\"actual\", x=\"predictions\", data=prediction_df, order=range(0, 13), ax=ax2)\n", + "\n", + " # add a best fit line - scale is wrong because numeric on top of categorical\n", + " sns.regplot(x=\"actual\", y=\"predictions\", data=prediction_df, scatter=False, color='black', ax=ax1)\n", + " sns.regplot(y=\"actual\", x=\"predictions\", data=prediction_df, scatter=False, color='black', ax=ax2)\n", + "\n", + "\n", + " # spearman correlation\n", + " corr, _ = spearmanr(prediction_df['actual'], prediction_df['predictions'])\n", + "\n", + " fig.suptitle('Predicted Score vs Actual Score (r = ' + str(round(corr, 4)) + ')', fontsize=15)\n", + "\n", + " # set x label\n", + " ax1.set_xlabel('Actual Score')\n", + " ax1.set_ylabel('Predicted Score')\n", + " ax2.set_ylabel('Actual Score')\n", + " ax2.set_xlabel('Predicted Score')\n", + "\n", + "def cm_heatmap (prediction_df, title='Confusion matrix', ax=None):\n", + " cm = confusion_matrix(prediction_df['actual'], prediction_df['predictions'])\n", + "\n", + " plot = sns.heatmap(cm, annot=True, fmt='g', ax=ax,\n", + " vmin=0, vmax=100, cmap='Blues')\n", + "\n", + " if(len(cm[[0]][0]) == 11) :\n", + " # change the tick labels\n", + " plot.set_xticks(np.arange(0, 11, 1)+0.5, np.arange(0, 9, 1).tolist() + [10, 12])\n", + " plot.set_yticks(np.arange(0, 11, 1)+0.5, np.arange(0, 9, 1).tolist() + [10, 12])\n", + " else:\n", + " plot.set_xticks(np.arange(0, 10, 1)+0.5, np.arange(1, 9, 1).tolist() + [10, 12])\n", + " plot.set_yticks(np.arange(0, 10, 1)+0.5, np.arange(1, 9, 1).tolist() + [10, 12])\n", + "\n", + " # add tick marks\n", + " plot.tick_params(axis='both', which='both', length=5, color='black')\n", + "\n", + " plot.set_title(title)\n", + " plot.set_ylabel('Actual Score')\n", + " plot.set_xlabel('Predicted Score')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "violins(out)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "violins(out_all.loc[out_all['actual'] > 0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cm_heatmap(out)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cm_heatmap(out_all)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Predicting the 2023 results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_2023 = future\n", + "\n", + "df_2023['id'] = df_2023['from_code2'] + df_2023['year'].astype(str)\n", + "df_2023['received_vote'] = 0\n", + "\n", + "df_2023 = df_2023[df_xgboost.columns]\n", + "\n", + "# remove the points and received_vote columns\n", + "df_2023 = df_2023[df_2023.columns.drop('points')]\n", + "df_2023 = df_2023[df_2023.columns.drop('received_vote')]\n", + "\n", + "df_2023.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_2023['from_code2'].value_counts().shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Remove russia from the voting, since they did not participate in 2023" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_2023 = df_2023.loc[df_2023['from_code2'] != 'RU', ]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_2023['id'] = df_2023['from_code2'] + df_2023['year'].astype(str)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check\n", + "print(df_2023['to_code2'].value_counts().value_counts())\n", + "print(df_2023['from_code2'].value_counts().value_counts())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_hurdle = df_2023.copy()\n", + "\n", + "# one hot encode the gender, from country and code\n", + "for j in ['gender', 'to_code2', 'from_code2']:\n", + " j_text = \"_voting\" if j == 'from_code2' else ''\n", + " for i in df_hurdle[j].unique():\n", + " df_hurdle[i+j_text] = df_hurdle[j].apply(lambda x: 1 if x == i else 0)\n", + "\n", + " df_hurdle = df_hurdle[df_hurdle.columns.drop(j)]\n", + "\n", + "df_hurdle= df_hurdle[df_hurdle.columns.drop('year')]\n", + "\n", + "df_hurdle.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# print the test_data_all columns which are not in df_hurdle\n", + "print([col for col in df_hurdle.columns if col not in test_data_all.columns])\n", + "print([col for col in test_data_all.columns if col not in df_hurdle.columns])\n", + "\n", + "# Add the missing columns\n", + "for col in test_data_all.columns:\n", + " if col not in df_hurdle.columns:\n", + " df_hurdle[col] = 0\n", + "\n", + "# add the missing columns\n", + "for col in test_data_all.columns:\n", + " if col not in df_hurdle.columns:\n", + " df_hurdle[col] = 0\n", + "\n", + "df_hurdle = df_hurdle[test_data_all.columns]\n", + "\n", + "df_hurdle['rank'] = 1\n", + "\n", + "df_hurdle.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "out_all = ranked_model_predictions(model_ranked_all, df_hurdle)\n", + "out_all['predictions'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "p1 = out_all.copy()\n", + "\n", + "cols = [col for col in p1.columns if '_voting' in col]\n", + "cols2 = [col for col in p1.columns if '_voting' not in col]\n", + "\n", + "\n", + "# wide format _voting columns to long format\n", + "p1 = pd.melt(p1, id_vars=cols2,\n", + " value_vars= cols, var_name='from_code2')\n", + "p1 = p1.loc[p1['value'] == 1]\n", + "\n", + "\n", + "p1['from_code2'] = p1['from_code2'].str.replace('_voting', '').tolist()\n", + "\n", + "p1 = p1.loc[p1['value'] == 1]\n", + "p1 = p1.drop('value', axis=1)\n", + "\n", + "cols2 = [\n", + " 'id',\n", + " 'from_code2',\n", + " 'Contains_English',\n", + " 'Contains_NonEnglish',\n", + " 'Contains_Multiple_Languages',\n", + " 'Number_of_Languages',\n", + " 'Contains_Own_Language',\n", + " 'Contains_Voting_Language',\n", + " 'has_border',\n", + " 'prediction_rel',\n", + " 'prop_emigrants_v2p',\n", + " 'prop_emigrants_p2v',\n", + " 'group',\n", + " 'female',\n", + " 'male',\n", + " 'rank',\n", + " 'comps_without_win',\n", + " 'predictions',\n", + " 'actual']\n", + "\n", + "cols = [col for col in p1.columns if col not in cols2]\n", + "\n", + "# # print p1.columns not in cols2\n", + "print([col for col in p1.columns if col not in cols2])\n", + "\n", + "p1 = pd.melt(p1,\n", + " id_vars=cols2,\n", + " value_vars= cols,\n", + " var_name='to_code2')\n", + "p1 = p1.loc[p1['value'] == 1]\n", + "# # drop value column\n", + "p1 = p1.drop('value', axis=1)\n", + "\n", + "p1.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| code-fold: true\n", + "#| code-summary: \"Code to print the results\"\n", + "\n", + "p1['total_points'] = p1.groupby('to_code2')['predictions'].transform('sum')\n", + "\n", + "# results\n", + "res = p1[['total_points', 'to_code2']].drop_duplicates()\n", + "\n", + "# match the country name to the country code\n", + "res = res.merge(df[['from_code2', 'from_country']],\n", + " left_on='to_code2',\n", + " right_on='from_code2',\n", + " how='left').drop_duplicates()\n", + "\n", + "res['position'] = res['total_points'].rank(ascending=False)\n", + "\n", + "# sort by total_points\n", + "res = res.sort_values(by=['total_points'], ascending=False)\n", + "\n", + "# rename from country to country\n", + "res = res.rename(columns={'from_country': 'country'})\n", + "\n", + "# remove from_code2 column\n", + "res = res.drop('from_code2', axis=1)\n", + "\n", + "# print without index\n", + "res" + ] + }, { "attachments": {}, "cell_type": "markdown",