From 2850a6f26d7e9c813679e121ed78ecbf980959c7 Mon Sep 17 00:00:00 2001
From: KatrionaGoldmann <kgoldmann@turing.ac.uk>
Date: Thu, 28 Mar 2024 17:18:49 +0000
Subject: [PATCH] Add xgboost model

---
 stories/2024-01-01-Eurovision/story.ipynb | 776 ++++++++++++++++++++--
 1 file changed, 722 insertions(+), 54 deletions(-)

diff --git a/stories/2024-01-01-Eurovision/story.ipynb b/stories/2024-01-01-Eurovision/story.ipynb
index f423791..0d9ee0a 100644
--- a/stories/2024-01-01-Eurovision/story.ipynb
+++ b/stories/2024-01-01-Eurovision/story.ipynb
@@ -176,9 +176,9 @@
                 "def standardise_country(c):\n",
                 "    replacements = [('-', ' '), ('&', 'and'), ('Netherands', 'Netherlands'),\n",
                 "                    # FYR Macedonia was formally renamed as North Macedonia in 2019\n",
-                "                    ('F.Y.R. Macedonia', 'North Macedonia'), \n",
-                "                    ('Russia', 'Russian Federation'), \n",
-                "                    ('The Netherlands', 'Netherlands'), \n",
+                "                    ('F.Y.R. Macedonia', 'North Macedonia'),\n",
+                "                    ('Russia', 'Russian Federation'),\n",
+                "                    ('The Netherlands', 'Netherlands'),\n",
                 "                    ('Czech Republic', 'Czechia'),\n",
                 "                    # Yugoslavia dissolved in 2002; most of it became 'Serbia and Montenegro', until 2006, when Serbia and Montenegro split ways.\n",
                 "                    ('Serbia and Montenegro', 'yugoslavia'),\n",
@@ -552,11 +552,11 @@
                 "migration = (pd.read_csv('data/migration-flows.csv')\n",
                 "    .pipe(pd.melt, id_vars=['Country', 'Year'], var_name='Migration', value_name='Count')  # to long format\n",
                 "    .loc[lambda x: x['Migration'].str.contains('Emigrants')]                               # filter for emigrant rows\n",
-                "    .pipe(lambda x: x.rename(columns = {col: col.lower() for col in x.columns}))           # lowercase column names                                                         \n",
-                "    .assign(migration = lambda x: x.migration.str.replace('Emigrants from ', ''))          # filter for emigrant rows                          \n",
+                "    .pipe(lambda x: x.rename(columns = {col: col.lower() for col in x.columns}))           # lowercase column names\n",
+                "    .assign(migration = lambda x: x.migration.str.replace('Emigrants from ', ''))          # filter for emigrant rows\n",
                 "    .rename(columns={'migration': 'emigrated_from', 'country': 'emigrated_to'})            # boil down to country name\n",
                 "    .query('count >= 0')                                                                   # negative counts are just total emigrants from country\n",
-                "    .pipe(lambda x: x.assign(count = x['count'].astype(int)))                              # convert count to int     \n",
+                "    .pipe(lambda x: x.assign(count = x['count'].astype(int)))                              # convert count to int\n",
                 ")\n",
                 "\n",
                 "# Clean up country names\n",
@@ -918,9 +918,9 @@
                 "plot_country_history('united kingdom', ax, df)\n",
                 "\n",
                 "legend_elements = [Line2D([0], [0], marker='*', color='white', label='Winner',\n",
-                "                          markerfacecolor='gold', markersize=12, markeredgecolor='black'),                          \n",
+                "                          markerfacecolor='gold', markersize=12, markeredgecolor='black'),\n",
                 "                  Line2D([0], [0], marker='x', color='white', label='Did not perform in final',\n",
-                "                          markerfacecolor='grey', markersize=8, markeredgecolor='grey'), \n",
+                "                          markerfacecolor='grey', markersize=8, markeredgecolor='grey'),\n",
                 "                  Line2D([0], [0], marker='_', color='white', label='Competition cancelled',\n",
                 "                          markerfacecolor='grey', markersize=8, markeredgecolor='grey')]\n",
                 "fig.legend(handles=legend_elements, loc='right', ncol=1, bbox_to_anchor=(0.9, -0.05))\n",
@@ -928,7 +928,7 @@
                 "sm = plt.cm.ScalarMappable(cmap=my_cmap, norm=plt.Normalize(vmin=1, vmax=26))\n",
                 "cbaxes = fig.add_axes([0.2, -0.05, 0.2, 0.05]) # x y deltax deltay\n",
                 "\n",
-                "fig.colorbar(sm, ax=ax, orientation='horizontal', fraction=0.02, pad=0.1, label='Position', \n",
+                "fig.colorbar(sm, ax=ax, orientation='horizontal', fraction=0.02, pad=0.1, label='Position',\n",
                 "             cax = cbaxes)\n",
                 "\n",
                 "plt.show()"
@@ -958,7 +958,7 @@
                 "                                        ):\n",
                 "    axs = subfig.subplots(1, 4, sharey=True, sharex=True, squeeze=True)\n",
                 "    subfig.suptitle(title, fontsize=14, fontweight='bold')\n",
-                "    for i, country in enumerate(country_group.index[:4]): \n",
+                "    for i, country in enumerate(country_group.index[:4]):\n",
                 "        plot_country_history(country, axs[i], df)\n",
                 "\n",
                 "# Add in colorbar and legend\n",
@@ -1042,17 +1042,17 @@
                 "\n",
                 "    row_dism = 1 - df_heatmap.T.corr()\n",
                 "    row_linkage = hc.linkage(sp.distance.squareform(row_dism), method='complete')\n",
-                "    plot = sns.clustermap(df_heatmap, row_linkage=row_linkage, col_linkage=row_linkage, \n",
+                "    plot = sns.clustermap(df_heatmap, row_linkage=row_linkage, col_linkage=row_linkage,\n",
                 "                          figsize=(9, 8),\n",
-                "                          mask=df_heatmap.isnull(), \n",
+                "                          mask=df_heatmap.isnull(),\n",
                 "                          dendrogram_ratio=[0.15, 0.01],\n",
                 "                          cbar_pos=(0.8, 1.01, 0.1, 0.019),\n",
                 "                          cbar_kws={'orientation': 'horizontal'},\n",
-                "                          cmap=cmap, \n",
+                "                          cmap=cmap,\n",
                 "                          xticklabels=1,\n",
                 "                          yticklabels=1,\n",
                 "                          **{'center': center} if center is not None else {})\n",
-                "    plot.ax_col_dendrogram.set_visible(False) \n",
+                "    plot.ax_col_dendrogram.set_visible(False)\n",
                 "\n",
                 "    plot.fig.suptitle(suptitle, fontsize=16, y=1.02)\n",
                 "    plot.ax_heatmap.set_xlabel('Performing country')\n",
@@ -1275,7 +1275,7 @@
                 "votes = (votes.assign(highest_average_point=lambda x: x.groupby(['to_country'])['average_points'].transform('max'))\n",
                 "              .sort_values(by='highest_average_point', ascending=True))\n",
                 "\n",
-                "fig = px.scatter(votes, x='average_points', y='to_country', color='vote_deviation', \n",
+                "fig = px.scatter(votes, x='average_points', y='to_country', color='vote_deviation',\n",
                 "                 color_continuous_scale=px.colors.diverging.RdBu_r,\n",
                 "                 color_continuous_midpoint=0)\n",
                 "fig.update_traces(hovertemplate=('Performer: %{customdata[0]}'\n",
@@ -1287,7 +1287,7 @@
                 "                  customdata=votes)\n",
                 "fig.update_layout(hoverlabel_align='left', width=640, height=640, margin=dict(l=20, r=20, t=20, b=20),\n",
                 "                  xaxis={'title': 'Average points received from each voter'},\n",
-                "                  yaxis={'title': 'Performing country'}, \n",
+                "                  yaxis={'title': 'Performing country'},\n",
                 "                  coloraxis_colorbar=dict(title='Deviation from average points'))\n",
                 "\n",
                 "fig.update_yaxes(tickfont_size=8)\n",
@@ -1361,8 +1361,8 @@
                 "\n",
                 "# Merge original dataframe with its reverse\n",
                 "df_pairs = (df_pairs\n",
-                "    .merge(df_pairs.rename(columns={'from_country': 'to_country', \n",
-                "                               'to_country': 'from_country'}), \n",
+                "    .merge(df_pairs.rename(columns={'from_country': 'to_country',\n",
+                "                               'to_country': 'from_country'}),\n",
                 "           on=['from_country', 'to_country'])\n",
                 "    .drop_duplicates()\n",
                 "    .query('from_country != to_country')\n",
@@ -1414,7 +1414,7 @@
                 "top_one_sided = (df_pairs\n",
                 "            .sort_values('votes_diff', ascending=False).head(N))\n",
                 "\n",
-                "# combine \n",
+                "# combine\n",
                 "top_relationships = (pd.concat([top_highs, top_lows, top_one_sided]))\n",
                 "# add grouping\n",
                 "top_relationships['group'] = ['high'] * N + ['low'] * N + ['one-sided'] * N\n",
@@ -1640,8 +1640,8 @@
             "outputs": [],
             "source": [
                 "#| code-fold: true\n",
-                "df_performance = df[['year', 'Artist', 'to_country', \n",
-                "       'total_points', \n",
+                "df_performance = df[['year', 'Artist', 'to_country',\n",
+                "       'total_points',\n",
                 "       'rank', 'to_code2',\n",
                 "       'Official_languages', 'Language_sung', 'Contains_English',\n",
                 "       'Contains_NonEnglish', 'Contains_Multiple_Languages',\n",
@@ -1682,7 +1682,7 @@
                 "df_language = df_performance.copy()\n",
                 "\n",
                 "df_performance['English_only'] = (df_performance['Contains_English']) & (df_performance['Number_of_Languages'] == 1 )\n",
-                "df_performance['No_English'] = ~df_performance['Contains_English'] \n",
+                "df_performance['No_English'] = ~df_performance['Contains_English']\n",
                 "df_performance['Some_English'] = (df_performance['Contains_English']) & (df_performance['Number_of_Languages'] > 1 )\n",
                 "\n",
                 "# for each country get the ratio of songs that contain only English, some English and no English\n",
@@ -1703,7 +1703,7 @@
                 "\n",
                 "df_language.plot(kind='bar', figsize=(15, 6), stacked=True, color=colours)\n",
                 "\n",
-                "plt.legend(['English only', 'Partly English', 'No English'], title=\"Performance languages\", loc=[1, 1], \n",
+                "plt.legend(['English only', 'Partly English', 'No English'], title=\"Performance languages\", loc=[1, 1],\n",
                 "        fontsize=14,  bbox_to_anchor=(0.51, 0., 0.5, 0.5), title_fontsize=16)\n",
                 "\n",
                 "plt.title('How frequently countries sing in English', fontsize=20, pad=30)\n",
@@ -1743,13 +1743,13 @@
                 "from statannot import add_stat_annotation\n",
                 "\n",
                 "# boxplots for each language type\n",
-                "ax = sns.boxplot(x='language', y='total_points', \n",
-                "    data=df_long.loc[df_long['contains_language'] > 0], \n",
-                "    palette=colours, showfliers=False, \n",
+                "ax = sns.boxplot(x='language', y='total_points',\n",
+                "    data=df_long.loc[df_long['contains_language'] > 0],\n",
+                "    palette=colours, showfliers=False,\n",
                 "    order=['English_only', 'Some_English', 'No_English'])\n",
-                "sns.stripplot(x='language', y='total_points', \n",
+                "sns.stripplot(x='language', y='total_points',\n",
                 "    order=['English_only', 'Some_English', 'No_English'],\n",
-                "    data=df_long.loc[df_long['contains_language'] > 0], \n",
+                "    data=df_long.loc[df_long['contains_language'] > 0],\n",
                 "    jitter=0.25, size=2, color=\".3\", linewidth=0)\n",
                 "\n",
                 "plt.title('Average points for performances in different languages', fontsize=20, pad=30)\n",
@@ -1757,7 +1757,7 @@
                 "plt.ylabel('Total points')\n",
                 "\n",
                 "add_stat_annotation(ax, data=df_long.loc[df_long['contains_language'] > 0],\n",
-                "                    x='language', y='total_points', \n",
+                "                    x='language', y='total_points',\n",
                 "                    order=['English_only', 'Some_English', 'No_English'],\n",
                 "                    box_pairs=[(\"English_only\", \"No_English\")],\n",
                 "                    test='Mann-Whitney', text_format='star', verbose=0)\n",
@@ -1774,8 +1774,8 @@
                 "#| code-fold: true\n",
                 "df_language = df_performance.copy()\n",
                 "\n",
-                "df_performance['Own_language'] = (df_performance['Contains_Own_Language']) \n",
-                "df_performance['Other_language'] = ~df_performance['Contains_Own_Language'] \n",
+                "df_performance['Own_language'] = (df_performance['Contains_Own_Language'])\n",
+                "df_performance['Other_language'] = ~df_performance['Contains_Own_Language']\n",
                 "\n",
                 "# for each country get the ratio of songs that contain only English, some English and no English\n",
                 "# then sort by the ratio of songs that contain only English\n",
@@ -1795,7 +1795,7 @@
                 "\n",
                 "df_language.plot(kind='bar', figsize=(15, 6), stacked=True, color=colours)\n",
                 "\n",
-                "plt.legend(['Other language', 'Own language'], title=\"Performance languages\", loc=[1, 1], \n",
+                "plt.legend(['Other language', 'Own language'], title=\"Performance languages\", loc=[1, 1],\n",
                 "        fontsize=14,  bbox_to_anchor=(0.51, 0., 0.5, 0.5), title_fontsize=16)\n",
                 "\n",
                 "plt.title('How frequently countries sing in their official languages', fontsize=20, pad=30)\n",
@@ -2022,7 +2022,7 @@
             "metadata": {},
             "outputs": [],
             "source": [
-                "# TODO: check if male get higher average votes. "
+                "# TODO: check if male get higher average votes."
             ]
         },
         {
@@ -2051,8 +2051,8 @@
             "outputs": [],
             "source": [
                 "#| code-fold: true\n",
-                "df_performance = df[['year', 'Artist', 'to_country', \n",
-                "       'total_points', 'rank', 'to_code2', \n",
+                "df_performance = df[['year', 'Artist', 'to_country',\n",
+                "       'total_points', 'rank', 'to_code2',\n",
                 "       'Official_languages', 'Language_sung', 'Contains_English',\n",
                 "       'Contains_NonEnglish', 'Contains_Multiple_Languages',\n",
                 "       'prop_emigrants_v2p', 'prop_emigrants_p2v', 'has_border',\n",
@@ -2075,10 +2075,10 @@
                 "    'Contains_Voting_Language': 'binary',\n",
                 "    'Contains_English': 'binary',\n",
                 "    'Contains_NonEnglish': 'binary',\n",
-                "    'prop_emigrants_v2p': 'numeric', \n",
-                "    'prop_emigrants_p2v': 'numeric', \n",
+                "    'prop_emigrants_v2p': 'numeric',\n",
+                "    'prop_emigrants_p2v': 'numeric',\n",
                 "    'has_border': 'binary',\n",
-                "    'gender': 'categorical', \n",
+                "    'gender': 'categorical',\n",
                 "    'comps_without_win': 'numeric'\n",
                 "}\n",
                 "\n",
@@ -2097,8 +2097,8 @@
                 "    elif value == 'binary':\n",
                 "        sns.violinplot(ax=axes[j, k], x=key, y='points', data=df, color='tab:blue', inner=None, showmeans=True)\n",
                 "    else:\n",
-                "        sns.regplot(ax=axes[j, k], x=key, y='points', data=df, ci=95, \n",
-                "                    color='tab:blue', scatter_kws={'alpha': 0.4, 'edgecolor': 'none', 's': 20}, \n",
+                "        sns.regplot(ax=axes[j, k], x=key, y='points', data=df, ci=95,\n",
+                "                    color='tab:blue', scatter_kws={'alpha': 0.4, 'edgecolor': 'none', 's': 20},\n",
                 "                    line_kws={'color': 'tab:orange'})\n",
                 "\n",
                 "        # if key contains Prop then log scale x axis\n",
@@ -2262,7 +2262,7 @@
                 "def train_baseline(excluded_years=None):\n",
                 "    if excluded_years is None:\n",
                 "        excluded_years = []\n",
-                "        \n",
+                "\n",
                 "    # Get maximum rank in each year (= number of countries participating)\n",
                 "    df_rank_max = (df[['year', 'rank']]\n",
                 "                .query('year not in @excluded_years')\n",
@@ -2270,7 +2270,7 @@
                 "                .agg('max')\n",
                 "                .rename({'rank': 'rank_max'}, axis=1)\n",
                 "                )\n",
-                "    \n",
+                "\n",
                 "    # Rescale rank to go from 0 = last place (rank = rank_max) to 1 = first place (rank = 1).\n",
                 "    df_rank = (df[['to_country', 'year', 'rank']]\n",
                 "                .query('year <= 2022')\n",
@@ -2281,9 +2281,9 @@
                 "                .agg({'rescaled_rank': 'mean'})\n",
                 "                .sort_values('rescaled_rank', ascending=False)\n",
                 "    )\n",
-                "    \n",
+                "\n",
                 "    return df_rank\n",
-                "    \n",
+                "\n",
                 "ranks_without_2022 = train_baseline(excluded_years=[2022])\n",
                 "ranks_without_2022.head(n=10)"
             ]
@@ -2315,22 +2315,22 @@
                 "    predictions['predicted_rank'] = predictions.index + 1\n",
                 "    predictions = predictions.set_index('to_country')\n",
                 "    return predictions\n",
-                "    \n",
+                "\n",
                 "def get_actual_ranks(year):\n",
                 "    return (df.query('year == @year')[['to_country', 'rank']]\n",
                 "                .drop_duplicates()\n",
                 "                .set_index('to_country')\n",
                 "                .sort_values('rank'))\n",
-                "                \n",
+                "\n",
                 "def join_predictions_and_actual(predicted_ranks, actual_ranks):\n",
                 "    both = predicted_ranks.join(actual_ranks, validate='one_to_one').astype(int)\n",
                 "    both = both.rename({'rank': 'actual_rank'}, axis=1)\n",
                 "    return both[['predicted_rank', 'actual_rank']]\n",
-                "    \n",
+                "\n",
                 "def get_spearman(predicted_ranks, actual_ranks):\n",
                 "    both_ranks = join_predictions_and_actual(predicted_ranks, actual_ranks)\n",
                 "    return both_ranks['predicted_rank'].corr(both_ranks['actual_rank'], method='spearman')\n",
-                "    \n",
+                "\n",
                 "predictions_baseline = predict_baseline(ranks_without_2022, 2022)\n",
                 "actual = get_actual_ranks(2022)\n",
                 "both = join_predictions_and_actual(predictions_baseline, actual)\n",
@@ -2373,7 +2373,7 @@
                 "        predictions_baseline = predictions_baseline.query('to_country != \"slovakia\"')\n",
                 "        actual = actual.query('to_country != \"slovakia\"')\n",
                 "    return(get_spearman(predictions_baseline, actual))\n",
-                "    \n",
+                "\n",
                 "spearmans = [get_spearman_baseline_without(year) for year in all_years]\n",
                 "\n",
                 "print(f'Mean Spearman coefficient across {len(all_years)} years: {np.mean(spearmans)}')\n",
@@ -2658,7 +2658,7 @@
                 "  sigmaAlpha ~ cauchy(0,1);\n",
                 "\n",
                 "  alpha ~ normal( xphi * phi, sigmaAlpha );\n",
-                "  \n",
+                "\n",
                 "  // remembering that vp is 0-indexed and alpha is 1-indexed\n",
                 "  y ~ ordered_logistic( gamma + alpha[ add(vp,1) ] + (xbeta * beta), lambda );\n",
                 "\n",
@@ -2695,7 +2695,7 @@
                 "# build xbeta matrix\n",
                 "xbeta_train = df_train.loc[:,['Contains_English_bin','Contains_Own_Language_bin','male','female','comps_without_win']].values\n",
                 "# minmax scaling of 'comps_since_last_win'\n",
-                "scaler = MinMaxScaler() \n",
+                "scaler = MinMaxScaler()\n",
                 "xbeta_train_norm = scaler.fit_transform(xbeta_train)\n",
                 "\n",
                 "xbeta_test = df_test.loc[:,['Contains_English_bin','Contains_Own_Language_bin','male','female','comps_without_win']].values\n",
@@ -2798,10 +2798,10 @@
             "outputs": [],
             "source": [
                 "az_fit = az.from_pystan(\n",
-                "    posterior=fit, \n",
-                "    observed_data=\"y\", \n",
+                "    posterior=fit,\n",
+                "    observed_data=\"y\",\n",
                 "    posterior_predictive=\"y_hat\",\n",
-                "    predictions=\"y_pred\", \n",
+                "    predictions=\"y_pred\",\n",
                 "    posterior_model=posterior)\n",
                 "\n",
                 "az.plot_trace(az_fit, [\"beta\",\"lambda\"], figsize=(20,8), legend=True)\n",
@@ -2840,6 +2840,674 @@
                 "Overall, a ranked XGBoost model is a powerful machine learning tool that can be used to predict rankings or orders of items, based on various factors that may influence their positions in the ranking. This technique is often used to train XGBoost models for ranking tasks, such as search engine ranking or recommendation systems."
             ]
         },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "import xgboost as xgb\n",
+                "from sklearn.model_selection import train_test_split\n",
+                "from sklearn.metrics import accuracy_score\n",
+                "from sklearn.metrics import confusion_matrix\n",
+                "from sklearn.preprocessing import MinMaxScaler\n",
+                "from sklearn.preprocessing import StandardScaler"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "df['received_vote'] = df['points'].apply(lambda x: 1 if x > 0 else 0)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "model_columns = [\n",
+                "        'from_code2',\n",
+                "        'points',\n",
+                "        'to_code2',\n",
+                "        'Contains_English',\n",
+                "        'Contains_NonEnglish',\n",
+                "        'Contains_Multiple_Languages',\n",
+                "        'Number_of_Languages',\n",
+                "        'Contains_Own_Language',\n",
+                "        'Contains_Voting_Language',\n",
+                "        'gender',\n",
+                "        'prop_emigrants_v2p',\n",
+                "        'prop_emigrants_p2v',\n",
+                "        'has_border',\n",
+                "        'comps_without_win',\n",
+                "        'received_vote',\n",
+                "]"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "# Bar chart for vote occurrences\n",
+                "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))\n",
+                "\n",
+                "sns.countplot(x=\"points\", data=df, ax=ax1)\n",
+                "ax1.set_ylabel('Count', fontsize=12)\n",
+                "ax1.set_xlabel('Score', fontsize=12)\n",
+                "ax1.set_title('Number of votes', fontsize=12)\n",
+                "\n",
+                "sns.countplot(x=\"received_vote\", data=df, ax=ax2)\n",
+                "ax2.set_ylabel('Count', fontsize=12)\n",
+                "ax2.set_xlabel('Received vote', fontsize=12)\n",
+                "ax2.set_title('Number of votes', fontsize=12)\n",
+                "\n",
+                "plt.show()"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "df_xgboost = df[['year'] + model_columns]\n",
+                "\n",
+                "df_xgboost['has_border'] = df_xgboost['has_border'].fillna(0)\n",
+                "\n",
+                "# log10 scale the prop_emigrants column\n",
+                "df_xgboost['prop_emigrants_v2p'] = df_xgboost['prop_emigrants_v2p'].apply(\n",
+                "    lambda x: np.log10(x+4e-8)\n",
+                ")\n",
+                "df_xgboost['prop_emigrants_p2v'] = df_xgboost['prop_emigrants_p2v'].apply(\n",
+                "    lambda x: np.log10(x+4e-8)\n",
+                ")\n",
+                "\n",
+                "# apply the standard scaler to prop_emigrants and comps_without_win\n",
+                "scaler = StandardScaler()\n",
+                "df_xgboost[['prop_emigrants_v2p']] = scaler.fit_transform(\n",
+                "    df_xgboost[['prop_emigrants_v2p']]\n",
+                ")\n",
+                "df_xgboost[['prop_emigrants_p2v']] = scaler.fit_transform(\n",
+                "    df_xgboost[['prop_emigrants_p2v']]\n",
+                ")\n",
+                "\n",
+                "scaler = MinMaxScaler()\n",
+                "df_xgboost[['comps_without_win']] = scaler.fit_transform(\n",
+                "    df_xgboost[['comps_without_win']]\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(10, 4))\n",
+                "\n",
+                "# histogram of the log scaled prop_emigrants column\n",
+                "df_xgboost['comps_without_win'].hist(bins=20, ax=ax1)\n",
+                "df_xgboost['prop_emigrants_v2p'].hist(bins=20, ax=ax2)\n",
+                "df_xgboost['prop_emigrants_p2v'].hist(bins=20, ax=ax3)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "def basic_xgboost(df_input, seed, test_size, predictor_column='received_vote'):\n",
+                "    X = df_input[model_columns]\n",
+                "    Y = df_input[predictor_column]\n",
+                "\n",
+                "    # one hot encode the gender, from country and code\n",
+                "    for j in ['gender', 'from_code2', 'to_code2']:\n",
+                "        j_text = '_voting' if j == 'from_code2' else ''\n",
+                "        for i in X[j].unique():\n",
+                "            output_binary = X[j].apply(lambda x: 1 if x == i else 0)\n",
+                "            X[i+j_text] = output_binary\n",
+                "\n",
+                "        X = X[X.columns.drop(j)]\n",
+                "\n",
+                "    # convert Y to int\n",
+                "    Y = Y.astype(int)\n",
+                "\n",
+                "    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed, stratify=X['points'])\n",
+                "\n",
+                "    test_all = X_test\n",
+                "\n",
+                "    # remove 'Votes' and 'received_vote' columns from X_train\n",
+                "    X_train = X_train[X_train.columns.drop('points')]\n",
+                "    X_train = X_train[X_train.columns.drop('received_vote')]\n",
+                "    X_test = X_test[X_test.columns.drop('points')]\n",
+                "    X_test = X_test[X_test.columns.drop('received_vote')]\n",
+                "\n",
+                "    model = xgb.XGBClassifier()\n",
+                "    model.fit(X_train, y_train)\n",
+                "\n",
+                "    return model, X_test, y_test, test_all\n",
+                "\n",
+                "def model_predictions(model, X_test, y_test):\n",
+                "    Xt = X_test.copy()\n",
+                "    y_pred = model.predict(Xt)\n",
+                "    predictions = [round(value) for value in y_pred]\n",
+                "\n",
+                "    Xt['prob'] = model.predict_proba(Xt)[:,1]\n",
+                "    Xt['predictions'] = predictions\n",
+                "    Xt['actual'] = y_test\n",
+                "    return Xt\n",
+                "\n",
+                "def model_evalutation(df_pred):\n",
+                "    # evaluate predictions\n",
+                "    accuracy = accuracy_score(df_pred['actual'], df_pred['predictions'])\n",
+                "    print(\"Accuracy: %.2f%%\" % (accuracy * 100.0))\n",
+                "\n",
+                "    # print a confusion matrix heatmap\n",
+                "    cm = confusion_matrix(df_pred['actual'], df_pred['predictions'])\n",
+                "    sns.heatmap(cm, annot=True, fmt=\"d\", cmap=\"Blues\")\n",
+                "\n",
+                "    # add title and axis labels\n",
+                "    plt.title('Confusion matrix' + \"(Accuracy={:.3f})\".format(accuracy))\n",
+                "    plt.ylabel('True label')\n",
+                "    plt.xlabel('Predicted label')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "model_basic, X_test, y_test, test_all = basic_xgboost(df_xgboost, seed=7, test_size=0.33)\n",
+                "predictions = model_predictions(model_basic, X_test, y_test)\n",
+                "model_evalutation(predictions)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "graph = xgb.to_graphviz(model_basic, num_trees=1, rankdir='LR')\n",
+                "\n",
+                "fig, ax = plt.subplots(figsize=(30, 50))\n",
+                "xgb.plot_tree(model_basic, num_trees=1, ax=ax)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "import xgboost as xgb\n",
+                "from scipy.stats import rankdata\n",
+                "from sklearn.metrics import confusion_matrix\n",
+                "import seaborn as sns\n",
+                "from sklearn.model_selection import GroupShuffleSplit\n",
+                "from scipy.stats import spearmanr\n",
+                "import numpy as np"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "def xgboost_rank_model(df_input, test_size=0.33, seed=7):\n",
+                "\n",
+                "    df2 = df_input[['year', 'from_code2', 'to_code2', 'points',\n",
+                "        'Contains_English',\n",
+                "        'Contains_NonEnglish', 'Contains_Multiple_Languages',\n",
+                "        'Number_of_Languages', 'Contains_Own_Language', 'Contains_Voting_Language', 'gender',\n",
+                "        'prop_emigrants_v2p',  'prop_emigrants_p2v','has_border',\n",
+                "        'comps_without_win']]\n",
+                "\n",
+                "    df2 = df2.rename(columns={\"points\": 'rank'})\n",
+                "\n",
+                "    df2['rank'] = df2['rank'].astype(int) - 1\n",
+                "\n",
+                "    df2['id'] = df2['from_code2'].astype(str) + df2['year'].astype(str)\n",
+                "\n",
+                "    # one hot encode the gender, from country and code\n",
+                "    for j in ['gender', 'to_code2', 'from_code2']:\n",
+                "        j_text = \"_voting\" if j == 'from_code2' else ''\n",
+                "        for i in df2[j].unique():\n",
+                "            df2[i+j_text] = df2[j].apply(lambda x: 1 if x == i else 0)\n",
+                "\n",
+                "        df2 = df2[df2.columns.drop(j)]\n",
+                "\n",
+                "    df2= df2[df2.columns.drop('year')]\n",
+                "\n",
+                "    gss = GroupShuffleSplit(test_size=test_size, n_splits=1, random_state = seed).split(df2, groups=df2['id'])\n",
+                "\n",
+                "    X_train_inds, X_test_inds = next(gss)\n",
+                "\n",
+                "    train_data= df2.iloc[X_train_inds]\n",
+                "    X_train = train_data.loc[:, ~train_data.columns.isin(['id','rank'])]\n",
+                "    y_train = train_data.loc[:, train_data.columns.isin(['rank'])]\n",
+                "\n",
+                "    groups = train_data.groupby('id').size().to_frame('size')['size'].to_numpy()\n",
+                "\n",
+                "    test_data= df2.iloc[X_test_inds]\n",
+                "    test_data = test_data[X_train.columns.tolist() + ['rank', 'id']]\n",
+                "\n",
+                "    model = xgb.XGBRanker(\n",
+                "        tree_method='hist',\n",
+                "        booster='gbtree',\n",
+                "        objective='rank:pairwise',\n",
+                "        random_state=7,\n",
+                "        learning_rate=0.1,\n",
+                "        colsample_bytree=0.9,\n",
+                "        eta=0.05,\n",
+                "        max_depth=6,\n",
+                "        n_estimators=110,\n",
+                "        subsample=0.75\n",
+                "    )\n",
+                "\n",
+                "    model.fit(X_train, y_train, group=groups, verbose=True)\n",
+                "\n",
+                "    return model, test_data, train_data\n",
+                "\n",
+                "\n",
+                "\n",
+                "def ranked_model_predictions(model, test_set):\n",
+                "    test = test_set.copy()\n",
+                "\n",
+                "    test['prediction_rel'] = np.nan\n",
+                "    test['predictions'] = np.nan\n",
+                "\n",
+                "    for i in test['id'].unique():\n",
+                "        sub_test = test.loc[test['id'] == i, ~test.columns.isin(['id', 'rank', 'prediction_rel', 'predictions'])]\n",
+                "\n",
+                "        preds = model.predict(sub_test)\n",
+                "\n",
+                "        # get the order of the predictions\n",
+                "        res = rankdata(preds, method='ordinal')\n",
+                "        test.loc[test['id'] == i, 'prediction_rel'] = res\n",
+                "\n",
+                "        # create a score for top 10 predictions getting 1:10 and others 0\n",
+                "        top10 = test.loc[test['id'] == i, 'prediction_rel'].nlargest(10).values\n",
+                "\n",
+                "\n",
+                "        test.loc[test['id'] == i, 'predictions'] = test.loc[test['id'] == i, 'prediction_rel'].apply(lambda x: x if x in top10 else 0)\n",
+                "\n",
+                "        # if not zero subtract min(top10)\n",
+                "        test.loc[test['id'] == i, 'predictions'] = test.loc[test['id'] == i, 'predictions'].apply(lambda x: x - max(top10) + 10 if x != 0 else 0)\n",
+                "\n",
+                "        # if 10 set to 12\n",
+                "        test.loc[test['id'] == i, 'predictions'] = test.loc[test['id'] == i, 'predictions'].apply(lambda x: 12 if x == 10 else x)\n",
+                "        test.loc[test['id'] == i, 'predictions'] = test.loc[test['id'] == i, 'predictions'].apply(lambda x: 10 if x == 9 else x)\n",
+                "\n",
+                "    test['actual'] = test['rank'] + 1\n",
+                "\n",
+                "\n",
+                "    accuracy = accuracy_score(test['actual'], test['predictions'])\n",
+                "    print(\"Accuracy: %.2f%%\" % (accuracy * 100.0))\n",
+                "\n",
+                "    return test"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "model_ranked, test_data, train_data = xgboost_rank_model(df_xgboost.loc[df_xgboost['points'] > 0], seed=7, test_size=0.33)\n",
+                "out = ranked_model_predictions(model_ranked, test_data)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "model_ranked_all, test_data_all, train_data_all = xgboost_rank_model(df_xgboost, seed=7, test_size=0.33)\n",
+                "out_all = ranked_model_predictions(model_ranked_all, test_data_all)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "out_all['predictions'].value_counts()"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "def violins(pred):\n",
+                "    prediction_df = pred.copy()\n",
+                "    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 7))\n",
+                "\n",
+                "    if(max(prediction_df['actual']) < 12):\n",
+                "        if(max(prediction_df['actual']) == 11):\n",
+                "            prediction_df['predictions'] = prediction_df['predictions'] + 1\n",
+                "            prediction_df['actual'] = prediction_df['actual'] + 1\n",
+                "        prediction_df['predictions'] = prediction_df['predictions'].apply(lambda x: 12 if x == 10 else x)\n",
+                "        prediction_df['predictions'] = prediction_df['predictions'].apply(lambda x: 10 if x == 9 else x)\n",
+                "        prediction_df['actual'] = prediction_df['actual'].apply(lambda x: 12 if x == 10 else x)\n",
+                "        prediction_df['actual'] = prediction_df['actual'].apply(lambda x: 10 if x == 9 else x)\n",
+                "\n",
+                "    # violin plot of predictions for each rank\n",
+                "    sns.violinplot(x=\"actual\", y=\"predictions\", data=prediction_df, order=range(0, 13), ax=ax1)\n",
+                "    sns.violinplot(y=\"actual\", x=\"predictions\", data=prediction_df, order=range(0, 13), ax=ax2)\n",
+                "\n",
+                "    # add a best fit line - scale is wrong because numeric on top of categorical\n",
+                "    sns.regplot(x=\"actual\", y=\"predictions\", data=prediction_df, scatter=False, color='black', ax=ax1)\n",
+                "    sns.regplot(y=\"actual\", x=\"predictions\", data=prediction_df, scatter=False, color='black', ax=ax2)\n",
+                "\n",
+                "\n",
+                "    # spearman correlation\n",
+                "    corr, _ = spearmanr(prediction_df['actual'], prediction_df['predictions'])\n",
+                "\n",
+                "    fig.suptitle('Predicted Score vs Actual Score (r = ' + str(round(corr, 4)) + ')', fontsize=15)\n",
+                "\n",
+                "    # set x label\n",
+                "    ax1.set_xlabel('Actual Score')\n",
+                "    ax1.set_ylabel('Predicted Score')\n",
+                "    ax2.set_ylabel('Actual Score')\n",
+                "    ax2.set_xlabel('Predicted Score')\n",
+                "\n",
+                "def cm_heatmap (prediction_df, title='Confusion matrix', ax=None):\n",
+                "    cm = confusion_matrix(prediction_df['actual'], prediction_df['predictions'])\n",
+                "\n",
+                "    plot = sns.heatmap(cm, annot=True, fmt='g', ax=ax,\n",
+                "                    vmin=0, vmax=100, cmap='Blues')\n",
+                "\n",
+                "    if(len(cm[[0]][0]) == 11) :\n",
+                "    # change the tick labels\n",
+                "        plot.set_xticks(np.arange(0, 11, 1)+0.5, np.arange(0, 9, 1).tolist() + [10, 12])\n",
+                "        plot.set_yticks(np.arange(0, 11, 1)+0.5, np.arange(0, 9, 1).tolist() + [10, 12])\n",
+                "    else:\n",
+                "        plot.set_xticks(np.arange(0, 10, 1)+0.5, np.arange(1, 9, 1).tolist() + [10, 12])\n",
+                "        plot.set_yticks(np.arange(0, 10, 1)+0.5, np.arange(1, 9, 1).tolist() + [10, 12])\n",
+                "\n",
+                "    # add tick marks\n",
+                "    plot.tick_params(axis='both', which='both', length=5, color='black')\n",
+                "\n",
+                "    plot.set_title(title)\n",
+                "    plot.set_ylabel('Actual Score')\n",
+                "    plot.set_xlabel('Predicted Score')"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "violins(out)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "violins(out_all.loc[out_all['actual'] > 0])"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "cm_heatmap(out)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "cm_heatmap(out_all)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## Predicting the 2023 results"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "df_2023 = future\n",
+                "\n",
+                "df_2023['id'] = df_2023['from_code2'] + df_2023['year'].astype(str)\n",
+                "df_2023['received_vote'] = 0\n",
+                "\n",
+                "df_2023 = df_2023[df_xgboost.columns]\n",
+                "\n",
+                "# remove the points and received_vote columns\n",
+                "df_2023 = df_2023[df_2023.columns.drop('points')]\n",
+                "df_2023 = df_2023[df_2023.columns.drop('received_vote')]\n",
+                "\n",
+                "df_2023.head()"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "df_2023['from_code2'].value_counts().shape"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "Remove russia from the voting, since they did not participate in 2023"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "df_2023 = df_2023.loc[df_2023['from_code2'] != 'RU', ]"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "df_2023['id'] = df_2023['from_code2'] + df_2023['year'].astype(str)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "# Check\n",
+                "print(df_2023['to_code2'].value_counts().value_counts())\n",
+                "print(df_2023['from_code2'].value_counts().value_counts())"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "df_hurdle = df_2023.copy()\n",
+                "\n",
+                "# one hot encode the gender, from country and code\n",
+                "for j in ['gender', 'to_code2', 'from_code2']:\n",
+                "    j_text = \"_voting\" if j == 'from_code2' else ''\n",
+                "    for i in df_hurdle[j].unique():\n",
+                "        df_hurdle[i+j_text] = df_hurdle[j].apply(lambda x: 1 if x == i else 0)\n",
+                "\n",
+                "    df_hurdle = df_hurdle[df_hurdle.columns.drop(j)]\n",
+                "\n",
+                "df_hurdle= df_hurdle[df_hurdle.columns.drop('year')]\n",
+                "\n",
+                "df_hurdle.head()"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "# print the test_data_all columns which are not in df_hurdle\n",
+                "print([col for col in df_hurdle.columns if col not in test_data_all.columns])\n",
+                "print([col for col in test_data_all.columns if col not in df_hurdle.columns])\n",
+                "\n",
+                "# Add the missing columns\n",
+                "for col in test_data_all.columns:\n",
+                "    if col not in df_hurdle.columns:\n",
+                "        df_hurdle[col] = 0\n",
+                "\n",
+                "# add the missing columns\n",
+                "for col in test_data_all.columns:\n",
+                "    if col not in df_hurdle.columns:\n",
+                "        df_hurdle[col] = 0\n",
+                "\n",
+                "df_hurdle = df_hurdle[test_data_all.columns]\n",
+                "\n",
+                "df_hurdle['rank'] = 1\n",
+                "\n",
+                "df_hurdle.head()"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "out_all = ranked_model_predictions(model_ranked_all, df_hurdle)\n",
+                "out_all['predictions'].value_counts()"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "p1 = out_all.copy()\n",
+                "\n",
+                "cols = [col for col in p1.columns if '_voting' in col]\n",
+                "cols2 = [col for col in p1.columns if '_voting' not in col]\n",
+                "\n",
+                "\n",
+                "# wide format _voting columns to long format\n",
+                "p1 = pd.melt(p1, id_vars=cols2,\n",
+                "                       value_vars= cols, var_name='from_code2')\n",
+                "p1 = p1.loc[p1['value'] == 1]\n",
+                "\n",
+                "\n",
+                "p1['from_code2'] = p1['from_code2'].str.replace('_voting', '').tolist()\n",
+                "\n",
+                "p1 = p1.loc[p1['value'] == 1]\n",
+                "p1 = p1.drop('value', axis=1)\n",
+                "\n",
+                "cols2 = [\n",
+                "        'id',\n",
+                "        'from_code2',\n",
+                "        'Contains_English',\n",
+                "        'Contains_NonEnglish',\n",
+                "        'Contains_Multiple_Languages',\n",
+                "        'Number_of_Languages',\n",
+                "        'Contains_Own_Language',\n",
+                "        'Contains_Voting_Language',\n",
+                "        'has_border',\n",
+                "        'prediction_rel',\n",
+                "        'prop_emigrants_v2p',\n",
+                "        'prop_emigrants_p2v',\n",
+                "        'group',\n",
+                "        'female',\n",
+                "        'male',\n",
+                "        'rank',\n",
+                "        'comps_without_win',\n",
+                "        'predictions',\n",
+                "        'actual']\n",
+                "\n",
+                "cols = [col for col in p1.columns if col not in cols2]\n",
+                "\n",
+                "# # print p1.columns not in cols2\n",
+                "print([col for col in p1.columns if col not in cols2])\n",
+                "\n",
+                "p1 = pd.melt(p1,\n",
+                "            id_vars=cols2,\n",
+                "            value_vars= cols,\n",
+                "            var_name='to_code2')\n",
+                "p1 = p1.loc[p1['value'] == 1]\n",
+                "# # drop value column\n",
+                "p1 = p1.drop('value', axis=1)\n",
+                "\n",
+                "p1.head()"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "#| code-fold: true\n",
+                "#| code-summary: \"Code to print the results\"\n",
+                "\n",
+                "p1['total_points'] = p1.groupby('to_code2')['predictions'].transform('sum')\n",
+                "\n",
+                "# results\n",
+                "res = p1[['total_points', 'to_code2']].drop_duplicates()\n",
+                "\n",
+                "# match the country name to the country code\n",
+                "res = res.merge(df[['from_code2', 'from_country']],\n",
+                "                left_on='to_code2',\n",
+                "                right_on='from_code2',\n",
+                "                how='left').drop_duplicates()\n",
+                "\n",
+                "res['position'] = res['total_points'].rank(ascending=False)\n",
+                "\n",
+                "# sort by total_points\n",
+                "res = res.sort_values(by=['total_points'], ascending=False)\n",
+                "\n",
+                "# rename from country to country\n",
+                "res = res.rename(columns={'from_country': 'country'})\n",
+                "\n",
+                "# remove from_code2 column\n",
+                "res = res.drop('from_code2', axis=1)\n",
+                "\n",
+                "# print without index\n",
+                "res"
+            ]
+        },
         {
             "attachments": {},
             "cell_type": "markdown",