diff --git a/GDP Prediction/Readme.md b/GDP Prediction/Readme.md
new file mode 100644
index 000000000..aafc88948
--- /dev/null
+++ b/GDP Prediction/Readme.md
@@ -0,0 +1,13 @@
+**GDP Prediction Model**
+
+Dataset : https://www.kaggle.com/rutikbhoyar/gdp-prediction-dataset
+
+4 different learning regressors **(Linear Regression, SVM, Random Forest, and Gradiant Boosting)** were tested to predict GDP, and the best prediction performance was seen in the order : \
+\
+**Random Forest > Gradiant Boosting > Linear Regression > SVM**
+
+The Metrics for the best prediction performance using Random Forest regressor, using all features in the dataset is:
+
+1. MAE: 2125.24
+2. RMSE: 3051.71
+3. R2_Score: 0.8873
diff --git a/GDP Prediction/gdp-prediction-model.ipynb b/GDP Prediction/gdp-prediction-model.ipynb
new file mode 100644
index 000000000..010cc723d
--- /dev/null
+++ b/GDP Prediction/gdp-prediction-model.ipynb
@@ -0,0 +1,3820 @@
+{
+ "metadata": {
+ "kernelspec": {
+ "language": "python",
+ "display_name": "Python 3",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python",
+ "version": "3.7.12",
+ "mimetype": "text/x-python",
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "pygments_lexer": "ipython3",
+ "nbconvert_exporter": "python",
+ "file_extension": ".py"
+ },
+ "colab": {
+ "name": "gdp-prediction-model.ipynb",
+ "provenance": []
+ }
+ },
+ "nbformat_minor": 0,
+ "nbformat": 4,
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Importing necessary libraries"
+ ],
+ "metadata": {
+ "id": "RaiCm2JYQnY6"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import numpy as np \n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns"
+ ],
+ "metadata": {
+ "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
+ "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
+ "execution": {
+ "iopub.status.busy": "2022-03-25T10:07:34.902728Z",
+ "iopub.execute_input": "2022-03-25T10:07:34.903232Z",
+ "iopub.status.idle": "2022-03-25T10:07:35.985950Z",
+ "shell.execute_reply.started": "2022-03-25T10:07:34.903201Z",
+ "shell.execute_reply": "2022-03-25T10:07:35.985035Z"
+ },
+ "trusted": true,
+ "id": "WT18AgP1QnZA"
+ },
+ "execution_count": 6,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Initial Check on Dataset"
+ ],
+ "metadata": {
+ "id": "jz_-ngFkQnZC"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from google.colab import drive\n",
+ "drive.mount('/content/gdrive')\n",
+ "df = pd.read_csv(\"/content/gdrive/My Drive/Colab Notebooks/world.csv\")\n",
+ "df.head()"
+ ],
+ "metadata": {
+ "execution": {
+ "iopub.status.busy": "2022-03-25T10:07:35.987621Z",
+ "iopub.execute_input": "2022-03-25T10:07:35.987942Z",
+ "iopub.status.idle": "2022-03-25T10:07:36.037531Z",
+ "shell.execute_reply.started": "2022-03-25T10:07:35.987909Z",
+ "shell.execute_reply": "2022-03-25T10:07:36.036542Z"
+ },
+ "trusted": true,
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 406
+ },
+ "id": "ykSkzIvCQnZD",
+ "outputId": "f7522154-beca-4d41-ef25-1d30b0f7e8bc"
+ },
+ "execution_count": 7,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount(\"/content/gdrive\", force_remount=True).\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " Country Region Population \\\n",
+ "0 Afghanistan ASIA (EX. NEAR EAST) 31056997 \n",
+ "1 Albania EASTERN EUROPE 3581655 \n",
+ "2 Algeria NORTHERN AFRICA 32930091 \n",
+ "3 American Samoa OCEANIA 57794 \n",
+ "4 Andorra WESTERN EUROPE 71201 \n",
+ "\n",
+ " Area (sq. mi.) Pop. Density (per sq. mi.) Coastline (coast/area ratio) \\\n",
+ "0 647500 48,0 0,00 \n",
+ "1 28748 124,6 1,26 \n",
+ "2 2381740 13,8 0,04 \n",
+ "3 199 290,4 58,29 \n",
+ "4 468 152,1 0,00 \n",
+ "\n",
+ " Net migration Infant mortality (per 1000 births) GDP ($ per capita) \\\n",
+ "0 23,06 163,07 700.0 \n",
+ "1 -4,93 21,52 4500.0 \n",
+ "2 -0,39 31 6000.0 \n",
+ "3 -20,71 9,27 8000.0 \n",
+ "4 6,6 4,05 19000.0 \n",
+ "\n",
+ " Literacy (%) Phones (per 1000) Arable (%) Crops (%) Other (%) Climate \\\n",
+ "0 36,0 3,2 12,13 0,22 87,65 1 \n",
+ "1 86,5 71,2 21,09 4,42 74,49 3 \n",
+ "2 70,0 78,1 3,22 0,25 96,53 1 \n",
+ "3 97,0 259,5 10 15 75 2 \n",
+ "4 100,0 497,2 2,22 0 97,78 3 \n",
+ "\n",
+ " Birthrate Deathrate Agriculture Industry Service \n",
+ "0 46,6 20,34 0,38 0,24 0,38 \n",
+ "1 15,11 5,22 0,232 0,188 0,579 \n",
+ "2 17,14 4,61 0,101 0,6 0,298 \n",
+ "3 22,46 3,27 NaN NaN NaN \n",
+ "4 8,71 6,25 NaN NaN NaN "
+ ],
+ "text/html": [
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Country | \n",
+ " Region | \n",
+ " Population | \n",
+ " Area (sq. mi.) | \n",
+ " Pop. Density (per sq. mi.) | \n",
+ " Coastline (coast/area ratio) | \n",
+ " Net migration | \n",
+ " Infant mortality (per 1000 births) | \n",
+ " GDP ($ per capita) | \n",
+ " Literacy (%) | \n",
+ " Phones (per 1000) | \n",
+ " Arable (%) | \n",
+ " Crops (%) | \n",
+ " Other (%) | \n",
+ " Climate | \n",
+ " Birthrate | \n",
+ " Deathrate | \n",
+ " Agriculture | \n",
+ " Industry | \n",
+ " Service | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Afghanistan | \n",
+ " ASIA (EX. NEAR EAST) | \n",
+ " 31056997 | \n",
+ " 647500 | \n",
+ " 48,0 | \n",
+ " 0,00 | \n",
+ " 23,06 | \n",
+ " 163,07 | \n",
+ " 700.0 | \n",
+ " 36,0 | \n",
+ " 3,2 | \n",
+ " 12,13 | \n",
+ " 0,22 | \n",
+ " 87,65 | \n",
+ " 1 | \n",
+ " 46,6 | \n",
+ " 20,34 | \n",
+ " 0,38 | \n",
+ " 0,24 | \n",
+ " 0,38 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Albania | \n",
+ " EASTERN EUROPE | \n",
+ " 3581655 | \n",
+ " 28748 | \n",
+ " 124,6 | \n",
+ " 1,26 | \n",
+ " -4,93 | \n",
+ " 21,52 | \n",
+ " 4500.0 | \n",
+ " 86,5 | \n",
+ " 71,2 | \n",
+ " 21,09 | \n",
+ " 4,42 | \n",
+ " 74,49 | \n",
+ " 3 | \n",
+ " 15,11 | \n",
+ " 5,22 | \n",
+ " 0,232 | \n",
+ " 0,188 | \n",
+ " 0,579 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Algeria | \n",
+ " NORTHERN AFRICA | \n",
+ " 32930091 | \n",
+ " 2381740 | \n",
+ " 13,8 | \n",
+ " 0,04 | \n",
+ " -0,39 | \n",
+ " 31 | \n",
+ " 6000.0 | \n",
+ " 70,0 | \n",
+ " 78,1 | \n",
+ " 3,22 | \n",
+ " 0,25 | \n",
+ " 96,53 | \n",
+ " 1 | \n",
+ " 17,14 | \n",
+ " 4,61 | \n",
+ " 0,101 | \n",
+ " 0,6 | \n",
+ " 0,298 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " American Samoa | \n",
+ " OCEANIA | \n",
+ " 57794 | \n",
+ " 199 | \n",
+ " 290,4 | \n",
+ " 58,29 | \n",
+ " -20,71 | \n",
+ " 9,27 | \n",
+ " 8000.0 | \n",
+ " 97,0 | \n",
+ " 259,5 | \n",
+ " 10 | \n",
+ " 15 | \n",
+ " 75 | \n",
+ " 2 | \n",
+ " 22,46 | \n",
+ " 3,27 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Andorra | \n",
+ " WESTERN EUROPE | \n",
+ " 71201 | \n",
+ " 468 | \n",
+ " 152,1 | \n",
+ " 0,00 | \n",
+ " 6,6 | \n",
+ " 4,05 | \n",
+ " 19000.0 | \n",
+ " 100,0 | \n",
+ " 497,2 | \n",
+ " 2,22 | \n",
+ " 0 | \n",
+ " 97,78 | \n",
+ " 3 | \n",
+ " 8,71 | \n",
+ " 6,25 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 7
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.info()"
+ ],
+ "metadata": {
+ "execution": {
+ "iopub.status.busy": "2022-03-25T10:07:36.038969Z",
+ "iopub.execute_input": "2022-03-25T10:07:36.039284Z",
+ "iopub.status.idle": "2022-03-25T10:07:36.067732Z",
+ "shell.execute_reply.started": "2022-03-25T10:07:36.039242Z",
+ "shell.execute_reply": "2022-03-25T10:07:36.066828Z"
+ },
+ "trusted": true,
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "HIWfPm-TQnZE",
+ "outputId": "63fc4d2b-fab2-4fd7-ff31-33c50b49836b"
+ },
+ "execution_count": 8,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "\n",
+ "RangeIndex: 227 entries, 0 to 226\n",
+ "Data columns (total 20 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 Country 227 non-null object \n",
+ " 1 Region 227 non-null object \n",
+ " 2 Population 227 non-null int64 \n",
+ " 3 Area (sq. mi.) 227 non-null int64 \n",
+ " 4 Pop. Density (per sq. mi.) 227 non-null object \n",
+ " 5 Coastline (coast/area ratio) 227 non-null object \n",
+ " 6 Net migration 224 non-null object \n",
+ " 7 Infant mortality (per 1000 births) 224 non-null object \n",
+ " 8 GDP ($ per capita) 226 non-null float64\n",
+ " 9 Literacy (%) 209 non-null object \n",
+ " 10 Phones (per 1000) 223 non-null object \n",
+ " 11 Arable (%) 225 non-null object \n",
+ " 12 Crops (%) 225 non-null object \n",
+ " 13 Other (%) 225 non-null object \n",
+ " 14 Climate 205 non-null object \n",
+ " 15 Birthrate 224 non-null object \n",
+ " 16 Deathrate 223 non-null object \n",
+ " 17 Agriculture 212 non-null object \n",
+ " 18 Industry 211 non-null object \n",
+ " 19 Service 212 non-null object \n",
+ "dtypes: float64(1), int64(2), object(17)\n",
+ "memory usage: 35.6+ KB\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Dataset has 20 Columns with 227 Entries"
+ ],
+ "metadata": {
+ "id": "jC9mf5yUQnZE"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.describe()"
+ ],
+ "metadata": {
+ "execution": {
+ "iopub.status.busy": "2022-03-25T10:07:36.070511Z",
+ "iopub.execute_input": "2022-03-25T10:07:36.071281Z",
+ "iopub.status.idle": "2022-03-25T10:07:36.100287Z",
+ "shell.execute_reply.started": "2022-03-25T10:07:36.071227Z",
+ "shell.execute_reply": "2022-03-25T10:07:36.099652Z"
+ },
+ "trusted": true,
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 300
+ },
+ "id": "FpmDEU8pQnZF",
+ "outputId": "d1c0d069-835e-479f-e646-08ed7845a610"
+ },
+ "execution_count": 9,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " Population Area (sq. mi.) GDP ($ per capita)\n",
+ "count 2.270000e+02 2.270000e+02 226.000000\n",
+ "mean 2.874028e+07 5.982270e+05 9689.823009\n",
+ "std 1.178913e+08 1.790282e+06 10049.138513\n",
+ "min 7.026000e+03 2.000000e+00 500.000000\n",
+ "25% 4.376240e+05 4.647500e+03 1900.000000\n",
+ "50% 4.786994e+06 8.660000e+04 5550.000000\n",
+ "75% 1.749777e+07 4.418110e+05 15700.000000\n",
+ "max 1.313974e+09 1.707520e+07 55100.000000"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Population | \n",
+ " Area (sq. mi.) | \n",
+ " GDP ($ per capita) | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 2.270000e+02 | \n",
+ " 2.270000e+02 | \n",
+ " 226.000000 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 2.874028e+07 | \n",
+ " 5.982270e+05 | \n",
+ " 9689.823009 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 1.178913e+08 | \n",
+ " 1.790282e+06 | \n",
+ " 10049.138513 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 7.026000e+03 | \n",
+ " 2.000000e+00 | \n",
+ " 500.000000 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 4.376240e+05 | \n",
+ " 4.647500e+03 | \n",
+ " 1900.000000 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 4.786994e+06 | \n",
+ " 8.660000e+04 | \n",
+ " 5550.000000 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 1.749777e+07 | \n",
+ " 4.418110e+05 | \n",
+ " 15700.000000 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 1.313974e+09 | \n",
+ " 1.707520e+07 | \n",
+ " 55100.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 9
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Only three columns are having proper numeric values. We can see in the previous table that most of the columns are having object as the datatype. This has to be changed."
+ ],
+ "metadata": {
+ "id": "Jr19pfNfQnZF"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "## Changing the Datatype\n",
+ "\n",
+ "for col in ['Country', 'Region']:\n",
+ " df[col] = df[col].astype('category')\n",
+ " \n",
+ "for col in ['Pop. Density (per sq. mi.)', 'Coastline (coast/area ratio)','Net migration','Infant mortality (per 1000 births)','Literacy (%)','Phones (per 1000)','Arable (%)','Crops (%)','Other (%)','Climate','Birthrate','Deathrate','Agriculture','Industry','Service']:\n",
+ " df[col] = df[col].astype('str')\n",
+ " df[col] = df[col].str.replace(\",\",\".\").astype(float) \n"
+ ],
+ "metadata": {
+ "execution": {
+ "iopub.status.busy": "2022-03-25T10:07:36.101277Z",
+ "iopub.execute_input": "2022-03-25T10:07:36.101851Z",
+ "iopub.status.idle": "2022-03-25T10:07:36.129797Z",
+ "shell.execute_reply.started": "2022-03-25T10:07:36.101817Z",
+ "shell.execute_reply": "2022-03-25T10:07:36.128887Z"
+ },
+ "trusted": true,
+ "id": "BKfJQx-EQnZG"
+ },
+ "execution_count": 10,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Country and Region columns are converted to **Category** Datatype while rest of numeric data is converted to **float**. The category data type in pandas is a hybrid data type. It looks and behaves like a string in many instances but internally is represented by an array of integers. This allows the data to be sorted in a custom order and to more efficiently store the data."
+ ],
+ "metadata": {
+ "id": "X3eoaUP5QnZH"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.info()"
+ ],
+ "metadata": {
+ "execution": {
+ "iopub.status.busy": "2022-03-25T10:07:36.130975Z",
+ "iopub.execute_input": "2022-03-25T10:07:36.131186Z",
+ "iopub.status.idle": "2022-03-25T10:07:36.152527Z",
+ "shell.execute_reply.started": "2022-03-25T10:07:36.131160Z",
+ "shell.execute_reply": "2022-03-25T10:07:36.151532Z"
+ },
+ "trusted": true,
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "QsJmlMi1QnZI",
+ "outputId": "fab75aed-6dde-4005-8a01-915e4e1a3991"
+ },
+ "execution_count": 11,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "\n",
+ "RangeIndex: 227 entries, 0 to 226\n",
+ "Data columns (total 20 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 Country 227 non-null category\n",
+ " 1 Region 227 non-null category\n",
+ " 2 Population 227 non-null int64 \n",
+ " 3 Area (sq. mi.) 227 non-null int64 \n",
+ " 4 Pop. Density (per sq. mi.) 227 non-null float64 \n",
+ " 5 Coastline (coast/area ratio) 227 non-null float64 \n",
+ " 6 Net migration 224 non-null float64 \n",
+ " 7 Infant mortality (per 1000 births) 224 non-null float64 \n",
+ " 8 GDP ($ per capita) 226 non-null float64 \n",
+ " 9 Literacy (%) 209 non-null float64 \n",
+ " 10 Phones (per 1000) 223 non-null float64 \n",
+ " 11 Arable (%) 225 non-null float64 \n",
+ " 12 Crops (%) 225 non-null float64 \n",
+ " 13 Other (%) 225 non-null float64 \n",
+ " 14 Climate 205 non-null float64 \n",
+ " 15 Birthrate 224 non-null float64 \n",
+ " 16 Deathrate 223 non-null float64 \n",
+ " 17 Agriculture 212 non-null float64 \n",
+ " 18 Industry 211 non-null float64 \n",
+ " 19 Service 212 non-null float64 \n",
+ "dtypes: category(2), float64(16), int64(2)\n",
+ "memory usage: 43.0 KB\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.describe()"
+ ],
+ "metadata": {
+ "execution": {
+ "iopub.status.busy": "2022-03-25T10:07:36.153826Z",
+ "iopub.execute_input": "2022-03-25T10:07:36.154198Z",
+ "iopub.status.idle": "2022-03-25T10:07:36.221684Z",
+ "shell.execute_reply.started": "2022-03-25T10:07:36.154160Z",
+ "shell.execute_reply": "2022-03-25T10:07:36.220748Z"
+ },
+ "trusted": true,
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 416
+ },
+ "id": "PDw14IE8QnZI",
+ "outputId": "8e55d091-fe7b-4a0f-c2f4-b9acdba7b7de"
+ },
+ "execution_count": 12,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " Population Area (sq. mi.) Pop. Density (per sq. mi.) \\\n",
+ "count 2.270000e+02 2.270000e+02 227.000000 \n",
+ "mean 2.874028e+07 5.982270e+05 379.047137 \n",
+ "std 1.178913e+08 1.790282e+06 1660.185825 \n",
+ "min 7.026000e+03 2.000000e+00 0.000000 \n",
+ "25% 4.376240e+05 4.647500e+03 29.150000 \n",
+ "50% 4.786994e+06 8.660000e+04 78.800000 \n",
+ "75% 1.749777e+07 4.418110e+05 190.150000 \n",
+ "max 1.313974e+09 1.707520e+07 16271.500000 \n",
+ "\n",
+ " Coastline (coast/area ratio) Net migration \\\n",
+ "count 227.000000 224.000000 \n",
+ "mean 21.165330 0.038125 \n",
+ "std 72.286863 4.889269 \n",
+ "min 0.000000 -20.990000 \n",
+ "25% 0.100000 -0.927500 \n",
+ "50% 0.730000 0.000000 \n",
+ "75% 10.345000 0.997500 \n",
+ "max 870.660000 23.060000 \n",
+ "\n",
+ " Infant mortality (per 1000 births) GDP ($ per capita) Literacy (%) \\\n",
+ "count 224.000000 226.000000 209.000000 \n",
+ "mean 35.506964 9689.823009 82.838278 \n",
+ "std 35.389899 10049.138513 19.722173 \n",
+ "min 2.290000 500.000000 17.600000 \n",
+ "25% 8.150000 1900.000000 70.600000 \n",
+ "50% 21.000000 5550.000000 92.500000 \n",
+ "75% 55.705000 15700.000000 98.000000 \n",
+ "max 191.190000 55100.000000 100.000000 \n",
+ "\n",
+ " Phones (per 1000) Arable (%) Crops (%) Other (%) Climate \\\n",
+ "count 223.000000 225.000000 225.000000 225.000000 205.000000 \n",
+ "mean 236.061435 13.797111 4.564222 81.638311 2.139024 \n",
+ "std 227.991829 13.040402 8.361470 16.140835 0.699397 \n",
+ "min 0.200000 0.000000 0.000000 33.330000 1.000000 \n",
+ "25% 37.800000 3.220000 0.190000 71.650000 2.000000 \n",
+ "50% 176.200000 10.420000 1.030000 85.700000 2.000000 \n",
+ "75% 389.650000 20.000000 4.440000 95.440000 3.000000 \n",
+ "max 1035.600000 62.110000 50.680000 100.000000 4.000000 \n",
+ "\n",
+ " Birthrate Deathrate Agriculture Industry Service \n",
+ "count 224.000000 223.000000 212.000000 211.000000 212.000000 \n",
+ "mean 22.114732 9.241345 0.150844 0.282711 0.565283 \n",
+ "std 11.176716 4.990026 0.146798 0.138272 0.165841 \n",
+ "min 7.290000 2.290000 0.000000 0.020000 0.062000 \n",
+ "25% 12.672500 5.910000 0.037750 0.193000 0.429250 \n",
+ "50% 18.790000 7.840000 0.099000 0.272000 0.571000 \n",
+ "75% 29.820000 10.605000 0.221000 0.341000 0.678500 \n",
+ "max 50.730000 29.740000 0.769000 0.906000 0.954000 "
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Population | \n",
+ " Area (sq. mi.) | \n",
+ " Pop. Density (per sq. mi.) | \n",
+ " Coastline (coast/area ratio) | \n",
+ " Net migration | \n",
+ " Infant mortality (per 1000 births) | \n",
+ " GDP ($ per capita) | \n",
+ " Literacy (%) | \n",
+ " Phones (per 1000) | \n",
+ " Arable (%) | \n",
+ " Crops (%) | \n",
+ " Other (%) | \n",
+ " Climate | \n",
+ " Birthrate | \n",
+ " Deathrate | \n",
+ " Agriculture | \n",
+ " Industry | \n",
+ " Service | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 2.270000e+02 | \n",
+ " 2.270000e+02 | \n",
+ " 227.000000 | \n",
+ " 227.000000 | \n",
+ " 224.000000 | \n",
+ " 224.000000 | \n",
+ " 226.000000 | \n",
+ " 209.000000 | \n",
+ " 223.000000 | \n",
+ " 225.000000 | \n",
+ " 225.000000 | \n",
+ " 225.000000 | \n",
+ " 205.000000 | \n",
+ " 224.000000 | \n",
+ " 223.000000 | \n",
+ " 212.000000 | \n",
+ " 211.000000 | \n",
+ " 212.000000 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 2.874028e+07 | \n",
+ " 5.982270e+05 | \n",
+ " 379.047137 | \n",
+ " 21.165330 | \n",
+ " 0.038125 | \n",
+ " 35.506964 | \n",
+ " 9689.823009 | \n",
+ " 82.838278 | \n",
+ " 236.061435 | \n",
+ " 13.797111 | \n",
+ " 4.564222 | \n",
+ " 81.638311 | \n",
+ " 2.139024 | \n",
+ " 22.114732 | \n",
+ " 9.241345 | \n",
+ " 0.150844 | \n",
+ " 0.282711 | \n",
+ " 0.565283 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 1.178913e+08 | \n",
+ " 1.790282e+06 | \n",
+ " 1660.185825 | \n",
+ " 72.286863 | \n",
+ " 4.889269 | \n",
+ " 35.389899 | \n",
+ " 10049.138513 | \n",
+ " 19.722173 | \n",
+ " 227.991829 | \n",
+ " 13.040402 | \n",
+ " 8.361470 | \n",
+ " 16.140835 | \n",
+ " 0.699397 | \n",
+ " 11.176716 | \n",
+ " 4.990026 | \n",
+ " 0.146798 | \n",
+ " 0.138272 | \n",
+ " 0.165841 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 7.026000e+03 | \n",
+ " 2.000000e+00 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " -20.990000 | \n",
+ " 2.290000 | \n",
+ " 500.000000 | \n",
+ " 17.600000 | \n",
+ " 0.200000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 33.330000 | \n",
+ " 1.000000 | \n",
+ " 7.290000 | \n",
+ " 2.290000 | \n",
+ " 0.000000 | \n",
+ " 0.020000 | \n",
+ " 0.062000 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 4.376240e+05 | \n",
+ " 4.647500e+03 | \n",
+ " 29.150000 | \n",
+ " 0.100000 | \n",
+ " -0.927500 | \n",
+ " 8.150000 | \n",
+ " 1900.000000 | \n",
+ " 70.600000 | \n",
+ " 37.800000 | \n",
+ " 3.220000 | \n",
+ " 0.190000 | \n",
+ " 71.650000 | \n",
+ " 2.000000 | \n",
+ " 12.672500 | \n",
+ " 5.910000 | \n",
+ " 0.037750 | \n",
+ " 0.193000 | \n",
+ " 0.429250 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 4.786994e+06 | \n",
+ " 8.660000e+04 | \n",
+ " 78.800000 | \n",
+ " 0.730000 | \n",
+ " 0.000000 | \n",
+ " 21.000000 | \n",
+ " 5550.000000 | \n",
+ " 92.500000 | \n",
+ " 176.200000 | \n",
+ " 10.420000 | \n",
+ " 1.030000 | \n",
+ " 85.700000 | \n",
+ " 2.000000 | \n",
+ " 18.790000 | \n",
+ " 7.840000 | \n",
+ " 0.099000 | \n",
+ " 0.272000 | \n",
+ " 0.571000 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 1.749777e+07 | \n",
+ " 4.418110e+05 | \n",
+ " 190.150000 | \n",
+ " 10.345000 | \n",
+ " 0.997500 | \n",
+ " 55.705000 | \n",
+ " 15700.000000 | \n",
+ " 98.000000 | \n",
+ " 389.650000 | \n",
+ " 20.000000 | \n",
+ " 4.440000 | \n",
+ " 95.440000 | \n",
+ " 3.000000 | \n",
+ " 29.820000 | \n",
+ " 10.605000 | \n",
+ " 0.221000 | \n",
+ " 0.341000 | \n",
+ " 0.678500 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 1.313974e+09 | \n",
+ " 1.707520e+07 | \n",
+ " 16271.500000 | \n",
+ " 870.660000 | \n",
+ " 23.060000 | \n",
+ " 191.190000 | \n",
+ " 55100.000000 | \n",
+ " 100.000000 | \n",
+ " 1035.600000 | \n",
+ " 62.110000 | \n",
+ " 50.680000 | \n",
+ " 100.000000 | \n",
+ " 4.000000 | \n",
+ " 50.730000 | \n",
+ " 29.740000 | \n",
+ " 0.769000 | \n",
+ " 0.906000 | \n",
+ " 0.954000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 12
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Understanding more about the Dataset\n",
+ "\n",
+ "A few of the columns: Climate, Agriculture, Industry, and Service have not been explained exactly what they include as values. We need to understand it better."
+ ],
+ "metadata": {
+ "id": "00YKFeZ5QnZJ"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.loc[:, ['Country', 'Region', 'Climate', 'Agriculture', 'Industry', 'Service']].head()"
+ ],
+ "metadata": {
+ "execution": {
+ "iopub.status.busy": "2022-03-25T10:07:36.222860Z",
+ "iopub.execute_input": "2022-03-25T10:07:36.223080Z",
+ "iopub.status.idle": "2022-03-25T10:07:36.239411Z",
+ "shell.execute_reply.started": "2022-03-25T10:07:36.223054Z",
+ "shell.execute_reply": "2022-03-25T10:07:36.238749Z"
+ },
+ "trusted": true,
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "id": "aunPcXB3QnZJ",
+ "outputId": "d5fe5ae8-17e3-4346-90d3-80bea8a78c36"
+ },
+ "execution_count": 13,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " Country Region Climate Agriculture \\\n",
+ "0 Afghanistan ASIA (EX. NEAR EAST) 1.0 0.380 \n",
+ "1 Albania EASTERN EUROPE 3.0 0.232 \n",
+ "2 Algeria NORTHERN AFRICA 1.0 0.101 \n",
+ "3 American Samoa OCEANIA 2.0 NaN \n",
+ "4 Andorra WESTERN EUROPE 3.0 NaN \n",
+ "\n",
+ " Industry Service \n",
+ "0 0.240 0.380 \n",
+ "1 0.188 0.579 \n",
+ "2 0.600 0.298 \n",
+ "3 NaN NaN \n",
+ "4 NaN NaN "
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Country | \n",
+ " Region | \n",
+ " Climate | \n",
+ " Agriculture | \n",
+ " Industry | \n",
+ " Service | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Afghanistan | \n",
+ " ASIA (EX. NEAR EAST) | \n",
+ " 1.0 | \n",
+ " 0.380 | \n",
+ " 0.240 | \n",
+ " 0.380 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Albania | \n",
+ " EASTERN EUROPE | \n",
+ " 3.0 | \n",
+ " 0.232 | \n",
+ " 0.188 | \n",
+ " 0.579 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Algeria | \n",
+ " NORTHERN AFRICA | \n",
+ " 1.0 | \n",
+ " 0.101 | \n",
+ " 0.600 | \n",
+ " 0.298 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " American Samoa | \n",
+ " OCEANIA | \n",
+ " 2.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Andorra | \n",
+ " WESTERN EUROPE | \n",
+ " 3.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 13
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "It looks like Agriculture , Industry and Service Columns represent the percent of Economy or GDP of a country that is being contributed by the respective economic activity. To understand Climate column, we can look at the distinct values and see which rows are coming together under the same value."
+ ],
+ "metadata": {
+ "id": "mduR9JRgQnZJ"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['Climate'].unique()"
+ ],
+ "metadata": {
+ "execution": {
+ "iopub.status.busy": "2022-03-25T10:07:36.240940Z",
+ "iopub.execute_input": "2022-03-25T10:07:36.241228Z",
+ "iopub.status.idle": "2022-03-25T10:07:36.251925Z",
+ "shell.execute_reply.started": "2022-03-25T10:07:36.241188Z",
+ "shell.execute_reply": "2022-03-25T10:07:36.251127Z"
+ },
+ "trusted": true,
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "JjHPpJVkQnZK",
+ "outputId": "7f3071b1-f2a3-4c5f-ab47-560d50297e20"
+ },
+ "execution_count": 14,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "array([1. , 3. , 2. , nan, 4. , 1.5, 2.5])"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 14
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "h = {}\n",
+ "for cat in [1, 2, 3, 4, 1.5, 2.5]:\n",
+ " h[cat] = df.loc[:, ['Country', 'Region', 'Climate']][df['Climate'] == cat].head()\n",
+ "\n",
+ "pd.concat([h[1], h[2], h[3], h[4], h[1.5], h[2.5]])\n"
+ ],
+ "metadata": {
+ "execution": {
+ "iopub.status.busy": "2022-03-25T10:07:36.255846Z",
+ "iopub.execute_input": "2022-03-25T10:07:36.256369Z",
+ "iopub.status.idle": "2022-03-25T10:07:36.290176Z",
+ "shell.execute_reply.started": "2022-03-25T10:07:36.256323Z",
+ "shell.execute_reply": "2022-03-25T10:07:36.289170Z"
+ },
+ "trusted": true,
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 927
+ },
+ "id": "hpOmYJToQnZK",
+ "outputId": "3593c1c1-08b1-40f1-b49c-e870b319fea5"
+ },
+ "execution_count": 15,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " Country Region Climate\n",
+ "0 Afghanistan ASIA (EX. NEAR EAST) 1.0\n",
+ "2 Algeria NORTHERN AFRICA 1.0\n",
+ "11 Australia OCEANIA 1.0\n",
+ "13 Azerbaijan C.W. OF IND. STATES 1.0\n",
+ "15 Bahrain NEAR EAST 1.0\n",
+ "3 American Samoa OCEANIA 2.0\n",
+ "6 Anguilla LATIN AMER. & CARIB 2.0\n",
+ "7 Antigua & Barbuda LATIN AMER. & CARIB 2.0\n",
+ "10 Aruba LATIN AMER. & CARIB 2.0\n",
+ "14 Bahamas, The LATIN AMER. & CARIB 2.0\n",
+ "1 Albania EASTERN EUROPE 3.0\n",
+ "4 Andorra WESTERN EUROPE 3.0\n",
+ "8 Argentina LATIN AMER. & CARIB 3.0\n",
+ "12 Austria WESTERN EUROPE 3.0\n",
+ "19 Belgium WESTERN EUROPE 3.0\n",
+ "9 Armenia C.W. OF IND. STATES 4.0\n",
+ "18 Belarus C.W. OF IND. STATES 4.0\n",
+ "25 Bosnia & Herzegovina EASTERN EUROPE 4.0\n",
+ "69 France WESTERN EUROPE 4.0\n",
+ "106 Kazakhstan C.W. OF IND. STATES 4.0\n",
+ "24 Bolivia LATIN AMER. & CARIB 1.5\n",
+ "35 Cameroon SUB-SAHARAN AFRICA 1.5\n",
+ "42 China ASIA (EX. NEAR EAST) 1.5\n",
+ "63 Eritrea SUB-SAHARAN AFRICA 1.5\n",
+ "107 Kenya SUB-SAHARAN AFRICA 1.5\n",
+ "94 India ASIA (EX. NEAR EAST) 2.5\n",
+ "112 Kyrgyzstan C.W. OF IND. STATES 2.5\n",
+ "194 Swaziland SUB-SAHARAN AFRICA 2.5"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Country | \n",
+ " Region | \n",
+ " Climate | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Afghanistan | \n",
+ " ASIA (EX. NEAR EAST) | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Algeria | \n",
+ " NORTHERN AFRICA | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " Australia | \n",
+ " OCEANIA | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " Azerbaijan | \n",
+ " C.W. OF IND. STATES | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " Bahrain | \n",
+ " NEAR EAST | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " American Samoa | \n",
+ " OCEANIA | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " Anguilla | \n",
+ " LATIN AMER. & CARIB | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " Antigua & Barbuda | \n",
+ " LATIN AMER. & CARIB | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " Aruba | \n",
+ " LATIN AMER. & CARIB | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " Bahamas, The | \n",
+ " LATIN AMER. & CARIB | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Albania | \n",
+ " EASTERN EUROPE | \n",
+ " 3.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Andorra | \n",
+ " WESTERN EUROPE | \n",
+ " 3.0 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " Argentina | \n",
+ " LATIN AMER. & CARIB | \n",
+ " 3.0 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " Austria | \n",
+ " WESTERN EUROPE | \n",
+ " 3.0 | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " Belgium | \n",
+ " WESTERN EUROPE | \n",
+ " 3.0 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " Armenia | \n",
+ " C.W. OF IND. STATES | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " Belarus | \n",
+ " C.W. OF IND. STATES | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " 25 | \n",
+ " Bosnia & Herzegovina | \n",
+ " EASTERN EUROPE | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " 69 | \n",
+ " France | \n",
+ " WESTERN EUROPE | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " 106 | \n",
+ " Kazakhstan | \n",
+ " C.W. OF IND. STATES | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " Bolivia | \n",
+ " LATIN AMER. & CARIB | \n",
+ " 1.5 | \n",
+ "
\n",
+ " \n",
+ " 35 | \n",
+ " Cameroon | \n",
+ " SUB-SAHARAN AFRICA | \n",
+ " 1.5 | \n",
+ "
\n",
+ " \n",
+ " 42 | \n",
+ " China | \n",
+ " ASIA (EX. NEAR EAST) | \n",
+ " 1.5 | \n",
+ "
\n",
+ " \n",
+ " 63 | \n",
+ " Eritrea | \n",
+ " SUB-SAHARAN AFRICA | \n",
+ " 1.5 | \n",
+ "
\n",
+ " \n",
+ " 107 | \n",
+ " Kenya | \n",
+ " SUB-SAHARAN AFRICA | \n",
+ " 1.5 | \n",
+ "
\n",
+ " \n",
+ " 94 | \n",
+ " India | \n",
+ " ASIA (EX. NEAR EAST) | \n",
+ " 2.5 | \n",
+ "
\n",
+ " \n",
+ " 112 | \n",
+ " Kyrgyzstan | \n",
+ " C.W. OF IND. STATES | \n",
+ " 2.5 | \n",
+ "
\n",
+ " \n",
+ " 194 | \n",
+ " Swaziland | \n",
+ " SUB-SAHARAN AFRICA | \n",
+ " 2.5 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 15
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "A guess for what the categories are pointing to is:\n",
+ "\n",
+ "**1** - Countries that are desert kind/hot. \\\n",
+ "**1.5** - Countries that are both hot and tropical. \\\n",
+ "**2** - Countries with a tropical climate.\\\n",
+ "**2.5** - Countries that are both cold and tropical.\\\n",
+ "**3** - Countries with cold Climate.\\\n",
+ "**4** - These countries also seem to have cold climate. Not mentioned why it is separated from Category 3. "
+ ],
+ "metadata": {
+ "id": "uE-3rWIWQnZK"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Data Cleaning"
+ ],
+ "metadata": {
+ "id": "KL4w-_-hQnZL"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "## Finding the Null Value in each Column Percentage\n",
+ "\n",
+ "num_missing = df.isnull().sum()\n",
+ "missing_value_df = pd.DataFrame({'Column_name': df.columns,'num_missing': num_missing})\n",
+ "missing_value_df"
+ ],
+ "metadata": {
+ "execution": {
+ "iopub.status.busy": "2022-03-25T10:07:36.291298Z",
+ "iopub.execute_input": "2022-03-25T10:07:36.291731Z",
+ "iopub.status.idle": "2022-03-25T10:07:36.308650Z",
+ "shell.execute_reply.started": "2022-03-25T10:07:36.291688Z",
+ "shell.execute_reply": "2022-03-25T10:07:36.307891Z"
+ },
+ "trusted": true,
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 0
+ },
+ "id": "DFYC9aYwQnZL",
+ "outputId": "2c53b4df-3f84-4b46-dd66-5818599e8a65"
+ },
+ "execution_count": 16,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " Column_name \\\n",
+ "Country Country \n",
+ "Region Region \n",
+ "Population Population \n",
+ "Area (sq. mi.) Area (sq. mi.) \n",
+ "Pop. Density (per sq. mi.) Pop. Density (per sq. mi.) \n",
+ "Coastline (coast/area ratio) Coastline (coast/area ratio) \n",
+ "Net migration Net migration \n",
+ "Infant mortality (per 1000 births) Infant mortality (per 1000 births) \n",
+ "GDP ($ per capita) GDP ($ per capita) \n",
+ "Literacy (%) Literacy (%) \n",
+ "Phones (per 1000) Phones (per 1000) \n",
+ "Arable (%) Arable (%) \n",
+ "Crops (%) Crops (%) \n",
+ "Other (%) Other (%) \n",
+ "Climate Climate \n",
+ "Birthrate Birthrate \n",
+ "Deathrate Deathrate \n",
+ "Agriculture Agriculture \n",
+ "Industry Industry \n",
+ "Service Service \n",
+ "\n",
+ " num_missing \n",
+ "Country 0 \n",
+ "Region 0 \n",
+ "Population 0 \n",
+ "Area (sq. mi.) 0 \n",
+ "Pop. Density (per sq. mi.) 0 \n",
+ "Coastline (coast/area ratio) 0 \n",
+ "Net migration 3 \n",
+ "Infant mortality (per 1000 births) 3 \n",
+ "GDP ($ per capita) 1 \n",
+ "Literacy (%) 18 \n",
+ "Phones (per 1000) 4 \n",
+ "Arable (%) 2 \n",
+ "Crops (%) 2 \n",
+ "Other (%) 2 \n",
+ "Climate 22 \n",
+ "Birthrate 3 \n",
+ "Deathrate 4 \n",
+ "Agriculture 15 \n",
+ "Industry 16 \n",
+ "Service 15 "
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Column_name | \n",
+ " num_missing | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Country | \n",
+ " Country | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " Region | \n",
+ " Region | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " Population | \n",
+ " Population | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " Area (sq. mi.) | \n",
+ " Area (sq. mi.) | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " Pop. Density (per sq. mi.) | \n",
+ " Pop. Density (per sq. mi.) | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " Coastline (coast/area ratio) | \n",
+ " Coastline (coast/area ratio) | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " Net migration | \n",
+ " Net migration | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " Infant mortality (per 1000 births) | \n",
+ " Infant mortality (per 1000 births) | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " GDP ($ per capita) | \n",
+ " GDP ($ per capita) | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " Literacy (%) | \n",
+ " Literacy (%) | \n",
+ " 18 | \n",
+ "
\n",
+ " \n",
+ " Phones (per 1000) | \n",
+ " Phones (per 1000) | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " Arable (%) | \n",
+ " Arable (%) | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " Crops (%) | \n",
+ " Crops (%) | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " Other (%) | \n",
+ " Other (%) | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " Climate | \n",
+ " Climate | \n",
+ " 22 | \n",
+ "
\n",
+ " \n",
+ " Birthrate | \n",
+ " Birthrate | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " Deathrate | \n",
+ " Deathrate | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " Agriculture | \n",
+ " Agriculture | \n",
+ " 15 | \n",
+ "
\n",
+ " \n",
+ " Industry | \n",
+ " Industry | \n",
+ " 16 | \n",
+ "
\n",
+ " \n",
+ " Service | \n",
+ " Service | \n",
+ " 15 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 16
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "There is a very little percentage of data in each column that is missing. We can view it in a heatmap to get a different visual analysis of it. "
+ ],
+ "metadata": {
+ "id": "EUqB-2Z2QnZL"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "sns.set(rc={'figure.figsize':(11,8)})\n",
+ "sns.heatmap(df.isnull()).set(title = 'Missing Data', xlabel = 'Columns', ylabel = 'Data Points')"
+ ],
+ "metadata": {
+ "execution": {
+ "iopub.status.busy": "2022-03-25T10:07:36.309865Z",
+ "iopub.execute_input": "2022-03-25T10:07:36.310223Z",
+ "iopub.status.idle": "2022-03-25T10:07:37.158493Z",
+ "shell.execute_reply.started": "2022-03-25T10:07:36.310193Z",
+ "shell.execute_reply": "2022-03-25T10:07:37.157512Z"
+ },
+ "trusted": true,
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 0
+ },
+ "id": "g_b-0lKVQnZM",
+ "outputId": "8f7dcf59-b6d5-4208-e8a7-ae3cd746ca44"
+ },
+ "execution_count": 17,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "[Text(75.5, 0.5, 'Data Points'),\n",
+ " Text(0.5, 48.5, 'Columns'),\n",
+ " Text(0.5, 1.0, 'Missing Data')]"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 17
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "image/png": "\n"
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "It is seen that there are significantly low values of **NULL** in some of the columns : **{\"Net Migration\", \"Infant Mortality\", \"GDP\", \"Literacy\", \"Phones\", \"Arable\", \"Crops\", \"Other\", \"Climate\", \"Birthrate\", \"Deathrate\", \"Agriculture\", \"Industry\", \"Service\"}**. The Rows with these values can be dealt with later for now."
+ ],
+ "metadata": {
+ "id": "c_hKZIUZQnZM"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "## Checking Rows in which null values are present for each column\n",
+ "\n",
+ "df1 = df[df['Net migration'].isna()]\n",
+ "df1"
+ ],
+ "metadata": {
+ "execution": {
+ "iopub.status.busy": "2022-03-25T10:07:37.159805Z",
+ "iopub.execute_input": "2022-03-25T10:07:37.160098Z",
+ "iopub.status.idle": "2022-03-25T10:07:37.186207Z",
+ "shell.execute_reply.started": "2022-03-25T10:07:37.160053Z",
+ "shell.execute_reply": "2022-03-25T10:07:37.185533Z"
+ },
+ "trusted": true,
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 0
+ },
+ "id": "m_Tfl06OQnZM",
+ "outputId": "b7e777ed-0a4a-4bae-a919-c3761700a28f"
+ },
+ "execution_count": 18,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " Country Region Population \\\n",
+ "47 Cook Islands OCEANIA 21388 \n",
+ "221 Wallis and Futuna OCEANIA 16025 \n",
+ "223 Western Sahara NORTHERN AFRICA 273008 \n",
+ "\n",
+ " Area (sq. mi.) Pop. Density (per sq. mi.) Coastline (coast/area ratio) \\\n",
+ "47 240 89.1 50.00 \n",
+ "221 274 58.5 47.08 \n",
+ "223 266000 1.0 0.42 \n",
+ "\n",
+ " Net migration Infant mortality (per 1000 births) GDP ($ per capita) \\\n",
+ "47 NaN NaN 5000.0 \n",
+ "221 NaN NaN 3700.0 \n",
+ "223 NaN NaN NaN \n",
+ "\n",
+ " Literacy (%) Phones (per 1000) Arable (%) Crops (%) Other (%) \\\n",
+ "47 95.0 289.9 17.39 13.04 69.57 \n",
+ "221 50.0 118.6 5.00 25.00 70.00 \n",
+ "223 NaN NaN 0.02 0.00 99.98 \n",
+ "\n",
+ " Climate Birthrate Deathrate Agriculture Industry Service \n",
+ "47 2.0 21.0 NaN 0.151 0.096 0.753 \n",
+ "221 2.0 NaN NaN NaN NaN NaN \n",
+ "223 1.0 NaN NaN NaN NaN 0.400 "
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Country | \n",
+ " Region | \n",
+ " Population | \n",
+ " Area (sq. mi.) | \n",
+ " Pop. Density (per sq. mi.) | \n",
+ " Coastline (coast/area ratio) | \n",
+ " Net migration | \n",
+ " Infant mortality (per 1000 births) | \n",
+ " GDP ($ per capita) | \n",
+ " Literacy (%) | \n",
+ " Phones (per 1000) | \n",
+ " Arable (%) | \n",
+ " Crops (%) | \n",
+ " Other (%) | \n",
+ " Climate | \n",
+ " Birthrate | \n",
+ " Deathrate | \n",
+ " Agriculture | \n",
+ " Industry | \n",
+ " Service | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 47 | \n",
+ " Cook Islands | \n",
+ " OCEANIA | \n",
+ " 21388 | \n",
+ " 240 | \n",
+ " 89.1 | \n",
+ " 50.00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 5000.0 | \n",
+ " 95.0 | \n",
+ " 289.9 | \n",
+ " 17.39 | \n",
+ " 13.04 | \n",
+ " 69.57 | \n",
+ " 2.0 | \n",
+ " 21.0 | \n",
+ " NaN | \n",
+ " 0.151 | \n",
+ " 0.096 | \n",
+ " 0.753 | \n",
+ "
\n",
+ " \n",
+ " 221 | \n",
+ " Wallis and Futuna | \n",
+ " OCEANIA | \n",
+ " 16025 | \n",
+ " 274 | \n",
+ " 58.5 | \n",
+ " 47.08 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 3700.0 | \n",
+ " 50.0 | \n",
+ " 118.6 | \n",
+ " 5.00 | \n",
+ " 25.00 | \n",
+ " 70.00 | \n",
+ " 2.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 223 | \n",
+ " Western Sahara | \n",
+ " NORTHERN AFRICA | \n",
+ " 273008 | \n",
+ " 266000 | \n",
+ " 1.0 | \n",
+ " 0.42 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.02 | \n",
+ " 0.00 | \n",
+ " 99.98 | \n",
+ " 1.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.400 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 18
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Changes suggested for these Rows with NaN values\n",
+ "\n",
+ "| Feature | Number of missing Values | Change |\n",
+ "|:----------|:-------------:|------:|\n",
+ "| Net migration | 3 | Belong to very small nations. Change to 0.|\n",
+ "| Infant mortality (per 1000 births) | 3 |Belong to very small nations. Change to 0. |\n",
+ "| GDP ($ per capita) | 1| From Google search, it is \\$2500. Change to same.|\n",
+ "|Literacy (\\%)|18| Replace by the mean literacy of each missing value's region|\n",
+ "|Phones (per 1000)|4|Replace by the mean phones of each missing value's region|\n",
+ "|Arable (\\%)|2|Very small islands.Change to 0.|\n",
+ "|Crops (\\%)|2|Very small islands.Change to 0.|\n",
+ "|Other (\\%)|2|Very small islands.Change to 0.|\n",
+ "|Climate|22|Change to 0. It represents \"unknown\" category.|\n",
+ "|Birthrate|3|Replace with their region's mean rates|\n",
+ "|Deathrate|4|Replace with their region's mean rates|\n",
+ "|Agriculture|15|Calculated guess seeing how similar countries have. Change to 0.15.|\n",
+ "|Industry|16|Calculated guess seeing how similar countries have. Change to 0.05.|\n",
+ "|Service|15|Calculated guess seeing how similar countries have. Change to 0.8.|"
+ ],
+ "metadata": {
+ "id": "KIwkWZlkQnZN"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "change1 = [(\"Net migration\", 0), (\"Infant mortality (per 1000 births)\", 0), (\"GDP ($ per capita)\", 2500), (\"Arable (%)\", 0), (\"Crops (%)\", 0),(\"Other (%)\",0),(\"Climate\",0),(\"Agriculture\",0.15), (\"Industry\", 0.05), (\"Service\", 0.8) ]\n",
+ "for col in change1:\n",
+ " df[col[0]].fillna(col[1], inplace = True)\n",
+ " \n",
+ "change2 = [\"Literacy (%)\", \"Phones (per 1000)\", \"Birthrate\", \"Deathrate\"]\n",
+ "for col in change2:\n",
+ " df[col].fillna(df.groupby('Region')[col].transform('mean'), inplace= True)"
+ ],
+ "metadata": {
+ "execution": {
+ "iopub.status.busy": "2022-03-25T10:07:37.187108Z",
+ "iopub.execute_input": "2022-03-25T10:07:37.187834Z",
+ "iopub.status.idle": "2022-03-25T10:07:37.204185Z",
+ "shell.execute_reply.started": "2022-03-25T10:07:37.187799Z",
+ "shell.execute_reply": "2022-03-25T10:07:37.203144Z"
+ },
+ "trusted": true,
+ "id": "g2rkTMIVQnZN"
+ },
+ "execution_count": 19,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print(df.isnull().sum())"
+ ],
+ "metadata": {
+ "execution": {
+ "iopub.status.busy": "2022-03-25T10:07:37.205429Z",
+ "iopub.execute_input": "2022-03-25T10:07:37.205705Z",
+ "iopub.status.idle": "2022-03-25T10:07:37.224130Z",
+ "shell.execute_reply.started": "2022-03-25T10:07:37.205675Z",
+ "shell.execute_reply": "2022-03-25T10:07:37.223285Z"
+ },
+ "trusted": true,
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "ofaOxZUhQnZN",
+ "outputId": "3c20e99a-8bab-4f4d-f4fd-2dd7d275e0dd"
+ },
+ "execution_count": 20,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Country 0\n",
+ "Region 0\n",
+ "Population 0\n",
+ "Area (sq. mi.) 0\n",
+ "Pop. Density (per sq. mi.) 0\n",
+ "Coastline (coast/area ratio) 0\n",
+ "Net migration 0\n",
+ "Infant mortality (per 1000 births) 0\n",
+ "GDP ($ per capita) 0\n",
+ "Literacy (%) 0\n",
+ "Phones (per 1000) 0\n",
+ "Arable (%) 0\n",
+ "Crops (%) 0\n",
+ "Other (%) 0\n",
+ "Climate 0\n",
+ "Birthrate 0\n",
+ "Deathrate 0\n",
+ "Agriculture 0\n",
+ "Industry 0\n",
+ "Service 0\n",
+ "dtype: int64\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# EDA\n",
+ "## Correlation Heatmap"
+ ],
+ "metadata": {
+ "id": "Wt5V1BZjQnZN"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "fig, ax = plt.subplots(figsize=(16,16)) \n",
+ "sns.heatmap(df.corr(), annot=True, ax=ax, cmap='Spectral').set(\n",
+ " title = 'Feature Correlation', xlabel = 'Columns', ylabel = 'Columns')\n",
+ "plt.show()"
+ ],
+ "metadata": {
+ "execution": {
+ "iopub.status.busy": "2022-03-25T10:07:37.225370Z",
+ "iopub.execute_input": "2022-03-25T10:07:37.225644Z",
+ "iopub.status.idle": "2022-03-25T10:07:38.955709Z",
+ "shell.execute_reply.started": "2022-03-25T10:07:37.225612Z",
+ "shell.execute_reply": "2022-03-25T10:07:38.954764Z"
+ },
+ "trusted": true,
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 1000
+ },
+ "id": "1FllGHxmQnZN",
+ "outputId": "9371de64-d9ac-46d4-9a01-50ca7378920a"
+ },
+ "execution_count": 21,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "