diff --git a/lab-dw-pandas.ipynb b/lab-dw-pandas.ipynb index fbd46831..2de4cd08 100644 --- a/lab-dw-pandas.ipynb +++ b/lab-dw-pandas.ipynb @@ -87,7 +87,310 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here" + "import pandas as pd\n", + "\n", + "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\"\n", + "df = pd.read_csv(url)\n", + "\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed1286e6", + "metadata": {}, + "outputs": [], + "source": [ + "#Challenge # 1. i)\n", + "\n", + "print(df.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "782178a4", + "metadata": {}, + "outputs": [], + "source": [ + "#Challenge # 1. ii)\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d114cc9f", + "metadata": {}, + "outputs": [], + "source": [ + "print(df.dtypes)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d397d072", + "metadata": {}, + "outputs": [], + "source": [ + "# or\n", + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd63ed51", + "metadata": {}, + "outputs": [], + "source": [ + "# We have three floats and namely: Income, Monthly Premium Auto, and Total Claim Amount.\n", + "# Let's correct Customer Lifetime Value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bfe2b5cf", + "metadata": {}, + "outputs": [], + "source": [ + "df['Customer Lifetime Value'] = df['Customer Lifetime Value'].str.replace('%', '')\n", + "df['Customer Lifetime Value'] = df['Customer Lifetime Value'].astype(float)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c13e41a6", + "metadata": {}, + "outputs": [], + "source": [ + "print(df.dtypes)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "178a6abd", + "metadata": {}, + "outputs": [], + "source": [ + "#Challenge # 1. iii) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87b234dc", + "metadata": {}, + "outputs": [], + "source": [ + "df.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aabcc5ed", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Unique values of states are {df['ST'].unique()}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7989588d", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Unique values of gender are {df['GENDER'].unique()}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1657c3a0", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Unique values of education are {df['Education'].unique()}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df101721", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Unique values of number of open complaints are {df['Number of Open Complaints'].unique()}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6bee29d9", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Unique values of policy type are {df['Policy Type'].unique()}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7db21a88", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Unique values of vehicle class are {df['Vehicle Class'].unique()}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ee1cda7", + "metadata": {}, + "outputs": [], + "source": [ + "#Challenge # 1. iv) Compute mean, median, mode, standard deviation, and quartiles to understand the central tendency and distribution of the data for numerical columns." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d46318e0", + "metadata": {}, + "outputs": [], + "source": [ + "# Floats are as follow:\n", + "# Customer Lifetime Value float64\n", + "# Income float64\n", + "# Monthly Premium Auto float64\n", + "# Total Claim Amount float64" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fdf5a0b0", + "metadata": {}, + "outputs": [], + "source": [ + "# Mean:\n", + "print(df['Customer Lifetime Value'].mean())\n", + "print(df['Income'].mean())\n", + "print(df['Monthly Premium Auto'].mean())\n", + "print(df['Total Claim Amount'].mean())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b9e79a09", + "metadata": {}, + "outputs": [], + "source": [ + "# Median:\n", + "print(df['Customer Lifetime Value'].median())\n", + "print(df['Income'].median())\n", + "print(df['Monthly Premium Auto'].median())\n", + "print(df['Total Claim Amount'].median())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9300567", + "metadata": {}, + "outputs": [], + "source": [ + "# Mode:\n", + "print(df['Customer Lifetime Value'].mode().iloc[0])\n", + "print(df['Income'].mode().iloc[0])\n", + "print(df['Monthly Premium Auto'].mode().iloc[0])\n", + "print(df['Total Claim Amount'].mode().iloc[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d63cd1ff", + "metadata": {}, + "outputs": [], + "source": [ + "# Standard Deviation:\n", + "print(df['Customer Lifetime Value'].std())\n", + "print(df['Income'].std())\n", + "print(df['Monthly Premium Auto'].std())\n", + "print(df['Total Claim Amount'].std())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f8daf58", + "metadata": {}, + "outputs": [], + "source": [ + "# Quartiles\n", + "print(df['Customer Lifetime Value'].quantile([0.25, 0.50, 0.75]))\n", + "print(df['Income'].quantile([0.25, 0.50, 0.75]))\n", + "print(df['Monthly Premium Auto'].quantile([0.25, 0.50, 0.75]))\n", + "print(df['Total Claim Amount'].quantile([0.25, 0.50, 0.75]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf591bdc", + "metadata": {}, + "outputs": [], + "source": [ + "# Or easier approach\n", + "\n", + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14a71e85", + "metadata": {}, + "outputs": [], + "source": [ + "# Challenge # 1. v)\n", + "\n", + "# Customer object\n", + "# ST object\n", + "# GENDER object\n", + "# Education object\n", + "# Number of Open Complaints object\n", + "# Policy Type object\n", + "# Vehicle Class object\n", + "\n", + "categorical_variables = [\"Customer\", \"ST\", \"GENDER\", \"Education\", \"Number of Open Complaints\", \"Policy Type\", \"Vehicle Class\"]\n", + "\n", + "display(df[categorical_variables].value_counts())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "888cc4b5", + "metadata": {}, + "outputs": [], + "source": [ + "# Or\n", + "\n", + "for column in categorical_variables:\n", + " print(df[column].value_counts())" ] }, { @@ -121,7 +424,23 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here" + "#Challenge # 2. Ex.1)\n", + "\n", + "Less5 = df['ST'].value_counts().tail(5).sort_values()\n", + "Less5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e1039c9", + "metadata": {}, + "outputs": [], + "source": [ + "# or\n", + "\n", + "Less5 = df['ST'].value_counts().nsmallest(5)\n", + "Less5" ] }, { @@ -147,11 +466,24 @@ { "cell_type": "code", "execution_count": null, - "id": "bcfad6c1-9af2-4b0b-9aa9-0dc5c17473c0", + "id": "dd8de787", + "metadata": {}, + "outputs": [], + "source": [ + "#Challenge # 2. Ex.2)\n", + "\n", + "policy_sold = df['Policy Type'].value_counts().head(1)\n", + "print(policy_sold)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e92000cf", "metadata": {}, "outputs": [], "source": [ - "# Your code here" + "df[['Policy Type', 'Income']].value_counts()" ] }, { @@ -177,11 +509,17 @@ { "cell_type": "code", "execution_count": null, - "id": "0c0563cf-6f8b-463d-a321-651a972f82e5", + "id": "bcfad6c1-9af2-4b0b-9aa9-0dc5c17473c0", "metadata": {}, "outputs": [], "source": [ - "# Your code here" + "#Challenge # 2. Ex.3)\n", + "\n", + "av_in_per = df.loc[df['Policy Type'] == 'Personal Auto', 'Income'].mean()\n", + "av_in_corp = df.loc[df['Policy Type'] == 'Corporate Auto', 'Income'].mean()\n", + "print(f'The mean of income from customers with personal auto is {av_in_per}')\n", + "print(f'The mean of income from customers with corporate auto is {av_in_corp}')\n", + "print(f'The customers with corporate auto have a mean income ${av_in_corp - av_in_per} higher than customers with personal auto')" ] }, { @@ -224,6 +562,19 @@ "*Hint 2: check `Boolean selection according to the values of a single column` in https://towardsdatascience.com/filtering-data-frames-in-pandas-b570b1f834b9*" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c0563cf-6f8b-463d-a321-651a972f82e5", + "metadata": {}, + "outputs": [], + "source": [ + "#Challenge # 2. Ex.Bonus)\n", + "\n", + "\n", + "df['Total Claim Amount'].describe()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -231,7 +582,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here" + "seventy_five = df[df['Total Claim Amount'] > df['Total Claim Amount'].quantile(0.75)]\n", + "seventy_five.describe()" ] } ], @@ -251,7 +603,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.11.5" } }, "nbformat": 4,