diff --git a/lab-dw-pandas.ipynb b/lab-dw-pandas.ipynb index fbd46831..8b075574 100644 --- a/lab-dw-pandas.ipynb +++ b/lab-dw-pandas.ipynb @@ -82,12 +82,507 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 62, "id": "dd4e8cd8-a6f6-486c-a5c4-1745b0c035f4", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim Amount
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323
\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education Customer Lifetime Value \\\n", + "0 RB50392 Washington NaN Master NaN \n", + "1 QZ44356 Arizona F Bachelor 697953.59% \n", + "2 AI49188 Nevada F Bachelor 1288743.17% \n", + "3 WW63253 California M Bachelor 764586.18% \n", + "4 GA49547 Washington M High School or Below 536307.65% \n", + "\n", + " Income Monthly Premium Auto Number of Open Complaints Policy Type \\\n", + "0 0.0 1000.0 1/0/00 Personal Auto \n", + "1 0.0 94.0 1/0/00 Personal Auto \n", + "2 48767.0 108.0 1/0/00 Personal Auto \n", + "3 0.0 106.0 1/0/00 Corporate Auto \n", + "4 36357.0 68.0 1/0/00 Personal Auto \n", + "\n", + " Vehicle Class Total Claim Amount \n", + "0 Four-Door Car 2.704934 \n", + "1 Four-Door Car 1131.464935 \n", + "2 Two-Door Car 566.472247 \n", + "3 SUV 529.881344 \n", + "4 Four-Door Car 17.269323 " + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here\n", + "import pandas as pd\n", + "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\")\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "3160d342-7f99-4421-ba8b-77253c9442f1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(4008, 11)" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Number of rows and columns\n", + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "b7721069-b889-488c-a6ea-e9b969ae60ad", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Customer object\n", + "ST object\n", + "GENDER object\n", + "Education object\n", + "Customer Lifetime Value object\n", + "Income float64\n", + "Monthly Premium Auto float64\n", + "Number of Open Complaints object\n", + "Policy Type object\n", + "Vehicle Class object\n", + "Total Claim Amount float64\n", + "dtype: object" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dtypes\n", + "\n", + "# Problems: \n", + "# Customer Lifetime Value is an object because of the % symbol\n", + "# Number of Open Complaints is an object because of the format used 1/0/00" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "7b75e069-c1bb-4c44-aa5b-1b1be0794c23", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Customer 1071\n", + "ST 8\n", + "GENDER 5\n", + "Education 6\n", + "Customer Lifetime Value 1027\n", + "Income 774\n", + "Monthly Premium Auto 132\n", + "Number of Open Complaints 6\n", + "Policy Type 3\n", + "Vehicle Class 6\n", + "Total Claim Amount 761\n", + "dtype: int64" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.nunique()\n", + "# Categorical columns: ST, GENDER, Education, Policy Type, Vehicle Class" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "bdd728ff-e780-4b52-84f7-28882dd4f30c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ST: ['Washington' 'Arizona' 'Nevada' 'California' 'Oregon' 'Cali' 'AZ' 'WA'\n", + " nan]\n", + "GENDER: [nan 'F' 'M' 'Femal' 'Male' 'female']\n", + "Education: ['Master' 'Bachelor' 'High School or Below' 'College' 'Bachelors' 'Doctor'\n", + " nan]\n", + "Policy Type: ['Personal Auto' 'Corporate Auto' 'Special Auto' nan]\n", + "Vehicle Class: ['Four-Door Car' 'Two-Door Car' 'SUV' 'Luxury SUV' 'Sports Car'\n", + " 'Luxury Car' nan]\n" + ] + } + ], + "source": [ + "print(\"ST: \", df[\"ST\"].unique())\n", + "print(\"GENDER: \", df[\"GENDER\"].unique())\n", + "print(\"Education: \",df[\"Education\"].unique())\n", + "print(\"Policy Type: \",df[\"Policy Type\"].unique())\n", + "print(\"Vehicle Class: \",df[\"Vehicle Class\"].unique())" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "fc50b0aa-23a5-49c8-a9ac-6fb7b0c3297c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The maximum value of Income is 99960.0 and the minimum value is 0.0\n", + "The maximum value of Monthly Premium Auto is 35354.0 and the minimum value is 61.0\n" + ] + } + ], "source": [ - "# Your code here" + "print(\"The maximum value of Income is \", df[\"Income\"].max(),\"and the minimum value is \" , df[\"Income\"].min())\n", + "print(\"The maximum value of Monthly Premium Auto is \", df[\"Monthly Premium Auto\"].max(),\"and the minimum value is \", df[\"Monthly Premium Auto\"].min())" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "c16899ad-11c9-4791-bd40-d2ee2460916b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IncomeMonthly Premium AutoTotal Claim Amount
count1071.0000001071.0000001071.000000
mean39295.701214193.234360404.986909
std30469.4270601601.190369293.027260
min0.00000061.0000000.382107
25%14072.00000068.000000202.157702
50%36234.00000083.000000354.729129
75%64631.000000109.500000532.800000
max99960.00000035354.0000002893.239678
\n", + "
" + ], + "text/plain": [ + " Income Monthly Premium Auto Total Claim Amount\n", + "count 1071.000000 1071.000000 1071.000000\n", + "mean 39295.701214 193.234360 404.986909\n", + "std 30469.427060 1601.190369 293.027260\n", + "min 0.000000 61.000000 0.382107\n", + "25% 14072.000000 68.000000 202.157702\n", + "50% 36234.000000 83.000000 354.729129\n", + "75% 64631.000000 109.500000 532.800000\n", + "max 99960.000000 35354.000000 2893.239678" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "6def634e-3aca-4082-856f-1b37e9c94cb3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueNumber of Open ComplaintsPolicy TypeVehicle Class
count1071107195410711068107110711071
unique10718561027636
topRB50392OregonFBachelor445811.34%1/0/00Personal AutoFour-Door Car
freq13204573244830780576
\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education Customer Lifetime Value \\\n", + "count 1071 1071 954 1071 1068 \n", + "unique 1071 8 5 6 1027 \n", + "top RB50392 Oregon F Bachelor 445811.34% \n", + "freq 1 320 457 324 4 \n", + "\n", + " Number of Open Complaints Policy Type Vehicle Class \n", + "count 1071 1071 1071 \n", + "unique 6 3 6 \n", + "top 1/0/00 Personal Auto Four-Door Car \n", + "freq 830 780 576 " + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe(include=object)" ] }, { @@ -116,12 +611,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 125, "id": "2dca5073-4520-4f42-9390-4b92733284ed", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "ST\n", + "AZ 25\n", + "WA 30\n", + "Washington 81\n", + "Nevada 98\n", + "Cali 120\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 125, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here" + "location_series = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\", usecols=[\"ST\"]).squeeze(\"columns\")\n", + "location_series_count = location_series.value_counts()\n", + "sorted_locations = location_series_count.sort_values()\n", + "less_common = sorted_locations.head(5)\n", + "less_common" ] }, { @@ -146,12 +662,30 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "bcfad6c1-9af2-4b0b-9aa9-0dc5c17473c0", + "execution_count": 145, + "id": "e0fc76bd-dedf-495b-a25b-c1e2f784510b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Policy Type\n", + "Personal Auto 780\n", + "Corporate Auto 234\n", + "Special Auto 57\n", + "Name: count, dtype: int64\n", + "----\n", + "780\n" + ] + } + ], "source": [ - "# Your code here" + "policy_series = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\", usecols=[\"Policy Type\"]).squeeze(\"columns\")\n", + "policy_series_count = policy_series.value_counts()\n", + "print(policy_series_count)\n", + "print(\"----\")\n", + "print(policy_series_count.max())" ] }, { @@ -164,6 +698,17 @@ "The sales team wants to know if customers with Personal Auto have a lower income than those with Corporate Auto. How does the average income compare between the two policy types?" ] }, + { + "cell_type": "code", + "execution_count": 155, + "id": "bcfad6c1-9af2-4b0b-9aa9-0dc5c17473c0", + "metadata": {}, + "outputs": [], + "source": [ + "personal_auto = df.loc[df[\"Policy Type\"] == \"Personal Auto\"]\n", + "corporate_auto = df.loc[df[\"Policy Type\"] == \"Corporate Auto\"]" + ] + }, { "cell_type": "markdown", "id": "b1386d75-2810-4aa1-93e0-9485aa12d552", @@ -176,12 +721,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 165, "id": "0c0563cf-6f8b-463d-a321-651a972f82e5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The average income of the customer with a personal policy is 38180.69871794872\n", + "The average income of the customer with a corporate policy is 41390.31196581197\n" + ] + } + ], "source": [ - "# Your code here" + "avg_personal = sum(personal_auto[\"Income\"]) / len(personal_auto[\"Income\"])\n", + "avg_corporate = sum(corporate_auto[\"Income\"]) / len(corporate_auto[\"Income\"])\n", + "print(\"The average income of the customer with a personal policy is \", avg_personal)\n", + "print(\"The average income of the customer with a corporate policy is \", avg_corporate)" ] }, { @@ -226,12 +783,138 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 169, "id": "b731bca6-a760-4860-a27b-a33efa712ce0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IncomeMonthly Premium AutoTotal Claim Amount
count1071.0000001071.0000001071.000000
mean39295.701214193.234360404.986909
std30469.4270601601.190369293.027260
min0.00000061.0000000.382107
25%14072.00000068.000000202.157702
50%36234.00000083.000000354.729129
75%64631.000000109.500000532.800000
max99960.00000035354.0000002893.239678
\n", + "
" + ], + "text/plain": [ + " Income Monthly Premium Auto Total Claim Amount\n", + "count 1071.000000 1071.000000 1071.000000\n", + "mean 39295.701214 193.234360 404.986909\n", + "std 30469.427060 1601.190369 293.027260\n", + "min 0.000000 61.000000 0.382107\n", + "25% 14072.000000 68.000000 202.157702\n", + "50% 36234.000000 83.000000 354.729129\n", + "75% 64631.000000 109.500000 532.800000\n", + "max 99960.000000 35354.000000 2893.239678" + ] + }, + "execution_count": 169, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 183, + "id": "20fd3459-ed16-4492-b376-47cd2ab4a804", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 264.000000\n", + "mean 782.228263\n", + "std 292.751640\n", + "min 537.600000\n", + "25% 606.521741\n", + "50% 679.597985\n", + "75% 851.400000\n", + "max 2893.239678\n", + "Name: Total Claim Amount, dtype: float64" + ] + }, + "execution_count": 183, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here" + "customers_high_claim = df.loc[df[\"Total Claim Amount\"] > 532.80]\n", + "customers_high_claim[\"Total Claim Amount\"].describe()" ] } ], @@ -251,7 +934,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.12.4" } }, "nbformat": 4,