From a7baad06428624a7a92050ec55044d150ff6b3b5 Mon Sep 17 00:00:00 2001 From: claragal Date: Sat, 23 Nov 2024 15:45:02 +0100 Subject: [PATCH 1/2] Lab solved --- lab-dw-pandas.ipynb | 392 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 385 insertions(+), 7 deletions(-) diff --git a/lab-dw-pandas.ipynb b/lab-dw-pandas.ipynb index fbd468314..39df433f2 100644 --- a/lab-dw-pandas.ipynb +++ b/lab-dw-pandas.ipynb @@ -82,12 +82,369 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "dd4e8cd8-a6f6-486c-a5c4-1745b0c035f4", "metadata": {}, "outputs": [], "source": [ - "# Your code here" + "# Your code here\n", + "import numpy as np\n", + "import pandas as pd\n", + "url = 'https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv'\n", + "insurance_data = pd.read_csv(url)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "215cc012", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The dataset contains 4008 rows and 11 columns.\n" + ] + } + ], + "source": [ + "rows, columns = insurance_data.shape\n", + "print(f\"The dataset contains {rows} rows and {columns} columns.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "c1a82b79", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Customer object\n", + "ST object\n", + "GENDER object\n", + "Education object\n", + "Customer Lifetime Value object\n", + "Income float64\n", + "Monthly Premium Auto float64\n", + "Number of Open Complaints object\n", + "Policy Type object\n", + "Vehicle Class object\n", + "Total Claim Amount float64\n", + "dtype: object\n" + ] + } + ], + "source": [ + "print(insurance_data.dtypes)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "88ba8bc2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim Amount
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323
\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education Customer Lifetime Value \\\n", + "0 RB50392 Washington NaN Master NaN \n", + "1 QZ44356 Arizona F Bachelor 697953.59% \n", + "2 AI49188 Nevada F Bachelor 1288743.17% \n", + "3 WW63253 California M Bachelor 764586.18% \n", + "4 GA49547 Washington M High School or Below 536307.65% \n", + "\n", + " Income Monthly Premium Auto Number of Open Complaints Policy Type \\\n", + "0 0.0 1000.0 1/0/00 Personal Auto \n", + "1 0.0 94.0 1/0/00 Personal Auto \n", + "2 48767.0 108.0 1/0/00 Personal Auto \n", + "3 0.0 106.0 1/0/00 Corporate Auto \n", + "4 36357.0 68.0 1/0/00 Personal Auto \n", + "\n", + " Vehicle Class Total Claim Amount \n", + "0 Four-Door Car 2.704934 \n", + "1 Four-Door Car 1131.464935 \n", + "2 Two-Door Car 566.472247 \n", + "3 SUV 529.881344 \n", + "4 Four-Door Car 17.269323 " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "insurance_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "aa69e012", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim Amount
4003NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4004NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4005NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4006NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4007NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education Customer Lifetime Value Income \\\n", + "4003 NaN NaN NaN NaN NaN NaN \n", + "4004 NaN NaN NaN NaN NaN NaN \n", + "4005 NaN NaN NaN NaN NaN NaN \n", + "4006 NaN NaN NaN NaN NaN NaN \n", + "4007 NaN NaN NaN NaN NaN NaN \n", + "\n", + " Monthly Premium Auto Number of Open Complaints Policy Type \\\n", + "4003 NaN NaN NaN \n", + "4004 NaN NaN NaN \n", + "4005 NaN NaN NaN \n", + "4006 NaN NaN NaN \n", + "4007 NaN NaN NaN \n", + "\n", + " Vehicle Class Total Claim Amount \n", + "4003 NaN NaN \n", + "4004 NaN NaN \n", + "4005 NaN NaN \n", + "4006 NaN NaN \n", + "4007 NaN NaN " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "insurance_data.tail()" + ] + }, + { + "cell_type": "markdown", + "id": "0bc13307", + "metadata": {}, + "source": [ + "The data types of the following variables could be fixed/changed:\n", + "\n", + "\n", + "Customer Lifetime Value - could be changed as a float. However, the % symbol should be removed and we have to convert the values to a decimal form" ] }, { @@ -116,12 +473,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "2dca5073-4520-4f42-9390-4b92733284ed", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ST\n", + "AZ 25\n", + "WA 30\n", + "Washington 81\n", + "Nevada 98\n", + "Cali 120\n", + "Name: count, dtype: int64\n" + ] + } + ], "source": [ - "# Your code here" + "# Your code here\n", + "# Count the frequencies of each customer location (State)\n", + "location_counts = insurance_data['ST'].value_counts()\n", + "\n", + "# Get the top 5 less common locations in ascending order\n", + "top_5_less_common_locations = location_counts.nsmallest(5)\n", + "\n", + "print(top_5_less_common_locations)" ] }, { @@ -237,7 +615,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -251,7 +629,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.12.6" } }, "nbformat": 4, From bea1cb40e3f5189e22382e38415864cace3a57b9 Mon Sep 17 00:00:00 2001 From: claragal Date: Sat, 23 Nov 2024 16:37:17 +0100 Subject: [PATCH 2/2] lab okey --- lab-dw-pandas.ipynb | 195 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 167 insertions(+), 28 deletions(-) diff --git a/lab-dw-pandas.ipynb b/lab-dw-pandas.ipynb index 39df433f2..7f641ce36 100644 --- a/lab-dw-pandas.ipynb +++ b/lab-dw-pandas.ipynb @@ -82,7 +82,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 10, "id": "dd4e8cd8-a6f6-486c-a5c4-1745b0c035f4", "metadata": {}, "outputs": [], @@ -91,12 +91,12 @@ "import numpy as np\n", "import pandas as pd\n", "url = 'https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv'\n", - "insurance_data = pd.read_csv(url)" + "data = pd.read_csv(url)" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 11, "id": "215cc012", "metadata": {}, "outputs": [ @@ -109,13 +109,13 @@ } ], "source": [ - "rows, columns = insurance_data.shape\n", + "rows, columns = data.shape\n", "print(f\"The dataset contains {rows} rows and {columns} columns.\")" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 12, "id": "c1a82b79", "metadata": {}, "outputs": [ @@ -139,12 +139,12 @@ } ], "source": [ - "print(insurance_data.dtypes)" + "print(data.dtypes)" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 13, "id": "88ba8bc2", "metadata": {}, "outputs": [ @@ -280,18 +280,18 @@ "4 Four-Door Car 17.269323 " ] }, - "execution_count": 17, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "insurance_data.head()" + "data.head()" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 14, "id": "aa69e012", "metadata": {}, "outputs": [ @@ -427,13 +427,13 @@ "4007 NaN NaN " ] }, - "execution_count": 18, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "insurance_data.tail()" + "data.tail()" ] }, { @@ -447,6 +447,69 @@ "Customer Lifetime Value - could be changed as a float. However, the % symbol should be removed and we have to convert the values to a decimal form" ] }, + { + "cell_type": "code", + "execution_count": 20, + "id": "89a2a9bb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Customer 1071\n", + "ST 8\n", + "GENDER 5\n", + "Education 6\n", + "Customer Lifetime Value 1027\n", + "Income 774\n", + "Monthly Premium Auto 132\n", + "Number of Open Complaints 6\n", + "Policy Type 3\n", + "Vehicle Class 6\n", + "Total Claim Amount 761\n", + "dtype: int64\n", + "\n", + "Categorical Columns Unique Values:\n", + "Customer: ['RB50392' 'QZ44356' 'AI49188' ... 'CW49887' 'MY31220' nan]\n", + "ST: ['Washington' 'Arizona' 'Nevada' 'California' 'Oregon' 'Cali' 'AZ' 'WA'\n", + " nan]\n", + "GENDER: [nan 'F' 'M' 'Femal' 'Male' 'female']\n", + "Education: ['Master' 'Bachelor' 'High School or Below' 'College' 'Bachelors' 'Doctor'\n", + " nan]\n", + "Customer Lifetime Value: [nan '697953.59%' '1288743.17%' ... '2031499.76%' '323912.47%'\n", + " '899704.02%']\n", + "Number of Open Complaints: ['1/0/00' '1/2/00' '1/1/00' '1/3/00' '1/5/00' '1/4/00' nan]\n", + "Policy Type: ['Personal Auto' 'Corporate Auto' 'Special Auto' nan]\n", + "Vehicle Class: ['Four-Door Car' 'Two-Door Car' 'SUV' 'Luxury SUV' 'Sports Car'\n", + " 'Luxury Car' nan]\n", + "\n", + "Numerical Columns Range:\n", + "Income: Min = 0.0, Max = 99960.0\n", + "Monthly Premium Auto: Min = 61.0, Max = 35354.0\n", + "Total Claim Amount: Min = 0.382107, Max = 2893.239678\n" + ] + } + ], + "source": [ + "# Count unique values for each column\n", + "print(data.nunique())\n", + "\n", + "# Identify categorical and numerical columns\n", + "categorical_columns = data.select_dtypes(include=['object']).columns\n", + "numerical_columns = data.select_dtypes(include=['number']).columns\n", + "\n", + "# Show unique values for categorical columns\n", + "print(\"\\nCategorical Columns Unique Values:\")\n", + "for column in categorical_columns:\n", + " print(f\"{column}: {data[column].unique()}\")\n", + "\n", + "# Show range of values for numerical columns\n", + "print(\"\\nNumerical Columns Range:\")\n", + "for column in numerical_columns:\n", + " print(f\"{column}: Min = {data[column].min()}, Max = {data[column].max()}\")" + ] + }, { "cell_type": "markdown", "id": "4a703890-63db-4944-b7ab-95a4f8185120", @@ -473,7 +536,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 21, "id": "2dca5073-4520-4f42-9390-4b92733284ed", "metadata": {}, "outputs": [ @@ -492,13 +555,13 @@ } ], "source": [ - "# Your code here\n", - "# Count the frequencies of each customer location (State)\n", - "location_counts = insurance_data['ST'].value_counts()\n", + "# Step 1: Count the occurrences of each location (State) and sort by ascending order\n", + "location_counts = data['ST'].value_counts().sort_values(ascending=True)\n", "\n", - "# Get the top 5 less common locations in ascending order\n", - "top_5_less_common_locations = location_counts.nsmallest(5)\n", + "# Step 2: Retrieve the top 5 less common locations\n", + "top_5_less_common_locations = location_counts.head(5)\n", "\n", + "# Step 3: Display the result\n", "print(top_5_less_common_locations)" ] }, @@ -524,12 +587,38 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "bcfad6c1-9af2-4b0b-9aa9-0dc5c17473c0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Policy Type with Highest Number of Policies Sold:\n", + "Personal Auto\n", + "\n", + "Total Number of Policies Sold for Each Type:\n", + "Policy Type\n", + "Personal Auto 780\n", + "Corporate Auto 234\n", + "Special Auto 57\n", + "Name: count, dtype: int64\n" + ] + } + ], "source": [ - "# Your code here" + "# Step 1: Count the occurrences of each policy type\n", + "policy_counts = data['Policy Type'].value_counts()\n", + "\n", + "# Step 2: Retrieve the policy type with the highest number of policies sold\n", + "policy_with_highest_sales = policy_counts.idxmax()\n", + "\n", + "# Step 3: Display the results\n", + "print(\"Policy Type with Highest Number of Policies Sold:\")\n", + "print(policy_with_highest_sales)\n", + "print(\"\\nTotal Number of Policies Sold for Each Type:\")\n", + "print(policy_counts)" ] }, { @@ -554,12 +643,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "id": "0c0563cf-6f8b-463d-a321-651a972f82e5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average Income for Personal Auto Policy: 38180.69871794872\n", + "Average Income for Corporate Auto Policy: 41390.31196581197\n" + ] + } + ], "source": [ - "# Your code here" + "# Step 1: Filter the dataset into two separate dataframes\n", + "personal_auto_df = data.loc[data['Policy Type'] == 'Personal Auto']\n", + "corporate_auto_df = data.loc[data['Policy Type'] == 'Corporate Auto']\n", + "\n", + "# Step 2: Calculate the average income for each policy type\n", + "average_income_personal_auto = personal_auto_df['Income'].mean()\n", + "average_income_corporate_auto = corporate_auto_df['Income'].mean()\n", + "\n", + "# Step 3: Print the results\n", + "print(f\"Average Income for Personal Auto Policy: {average_income_personal_auto}\")\n", + "print(f\"Average Income for Corporate Auto Policy: {average_income_corporate_auto}\")" ] }, { @@ -604,12 +712,43 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "id": "b731bca6-a760-4860-a27b-a33efa712ce0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Summary Statistics for High Policy Claim Amount Customers:\n", + "count 264.000000\n", + "mean 782.228263\n", + "std 292.751640\n", + "min 537.600000\n", + "25% 606.521741\n", + "50% 679.597985\n", + "75% 851.400000\n", + "max 2893.239678\n", + "Name: Total Claim Amount, dtype: float64\n" + ] + } + ], "source": [ - "# Your code here" + "# Step 1: Calculate the 75th percentile (top 25%)\n", + "top_25_percentile = data['Total Claim Amount'].quantile(0.75)\n", + "\n", + "# Step 2: Create a Boolean mask that checks if Total Claim Amount > 75th percentile\n", + "high_claims_mask = data['Total Claim Amount'] > top_25_percentile\n", + "\n", + "# Step 3: Apply the mask to filter the DataFrame\n", + "high_claims_df = data[high_claims_mask]\n", + "\n", + "# Step 4: Display summary statistics for the high claim amount data\n", + "high_claims_summary = high_claims_df['Total Claim Amount'].describe()\n", + "\n", + "# Step 5: Print the results\n", + "print(\"Summary Statistics for High Policy Claim Amount Customers:\")\n", + "print(high_claims_summary)" ] } ], @@ -629,7 +768,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.6" + "version": "3.12.7" } }, "nbformat": 4,