From c80d55107c0d37aa9e74eb215199970b59a89942 Mon Sep 17 00:00:00 2001 From: cristinarosa97 Date: Fri, 6 Dec 2024 17:38:06 +0100 Subject: [PATCH 1/2] w2 lab4 done --- lab-dw-aggregating.ipynb | 1143 +++++++++++++++++++++++++++++++++----- 1 file changed, 997 insertions(+), 146 deletions(-) diff --git a/lab-dw-aggregating.ipynb b/lab-dw-aggregating.ipynb index fff3ae5..fa9f082 100644 --- a/lab-dw-aggregating.ipynb +++ b/lab-dw-aggregating.ipynb @@ -1,161 +1,1012 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "31969215-2a90-4d8b-ac36-646a7ae13744", - "metadata": { - "id": "31969215-2a90-4d8b-ac36-646a7ae13744" - }, - "source": [ - "# Lab | Data Aggregation and Filtering" - ] - }, - { - "cell_type": "markdown", - "id": "a8f08a52-bec0-439b-99cc-11d3809d8b5d", - "metadata": { - "id": "a8f08a52-bec0-439b-99cc-11d3809d8b5d" - }, - "source": [ - "In this challenge, we will continue to work with customer data from an insurance company. We will use the dataset called marketing_customer_analysis.csv, which can be found at the following link:\n", - "\n", - "https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis.csv\n", - "\n", - "This dataset contains information such as customer demographics, policy details, vehicle information, and the customer's response to the last marketing campaign. Our goal is to explore and analyze this data by first performing data cleaning, formatting, and structuring." - ] - }, - { - "cell_type": "markdown", - "id": "9c98ddc5-b041-4c94-ada1-4dfee5c98e50", - "metadata": { - "id": "9c98ddc5-b041-4c94-ada1-4dfee5c98e50" - }, - "source": [ - "1. Create a new DataFrame that only includes customers who have a total_claim_amount greater than $1,000 and have a response of \"Yes\" to the last marketing campaign." - ] - }, - { - "cell_type": "markdown", - "id": "b9be383e-5165-436e-80c8-57d4c757c8c3", - "metadata": { - "id": "b9be383e-5165-436e-80c8-57d4c757c8c3" - }, - "source": [ - "2. Using the original Dataframe, analyze the average total_claim_amount by each policy type and gender for customers who have responded \"Yes\" to the last marketing campaign. Write your conclusions." - ] - }, - { - "cell_type": "markdown", - "id": "7050f4ac-53c5-4193-a3c0-8699b87196f0", - "metadata": { - "id": "7050f4ac-53c5-4193-a3c0-8699b87196f0" - }, - "source": [ - "3. Analyze the total number of customers who have policies in each state, and then filter the results to only include states where there are more than 500 customers." - ] - }, - { - "cell_type": "markdown", - "id": "b60a4443-a1a7-4bbf-b78e-9ccdf9895e0d", - "metadata": { - "id": "b60a4443-a1a7-4bbf-b78e-9ccdf9895e0d" - }, - "source": [ - "4. Find the maximum, minimum, and median customer lifetime value by education level and gender. Write your conclusions." - ] - }, + "cells": [ + { + "cell_type": "markdown", + "id": "31969215-2a90-4d8b-ac36-646a7ae13744", + "metadata": { + "id": "31969215-2a90-4d8b-ac36-646a7ae13744" + }, + "source": [ + "# Lab | Data Aggregation and Filtering" + ] + }, + { + "cell_type": "markdown", + "id": "a8f08a52-bec0-439b-99cc-11d3809d8b5d", + "metadata": { + "id": "a8f08a52-bec0-439b-99cc-11d3809d8b5d" + }, + "source": [ + "In this challenge, we will continue to work with customer data from an insurance company. We will use the dataset called marketing_customer_analysis.csv, which can be found at the following link:\n", + "\n", + "https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis.csv\n", + "\n", + "This dataset contains information such as customer demographics, policy details, vehicle information, and the customer's response to the last marketing campaign. Our goal is to explore and analyze this data by first performing data cleaning, formatting, and structuring." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "ec87b49e-2770-4d52-85ec-ea8a0ccfa132", + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "id": "b42999f9-311f-481e-ae63-40a5577072c5", - "metadata": { - "id": "b42999f9-311f-481e-ae63-40a5577072c5" - }, - "source": [ - "## Bonus" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0CustomerStateCustomer Lifetime ValueResponseCoverageEducationEffective To DateEmploymentStatusGender...Number of Open ComplaintsNumber of PoliciesPolicy TypePolicyRenew Offer TypeSales ChannelTotal Claim AmountVehicle ClassVehicle SizeVehicle Type
00DK49336Arizona4809.216960NoBasicCollege2/18/11EmployedM...0.09Corporate AutoCorporate L3Offer3Agent292.800000Four-Door CarMedsizeNaN
11KX64629California2228.525238NoBasicCollege1/18/11UnemployedF...0.01Personal AutoPersonal L3Offer4Call Center744.924331Four-Door CarMedsizeNaN
22LZ68649Washington14947.917300NoBasicBachelor2/10/11EmployedM...0.02Personal AutoPersonal L3Offer3Call Center480.000000SUVMedsizeA
33XL78013Oregon22332.439460YesExtendedCollege1/11/11EmployedM...0.02Corporate AutoCorporate L3Offer2Branch484.013411Four-Door CarMedsizeA
44QA50777Oregon9025.067525NoPremiumBachelor1/17/11Medical LeaveF...NaN7Personal AutoPersonal L2Offer1Branch707.925645Four-Door CarMedsizeNaN
\n", + "

5 rows × 26 columns

\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 Customer State Customer Lifetime Value Response \\\n", + "0 0 DK49336 Arizona 4809.216960 No \n", + "1 1 KX64629 California 2228.525238 No \n", + "2 2 LZ68649 Washington 14947.917300 No \n", + "3 3 XL78013 Oregon 22332.439460 Yes \n", + "4 4 QA50777 Oregon 9025.067525 No \n", + "\n", + " Coverage Education Effective To Date EmploymentStatus Gender ... \\\n", + "0 Basic College 2/18/11 Employed M ... \n", + "1 Basic College 1/18/11 Unemployed F ... \n", + "2 Basic Bachelor 2/10/11 Employed M ... \n", + "3 Extended College 1/11/11 Employed M ... \n", + "4 Premium Bachelor 1/17/11 Medical Leave F ... \n", + "\n", + " Number of Open Complaints Number of Policies Policy Type Policy \\\n", + "0 0.0 9 Corporate Auto Corporate L3 \n", + "1 0.0 1 Personal Auto Personal L3 \n", + "2 0.0 2 Personal Auto Personal L3 \n", + "3 0.0 2 Corporate Auto Corporate L3 \n", + "4 NaN 7 Personal Auto Personal L2 \n", + "\n", + " Renew Offer Type Sales Channel Total Claim Amount Vehicle Class \\\n", + "0 Offer3 Agent 292.800000 Four-Door Car \n", + "1 Offer4 Call Center 744.924331 Four-Door Car \n", + "2 Offer3 Call Center 480.000000 SUV \n", + "3 Offer2 Branch 484.013411 Four-Door Car \n", + "4 Offer1 Branch 707.925645 Four-Door Car \n", + "\n", + " Vehicle Size Vehicle Type \n", + "0 Medsize NaN \n", + "1 Medsize NaN \n", + "2 Medsize A \n", + "3 Medsize A \n", + "4 Medsize NaN \n", + "\n", + "[5 rows x 26 columns]" ] - }, + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis.csv\")\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "169724f6-9f30-420e-ad89-53eb59cc1f6a", + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "id": "81ff02c5-6584-4f21-a358-b918697c6432", - "metadata": { - "id": "81ff02c5-6584-4f21-a358-b918697c6432" - }, - "source": [ - "5. The marketing team wants to analyze the number of policies sold by state and month. Present the data in a table where the months are arranged as columns and the states are arranged as rows." + "data": { + "text/plain": [ + "Index(['Unnamed: 0', 'Customer', 'State', 'Customer Lifetime Value',\n", + " 'Response', 'Coverage', 'Education', 'Effective To Date',\n", + " 'EmploymentStatus', 'Gender', 'Income', 'Location Code',\n", + " 'Marital Status', 'Monthly Premium Auto', 'Months Since Last Claim',\n", + " 'Months Since Policy Inception', 'Number of Open Complaints',\n", + " 'Number of Policies', 'Policy Type', 'Policy', 'Renew Offer Type',\n", + " 'Sales Channel', 'Total Claim Amount', 'Vehicle Class', 'Vehicle Size',\n", + " 'Vehicle Type'],\n", + " dtype='object')" ] - }, + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.columns" + ] + }, + { + "cell_type": "markdown", + "id": "9c98ddc5-b041-4c94-ada1-4dfee5c98e50", + "metadata": { + "id": "9c98ddc5-b041-4c94-ada1-4dfee5c98e50" + }, + "source": [ + "1. Create a new DataFrame that only includes customers who have a total_claim_amount greater than $1,000 and have a response of \"Yes\" to the last marketing campaign." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "a90c9139-e9bb-4738-a0f5-bc39f880b8d4", + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "id": "b6aec097-c633-4017-a125-e77a97259cda", - "metadata": { - "id": "b6aec097-c633-4017-a125-e77a97259cda" - }, - "source": [ - "6. Display a new DataFrame that contains the number of policies sold by month, by state, for the top 3 states with the highest number of policies sold.\n", - "\n", - "*Hint:*\n", - "- *To accomplish this, you will first need to group the data by state and month, then count the number of policies sold for each group. Afterwards, you will need to sort the data by the count of policies sold in descending order.*\n", - "- *Next, you will select the top 3 states with the highest number of policies sold.*\n", - "- *Finally, you will create a new DataFrame that contains the number of policies sold by month for each of the top 3 states.*" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0CustomerStateCustomer Lifetime ValueResponseCoverageEducationEffective To DateEmploymentStatusGender...Number of Open ComplaintsNumber of PoliciesPolicy TypePolicyRenew Offer TypeSales ChannelTotal Claim AmountVehicle ClassVehicle SizeVehicle Type
189189OK31456California11009.130490YesPremiumBachelor1/24/11EmployedF...0.01Corporate AutoCorporate L3Offer2Agent1358.400000Luxury CarMedsizeNaN
236236YJ16163Oregon11009.130490YesPremiumBachelor1/24/11EmployedF...0.01Special AutoSpecial L3Offer2Agent1358.400000Luxury CarMedsizeA
419419GW43195Oregon25807.063000YesExtendedCollege2/13/11EmployedF...1.02Personal AutoPersonal L2Offer1Branch1027.200000Luxury CarSmallA
442442IP94270Arizona13736.132500YesPremiumMaster2/13/11DisabledF...0.08Personal AutoPersonal L2Offer1Web1261.319869SUVMedsizeA
587587FJ28407California5619.689084YesPremiumHigh School or Below1/26/11UnemployedM...0.01Personal AutoPersonal L1Offer2Web1027.000029SUVMedsizeA
..................................................................
1035110351FN44127Oregon3508.569533YesExtendedCollege1/5/11Medical LeaveM...1.01Personal AutoPersonal L2Offer2Branch1176.278800Four-Door CarSmallNaN
1037310373XZ64172Oregon10963.957230YesPremiumHigh School or Below2/8/11EmployedM...0.01Corporate AutoCorporate L2Offer1Agent1324.800000Luxury SUVMedsizeNaN
1048710487IX60941Oregon3508.569533YesExtendedCollege1/5/11Medical LeaveM...1.01Personal AutoPersonal L3Offer2Branch1176.278800Four-Door CarSmallNaN
1056510565QO62792Oregon7840.165778YesExtendedCollege1/14/11EmployedM...2.01Personal AutoPersonal L3Offer2Agent1008.000000NaNNaNNaN
1070810708CK39096Oregon5619.689084YesPremiumHigh School or Below1/26/11UnemployedM...0.01Personal AutoPersonal L3Offer2Web1027.000029SUVMedsizeA
\n", + "

67 rows × 26 columns

\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 Customer State Customer Lifetime Value Response \\\n", + "189 189 OK31456 California 11009.130490 Yes \n", + "236 236 YJ16163 Oregon 11009.130490 Yes \n", + "419 419 GW43195 Oregon 25807.063000 Yes \n", + "442 442 IP94270 Arizona 13736.132500 Yes \n", + "587 587 FJ28407 California 5619.689084 Yes \n", + "... ... ... ... ... ... \n", + "10351 10351 FN44127 Oregon 3508.569533 Yes \n", + "10373 10373 XZ64172 Oregon 10963.957230 Yes \n", + "10487 10487 IX60941 Oregon 3508.569533 Yes \n", + "10565 10565 QO62792 Oregon 7840.165778 Yes \n", + "10708 10708 CK39096 Oregon 5619.689084 Yes \n", + "\n", + " Coverage Education Effective To Date EmploymentStatus \\\n", + "189 Premium Bachelor 1/24/11 Employed \n", + "236 Premium Bachelor 1/24/11 Employed \n", + "419 Extended College 2/13/11 Employed \n", + "442 Premium Master 2/13/11 Disabled \n", + "587 Premium High School or Below 1/26/11 Unemployed \n", + "... ... ... ... ... \n", + "10351 Extended College 1/5/11 Medical Leave \n", + "10373 Premium High School or Below 2/8/11 Employed \n", + "10487 Extended College 1/5/11 Medical Leave \n", + "10565 Extended College 1/14/11 Employed \n", + "10708 Premium High School or Below 1/26/11 Unemployed \n", + "\n", + " Gender ... Number of Open Complaints Number of Policies \\\n", + "189 F ... 0.0 1 \n", + "236 F ... 0.0 1 \n", + "419 F ... 1.0 2 \n", + "442 F ... 0.0 8 \n", + "587 M ... 0.0 1 \n", + "... ... ... ... ... \n", + "10351 M ... 1.0 1 \n", + "10373 M ... 0.0 1 \n", + "10487 M ... 1.0 1 \n", + "10565 M ... 2.0 1 \n", + "10708 M ... 0.0 1 \n", + "\n", + " Policy Type Policy Renew Offer Type Sales Channel \\\n", + "189 Corporate Auto Corporate L3 Offer2 Agent \n", + "236 Special Auto Special L3 Offer2 Agent \n", + "419 Personal Auto Personal L2 Offer1 Branch \n", + "442 Personal Auto Personal L2 Offer1 Web \n", + "587 Personal Auto Personal L1 Offer2 Web \n", + "... ... ... ... ... \n", + "10351 Personal Auto Personal L2 Offer2 Branch \n", + "10373 Corporate Auto Corporate L2 Offer1 Agent \n", + "10487 Personal Auto Personal L3 Offer2 Branch \n", + "10565 Personal Auto Personal L3 Offer2 Agent \n", + "10708 Personal Auto Personal L3 Offer2 Web \n", + "\n", + " Total Claim Amount Vehicle Class Vehicle Size Vehicle Type \n", + "189 1358.400000 Luxury Car Medsize NaN \n", + "236 1358.400000 Luxury Car Medsize A \n", + "419 1027.200000 Luxury Car Small A \n", + "442 1261.319869 SUV Medsize A \n", + "587 1027.000029 SUV Medsize A \n", + "... ... ... ... ... \n", + "10351 1176.278800 Four-Door Car Small NaN \n", + "10373 1324.800000 Luxury SUV Medsize NaN \n", + "10487 1176.278800 Four-Door Car Small NaN \n", + "10565 1008.000000 NaN NaN NaN \n", + "10708 1027.000029 SUV Medsize A \n", + "\n", + "[67 rows x 26 columns]" ] - }, + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_df = df[(df[\"Response\"] == \"Yes\") & (df[\"Total Claim Amount\"] > 1000)]\n", + "new_df" + ] + }, + { + "cell_type": "markdown", + "id": "b9be383e-5165-436e-80c8-57d4c757c8c3", + "metadata": { + "id": "b9be383e-5165-436e-80c8-57d4c757c8c3" + }, + "source": [ + "2. Using the original Dataframe, analyze the average total_claim_amount by each policy type and gender for customers who have responded \"Yes\" to the last marketing campaign. Write your conclusions." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "18fbe115-fc82-470a-9eb2-9633d1e64a77", + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "id": "ba975b8a-a2cf-4fbf-9f59-ebc381767009", - "metadata": { - "id": "ba975b8a-a2cf-4fbf-9f59-ebc381767009" - }, - "source": [ - "7. The marketing team wants to analyze the effect of different marketing channels on the customer response rate.\n", - "\n", - "Hint: You can use melt to unpivot the data and create a table that shows the customer response rate (those who responded \"Yes\") by marketing channel." + "data": { + "text/plain": [ + "Gender Policy Type \n", + "F Corporate Auto 433.738499\n", + " Personal Auto 452.965929\n", + " Special Auto 453.280164\n", + "M Corporate Auto 408.582459\n", + " Personal Auto 457.010178\n", + " Special Auto 429.527942\n", + "Name: Total Claim Amount, dtype: float64" ] - }, + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "condition = df[\"Response\"] == \"Yes\"\n", + "answer_yes = df[condition].copy()\n", + "answer_yes.groupby([\"Gender\", \"Policy Type\"])[\"Total Claim Amount\"].mean()" + ] + }, + { + "cell_type": "markdown", + "id": "7050f4ac-53c5-4193-a3c0-8699b87196f0", + "metadata": { + "id": "7050f4ac-53c5-4193-a3c0-8699b87196f0" + }, + "source": [ + "3. Analyze the total number of customers who have policies in each state, and then filter the results to only include states where there are more than 500 customers." + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "f99002e8-3178-4855-a7e4-213428d56d8b", + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "id": "e4378d94-48fb-4850-a802-b1bc8f427b2d", - "metadata": { - "id": "e4378d94-48fb-4850-a802-b1bc8f427b2d" - }, - "source": [ - "External Resources for Data Filtering: https://towardsdatascience.com/filtering-data-frames-in-pandas-b570b1f834b9" + "data": { + "text/plain": [ + "[State\n", + " California True\n", + " Oregon True\n", + " Arizona True\n", + " Nevada True\n", + " Washington True\n", + " Name: count, dtype: bool]" ] - }, + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "more_than = [df[\"State\"].value_counts() > 500]\n", + "more_than" + ] + }, + { + "cell_type": "markdown", + "id": "b60a4443-a1a7-4bbf-b78e-9ccdf9895e0d", + "metadata": { + "id": "b60a4443-a1a7-4bbf-b78e-9ccdf9895e0d" + }, + "source": [ + "4. Find the maximum, minimum, and median customer lifetime value by education level and gender. Write your conclusions." + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "bea4ec6d-4a76-4905-a385-beabd5f6ae6c", + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "id": "449513f4-0459-46a0-a18d-9398d974c9ad", - "metadata": { - "id": "449513f4-0459-46a0-a18d-9398d974c9ad" - }, - "outputs": [], - "source": [ - "# your code goes here" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
maxminmedian
GenderEducation
FBachelor73225.956521904.0008525640.505303
College61850.188031898.6836865623.611187
Doctor44856.113972395.5700005332.462694
High School or Below55277.445892144.9215356039.553187
Master51016.067042417.7770325729.855012
MBachelor67907.270501898.0076755548.031892
College61134.683071918.1197006005.847375
Doctor32677.342842267.6040385577.669457
High School or Below83325.381191940.9812216286.731006
Master50568.259122272.3073105579.099207
\n", + "
" + ], + "text/plain": [ + " max min median\n", + "Gender Education \n", + "F Bachelor 73225.95652 1904.000852 5640.505303\n", + " College 61850.18803 1898.683686 5623.611187\n", + " Doctor 44856.11397 2395.570000 5332.462694\n", + " High School or Below 55277.44589 2144.921535 6039.553187\n", + " Master 51016.06704 2417.777032 5729.855012\n", + "M Bachelor 67907.27050 1898.007675 5548.031892\n", + " College 61134.68307 1918.119700 6005.847375\n", + " Doctor 32677.34284 2267.604038 5577.669457\n", + " High School or Below 83325.38119 1940.981221 6286.731006\n", + " Master 50568.25912 2272.307310 5579.099207" ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.13" - }, - "colab": { - "provenance": [] - } + ], + "source": [ + "df.groupby([\"Gender\", \"Education\"])[\"Customer Lifetime Value\"].agg([\"max\", \"min\", \"median\"])" + ] + }, + { + "cell_type": "markdown", + "id": "b42999f9-311f-481e-ae63-40a5577072c5", + "metadata": { + "id": "b42999f9-311f-481e-ae63-40a5577072c5" + }, + "source": [ + "## Bonus" + ] + }, + { + "cell_type": "markdown", + "id": "81ff02c5-6584-4f21-a358-b918697c6432", + "metadata": { + "id": "81ff02c5-6584-4f21-a358-b918697c6432" + }, + "source": [ + "5. The marketing team wants to analyze the number of policies sold by state and month. Present the data in a table where the months are arranged as columns and the states are arranged as rows." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95376705-7d06-4207-ac08-66ded7bcbcc1", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "b6aec097-c633-4017-a125-e77a97259cda", + "metadata": { + "id": "b6aec097-c633-4017-a125-e77a97259cda" + }, + "source": [ + "6. Display a new DataFrame that contains the number of policies sold by month, by state, for the top 3 states with the highest number of policies sold.\n", + "\n", + "*Hint:*\n", + "- *To accomplish this, you will first need to group the data by state and month, then count the number of policies sold for each group. Afterwards, you will need to sort the data by the count of policies sold in descending order.*\n", + "- *Next, you will select the top 3 states with the highest number of policies sold.*\n", + "- *Finally, you will create a new DataFrame that contains the number of policies sold by month for each of the top 3 states.*" + ] + }, + { + "cell_type": "markdown", + "id": "ba975b8a-a2cf-4fbf-9f59-ebc381767009", + "metadata": { + "id": "ba975b8a-a2cf-4fbf-9f59-ebc381767009" + }, + "source": [ + "7. The marketing team wants to analyze the effect of different marketing channels on the customer response rate.\n", + "\n", + "Hint: You can use melt to unpivot the data and create a table that shows the customer response rate (those who responded \"Yes\") by marketing channel." + ] + }, + { + "cell_type": "markdown", + "id": "e4378d94-48fb-4850-a802-b1bc8f427b2d", + "metadata": { + "id": "e4378d94-48fb-4850-a802-b1bc8f427b2d" + }, + "source": [ + "External Resources for Data Filtering: https://towardsdatascience.com/filtering-data-frames-in-pandas-b570b1f834b9" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "449513f4-0459-46a0-a18d-9398d974c9ad", + "metadata": { + "id": "449513f4-0459-46a0-a18d-9398d974c9ad" + }, + "outputs": [], + "source": [ + "# your code goes here" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 49af77faf407824dc1eb3c011e1cc3fd62d69863 Mon Sep 17 00:00:00 2001 From: cristinarosa97 Date: Fri, 13 Dec 2024 17:40:44 +0100 Subject: [PATCH 2/2] w2 lab4 redone --- lab-dw-aggregating.ipynb | 206 ++++++++++++++++++++++++++++++++++----- 1 file changed, 183 insertions(+), 23 deletions(-) diff --git a/lab-dw-aggregating.ipynb b/lab-dw-aggregating.ipynb index fa9f082..293f372 100644 --- a/lab-dw-aggregating.ipynb +++ b/lab-dw-aggregating.ipynb @@ -252,7 +252,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "169724f6-9f30-420e-ad89-53eb59cc1f6a", "metadata": {}, "outputs": [ @@ -270,7 +270,7 @@ " dtype='object')" ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -291,7 +291,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 5, "id": "a90c9139-e9bb-4738-a0f5-bc39f880b8d4", "metadata": {}, "outputs": [ @@ -678,7 +678,7 @@ "[67 rows x 26 columns]" ] }, - "execution_count": 18, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -700,7 +700,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 7, "id": "18fbe115-fc82-470a-9eb2-9633d1e64a77", "metadata": {}, "outputs": [ @@ -717,7 +717,7 @@ "Name: Total Claim Amount, dtype: float64" ] }, - "execution_count": 26, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -740,30 +740,62 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 27, + "id": "73167bee-5e5a-4e60-950c-f0135ebeae02", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Arizona', 'California', 'Washington', 'Oregon', nan, 'Nevada'],\n", + " dtype=object)" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"State\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "8f1493da-60eb-47fc-8147-49097cb3ddfa", + "metadata": {}, + "outputs": [], + "source": [ + "state_counts = df[\"State\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, "id": "f99002e8-3178-4855-a7e4-213428d56d8b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[State\n", - " California True\n", - " Oregon True\n", - " Arizona True\n", - " Nevada True\n", - " Washington True\n", - " Name: count, dtype: bool]" + "State\n", + "California 3552\n", + "Oregon 2909\n", + "Arizona 1937\n", + "Nevada 993\n", + "Washington 888\n", + "Name: count, dtype: int64" ] }, - "execution_count": 44, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "more_than = [df[\"State\"].value_counts() > 500]\n", - "more_than" + "states_with_more_than_500 = state_counts[state_counts > 500]\n", + "states_with_more_than_500" ] }, { @@ -929,11 +961,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 45, "id": "95376705-7d06-4207-ac08-66ded7bcbcc1", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Month 1 2\n", + "State \n", + "Arizona 3052 2864\n", + "California 5673 4929\n", + "Nevada 1493 1278\n", + "Oregon 4697 3969\n", + "Washington 1358 1225\n" + ] + } + ], + "source": [ + "df['Effective To Date'] = pd.to_datetime(df['Effective To Date'])\n", + "df['Month'] = df['Effective To Date'].dt.month\n", + "policy_counts = df.groupby(['State', 'Month'])['Number of Policies'].sum().reset_index()\n", + "policy_pivot = policy_counts.pivot_table(index='State', columns='Month', values='Number of Policies', aggfunc='sum')\n", + "print(policy_pivot)" + ] }, { "cell_type": "markdown", @@ -950,6 +1002,93 @@ "- *Finally, you will create a new DataFrame that contains the number of policies sold by month for each of the top 3 states.*" ] }, + { + "cell_type": "code", + "execution_count": 69, + "id": "35a38832-f61b-4efd-a31d-549dc0123a3f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Month12
State
Arizona30522864
California56734929
Oregon46973969
\n", + "
" + ], + "text/plain": [ + "Month 1 2\n", + "State \n", + "Arizona 3052 2864\n", + "California 5673 4929\n", + "Oregon 4697 3969" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['Effective To Date'] = pd.to_datetime(df['Effective To Date'], errors='coerce')\n", + "\n", + "df['Month'] = df['Effective To Date'].dt.month\n", + "\n", + "policy_counts_by_state_month = df.groupby(['State', 'Month'])['Number of Policies'].sum().reset_index()\n", + "\n", + "total_policies_per_state = df.groupby('State')['Number of Policies'].sum().sort_values(ascending=False)\n", + "\n", + "top_3_states = total_policies_per_state.head(3).index\n", + "\n", + "top_3_state_monthly_policies = policy_counts_by_state_month[policy_counts_by_state_month['State'].isin(top_3_states)]\n", + "\n", + "top_3_state_monthly_policies_pivot = top_3_state_monthly_policies.pivot_table(index='State', columns='Month', values='Number of Policies', aggfunc='sum')\n", + "\n", + "top_3_state_monthly_policies_pivot\n" + ] + }, { "cell_type": "markdown", "id": "ba975b8a-a2cf-4fbf-9f59-ebc381767009", @@ -974,14 +1113,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 73, "id": "449513f4-0459-46a0-a18d-9398d974c9ad", "metadata": { "id": "449513f4-0459-46a0-a18d-9398d974c9ad" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Sales Channel\n", + "Agent 0.190746\n", + "Branch 0.113787\n", + "Call Center 0.109786\n", + "Web 0.117141\n", + "Name: Response, dtype: float64" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# your code goes here" + "melted_df = df.melt(id_vars=['Customer'], value_vars=['Sales Channel'], var_name='Channel', value_name='Marketing Channel')\n", + "\n", + "response_df = df[df['Response'] == 'Yes']\n", + "\n", + "by_channel = response_df.groupby('Sales Channel')['Response'].count() / df.groupby('Sales Channel')['Response'].count()\n", + "by_channel" ] } ],