From d0ddc6e9c581ae1c685b8d8b86ac019707605784 Mon Sep 17 00:00:00 2001 From: Gerardo Date: Thu, 14 Nov 2024 00:13:42 +0100 Subject: [PATCH] =?UTF-8?q?Soluci=C3=B3n=20Lab2=2013/11/24?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- __pycache__/functions.cpython-312.pyc | Bin 0 -> 1899 bytes functions.py | 90 +++ lab-dw-aggregating.ipynb | 756 +++++++++++++++++++++----- 3 files changed, 696 insertions(+), 150 deletions(-) create mode 100644 __pycache__/functions.cpython-312.pyc create mode 100644 functions.py diff --git a/__pycache__/functions.cpython-312.pyc b/__pycache__/functions.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..851bdc3a39f124672a0830c574c86c4f15736ad0 GIT binary patch literal 1899 zcmaJ>%TF6e7@vLl1?K$_kfsKDu&G>$(yCIDHi8m0ghv`d5-D-(44$=FciFXO*Ek>} znW|EHs9I1DI7+K<3<9b76M97I#nh-+(@UkEdUK(da;W;vn#CzXoz;Hx?f2eqe)Ihl z3ang%pcs0DQ4DI>9)Y61v3{BnJ0q{f-R;jsx+lW zWG0d1Feh&_F+-S0k`_-Mf^{;NxDvZ-snm*nq*6?Y85=hf*+tc`L893=v-2?>&qqlv zO0bQiIF+Jm3fr1#L?KBusp*hfGg7f+)<|%%^;Tw?`JBvglB7+Li1)_21f+m|mAfn6 zz?0>*<@NSrcVW5g?JtZ~8iIw1W6-07+eZGuo9#5{Q;?oDNVvE`w!&Tbvc0t(A+Yqh ze0|?J&Ef9j=o>L1YA7LoGKp#jaDi7|?12y$N37}64fPx@w{yy*gRZ2$6Oo)9Xu7#B zcRAPhTk)%D8LC?qcV)NcRbI&jFd+)@Xu>(W3;sp`|5=DBIB|0*;+>&9+=&okLOdcc zZ^G2Gi-yHS%VvsYV_S{*DDN!uP}RVTDpM#nQYw>l)u8QM0cKRoFfUazI!=IrZst@P zktp01M0t0ZU|2j20FUyHuug3;L(L^MZfmx##!r*d*Ffii*ZLaBDQNs0GzX8=gQQkU z>xXH3KW&$j+47^MUwU$-hq2dGc&;42x8Hm3=bjt~;p4*CZ(b)u@r`1xc$v$>08H8Ql>}9bsN{51CtC z11mp2Y<^+SEh6w3Fbh!n4Dxjo;@=IpksP@23ePQ=ISspw?o{ER_@6woAK7ORC&9e* z43Vb1{0x162Fn_(BF!~0G0$z76{f)gA7Wign$*xR^Nvz&#WXMr+{e}e3=SI%JTvMN zCKwR*I#ftQK$y&(Xg_Dg(Fb(EVeXI8LCz?%9Fa`Ix=#7UpW~3#35%D;{roB#NYXN6e<{{L~K$RXCa60Tt28Ns+QC`7v zqjM|bOIXjU7Hfv_Xj*a2&=MNfK@hA7lA6IkEnX$S-UWU;XmK1Oz@UbPI0KaqgAJdT z)(apG?5iHs)?4Z9uQc~n+WIQNF3tygDy=<#w))!wg{f)>YHeSCZ=>O`WpKY`aBHgE zGF%udRCJVPKHIT30}q(4>dLwCMX|WGjPj! z<<8A@{$m?C3g?=i_^vIQB&(|*&=D}=^VjMGQk4WjI1(kH=_ue4nvPqh1fj`s`xoUI Bu>t@9 literal 0 HcmV?d00001 diff --git a/functions.py b/functions.py new file mode 100644 index 0000000..960cbde --- /dev/null +++ b/functions.py @@ -0,0 +1,90 @@ +def merge_df(df1,df2): + import pandas as pd + df=df1.merge(df2,how="outer") + return df + +def improve_title_columns(df): + import pandas as pd + df=df.rename(columns={df.columns[n]:df.columns[n].strip().replace(" ","_").lower() for n in range(len(df.columns))}) + #rename column st to state + df=df.rename(columns={"st":"state"}) + df.columns + return df + +def data_standarization(df): + import pandas as pd + + state_values={ #creates a dictionary with the values and corrections + "Oregon":"Oregon", + "California":"California", + "Cali":"California", + "Arizona":"Arizona", + "AZ":"Arizona", + "Washington":"Washington", + "WA":"Washington", + "Nevada":"Nevada", + } + df["State"]=df["State"].replace(state_values) #replace with correct values + """ + gender_values={ #creates a dictionary with the values and corrections + "F":"F", + "Femal":"F", + "female":"F", + "Male":"M"} + + state_values={ #creates a dictionary with the values and corrections + "Oregon":"Oregon", + "California":"California", + "Cali":"California", + "Arizona":"Arizona", + "AZ":"Arizona", + "Washington":"Washington", + "WA":"Washington", + "Nevada":"Nevada", + } + + education_values={ #creates a dictionary with the values and corrections + "Bachelors":"Bachelor" + } + + vehicle_class_values={ #creates a dictionary with the values and corrections + "Sports Car":"Luxury", + "Luxury SUV":"Luxury", + "Luxury Car":"Luxury", + } + + df["gender"]=df["gender"].replace(gender_values) + df["state"]=df["state"].replace(state_values) #replace with correct values + df["state"]=df["state"].replace(state_values) #replace with correct values + df["vehicle_class"]=df["vehicle_class"].replace(vehicle_class_values) #replace with correct values + + #Replace % caracter with none in customer_lifetime_value + df["customer_lifetime_value"]=df["customer_lifetime_value"].str.replace("%","") + """ + #Cleaning NaN and null values} + + #First cleaning + + datos_iniciales=df.shape[0] #valor del total de filas antes de limpieza + df=df.dropna(how="all") + df.fillna(0, inplace=True) + datos_finales=df.shape[0] + ''' + #complains open format manage + list_complains_types=df["number_of_open_complaints"].unique() + list_complains_types=list(list_complains_types) + list_complains=[list_complains_types[n][2].split("/") for n in range(len(list_complains_types))] + dict_complains=dict(zip(list_complains_types,list_complains)) + + df["number_of_open_complaints"]=df["number_of_open_complaints"].replace(dict_complains) + + #changing data type + df["vehicle_class"]=df["vehicle_class"].astype("object") + df["customer_lifetime_value"]=df["customer_lifetime_value"].astype("float64") + df["number_of_open_complaints"]=df["number_of_open_complaints"].astype(int) + ''' + print(f"Data before cleaning: {datos_iniciales}\n Data after cleaning: {datos_finales}") + + + + return df \ No newline at end of file diff --git a/lab-dw-aggregating.ipynb b/lab-dw-aggregating.ipynb index fff3ae5..925fe42 100644 --- a/lab-dw-aggregating.ipynb +++ b/lab-dw-aggregating.ipynb @@ -1,161 +1,617 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "31969215-2a90-4d8b-ac36-646a7ae13744", - "metadata": { - "id": "31969215-2a90-4d8b-ac36-646a7ae13744" - }, - "source": [ - "# Lab | Data Aggregation and Filtering" - ] - }, - { - "cell_type": "markdown", - "id": "a8f08a52-bec0-439b-99cc-11d3809d8b5d", - "metadata": { - "id": "a8f08a52-bec0-439b-99cc-11d3809d8b5d" - }, - "source": [ - "In this challenge, we will continue to work with customer data from an insurance company. We will use the dataset called marketing_customer_analysis.csv, which can be found at the following link:\n", - "\n", - "https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis.csv\n", - "\n", - "This dataset contains information such as customer demographics, policy details, vehicle information, and the customer's response to the last marketing campaign. Our goal is to explore and analyze this data by first performing data cleaning, formatting, and structuring." - ] - }, - { - "cell_type": "markdown", - "id": "9c98ddc5-b041-4c94-ada1-4dfee5c98e50", - "metadata": { - "id": "9c98ddc5-b041-4c94-ada1-4dfee5c98e50" - }, - "source": [ - "1. Create a new DataFrame that only includes customers who have a total_claim_amount greater than $1,000 and have a response of \"Yes\" to the last marketing campaign." - ] - }, - { - "cell_type": "markdown", - "id": "b9be383e-5165-436e-80c8-57d4c757c8c3", - "metadata": { - "id": "b9be383e-5165-436e-80c8-57d4c757c8c3" - }, - "source": [ - "2. Using the original Dataframe, analyze the average total_claim_amount by each policy type and gender for customers who have responded \"Yes\" to the last marketing campaign. Write your conclusions." - ] - }, - { - "cell_type": "markdown", - "id": "7050f4ac-53c5-4193-a3c0-8699b87196f0", - "metadata": { - "id": "7050f4ac-53c5-4193-a3c0-8699b87196f0" - }, - "source": [ - "3. Analyze the total number of customers who have policies in each state, and then filter the results to only include states where there are more than 500 customers." - ] - }, - { - "cell_type": "markdown", - "id": "b60a4443-a1a7-4bbf-b78e-9ccdf9895e0d", - "metadata": { - "id": "b60a4443-a1a7-4bbf-b78e-9ccdf9895e0d" - }, - "source": [ - "4. Find the maximum, minimum, and median customer lifetime value by education level and gender. Write your conclusions." - ] - }, - { - "cell_type": "markdown", - "id": "b42999f9-311f-481e-ae63-40a5577072c5", - "metadata": { - "id": "b42999f9-311f-481e-ae63-40a5577072c5" - }, - "source": [ - "## Bonus" - ] - }, + "cells": [ + { + "cell_type": "markdown", + "id": "31969215-2a90-4d8b-ac36-646a7ae13744", + "metadata": { + "id": "31969215-2a90-4d8b-ac36-646a7ae13744" + }, + "source": [ + "# Lab | Data Aggregation and Filtering" + ] + }, + { + "cell_type": "markdown", + "id": "a8f08a52-bec0-439b-99cc-11d3809d8b5d", + "metadata": { + "id": "a8f08a52-bec0-439b-99cc-11d3809d8b5d" + }, + "source": [ + "In this challenge, we will continue to work with customer data from an insurance company. We will use the dataset called marketing_customer_analysis.csv, which can be found at the following link:\n", + "\n", + "https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis.csv\n", + "\n", + "This dataset contains information such as customer demographics, policy details, vehicle information, and the customer's response to the last marketing campaign. Our goal is to explore and analyze this data by first performing data cleaning, formatting, and structuring." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "449513f4-0459-46a0-a18d-9398d974c9ad", + "metadata": { + "id": "449513f4-0459-46a0-a18d-9398d974c9ad" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import functions as f\n", + "url=\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis.csv\"\n", + "df=pd.read_csv(url)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3543be10", + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "id": "81ff02c5-6584-4f21-a358-b918697c6432", - "metadata": { - "id": "81ff02c5-6584-4f21-a358-b918697c6432" - }, - "source": [ - "5. The marketing team wants to analyze the number of policies sold by state and month. Present the data in a table where the months are arranged as columns and the states are arranged as rows." - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 10910 entries, 0 to 10909\n", + "Data columns (total 26 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Unnamed: 0 10910 non-null int64 \n", + " 1 Customer 10910 non-null object \n", + " 2 State 10279 non-null object \n", + " 3 Customer Lifetime Value 10910 non-null float64\n", + " 4 Response 10279 non-null object \n", + " 5 Coverage 10910 non-null object \n", + " 6 Education 10910 non-null object \n", + " 7 Effective To Date 10910 non-null object \n", + " 8 EmploymentStatus 10910 non-null object \n", + " 9 Gender 10910 non-null object \n", + " 10 Income 10910 non-null int64 \n", + " 11 Location Code 10910 non-null object \n", + " 12 Marital Status 10910 non-null object \n", + " 13 Monthly Premium Auto 10910 non-null int64 \n", + " 14 Months Since Last Claim 10277 non-null float64\n", + " 15 Months Since Policy Inception 10910 non-null int64 \n", + " 16 Number of Open Complaints 10277 non-null float64\n", + " 17 Number of Policies 10910 non-null int64 \n", + " 18 Policy Type 10910 non-null object \n", + " 19 Policy 10910 non-null object \n", + " 20 Renew Offer Type 10910 non-null object \n", + " 21 Sales Channel 10910 non-null object \n", + " 22 Total Claim Amount 10910 non-null float64\n", + " 23 Vehicle Class 10288 non-null object \n", + " 24 Vehicle Size 10288 non-null object \n", + " 25 Vehicle Type 5428 non-null object \n", + "dtypes: float64(4), int64(5), object(17)\n", + "memory usage: 2.2+ MB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1d531765", + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "id": "b6aec097-c633-4017-a125-e77a97259cda", - "metadata": { - "id": "b6aec097-c633-4017-a125-e77a97259cda" - }, - "source": [ - "6. Display a new DataFrame that contains the number of policies sold by month, by state, for the top 3 states with the highest number of policies sold.\n", - "\n", - "*Hint:*\n", - "- *To accomplish this, you will first need to group the data by state and month, then count the number of policies sold for each group. Afterwards, you will need to sort the data by the count of policies sold in descending order.*\n", - "- *Next, you will select the top 3 states with the highest number of policies sold.*\n", - "- *Finally, you will create a new DataFrame that contains the number of policies sold by month for each of the top 3 states.*" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "Data before cleaning: 10910\n", + " Data after cleaning: 10910\n", + "\n", + "RangeIndex: 10910 entries, 0 to 10909\n", + "Data columns (total 26 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Unnamed: 0 10910 non-null int64 \n", + " 1 Customer 10910 non-null object \n", + " 2 State 10910 non-null object \n", + " 3 Customer Lifetime Value 10910 non-null float64\n", + " 4 Response 10910 non-null object \n", + " 5 Coverage 10910 non-null object \n", + " 6 Education 10910 non-null object \n", + " 7 Effective To Date 10910 non-null object \n", + " 8 EmploymentStatus 10910 non-null object \n", + " 9 Gender 10910 non-null object \n", + " 10 Income 10910 non-null int64 \n", + " 11 Location Code 10910 non-null object \n", + " 12 Marital Status 10910 non-null object \n", + " 13 Monthly Premium Auto 10910 non-null int64 \n", + " 14 Months Since Last Claim 10910 non-null float64\n", + " 15 Months Since Policy Inception 10910 non-null int64 \n", + " 16 Number of Open Complaints 10910 non-null float64\n", + " 17 Number of Policies 10910 non-null int64 \n", + " 18 Policy Type 10910 non-null object \n", + " 19 Policy 10910 non-null object \n", + " 20 Renew Offer Type 10910 non-null object \n", + " 21 Sales Channel 10910 non-null object \n", + " 22 Total Claim Amount 10910 non-null float64\n", + " 23 Vehicle Class 10910 non-null object \n", + " 24 Vehicle Size 10910 non-null object \n", + " 25 Vehicle Type 10910 non-null object \n", + "dtypes: float64(4), int64(5), object(17)\n", + "memory usage: 2.2+ MB\n" + ] + } + ], + "source": [ + "df=f.data_standarization(df)\n", + "df.info()" + ] + }, + { + "cell_type": "markdown", + "id": "9c98ddc5-b041-4c94-ada1-4dfee5c98e50", + "metadata": { + "id": "9c98ddc5-b041-4c94-ada1-4dfee5c98e50" + }, + "source": [ + "1. Create a new DataFrame that only includes customers who have a total_claim_amount greater than $1,000 and have a response of \"Yes\" to the last marketing campaign." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "df159dfb", + "metadata": {}, + "outputs": [], + "source": [ + "df1=df[(df[\"Total Claim Amount\"]>1000) & (df[\"Response\"]==\"Yes\")]\n" + ] + }, + { + "cell_type": "markdown", + "id": "b9be383e-5165-436e-80c8-57d4c757c8c3", + "metadata": { + "id": "b9be383e-5165-436e-80c8-57d4c757c8c3" + }, + "source": [ + "2. Using the original Dataframe, analyze the average total_claim_amount by each policy type and gender for customers who have responded \"Yes\" to the last marketing campaign. Write your conclusions." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "754d9417", + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "id": "ba975b8a-a2cf-4fbf-9f59-ebc381767009", - "metadata": { - "id": "ba975b8a-a2cf-4fbf-9f59-ebc381767009" - }, - "source": [ - "7. The marketing team wants to analyze the effect of different marketing channels on the customer response rate.\n", - "\n", - "Hint: You can use melt to unpivot the data and create a table that shows the customer response rate (those who responded \"Yes\") by marketing channel." + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Total Claim Amount
GenderPolicy Type
FCorporate Auto397.799287
Personal Auto413.239658
Special Auto458.139623
MCorporate Auto462.223565
Personal Auto459.919476
Special Auto420.355202
\n", + "
" + ], + "text/plain": [ + " Total Claim Amount\n", + "Gender Policy Type \n", + "F Corporate Auto 397.799287\n", + " Personal Auto 413.239658\n", + " Special Auto 458.139623\n", + "M Corporate Auto 462.223565\n", + " Personal Auto 459.919476\n", + " Special Auto 420.355202" ] - }, + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "filtered_df = df[df[\"Response\"] == \"Yes\"]#Filter data with response=yes\n", + "df2=df.groupby([\"Gender\",\"Policy Type\"]).agg({\"Total Claim Amount\":\"mean\"})#group by gender and policy type to get total claim mean\n", + "df2" + ] + }, + { + "cell_type": "markdown", + "id": "7050f4ac-53c5-4193-a3c0-8699b87196f0", + "metadata": { + "id": "7050f4ac-53c5-4193-a3c0-8699b87196f0" + }, + "source": [ + "3. Analyze the total number of customers who have policies in each state, and then filter the results to only include states where there are more than 500 customers." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "1f74d550", + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "id": "e4378d94-48fb-4850-a802-b1bc8f427b2d", - "metadata": { - "id": "e4378d94-48fb-4850-a802-b1bc8f427b2d" - }, - "source": [ - "External Resources for Data Filtering: https://towardsdatascience.com/filtering-data-frames-in-pandas-b570b1f834b9" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Customer
StatePolicy Type
ArizonaPersonal Auto1469
CaliforniaCorporate Auto835
Personal Auto2594
NevadaPersonal Auto739
OregonCorporate Auto592
Personal Auto2180
WashingtonPersonal Auto682
\n", + "
" + ], + "text/plain": [ + " Customer\n", + "State Policy Type \n", + "Arizona Personal Auto 1469\n", + "California Corporate Auto 835\n", + " Personal Auto 2594\n", + "Nevada Personal Auto 739\n", + "Oregon Corporate Auto 592\n", + " Personal Auto 2180\n", + "Washington Personal Auto 682" ] - }, + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "filtered_df = df[df[\"State\"] != 0]#Filter data to exclude no state value\n", + "df3=filtered_df.groupby([\"State\",\"Policy Type\"]).agg({\"Customer\":\"count\"})\n", + "df3=df3[df3[\"Customer\"]>500]\n", + "df3" + ] + }, + { + "cell_type": "markdown", + "id": "b60a4443-a1a7-4bbf-b78e-9ccdf9895e0d", + "metadata": { + "id": "b60a4443-a1a7-4bbf-b78e-9ccdf9895e0d" + }, + "source": [ + "4. Find the maximum, minimum, and median customer lifetime value by education level and gender. Write your conclusions." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "25641b4d", + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "id": "449513f4-0459-46a0-a18d-9398d974c9ad", - "metadata": { - "id": "449513f4-0459-46a0-a18d-9398d974c9ad" - }, - "outputs": [], - "source": [ - "# your code goes here" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Customer Lifetime Value
maxminmedian
EducationGender
BachelorF73225.961904.005640.51
M67907.271898.015548.03
CollegeF61850.191898.685623.61
M61134.681918.126005.85
DoctorF44856.112395.575332.46
M32677.342267.605577.67
High School or BelowF55277.452144.926039.55
M83325.381940.986286.73
MasterF51016.072417.785729.86
M50568.262272.315579.10
\n", + "
" + ], + "text/plain": [ + " Customer Lifetime Value \n", + " max min median\n", + "Education Gender \n", + "Bachelor F 73225.96 1904.00 5640.51\n", + " M 67907.27 1898.01 5548.03\n", + "College F 61850.19 1898.68 5623.61\n", + " M 61134.68 1918.12 6005.85\n", + "Doctor F 44856.11 2395.57 5332.46\n", + " M 32677.34 2267.60 5577.67\n", + "High School or Below F 55277.45 2144.92 6039.55\n", + " M 83325.38 1940.98 6286.73\n", + "Master F 51016.07 2417.78 5729.86\n", + " M 50568.26 2272.31 5579.10" ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.13" - }, - "colab": { - "provenance": [] - } + ], + "source": [ + "df4=df.groupby([\"Education\",\"Gender\"]).agg({\"Customer Lifetime Value\":[\"max\",\"min\",\"median\"]})\n", + "df4.round(2)" + ] + }, + { + "cell_type": "markdown", + "id": "b42999f9-311f-481e-ae63-40a5577072c5", + "metadata": { + "id": "b42999f9-311f-481e-ae63-40a5577072c5" + }, + "source": [ + "## Bonus" + ] + }, + { + "cell_type": "markdown", + "id": "81ff02c5-6584-4f21-a358-b918697c6432", + "metadata": { + "id": "81ff02c5-6584-4f21-a358-b918697c6432" + }, + "source": [ + "5. The marketing team wants to analyze the number of policies sold by state and month. Present the data in a table where the months are arranged as columns and the states are arranged as rows." + ] + }, + { + "cell_type": "markdown", + "id": "b6aec097-c633-4017-a125-e77a97259cda", + "metadata": { + "id": "b6aec097-c633-4017-a125-e77a97259cda" + }, + "source": [ + "6. Display a new DataFrame that contains the number of policies sold by month, by state, for the top 3 states with the highest number of policies sold.\n", + "\n", + "*Hint:*\n", + "- *To accomplish this, you will first need to group the data by state and month, then count the number of policies sold for each group. Afterwards, you will need to sort the data by the count of policies sold in descending order.*\n", + "- *Next, you will select the top 3 states with the highest number of policies sold.*\n", + "- *Finally, you will create a new DataFrame that contains the number of policies sold by month for each of the top 3 states.*" + ] + }, + { + "cell_type": "markdown", + "id": "ba975b8a-a2cf-4fbf-9f59-ebc381767009", + "metadata": { + "id": "ba975b8a-a2cf-4fbf-9f59-ebc381767009" + }, + "source": [ + "7. The marketing team wants to analyze the effect of different marketing channels on the customer response rate.\n", + "\n", + "Hint: You can use melt to unpivot the data and create a table that shows the customer response rate (those who responded \"Yes\") by marketing channel." + ] + }, + { + "cell_type": "markdown", + "id": "e4378d94-48fb-4850-a802-b1bc8f427b2d", + "metadata": { + "id": "e4378d94-48fb-4850-a802-b1bc8f427b2d" + }, + "source": [ + "External Resources for Data Filtering: https://towardsdatascience.com/filtering-data-frames-in-pandas-b570b1f834b9" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c61f09d1", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}