-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmarketing_campaign_exploring_and_cleaning.sql
181 lines (165 loc) · 4.55 KB
/
marketing_campaign_exploring_and_cleaning.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
/*
This is a data exploration and cleaning project using PostgreSQL
The data used was the Marketing Campaign from Kaggle (https://www.kaggle.com/datasets/rodsaldanha/arketing-campaign/data)
Acknowledging O. Parr-Rud. Business Analytics Using SAS Enterprise Guide and SAS Enterprise Miner. SAS Institute, 2014.
*/
-- Creat the table for import
CREATE TABLE marketing_campaign (
customer_id INTEGER,
birth_year INTEGER,
education TEXT,
marital_status TEXT,
income INTEGER,
kidhome INTEGER,
teenhome INTEGER,
start_date TIMESTAMP WITHOUT TIME ZONE,
days_last_purchase INTEGER,
spent_wine INTEGER,
spent_fruits INTEGER,
spent_meat INTEGER,
spent_fish INTEGER,
spent_sweets INTEGER,
spent_gold INTEGER,
deal_purchases INTEGER,
web_purchases INTEGER,
catalog_purchases INTEGER,
store_purchases INTEGER,
web_visits INTEGER,
campaign3 INTEGER,
campaign4 INTEGER,
campaign5 INTEGER,
campaign1 INTEGER,
campaign2 INTEGER,
complain INTEGER,
zcost INTEGER,
zrevenue INTEGER,
response INTEGER
);
-- Exploration and Cleaning the data
------------------------------------------------------------------
-- Identify if there is any null values in all colums
SELECT
income
FROM marketing_campaign
WHERE income IS NULL
-- Only the income column has 24 null values
;
-- Delete null values in the income cell
DELETE FROM marketing_campaign
WHERE income IS NULL
-- 24 rows deleted
;
-- birth_year outliers
SELECT
birth_year,
(2014 - birth_year) AS age,
COUNT(*)
FROM marketing_campaign
WHERE (2014 - birth_year) > 100
GROUP BY birth_year, age
/* With this query we have identified that the dataset has people
that was born in the year 1900 and before so we need to exclude
them to have an accurate analysis (3 outliers).
This was done because the dataset was published in the year 2014
By SAS Institute, 2014.
*/;
-- Delete values that are more than 100 years old
DELETE FROM marketing_campaign
WHERE (2014 - birth_year) > 100
-- 3 rows deleted
;
-- Education ef clients
SELECT
education,
COUNT(*)
FROM marketing_campaign
GROUP BY education;
-- Marital Satuts of clients
SELECT
marital_status,
COUNT(*)
FROM marketing_campaign
GROUP BY marital_status
/*
The table has to types of value that does not match the type of value
we could keep only 2 values to standarize the column:
'Widow', 'YOLO', 'Alone', 'Absurd', 'Divorced' to SINGLE
'Together' to MARRIED
*/
;
-- Update te table to standardize the marital_status column
UPDATE marketing_campaign
SET marital_status =
CASE
WHEN marital_status = 'Together' THEN 'Married'
WHEN marital_status IN ('Widow', 'YOLO', 'Alone', 'Absurd', 'Divorced') THEN 'Single'
ELSE marital_status
END;
-- Anual Income average, maximum, minimun and quartiles
SELECT
ROUND(AVG(income)) AS avg_income,
MIN(income) AS min_income,
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY income) AS q1_income,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY income) AS median_income,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY income) AS q3_income,
MAX(income) AS max_income
FROM marketing_campaign
WHERE income IS NOT NULL;
-- Identify outliers in the income column
WITH quartiles AS (
SELECT
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY income) AS Q1,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY income) AS Q3
FROM marketing_campaign
),
iqr_calculation AS (
SELECT
Q1,
Q3,
(Q3 - Q1) AS IQR
FROM quartiles
)
SELECT
mc.*,
CASE
WHEN mc.income < (q.Q1 - 1.5 * q.IQR) THEN 'Lower Outlier'
WHEN mc.income > (q.Q3 + 1.5 * q.IQR) THEN 'Upper Outlier'
ELSE 'Normal'
END AS outlier_status
FROM marketing_campaign mc
CROSS JOIN iqr_calculation q
WHERE
mc.income < (q.Q1 - 1.5 * q.IQR)
OR mc.income > (q.Q3 + 1.5 * q.IQR)
/*
With this query we have identified that we have 8 outliers in the table
that we need to delete from the tbale
*/
-- Date of customers enrolment max and min
SELECT
MIN(DATE(start_date)) AS min_date,
MAX(DATE(start_date)) AS max_date
FROM marketing_campaign
-- From July 30 2012 to June 29 2014
;
-- Remove income outliers
WITH quartiles AS (
SELECT
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY income) AS Q1,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY income) AS Q3
FROM marketing_campaign
),
iqr_calculation AS (
SELECT
Q1,
Q3,
(Q3 - Q1) AS IQR
FROM
quartiles
)
DELETE FROM marketing_campaign
USING iqr_calculation
WHERE income < (iqr_calculation.Q1 - 1.5 * iqr_calculation.IQR)
OR income > (iqr_calculation.Q3 + 1.5 * iqr_calculation.IQR)
-- 8 values deleted
;