-
Notifications
You must be signed in to change notification settings - Fork 0
/
crime analysis.R
319 lines (250 loc) · 11 KB
/
crime analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
library(dplyr)
library(lubridate)
library(skimr)
library(forecast)
library(tseries)
library(tidyr)
# Data tiding and parsing
#Load the csv files and merge them
arrest_data1 <- read.csv("/Users/wangyuan/Desktop/NYPD_Arrest1.csv")
arrest_data2 <- read.csv("/Users/wangyuan/Desktop/NYPD_Arrest2.csv")
arrest_data <- rbind(arrest_data1, arrest_data2)
# Checking the distribution of data types and missing values
str(arrest_data)
skim(arrest_data)
# Drop some unnecessary columns
arrest_data_cleaned <- arrest_data %>% select(-c(PD_CD, PD_DESC, KY_CD, LAW_CODE, JURISDICTION_CODE, ARREST_PRECINCT))
# Convert ARREST_DATE to date format and create a new list of crime months
arrest_data_cleaned$ARREST_DATE <- mdy(arrest_data_cleaned$ARREST_DATE)
arrest_data_cleaned$ARREST_MONTH <- month(arrest_data_cleaned$ARREST_DATE)
# Create a new column to store the month and year information
arrest_data_cleaned$ARREST_YM <- floor_date(arrest_data_cleaned$ARREST_DATE, "month")
# Group according to ARREST_YM and calculate the number of crimes per month
monthly_crime_counts <- arrest_data_cleaned %>%
group_by(ARREST_YM) %>%
summarise(MONTHLY_CRIME_COUNT = n())
arrest_data <- arrest_data_cleaned %>%
left_join(monthly_crime_counts, by = "ARREST_YM")
# For LAW_CAT_CD
table(arrest_data$LAW_CAT_CD)
# Missing data are categorical variables, so they are transformed to "not provided"
arrest_data <- arrest_data %>%
mutate(LAW_CAT_CD = case_when(
LAW_CAT_CD == "F" ~ "felony",
LAW_CAT_CD == "M" ~ "misdemeanor",
LAW_CAT_CD == "V" ~ "violation",
LAW_CAT_CD == 9 | LAW_CAT_CD == "I" ~ "other",
LAW_CAT_CD == "(null)" | LAW_CAT_CD == "" ~ "not provided",
TRUE ~ LAW_CAT_CD
))
# For ARREST_BORO
arrest_data <- arrest_data %>%
mutate(ARREST_BORO = case_when(
ARREST_BORO == "B" ~ "Bronx",
ARREST_BORO == "S" ~ "Staten Island",
ARREST_BORO == "K" ~ "Brooklyn",
ARREST_BORO == "M" ~ "Manhattan",
ARREST_BORO == "Q" ~ "Queens",
TRUE ~ ARREST_BORO
))
table(arrest_data$ARREST_BORO)
# For other variables
arrest_data <- arrest_data %>%
mutate(PERP_SEX = case_when(
PERP_SEX == "F" ~ "female",
PERP_SEX == "M" ~ "male",
TRUE ~ PERP_SEX
),
PERP_RACE = tolower(PERP_RACE),
OFNS_DESC = tolower(OFNS_DESC)
)
cleaned_dataset <- arrest_data %>%
arrange(ARREST_DATE) %>%
filter(year(ARREST_DATE) >= 2018 & year(ARREST_DATE) <= 2023)
# Export the cleaned_dataset
write.csv(cleaned_dataset, "/Users/wangyuan/Desktop/cleaned_dataset.csv", row.names = FALSE)
# Filter the records from 2018 to 2023
str(cleaned_dataset)
cleaned_dataset <- cleaned_dataset %>%
arrange(ARREST_DATE) %>%
filter(year(ARREST_DATE) >= 2018 & year(ARREST_DATE) <= 2023)
write.csv(cleaned_dataset, "/Users/wangyuan/Desktop/cleaned_dataset.csv", row.names = FALSE)
# Research question 1: Is there a correlation between the total number of crimes and the seasons in New York City?
# Create a time series about number of crimes in New York
total_crime_ts <- cleaned_dataset %>%
group_by(ARREST_YM) %>%
summarise(Total_Crimes = n()) %>%
arrange(ARREST_YM) %>%
complete(ARREST_YM = seq.Date(min(ARREST_YM), max(ARREST_YM), by="month")) %>%
replace_na(list(Total_Crimes = 0))
# Convert the data into time series
crime_counts_vector <- total_crime_ts$Total_Crimes # specify the number of crimes
start_year <- year(min(total_crime_ts$ARREST_YM)) # specify the start year
start_month <- month(min(total_crime_ts$ARREST_YM)) # specify the start month
total_crime_ts <- ts(crime_counts_vector,
start = c(start_year, start_month),
frequency = 12) # specify that there are 12 months in one year
# Exploratory data analysis on number of crimes in New York
# Create line charts to observe the pattern
autoplot(total_crime_ts)
ggseasonplot(total_crime_ts)
total_crime_ts%>%
stl(s.window = 'periodic')%>%
autoplot()
# Build model and conduct Ljung-Box to test whether the time series fully captures time-dependent pattern
# AMRIMA Model: white noise
arima_model_total <- auto.arima(total_crime_ts)
arima_model_total # AIC=1257.52 AICc=1258.13 BIC=1266.57
Box.test(arima_model_total$residuals, type = "Ljung-Box") # p-value=0.8517
checkresiduals(arima_model_total) # p-value=0.1499
# ETS AAA Model:need further analysis
ets_aaa_total = ets(total_crime_ts,model = 'AAA')
ets_aaa_total #AIC=1376.670 AICc=1388.004 BIC=1415.374
Box.test(ets_aaa_total$residuals, type = "Ljung-Box") # p-value=0.9002
checkresiduals(ets_aaa_total) # p-value=0.04675
# ETS Auto Model: need further analysis
ets_model_total <- ets(total_crime_ts)
ets_model_total #ANN AIC=1382.483 AICc=1382.836 BIC=1389.313
Box.test(ets_model_total$residuals, type = "Ljung-Box") # p-value=0.4292
checkresiduals(ets_model_total) # p-value=0.03696
library(ggplot2)
# Divide 12 months into 4 seaons
cleaned_dataset <- cleaned_dataset %>%
mutate(Season = case_when(
month(ARREST_DATE) %in% c(12, 1, 2) ~ "Winter",
month(ARREST_DATE) %in% c(3, 4, 5) ~ "Spring",
month(ARREST_DATE) %in% c(6, 7, 8) ~ "Summer",
month(ARREST_DATE) %in% c(9, 10, 11) ~ "Autumn"
))
# Create line chart to observe seasonal changes in the number of crimes
ggplot(cleaned_dataset, aes(x = ARREST_DATE, y = MONTHLY_CRIME_COUNT, group = Season, color = Season)) +
geom_line() +
labs(title = "Seasonal Variation in Crime Rates", x = "Date", y = "Total Crimes") +
theme_minimal()
# Create box chart to compare the number of crimes in four seasons
ggplot(cleaned_dataset, aes(x = Season, y = MONTHLY_CRIME_COUNT, fill = Season)) +
geom_boxplot() +
labs(title = "Crime Count by Season", x = "Season", y = "Total Crimes") +
theme_minimal()
# Summary the crime rates in four seasons
seasonal_summary <- cleaned_dataset %>%
group_by(Season) %>%
summarise(
Average = mean(MONTHLY_CRIME_COUNT),
Median = median(MONTHLY_CRIME_COUNT),
IQR = IQR(MONTHLY_CRIME_COUNT)
)
print(seasonal_summary)
cleaned_dataset_ts <- ts(cleaned_dataset$MONTHLY_CRIME_COUNT, frequency = 12, start = c(year(min(cleaned_dataset$ARREST_DATE)), month(min(cleaned_dataset$ARREST_DATE))))
# Seasonal-Trend decomposition using LOESS
stl_result <- stl(cleaned_dataset_ts, s.window = "periodic")
plot(stl_result)
# Anova Test
anova_result <- aov(MONTHLY_CRIME_COUNT ~ Season, data = cleaned_dataset)
summary(anova_result) # P-value<0.001,significant seasonality at 95% significance level
# Research question 2: Is there a correlation between the number of assault crimes and the seasons in New York City?
# Find the most frwquent crime type in New York:assaults
top_crime_type <- cleaned_dataset %>%
group_by(OFNS_DESC) %>%
summarise(Total = n()) %>%
ungroup() %>%
arrange(desc(Total)) %>%
slice(1) %>%
pull(OFNS_DESC)
crime_counts <- cleaned_dataset %>%
group_by(OFNS_DESC) %>%
summarise(Total = n()) %>%
arrange(desc(Total))
library(ggplot2)
top_ten_crime_counts <- crime_counts %>%
slice(1:10) # Selecting the top 10 crime types
# Plotting
ggplot(top_ten_crime_counts, aes(x = reorder(OFNS_DESC, Total), y = Total, fill = OFNS_DESC == "assault 3 & related offenses")) +
geom_bar(stat = "identity") +
labs(title = "Top 10 Crime Types in NYC",
x = "Offense Description",
y = "Total Incidents") +
scale_fill_manual(values = c("true" = "red", "false" = "grey"),
guide = FALSE) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x labels for better readability
monthly_topcrime_counts <- cleaned_dataset %>%
filter(OFNS_DESC == top_crime_type) %>%
group_by(ARREST_YM) %>%
summarise(MONTHLY_TOPCRIME_COUNT = n())
cleaned_dataset <- cleaned_dataset %>%
left_join(monthly_topcrime_counts, by = "ARREST_YM")
top_crime_ts <- cleaned_dataset %>%
filter(OFNS_DESC == top_crime_type) %>%
group_by(ARREST_YM) %>%
summarise(Top_Crime_Count = n()) %>%
arrange(ARREST_YM) %>%
complete(ARREST_YM = seq.Date(min(ARREST_YM), max(ARREST_YM), by="month")) %>%
replace_na(list(Top_Crime_Count = 0))
top_crime_type
# Convert the data into time series
crime_counts_vector <- top_crime_ts$Top_Crime_Count # specify the number of crimes
start_year <- year(min(top_crime_ts$ARREST_YM)) # specify the start year
start_month <- month(min(top_crime_ts$ARREST_YM)) # # specify the start month
# Create a time series about number of crimes in New York
top_crime_ts <- ts(crime_counts_vector,
start = c(start_year, start_month),
frequency = 12) # specify that there are 12 months in one year
# Exploratory data analysis on number of crimes in New York
# Create line charts to observe the pattern
autoplot(top_crime_ts)
ggseasonplot(top_crime_ts)
top_crime_ts%>%
stl(s.window = 'periodic')%>%
autoplot()
# Build model and conduct Ljung-Box to test whether the time series fully captures time-dependent pattern
# AMRIMA Model: white noises
arima_model_top_crime <- auto.arima(top_crime_ts)
arima_model_top_crime # AIC=955.99 AICc=956.35 BIC=962.78
Box.test(arima_model_top_crime$residuals, type = "Ljung-Box") # p-value = 0.8833
checkresiduals(arima_model_top_crime) # p-value = 0.1207
# ETS AAA Model:white noise
ets_aaa_top = ets(top_crime_ts,model = 'AAA')
ets_aaa_top # AIC=1070.901 AICc=1082.234 BIC=1109.604
Box.test(ets_aaa_top$residuals, type = "Ljung-Box") # p-value = 0.9639
checkresiduals(ets_aaa_top) # p-value = 0.379
# ETS Auto Model: white noise
ets_model_top_crime <- ets(top_crime_ts)
ets_model_top_crime # AIC=1064.292 AICc=1072.863 BIC=1098.442
Box.test(ets_model_top_crime$residuals, type = "Ljung-Box") # p-value = 0.9547
checkresiduals(ets_model_top_crime) # p-value = 0.2875
library(ggplot2)
# Divide 12 months into 4 seaons
cleaned_dataset <- cleaned_dataset %>%
mutate(Season = case_when(
month(ARREST_DATE) %in% c(12, 1, 2) ~ "Winter",
month(ARREST_DATE) %in% c(3, 4, 5) ~ "Spring",
month(ARREST_DATE) %in% c(6, 7, 8) ~ "Summer",
month(ARREST_DATE) %in% c(9, 10, 11) ~ "Autumn"
))
# Create line chart to observe seasonal changes in the number of assaults
ggplot(cleaned_dataset, aes(x = ARREST_DATE, y = MONTHLY_TOPCRIME_COUNT, group = Season, color = Season)) +
geom_line() +
labs(title = "Seasonal Variation in Crime Rates", x = "Date", y = "Top Crimes") +
theme_minimal()
# Create box chart to compare the number of crimes in four seasons
ggplot(cleaned_dataset, aes(x = Season, y = MONTHLY_TOPCRIME_COUNT, fill = Season)) +
geom_boxplot() +
labs(title = "Crime Count by Season", x = "Season", y = "Top Crimes") +
theme_minimal()
# Summary the crime rates in four seasons
seasonal_summary <- cleaned_dataset %>%
group_by(Season) %>%
summarise(
Average = mean(MONTHLY_TOPCRIME_COUNT),
Median = median(MONTHLY_TOPCRIME_COUNT),
IQR = IQR(MONTHLY_TOPCRIME_COUNT)
)
print(seasonal_summary)
cleaned_dataset_ts <- ts(cleaned_dataset$MONTHLY_TOPCRIME_COUNT, frequency = 12, start = c(year(min(cleaned_dataset$ARREST_DATE)), month(min(cleaned_dataset$ARREST_DATE))))
# Seasonal-Trend decomposition using LOESS
stl_result <- stl(cleaned_dataset_ts, s.window = "periodic")
plot(stl_result)
# Anova Test
anova_result <- aov(MONTHLY_TOPCRIME_COUNT ~ Season, data = cleaned_dataset)
summary(anova_result)