-
Notifications
You must be signed in to change notification settings - Fork 131
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
starting work described in #104 to split feature creation
- Loading branch information
Showing
10 changed files
with
185 additions
and
233 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
|
||
if(interactive()){ | ||
##========================================================================== | ||
## INITIALIZE | ||
##========================================================================== | ||
## Remove all objects; perform garbage collection | ||
rm(list=ls()) | ||
gc(reset=TRUE) | ||
## Detach any non-standard libraries | ||
geneorama::detach_nonstandard_packages() | ||
} | ||
## Load libraries & project functions | ||
geneorama::loadinstall_libraries(c("data.table", "MASS")) | ||
geneorama::sourceDir("CODE/functions/") | ||
## Import shift function | ||
shift <- geneorama::shift | ||
|
||
##============================================================================== | ||
## LOAD CACHED RDS FILES | ||
##============================================================================== | ||
foodInspect <- readRDS("DATA/13_food_inspections.Rds") | ||
|
||
## Apply row filter to remove invalid data | ||
foodInspect <- filter_foodInspect(foodInspect) | ||
|
||
## Remove violations from food inspection, violations are caputured in the | ||
## violation matrix data | ||
foodInspect$Violations <- NULL | ||
|
||
## Import violation matrix which lists violations by categories: | ||
## Critical, serious, and minor violations | ||
violation_dat <- readRDS("DATA/21_food_inspection_violation_matrix.Rds") | ||
|
||
##============================================================================== | ||
## CALCULATE FEATURES | ||
##============================================================================== | ||
|
||
## Facility_Type_Clean: Anything that is not "restaurant" or "grocery" is "other" | ||
foodInspect[ , Facility_Type_Clean := | ||
categorize(x = Facility_Type, | ||
primary = list(Restaurant = "restaurant", | ||
Grocery_Store = "grocery"), | ||
ignore.case = TRUE)] | ||
## Join in the violation matrix | ||
foodInspect <- merge(x = foodInspect, | ||
y = violation_dat, | ||
by = "Inspection_ID") | ||
## Create pass / fail flags | ||
foodInspect[ , pass_flag := ifelse(Results=="Pass",1, 0)] | ||
foodInspect[ , fail_flag := ifelse(Results=="Fail",1, 0)] | ||
## Set key to ensure that records are treated CHRONOLOGICALLY... | ||
setkey(foodInspect, License, Inspection_Date) | ||
## Then find previous info by "shifting" the columns (grouped by License) | ||
foodInspect[ , pastFail := shift(fail_flag, -1, 0), by = License] | ||
foodInspect[ , pastCritical := shift(criticalCount, -1, 0), by = License] | ||
foodInspect[ , pastSerious := shift(seriousCount, -1, 0), by = License] | ||
foodInspect[ , pastMinor := shift(minorCount, -1, 0), by = License] | ||
|
||
## Calcualte time since last inspection. | ||
## If the time is NA, this means it's the first inspection; add an inicator | ||
## variable to indicate that it's the first inspection. | ||
foodInspect[i = TRUE , | ||
j = timeSinceLast := as.numeric( | ||
Inspection_Date - shift(Inspection_Date, -1, NA)) / 365, | ||
by = License] | ||
foodInspect[ , firstRecord := 0] | ||
foodInspect[is.na(timeSinceLast), firstRecord := 1] | ||
foodInspect[is.na(timeSinceLast), timeSinceLast := 2] | ||
foodInspect[ , timeSinceLast := pmin(timeSinceLast, 2)] | ||
|
||
##============================================================================== | ||
## SAVE RDS | ||
##============================================================================== | ||
setkey(foodInspect, Inspection_ID) | ||
saveRDS(foodInspect, file.path("DATA/23_food_insp_features.Rds")) | ||
|
||
|
||
|
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
|
||
if(interactive()){ | ||
##========================================================================== | ||
## INITIALIZE | ||
##========================================================================== | ||
## Remove all objects; perform garbage collection | ||
rm(list=ls()) | ||
gc(reset=TRUE) | ||
## Detach any non-standard libraries | ||
geneorama::detach_nonstandard_packages() | ||
} | ||
## Load libraries & project functions | ||
geneorama::loadinstall_libraries(c("data.table", "MASS")) | ||
geneorama::sourceDir("CODE/functions/") | ||
## Import shift function | ||
shift <- geneorama::shift | ||
|
||
##============================================================================== | ||
## LOAD CACHED RDS FILES | ||
##============================================================================== | ||
business <- readRDS("DATA/11_bus_license.Rds") | ||
|
||
## Apply filter to remove invalid / unused data | ||
business <- filter_business(business) | ||
|
||
## Food inspection data needed for some feature calculations inspection date | ||
foodInspect <- readRDS("DATA/23_food_insp_features.Rds") | ||
|
||
##============================================================================== | ||
## CALCULATE FEATURES BASED ON BUSINESS LICENSE DATA | ||
##============================================================================== | ||
|
||
## Calculate min date (by license) | ||
business[ , minDate := min(LICENSE_TERM_START_DATE), LICENSE_NUMBER] | ||
business[ , maxDate := max(LICENSE_TERM_EXPIRATION_DATE), LICENSE_NUMBER] | ||
|
||
##============================================================================== | ||
## Use only the business data that pertains to food inspections | ||
##============================================================================== | ||
## Create a table of matches between the food inspection and business license | ||
## data, based on the where the Inspection_Date falls within the business | ||
## license renewal | ||
id_table_food2business <- find_bus_id_matches(business, foodInspect) | ||
geneorama::NAsummary(id_table_food2business) | ||
|
||
## Add food key to matched business data | ||
bus_matched <- merge(x = id_table_food2business, | ||
y = business, | ||
by = "ID", | ||
all.y = FALSE, | ||
all.x = TRUE) | ||
setkey(bus_matched, Inspection_ID) | ||
|
||
## Add business key to food data | ||
foodInspect <- merge(x = id_table_food2business, | ||
y = foodInspect, | ||
by = "Inspection_ID") | ||
setkey(foodInspect, Inspection_ID) | ||
|
||
## Use minDate and Inspection date to calculate age at | ||
bus_matched <- bus_matched[foodInspect[,Inspection_Date,keyby=Inspection_ID]] | ||
bus_matched[ , ageAtInspection := as.numeric(Inspection_Date - minDate) / 365] | ||
|
||
## Remove Inspection Date to avoid conflict names when merging later | ||
bus_matched[ , Inspection_Date := NULL] | ||
|
||
|
||
## CALCULATE AND MERGE IN OTHER CATEGORIES | ||
OtherCategories <- GenerateOtherLicenseInfo(foodInspect, business, max_cat = 12) | ||
geneorama::NAsummary(OtherCategories) | ||
|
||
## Merge in results | ||
bus_matched <- merge(x = bus_matched, | ||
y = OtherCategories, | ||
by = "Inspection_ID", | ||
all.x = T) | ||
## Remove NAs in category columns and set max value to 1 | ||
for (j in match(colnames(OtherCategories)[-1], colnames(bus_matched))) { | ||
set(x = bus_matched, i = which(is.na(bus_matched[[j]])), j = j, value = 0) | ||
set(x = bus_matched, j = j, value = pmin(bus_matched[[j]], 1)) | ||
} | ||
|
||
bus_matched | ||
|
||
##============================================================================== | ||
## SAVE RDS | ||
##============================================================================== | ||
## Set the key for dat_model | ||
setkey(bus_matched, Inspection_ID) | ||
saveRDS(bus_matched, file.path("DATA/24_bus_features.Rds")) | ||
|
Oops, something went wrong.