Skip to content

Commit

Permalink
starting work described in #104 to split feature creation
Browse files Browse the repository at this point in the history
  • Loading branch information
geneorama committed Dec 21, 2017
1 parent b8a9d08 commit 9a10616
Show file tree
Hide file tree
Showing 10 changed files with 185 additions and 233 deletions.
78 changes: 78 additions & 0 deletions CODE/23_food_insp_features.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@

if(interactive()){
##==========================================================================
## INITIALIZE
##==========================================================================
## Remove all objects; perform garbage collection
rm(list=ls())
gc(reset=TRUE)
## Detach any non-standard libraries
geneorama::detach_nonstandard_packages()
}
## Load libraries & project functions
geneorama::loadinstall_libraries(c("data.table", "MASS"))
geneorama::sourceDir("CODE/functions/")
## Import shift function
shift <- geneorama::shift

##==============================================================================
## LOAD CACHED RDS FILES
##==============================================================================
foodInspect <- readRDS("DATA/13_food_inspections.Rds")

## Apply row filter to remove invalid data
foodInspect <- filter_foodInspect(foodInspect)

## Remove violations from food inspection, violations are caputured in the
## violation matrix data
foodInspect$Violations <- NULL

## Import violation matrix which lists violations by categories:
## Critical, serious, and minor violations
violation_dat <- readRDS("DATA/21_food_inspection_violation_matrix.Rds")

##==============================================================================
## CALCULATE FEATURES
##==============================================================================

## Facility_Type_Clean: Anything that is not "restaurant" or "grocery" is "other"
foodInspect[ , Facility_Type_Clean :=
categorize(x = Facility_Type,
primary = list(Restaurant = "restaurant",
Grocery_Store = "grocery"),
ignore.case = TRUE)]
## Join in the violation matrix
foodInspect <- merge(x = foodInspect,
y = violation_dat,
by = "Inspection_ID")
## Create pass / fail flags
foodInspect[ , pass_flag := ifelse(Results=="Pass",1, 0)]
foodInspect[ , fail_flag := ifelse(Results=="Fail",1, 0)]
## Set key to ensure that records are treated CHRONOLOGICALLY...
setkey(foodInspect, License, Inspection_Date)
## Then find previous info by "shifting" the columns (grouped by License)
foodInspect[ , pastFail := shift(fail_flag, -1, 0), by = License]
foodInspect[ , pastCritical := shift(criticalCount, -1, 0), by = License]
foodInspect[ , pastSerious := shift(seriousCount, -1, 0), by = License]
foodInspect[ , pastMinor := shift(minorCount, -1, 0), by = License]

## Calcualte time since last inspection.
## If the time is NA, this means it's the first inspection; add an inicator
## variable to indicate that it's the first inspection.
foodInspect[i = TRUE ,
j = timeSinceLast := as.numeric(
Inspection_Date - shift(Inspection_Date, -1, NA)) / 365,
by = License]
foodInspect[ , firstRecord := 0]
foodInspect[is.na(timeSinceLast), firstRecord := 1]
foodInspect[is.na(timeSinceLast), timeSinceLast := 2]
foodInspect[ , timeSinceLast := pmin(timeSinceLast, 2)]

##==============================================================================
## SAVE RDS
##==============================================================================
setkey(foodInspect, Inspection_ID)
saveRDS(foodInspect, file.path("DATA/23_food_insp_features.Rds"))



227 changes: 0 additions & 227 deletions CODE/23_generate_model_dat.R

This file was deleted.

91 changes: 91 additions & 0 deletions CODE/24_bus_features.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@

if(interactive()){
##==========================================================================
## INITIALIZE
##==========================================================================
## Remove all objects; perform garbage collection
rm(list=ls())
gc(reset=TRUE)
## Detach any non-standard libraries
geneorama::detach_nonstandard_packages()
}
## Load libraries & project functions
geneorama::loadinstall_libraries(c("data.table", "MASS"))
geneorama::sourceDir("CODE/functions/")
## Import shift function
shift <- geneorama::shift

##==============================================================================
## LOAD CACHED RDS FILES
##==============================================================================
business <- readRDS("DATA/11_bus_license.Rds")

## Apply filter to remove invalid / unused data
business <- filter_business(business)

## Food inspection data needed for some feature calculations inspection date
foodInspect <- readRDS("DATA/23_food_insp_features.Rds")

##==============================================================================
## CALCULATE FEATURES BASED ON BUSINESS LICENSE DATA
##==============================================================================

## Calculate min date (by license)
business[ , minDate := min(LICENSE_TERM_START_DATE), LICENSE_NUMBER]
business[ , maxDate := max(LICENSE_TERM_EXPIRATION_DATE), LICENSE_NUMBER]

##==============================================================================
## Use only the business data that pertains to food inspections
##==============================================================================
## Create a table of matches between the food inspection and business license
## data, based on the where the Inspection_Date falls within the business
## license renewal
id_table_food2business <- find_bus_id_matches(business, foodInspect)
geneorama::NAsummary(id_table_food2business)

## Add food key to matched business data
bus_matched <- merge(x = id_table_food2business,
y = business,
by = "ID",
all.y = FALSE,
all.x = TRUE)
setkey(bus_matched, Inspection_ID)

## Add business key to food data
foodInspect <- merge(x = id_table_food2business,
y = foodInspect,
by = "Inspection_ID")
setkey(foodInspect, Inspection_ID)

## Use minDate and Inspection date to calculate age at
bus_matched <- bus_matched[foodInspect[,Inspection_Date,keyby=Inspection_ID]]
bus_matched[ , ageAtInspection := as.numeric(Inspection_Date - minDate) / 365]

## Remove Inspection Date to avoid conflict names when merging later
bus_matched[ , Inspection_Date := NULL]


## CALCULATE AND MERGE IN OTHER CATEGORIES
OtherCategories <- GenerateOtherLicenseInfo(foodInspect, business, max_cat = 12)
geneorama::NAsummary(OtherCategories)

## Merge in results
bus_matched <- merge(x = bus_matched,
y = OtherCategories,
by = "Inspection_ID",
all.x = T)
## Remove NAs in category columns and set max value to 1
for (j in match(colnames(OtherCategories)[-1], colnames(bus_matched))) {
set(x = bus_matched, i = which(is.na(bus_matched[[j]])), j = j, value = 0)
set(x = bus_matched, j = j, value = pmin(bus_matched[[j]], 1))
}

bus_matched

##==============================================================================
## SAVE RDS
##==============================================================================
## Set the key for dat_model
setkey(bus_matched, Inspection_ID)
saveRDS(bus_matched, file.path("DATA/24_bus_features.Rds"))

Loading

0 comments on commit 9a10616

Please sign in to comment.