-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvisa_and_models.R
65 lines (52 loc) · 2.26 KB
/
visa_and_models.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
library(tidyverse) # Misc tidy data wrapper package.
library(caret) # KNN model
library(ggplot2) # Plotting
# 1 ===== LOAD DATA =====
# The sampled source csv is read in, cleaned, and converted to a .RData file in generate_env.R
load("visa_info.RData")
# 2 ===== CASE STATUS CLASSIFIER =====
# KNN model to predict categorical Case Status given user input.
# Since KNNs do not continuously "learn" and dimensionality is low, it is a useful choice.
# Source: http://dataaspirant.com/2017/01/09/knn-implementation-r-using-caret-package/
# Functions
encode_numeric <- function(df) {
df$emp_encoding <- as.numeric(as.factor(df$EMPLOYER_NAME))
df$soc_encoding <- as.numeric(as.factor(df$SOC_NAME))
df$job_title_encoding <- as.numeric(as.factor(df$JOB_TITLE))
df$ft_encoding <- as.numeric(as.factor(df$FULL_TIME_POSITION))
df$worksite_encoding <- as.numeric(as.factor(df$WORKSITE))
return(df)
}
# Setup
num_cases <- length(unique(visas$CASE_STATUS))
set.seed(37)
visa_colnames <- colnames(visas)[3:11]
cluster_eval_cols <- setdiff(visa_colnames, "PREVAILING_WAGE")
visas <- encode_numeric(visas)
# Train-test split
in.train.rows <- createDataPartition(visas$CASE_STATUS, p = 0.7, list = FALSE)
in.train.cols <- union(2, 10:16)
v.train <- visas[in.train.rows, in.train.cols]
v.test <- visas[-in.train.rows, in.train.cols]
v.train$CASE_STATUS <- factor(v.train$CASE_STATUS)
# Build classifier
train_control <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
classifier <- train(CASE_STATUS ~., data = v.train, method = "knn",
trControl = train_control,
preProcess = c("center", "scale"),
tuneLength = 10)
# External Interface.
predict_case_status <- function(user_df) {
user_numeric_df <- encode_numeric(user_df)
predict(classifier, newdata = user_numeric_df)
}
# 3 ===== PREDICTED WAGE REGRESSOR =====
wage_regressor <- lm(PREVAILING_WAGE ~ lon + lat + emp_encoding + soc_encoding
+ job_title_encoding + ft_encoding + worksite_encoding,
data = visas)
# External interface
predict_wage <- function(user_df) {
user_numeric_df <- encode_numeric(user_df)
predict(wage_regressor, newdata = user_numeric_df)
}
save.image(file = "visas_and_models.RData")