-
Notifications
You must be signed in to change notification settings - Fork 0
/
model_creation.Rmd
118 lines (101 loc) · 3.47 KB
/
model_creation.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
---
title: "skhillon Models Creation"
output:
html_document:
df_print: paged
---
```{r}
library(tidyverse) # Misc tidy data wrapper package.
library(tools) # Only used for toTitleCase()
library(caret) # KNN model
library(ggplot2) # Plotting
```
Load visas data
```{r}
load("visa_info.RData")
```
Convert all SOC, and Job Title fields to Title Case.
We leave Employers as all-caps because Company names vary greatly.
```{r}
visas$SOC_NAME <- toTitleCase(tolower(visas$SOC_NAME))
visas$JOB_TITLE <- toTitleCase(tolower(visas$JOB_TITLE))
```
## Classification - Case Status
KNN model to predict categorical Case Status given user input.
Since KNNs do not continuously "learn" and dimensionality is low, it is a useful choice.
Therefore, we also don't need to split data into training and testing because we will not be changing the number of clusters.
Source: https://dzone.com/refcardz/machine-learning-predictive?chapter=9
```{r}
num_cases <- length(unique(visas$CASE_STATUS))
set.seed(37)
visa_colnames <- colnames(visas)[3:11]
cluster_eval_cols <- setdiff(visa_colnames, "PREVAILING_WAGE")
cluster_eval_cols
str(visas[, cluster_eval_cols])
```
Not sure whether it wants numbers or factors, so here's both. (update: both fail)
```{r}
encode_numeric <- function(df) {
df$emp_encoding <- as.numeric(as.factor(df$EMPLOYER_NAME))
df$soc_encoding <- as.numeric(as.factor(df$SOC_NAME))
df$job_title_encoding <- as.numeric(as.factor(df$JOB_TITLE))
df$ft_encoding <- as.numeric(as.factor(df$FULL_TIME_POSITION))
df$worksite_encoding <- as.numeric(as.factor(df$WORKSITE))
df$year_encoding <- as.numeric(df$YEAR)
return(df)
}
encode_factor <- function(df) {
df$lon_encoding <- as.factor(df$lon)
df$lat_encoding <- as.factor(df$lat)
df$emp_encoding <- as.factor(df$EMPLOYER_NAME)
df$soc_encoding <- as.factor(df$SOC_NAME)
df$job_title_encoding <- as.factor(df$JOB_TITLE)
df$ft_encoding <- as.factor(df$FULL_TIME_POSITION)
df$worksite_encoding <- as.factor(df$WORKSITE)
df$year_encoding <- as.factor(df$YEAR)
return(df)
}
visas <- encode_numeric(visas)
head(visas)
```
Train-Test split, 70-30. Train = TRUE, Test = 2.
```{r}
in.train.rows <- createDataPartition(visas$CASE_STATUS, p = 0.7, list = FALSE)
in.train.cols <- union(2, 10:17)
v.train <- visas[in.train.rows, in.train.cols]
v.test <- visas[-in.train.rows, in.train.cols]
v.train$CASE_STATUS <- factor(v.train$CASE_STATUS)
head(v.train)
```
Build classifier.
*Error: Error in sprintf(gettext(fmt, domain = domain), ...) : invalid format '%d'; use format %s for character objects*
```{r}
train_control <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
classifier <- train(CASE_STATUS ~., data = v.train, method = "knn",
trControl = train_control,
preProcess = c("center", "scale"),
tuneLength = 10)
classifier
```
```{r}
predict_case_status <- function(user_df) {
user_numeric_df <- encode_numeric(user_df)
predict(classifier, newdata = user_numeric_df)
}
```
## Regression - Yearly Wage Estimation (TODO)
```{r}
wage_regressor <- lm(PREVAILING_WAGE ~ lon + lat + emp_encoding + soc_encoding
+ job_title_encoding + ft_encoding + worksite_encoding + year_encoding,
data = visas)
wage_regressor
```
```{r}
predict_wage <- function(user_df) {
user_numeric_df <- encode_numeric(user_df)
predict(wage_regressor, newdata = user_numeric_df)
}
```
```{r}
summary(wage_regressor)
```