Skip to content

Commit

Permalink
Dispatch all BISG's to new code
Browse files Browse the repository at this point in the history
  • Loading branch information
solivella committed May 11, 2022
1 parent 60f8fb2 commit 82078c3
Show file tree
Hide file tree
Showing 14 changed files with 390 additions and 420 deletions.
4 changes: 2 additions & 2 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
#' @param verbose Boolean; should informative messages be printed?
#'
#' @keywords internal
sample_me <- function(last_name, first_name, mid_name, geo, N_rg, alpha, pi_s, pi_f, pi_m, pi_nr, which_names, samples, burnin, me_race, race_init, verbose) {
.Call(`_wru_sample_me`, last_name, first_name, mid_name, geo, N_rg, alpha, pi_s, pi_f, pi_m, pi_nr, which_names, samples, burnin, me_race, race_init, verbose)
sample_me <- function(last_name, first_name, mid_name, geo, N_rg, pi_s, pi_f, pi_m, pi_nr, which_names, samples, burnin, race_init, verbose) {
.Call(`_wru_sample_me`, last_name, first_name, mid_name, geo, N_rg, pi_s, pi_f, pi_m, pi_nr, which_names, samples, burnin, race_init, verbose)
}

77 changes: 41 additions & 36 deletions R/merge_names.R
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
#' merge_names(voters)
#'
#' @export
merge_names <- function(voter.file, namesToUse, table.surnames = NULL, table.first = NULL, table.middle = NULL, clean.names = TRUE, impute.missing = FALSE, model = "BISG") {
merge_names <- function(voter.file, namesToUse, census.surname, table.surnames = NULL, table.first = NULL, table.middle = NULL, clean.names = TRUE, impute.missing = FALSE, model = "BISG") {

# check the names
if (namesToUse == "surname") {
Expand All @@ -87,31 +87,31 @@ merge_names <- function(voter.file, namesToUse, table.surnames = NULL, table.fir

first_c <- readRDS("wru-data-first_c.rds")
mid_c <- readRDS("wru-data-mid_c.rds")
last_c <- readRDS("wru-data-last_c.rds")
if(census.surname){
last_c <- readRDS("wru-data-census_last_c.rds")
} else {
last_c <- readRDS("wru-data-last_c.rds")
}

p_eth <- c("c_whi", "c_bla", "c_his", "c_asi", "c_oth")
margin_sel <- 2
if (is.null(table.surnames)) {
lastNameDict <- last_c
} else {
lastNameDict <- table.surnames
lastNameDict[, -1] <- apply(table.surnames[, -1], margin_sel, function(x) x / sum(x, na.rm = TRUE))
lastNameDict[is.na(lastNameDict)] <- 0
names(lastNameDict) <- names(last_c)
lastNameDict[is.na(lastNameDict)] <- 0
}
if (is.null(table.first)) {
firstNameDict <- first_c
} else {
firstNameDict <- table.first
firstNameDict[, -1] <- apply(table.first[, -1], margin_sel, function(x) x / sum(x, na.rm = TRUE))
firstNameDict[is.na(firstNameDict)] <- 0
names(firstNameDict) <- names(first_c)
}
if (is.null(table.middle)) {
middleNameDict <- mid_c
} else {
middleNameDict <- table.middle
middleNameDict[, -1] <- apply(table.middle[, -1], margin_sel, function(x) x / sum(x, na.rm = TRUE))
middleNameDict[is.na(middleNameDict)] <- 0
names(middleNameDict) <- names(mid_c)
}
Expand Down Expand Up @@ -158,8 +158,11 @@ merge_names <- function(voter.file, namesToUse, table.surnames = NULL, table.fir
## Clean names (if specified by user)
if (clean.names) {
for (nameType in str_split(namesToUse, ", ")[[1]]) {
df1 <- df[!is.na(df[, paste("p_whi_", nameType, sep = "")]), ] # Matched names
df2 <- df[is.na(df[, paste("p_whi_", nameType, sep = "")]), ] # Unmatched names
if(nameType=="surname"){
nameType <- "last"
}
df1 <- df[!is.na(df[, paste("c_whi_", nameType, sep = "")]), ] # Matched names
df2 <- df[is.na(df[, paste("c_whi_", nameType, sep = "")]), ] # Unmatched names

## Remove All Punctuation and Try Merge Again
if (nrow(df2) > 0) {
Expand All @@ -171,9 +174,9 @@ merge_names <- function(voter.file, namesToUse, table.surnames = NULL, table.fir
)
df2 <- df2[, names(df1)] # reorder the columns

if (sum(!is.na(df2[, paste("p_whi_", nameType, sep = ""), ])) > 0) {
df1 <- rbind(df1, df2[!is.na(df2[, paste("p_whi_", nameType, sep = ""), ]), ])
df2 <- df2[is.na(df2[, paste("p_whi_", nameType, sep = "")]), ]
if (sum(!is.na(df2[, paste("c_whi_", nameType, sep = ""), ])) > 0) {
df1 <- rbind(df1, df2[!is.na(df2[, paste("c_whi_", nameType, sep = ""), ]), ])
df2 <- df2[is.na(df2[, paste("c_whi_", nameType, sep = "")]), ]
}
}

Expand All @@ -186,9 +189,9 @@ merge_names <- function(voter.file, namesToUse, table.surnames = NULL, table.fir
)
df2 <- df2[, names(df1)] # reorder the columns

if (sum(!is.na(df2[, paste("p_whi_", nameType, sep = ""), ])) > 0) {
df1 <- rbind(df1, df2[!is.na(df2[, paste("p_whi_", nameType, sep = ""), ]), ])
df2 <- df2[is.na(df2[, paste("p_whi_", nameType, sep = "")]), ]
if (sum(!is.na(df2[, paste("c_whi_", nameType, sep = ""), ])) > 0) {
df1 <- rbind(df1, df2[!is.na(df2[, paste("c_whi_", nameType, sep = ""), ]), ])
df2 <- df2[is.na(df2[, paste("c_whi_", nameType, sep = "")]), ]
}
}

Expand All @@ -214,9 +217,9 @@ merge_names <- function(voter.file, namesToUse, table.surnames = NULL, table.fir
df2 <- merge(df2[, !grepl(paste("_", nameType, sep = ""), names(df2))], lastNameDict, by.x = "lastname.match", by.y = "last_name", all.x = TRUE)
df2 <- df2[, names(df1)] # reorder the columns

if (sum(!is.na(df2[, paste("p_whi_", nameType, sep = ""), ])) > 0) {
df1 <- rbind(df1, df2[!is.na(df2[, paste("p_whi_", nameType, sep = ""), ]), ])
df2 <- df2[is.na(df2[, paste("p_whi_", nameType, sep = "")]), ]
if (sum(!is.na(df2[, paste("c_whi_", nameType, sep = ""), ])) > 0) {
df1 <- rbind(df1, df2[!is.na(df2[, paste("c_whi_", nameType, sep = ""), ]), ])
df2 <- df2[is.na(df2[, paste("c_whi_", nameType, sep = "")]), ]
}
}

Expand All @@ -237,9 +240,9 @@ merge_names <- function(voter.file, namesToUse, table.surnames = NULL, table.fir
)
df2 <- df2[, c(names(df1), "name1", "name2")] # reorder the columns

if (sum(!is.na(df2[, paste("p_whi_", nameType, sep = ""), ])) > 0) {
df1 <- rbind(df1, df2[!is.na(df2[, paste("p_whi_", nameType, sep = "")]), !(names(df2) %in% c("name1", "name2"))])
df2 <- df2[is.na(df2[, paste("p_whi_", nameType, sep = "")]), ]
if (sum(!is.na(df2[, paste("c_whi_", nameType, sep = ""), ])) > 0) {
df1 <- rbind(df1, df2[!is.na(df2[, paste("c_whi_", nameType, sep = "")]), !(names(df2) %in% c("name1", "name2"))])
df2 <- df2[is.na(df2[, paste("c_whi_", nameType, sep = "")]), ]
}
}

Expand All @@ -252,9 +255,9 @@ merge_names <- function(voter.file, namesToUse, table.surnames = NULL, table.fir
)
df2 <- df2[, c(names(df1), "name1", "name2")] # reorder the columns

if (sum(!is.na(df2[, paste("p_whi_", nameType, sep = ""), ])) > 0) {
df1 <- rbind(df1, df2[!is.na(df2[, paste("p_whi_", nameType, sep = "")]), !(names(df2) %in% c("name1", "name2"))])
df2 <- df2[is.na(df2[, paste("p_whi_", nameType, sep = "")]), ]
if (sum(!is.na(df2[, paste("c_whi_", nameType, sep = ""), ])) > 0) {
df1 <- rbind(df1, df2[!is.na(df2[, paste("c_whi_", nameType, sep = "")]), !(names(df2) %in% c("name1", "name2"))])
df2 <- df2[is.na(df2[, paste("c_whi_", nameType, sep = "")]), ]
}
}

Expand All @@ -270,26 +273,26 @@ merge_names <- function(voter.file, namesToUse, table.surnames = NULL, table.fir

## For unmatched names, just fill with an column mean if impute is true, or with constant if false
# require(dplyr), now included as Import in package
p_miss_last <- mean(is.na(df$c_whi_last))
if (p_miss_last > 0) {
c_miss_last <- mean(is.na(df$c_whi_last))
if (c_miss_last > 0) {
message(paste(paste(sum(is.na(df$c_whi_last)), " (", round(100 * mean(is.na(df$c_whi_last)), 1), "%) individuals' last names were not matched.", sep = "")))
}
if (grepl("first", namesToUse)) {
p_miss_first <- mean(is.na(df$c_whi_first))
if (p_miss_first > 0) {
c_miss_first <- mean(is.na(df$c_whi_first))
if (c_miss_first > 0) {
message(paste(paste(sum(is.na(df$c_whi_first)), " (", round(100 * mean(is.na(df$c_whi_first)), 1), "%) individuals' first names were not matched.", sep = "")))
}
}
if (grepl("middle", namesToUse)) {
p_miss_mid <- mean(is.na(df$c_whi_middle))
if (p_miss_mid > 0) {
c_miss_mid <- mean(is.na(df$c_whi_middle))
if (c_miss_mid > 0) {
message(paste(paste(sum(is.na(df$c_whi_middle)), " (", round(100 * mean(is.na(df$c_whi_middle)), 1), "%) individuals' middle names were not matched.", sep = "")))
}
}

if (impute.missing) {
impute.vec <- colMeans(df[, grep("c_", names(df), value = TRUE)], na.rm = TRUE)
for (i in grep("p_", names(df), value = TRUE)) {
for (i in grep("c_", names(df), value = TRUE)) {
df[, i] <- dplyr::coalesce(df[, i], impute.vec[i])
}
} else {
Expand Down Expand Up @@ -317,23 +320,25 @@ merge_names <- function(voter.file, namesToUse, table.surnames = NULL, table.fir

#' Preflight for name data
#'
#' Checks if namedata is avialable in the current working directory, if not
#' Checks if namedata is available in the current working directory, if not
#' downloads it from github using piggyback.
#'
#' @importFrom piggyback pb_download
wru_data_preflight <- function() {
if (!all(
file.exists("wru-data-first_c.rds"),
file.exists("wru-data-mid_c.rds"),
file.exists("wru-data-last_c.rds")
file.exists("wru-data-last_c.rds"),
file.exists("wru-data-census_last_c.rds")
)
) {
# TODO: Point to a repository that is not private! See inst/scripts/
# prep-piggyback.R for example
piggyback::pb_download("wru-data-first_c.rds", repo = "solivella/wruData")
piggyback::pb_download("wru-data-mid_c.rds", repo = "solivella/wruData")
piggyback::pb_download("wru-data-last_c.rds", repo = "solivella/wruData")
} else {
message("`wru` name data already available in working directory")
}
piggyback::pb_download("wru-data-census_last_c.rds", repo = "solivella/wruData")
} #else {
#message("`wru` name data already available in working directory")
#}
}
14 changes: 8 additions & 6 deletions R/predict_race.R
Original file line number Diff line number Diff line change
Expand Up @@ -136,27 +136,29 @@ predict_race <- function(voter.file, census.surname = TRUE, surname.only = FALSE
surname.year = 2010, census.geo, census.key = NULL, census.data = NA, age = FALSE,
sex = FALSE, year = "2010", party, retry = 3, impute.missing = TRUE,
use_counties = FALSE, model = "BISG", race.init = NULL, name.dictionaries = NULL,
names.to.use = "surname",control = NULL, ...) {
names.to.use = "surname",control = NULL) {

## Check model type
if (!(model %in% c("BISG", "fBISG"))) {
stop(
paste0(
"'model' must be one of 'BISG' (for standard BISG results, or results",
" with all name data without error correction) or 'fBISG' (for the",
" fully Bayesian model that accommodates all name data)."
" fully Bayesian/error correction model that accommodates all name data)."
)
)
}

## Build model calls
cl <- match.call()
arg_list <- as.list(match.call())[-1]
cl <- formals()
cl[names(arg_list)] <- arg_list
if((model == "BISG")){
cl[[1L]] <- quote(wru:::.predict_race_new)
cl <- c(quote(wru:::predict_race_new), cl)
} else {
cl[[1L]] <- quote(.predict_race_me)
cl <- c(quote(wru:::predict_race_me), cl)
}
res <- eval(cl, parent.frame())
res <- eval(as.call(cl), parent.frame())

return(res)
}
Expand Down
Loading

0 comments on commit 82078c3

Please sign in to comment.