Skip to content

Commit

Permalink
Merge branch 'dev' into fix/update-voters-tract
Browse files Browse the repository at this point in the history
  • Loading branch information
1beb authored Nov 28, 2023
2 parents 60dda21 + 05c6147 commit 5a9a1bd
Show file tree
Hide file tree
Showing 30 changed files with 824 additions and 382 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@ ChangeLog

^cran-comments\.md$
^CRAN-SUBMISSION$
^README\.Rmd$
^data-raw$
56 changes: 31 additions & 25 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,24 +1,30 @@
Package: wru
Title: Who are You? Bayesian Prediction of Racial Category Using Surname,
First Name, Middle Name, and Geolocation
Version: 2.0.0
Date: 2023-07-12
Title: Who are You? Bayesian Prediction of Racial Category Using Surname, First Name, Middle Name, and
Geolocation
Authors@R: c(
person("Kabir", "Khanna", email = "[email protected]", role = c("aut")),
person("Brandon", "Bertelsen", email = "[email protected]", role = c("aut","cre")),
person("Santiago", "Olivella", email = "[email protected]", role = c("aut")),
person("Evan", "Rosenman", email = "[email protected]", role = c("aut")),
person("Kosuke", "Imai", email = "[email protected]", role = c("aut"))
person("Kabir", "Khanna", , "[email protected]", role = "aut"),
person("Brandon", "Bertelsen", , "[email protected]", role = c("aut", "cre")),
person("Santiago", "Olivella", , "[email protected]", role = "aut"),
person("Evan", "Rosenman", , "[email protected]", role = "aut"),
person("Kosuke", "Imai", , "[email protected]", role = "aut")
)
Description: Predicts individual race/ethnicity using surname, first name, middle name, geolocation,
and other attributes, such as gender and age. The method utilizes Bayes'
Rule (with optional measurement error correction) to compute the posterior probability of each racial category for any given
individual. The package implements methods described in Imai and Khanna (2016)
"Improving Ecological Inference by Predicting Individual Ethnicity from Voter
Registration Records" Political Analysis <DOI:10.1093/pan/mpw001> and Imai, Olivella, and Rosenman (2022)
"Addressing census data problems in race imputation via fully Bayesian Improved Surname Geocoding and name supplements"
<DOI:10.1126/sciadv.adc9824>. The package also incorporates the data described in Rosenman, Olivella, and Imai (2023)
"Race and ethnicity data for first, middle, and surnames" <DOI:10.1038/s41597-023-02202-2>.
Description: Predicts individual race/ethnicity using surname, first name,
middle name, geolocation, and other attributes, such as gender and
age. The method utilizes Bayes' Rule (with optional measurement error
correction) to compute the posterior probability of each racial
category for any given individual. The package implements methods
described in Imai and Khanna (2016) "Improving Ecological Inference by
Predicting Individual Ethnicity from Voter Registration Records"
Political Analysis <DOI:10.1093/pan/mpw001> and Imai, Olivella, and
Rosenman (2022) "Addressing census data problems in race imputation
via fully Bayesian Improved Surname Geocoding and name supplements"
<DOI:10.1126/sciadv.adc9824>. The package also incorporates the data
described in Rosenman, Olivella, and Imai (2023) "Race and ethnicity
data for first, middle, and surnames"
<DOI:10.1038/s41597-023-02202-2>.
License: GPL (>= 3)
URL: https://github.com/kosukeimai/wru
BugReports: https://github.com/kosukeimai/wru/issues
Depends:
Expand All @@ -28,20 +34,20 @@ Imports:
dplyr,
furrr,
future,
piggyback (>= 0.1.4),
PL94171,
purrr,
Rcpp,
piggyback (>= 0.1.4),
PL94171
Rcpp
Suggests:
testthat (>= 3.0.0),
covr
covr,
testthat (>= 3.0.0)
LinkingTo:
Rcpp,
RcppArmadillo
LazyLoad: yes
Config/testthat/edition: 3
Encoding: UTF-8
LazyData: yes
LazyDataCompression: xz
License: GPL (>= 3)
LazyLoad: yes
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
Encoding: UTF-8
Config/testthat/edition: 3
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ export(predict_race)
import(PL94171)
importFrom(Rcpp,evalCpp)
importFrom(dplyr,coalesce)
importFrom(dplyr,pull)
importFrom(furrr,future_map_dfr)
importFrom(piggyback,pb_download)
importFrom(purrr,map_dfr)
Expand Down
4 changes: 1 addition & 3 deletions R/census_data_preflight.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
#' Preflight census data
#'
#' @param census.data See documentation in \code{race_predict}.
#' @param census.geo See documentation in \code{race_predict}.
#' @param year See documentation in \code{race_predict}.
#' @inheritParams predict_race
#' @keywords internal

census_data_preflight <- function(census.data, census.geo, year) {
Expand Down
9 changes: 7 additions & 2 deletions R/get_census_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,13 @@
#' for specified state(s). Using this function to download Census data in advance
#' can save considerable time when running \code{predict_race} and \code{census_helper}.
#'
#' @param key A required character object containing a valid Census API key,
#' which can be requested \href{https://api.census.gov/data/key_signup.html}{here}.
#' @param key A character string containing a valid U.S. Census API key,
#' which can be requested from the
#' [U.S. Census API key signup page](https://api.census.gov/data/key_signup.html).
#'
#' If [`NULL`], the default, attempts to find a census key stored in an
#' [environment variable][Sys.getenv] named `CENSUS_API_KEY`.
#'
#' @param states which states to extract Census data for, e.g., \code{c("NJ", "NY")}.
#' @param age A \code{TRUE}/\code{FALSE} object indicating whether to condition on
#' age or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race).
Expand Down
39 changes: 23 additions & 16 deletions R/predict_race.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,14 @@
#' must have column named \code{place}.
#' Specifying \code{\var{census.geo}} will call \code{census_helper} function
#' to merge Census geographic data at specified level of geography.
#' @param census.key A character object specifying user's Census API
#' key. Required if \code{\var{census.geo}} is specified, because
#' a valid Census API key is required to download Census geographic data.
#'
#' @param census.key A character object specifying user's Census API key.
#' Required if `census.geo` is specified, because a valid Census API key is
#' required to download Census geographic data.
#'
#' If [`NULL`], the default, attempts to find a census key stored in an
#' [environment variable][Sys.getenv] named `CENSUS_API_KEY`.
#'
#' @param census.data A list indexed by two-letter state abbreviations,
#' which contains pre-saved Census geographic data.
#' Can be generated using \code{get_census_data} function.
Expand Down Expand Up @@ -225,19 +230,21 @@ predict_race <- function(voter.file, census.surname = TRUE, surname.only = FALSE
if(ctrl$verbose){
message("Using `predict_race` to obtain initial race prediction priors with BISG model")
}
race.init <- predict_race_new(voter.file = voter.file,
names.to.use = names.to.use,
year = year,
age = age, sex = sex, # not implemented, default to F
census.geo = census.geo,
census.key = census.key,
name.dictionaries = name.dictionaries,
surname.only=surname.only,
census.data = census.data,
retry = retry,
impute.missing = TRUE,
census.surname = census.surname,
use.counties = use.counties)
race.init <- predict_race(voter.file = voter.file,
names.to.use = names.to.use,
year = year,
age = age, sex = sex, # not implemented, default to F
census.geo = census.geo,
census.key = census.key,
name.dictionaries = name.dictionaries,
surname.only=surname.only,
census.data = census.data,
retry = retry,
impute.missing = TRUE,
census.surname = census.surname,
use.counties = use.counties,
model = "BISG",
control = list(verbose=FALSE))
race.init <- max.col(
race.init[, paste0("pred.", c("whi", "bla", "his", "asi", "oth"))],
ties.method = "random"
Expand Down
28 changes: 8 additions & 20 deletions R/race_prediction_funs.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#' Internal model fitting functions
#'
#' These functions are intended for internal use only. Users should use the
#' \code{race_predict} interface rather any of these functions directly.
#' [predict_race()] interface rather any of these functions directly.
#'
#' These functions fit different versions of WRU. \code{.predict_race_old} fits
#' the original WRU model, also known as BISG with census-based surname dictionary.
Expand All @@ -13,26 +13,11 @@
#' the augmented surname dictionary, and the first and middle name
#' dictionaries when making predictions.
#'
#' @param voter.file See documentation in \code{race_predict}.
#' @param census.surname See documentation in \code{race_predict}.
#' @param surname.only See documentation in \code{race_predict}.
#' @param surname.year See documentation in \code{race_predict}.
#' @param census.geo See documentation in \code{race_predict}.
#' @param census.key See documentation in \code{race_predict}.
#' @param census.data See documentation in \code{race_predict}.
#' @param age See documentation in \code{race_predict}.
#' @param sex See documentation in \code{race_predict}.
#' @param year See documentation in \code{race_predict}.
#' @param party See documentation in \code{race_predict}.
#' @param retry See documentation in \code{race_predict}.
#' @param impute.missing See documentation in \code{race_predict}.
#' @param names.to.use See documentation in \code{race_predict}.
#' @param race.init See documentation in \code{race_predict}.
#' @param name.dictionaries See documentation in \code{race_predict}.
#' @param ctrl See \code{control} in documentation for \code{race_predict}.
#' @inheritParams predict_race
#' @param ctrl See `control` in documentation for [predict_race()].
#' @param use.counties A logical, defaulting to FALSE. Should census data be filtered by counties available in \var{census.data}?
#'
#' @return See documentation in \code{race_predict}.
#' @inherit predict_race return
#'
#' @name modfuns
NULL
Expand Down Expand Up @@ -261,6 +246,7 @@ NULL
#' New race prediction function, implementing classical BISG with augmented
#' surname dictionary, as well as first and middle name information.
#' @rdname modfuns
#' @keywords internal
predict_race_new <- function(voter.file, names.to.use, year = "2020",age = FALSE, sex = FALSE,
census.geo, census.key = NULL, name.dictionaries, surname.only=FALSE,
census.data = NULL, retry = 0, impute.missing = TRUE, census.surname = FALSE,
Expand Down Expand Up @@ -429,7 +415,9 @@ predict_race_new <- function(voter.file, names.to.use, year = "2020",age = FALSE
#' New race prediction function, implementing fBISG (i.e. measurement
#' error correction, fully Bayesian model) with augmented
#' surname dictionary, as well as first and middle name information.
#' @importFrom dplyr pull
#' @rdname modfuns
#' @keywords internal
predict_race_me <- function(voter.file, names.to.use, year = "2020",age = FALSE, sex = FALSE,
census.geo, census.key, name.dictionaries, surname.only=FALSE,
census.data = NULL, retry = 0, impute.missing = TRUE, census.surname = FALSE,
Expand Down Expand Up @@ -604,7 +592,7 @@ predict_race_me <- function(voter.file, names.to.use, year = "2020",age = FALSE,
surname = last_c,
first = first_c,
middle = mid_c)
kw_names <- toupper(ntab[, 1])
kw_names <- toupper(dplyr::pull(ntab, 1))
proc_names_vf <- .name_preproc(voter.file[[ntype]], c(kw_names))
u_vf_names <- unique(proc_names_vf)
kw_in_vf <- kw_names %in% proc_names_vf
Expand Down
15 changes: 9 additions & 6 deletions R/wru-internal.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
.onAttach <-
function(libname, pkgname) {
packageStartupMessage("\nPlease cite as: \n")
packageStartupMessage("Khanna K, Bertelsen B, Olivella S, Rosenman E, Imai K (2022). wru: Who are You?")
packageStartupMessage("Bayesian Prediction of Racial Category Using Surname, First Name, Middle Name, and Geolocation.")
packageStartupMessage("URL: https://CRAN.R-project.org/package=wru \n")
.onAttach <- function(libname, pkgname) {
packageStartupMessage(
"\n",
"Please cite as:", "\n\n",
format(citation("wru"), style = "text"), "\n\n",
"Note that wru 2.0.0 uses 2020 census data by default.", "\n",
'Use the argument `year = "2010"`, to replicate analyses produced with earlier package versions.',
"\n"
)
}
Loading

0 comments on commit 5a9a1bd

Please sign in to comment.