diff --git a/ChangeLog b/ChangeLog index a2f8930..604f189 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,5 @@ Date Version Comment 2015-12-09 0.0-1 First version on CRAN 2016-03-04 0.0-2 Minor improvements +2016-12-13 0.1-1 New function to pre-download Census data and other minor improvements +2017-03-03 0.1-2 Updated surname handling, enhanced demographics option, and improved error handling and documentation diff --git a/DESCRIPTION b/DESCRIPTION index b61c199..06b162d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,9 +1,9 @@ Package: wru -Version: 0.1-1 -Date: 2016-12-12 +Version: 0.1-2 +Date: 2017-3-3 Title: Who Are You? Bayesian Prediction of Racial Category Using Surname and Geolocation -Author: Kabir Khanna [aut, cre], Kosuke Imai [aut, cre] +Author: Kabir Khanna [aut, cre], Kosuke Imai [aut, cre], Hubert Jin [ctb] Maintainer: Kabir Khanna Description: This open-source software package enables researchers to predict individual ethnicity using his/her surname, geolocation, and other attributes diff --git a/NAMESPACE b/NAMESPACE index 4dd8931..f9efb29 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,13 +1,11 @@ # Generated by roxygen2: do not edit by hand -export(census.helper.api) -export(census.helper.api.local) -export(census.helper.api.online) -export(censusData) -export(getCensusApi) -export(getCensusApi2) -export(getCensusData) -export(name.clean) -export(race.pred) -export(vecToChunk) +export(census_geo_api) +export(census_helper) +export(get_census_api) +export(get_census_api_2) +export(get_census_data) +export(merge_surnames) +export(predict_race) +export(vec_to_chunk) import(devtools) diff --git a/R/State.FIPS.R b/R/State.FIPS.R new file mode 100644 index 0000000..6ea233c --- /dev/null +++ b/R/State.FIPS.R @@ -0,0 +1,16 @@ +#' State's FIPS. +#' +#' List of States and their FIPS. +#' +#' @format A data frame with 55 rows and 2 variables: +#' \describe{ +#' \item{State}{State} +#' \item{FIPS}{FIPS} +#' #' } +#' +#' @docType data +#' @keywords datasets +#' @name State.FIPS +#' @examples +#' data(State.FIPS) +"State.FIPS" diff --git a/R/census.helper.api.R b/R/census.helper.api.R deleted file mode 100644 index 6e18eae..0000000 --- a/R/census.helper.api.R +++ /dev/null @@ -1,584 +0,0 @@ -#' Census helper function. -#' -#' \code{census.helper.api.online} links user-input dataset with Census data. -#' -#' This function allows users to link their geocoded dataset (e.g., voter file) -#' with U.S. Census 2010 data. The function extracts Census Summary File data -#' at the tract or block level using the 'UScensus2010' package. Census data -#' calculated are Pr(Geolocation | Race) where geolocation is tract or block. -#' -#' @param key A required character object. Must contain user's Census API -#' key, which can be requested \href{http://api.census.gov/data/key_signup.html}{here}. -#' @param voters An object of class \code{data.frame}. Must contain field(s) -#' named \code{\var{county}}, \code{\var{tract}}, and/or \code{\var{block}} -#' specifying geolocation. These should be character variables that match up with -#' U.S. Census categories. County should be three characters (e.g., "031" not "31"), -#' tract should be six characters, and block should be four characters. -#' @param states A character vector specifying which states to extract -#' Census data for, e.g. \code{c("NJ", "NY")}. Default is \code{"all"}, which extracts -#' Census data for all states contained in user-input data. -#' @param geo A character object specifying what aggregation level to use. -#' Use \code{"county"}, \code{"tract"}, or \code{"block"}. Default is \code{"tract"}. -#' Warning: extracting block-level data takes very long. -#' @param demo A \code{TRUE}/\code{FALSE} object indicating whether to condition on -#' demographics (i.e., age and sex) or not. If \code{TRUE}, function will return -#' Pr(Geolocation, Age, Sex | Race). If \code{FALSE}, function wil return -#' Pr(Geolocation | Race). Default is \code{FALSE}. -#' @return Output will be an object of class \code{data.frame}. It will -#' consist of the original user-input data with additional columns of -#' Census data. -#' -#' @examples -#' \dontshow{data(voters)} -#' \dontrun{census.helper.api(key = "...", voters = voters, states = "nj", geo = "block")} -#' \dontrun{census.helper.api(key = "...", voters = voters, states = "all", geo = "tract", -#' demo = TRUE)} -#' -#' @references -#' Relies on getCensusApi, getCensusApi2, and vecToChunk functions authored by Nicholas Nagle, -#' available \href{http://rstudio-pubs-static.s3.amazonaws.com/19337_2e7f827190514c569ea136db788ce850.html}{here}. -#' -#' @export -census.helper.api.online <- function(key, voters, states = "all", geo = "tract", demo = FALSE) { - - if (missing(key)) { - stop('Must enter U.S. Census API key, which can be requested at http://api.census.gov/data/key_signup.html.') - } - - states <- toupper(states) - if (states == "ALL") { - states <- toupper(as.character(unique(voters$state))) - } - - df.out <- NULL - - for (s in 1:length(states)) { - - print(paste("State ", s, " of ", length(states), ": ", states[s], sep = "")) - fips.codes <- get("State.FIPS") - state.fips <- fips.codes[fips.codes$State == states[s], "FIPS"] - - if (demo == F) { - num <- ifelse(3:10 != 10, paste("0", 3:10, sep = ""), "10") - vars <- paste("P00500", num, sep = "") - } - - if (demo == T) { - eth.let <- c("I", "B", "H", "D", "E", "F", "C") - num <- as.character(c(c("01", "07", "08", "09"), seq(10, 25), seq(31, 49))) - vars <- NULL - for (e in 1:length(eth.let)) { - vars <- c(vars, paste("P012", eth.let[e], "0", num, sep = "")) - } - } - - if (geo == "county") { - geo.merge <- c("state", "county") - region <- paste("for=county:*&in=state:", state.fips, sep = "") - census <- getCensusApi("http://api.census.gov/data/2010/sf1?", - key = key, vars = vars, region = region) - } - - if (geo == "tract") { - - geo.merge <- c("state", "county", "tract") - - region_county <- paste("for=county:*&in=state:", state.fips, sep = "") - county_df <- getCensusApi("http://api.census.gov/data/2010/sf1?", key = key, vars = vars, region = region_county) - county_list <- county_df$county - - census <- NULL - for (c in 1:length(county_list)) { - print(paste("County ", c, " of ", length(county_list), ": ", county_list[c], sep = "")) - region_county <- paste("for=tract:*&in=state:", state.fips, "+county:", county_list[c], sep = "") - census.temp <- getCensusApi("http://api.census.gov/data/2010/sf1?", - key = key, vars = vars, region = region_county) - census <- rbind(census, census.temp) - } - rm(census.temp) - } - - if (geo == "block") { - - geo.merge <- c("state", "county", "tract", "block") - - region_county <- paste("for=county:*&in=state:", state.fips, sep = "") - county_df <- getCensusApi("http://api.census.gov/data/2010/sf1?", key = key, vars = vars, region = region_county) - county_list <- county_df$county - - census <- NULL - - for (c in 1:length(county_list)) { - print(paste("County ", c, " of ", length(county_list), ": ", county_list[c], sep = "")) - - region_tract <- paste("for=tract:*&in=state:", state.fips, "+county:", county_list[c], sep = "") - print(region_tract) - tract_df <- getCensusApi("http://api.census.gov/data/2010/sf1?", - key = key, vars = vars, region = region_tract) - tract_list <- tract_df$tract - - for (t in 1:length(tract_list)) { - print(paste("Tract ", t, " of ", length(tract_list), ": ", tract_list[t], sep = "")) - - region_block <- paste("for=block:*&in=state:", state.fips, "+county:", county_list[c], "+tract:", tract_list[t], sep = "") - census.temp <- getCensusApi("http://api.census.gov/data/2010/sf1?", - key = key, vars = vars, region = region_block) - census <- rbind(census, census.temp) - } - } - rm(census.temp) - - } - - if (demo == F) { - - census$state <- states[s] - - ## Calculate Pr(Geolocation | Race) - census$r_whi <- census$P0050003 / sum(census$P0050003) #Pr(Tract|White) - census$r_bla <- census$P0050004 / sum(census$P0050004) #Pr(Tract|Black) - census$r_his <- census$P0050010 / sum(census$P0050010) #Pr(Tract|Latino) - census$r_asi <- (census$P0050006 + census$P0050007) / (sum(census$P0050006) + sum(census$P0050007)) #Pr(Tract | Asian or NH/PI) - census$r_oth <- (census$P0050005 + census$P0050008 + census$P0050009) / (sum(census$P0050005) + sum(census$P0050008) + sum(census$P0050009)) #Pr(Tract | AI/AN, Other, or Mixed) - - drop <- grep("P005", names(census)) - voters.census <- merge(voters[toupper(voters$state) == toupper(states[s]), ], census[, -drop], by = geo.merge, all.x = T) - - } - - if (demo == T) { - - census$state <- states[s] - - ## Calculate Pr(Tract, Sex, Age Category | Race) - eth.cen <- c("whi", "bla", "his", "asi", "oth") - eth.let <- c("I", "B", "H", "D", "F") - sex <- c("mal", "fem") - age.cat <- c(seq(5, 23), seq(5, 23)) - age.cen <- as.character(c(c("07", "08", "09"), seq(10, 25), seq(31, 49))) - - for (i in 1:length(eth.cen)) { - for (k in 1:length(sex)) { - for (j in 1:19) { - if (k == 2) { - j <- j + 19 - } - if (i != 4 & i != 5) { - census[paste("r", sex[k], age.cat[j], eth.cen[i], sep = "_")] <- census[paste("P012", eth.let[i], "0", age.cen[j], sep = "")] / sum(census[paste("P012", eth.let[i], "001", sep = "")]) - } - if (i == 4) { - ## Combine Asian and Native Hawaiian/Pacific Islander - census[paste("r", sex[k], age.cat[j], eth.cen[i], sep = "_")] <- (census[paste("P012D0", age.cen[j], sep = "")] + census[paste("P012E0", age.cen[j], sep = "")]) / sum(census$P012D001 + census$P012E001) - } - if (i == 5) { - ## Combine American India/Alaska Native and Other - census[paste("r", sex[k], age.cat[j], eth.cen[i], sep = "_")] <- (census[paste("P012C0", age.cen[j], sep = "")] + census[paste("P012F0", age.cen[j], sep = "")]) / sum(census$P012C001 + census$P012F001) - } - } - } - } - - drop <- grep("P012", names(census)) - voters.census <- merge(voters[toupper(voters$state) == toupper(states[s]), ], census[, -drop], by = geo.merge, all.x = T) - - ## Add Census Age Categories - voters.census$agecat <- NA - voters.census$agecat <- ifelse(voters.census$age >= 18 & voters.census$age <= 19, 5, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age == 20, 6, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age == 21, 7, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 22 & voters.census$age <= 24, 8, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 25 & voters.census$age <= 29, 9, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 30 & voters.census$age <= 34, 10, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 35 & voters.census$age <= 39, 11, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 40 & voters.census$age <= 44, 12, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 45 & voters.census$age <= 49, 13, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 50 & voters.census$age <= 54, 14, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 55 & voters.census$age <= 59, 15, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 60 & voters.census$age <= 61, 16, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 62 & voters.census$age <= 64, 17, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 65 & voters.census$age <= 66, 18, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 67 & voters.census$age <= 69, 19, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 70 & voters.census$age <= 74, 20, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 75 & voters.census$age <= 79, 21, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 80 & voters.census$age <= 84, 22, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 85, 23, voters.census$agecat) - - for (i in 1:length(eth.cen)) { - for (j in 5:23) { - voters.census[voters.census$sex == 0 & voters.census$agecat == j, - paste("r", eth.cen[i], sep = "_")] <- voters.census[paste("r_mal", j, eth.cen[i], sep = "_")] - voters.census[voters.census$sex == 1 & voters.census$agecat == j, - paste("r", eth.cen[i], sep = "_")] <- voters.census[paste("r_fem", j, eth.cen[i], sep = "_")] - } - } - - drop <- c(grep("_mal_", names(voters.census)), grep("_fem_", names(voters.census))) - voters.census <- voters.census[, -drop] - } - - df.out <- as.data.frame(rbind(df.out, voters.census[names(voters.census) != "agecat"])) - - } - - return(df.out) -} - - -#' Census helper function. -#' -#' \code{census.helper.api.local} links user-input dataset with Census data. -#' -#' This function allows users to link their geocoded dataset (e.g., voter file) -#' with U.S. Census 2010 data. The function extracts Census Summary File data -#' at the tract or block level using the 'UScensus2010' package. Census data -#' calculated are Pr(Geolocation | Race) where geolocation is tract or block. -#' -#' @param voters An object of class \code{data.frame}. Must contain field(s) -#' named \code{\var{county}}, \code{\var{tract}}, and/or \code{\var{block}} -#' specifying geolocation. These should be character variables that match up with -#' U.S. Census categories. County should be three characters (e.g., "031" not "31"), -#' tract should be six characters, and block should be four characters. -#' @param states A state to use Census data for, e.g. \code{c("NJ", "NY")}. -#' Default is \code{"NA"}. -#' @param geo A character object specifying what aggregation level to use. -#' Use \code{"county"}, \code{"tract"}, or \code{"block"}. Default is \code{"tract"}. -#' Warning: extracting block-level data takes very long. -#' @param demo A \code{TRUE}/\code{FALSE} object indicating whether to condition on -#' demographics (i.e., age and sex) or not. If \code{TRUE}, function will return -#' Pr(Geolocation, Age, Sex | Race). If \code{FALSE}, function wil return -#' Pr(Geolocation | Race). Default is \code{FALSE}. -#' @param census.data A optional census object holding census data that is already -#' provided. If missing, function will retrive the census data online. -#' @return Output will be an object of class \code{data.frame}. It will -#' consist of the original user-input data with additional columns of -#' Census data. -#' -#' @examples -#' \dontshow{data(voters)} -#' \dontrun{census.helper.api.local(voters = voters, states = "nj", geo = "block", census.data = x)} -#' \dontrun{census.helper.api.local(voters = voters, states = "all", geo = "tract", demo = TRUE, census.data = x)} -#' -#' @references -#' Relies on getCensusApi, getCensusApi2, and vecToChunk functions authored by Nicholas Nagle, -#' available \href{http://rstudio-pubs-static.s3.amazonaws.com/19337_2e7f827190514c569ea136db788ce850.html}{here}. -#' -#' @export -census.helper.api.local <- function(voters, states = "all", geo = "tract", demo = FALSE, census.data = NA) { - - if (is.na(census.data)) { - stop('Without pre-downloaded census data, please use census.helper.api to access http://api.census.gov/data/key_signup.html.') - } - - states <- toupper(states) - if (states == "ALL") { - states <- toupper(as.character(unique(voters$state))) - } - - df.out <- NULL - - for (s in 1:length(states)) { - - state <- states[s] - - if ((is.null(census.data[[state]])) || (census.data[[state]]$demo != demo)) { - lstate <- paste(names(census.data), collapse=";") - print(paste("The census object includes", lstate, ". But the state here is", state)) - stop('Mismatch census data, please provide matching census data or use census.helper.api to access http://api.census.gov/data/key_signup.html.') - } - - if (geo == "county") { - geo.merge <- c("state", "county") - census <- census.data[[state]]$county - } - - if (geo == "tract") { - geo.merge <- c("state", "county", "tract") - census <- census.data[[state]]$tract - } - - if (geo == "block") { - geo.merge <- c("state", "county", "tract", "block") - census <- census.data[[state]]$block - } - - if (demo == F) { - - census$state <- state - - ## Calculate Pr(Geolocation | Race) - census$r_whi <- census$P0050003 / sum(census$P0050003) #Pr(Tract|White) - census$r_bla <- census$P0050004 / sum(census$P0050004) #Pr(Tract|Black) - census$r_his <- census$P0050010 / sum(census$P0050010) #Pr(Tract|Latino) - census$r_asi <- (census$P0050006 + census$P0050007) / (sum(census$P0050006) + sum(census$P0050007)) #Pr(Tract | Asian or NH/PI) - census$r_oth <- (census$P0050005 + census$P0050008 + census$P0050009) / (sum(census$P0050005) + sum(census$P0050008) + sum(census$P0050009)) #Pr(Tract | AI/AN, Other, or Mixed) - - drop <- grep("P005", names(census)) - voters.census <- merge(voters[toupper(voters$state) == toupper(state), ], census[, -drop], by = geo.merge, all.x = T) - - } - - if (demo == T) { - - census$state <- state - - ## Calculate Pr(Tract, Sex, Age Category | Race) - eth.cen <- c("whi", "bla", "his", "asi", "oth") - eth.let <- c("I", "B", "H", "D", "F") - sex <- c("mal", "fem") - age.cat <- c(seq(5, 23), seq(5, 23)) - age.cen <- as.character(c(c("07", "08", "09"), seq(10, 25), seq(31, 49))) - - for (i in 1:length(eth.cen)) { - for (k in 1:length(sex)) { - for (j in 1:19) { - if (k == 2) { - j <- j + 19 - } - if (i != 4 & i != 5) { - census[paste("r", sex[k], age.cat[j], eth.cen[i], sep = "_")] <- census[paste("P012", eth.let[i], "0", age.cen[j], sep = "")] / sum(census[paste("P012", eth.let[i], "001", sep = "")]) - } - if (i == 4) { - ## Combine Asian and Native Hawaiian/Pacific Islander - census[paste("r", sex[k], age.cat[j], eth.cen[i], sep = "_")] <- (census[paste("P012D0", age.cen[j], sep = "")] + census[paste("P012E0", age.cen[j], sep = "")]) / sum(census$P012D001 + census$P012E001) - } - if (i == 5) { - ## Combine American India/Alaska Native and Other - census[paste("r", sex[k], age.cat[j], eth.cen[i], sep = "_")] <- (census[paste("P012C0", age.cen[j], sep = "")] + census[paste("P012F0", age.cen[j], sep = "")]) / sum(census$P012C001 + census$P012F001) - } - } - } - } - - drop <- grep("P012", names(census)) - voters.census <- merge(voters[toupper(voters$state) == toupper(state), ], census[, -drop], by = geo.merge, all.x = T) - - ## Add Census Age Categories - voters.census$agecat <- NA - voters.census$agecat <- ifelse(voters.census$age >= 18 & voters.census$age <= 19, 5, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age == 20, 6, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age == 21, 7, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 22 & voters.census$age <= 24, 8, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 25 & voters.census$age <= 29, 9, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 30 & voters.census$age <= 34, 10, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 35 & voters.census$age <= 39, 11, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 40 & voters.census$age <= 44, 12, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 45 & voters.census$age <= 49, 13, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 50 & voters.census$age <= 54, 14, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 55 & voters.census$age <= 59, 15, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 60 & voters.census$age <= 61, 16, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 62 & voters.census$age <= 64, 17, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 65 & voters.census$age <= 66, 18, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 67 & voters.census$age <= 69, 19, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 70 & voters.census$age <= 74, 20, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 75 & voters.census$age <= 79, 21, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 80 & voters.census$age <= 84, 22, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 85, 23, voters.census$agecat) - - for (i in 1:length(eth.cen)) { - for (j in 5:23) { - voters.census[voters.census$sex == 0 & voters.census$agecat == j, - paste("r", eth.cen[i], sep = "_")] <- voters.census[paste("r_mal", j, eth.cen[i], sep = "_")] - voters.census[voters.census$sex == 1 & voters.census$agecat == j, - paste("r", eth.cen[i], sep = "_")] <- voters.census[paste("r_fem", j, eth.cen[i], sep = "_")] - } - } - - drop <- c(grep("_mal_", names(voters.census)), grep("_fem_", names(voters.census))) - voters.census <- voters.census[, -drop] - } - - df.out <- as.data.frame(rbind(df.out, voters.census[names(voters.census) != "agecat"])) - - } - - return(df.out) -} - - -#' Census helper function. -#' -#' \code{census.helper.api} links user-input dataset with Census data. -#' -#' This function allows users to link their geocoded dataset (e.g., voter file) -#' with U.S. Census 2010 data. The function extracts Census Summary File data -#' at the tract or block level using the 'UScensus2010' package. Census data -#' calculated are Pr(Geolocation | Race) where geolocation is tract or block. -#' -#' @param key A required character object. Must contain user's Census API -#' key, which can be requested \href{http://api.census.gov/data/key_signup.html}{here}. -#' @param voters An object of class \code{data.frame}. Must contain field(s) -#' named \code{\var{county}}, \code{\var{tract}}, and/or \code{\var{block}} -#' specifying geolocation. These should be character variables that match up with -#' U.S. Census categories. County should be three characters (e.g., "031" not "31"), -#' tract should be six characters, and block should be four characters. -#' @param states A character vector specifying which states to extract -#' Census data for, e.g. \code{c("NJ", "NY")}. Default is \code{"all"}, which extracts -#' Census data for all states contained in user-input data. -#' @param geo A character object specifying what aggregation level to use. -#' Use \code{"county"}, \code{"tract"}, or \code{"block"}. Default is \code{"tract"}. -#' Warning: extracting block-level data takes very long. -#' @param demo A \code{TRUE}/\code{FALSE} object indicating whether to condition on -#' demographics (i.e., age and sex) or not. If \code{TRUE}, function will return -#' Pr(Geolocation, Age, Sex | Race). If \code{FALSE}, function wil return -#' Pr(Geolocation | Race). Default is \code{FALSE}. -#' @param census.data A optional census object holding census data that is already -#' provided. If missing, function will retrive the census data online. -#' @return Output will be an object of class \code{data.frame}. It will -#' consist of the original user-input data with additional columns of -#' Census data. -#' -#' @examples -#' \dontshow{data(voters)} -#' \dontrun{census.helper.api(key = "...", voters = voters, states = "nj", geo = "block")} -#' \dontrun{census.helper.api(key = "...", voters = voters, states = "all", geo = "tract", -#' demo = TRUE)} -#' -#' @references -#' Relies on getCensusApi, getCensusApi2, and vecToChunk functions authored by Nicholas Nagle, -#' available \href{http://rstudio-pubs-static.s3.amazonaws.com/19337_2e7f827190514c569ea136db788ce850.html}{here}. -#' -#' @export -census.helper.api <- function(key, voters, states = "all", geo = "tract", demo = FALSE, census.data = NA) { - - if (is.na(census.data) || (typeof(census.data) != "list")) { - toDownload = TRUE - } else { - toDownload = FALSE - } - - if (toDownload) { - if (missing(key)) { - stop('Must enter U.S. Census API key, which can be requested at http://api.census.gov/data/key_signup.html.') - } - } - - states <- toupper(states) - if (states == "ALL") { - states <- toupper(as.character(unique(voters$state))) - } - - df.out <- NULL - - for (s in 1:length(states)) { - - state <- toupper(states[s]) - - if (geo == "county") { - geo.merge <- c("state", "county") - if ((toDownload) || (is.null(census.data[[state]])) || (census.data[[state]]$demo != demo)) { - census <- censusData(key, state, geo = "county", demo) - } else { - census <- census.data[[state]]$county - } - } - - if (geo == "tract") { - geo.merge <- c("state", "county", "tract") - if ((toDownload) || (is.null(census.data[[state]])) || (census.data[[state]]$demo != demo)) { - census <- censusData(key, state, geo = "tract", demo) - } else { - census <- census.data[[state]]$tract - } - } - - if (geo == "block") { - geo.merge <- c("state", "county", "tract", "block") - if ((toDownload) || (is.null(census.data[[state]])) || (census.data[[state]]$demo != demo)) { - census <- censusData(key, state, geo = "block", demo) - } else { - census <- census.data[[state]]$block - } - } - - if (demo == F) { - - census$state <- state - - ## Calculate Pr(Geolocation | Race) - census$r_whi <- census$P0050003 / sum(census$P0050003) #Pr(Tract|White) - census$r_bla <- census$P0050004 / sum(census$P0050004) #Pr(Tract|Black) - census$r_his <- census$P0050010 / sum(census$P0050010) #Pr(Tract|Latino) - census$r_asi <- (census$P0050006 + census$P0050007) / (sum(census$P0050006) + sum(census$P0050007)) #Pr(Tract | Asian or NH/PI) - census$r_oth <- (census$P0050005 + census$P0050008 + census$P0050009) / (sum(census$P0050005) + sum(census$P0050008) + sum(census$P0050009)) #Pr(Tract | AI/AN, Other, or Mixed) - - drop <- grep("P005", names(census)) - voters.census <- merge(voters[toupper(voters$state) == toupper(states[s]), ], census[, -drop], by = geo.merge, all.x = T) - - } - - if (demo == T) { - - census$state <- state - - ## Calculate Pr(Tract, Sex, Age Category | Race) - eth.cen <- c("whi", "bla", "his", "asi", "oth") - eth.let <- c("I", "B", "H", "D", "F") - sex <- c("mal", "fem") - age.cat <- c(seq(5, 23), seq(5, 23)) - age.cen <- as.character(c(c("07", "08", "09"), seq(10, 25), seq(31, 49))) - - for (i in 1:length(eth.cen)) { - for (k in 1:length(sex)) { - for (j in 1:19) { - if (k == 2) { - j <- j + 19 - } - if (i != 4 & i != 5) { - census[paste("r", sex[k], age.cat[j], eth.cen[i], sep = "_")] <- census[paste("P012", eth.let[i], "0", age.cen[j], sep = "")] / sum(census[paste("P012", eth.let[i], "001", sep = "")]) - } - if (i == 4) { - ## Combine Asian and Native Hawaiian/Pacific Islander - census[paste("r", sex[k], age.cat[j], eth.cen[i], sep = "_")] <- (census[paste("P012D0", age.cen[j], sep = "")] + census[paste("P012E0", age.cen[j], sep = "")]) / sum(census$P012D001 + census$P012E001) - } - if (i == 5) { - ## Combine American India/Alaska Native and Other - census[paste("r", sex[k], age.cat[j], eth.cen[i], sep = "_")] <- (census[paste("P012C0", age.cen[j], sep = "")] + census[paste("P012F0", age.cen[j], sep = "")]) / sum(census$P012C001 + census$P012F001) - } - } - } - } - - drop <- grep("P012", names(census)) - voters.census <- merge(voters[toupper(voters$state) == toupper(states[s]), ], census[, -drop], by = geo.merge, all.x = T) - - ## Add Census Age Categories - voters.census$agecat <- NA - voters.census$agecat <- ifelse(voters.census$age >= 18 & voters.census$age <= 19, 5, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age == 20, 6, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age == 21, 7, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 22 & voters.census$age <= 24, 8, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 25 & voters.census$age <= 29, 9, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 30 & voters.census$age <= 34, 10, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 35 & voters.census$age <= 39, 11, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 40 & voters.census$age <= 44, 12, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 45 & voters.census$age <= 49, 13, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 50 & voters.census$age <= 54, 14, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 55 & voters.census$age <= 59, 15, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 60 & voters.census$age <= 61, 16, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 62 & voters.census$age <= 64, 17, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 65 & voters.census$age <= 66, 18, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 67 & voters.census$age <= 69, 19, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 70 & voters.census$age <= 74, 20, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 75 & voters.census$age <= 79, 21, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 80 & voters.census$age <= 84, 22, voters.census$agecat) - voters.census$agecat <- ifelse(voters.census$age >= 85, 23, voters.census$agecat) - - for (i in 1:length(eth.cen)) { - for (j in 5:23) { - voters.census[voters.census$sex == 0 & voters.census$agecat == j, - paste("r", eth.cen[i], sep = "_")] <- voters.census[paste("r_mal", j, eth.cen[i], sep = "_")] - voters.census[voters.census$sex == 1 & voters.census$agecat == j, - paste("r", eth.cen[i], sep = "_")] <- voters.census[paste("r_fem", j, eth.cen[i], sep = "_")] - } - } - - drop <- c(grep("_mal_", names(voters.census)), grep("_fem_", names(voters.census))) - voters.census <- voters.census[, -drop] - } - - df.out <- as.data.frame(rbind(df.out, voters.census[names(voters.census) != "agecat"])) - - } - - return(df.out) -} diff --git a/R/census_geo_api.R b/R/census_geo_api.R new file mode 100644 index 0000000..80cc903 --- /dev/null +++ b/R/census_geo_api.R @@ -0,0 +1,234 @@ +#' Census Data download function. +#' +#' \code{census_geo_api} retrieves U.S. Census geographic data for a given state. +#' +#' This function allows users to download U.S. Census 2010 geographic data, +#' at either the county, tract, or block level, for a particular state. +#' +#' @param key A required character object. Must contain user's Census API +#' key, which can be requested \href{http://api.census.gov/data/key_signup.html}{here}. +#' @param state A required character object specifying which state to extract Census data for, +#' e.g., \code{"NJ"}. +#' @param geo A character object specifying what aggregation level to use. +#' Use \code{"county"}, \code{"tract"}, or \code{"block"}. Default is \code{"tract"}. +#' Warning: extracting block-level data takes very long. +#' @param age A \code{TRUE}/\code{FALSE} object indicating whether to condition on +#' age or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). +#' If \code{TRUE}, function will return Pr(Geolocation, Age | Race). +#' If \code{\var{sex}} is also \code{TRUE}, function will return Pr(Geolocation, Age, Sex | Race). +#' @param sex A \code{TRUE}/\code{FALSE} object indicating whether to condition on +#' sex or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). +#' If \code{TRUE}, function will return Pr(Geolocation, Sex | Race). +#' If \code{\var{age}} is also \code{TRUE}, function will return Pr(Geolocation, Age, Sex | Race). +#' @return Output will be an object of class \code{list}, indexed by state names. It will +#' consist of the original user-input data with additional columns of Census geographic data. +#' +#' @examples +#' \dontshow{data(voters)} +#' \dontrun{census_geo_api(key = "...", states = c("NJ", "DE"), geo = "block")} +#' \dontrun{census_geo_api(key = "...", states = "FL", geo = "tract", age = TRUE, sex = TRUE)} +#' +#' @references +#' Relies on get_census_api, get_census_api_2, and vec_to_chunk functions authored by Nicholas Nagle, +#' available \href{http://rstudio-pubs-static.s3.amazonaws.com/19337_2e7f827190514c569ea136db788ce850.html}{here}. +#' +#' @export +census_geo_api <- function(key, state, geo = "tract", age = FALSE, sex = FALSE) { + + if (missing(key)) { + stop('Must enter U.S. Census API key, which can be requested at http://api.census.gov/data/key_signup.html.') + } + + state <- toupper(state) + + df.out <- NULL + + fips.codes <- get("State.FIPS") + state.fips <- fips.codes[fips.codes$State == state, "FIPS"] + + if (age == F & sex == F) { + num <- ifelse(3:10 != 10, paste("0", 3:10, sep = ""), "10") + vars <- paste("P00500", num, sep = "") + } + + if (age == F & sex == T) { + eth.let <- c("I", "B", "H", "D", "E", "F", "C") + num <- as.character(c("01", "02", "26")) + vars <- NULL + for (e in 1:length(eth.let)) { + vars <- c(vars, paste("P012", eth.let[e], "0", num, sep = "")) + } + } + + if (age == T & sex == F) { + eth.let <- c("I", "B", "H", "D", "E", "F", "C") + num <- as.character(c(c("01", "03", "04", "05", "06", "07", "08", "09"), seq(10, 25), seq(27, 49))) + vars <- NULL + for (e in 1:length(eth.let)) { + vars <- c(vars, paste("P012", eth.let[e], "0", num, sep = "")) + } + } + + if (age == T & sex == T) { + eth.let <- c("I", "B", "H", "D", "E", "F", "C") + num <- as.character(c(c("01", "03", "04", "05", "06", "07", "08", "09"), seq(10, 25), seq(27, 49))) + vars <- NULL + for (e in 1:length(eth.let)) { + vars <- c(vars, paste("P012", eth.let[e], "0", num, sep = "")) + } + } + + if (geo == "county") { + geo.merge <- c("state", "county") + region <- paste("for=county:*&in=state:", state.fips, sep = "") + census <- get_census_api("http://api.census.gov/data/2010/sf1?", + key = key, vars = vars, region = region) + } + + if (geo == "tract") { + + geo.merge <- c("state", "county", "tract") + + region_county <- paste("for=county:*&in=state:", state.fips, sep = "") + county_df <- get_census_api("http://api.census.gov/data/2010/sf1?", key = key, vars = vars, region = region_county) + county_list <- county_df$county + + census <- NULL + for (c in 1:length(county_list)) { + print(paste("County ", c, " of ", length(county_list), ": ", county_list[c], sep = "")) + region_county <- paste("for=tract:*&in=state:", state.fips, "+county:", county_list[c], sep = "") + census.temp <- get_census_api("http://api.census.gov/data/2010/sf1?", + key = key, vars = vars, region = region_county) + census <- rbind(census, census.temp) + } + rm(census.temp) + } + + if (geo == "block") { + + geo.merge <- c("state", "county", "tract", "block") + + region_county <- paste("for=county:*&in=state:", state.fips, sep = "") + county_df <- get_census_api("http://api.census.gov/data/2010/sf1?", key = key, vars = vars, region = region_county) + county_list <- county_df$county + + census <- NULL + + for (c in 1:length(county_list)) { + print(paste("County ", c, " of ", length(county_list), ": ", county_list[c], sep = "")) + + region_tract <- paste("for=tract:*&in=state:", state.fips, "+county:", county_list[c], sep = "") + print(region_tract) + tract_df <- get_census_api("http://api.census.gov/data/2010/sf1?", + key = key, vars = vars, region = region_tract) + tract_list <- tract_df$tract + + for (t in 1:length(tract_list)) { + print(paste("Tract ", t, " of ", length(tract_list), ": ", tract_list[t], sep = "")) + + region_block <- paste("for=block:*&in=state:", state.fips, "+county:", county_list[c], "+tract:", tract_list[t], sep = "") + census.temp <- get_census_api("http://api.census.gov/data/2010/sf1?", + key = key, vars = vars, region = region_block) + census <- rbind(census, census.temp) + } + } + + rm(census.temp) + + } + + census$state <- state + + if (age == F & sex == F) { + + ## Calculate Pr(Geolocation | Race) + census$r_whi <- census$P0050003 / sum(census$P0050003) #Pr(Tract|White) + census$r_bla <- census$P0050004 / sum(census$P0050004) #Pr(Tract|Black) + census$r_his <- census$P0050010 / sum(census$P0050010) #Pr(Tract|Latino) + census$r_asi <- (census$P0050006 + census$P0050007) / (sum(census$P0050006) + sum(census$P0050007)) #Pr(Tract | Asian or NH/PI) + census$r_oth <- (census$P0050005 + census$P0050008 + census$P0050009) / (sum(census$P0050005) + sum(census$P0050008) + sum(census$P0050009)) #Pr(Tract | AI/AN, Other, or Mixed) + + } + + if (age == F & sex == T) { + + ## Calculate Pr(Geolocation, Sex | Race) + eth.cen <- c("whi", "bla", "his", "asi", "oth") + eth.let <- c("I", "B", "H", "D", "F") + + for (i in 1:length(eth.cen)) { + if (i != 4 & i != 5) { + census[paste("r_mal", eth.cen[i], sep = "_")] <- census[paste("P012", eth.let[i], "002", sep = "")] / sum(census[paste("P012", eth.let[i], "001", sep = "")]) + census[paste("r_fem", eth.cen[i], sep = "_")] <- census[paste("P012", eth.let[i], "026", sep = "")] / sum(census[paste("P012", eth.let[i], "001", sep = "")]) + } + if (i == 4) { + ## Combine Asian and Native Hawaiian/Pacific Islander + census[paste("r_mal", eth.cen[i], sep = "_")] <- (census$P012D002 + census$P012E002) / sum(census$P012D001 + census$P012E001) + census[paste("r_fem", eth.cen[i], sep = "_")] <- (census$P012D026 + census$P012E026) / sum(census$P012D001 + census$P012E001) + } + if (i == 5) { + ## Combine American India/Alaska Native and Other + census[paste("r_mal", eth.cen[i], sep = "_")] <- (census$P012C002 + census$P012F002) / sum(census$P012C001 + census$P012F001) + census[paste("r_fem", eth.cen[i], sep = "_")] <- (census$P012C026 + census$P012F026) / sum(census$P012C001 + census$P012F001) + } + } + } + + if (age == T & sex == F) { + + ## Calculate Pr(Geolocation, Age Category | Race) + eth.cen <- c("whi", "bla", "his", "asi", "oth") + eth.let <- c("I", "B", "H", "D", "F") + age.cat <- c(seq(1, 23), seq(1, 23)) + age.cen <- as.character(c(c("03", "04", "05", "06", "07", "08", "09"), seq(10, 25), seq(27, 49))) + + for (i in 1:length(eth.cen)) { + for (j in 1:23) { + if (i != 4 & i != 5) { + census[paste("r", age.cat[j], eth.cen[i], sep = "_")] <- (census[paste("P012", eth.let[i], "0", age.cen[j], sep = "")] + census[paste("P012", eth.let[i], "0", age.cen[j + 23], sep = "")]) / sum(census[paste("P012", eth.let[i], "001", sep = "")]) + } + if (i == 4) { + ## Combine Asian and Native Hawaiian/Pacific Islander + census[paste("r", age.cat[j], eth.cen[i], sep = "_")] <- (census[paste("P012D0", age.cen[j], sep = "")] + census[paste("P012D0", age.cen[j + 23], sep = "")] + census[paste("P012E0", age.cen[j], sep = "")] + census[paste("P012E0", age.cen[j + 23], sep = "")]) / sum(census$P012D001 + census$P012E001) + } + if (i == 5) { + ## Combine American India/Alaska Native and Other + census[paste("r", age.cat[j], eth.cen[i], sep = "_")] <- (census[paste("P012C0", age.cen[j], sep = "")] + census[paste("P012C0", age.cen[j + 23], sep = "")] + census[paste("P012F0", age.cen[j], sep = "")] + census[paste("P012F0", age.cen[j + 23], sep = "")]) / sum(census$P012C001 + census$P012F001) + } + } + } + } + + if (age == T & sex == T) { + + ## Calculate Pr(Geolocation, Sex, Age Category | Race) + eth.cen <- c("whi", "bla", "his", "asi", "oth") + eth.let <- c("I", "B", "H", "D", "F") + sex.let <- c("mal", "fem") + age.cat <- c(seq(1, 23), seq(1, 23)) + age.cen <- as.character(c(c("03", "04", "05", "06", "07", "08", "09"), seq(10, 25), seq(27, 49))) + + for (i in 1:length(eth.cen)) { + for (k in 1:length(sex.let)) { + for (j in 1:23) { + if (k == 2) { + j <- j + 23 + } + if (i != 4 & i != 5) { + census[paste("r", sex.let[k], age.cat[j], eth.cen[i], sep = "_")] <- census[paste("P012", eth.let[i], "0", age.cen[j], sep = "")] / sum(census[paste("P012", eth.let[i], "001", sep = "")]) + } + if (i == 4) { + ## Combine Asian and Native Hawaiian/Pacific Islander + census[paste("r", sex.let[k], age.cat[j], eth.cen[i], sep = "_")] <- (census[paste("P012D0", age.cen[j], sep = "")] + census[paste("P012E0", age.cen[j], sep = "")]) / sum(census$P012D001 + census$P012E001) + } + if (i == 5) { + ## Combine American India/Alaska Native and Other + census[paste("r", sex.let[k], age.cat[j], eth.cen[i], sep = "_")] <- (census[paste("P012C0", age.cen[j], sep = "")] + census[paste("P012F0", age.cen[j], sep = "")]) / sum(census$P012C001 + census$P012F001) + } + } + } + } + } + + return(census) +} diff --git a/R/census_helper.R b/R/census_helper.R new file mode 100644 index 0000000..2e12c20 --- /dev/null +++ b/R/census_helper.R @@ -0,0 +1,265 @@ +#' Census helper function. +#' +#' \code{census_helper} links user-input dataset with Census geographic data. +#' +#' This function allows users to link their geocoded dataset (e.g., voter file) +#' with U.S. Census 2010 data. The function extracts Census Summary File data +#' at the county, tract, or block level using the 'UScensus2010' package. Census data +#' calculated are Pr(Geolocation | Race) where geolocation is county, tract, or block. +#' +#' @param key A required character object. Must contain user's Census API +#' key, which can be requested \href{http://api.census.gov/data/key_signup.html}{here}. +#' @param voter.file An object of class \code{data.frame}. Must contain field(s) +#' named \code{\var{county}}, \code{\var{tract}}, and/or \code{\var{block}} +#' specifying geolocation. These should be character variables that match up with +#' U.S. Census categories. County should be three characters (e.g., "031" not "31"), +#' tract should be six characters, and block should be four characters. +#' @param states A character vector specifying which states to extract +#' Census data for, e.g. \code{c("NJ", "NY")}. Default is \code{"all"}, which extracts +#' Census data for all states contained in user-input data. +#' @param geo A character object specifying what aggregation level to use. +#' Use \code{"county"}, \code{"tract"}, or \code{"block"}. Default is \code{"tract"}. +#' Warning: extracting block-level data takes very long. +#' @param age A \code{TRUE}/\code{FALSE} object indicating whether to condition on +#' age or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). +#' If \code{TRUE}, function will return Pr(Geolocation, Age | Race). +#' If \code{\var{sex}} is also \code{TRUE}, function will return Pr(Geolocation, Age, Sex | Race). +#' @param sex A \code{TRUE}/\code{FALSE} object indicating whether to condition on +#' sex or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). +#' If \code{TRUE}, function will return Pr(Geolocation, Sex | Race). +#' If \code{\var{age}} is also \code{TRUE}, function will return Pr(Geolocation, Age, Sex | Race). +#' @param census.data A optional census object of class \code{list} containing +#' pre-saved Census geographic data. Can be created using \code{get_census_data} function. +#' If \code{\var{census.data}} is provided, the \code{\var{age}} element must have the same value +#' as the \code{\var{age}} option specified in this function (i.e., \code{TRUE} in both or +#' \code{FALSE} in both). Similarly, the \code{\var{sex}} element in the object provided in +#' \code{\var{census.data}} must have the same value as the \code{\var{sex}} option here. +#' If \code{\var{census.data}} is missing, Census geographic data will be obtained via Census API. +#' @return Output will be an object of class \code{data.frame}. It will +#' consist of the original user-input data with additional columns of +#' Census data. +#' +#' @examples +#' \dontshow{data(voters)} +#' \dontrun{census_helper(key = "...", voter.file = voters, states = "nj", geo = "block")} +#' \dontrun{census_helper(key = "...", voter.file = voters, states = "all", geo = "tract", age = TRUE, sex = TRUE)} +#' +#' @export +census_helper <- function(key, voter.file, states = "all", geo = "tract", age = FALSE, sex = FALSE, census.data = NA) { + + if (is.na(census.data) || (typeof(census.data) != "list")) { + toDownload = TRUE + } else { + toDownload = FALSE + } + + if (toDownload) { + if (missing(key)) { + stop('Must enter U.S. Census API key, which can be requested at http://api.census.gov/data/key_signup.html.') + } + } + + states <- toupper(states) + if (states == "ALL") { + states <- toupper(as.character(unique(voter.file$state))) + } + + df.out <- NULL + + for (s in 1:length(states)) { + + print(paste("State ", s, " of ", length(states), ": ", states[s], sep = "")) + state <- toupper(states[s]) + + if (geo == "county") { + geo.merge <- c("state", "county") + if ((toDownload) || (is.null(census.data[[state]])) || (census.data[[state]]$age != age) || (census.data[[state]]$sex != sex)) { + census <- census_geo_api(key, state, geo = "county", age, sex) + } else { + census <- census.data[[state]]$county + } + } + + if (geo == "tract") { + geo.merge <- c("state", "county", "tract") + if ((toDownload) || (is.null(census.data[[state]])) || (census.data[[state]]$age != age) || (census.data[[state]]$sex != sex)) { + census <- census_geo_api(key, state, geo = "tract", age, sex) + } else { + census <- census.data[[state]]$tract + } + } + + if (geo == "block") { + geo.merge <- c("state", "county", "tract", "block") + if ((toDownload) || (is.null(census.data[[state]])) || (census.data[[state]]$age != age) || (census.data[[state]]$sex != sex)) { + census <- census_geo_api(key, state, geo = "block", age, sex) + } else { + census <- census.data[[state]]$block + } + } + + census$state <- state + + if (age == T) { + ## Add Census Age Categories + voter.file$agecat <- NA + voter.file$agecat <- ifelse(voter.file$age <= 4, 1, voter.file$agecat) + voter.file$agecat <- ifelse(voter.file$age >= 5 & voter.file$age <= 9, 2, voter.file$agecat) + voter.file$agecat <- ifelse(voter.file$age >= 10 & voter.file$age <= 14, 3, voter.file$agecat) + voter.file$agecat <- ifelse(voter.file$age >= 15 & voter.file$age <= 17, 4, voter.file$agecat) + voter.file$agecat <- ifelse(voter.file$age >= 18 & voter.file$age <= 19, 5, voter.file$agecat) + voter.file$agecat <- ifelse(voter.file$age == 20, 6, voter.file$agecat) + voter.file$agecat <- ifelse(voter.file$age == 21, 7, voter.file$agecat) + voter.file$agecat <- ifelse(voter.file$age >= 22 & voter.file$age <= 24, 8, voter.file$agecat) + voter.file$agecat <- ifelse(voter.file$age >= 25 & voter.file$age <= 29, 9, voter.file$agecat) + voter.file$agecat <- ifelse(voter.file$age >= 30 & voter.file$age <= 34, 10, voter.file$agecat) + voter.file$agecat <- ifelse(voter.file$age >= 35 & voter.file$age <= 39, 11, voter.file$agecat) + voter.file$agecat <- ifelse(voter.file$age >= 40 & voter.file$age <= 44, 12, voter.file$agecat) + voter.file$agecat <- ifelse(voter.file$age >= 45 & voter.file$age <= 49, 13, voter.file$agecat) + voter.file$agecat <- ifelse(voter.file$age >= 50 & voter.file$age <= 54, 14, voter.file$agecat) + voter.file$agecat <- ifelse(voter.file$age >= 55 & voter.file$age <= 59, 15, voter.file$agecat) + voter.file$agecat <- ifelse(voter.file$age >= 60 & voter.file$age <= 61, 16, voter.file$agecat) + voter.file$agecat <- ifelse(voter.file$age >= 62 & voter.file$age <= 64, 17, voter.file$agecat) + voter.file$agecat <- ifelse(voter.file$age >= 65 & voter.file$age <= 66, 18, voter.file$agecat) + voter.file$agecat <- ifelse(voter.file$age >= 67 & voter.file$age <= 69, 19, voter.file$agecat) + voter.file$agecat <- ifelse(voter.file$age >= 70 & voter.file$age <= 74, 20, voter.file$agecat) + voter.file$agecat <- ifelse(voter.file$age >= 75 & voter.file$age <= 79, 21, voter.file$agecat) + voter.file$agecat <- ifelse(voter.file$age >= 80 & voter.file$age <= 84, 22, voter.file$agecat) + voter.file$agecat <- ifelse(voter.file$age >= 85, 23, voter.file$agecat) + } + + if (age == F & sex == F) { + + ## Calculate Pr(Geolocation | Race) + census$r_whi <- census$P0050003 / sum(census$P0050003) #Pr(Tract|White) + census$r_bla <- census$P0050004 / sum(census$P0050004) #Pr(Tract|Black) + census$r_his <- census$P0050010 / sum(census$P0050010) #Pr(Tract|Latino) + census$r_asi <- (census$P0050006 + census$P0050007) / (sum(census$P0050006) + sum(census$P0050007)) #Pr(Tract | Asian or NH/PI) + census$r_oth <- (census$P0050005 + census$P0050008 + census$P0050009) / (sum(census$P0050005) + sum(census$P0050008) + sum(census$P0050009)) #Pr(Tract | AI/AN, Other, or Mixed) + + drop <- grep("P005", names(census)) + voters.census <- merge(voter.file[toupper(voter.file$state) == toupper(states[s]), ], census[, -drop], by = geo.merge, all.x = T) + + } + + if (age == F & sex == T) { + + ## Calculate Pr(Geolocation, Sex | Race) + eth.cen <- c("whi", "bla", "his", "asi", "oth") + eth.let <- c("I", "B", "H", "D", "F") + + for (i in 1:length(eth.cen)) { + if (i != 4 & i != 5) { + census[paste("r_mal", eth.cen[i], sep = "_")] <- census[paste("P012", eth.let[i], "002", sep = "")] / sum(census[paste("P012", eth.let[i], "001", sep = "")]) + census[paste("r_fem", eth.cen[i], sep = "_")] <- census[paste("P012", eth.let[i], "026", sep = "")] / sum(census[paste("P012", eth.let[i], "001", sep = "")]) + } + if (i == 4) { + ## Combine Asian and Native Hawaiian/Pacific Islander + census[paste("r_mal", eth.cen[i], sep = "_")] <- (census$P012D002 + census$P012E002) / sum(census$P012D001 + census$P012E001) + census[paste("r_fem", eth.cen[i], sep = "_")] <- (census$P012D026 + census$P012E026) / sum(census$P012D001 + census$P012E001) + } + if (i == 5) { + ## Combine American India/Alaska Native and Other + census[paste("r_mal", eth.cen[i], sep = "_")] <- (census$P012C002 + census$P012F002) / sum(census$P012C001 + census$P012F001) + census[paste("r_fem", eth.cen[i], sep = "_")] <- (census$P012C026 + census$P012F026) / sum(census$P012C001 + census$P012F001) + } + } + + voters.census <- merge(voter.file[toupper(voter.file$state) == toupper(states[s]), ], census, by = geo.merge, all.x = T) + for (i in 1:length(eth.cen)) { + voters.census[voters.census$sex == 0, paste("r", eth.cen[i], sep = "_")] <- + voters.census[voters.census$sex == 0, paste("r_mal", eth.cen[i], sep = "_")] + voters.census[voters.census$sex == 1, paste("r", eth.cen[i], sep = "_")] <- + voters.census[voters.census$sex == 1, paste("r_fem", eth.cen[i], sep = "_")] + } + + } + + if (age == T & sex == F) { + + ## Calculate Pr(Geolocation, Age Category | Race) + eth.cen <- c("whi", "bla", "his", "asi", "oth") + eth.let <- c("I", "B", "H", "D", "F") + age.cat <- c(seq(1, 23), seq(1, 23)) + age.cen <- as.character(c(c("03", "04", "05", "06", "07", "08", "09"), seq(10, 25), seq(27, 49))) + + for (i in 1:length(eth.cen)) { + for (j in 1:23) { + if (i != 4 & i != 5) { + census[paste("r", age.cat[j], eth.cen[i], sep = "_")] <- (census[paste("P012", eth.let[i], "0", age.cen[j], sep = "")] + census[paste("P012", eth.let[i], "0", age.cen[j + 23], sep = "")]) / sum(census[paste("P012", eth.let[i], "001", sep = "")]) + } + if (i == 4) { + ## Combine Asian and Native Hawaiian/Pacific Islander + census[paste("r", age.cat[j], eth.cen[i], sep = "_")] <- (census[paste("P012D0", age.cen[j], sep = "")] + census[paste("P012D0", age.cen[j + 23], sep = "")] + census[paste("P012E0", age.cen[j], sep = "")] + census[paste("P012E0", age.cen[j + 23], sep = "")]) / sum(census$P012D001 + census$P012E001) + } + if (i == 5) { + ## Combine American India/Alaska Native and Other + census[paste("r", age.cat[j], eth.cen[i], sep = "_")] <- (census[paste("P012C0", age.cen[j], sep = "")] + census[paste("P012C0", age.cen[j + 23], sep = "")] + census[paste("P012F0", age.cen[j], sep = "")] + census[paste("P012F0", age.cen[j + 23], sep = "")]) / sum(census$P012C001 + census$P012F001) + } + } + } + + voters.census <- merge(voter.file[toupper(voter.file$state) == toupper(states[s]), ], census, by = geo.merge, all.x = T) + for (i in 1:length(eth.cen)) { + for (j in 1:23) { + voters.census[voters.census$agecat == j, paste("r", eth.cen[i], sep = "_")] <- + voters.census[voters.census$agecat == j, paste("r", j, eth.cen[i], sep = "_")] + } + } + + } + + if (age == T & sex == T) { + + ## Calculate Pr(Tract, Sex, Age Category | Race) + eth.cen <- c("whi", "bla", "his", "asi", "oth") + eth.let <- c("I", "B", "H", "D", "F") + sex.let <- c("mal", "fem") + age.cat <- c(seq(1, 23), seq(1, 23)) + age.cen <- as.character(c(c("03", "04", "05", "06", "07", "08", "09"), seq(10, 25), seq(27, 49))) + + for (i in 1:length(eth.cen)) { + for (k in 1:length(sex.let)) { + for (j in 1:23) { + if (k == 2) { + j <- j + 23 + } + if (i != 4 & i != 5) { + census[paste("r", sex.let[k], age.cat[j], eth.cen[i], sep = "_")] <- census[paste("P012", eth.let[i], "0", age.cen[j], sep = "")] / sum(census[paste("P012", eth.let[i], "001", sep = "")]) + } + if (i == 4) { + ## Combine Asian and Native Hawaiian/Pacific Islander + census[paste("r", sex.let[k], age.cat[j], eth.cen[i], sep = "_")] <- (census[paste("P012D0", age.cen[j], sep = "")] + census[paste("P012E0", age.cen[j], sep = "")]) / sum(census$P012D001 + census$P012E001) + } + if (i == 5) { + ## Combine American India/Alaska Native and Other + census[paste("r", sex.let[k], age.cat[j], eth.cen[i], sep = "_")] <- (census[paste("P012C0", age.cen[j], sep = "")] + census[paste("P012F0", age.cen[j], sep = "")]) / sum(census$P012C001 + census$P012F001) + } + } + } + } + + voters.census <- merge(voter.file[toupper(voter.file$state) == toupper(states[s]), ], census, by = geo.merge, all.x = T) + for (i in 1:length(eth.cen)) { + for (j in 1:23) { + voters.census[voters.census$sex == 0 & voters.census$agecat == j, + paste("r", eth.cen[i], sep = "_")] <- + voters.census[voters.census$sex == 0 & voters.census$agecat == j, + paste("r_mal", j, eth.cen[i], sep = "_")] + voters.census[voters.census$sex == 1 & voters.census$agecat == j, + paste("r", eth.cen[i], sep = "_")] <- + voters.census[voters.census$sex == 1 & voters.census$agecat == j, + paste("r_fem", j, eth.cen[i], sep = "_")] + } + } + + } + + keep.vars <- c(names(voter.file)[names(voter.file) != "agecat"], + paste("r", c("whi", "bla", "his", "asi", "oth"), sep = "_")) + df.out <- as.data.frame(rbind(df.out, voters.census[keep.vars])) + + } + + return(df.out) +} diff --git a/R/getCensusData.R b/R/getCensusData.R deleted file mode 100644 index 795d009..0000000 --- a/R/getCensusData.R +++ /dev/null @@ -1,251 +0,0 @@ -#' Census Data download function. -#' -#' \code{censusData} retrieve Census data. -#' -#' This function allows users to download (e.g., voter file) the U.S. Census 2010 data, -#' at either county level, tract level or block level. -#' -#' @param key A required character object. Must contain user's Census API -#' key, which can be requested \href{http://api.census.gov/data/key_signup.html}{here}. -#' @param state to extract Census data for, e.g. \code{"NJ"}. -#' @param geo A character object specifying what aggregation level to use. -#' Use \code{"county"}, \code{"tract"}, or \code{"block"}. Default is \code{"tract"}. -#' Warning: extracting block-level data takes very long. -#' @param demo A \code{TRUE}/\code{FALSE} object indicating whether to condition on -#' demographics (i.e., age and sex) or not. If \code{TRUE}, function will return -#' Pr(Geolocation, Age, Sex | Race). If \code{FALSE}, function wil return -#' Pr(Geolocation | Race). Default is \code{FALSE}. -#' @return Output will be an object of class \code{list}, indexed by state names. It will -#' consist of the original user-input data with additional columns of -#' Census data. -#' -#' @examples -#' \dontshow{data(voters)} -#' \dontrun{censusData(key = "...", states = c("NJ", "DE"), geo = "block")} -#' \dontrun{censusData(key = "...", states = "FL", geo = "tract", -#' demo = TRUE)} -#' -#' @references -#' Relies on getCensusApi, getCensusApi2, and vecToChunk functions authored by Nicholas Nagle, -#' available \href{http://rstudio-pubs-static.s3.amazonaws.com/19337_2e7f827190514c569ea136db788ce850.html}{here}. -#' -#' @export -censusData <- function(key, state, geo = "tract", demo = FALSE) { - - if (missing(key)) { - stop('Must enter U.S. Census API key, which can be requested at http://api.census.gov/data/key_signup.html.') - } - - state <- toupper(state) - - df.out <- NULL - - { - print(paste("State ", state)) - fips.codes <- get("State.FIPS") - state.fips <- fips.codes[fips.codes$State == state, "FIPS"] - - if (demo == F) { - num <- ifelse(3:10 != 10, paste("0", 3:10, sep = ""), "10") - vars <- paste("P00500", num, sep = "") - } - - if (demo == T) { - eth.let <- c("I", "B", "H", "D", "E", "F", "C") - num <- as.character(c(c("01", "07", "08", "09"), seq(10, 25), seq(31, 49))) - vars <- NULL - for (e in 1:length(eth.let)) { - vars <- c(vars, paste("P012", eth.let[e], "0", num, sep = "")) - } - } - - if (geo == "county") { - geo.merge <- c("state", "county") - region <- paste("for=county:*&in=state:", state.fips, sep = "") - census <- getCensusApi("http://api.census.gov/data/2010/sf1?", - key = key, vars = vars, region = region) - } - - if (geo == "tract") { - - geo.merge <- c("state", "county", "tract") - - region_county <- paste("for=county:*&in=state:", state.fips, sep = "") - county_df <- getCensusApi("http://api.census.gov/data/2010/sf1?", key = key, vars = vars, region = region_county) - county_list <- county_df$county - - census <- NULL - for (c in 1:length(county_list)) { - print(paste("County ", c, " of ", length(county_list), ": ", county_list[c], sep = "")) - region_county <- paste("for=tract:*&in=state:", state.fips, "+county:", county_list[c], sep = "") - census.temp <- getCensusApi("http://api.census.gov/data/2010/sf1?", - key = key, vars = vars, region = region_county) - census <- rbind(census, census.temp) - } - rm(census.temp) - } - - if (geo == "block") { - - geo.merge <- c("state", "county", "tract", "block") - - region_county <- paste("for=county:*&in=state:", state.fips, sep = "") - county_df <- getCensusApi("http://api.census.gov/data/2010/sf1?", key = key, vars = vars, region = region_county) - county_list <- county_df$county - - census <- NULL - - for (c in 1:length(county_list)) { - print(paste("County ", c, " of ", length(county_list), ": ", county_list[c], sep = "")) - - region_tract <- paste("for=tract:*&in=state:", state.fips, "+county:", county_list[c], sep = "") - print(region_tract) - tract_df <- getCensusApi("http://api.census.gov/data/2010/sf1?", - key = key, vars = vars, region = region_tract) - tract_list <- tract_df$tract - - for (t in 1:length(tract_list)) { - print(paste("Tract ", t, " of ", length(tract_list), ": ", tract_list[t], sep = "")) - - region_block <- paste("for=block:*&in=state:", state.fips, "+county:", county_list[c], "+tract:", tract_list[t], sep = "") - census.temp <- getCensusApi("http://api.census.gov/data/2010/sf1?", - key = key, vars = vars, region = region_block) - census <- rbind(census, census.temp) - } - } - rm(census.temp) - - } - - if (demo == F) { - - census$state <- state - - ## Calculate Pr(Geolocation | Race) - census$r_whi <- census$P0050003 / sum(census$P0050003) #Pr(Tract|White) - census$r_bla <- census$P0050004 / sum(census$P0050004) #Pr(Tract|Black) - census$r_his <- census$P0050010 / sum(census$P0050010) #Pr(Tract|Latino) - census$r_asi <- (census$P0050006 + census$P0050007) / (sum(census$P0050006) + sum(census$P0050007)) #Pr(Tract | Asian or NH/PI) - census$r_oth <- (census$P0050005 + census$P0050008 + census$P0050009) / (sum(census$P0050005) + sum(census$P0050008) + sum(census$P0050009)) #Pr(Tract | AI/AN, Other, or Mixed) - - # drop <- grep("P005", names(census)) - # voters.census <- merge(voters[toupper(voters$state) == toupper(state), ], census[, -drop], by = geo.merge, all.x = T) - - } - - if (demo == T) { - - census$state <- state - - ## Calculate Pr(Tract, Sex, Age Category | Race) - eth.cen <- c("whi", "bla", "his", "asi", "oth") - eth.let <- c("I", "B", "H", "D", "F") - sex <- c("mal", "fem") - age.cat <- c(seq(5, 23), seq(5, 23)) - age.cen <- as.character(c(c("07", "08", "09"), seq(10, 25), seq(31, 49))) - - for (i in 1:length(eth.cen)) { - for (k in 1:length(sex)) { - for (j in 1:19) { - if (k == 2) { - j <- j + 19 - } - if (i != 4 & i != 5) { - census[paste("r", sex[k], age.cat[j], eth.cen[i], sep = "_")] <- census[paste("P012", eth.let[i], "0", age.cen[j], sep = "")] / sum(census[paste("P012", eth.let[i], "001", sep = "")]) - } - if (i == 4) { - ## Combine Asian and Native Hawaiian/Pacific Islander - census[paste("r", sex[k], age.cat[j], eth.cen[i], sep = "_")] <- (census[paste("P012D0", age.cen[j], sep = "")] + census[paste("P012E0", age.cen[j], sep = "")]) / sum(census$P012D001 + census$P012E001) - } - if (i == 5) { - ## Combine American India/Alaska Native and Other - census[paste("r", sex[k], age.cat[j], eth.cen[i], sep = "_")] <- (census[paste("P012C0", age.cen[j], sep = "")] + census[paste("P012F0", age.cen[j], sep = "")]) / sum(census$P012C001 + census$P012F001) - } - } - } - } - - # drop <- grep("P012", names(census)) - # voters.census <- merge(voters[toupper(voters$state) == toupper(state), ], census[, -drop], by = geo.merge, all.x = T) - - ## Add Census Age Categories - # voters.census$agecat <- NA - # voters.census$agecat <- ifelse(voters.census$age >= 18 & voters.census$age <= 19, 5, voters.census$agecat) - # voters.census$agecat <- ifelse(voters.census$age == 20, 6, voters.census$agecat) - # voters.census$agecat <- ifelse(voters.census$age == 21, 7, voters.census$agecat) - # voters.census$agecat <- ifelse(voters.census$age >= 22 & voters.census$age <= 24, 8, voters.census$agecat) - # voters.census$agecat <- ifelse(voters.census$age >= 25 & voters.census$age <= 29, 9, voters.census$agecat) - # voters.census$agecat <- ifelse(voters.census$age >= 30 & voters.census$age <= 34, 10, voters.census$agecat) - # voters.census$agecat <- ifelse(voters.census$age >= 35 & voters.census$age <= 39, 11, voters.census$agecat) - # voters.census$agecat <- ifelse(voters.census$age >= 40 & voters.census$age <= 44, 12, voters.census$agecat) - # voters.census$agecat <- ifelse(voters.census$age >= 45 & voters.census$age <= 49, 13, voters.census$agecat) - # voters.census$agecat <- ifelse(voters.census$age >= 50 & voters.census$age <= 54, 14, voters.census$agecat) - # voters.census$agecat <- ifelse(voters.census$age >= 55 & voters.census$age <= 59, 15, voters.census$agecat) - # voters.census$agecat <- ifelse(voters.census$age >= 60 & voters.census$age <= 61, 16, voters.census$agecat) - # voters.census$agecat <- ifelse(voters.census$age >= 62 & voters.census$age <= 64, 17, voters.census$agecat) - # voters.census$agecat <- ifelse(voters.census$age >= 65 & voters.census$age <= 66, 18, voters.census$agecat) - # voters.census$agecat <- ifelse(voters.census$age >= 67 & voters.census$age <= 69, 19, voters.census$agecat) - # voters.census$agecat <- ifelse(voters.census$age >= 70 & voters.census$age <= 74, 20, voters.census$agecat) - # voters.census$agecat <- ifelse(voters.census$age >= 75 & voters.census$age <= 79, 21, voters.census$agecat) - # voters.census$agecat <- ifelse(voters.census$age >= 80 & voters.census$age <= 84, 22, voters.census$agecat) - # voters.census$agecat <- ifelse(voters.census$age >= 85, 23, voters.census$agecat) - - # for (i in 1:length(eth.cen)) { - # for (j in 5:23) { - # voters.census[voters.census$sex == 0 & voters.census$agecat == j, - # paste("r", eth.cen[i], sep = "_")] <- voters.census[paste("r_mal", j, eth.cen[i], sep = "_")] - # voters.census[voters.census$sex == 1 & voters.census$agecat == j, - # paste("r", eth.cen[i], sep = "_")] <- voters.census[paste("r_fem", j, eth.cen[i], sep = "_")] - # } - # } - - # drop <- c(grep("_mal_", names(voters.census)), grep("_fem_", names(voters.census))) - # voters.census <- voters.census[, -drop] - } - - # df.out <- as.data.frame(rbind(df.out, voters.census[names(voters.census) != "agecat"])) - - } - - # return(df.out) - return(census) -} - -#' Title return -#' Multilevel Census Data download function. -#' -#' \code{getCensusData} returns a Census data obj for a state. -#' -#' @param key A required character object. Must contain user's Census API -#' key, which can be requested \href{http://api.census.gov/data/key_signup.html}{here}. -#' @param states which states to extract -#' Census data for, e.g. \code{c("NJ", "NY")}. -#' @param demo A \code{TRUE}/\code{FALSE} object indicating whether to condition on -#' demographics (i.e., age and sex) or not. If \code{TRUE}, function will return -#' Pr(Geolocation, Age, Sex | Race). If \code{FALSE}, function wil return -#' Pr(Geolocation | Race). Default is \code{FALSE}. -#' -#' @return Output will be an census object of class which is a list consist of \code{state}, -#' \code{demo}, \code{county level census}, \code{tract level census} and \code{block level census}. -#' Have the census data available could make \code{census.helper.api} runs more efficient. -#' -#' @export -#' -#' @examples \dontrun{getCensusData(key = "...", states = c("NJ", "DE"), demo = TRUE)} -getCensusData <- function(key, states, demo = FALSE) { - - if (missing(key)) { - stop('Must enter U.S. Census API key, which can be requested at http://api.census.gov/data/key_signup.html.') - } - - states <- toupper(states) - - CensusObjs <- NULL - for (s in states) { - county = censusData(key, s, geo = "county", demo) - tract = censusData(key, s, geo = "tract", demo) - block = censusData(key, s, geo = "block", demo) - CensusObjs[[s]] <- list(state = s, demo = demo, county = county, tract = tract, block = block) - } - return(CensusObjs) -} diff --git a/R/getCensusApi.R b/R/get_census_api.R similarity index 81% rename from R/getCensusApi.R rename to R/get_census_api.R index 5d9d6b0..63ac9d8 100644 --- a/R/getCensusApi.R +++ b/R/get_census_api.R @@ -1,6 +1,6 @@ #' Census API function. #' -#' \code{getCensusApi} obtains U.S. Census data via the public API. +#' \code{get_census_api} obtains U.S. Census data via the public API. #' #' This function obtains U.S. Census data via the public API. User #' can specify the variables and region(s) for which to obtain data. @@ -20,7 +20,7 @@ #' If unsuccessful, function prints the URL query that caused the error. #' #' @examples -#' \dontrun{getCensusApi(data_url = "http://api.census.gov/data/2010/sf1?", key = "...", +#' \dontrun{get_census_api(data_url = "http://api.census.gov/data/2010/sf1?", key = "...", #' vars = c("P0050003","P0050004","P0050005", "P0050006"), region = "for=county:*&in=state:34")} #' #' @references @@ -28,15 +28,15 @@ #' \href{http://rstudio-pubs-static.s3.amazonaws.com/19337_2e7f827190514c569ea136db788ce850.html}{here}. #' #' @export -getCensusApi <- function(data_url, key, vars, region) { +get_census_api <- function(data_url, key, vars, region) { if(length(vars) > 50){ - vars <- vecToChunk(vars) # Split variables into a list + vars <- vec_to_chunk(vars) # Split variables into a list get <- lapply(vars, function(x) paste(x, sep='', collapse=",")) - data <- lapply(vars, function(x) getCensusApi2(data_url,key, x, region)) + data <- lapply(vars, function(x) get_census_api_2(data_url,key, x, region)) } else { get <- paste(vars, sep='', collapse=',') - data <- list(getCensusApi2(data_url, key, get, region)) + data <- list(get_census_api_2(data_url, key, get, region)) } ## Format output. If there were no errors, than paste the data together. If there is an error, just return the unformatted list. @@ -52,7 +52,7 @@ getCensusApi <- function(data_url, key, vars, region) { } else{ - print('Unable to create single data.frame in getCensusApi') + print('Unable to create single data.frame in get_census_api') return(data) } } diff --git a/R/getCensusApi2.R b/R/get_census_api_2.R similarity index 88% rename from R/getCensusApi2.R rename to R/get_census_api_2.R index 3e602fc..2f765ac 100644 --- a/R/getCensusApi2.R +++ b/R/get_census_api_2.R @@ -1,9 +1,9 @@ #' Census API URL assembler. #' -#' \code{getCensusApi2} assembles URL components for \code{getCensusApi}. +#' \code{get_census_api_2} assembles URL components for \code{get_census_api}. #' #' This function assembles the URL components and sends the request to the Census server. -#' It is used by the \code{getCensusApi} function. The user should not need to call this +#' It is used by the \code{get_census_api} function. The user should not need to call this #' function directly. #' #' @param data_url URL root of the API, including the question mark, @@ -21,7 +21,7 @@ #' If unsuccessful, function prints the URL query that was constructed. #' #' @examples -#' \dontrun{getCensusApi2(data_url = "http://api.census.gov/data/2010/sf1?", key = "...", +#' \dontrun{get_census_api_2(data_url = "http://api.census.gov/data/2010/sf1?", key = "...", #' get = c("P0050003","P0050004","P0050005", "P0050006"), region = "for=county:*&in=state:34")} #' #' @references @@ -29,7 +29,7 @@ #' \href{http://rstudio-pubs-static.s3.amazonaws.com/19337_2e7f827190514c569ea136db788ce850.html}{here}. #' #' @export -getCensusApi2 <- function(data_url, key, get, region){ +get_census_api_2 <- function(data_url, key, get, region){ if(length(get)>1) { get <- paste(get, collapse=',', sep='') } diff --git a/R/get_census_data.R b/R/get_census_data.R new file mode 100644 index 0000000..8a3c59a --- /dev/null +++ b/R/get_census_data.R @@ -0,0 +1,41 @@ +#' Multilevel Census data download function. +#' +#' \code{get_census_data} returns county-, tract-, and block-level Census data +#' for specified state(s). Using this function to download Census data in advance +#' can save considerable time when running \code{predict_race} and \code{census_helper}. +#' +#' @param key A required character object containing a valid Census API key, +#' which can be requested \href{http://api.census.gov/data/key_signup.html}{here}. +#' @param states which states to extract Census data for, e.g., \code{c("NJ", "NY")}. +#' @param age A \code{TRUE}/\code{FALSE} object indicating whether to condition on +#' age or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). +#' If \code{TRUE}, function will return Pr(Geolocation, Age | Race). +#' If \code{\var{sex}} is also \code{TRUE}, function will return Pr(Geolocation, Age, Sex | Race). +#' @param sex A \code{TRUE}/\code{FALSE} object indicating whether to condition on +#' sex or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). +#' If \code{TRUE}, function will return Pr(Geolocation, Sex | Race). +#' If \code{\var{age}} is also \code{TRUE}, function will return Pr(Geolocation, Age, Sex | Race). +#' @return Output will be an object of class \code{list} indexed by state. +#' Output will contain the following elements: \code{state}, \code{age}, \code{sex}, +#' \code{county}, \code{tract} and \code{block}. +#' +#' @export +#' +#' @examples \dontrun{get_census_data(key = "...", states = c("NJ", "NY"), age = TRUE, sex = FALSE)} +get_census_data <- function(key, states, age = FALSE, sex = FALSE) { + + if (missing(key)) { + stop('Must enter valid Census API key, which can be requested at http://api.census.gov/data/key_signup.html.') + } + + states <- toupper(states) + + CensusObj <- NULL + for (s in states) { + county = census_geo_api(key, s, geo = "county", age, sex) + tract = census_geo_api(key, s, geo = "tract", age, sex) + block = census_geo_api(key, s, geo = "block", age, sex) + CensusObj[[s]] <- list(state = s, age = age, sex = sex, county = county, tract = tract, block = block) + } + return(CensusObj) +} diff --git a/R/merge_surnames.R b/R/merge_surnames.R new file mode 100644 index 0000000..537353e --- /dev/null +++ b/R/merge_surnames.R @@ -0,0 +1,158 @@ +#' Surname probability merging function. +#' +#' \code{merge_surnames} merges surnames in user-input dataset with corresponding +#' race/ethnicity probabilities from U.S. Census Surname List and Spanish Surname List. +#' +#' This function allows users to match surnames in their dataset with the U.S. +#' Census Surname List (from 2000 or 2010) and Spanish Surname List to obtain +#' Pr(Race | Surname) for each of the five major racial groups. +#' +#' By default, the function matches surnames to the Census list as follows +#' (each step only applies to surnames not matched in previous steps): +#' 1) Search raw surnames in Census surname list; +#' 2) Remove any punctuation and search again; +#' 3) Remove any spaces and search again; +#' 4) Remove suffixes (e.g., Jr) and search again; +#' 5) Split double-barreled surnames into two parts and search first part of name; +#' 6) Split double-barreled surnames into two parts and search second part of name; +#' 7) For any remaining names, impute probabilities using distribution +#' for all names not appearing on Census list. +#' +#' Note: Any name appearing only on the Spanish Surname List is assigned a +#' probability of 1 for Hispanics/Latinos and 0 for all other racial groups. +#' +#' @param voter.file An object of class \code{data.frame}. Must contain a field +#' named 'surname' containing list of surnames to be merged with Census lists. +#' @param surname.year An object of class \code{numeric} indicating which year +#' Census Surname List is from. Accepted values are \code{2010} and \code{2000}. +#' Default is \code{2010}. +#' @param clean.surname A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, +#' \code{clean.surname} function will be run to clean raw surnames in +#' \code{\var{voter.file}} before matching them with Census lists, +#' in order to increase the chance of finding a match. +#' See \code{clean.surname} documentation for details. +#' Default is \code{TRUE}. +#' @param impute.missing A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, +#' race/ethnicity probabilities will be imputed for unmatched names using +#' race/ethnicity distribution for all other names (i.e., not on Census List). +#' Default is \code{TRUE}. +#' @return Output will be an object of class \code{data.frame}. It will +#' consist of the original user-input data with additional columns that +#' specify the part of the name matched with Census data (\code{\var{surname.match}}), +#' and the probabilities Pr(Race | Surname) for each racial group +#' (\code{\var{p_whi}} for White, \code{\var{p_bla}} for Black, +#' \code{\var{p_his}} for Hispanic/Latino, +#' \code{\var{p_asi}} for Asian and Pacific Islander, and +#' \code{\var{p_oth}} for Other/Mixed). +#' +#' @import devtools +#' +#' @examples +#' data(voters) +#' merge_surnames(voters) +#' +#' @export +merge_surnames <- function(voter.file, surname.year = 2010, clean.surname = T, impute.missing = T) { + + if ("surname" %in% names(voter.file) == F) { + stop('Data does not contain surname field.') + } + + ## Census Surname List + if (surname.year == 2000) { + surnames <- surnames2000 + } + surnames$surname <- as.character(surnames$surname) + + p_eth <- c("p_whi", "p_bla", "p_his", "p_asi", "p_oth") + + ## Convert Surnames in Voter File to Upper Case + df <- voter.file + df$caseid <- 1:nrow(df) + df$surname.match <- df$surname.upper <- toupper(as.character(df$surname)) + + ## Merge Surnames with Census List (No Cleaning Yet) + df <- merge(df[names(df) %in% p_eth == F], surnames[c("surname", p_eth)], by.x = "surname.match", by.y = "surname", all.x = TRUE) + df[df$surname.upper %in% surnames$surname == F, ]$surname.match <- "" + + df1 <- df[df$surname.upper %in% surnames$surname, ] #Matched surnames + df2 <- df[df$surname.upper %in% surnames$surname == F, ] #Unmatched surnames + + ## Clean Surnames (if Specified by User) + if (clean.surname) { + + ## Remove All Punctuation and Try Merge Again + df2$surname.match <- gsub("[^[:alnum:] ]", "", df2$surname.upper) + df2 <- merge(df2[names(df2) %in% p_eth == F], surnames[c("surname", p_eth)], by.x = "surname.match", by.y = "surname", all.x = TRUE) + if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) { + df1 <- rbind(df1, df2[df2$surname.match %in% surnames$surname, ]) + df2 <- df2[df2$surname.match %in% surnames$surname == F, ] + if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) {df2$surname.match <- ""} + } + + ## Remove All Spaces and Try Merge Again + df2$surname.match <- gsub(" ", "", df2$surname.match) + df2 <- merge(df2[names(df2) %in% p_eth == F], surnames[c("surname", p_eth)], by.x = "surname.match", by.y = "surname", all.x = TRUE) + if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) { + df1 <- rbind(df1, df2[df2$surname.match %in% surnames$surname, ]) + df2 <- df2[df2$surname.match %in% surnames$surname == F, ] + if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) {df2$surname.match <- ""} + } + + ## Remove Jr/Sr/III Suffixes + suffix <- c("JUNIOR", "SENIOR", "THIRD", "III", "JR", " II", " J R", " S R", " IV") + for (i in 1:length(suffix)) { + df2$surname.match <- ifelse(substr(df2$surname.match, nchar(df2$surname.match) - (nchar(suffix)[i] - 1), nchar(df2$surname.match)) == suffix[i], + substr(df2$surname.match, 1, nchar(df2$surname.match) - nchar(suffix)[i]), + df2$surname.match) + } + df2$surname.match <- ifelse(nchar(df2$surname.match) >= 7, + ifelse(substr(df2$surname.match, nchar(df2$surname.match) - 1, nchar(df2$surname.match)) == "SR", + substr(df2$surname.match, 1, nchar(df2$surname.match) - 2), + df2$surname.match), + df2$surname.match) #Remove "SR" only if name has at least 7 characters + df2 <- merge(df2[names(df2) %in% p_eth == F], surnames[c("surname", p_eth)], by.x = "surname.match", by.y = "surname", all.x = TRUE) + if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) { + df1 <- rbind(df1, df2[df2$surname.match %in% surnames$surname, ]) + df2 <- df2[df2$surname.match %in% surnames$surname == F, ] + if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) {df2$surname.match <- ""} + } + + ## Names with Hyphens or Spaces, e.g. Double-Barreled Names + df2$surname2 <- df2$surname1 <- NA + df2$surname1[grep("-", df2$surname.upper)] <- sapply(strsplit(grep("-", df2$surname.upper, value = T), "-"), "[", 1) + df2$surname2[grep("-", df2$surname.upper)] <- sapply(strsplit(grep("-", df2$surname.upper, value = T), "-"), "[", 2) + df2$surname1[grep(" ", df2$surname.upper)] <- sapply(strsplit(grep(" ", df2$surname.upper, value = T), " "), "[", 1) + df2$surname2[grep(" ", df2$surname.upper)] <- sapply(strsplit(grep(" ", df2$surname.upper, value = T), " "), "[", 2) + + ## Use first half of name to merge in priors + df2$surname.match <- as.character(df2$surname1) + df2 <- merge(df2[names(df2) %in% c(p_eth) == F], surnames[c("surname", p_eth)], by.x = "surname.match", by.y = "surname", all.x = TRUE)[names(df2)] + if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) { + df1 <- rbind(df1, df2[df2$surname.match %in% surnames$surname, names(df2) %in% names(df1)]) + df2 <- df2[df2$surname.match %in% surnames$surname == F, ] + if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) {df2$surname.match <- ""} + } + + ## Use second half of name to merge in priors for rest + df2$surname.match <- as.character(df2$surname2) + df2 <- merge(df2[names(df2) %in% c(p_eth, "surname1", "surname2") == F], surnames[c("surname", p_eth)], by.x = "surname.match", by.y = "surname", all.x = TRUE)[names(df2) %in% c("surname1", "surname2") == F] + if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) { + df1 <- rbind(df1, df2[df2$surname.match %in% surnames$surname, names(df2) %in% names(df1)]) + df2 <- df2[df2$surname.match %in% surnames$surname == F, ] + if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) {df2$surname.match <- ""} + } + } + + ## Impute priors for names not on Census lists + if (impute.missing) { + if (nrow(df2) > 0) { + df2$surname.match <- "" + df2$p_whi <- .6665; df2$p_bla <- .0853; df2$p_his <- .1367; df2$p_asi <- .0797; df2$p_oth <- .0318 + warning(paste("Probabilities were imputed for", nrow(df2), ifelse(nrow(df2) == 1, "surname", "surnames"), "that could not be matched to Census list.")) + } + } else warning(paste(nrow(df2), ifelse(nrow(df2) == 1, "surname was", "surnames were"), "not matched.")) + + df <- rbind(df1, df2) + return(df[order(df$caseid), c(names(voter.file), "surname.match", p_eth)]) +} diff --git a/R/name.clean.R b/R/name.clean.R deleted file mode 100644 index 9289dfc..0000000 --- a/R/name.clean.R +++ /dev/null @@ -1,94 +0,0 @@ -#' Name cleaning and matching function. -#' -#' \code{name.clean} cleans surnames in user-input dataset and merges in racial -#' distributions from the Census Surname List and Census Spanish Surname List. -#' -#' This function allows users to match surnames in their dataset with the U.S. -#' Census 2000 Surname List to obtain Pr(Race | Surname) for each of the -#' five major racial groups. The function matches user-input surnames with -#' Census surnames as follows (each step only applies to surnames not matched -#' in previous steps): -#' 1) match raw surnames with Census data; -#' 2) remove any spaces and search again; -#' 3) split apart double-barreled surnames into two names and match on first; -#' 4) split apart double-barreled surnames into two names and match on second; -#' 5) for any remaining names, impute probabilities from overall U.S. population. -#' Note: Any name appearing only on the Spanish Surname List is assigned a -#' probability of 1 for Hispanics/Latinos and 0 for all other racial groups. -#' -#' @param voters An object of class \code{data.frame}. Must contain a field -#' named 'surname'. -#' @return Output will be an object of class \code{data.frame}. It will -#' consist of the original user-input data with additional columns that -#' specify the part of the name matched with Census data (\code{\var{surname.match}}), -#' and the probabilities Pr(Race | Surname) for each racial group -#' (\code{\var{p_whi}} for Whites, \code{\var{p_bla}} for Blacks, -#' \code{\var{p_his}} for Hispanics/Latinos, \code{\var{p_asi}} for Asians, and -#' \code{\var{p_oth}} for Others). -#' -#' @import devtools -#' -#' @examples -#' data(voters) -#' name.clean(voters) -#' -#' @export -name.clean <- function(voters) { - - if ("surname" %in% names(voters) == F) { - stop('Data does not contain surname field.') - } - names.all$surname <- as.character(names.all$surname) - - p_eth <- c("p_whi", "p_bla", "p_his", "p_asi", "p_oth") - - ## Convert Surnames to Upper Case - df1 <- voters - df1$surname.upper <- df1$surname.match <- toupper(as.character(df1$surname)) - - ## Merge Surname Priors (No Cleaning Yet) - df2 <- merge(df1[names(df1) %in% p_eth == F], names.all[c("surname", p_eth)], by.x = "surname.match", by.y = "surname", all.x = TRUE) - - ## Remove Spaces and Merge Again - df2$surname.match <- gsub(" ","", df2$surname.upper) - df3 <- merge(df2[names(df2) %in% p_eth == F], names.all[c("surname", p_eth)], by.x = "surname.match", by.y = "surname", all.x = TRUE) - - ## Names with Hyphens or Spaces, e.g. Double-Barreled Names - df3$nomatch <- 0 - if (nrow(df3[df3$surname.upper %in% names.all$surname == F, ]) > 0) { - df3[df3$surname.upper %in% names.all$surname == F, ]$nomatch <- 1 - } - df3$surname1 <- NA - df3$surname2 <- NA - df3[df3$nomatch == 1, ]$surname1[grep("-", df3[df3$nomatch == 1, ]$surname.upper)] <- sapply(strsplit(grep("-", df3$surname.upper, value = T), "-"), "[", 1) - df3[df3$nomatch == 1, ]$surname2[grep("-", df3[df3$nomatch == 1, ]$surname.upper)] <- sapply(strsplit(grep("-", df3$surname.upper, value = T), "-"), "[", 2) - df3[df3$nomatch == 1, ]$surname1[grep(" ", df3[df3$nomatch == 1, ]$surname.upper)] <- sapply(strsplit(grep(" ", df3$surname.upper, value = T), " "), "[", 1) - df3[df3$nomatch == 1, ]$surname2[grep(" ", df3[df3$nomatch == 1, ]$surname.upper)] <- sapply(strsplit(grep(" ", df3$surname.upper, value = T), " "), "[", 2) - - ## Use first half of name to merge in priors - df3[df3$nomatch == 1, ]$surname.match <- as.character(df3[df3$nomatch == 1 , ]$surname1) - df3[df3$nomatch == 1, ] <- merge(df3[df3$nomatch == 1, names(df3) %in% p_eth == F], names.all[c("surname", p_eth)], by.x = "surname.match", by.y = "surname", all.x = TRUE)[names(df3)] - df3$nomatch <- 0 - if (nrow(df3[df3$surname.match %in% names.all$surname == F, ]) > 0) { - df3[df3$surname.match %in% names.all$surname == F, ]$nomatch <- 1 - } - - ## Use second half of name to merge in priors for rest - df3[df3$nomatch == 1, ]$surname.match <- as.character(df3[df3$nomatch == 1 , ]$surname2) - df3[df3$nomatch == 1, ] <- merge(df3[df3$nomatch == 1, names(df3) %in% p_eth == F], names.all[c("surname", p_eth)], by.x = "surname.match", by.y = "surname", all.x = TRUE)[names(df3)] - df3$nomatch <- 0 - if (nrow(df3[df3$surname.match %in% names.all$surname == F, ]) > 0) { - df3[df3$surname.match %in% names.all$surname == F, ]$nomatch <- 1 - } - - ## Impute priors for names not on Census 2000 surname list or Spanish surname list - if (nrow(df3[df3$nomatch == 1, ]) > 0) { - df3[df3$nomatch == 1, ]$p_whi <- .621 #.705 - df3[df3$nomatch == 1, ]$p_bla <- .132 #.113 - df3[df3$nomatch == 1, ]$p_his <- .174 #.111 - df3[df3$nomatch == 1, ]$p_asi <- .054 #.070 - df3[df3$nomatch == 1, ]$p_oth <- .019 #(neg) - } - - return(df3[c(names(voters), "surname.match", p_eth)]) -} diff --git a/R/pid.R b/R/pid.R new file mode 100644 index 0000000..39d6e10 --- /dev/null +++ b/R/pid.R @@ -0,0 +1,21 @@ +#' Party and race +#' +#' A table for probability of party by race. +#' +#' @format A data frame with 3 rows and 7 variables: +#' \describe{ +#' \item{party}{Dem/Rep/Ind} +#' \item{PID}{0/1/2} +#' \item{r_pid_whi}{Pr(PID | White)} +#' \item{r_pid_bla}{Pr(PID | Black)} +#' \item{r_pid_his}{Pr(PID | Hispanic/Latino)} +#' \item{r_pid_asi}{Pr(PID | Asian/Pacific Islander)} +#' \item{r_pid_oth}{Pr(PID | Other)} +#' #' } +#' +#' @docType data +#' @keywords datasets +#' @name pid +#' @examples +#' data(pid) +"pid" diff --git a/R/predict_race.R b/R/predict_race.R new file mode 100644 index 0000000..a4b7921 --- /dev/null +++ b/R/predict_race.R @@ -0,0 +1,246 @@ +#' Race prediction function. +#' +#' \code{predict_race} makes probabilistic estimates of individual-level race/ethnicity. +#' +#' This function implements the Bayesian race prediction methods outlined in +#' Imai and Khanna (2015). The function produces probabilistic estimates of +#' individual-level race/ethnicity, based on surname, geolocation, and party. +#' @param voter.file An object of class \code{data.frame}. +#' Must contain a row for each individual being predicted, +#' as well as a field named \code{\var{surname}} containing each individual's surname. +#' If using geolocation in predictions, \code{\var{voter.file}} must contain a field named +#' \code{\var{state}}, which contains the two-character abbreviation for each individual's +#' state of residence (e.g., \code{"nj"} for New Jersey). +#' If using Census geographic data in race/ethnicity predictions, +#' \code{\var{voter.file}} must also contain at least one of the following fields: +#' \code{\var{county}}, \code{\var{tract}}, and/or \code{\var{block}}. +#' These fields should contain character strings matching U.S. Census categories. +#' County is three characters (e.g., \code{"031"} not \code{"31"}), +#' tract is six characters, and block is four characters. +#' See below for other optional fields. +#' @param census.surname A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, +#' function will call \code{merge_surnames} to merge in Pr(Race | Surname) +#' from U.S. Census Surname List (2000 or 2010) and Spanish Surname List. +#' If \code{FALSE}, \code{voter.file} object must contain additional fields specifying +#' Pr(Race | Surname), named as follows: \code{\var{p_whi}} for Whites, +#' \code{\var{p_bla}} for Blacks, \code{\var{p_his}} for Hispanics/Latinos, +#' \code{\var{p_asi}} for Asians, and/or \code{\var{p_oth}} for Other. +#' Default is \code{TRUE}. +#' @param surname.only A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, race predictions will +#' only use surname data and calculate Pr(Race | Surnname). Default is \code{FALSE}. +#' @param surname.year A number to specify the year of the census surname statistics. +#' These surname statistics is stored in the data, and will be automatically loaded. +#' The default value is \code{2010}, which means the surname statistics from the +#' 2010 census will be used. Currently, the other available choice is \code{2000}. +#' @param census.geo An optional character vector specifying what level of +#' geography to use to merge in U.S. Census 2010 geographic data. Currently +#' \code{"county"}, \code{"tract"}, or \code{"block"} are supported. +#' Note: sufficient information must be in user-defined \code{\var{voter.file}} object. +#' If \code{\var{census.geo} = "county"}, then \code{\var{voter.file}} +#' must have column named \code{county}. +#' If \code{\var{census.geo} = "tract"}, then \code{\var{voter.file}} +#' must have columns named \code{county} and \code{tract}. +#' And if \code{\var{census.geo} = "block"}, then \code{\var{voter.file}} +#' must have columns named \code{county}, \code{tract}, and \code{block}. +#' Specifying \code{\var{census.geo}} will call \code{census_helper} function +#' to merge Census geographic data at specified level of geography. +#' @param census.key A character object specifying user's Census API +#' key. Required if \code{\var{census.geo}} is specified, because +#' a valid Census API key is required to download Census geographic data. +#' @param census.data A list indexed by two-letter state abbreviations, +#' which contains pre-saved Census geographic data. +#' Can be generated using \code{get_census_data} function. +#' @param age An optional \code{TRUE}/\code{FALSE} object specifying whether to +#' condition race predictions on age (in addition to surname and geolocation). +#' Default is \code{FALSE}. Must be same as \code{\var{age}} in \code{\var{census.data}} object. +#' May only be set to \code{TRUE} if \code{census.geo} option is specified. +#' If \code{TRUE}, \code{\var{voter.file}} should include a numerical variable \code{\var{age}}. +#' @param sex optional \code{TRUE}/\code{FALSE} object specifying whether to +#' condition race predictions on sex (in addition to surname and geolocation). +#' Default is \code{FALSE}. Must be same as \code{\var{sex}} in \code{\var{census.data}} object. +#' May only be set to \code{TRUE} if \code{census.geo} option is specified. +#' If \code{TRUE}, \code{\var{voter.file}} should include a numerical variable \code{\var{sex}}, +#' where \code{\var{sex}} is coded as 0 for males and 1 for females. +#' @param party An optional character object specifying party registration field +#' in \code{\var{voter.file}}, e.g., \code{\var{party} = "PartyReg"}. +#' If specified, race/ethnicity predictions will be conditioned +#' on individual's party registration (in addition to geolocation). +#' Whatever the name of the party registration field in \code{\var{voter.file}}, +#' it should be coded as 1 for Democrat, 2 for Republican, and 0 for Other. +#' @return Output will be an object of class \code{data.frame}. It will +#' consist of the original user-input data with additional columns with +#' predicted probabilities for each of the five major racial categories: +#' \code{\var{pred.whi}} for White, +#' \code{\var{pred.bla}} for Black, +#' \code{\var{pred.his}} for Hispanic/Latino, +#' \code{\var{pred.asi}} for Asian/Pacific Islander, and +#' \code{\var{pred.oth}} for Other/Mixed. +#' +#' @examples +#' data(voters) +#' predict_race(voters, surname.only = TRUE) +#' predict_race(voter.file = voters, surname.only = TRUE) +#' \dontrun{predict_race(voter.file = voters, census.geo = "tract", census.key = "...", age = TRUE)} +#' \dontrun{predict_race(voter.file = voters, census.geo = "tract", census.key = "...", party = "PID")} +#' \dontrun{CensusObj <- get_census_data("...", state = c("NY", "DC", "NJ")); +#' predict_race(voter.file = voters, census.geo = "tract", census.data = CensusObj, party = "PID")} +#' \dontrun{CensusObj2 <- get_census_data("...", state = c("NY", "DC", "NJ"), age = TRUE, sex = TRUE); +#' predict_race(voter.file = voters, census.geo = "tract", census.data = CensusObj2, party = "PID", age = TRUE, sex = TRUE)} +#' @export + +## Race Prediction Function +predict_race <- function(voter.file, + census.surname = TRUE, surname.only = FALSE, surname.year = 2010, + census.geo, census.key, census.data = NA, age = FALSE, sex = FALSE, party) { + + if (!missing(census.geo) && (census.geo == "precinct")) { + # geo <- "precinct" + stop('Error: census_helper function does not currently support merging precinct-level data.') + } + + vars.orig <- names(voter.file) + + if (surname.only == T) { + print("Proceeding with surname-only predictions...") + if (!("surname" %in% names(voter.file))) { + stop("Voter data frame needs to have a column named surname") + } + } else { + if (missing(census.geo) || is.null(census.geo) || is.na(census.geo) || census.geo %in% c("county", "tract", "block") == F) { + stop("census.geo must be either 'county', 'tract', or 'block'") + } else { + print(paste("Proceeding with Census geographic data at", census.geo, "level...")) + } + if (missing(census.data) || is.null(census.data) || is.na(census.data)) { + if (missing(census.key) || is.null(census.key) || is.na(census.key)) { + stop("Please provide a valid Census API key using census.key option.") + } else { + print("Downloading Census geographic data using provided API key...") + } + } else { + if (!("state" %in% names(voter.file))) { + stop("voter.file object needs to have a column named state.") + } + if (sum(toupper(unique(as.character(voter.file$state))) %in% toupper(names(census.data)) == FALSE) > 0) { + print("census.data object does not include all states in voter.file object.") + if (missing(census.key) || is.null(census.key) || is.na(census.key)) { + stop("Please provide either a valid Census API key or valid census.data object that covers all states in voter.file object.") + } else { + print("Downloading Census geographic data for states not included in census.data object...") + } + } else { + print("Using Census geographic data from provided census.data object...") + } + } + } + + eth <- c("whi", "bla", "his", "asi", "oth") + + ## Merge in Pr(Race | Surname) if necessary + if (census.surname) { + if (surname.year == 2010) { + voter.file <- merge_surnames(voter.file) + } else { + if (surname.year == 2000) { + voter.file <- merge_surnames(voter.file, surname.year = surname.year) + } else { + stop(paste(surname.year, "is not a valid surname.year. It should be either 2000 or 2010 (default).")) + } + } + } else { + # Check if voter.file has the nessary data + for (k in 1:length(eth)) { + if (paste("p", eth[k], sep = "_") %in% names(voter.file) == F) { + stop(paste("voter.file object needs to have columns named ", paste(paste("p", eth, sep = "_"), collapse = " and "), ".", sep = "")) + } + } + } + + ## Surname-Only Predictions + if (surname.only) { + for (k in 1:length(eth)) { + voter.file[paste("pred", eth[k], sep = ".")] <- voter.file[paste("p", eth[k], sep = "_")] / apply(voter.file[paste("p", eth, sep = "_")], 1, sum) + } + pred <- paste("pred", eth, sep = ".") + return(voter.file[c(vars.orig, pred)]) + } + + ## Merge in Pr(Party | Race) if necessary + if (missing(party) == F) { + voter.file$PID <- voter.file[, party] + voter.file <- merge(voter.file, get("pid")[names(get("pid")) %in% "party" == F], by = "PID", all.x = T) + } + + if (census.geo == "block") { + if (!("tract" %in% names(voter.file)) || !("county" %in% names(voter.file)) || !("block" %in% names(voter.file))) { + stop("voter.file object needs to have columns named block, tract, and county.") + } + voter.file <- census_helper(key = census.key, + voter.file = voter.file, + states = "all", + geo = "block", + age = age, + sex = sex, + census.data = census.data) + } + + if (census.geo == "precinct") { + geo <- "precinct" + stop('Error: census_helper function does not currently support precinct-level data.') + } + + if (census.geo == "tract") { + if (!("tract" %in% names(voter.file)) || !("county" %in% names(voter.file))) { + stop("voter.file object needs to have columns named tract and county.") + } + voter.file <- census_helper(key = census.key, + voter.file = voter.file, + states = "all", + geo = "tract", + age = age, + sex = sex, + census.data = census.data) + } + + if (census.geo == "county") { + if (!("county" %in% names(voter.file))) { + stop("voter.file object needs to have a column named county.") + } + voter.file <- census_helper(key = census.key, + voter.file = voter.file, + states = "all", + geo = "county", + age = age, + sex = sex, + census.data = census.data) + } + + ## Pr(Race | Surname, Geolocation) + if (missing(party)) { + for (k in 1:length(eth)) { + voter.file[paste("u", eth[k], sep = "_")] <- voter.file[paste("p", eth[k], sep = "_")] * voter.file[paste("r", eth[k], sep = "_")] + } + voter.file$u_tot <- apply(voter.file[paste("u", eth, sep = "_")], 1, sum, na.rm = T) + for (k in 1:length(eth)) { + voter.file[paste("q", eth[k], sep = "_")] <- voter.file[paste("u", eth[k], sep = "_")] / voter.file$u_tot + } + } + + ## Pr(Race | Surname, Geolocation, Party) + if (missing(party) == F) { + for (k in 1:length(eth)) { + voter.file[paste("u", eth[k], sep = "_")] <- voter.file[paste("p", eth[k], sep = "_")] * voter.file[paste("r", eth[k], sep = "_")] * voter.file[paste("r_pid", eth[k], sep = "_")] + } + voter.file$u_tot <- apply(voter.file[paste("u", eth, sep = "_")], 1, sum, na.rm = T) + for (k in 1:length(eth)) { + voter.file[paste("q", eth[k], sep = "_")] <- voter.file[paste("u", eth[k], sep = "_")] / voter.file$u_tot + } + } + + for (k in 1:length(eth)) { + voter.file[paste("pred", eth[k], sep = ".")] <- voter.file[paste("q", eth[k], sep = "_")] + } + pred <- paste("pred", eth, sep = ".") + + return(voter.file[c(vars.orig, pred)]) +} diff --git a/R/race.pred.R b/R/race.pred.R deleted file mode 100644 index ce7590e..0000000 --- a/R/race.pred.R +++ /dev/null @@ -1,226 +0,0 @@ -#' Race prediction function. -#' -#' \code{race.pred} makes probabilistic estimates of individual-level race/ethnicity. -#' -#' This function implements the Bayesian race prediction methods outlined in -#' Imai and Khanna (2015). The function produces probabilistic estimates of -#' individual-level race/ethnicity, based on surname, geolocation, and party. -#' @param voters An object of class \code{data.frame}. Must contain a row for each individual being -#' predicted, as well as a field named \code{\var{surname}} containing each individual's surname. -#' If using geolocation in predictions, \code{\var{voters}} must contain a field named \code{\var{state}}, -#' which contains the two-character abbreviation for each individual's state of residence (e.g., "nj" for New Jersey). -#' If using geolocation, \code{\var{voters}} must also contain at least one of the following fields: -#' \code{\var{county}}, \code{\var{tract}}, and/or \code{\var{block}}. -#' These fields should contain character strings matching U.S. Census categories. -#' County is three characters (e.g., "031" not "31"), tract is six characters, and block is four characters. -#' See below for other optional fields. -#' @param races A character vector specifying which racial groups to generate -#' predicted probabilities for. Can include any subset of the default vector, -#' which is \code{c("white", "black", "latino", "asian", "other")}. -#' @param name.clean A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, function will call -#' \code{name.clean} to merge in data from U.S. Census 2000 Surname List -#' and Spanish Surname List. If \code{FALSE}, voters object must contain additional fields -#' specifying Pr(Race | Surname), named as follows: -#' \code{\var{p_whi}} for Whites, \code{\var{p_bla}} for Blacks, -#' \code{\var{p_his}} for Hispanics/Latinos, \code{\var{p_asi}} for Asians, -#' and/or \code{\var{p_oth}} for Other. Default is \code{TRUE}. -#' @param census An optional character vector specifying what level of -#' geography to use to merge in U.S. Census 2010 data. Can be one of -#' \code{"county"}, \code{"tract"}, or \code{"block"}. -#' Function calls \code{census.helper.api} to merge in Census data at specified level. -#' If left unspecified, \code{voters} must contain additional fields -#' specifying Pr(Geolocation | Race), including any of the following: -#' \code{\var{r_whi}}, \code{\var{r_bla}}, \code{\var{r_his}}, -#' \code{\var{r_asi}}, and/or \code{\var{r_oth}}. -#' @param census.key A character object specifying user's Census API -#' key. Required if \code{census} is specified, because -#' \code{census.helper} function requires a Census API key to operate. -#' @param demo An optional \code{TRUE}/\code{FALSE} object specifying whether to -#' condition race predictions on individual age and sex (in addition to geolocation). -#' Default is \code{FALSE}. -#' May only be set to \code{TRUE} if \code{census} option is specified. -#' If \code{TRUE}, \code{voters} should include numerical variables -#' \code{\var{age}} and \code{\var{sex}}, where \code{\var{sex}} coded as 0 for -#' males and 1 for females. -#' @param party An optional character object specifying party registration field in \code{\var{voters}}, -#' e.g., \code{\var{party} = "PartyReg"}. If specified, race/ethnicity predictions will be conditioned -#' on individual's party registration (in addition to geolocation). -#' Whatever the name of the party registration field in \code{\var{voters}}, -#' it should be coded as 1 for Democrat, 2 for Republican, and 0 for Other. -#' @param census.data A census data object, a list indexed by state names, -#' which contains census data on demo, county, tract and block. -#' @param surname.only A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, race predictions will -#' only use surname data and calculate Pr(Race | Surnname). Default is \code{FALSE}. -#' @return Output will be an object of class \code{data.frame}. It will -#' consist of the original user-input data with additional columns that -#' contain predicted probabilities for each race in \code{races}. -#' -#' @examples -#' data(voters) -#' race.pred(voters = voters, races = c("asian"), surname.only = TRUE) -#' \dontrun{race.pred(voters = voters, races = c("white", "black", "latino"), -#' census = "tract", census.key = "...", demo = TRUE)} -#' \dontrun{race.pred(voters = voters, races = c("white", "black", "latino", "asian", "other"), -#' census = "tract", census.key = "...", party = "PID")} -#' \dontrun{race.pred(voters = voters, races = c("white", "black", "latino", "asian", "other"), -#' census = "tract", census.data = censusObjs, party = "PID")} -#' @export - -## Race Prediction Function -race.pred <- function(voters, races = c("white", "black", "latino", "asian", "other"), - name.clean = TRUE, surname.only = FALSE, - census = "", census.key = "", demo = FALSE, party, census.data = NA) { - - vars.orig <- names(voters) - - ## Subset user-specified races (maximum of five) - eth <- c("whi", "bla", "his", "asi", "oth")[c("white", "black", "latino", "asian", "other") %in% races] - - if (census == "" & demo == TRUE) { - stop('Cannot set demo to TRUE without specifying census option.') - } - - ## Merge in Pr(Race | Surname) if necessary - if (name.clean == TRUE) { - voters <- name.clean(voters) - } - - ## Surname-Only Predictions - if (surname.only == TRUE) { - for (k in 1:length(eth)) { - voters[paste("pred", eth[k], sep = ".")] <- voters[paste("p", eth[k], sep = "_")] - } - pred <- paste("pred", eth, sep = ".") - return(voters[c(vars.orig, pred)]) - } - - ## Merge in Pr(Party | Race) if necessary - if (missing(party) == F) { - voters$PID <- voters[, party] - voters <- merge(voters, get("pid")[names(get("pid")) %in% "party" == F], by = "PID", all.x = T) - } - - if (census == "block") { - oldw <- getOption("warn") - options(warn = -1) - warning("Extracting U.S. Census 2010 block-level data -- may take a long time!") - - # - # if (is.na(census.data)) { - # voters <- census.helper.api.online(key = census.key, - # voters = voters, - # states = "all", - # geo = "block", - # demo = demo) - # } else { - # voters <- census.helper.api.local(voters = voters, - # state = toupper(unique(voters$state)[1]), - # geo = "block", - # demo = demo, - # census.data = census.data) - # } - # - - voters <- census.helper.api(key = census.key, - voters = voters, - states = "all", - geo = "block", - demo = demo, - census.data = census.data) - options(warn = oldw) - } - - if (census == "precinct") { - geo <- "precinct" - stop('Error: census.helper function does not currently support merging precinct-level data.') - } - - if (census == "tract") { - oldw <- getOption("warn") - options(warn = -1) - warning("Extracting U.S. Census 2010 tract-level data -- may take a long time!") - - # - # if (is.na(census.data)) { - # voters <- census.helper.api.online(key = census.key, - # voters = voters, - # states = "all", - # geo = "tract", - # demo = demo) - # } else { - # voters <- census.helper.api.local(voters = voters, - # state = toupper(unique(voters$state)[1]), - # geo = "tract", - # demo = demo, - # census.data = census.data) - # } - # - - voters <- census.helper.api(key = census.key, - voters = voters, - states = "all", - geo = "tract", - demo = demo, - census.data = census.data) - options(warn = oldw) - } - - if (census == "county") { - oldw <- getOption("warn") - options(warn = -1) - warning("Extracting U.S. Census 2010 county-level data -- may take a long time!") - - # - # if (is.na(census.data)) { - # voters <- census.helper.api.online(key = census.key, - # voters = voters, - # states = "all", - # geo = "county", - # demo = demo) - # } else { - # voters <- census.helper.api.local(voters = voters, - # state = toupper(unique(voters$state)[1]), - # geo = "county", - # demo = demo, - # census.data = census.data) - # } - # - - voters <- census.helper.api(key = census.key, - voters = voters, - states = "all", - geo = "county", - demo = demo, - census.data = census.data) - options(warn = oldw) - } - - ## Pr(Race | Surname, Geolocation) - if (missing(party)) { - for (k in 1:length(eth)) { - voters[paste("u", eth[k], sep = "_")] <- voters[paste("p", eth[k], sep = "_")] * voters[paste("r", eth[k], sep = "_")] - } - voters$u_tot <- apply(voters[paste("u", eth, sep = "_")], 1, sum, na.rm = T) - for (k in 1:length(eth)) { - voters[paste("q", eth[k], sep = "_")] <- voters[paste("u", eth[k], sep = "_")] / voters$u_tot - } - } - - ## Pr(Race | Surname, Geolocation, Party) - if (missing(party) == F) { - for (k in 1:length(eth)) { - voters[paste("u", eth[k], sep = "_")] <- voters[paste("p", eth[k], sep = "_")] * voters[paste("r", eth[k], sep = "_")] * voters[paste("r_pid", eth[k], sep = "_")] - } - voters$u_tot <- apply(voters[paste("u", eth, sep = "_")], 1, sum, na.rm = T) - for (k in 1:length(eth)) { - voters[paste("q", eth[k], sep = "_")] <- voters[paste("u", eth[k], sep = "_")] / voters$u_tot - } - } - - for (k in 1:length(eth)) { - voters[paste("pred", eth[k], sep = ".")] <- voters[paste("q", eth[k], sep = "_")] - } - pred <- paste("pred", eth, sep = ".") - - return(voters[c(vars.orig, pred)]) -} diff --git a/R/surnames2000.R b/R/surnames2000.R new file mode 100644 index 0000000..80293ef --- /dev/null +++ b/R/surnames2000.R @@ -0,0 +1,20 @@ +#' Census Surname List (2000). +#' +#' Census Surname List from 2000 with race/ethnicity probabilities by surname. +#' +#' @format A data frame with 157,728 rows and 6 variables: +#' \describe{ +#' \item{surname}{Surname} +#' \item{p_whi}{Pr(White | Surname)} +#' \item{p_bla}{Pr(Black | Surname)} +#' \item{p_his}{Pr(Hispanic/Latino | Surname)} +#' \item{p_asi}{Pr(Asian/Pacific Islander | Surname)} +#' \item{p_oth}{Pr(Other | Surname)} +#' #' } +#' +#' @docType data +#' @keywords datasets +#' @name surnames2000 +#' @examples +#' data(surnames2000) +"surnames2000" diff --git a/R/surnames2010.R b/R/surnames2010.R new file mode 100644 index 0000000..6ee0b66 --- /dev/null +++ b/R/surnames2010.R @@ -0,0 +1,20 @@ +#' Census Surname List (2010). +#' +#' Census Surname List from 2010 with race/ethnicity probabilities by surname. +#' +#' @format A data frame with 167,613 rows and 6 variables: +#' \describe{ +#' \item{surname}{Surname} +#' \item{p_whi}{Pr(White | Surname)} +#' \item{p_bla}{Pr(Black | Surname)} +#' \item{p_his}{Pr(Hispanic/Latino | Surname)} +#' \item{p_asi}{Pr(Asian/Pacific Islander | Surname)} +#' \item{p_oth}{Pr(Other | Surname)} +#' #' } +#' +#' @docType data +#' @keywords datasets +#' @name surnames +#' @examples +#' data(surnames) +"surnames" diff --git a/R/sysdata.rda b/R/sysdata.rda new file mode 100644 index 0000000..c784baf Binary files /dev/null and b/R/sysdata.rda differ diff --git a/R/vecToChunk.R b/R/vec_to_chunk.R similarity index 76% rename from R/vecToChunk.R rename to R/vec_to_chunk.R index bfe46b4..26e9f36 100644 --- a/R/vecToChunk.R +++ b/R/vec_to_chunk.R @@ -1,6 +1,6 @@ #' Variable vector into chunks. #' -#' \code{vecToChunk} takes a list of variables and collects them into 50-variable chunks. +#' \code{vec_to_chunk} takes a list of variables and collects them into 50-variable chunks. #' #' This function takes a list of variable names and collects them into chunks with no more than #' 50 variables each. This helps to get around requests with more than 50 variables,because the @@ -11,14 +11,14 @@ #' @return Object of class \code{list}. #' #' @examples -#' vecToChunk(x = c(paste("P012F0", seq(10:49), sep = ""), paste("P012I0", seq(10, 49), sep = ""))) +#' vec_to_chunk(x = c(paste("P012F0", seq(10:49), sep = ""), paste("P012I0", seq(10, 49), sep = ""))) #' #' @references #' Based on code authored by Nicholas Nagle, which is available #' \href{http://rstudio-pubs-static.s3.amazonaws.com/19337_2e7f827190514c569ea136db788ce850.html}{here}. #' #' @export -vecToChunk <- function(x){ +vec_to_chunk <- function(x){ s <- seq_along(x) x1 <- split(x, ceiling(s/50)) return(x1) diff --git a/R/voters.R b/R/voters.R new file mode 100644 index 0000000..9b2f9b7 --- /dev/null +++ b/R/voters.R @@ -0,0 +1,25 @@ +#' Example voter file. +#' +#' An example dataset containing voter file information. +#' +#' @format A data frame with 10 rows and 12 variables: +#' \describe{ +#' \item{VoterID}{Voter identifier (numeric)} +#' \item{surname}{Surname} +#' \item{state}{State of residence} +#' \item{CD}{Congressional district} +#' \item{county}{Census county (three-digit code)} +#' \item{tract}{Census tract (six-digit code)} +#' \item{block}{Census block (four-digit code)} +#' \item{precinct}{Voting precinct} +#' \item{age}{Age in years} +#' \item{sex}{0=male, 1=female} +#' \item{party}{Party registration (character)} +#' \item{PID}{Party registration (numeric)} +#' #' } +#' @docType data +#' @keywords datasets +#' @name voters +#' @examples +#' data(voters) +"voters" diff --git a/README.md b/README.md index e0b7a8c..9e053d3 100644 --- a/README.md +++ b/README.md @@ -1,36 +1,95 @@ # wru: Who Are You? Bayesian Prediction of Racial Category Using Surname and Geolocation [![Build Status](https://travis-ci.org/kosukeimai/wru.svg?branch=master)](https://travis-ci.org/kosukeimai/wru) [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/wru)](https://cran.r-project.org/package=wru) -The R package implements the methods proposed in Imai, Kosuke and Kabir Khanna. (2016). ``[Improving Ecological Inference by Predicting Individual Ethnicity from Voter Registration Record.](http://imai.princeton.edu/research/race.html)'' Political Analysis, Vol. 24, No. 2 (Spring), pp. 263-272. doi: 10.1093/pan/mpw001 +This R package implements the methods proposed in Imai, K. and Khanna, K. (2016). "[Improving Ecological Inference by Predicting Individual Ethnicity from Voter Registration Record.](http://imai.princeton.edu/research/race.html)" Political Analysis, Vol. 24, No. 2 (Spring), pp. 263-272. doi: 10.1093/pan/mpw001. ### Using wru -To start using the package, get Census Data API Key from [http://api.census.gov/data/key_signup.html](http://api.census.gov/data/key_signup.html) +Here is a simple example that predicts the race/ethnicity of voters based only on their surnames. +```r +library(wru) +data(voters) +predict_race(voter.file = voters, surname.only = T) +``` -Once you have the key, you can dive right in. The package downloads relevant data on demand. +The above produces the following output, where the last five columns are probabilistic race/ethnicity predictions (e.g., 'pred.his' is the probability of being Hispanic/Latino): +```r +# "Proceeding with surname-only predictions ..." +# VoterID surname state CD county tract block precinct age sex party PID pred.whi pred.bla pred.his pred.asi pred.oth +# 1 Khanna NJ 12 021 004000 3001 6 29 0 Ind 0 0.0676 0.00430000 0.00820000 0.86680000 0.05310 +# 2 Imai NJ 12 021 004501 1025 40 0 Dem 1 0.0812 0.00240000 0.06890000 0.73750000 0.11000 +# 3 Velasco NY 12 061 004800 6001 33 0 Rep 2 0.0594 0.00260000 0.82270000 0.10510000 0.01020 +# 4 Fifield NJ 12 021 004501 1025 27 0 Dem 1 0.9355 0.00220000 0.02850000 0.00780000 0.02590 +# 5 Zhou NJ 12 021 004501 1025 28 1 Dem 1 0.0098 0.00180000 0.00065000 0.98200000 0.00575 +# 6 Ratkovic NJ 12 021 004000 1025 35 0 Ind 0 0.9187 0.01083333 0.01083333 0.01083333 0.04880 +# 7 Johnson NY 9 061 015100 4000 25 0 Dem 1 0.5897 0.34630000 0.02360000 0.00540000 0.03500 +# 8 Lopez NJ 12 021 004501 1025 33 0 Rep 2 0.0486 0.00570000 0.92920000 0.01020000 0.00630 +# 9 Wantchekon NJ 12 021 004501 1025 50 0 Rep 2 0.6665 0.08530000 0.13670000 0.07970000 0.03180 +# 10 Morse DC 0 001 001301 3005 29 1 Rep 2 0.9054 0.04310000 0.02060000 0.00720000 0.02370 +``` -For instance, to get the race of someone with name last name Smith, write in where `---' should be replaced with the Census Data API Key you obtained: +In order to predict race/ethnicity based on surnames AND geolocation, first request a U.S. Census API key [here](http://api.census.gov/data/key_signup.html). Once you have an API key, you can use the package to download relevant Census geographic data on demand and condition race/ethnicity predictions on geolocation (county, tract, or block). +The following example predicts the race/ethnicity of voters based on their surnames, Census tract of residence (census.geo = "tract"), and which party registration (party = "PID"). Note that a valid API key must be provided in the input parameter 'census.key' in order for the function to download the relevant tract-level data. ```r -race.pred(voters = data.frame(surname="Smith"), races = c("white", "black", "latino"), census = "tract", - census.key = "---", demo = TRUE, surname.only=TRUE) +library(wru) +data(voters) +predict_race(voter.file = voters, census.geo = "tract", census.key = "", party = "PID") ``` -which returns the predicted probabilities for each racial category: +The above returns the following output. +``` +# VoterID surname state CD county tract block precinct age sex party PID pred.whi pred.bla pred.his pred.asi pred.oth +# 1 Khanna NJ 12 021 004000 3001 6 29 0 Ind 0 0.081856291 0.0021396565 0.0110451405 0.828313291 0.076645621 +# 6 Ratkovic NJ 12 021 004000 1025 35 0 Ind 0 0.916936771 0.0044432219 0.0120276229 0.008532929 0.058059455 +# 4 Fifield NJ 12 021 004501 1025 27 0 Dem 1 0.895620643 0.0022078678 0.0139457411 0.023345853 0.064879895 +# 5 Zhou NJ 12 021 004501 1025 28 1 Dem 1 0.003164229 0.0006092345 0.0001072684 0.991261466 0.004857802 +# 2 Imai NJ 12 021 004501 1025 40 0 Dem 1 0.029936354 0.0009275220 0.0129831039 0.850040743 0.106112277 +# 8 Lopez NJ 12 021 004501 1025 33 0 Rep 2 0.231046860 0.0016485574 0.6813780115 0.053180270 0.032746301 +# 9 Wantchekon NJ 12 021 004501 1025 50 0 Rep 2 0.817841573 0.0063677130 0.0258733496 0.107254103 0.042663261 +# 3 Velasco NY 12 061 004800 6001 33 0 Rep 2 0.223924118 0.0002913000 0.4451163607 0.313431417 0.017236805 +# 7 Johnson NY 9 061 015100 4000 25 0 Dem 1 0.241417483 0.6900686166 0.0293556870 0.011105140 0.028053073 +# 10 Morse DC 0 001 001301 3005 29 1 Rep 2 0.983300770 0.0006116706 0.0034070782 0.004823439 0.007857042 +``` +It is also possible to pre-download Census geographic data, which can save time when running predict_race(). The example dataset 'voters' includes people in DC, NJ, and NY. The following example subsets voters in DC and NJ, and then uses get_census_data() to download Census geographic data in these two states (input parameter 'key' requires valid API key). Census data is assigned to an object named census.dc.nj. The predict_race() statement predicts the race/ethnicity of voters in DC and NJ using the pre-saved Census data (census.data = census.dc.nj). This example conditions race/ethnicity predictions on voters' surnames, block of residence (census.geo = "block"), age (age = TRUE), and party registration (party = "PID"). + +Please note that the input parameters 'age' and 'sex' must have the same values in get_census_data() and predict_race(), i.e., TRUE in both or FALSE in both. In this case, predictions are conditioned on age but not sex, so age = TRUE and sex = FALSE in both the get_census_data() and predict_race() statements. +```r +library(wru) +data(voters) +voters.dc.nj <- voters[c(-3, -7), ] # remove two NY cases from dataset +census.dc.nj <- get_census_data(key = "", state = c("DC", "NJ"), age = TRUE, sex = FALSE) # create Census data object covering DC and NJ +predict_race(voter.file = voters.dc.nj, census.geo = "block", census.data = census.dc.nj, age = TRUE, sex = FALSE, party = "PID") ``` -# surname pred.whi pred.bla pred.his -# 1 Smith 0.7335 0.2222 0.0156 + +The last two lines above are equivalent to the following: +```r +predict_race(voter.file = voters.dc.nj, census.geo = "block", census.key = "", age = TRUE, sex = FALSE, party = "PID") ``` -It is also possible to pre-download and save the census data for the intended states. For example, suppose the voters data involves people in DE, FL, NJ: +Using pre-downloaded Census data may be useful for the following reasons: +* You can save a lot of time in future runs of predict_race() if the relevant Census data has already been saved; +* The machines used to run predict_race() may not have internet access; +* You can obtain timely snapshots of Census geographic data that match your voter file. + +Downloading data using get_census_data() may take a long time, especially at the block level or in large states. The example below uses the census_geo_api() function to download county-level and tract-level data in DC and NJ, while avoiding downloading block-level data. Note that this function has the input parameter 'state' that requires a two-letter state abbreviation to proceed. ```r -censusObj <- getCensusData("...", state = c("FL", "NJ", "DE"), demo = TRUE) -## where ... is the census key to access the census website -race.pred(voters = data.frame(surname="Smith"), races = c("white", "black", "latino"), census = "tract", demo = TRUE, surname.only=TRUE, census.data = censusObj) +censusObj2 = list() + +county.dc <- census_geo_api(key = "", state = "DC", geo = "county") +tract.dc <- census_geo_api(key = "", state = "DC", geo = "tract") +censusObj2[["DC"]] <- list(state = "DC", demo = TRUE, county = county.dc, tract = tract.dc) + +tract.nj <- census_geo_api(key = "", state = "NJ", geo = "tract") +county.nj <- census_geo_api(key = "", state = "NJ", geo = "county") +censusObj2[["NJ"]] <- list(state = "NJ", demo = TRUE, county = county.nj, tract = tract.nj) ``` -This is useful for the following reasons: -(1) the machine runs race.pred may not have internet access. -(2) No redundent download of the census data for each state. -(3) Timely snapshots of the census data that match the voter data. +After saving the data in censusObj2 above, we can condition race/ethnicity predictions on different combinations of input variables, without having to re-download the relevant Census data. +```r +predict_race(voter.file = voters.dc.nj, census.geo = "county", census.data = censusObj2) # Pr(Race | Surname, County) +predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = censusObj2) # Pr(Race | Surname, Tract) +predict_race(voter.file = voters.dc.nj, census.geo = "county", census.data = censusObj2, party = "PID") # Pr(Race | Surname, County, Party) +predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = censusObj2, party = "PID") # Pr(Race | Surname, Tract, Party) +``` diff --git a/data/State.FIPS.RData b/data/State.FIPS.RData deleted file mode 100644 index d374442..0000000 Binary files a/data/State.FIPS.RData and /dev/null differ diff --git a/data/names.all.RData b/data/names.all.RData deleted file mode 100644 index 604bf8c..0000000 Binary files a/data/names.all.RData and /dev/null differ diff --git a/data/pid.RData b/data/pid.RData deleted file mode 100644 index 3f03d46..0000000 Binary files a/data/pid.RData and /dev/null differ diff --git a/data/surnames2000.RData b/data/surnames2000.RData new file mode 100644 index 0000000..ece232a Binary files /dev/null and b/data/surnames2000.RData differ diff --git a/data/surnames2010.RData b/data/surnames2010.RData new file mode 100644 index 0000000..1d2d98e Binary files /dev/null and b/data/surnames2010.RData differ diff --git a/man/State.FIPS.Rd b/man/State.FIPS.Rd index c00b21f..c90bd2c 100644 --- a/man/State.FIPS.Rd +++ b/man/State.FIPS.Rd @@ -1,9 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/State.FIPS.R +\docType{data} \name{State.FIPS} \alias{State.FIPS} -\title{Dataset containing FIPS codes for each state.} +\title{State's FIPS.} +\format{A data frame with 55 rows and 2 variables: +\describe{ + \item{State}{State} + \item{FIPS}{FIPS} + #' }} +\usage{ +State.FIPS +} \description{ -This dataset contains the two-digit FIPS code for all 50 U.S. states, territories, and the District of Columbia. +List of States and their FIPS. } -\format{ -A data frame with 55 rows and 2 variables (State and FIPS). +\examples{ +data(State.FIPS) } +\keyword{datasets} + diff --git a/man/census.helper.api.Rd b/man/census.helper.api.Rd deleted file mode 100644 index 654c2a5..0000000 --- a/man/census.helper.api.Rd +++ /dev/null @@ -1,61 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/census.helper.api.R -\name{census.helper.api} -\alias{census.helper.api} -\title{Census helper function.} -\usage{ -census.helper.api(key, voters, states = "all", geo = "tract", - demo = FALSE, census.data = NA) -} -\arguments{ -\item{key}{A required character object. Must contain user's Census API -key, which can be requested \href{http://api.census.gov/data/key_signup.html}{here}.} - -\item{voters}{An object of class \code{data.frame}. Must contain field(s) -named \code{\var{county}}, \code{\var{tract}}, and/or \code{\var{block}} -specifying geolocation. These should be character variables that match up with -U.S. Census categories. County should be three characters (e.g., "031" not "31"), -tract should be six characters, and block should be four characters.} - -\item{states}{A character vector specifying which states to extract -Census data for, e.g. \code{c("NJ", "NY")}. Default is \code{"all"}, which extracts -Census data for all states contained in user-input data.} - -\item{geo}{A character object specifying what aggregation level to use. -Use \code{"county"}, \code{"tract"}, or \code{"block"}. Default is \code{"tract"}. -Warning: extracting block-level data takes very long.} - -\item{demo}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on -demographics (i.e., age and sex) or not. If \code{TRUE}, function will return -Pr(Geolocation, Age, Sex | Race). If \code{FALSE}, function wil return -Pr(Geolocation | Race). Default is \code{FALSE}.} - -\item{census.data}{A optional census object holding census data that is already -provided. If missing, function will retrive the census data online.} -} -\value{ -Output will be an object of class \code{data.frame}. It will - consist of the original user-input data with additional columns of - Census data. -} -\description{ -\code{census.helper.api} links user-input dataset with Census data. -} -\details{ -This function allows users to link their geocoded dataset (e.g., voter file) -with U.S. Census 2010 data. The function extracts Census Summary File data -at the tract or block level using the 'UScensus2010' package. Census data -calculated are Pr(Geolocation | Race) where geolocation is tract or block. -} -\examples{ -\dontshow{data(voters)} -\dontrun{census.helper.api(key = "...", voters = voters, states = "nj", geo = "block")} -\dontrun{census.helper.api(key = "...", voters = voters, states = "all", geo = "tract", -demo = TRUE)} - -} -\references{ -Relies on getCensusApi, getCensusApi2, and vecToChunk functions authored by Nicholas Nagle, -available \href{http://rstudio-pubs-static.s3.amazonaws.com/19337_2e7f827190514c569ea136db788ce850.html}{here}. -} - diff --git a/man/census.helper.api.local.Rd b/man/census.helper.api.local.Rd deleted file mode 100644 index 393cb2c..0000000 --- a/man/census.helper.api.local.Rd +++ /dev/null @@ -1,57 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/census.helper.api.R -\name{census.helper.api.local} -\alias{census.helper.api.local} -\title{Census helper function.} -\usage{ -census.helper.api.local(voters, states = "all", geo = "tract", - demo = FALSE, census.data = NA) -} -\arguments{ -\item{voters}{An object of class \code{data.frame}. Must contain field(s) -named \code{\var{county}}, \code{\var{tract}}, and/or \code{\var{block}} -specifying geolocation. These should be character variables that match up with -U.S. Census categories. County should be three characters (e.g., "031" not "31"), -tract should be six characters, and block should be four characters.} - -\item{states}{A state to use Census data for, e.g. \code{c("NJ", "NY")}. -Default is \code{"NA"}.} - -\item{geo}{A character object specifying what aggregation level to use. -Use \code{"county"}, \code{"tract"}, or \code{"block"}. Default is \code{"tract"}. -Warning: extracting block-level data takes very long.} - -\item{demo}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on -demographics (i.e., age and sex) or not. If \code{TRUE}, function will return -Pr(Geolocation, Age, Sex | Race). If \code{FALSE}, function wil return -Pr(Geolocation | Race). Default is \code{FALSE}.} - -\item{census.data}{A optional census object holding census data that is already -provided. If missing, function will retrive the census data online.} -} -\value{ -Output will be an object of class \code{data.frame}. It will - consist of the original user-input data with additional columns of - Census data. -} -\description{ -\code{census.helper.api.local} links user-input dataset with Census data. -} -\details{ -This function allows users to link their geocoded dataset (e.g., voter file) -with U.S. Census 2010 data. The function extracts Census Summary File data -at the tract or block level using the 'UScensus2010' package. Census data -calculated are Pr(Geolocation | Race) where geolocation is tract or block. -} -\examples{ -\dontshow{data(voters)} -\dontrun{census.helper.api.local(voters = voters, states = "nj", geo = "block", census.data = x)} -\dontrun{census.helper.api.local(voters = voters, states = "all", geo = "tract", demo = TRUE, -census.data = x)} - -} -\references{ -Relies on getCensusApi, getCensusApi2, and vecToChunk functions authored by Nicholas Nagle, -available \href{http://rstudio-pubs-static.s3.amazonaws.com/19337_2e7f827190514c569ea136db788ce850.html}{here}. -} - diff --git a/man/census.helper.api.online.Rd b/man/census.helper.api.online.Rd deleted file mode 100644 index c1e26c9..0000000 --- a/man/census.helper.api.online.Rd +++ /dev/null @@ -1,58 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/census.helper.api.R -\name{census.helper.api.online} -\alias{census.helper.api.online} -\title{Census helper function.} -\usage{ -census.helper.api.online(key, voters, states = "all", geo = "tract", - demo = FALSE) -} -\arguments{ -\item{key}{A required character object. Must contain user's Census API -key, which can be requested \href{http://api.census.gov/data/key_signup.html}{here}.} - -\item{voters}{An object of class \code{data.frame}. Must contain field(s) -named \code{\var{county}}, \code{\var{tract}}, and/or \code{\var{block}} -specifying geolocation. These should be character variables that match up with -U.S. Census categories. County should be three characters (e.g., "031" not "31"), -tract should be six characters, and block should be four characters.} - -\item{states}{A character vector specifying which states to extract -Census data for, e.g. \code{c("NJ", "NY")}. Default is \code{"all"}, which extracts -Census data for all states contained in user-input data.} - -\item{geo}{A character object specifying what aggregation level to use. -Use \code{"county"}, \code{"tract"}, or \code{"block"}. Default is \code{"tract"}. -Warning: extracting block-level data takes very long.} - -\item{demo}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on -demographics (i.e., age and sex) or not. If \code{TRUE}, function will return -Pr(Geolocation, Age, Sex | Race). If \code{FALSE}, function wil return -Pr(Geolocation | Race). Default is \code{FALSE}.} -} -\value{ -Output will be an object of class \code{data.frame}. It will - consist of the original user-input data with additional columns of - Census data. -} -\description{ -\code{census.helper.api.online} links user-input dataset with Census data. -} -\details{ -This function allows users to link their geocoded dataset (e.g., voter file) -with U.S. Census 2010 data. The function extracts Census Summary File data -at the tract or block level using the 'UScensus2010' package. Census data -calculated are Pr(Geolocation | Race) where geolocation is tract or block. -} -\examples{ -\dontshow{data(voters)} -\dontrun{census.helper.api(key = "...", voters = voters, states = "nj", geo = "block")} -\dontrun{census.helper.api(key = "...", voters = voters, states = "all", geo = "tract", -demo = TRUE)} - -} -\references{ -Relies on getCensusApi, getCensusApi2, and vecToChunk functions authored by Nicholas Nagle, -available \href{http://rstudio-pubs-static.s3.amazonaws.com/19337_2e7f827190514c569ea136db788ce850.html}{here}. -} - diff --git a/man/censusData.Rd b/man/censusData.Rd deleted file mode 100644 index 9401da7..0000000 --- a/man/censusData.Rd +++ /dev/null @@ -1,47 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/getCensusData.R -\name{censusData} -\alias{censusData} -\title{Census Data download function.} -\usage{ -censusData(key, state, geo = "tract", demo = FALSE) -} -\arguments{ -\item{key}{A required character object. Must contain user's Census API -key, which can be requested \href{http://api.census.gov/data/key_signup.html}{here}.} - -\item{state}{to extract Census data for, e.g. \code{"NJ"}.} - -\item{geo}{A character object specifying what aggregation level to use. -Use \code{"county"}, \code{"tract"}, or \code{"block"}. Default is \code{"tract"}. -Warning: extracting block-level data takes very long.} - -\item{demo}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on -demographics (i.e., age and sex) or not. If \code{TRUE}, function will return -Pr(Geolocation, Age, Sex | Race). If \code{FALSE}, function wil return -Pr(Geolocation | Race). Default is \code{FALSE}.} -} -\value{ -Output will be an object of class \code{list}, indexed by state names. It will - consist of the original user-input data with additional columns of - Census data. -} -\description{ -\code{censusData} retrieve Census data. -} -\details{ -This function allows users to download (e.g., voter file) the U.S. Census 2010 data, -at either county level, tract level or block level. -} -\examples{ -\dontshow{data(voters)} -\dontrun{censusData(key = "...", states = c("NJ", "DE"), geo = "block")} -\dontrun{censusData(key = "...", states = "FL", geo = "tract", -demo = TRUE)} - -} -\references{ -Relies on getCensusApi, getCensusApi2, and vecToChunk functions authored by Nicholas Nagle, -available \href{http://rstudio-pubs-static.s3.amazonaws.com/19337_2e7f827190514c569ea136db788ce850.html}{here}. -} - diff --git a/man/census_geo_api.Rd b/man/census_geo_api.Rd new file mode 100644 index 0000000..55898fe --- /dev/null +++ b/man/census_geo_api.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/census_geo_api.R +\name{census_geo_api} +\alias{census_geo_api} +\title{Census Data download function.} +\usage{ +census_geo_api(key, state, geo = "tract", age = FALSE, sex = FALSE) +} +\arguments{ +\item{key}{A required character object. Must contain user's Census API +key, which can be requested \href{http://api.census.gov/data/key_signup.html}{here}.} + +\item{state}{A required character object specifying which state to extract Census data for, +e.g., \code{"NJ"}.} + +\item{geo}{A character object specifying what aggregation level to use. +Use \code{"county"}, \code{"tract"}, or \code{"block"}. Default is \code{"tract"}. +Warning: extracting block-level data takes very long.} + +\item{age}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on +age or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). +If \code{TRUE}, function will return Pr(Geolocation, Age | Race). +If \code{\var{sex}} is also \code{TRUE}, function will return Pr(Geolocation, Age, Sex | Race).} + +\item{sex}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on +sex or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). +If \code{TRUE}, function will return Pr(Geolocation, Sex | Race). +If \code{\var{age}} is also \code{TRUE}, function will return Pr(Geolocation, Age, Sex | Race).} +} +\value{ +Output will be an object of class \code{list}, indexed by state names. It will + consist of the original user-input data with additional columns of Census geographic data. +} +\description{ +\code{census_geo_api} retrieves U.S. Census geographic data for a given state. +} +\details{ +This function allows users to download U.S. Census 2010 geographic data, +at either the county, tract, or block level, for a particular state. +} +\examples{ +\dontshow{data(voters)} +\dontrun{census_geo_api(key = "...", states = c("NJ", "DE"), geo = "block")} +\dontrun{census_geo_api(key = "...", states = "FL", geo = "tract", age = TRUE, sex = TRUE)} + +} +\references{ +Relies on get_census_api, get_census_api_2, and vec_to_chunk functions authored by Nicholas Nagle, +available \href{http://rstudio-pubs-static.s3.amazonaws.com/19337_2e7f827190514c569ea136db788ce850.html}{here}. +} + diff --git a/man/census_helper.Rd b/man/census_helper.Rd new file mode 100644 index 0000000..ce0483a --- /dev/null +++ b/man/census_helper.Rd @@ -0,0 +1,66 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/census_helper.R +\name{census_helper} +\alias{census_helper} +\title{Census helper function.} +\usage{ +census_helper(key, voter.file, states = "all", geo = "tract", age = FALSE, + sex = FALSE, census.data = NA) +} +\arguments{ +\item{key}{A required character object. Must contain user's Census API +key, which can be requested \href{http://api.census.gov/data/key_signup.html}{here}.} + +\item{voter.file}{An object of class \code{data.frame}. Must contain field(s) +named \code{\var{county}}, \code{\var{tract}}, and/or \code{\var{block}} +specifying geolocation. These should be character variables that match up with +U.S. Census categories. County should be three characters (e.g., "031" not "31"), +tract should be six characters, and block should be four characters.} + +\item{states}{A character vector specifying which states to extract +Census data for, e.g. \code{c("NJ", "NY")}. Default is \code{"all"}, which extracts +Census data for all states contained in user-input data.} + +\item{geo}{A character object specifying what aggregation level to use. +Use \code{"county"}, \code{"tract"}, or \code{"block"}. Default is \code{"tract"}. +Warning: extracting block-level data takes very long.} + +\item{age}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on +age or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). +If \code{TRUE}, function will return Pr(Geolocation, Age | Race). +If \code{\var{sex}} is also \code{TRUE}, function will return Pr(Geolocation, Age, Sex | Race).} + +\item{sex}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on +sex or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). +If \code{TRUE}, function will return Pr(Geolocation, Sex | Race). +If \code{\var{age}} is also \code{TRUE}, function will return Pr(Geolocation, Age, Sex | Race).} + +\item{census.data}{A optional census object of class \code{list} containing +pre-saved Census geographic data. Can be created using \code{get_census_data} function. +If \code{\var{census.data}} is provided, the \code{\var{age}} element must have the same value +as the \code{\var{age}} option specified in this function (i.e., \code{TRUE} in both or +\code{FALSE} in both). Similarly, the \code{\var{sex}} element in the object provided in +\code{\var{census.data}} must have the same value as the \code{\var{sex}} option here. +If \code{\var{census.data}} is missing, Census geographic data will be obtained via Census API.} +} +\value{ +Output will be an object of class \code{data.frame}. It will + consist of the original user-input data with additional columns of + Census data. +} +\description{ +\code{census_helper} links user-input dataset with Census geographic data. +} +\details{ +This function allows users to link their geocoded dataset (e.g., voter file) +with U.S. Census 2010 data. The function extracts Census Summary File data +at the county, tract, or block level using the 'UScensus2010' package. Census data +calculated are Pr(Geolocation | Race) where geolocation is county, tract, or block. +} +\examples{ +\dontshow{data(voters)} +\dontrun{census_helper(key = "...", voter.file = voters, states = "nj", geo = "block")} +\dontrun{census_helper(key = "...", voter.file = voters, states = "all", geo = "tract", age = TRUE, sex = TRUE)} + +} + diff --git a/man/getCensusData.Rd b/man/getCensusData.Rd deleted file mode 100644 index 72eabf3..0000000 --- a/man/getCensusData.Rd +++ /dev/null @@ -1,33 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/getCensusData.R -\name{getCensusData} -\alias{getCensusData} -\title{Title return -Multilevel Census Data download function.} -\usage{ -getCensusData(key, states, demo = FALSE) -} -\arguments{ -\item{key}{A required character object. Must contain user's Census API -key, which can be requested \href{http://api.census.gov/data/key_signup.html}{here}.} - -\item{states}{which states to extract -Census data for, e.g. \code{c("NJ", "NY")}.} - -\item{demo}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on -demographics (i.e., age and sex) or not. If \code{TRUE}, function will return -Pr(Geolocation, Age, Sex | Race). If \code{FALSE}, function wil return -Pr(Geolocation | Race). Default is \code{FALSE}.} -} -\value{ -Output will be an census object of class which is a list consist of \code{state}, -\code{demo}, \code{county level census}, \code{tract level census} and \code{block level census}. -Have the census data available could make \code{census.helper.api} runs more efficient. -} -\description{ -\code{getCensusData} returns a Census data obj for a state. -} -\examples{ -\dontrun{getCensusData(key = "...", states = c("NJ", "DE"), demo = TRUE)} -} - diff --git a/man/getCensusApi.Rd b/man/get_census_api.Rd similarity index 82% rename from man/getCensusApi.Rd rename to man/get_census_api.Rd index cc9b2c2..3ddec0d 100644 --- a/man/getCensusApi.Rd +++ b/man/get_census_api.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/getCensusApi.R -\name{getCensusApi} -\alias{getCensusApi} +% Please edit documentation in R/get_census_api.R +\name{get_census_api} +\alias{get_census_api} \title{Census API function.} \usage{ -getCensusApi(data_url, key, vars, region) +get_census_api(data_url, key, vars, region) } \arguments{ \item{data_url}{URL root of the API, including the question mark, @@ -27,14 +27,14 @@ If successful, output will be an object of class \code{data.frame}. If unsuccessful, function prints the URL query that caused the error. } \description{ -\code{getCensusApi} obtains U.S. Census data via the public API. +\code{get_census_api} obtains U.S. Census data via the public API. } \details{ This function obtains U.S. Census data via the public API. User can specify the variables and region(s) for which to obtain data. } \examples{ -\dontrun{getCensusApi(data_url = "http://api.census.gov/data/2010/sf1?", key = "...", +\dontrun{get_census_api(data_url = "http://api.census.gov/data/2010/sf1?", key = "...", vars = c("P0050003","P0050004","P0050005", "P0050006"), region = "for=county:*&in=state:34")} } diff --git a/man/getCensusApi2.Rd b/man/get_census_api_2.Rd similarity index 77% rename from man/getCensusApi2.Rd rename to man/get_census_api_2.Rd index 4cba47a..fffc34e 100644 --- a/man/getCensusApi2.Rd +++ b/man/get_census_api_2.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/getCensusApi2.R -\name{getCensusApi2} -\alias{getCensusApi2} +% Please edit documentation in R/get_census_api_2.R +\name{get_census_api_2} +\alias{get_census_api_2} \title{Census API URL assembler.} \usage{ -getCensusApi2(data_url, key, get, region) +get_census_api_2(data_url, key, get, region) } \arguments{ \item{data_url}{URL root of the API, including the question mark, @@ -27,15 +27,15 @@ If successful, output will be an object of class \code{data.frame}. If unsuccessful, function prints the URL query that was constructed. } \description{ -\code{getCensusApi2} assembles URL components for \code{getCensusApi}. +\code{get_census_api_2} assembles URL components for \code{get_census_api}. } \details{ This function assembles the URL components and sends the request to the Census server. -It is used by the \code{getCensusApi} function. The user should not need to call this +It is used by the \code{get_census_api} function. The user should not need to call this function directly. } \examples{ -\dontrun{getCensusApi2(data_url = "http://api.census.gov/data/2010/sf1?", key = "...", +\dontrun{get_census_api_2(data_url = "http://api.census.gov/data/2010/sf1?", key = "...", get = c("P0050003","P0050004","P0050005", "P0050006"), region = "for=county:*&in=state:34")} } diff --git a/man/get_census_data.Rd b/man/get_census_data.Rd new file mode 100644 index 0000000..393e5bf --- /dev/null +++ b/man/get_census_data.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_census_data.R +\name{get_census_data} +\alias{get_census_data} +\title{Multilevel Census data download function.} +\usage{ +get_census_data(key, states, age = FALSE, sex = FALSE) +} +\arguments{ +\item{key}{A required character object containing a valid Census API key, +which can be requested \href{http://api.census.gov/data/key_signup.html}{here}.} + +\item{states}{which states to extract Census data for, e.g., \code{c("NJ", "NY")}.} + +\item{age}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on +age or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). +If \code{TRUE}, function will return Pr(Geolocation, Age | Race). +If \code{\var{sex}} is also \code{TRUE}, function will return Pr(Geolocation, Age, Sex | Race).} + +\item{sex}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on +sex or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). +If \code{TRUE}, function will return Pr(Geolocation, Sex | Race). +If \code{\var{age}} is also \code{TRUE}, function will return Pr(Geolocation, Age, Sex | Race).} +} +\value{ +Output will be an object of class \code{list} indexed by state. +Output will contain the following elements: \code{state}, \code{age}, \code{sex}, +\code{county}, \code{tract} and \code{block}. +} +\description{ +\code{get_census_data} returns county-, tract-, and block-level Census data +for specified state(s). Using this function to download Census data in advance +can save considerable time when running \code{predict_race} and \code{census_helper}. +} +\examples{ +\dontrun{get_census_data(key = "...", states = c("NJ", "NY"), age = TRUE, sex = FALSE)} +} + diff --git a/man/merge_surnames.Rd b/man/merge_surnames.Rd new file mode 100644 index 0000000..3c65edc --- /dev/null +++ b/man/merge_surnames.Rd @@ -0,0 +1,68 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/merge_surnames.R +\name{merge_surnames} +\alias{merge_surnames} +\title{Surname probability merging function.} +\usage{ +merge_surnames(voter.file, surname.year = 2010, clean.surname = T, + impute.missing = T) +} +\arguments{ +\item{voter.file}{An object of class \code{data.frame}. Must contain a field +named 'surname' containing list of surnames to be merged with Census lists.} + +\item{surname.year}{An object of class \code{numeric} indicating which year +Census Surname List is from. Accepted values are \code{2010} and \code{2000}. +Default is \code{2010}.} + +\item{clean.surname}{A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, +\code{clean.surname} function will be run to clean raw surnames in +\code{\var{voter.file}} before matching them with Census lists, +in order to increase the chance of finding a match. +See \code{clean.surname} documentation for details. +Default is \code{TRUE}.} + +\item{impute.missing}{A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, +race/ethnicity probabilities will be imputed for unmatched names using +race/ethnicity distribution for all other names (i.e., not on Census List). +Default is \code{TRUE}.} +} +\value{ +Output will be an object of class \code{data.frame}. It will + consist of the original user-input data with additional columns that + specify the part of the name matched with Census data (\code{\var{surname.match}}), + and the probabilities Pr(Race | Surname) for each racial group + (\code{\var{p_whi}} for White, \code{\var{p_bla}} for Black, + \code{\var{p_his}} for Hispanic/Latino, + \code{\var{p_asi}} for Asian and Pacific Islander, and + \code{\var{p_oth}} for Other/Mixed). +} +\description{ +\code{merge_surnames} merges surnames in user-input dataset with corresponding + race/ethnicity probabilities from U.S. Census Surname List and Spanish Surname List. +} +\details{ +This function allows users to match surnames in their dataset with the U.S. + Census Surname List (from 2000 or 2010) and Spanish Surname List to obtain + Pr(Race | Surname) for each of the five major racial groups. + + By default, the function matches surnames to the Census list as follows + (each step only applies to surnames not matched in previous steps): + 1) Search raw surnames in Census surname list; + 2) Remove any punctuation and search again; + 3) Remove any spaces and search again; + 4) Remove suffixes (e.g., Jr) and search again; + 5) Split double-barreled surnames into two parts and search first part of name; + 6) Split double-barreled surnames into two parts and search second part of name; + 7) For any remaining names, impute probabilities using distribution + for all names not appearing on Census list. + + Note: Any name appearing only on the Spanish Surname List is assigned a + probability of 1 for Hispanics/Latinos and 0 for all other racial groups. +} +\examples{ +data(voters) +merge_surnames(voters) + +} + diff --git a/man/name.clean.Rd b/man/name.clean.Rd deleted file mode 100644 index 1143d17..0000000 --- a/man/name.clean.Rd +++ /dev/null @@ -1,45 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/name.clean.R -\name{name.clean} -\alias{name.clean} -\title{Name cleaning and matching function.} -\usage{ -name.clean(voters) -} -\arguments{ -\item{voters}{An object of class \code{data.frame}. Must contain a field -named 'surname'.} -} -\value{ -Output will be an object of class \code{data.frame}. It will - consist of the original user-input data with additional columns that - specify the part of the name matched with Census data (\code{\var{surname.match}}), - and the probabilities Pr(Race | Surname) for each racial group - (\code{\var{p_whi}} for Whites, \code{\var{p_bla}} for Blacks, - \code{\var{p_his}} for Hispanics/Latinos, \code{\var{p_asi}} for Asians, and - \code{\var{p_oth}} for Others). -} -\description{ -\code{name.clean} cleans surnames in user-input dataset and merges in racial - distributions from the Census Surname List and Census Spanish Surname List. -} -\details{ -This function allows users to match surnames in their dataset with the U.S. - Census 2000 Surname List to obtain Pr(Race | Surname) for each of the - five major racial groups. The function matches user-input surnames with - Census surnames as follows (each step only applies to surnames not matched - in previous steps): - 1) match raw surnames with Census data; - 2) remove any spaces and search again; - 3) split apart double-barreled surnames into two names and match on first; - 4) split apart double-barreled surnames into two names and match on second; - 5) for any remaining names, impute probabilities from overall U.S. population. - Note: Any name appearing only on the Spanish Surname List is assigned a - probability of 1 for Hispanics/Latinos and 0 for all other racial groups. -} -\examples{ -data(voters) -name.clean(voters) - -} - diff --git a/man/pid.Rd b/man/pid.Rd index 449a121..4b2d2a3 100644 --- a/man/pid.Rd +++ b/man/pid.Rd @@ -1,18 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pid.R +\docType{data} \name{pid} \alias{pid} -\title{Dataset containing distribution of party registration by race.} +\title{Party and race} +\format{A data frame with 3 rows and 7 variables: +\describe{ + \item{party}{Dem/Rep/Ind} + \item{PID}{0/1/2} + \item{r_pid_whi}{Pr(PID | White)} + \item{r_pid_bla}{Pr(PID | Black)} + \item{r_pid_his}{Pr(PID | Hispanic/Latino)} + \item{r_pid_asi}{Pr(PID | Asian/Pacific Islander)} + \item{r_pid_oth}{Pr(PID | Other)} + #' }} +\usage{ +pid +} \description{ -A dataset containing the distribution of party registration for each major racial group in the U.S., as measured by Gallup in February 2013 (http://www.gallup.com/poll/160373/democrats-racially-diverse-republicans-mostly-white.aspx). Variables are as follows: - \itemize{ - \item party (i.e., "Dem", "Rep", or "Oth") - \item PID (i.e., Democrat = 1, Republican = 2, Other = 0) - \item r_pid_whi (i.e., Pr(Party | White) - \item r_pid_bla (i.e., Pr(Party | Black) - \item r_pid_his (i.e., Pr(Party | Hispanic/Latino) - \item r_pid_asi (i.e., Pr(Party | Asian) - \item r_pid_oth (i.e., Pr(Party | Other) - } +A table for probability of party by race. } -\format{ -A data frame with 3 rows and 7 variables. +\examples{ +data(pid) } +\keyword{datasets} + diff --git a/man/predict_race.Rd b/man/predict_race.Rd new file mode 100644 index 0000000..f666158 --- /dev/null +++ b/man/predict_race.Rd @@ -0,0 +1,113 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/predict_race.R +\name{predict_race} +\alias{predict_race} +\title{Race prediction function.} +\usage{ +predict_race(voter.file, census.surname = TRUE, surname.only = FALSE, + surname.year = 2010, census.geo, census.key, census.data = NA, + age = FALSE, sex = FALSE, party) +} +\arguments{ +\item{voter.file}{An object of class \code{data.frame}. +Must contain a row for each individual being predicted, +as well as a field named \code{\var{surname}} containing each individual's surname. +If using geolocation in predictions, \code{\var{voter.file}} must contain a field named +\code{\var{state}}, which contains the two-character abbreviation for each individual's +state of residence (e.g., \code{"nj"} for New Jersey). +If using Census geographic data in race/ethnicity predictions, +\code{\var{voter.file}} must also contain at least one of the following fields: +\code{\var{county}}, \code{\var{tract}}, and/or \code{\var{block}}. +These fields should contain character strings matching U.S. Census categories. +County is three characters (e.g., \code{"031"} not \code{"31"}), +tract is six characters, and block is four characters. +See below for other optional fields.} + +\item{census.surname}{A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, +function will call \code{merge_surnames} to merge in Pr(Race | Surname) +from U.S. Census Surname List (2000 or 2010) and Spanish Surname List. +If \code{FALSE}, \code{voter.file} object must contain additional fields specifying +Pr(Race | Surname), named as follows: \code{\var{p_whi}} for Whites, +\code{\var{p_bla}} for Blacks, \code{\var{p_his}} for Hispanics/Latinos, +\code{\var{p_asi}} for Asians, and/or \code{\var{p_oth}} for Other. +Default is \code{TRUE}.} + +\item{surname.only}{A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, race predictions will +only use surname data and calculate Pr(Race | Surnname). Default is \code{FALSE}.} + +\item{surname.year}{A number to specify the year of the census surname statistics. +These surname statistics is stored in the data, and will be automatically loaded. +The default value is \code{2010}, which means the surname statistics from the +2010 census will be used. Currently, the other available choice is \code{2000}.} + +\item{census.geo}{An optional character vector specifying what level of +geography to use to merge in U.S. Census 2010 geographic data. Currently +\code{"county"}, \code{"tract"}, or \code{"block"} are supported. +Note: sufficient information must be in user-defined \code{\var{voter.file}} object. +If \code{\var{census.geo} = "county"}, then \code{\var{voter.file}} +must have column named \code{county}. +If \code{\var{census.geo} = "tract"}, then \code{\var{voter.file}} +must have columns named \code{county} and \code{tract}. +And if \code{\var{census.geo} = "block"}, then \code{\var{voter.file}} +must have columns named \code{county}, \code{tract}, and \code{block}. +Specifying \code{\var{census.geo}} will call \code{census_helper} function +to merge Census geographic data at specified level of geography.} + +\item{census.key}{A character object specifying user's Census API +key. Required if \code{\var{census.geo}} is specified, because +a valid Census API key is required to download Census geographic data.} + +\item{census.data}{A list indexed by two-letter state abbreviations, +which contains pre-saved Census geographic data. +Can be generated using \code{get_census_data} function.} + +\item{age}{An optional \code{TRUE}/\code{FALSE} object specifying whether to +condition race predictions on age (in addition to surname and geolocation). +Default is \code{FALSE}. Must be same as \code{\var{age}} in \code{\var{census.data}} object. +May only be set to \code{TRUE} if \code{census.geo} option is specified. +If \code{TRUE}, \code{\var{voter.file}} should include a numerical variable \code{\var{age}}.} + +\item{sex}{optional \code{TRUE}/\code{FALSE} object specifying whether to +condition race predictions on sex (in addition to surname and geolocation). +Default is \code{FALSE}. Must be same as \code{\var{sex}} in \code{\var{census.data}} object. +May only be set to \code{TRUE} if \code{census.geo} option is specified. +If \code{TRUE}, \code{\var{voter.file}} should include a numerical variable \code{\var{sex}}, +where \code{\var{sex}} is coded as 0 for males and 1 for females.} + +\item{party}{An optional character object specifying party registration field +in \code{\var{voter.file}}, e.g., \code{\var{party} = "PartyReg"}. +If specified, race/ethnicity predictions will be conditioned +on individual's party registration (in addition to geolocation). +Whatever the name of the party registration field in \code{\var{voter.file}}, +it should be coded as 1 for Democrat, 2 for Republican, and 0 for Other.} +} +\value{ +Output will be an object of class \code{data.frame}. It will + consist of the original user-input data with additional columns with + predicted probabilities for each of the five major racial categories: + \code{\var{pred.whi}} for White, + \code{\var{pred.bla}} for Black, + \code{\var{pred.his}} for Hispanic/Latino, + \code{\var{pred.asi}} for Asian/Pacific Islander, and + \code{\var{pred.oth}} for Other/Mixed. +} +\description{ +\code{predict_race} makes probabilistic estimates of individual-level race/ethnicity. +} +\details{ +This function implements the Bayesian race prediction methods outlined in +Imai and Khanna (2015). The function produces probabilistic estimates of +individual-level race/ethnicity, based on surname, geolocation, and party. +} +\examples{ +data(voters) +predict_race(voters, surname.only = TRUE) +predict_race(voter.file = voters, surname.only = TRUE) +\dontrun{predict_race(voter.file = voters, census.geo = "tract", census.key = "...", age = TRUE)} +\dontrun{predict_race(voter.file = voters, census.geo = "tract", census.key = "...", party = "PID")} +\dontrun{CensusObj <- get_census_data("...", state = c("NY", "DC", "NJ")); +predict_race(voter.file = voters, census.geo = "tract", census.data = CensusObj, party = "PID")} +\dontrun{CensusObj2 <- get_census_data("...", state = c("NY", "DC", "NJ"), age = TRUE, sex = TRUE); +predict_race(voter.file = voters, census.geo = "tract", census.data = CensusObj2, party = "PID", age = TRUE, sex = TRUE)} +} + diff --git a/man/race.pred.Rd b/man/race.pred.Rd deleted file mode 100644 index cacf506..0000000 --- a/man/race.pred.Rd +++ /dev/null @@ -1,90 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/race.pred.R -\name{race.pred} -\alias{race.pred} -\title{Race prediction function.} -\usage{ -race.pred(voters, races = c("white", "black", "latino", "asian", "other"), - name.clean = TRUE, surname.only = FALSE, census = "", census.key = "", - demo = FALSE, party, census.data = NA) -} -\arguments{ -\item{voters}{An object of class \code{data.frame}. Must contain a row for each individual being -predicted, as well as a field named \code{\var{surname}} containing each individual's surname. -If using geolocation in predictions, \code{\var{voters}} must contain a field named \code{\var{state}}, -which contains the two-character abbreviation for each individual's state of residence (e.g., "nj" for New Jersey). -If using geolocation, \code{\var{voters}} must also contain at least one of the following fields: -\code{\var{county}}, \code{\var{tract}}, and/or \code{\var{block}}. -These fields should contain character strings matching U.S. Census categories. -County is three characters (e.g., "031" not "31"), tract is six characters, and block is four characters. -See below for other optional fields.} - -\item{races}{A character vector specifying which racial groups to generate -predicted probabilities for. Can include any subset of the default vector, -which is \code{c("white", "black", "latino", "asian", "other")}.} - -\item{name.clean}{A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, function will call -\code{name.clean} to merge in data from U.S. Census 2000 Surname List -and Spanish Surname List. If \code{FALSE}, voters object must contain additional fields -specifying Pr(Race | Surname), named as follows: -\code{\var{p_whi}} for Whites, \code{\var{p_bla}} for Blacks, -\code{\var{p_his}} for Hispanics/Latinos, \code{\var{p_asi}} for Asians, -and/or \code{\var{p_oth}} for Other. Default is \code{TRUE}.} - -\item{surname.only}{A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, race predictions will -only use surname data and calculate Pr(Race | Surnname). Default is \code{FALSE}.} - -\item{census}{An optional character vector specifying what level of -geography to use to merge in U.S. Census 2010 data. Can be one of -\code{"county"}, \code{"tract"}, or \code{"block"}. -Function calls \code{census.helper.api} to merge in Census data at specified level. -If left unspecified, \code{voters} must contain additional fields -specifying Pr(Geolocation | Race), including any of the following: -\code{\var{r_whi}}, \code{\var{r_bla}}, \code{\var{r_his}}, -\code{\var{r_asi}}, and/or \code{\var{r_oth}}.} - -\item{census.key}{A character object specifying user's Census API -key. Required if \code{census} is specified, because -\code{census.helper} function requires a Census API key to operate.} - -\item{demo}{An optional \code{TRUE}/\code{FALSE} object specifying whether to -condition race predictions on individual age and sex (in addition to geolocation). -Default is \code{FALSE}. -May only be set to \code{TRUE} if \code{census} option is specified. -If \code{TRUE}, \code{voters} should include numerical variables -\code{\var{age}} and \code{\var{sex}}, where \code{\var{sex}} coded as 0 for -males and 1 for females.} - -\item{party}{An optional character object specifying party registration field in \code{\var{voters}}, -e.g., \code{\var{party} = "PartyReg"}. If specified, race/ethnicity predictions will be conditioned -on individual's party registration (in addition to geolocation). -Whatever the name of the party registration field in \code{\var{voters}}, -it should be coded as 1 for Democrat, 2 for Republican, and 0 for Other.} - -\item{census.data}{A census data object, a list indexed by state names, -which contains census data on demo, county, tract and block.} -} -\value{ -Output will be an object of class \code{data.frame}. It will - consist of the original user-input data with additional columns that - contain predicted probabilities for each race in \code{races}. -} -\description{ -\code{race.pred} makes probabilistic estimates of individual-level race/ethnicity. -} -\details{ -This function implements the Bayesian race prediction methods outlined in -Imai and Khanna (2015). The function produces probabilistic estimates of -individual-level race/ethnicity, based on surname, geolocation, and party. -} -\examples{ -data(voters) -race.pred(voters = voters, races = c("asian"), surname.only = TRUE) -\dontrun{race.pred(voters = voters, races = c("white", "black", "latino"), -census = "tract", census.key = "...", demo = TRUE)} -\dontrun{race.pred(voters = voters, races = c("white", "black", "latino", "asian", "other"), -census = "tract", census.key = "...", party = "PID")} -\dontrun{race.pred(voters = voters, races = c("white", "black", "latino", "asian", "other"), -census = "tract", census.data = censusObjs, party = "PID")} -} - diff --git a/man/surnames.Rd b/man/surnames.Rd new file mode 100644 index 0000000..660e119 --- /dev/null +++ b/man/surnames.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/surnames2010.R +\docType{data} +\name{surnames} +\alias{surnames} +\title{Census Surname List (2010).} +\format{A data frame with 167,613 rows and 6 variables: +\describe{ + \item{surname}{Surname} + \item{p_whi}{Pr(White | Surname)} + \item{p_bla}{Pr(Black | Surname)} + \item{p_his}{Pr(Hispanic/Latino | Surname)} + \item{p_asi}{Pr(Asian/Pacific Islander | Surname)} + \item{p_oth}{Pr(Other | Surname)} + #' }} +\usage{ +surnames +} +\description{ +Census Surname List from 2010 with race/ethnicity probabilities by surname. +} +\examples{ +data(surnames) +} +\keyword{datasets} + diff --git a/man/surnames2000.Rd b/man/surnames2000.Rd new file mode 100644 index 0000000..2cd9624 --- /dev/null +++ b/man/surnames2000.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/surnames2000.R +\docType{data} +\name{surnames2000} +\alias{surnames2000} +\title{Census Surname List (2000).} +\format{A data frame with 157,728 rows and 6 variables: +\describe{ + \item{surname}{Surname} + \item{p_whi}{Pr(White | Surname)} + \item{p_bla}{Pr(Black | Surname)} + \item{p_his}{Pr(Hispanic/Latino | Surname)} + \item{p_asi}{Pr(Asian/Pacific Islander | Surname)} + \item{p_oth}{Pr(Other | Surname)} + #' }} +\usage{ +surnames2000 +} +\description{ +Census Surname List from 2000 with race/ethnicity probabilities by surname. +} +\examples{ +data(surnames2000) +} +\keyword{datasets} + diff --git a/man/vecToChunk.Rd b/man/vec_to_chunk.Rd similarity index 70% rename from man/vecToChunk.Rd rename to man/vec_to_chunk.Rd index 27795a3..32b2b7a 100644 --- a/man/vecToChunk.Rd +++ b/man/vec_to_chunk.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/vecToChunk.R -\name{vecToChunk} -\alias{vecToChunk} +% Please edit documentation in R/vec_to_chunk.R +\name{vec_to_chunk} +\alias{vec_to_chunk} \title{Variable vector into chunks.} \usage{ -vecToChunk(x) +vec_to_chunk(x) } \arguments{ \item{x}{Character vector of variable names.} @@ -13,7 +13,7 @@ vecToChunk(x) Object of class \code{list}. } \description{ -\code{vecToChunk} takes a list of variables and collects them into 50-variable chunks. +\code{vec_to_chunk} takes a list of variables and collects them into 50-variable chunks. } \details{ This function takes a list of variable names and collects them into chunks with no more than @@ -22,7 +22,7 @@ API only allows queries of 50 variables at a time. The user should not need to call this function directly. } \examples{ -vecToChunk(x = c(paste("P012F0", seq(10:49), sep = ""), paste("P012I0", seq(10, 49), sep = ""))) +vec_to_chunk(x = c(paste("P012F0", seq(10:49), sep = ""), paste("P012I0", seq(10, 49), sep = ""))) } \references{ diff --git a/man/voters.Rd b/man/voters.Rd index 55af944..7f8f536 100644 --- a/man/voters.Rd +++ b/man/voters.Rd @@ -1,22 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/voters.R +\docType{data} \name{voters} \alias{voters} -\title{Dataset containing example voter file.} +\title{Example voter file.} +\format{A data frame with 10 rows and 12 variables: +\describe{ + \item{VoterID}{Voter identifier (numeric)} + \item{surname}{Surname} + \item{state}{State of residence} + \item{CD}{Congressional district} + \item{county}{Census county (three-digit code)} + \item{tract}{Census tract (six-digit code)} + \item{block}{Census block (four-digit code)} + \item{precinct}{Voting precinct} + \item{age}{Age in years} + \item{sex}{0=male, 1=female} + \item{party}{Party registration (character)} + \item{PID}{Party registration (numeric)} + #' }} +\usage{ +voters +} \description{ -A dataset containing an example voter file with 10 voters. Variables are as follows: - \itemize{ - \item VoterID - \item surname - \item state - \item CD (i.e., Congressional District) - \item county - \item tract - \item block - \item precinct - \item age - \item female - \item party - } +An example dataset containing voter file information. } -\format{ -A data frame with 10 rows and 11 variables. +\examples{ +data(voters) } +\keyword{datasets} + diff --git a/wru.Rproj b/wru.Rproj index ad83b6f..eaa6b81 100644 --- a/wru.Rproj +++ b/wru.Rproj @@ -15,4 +15,4 @@ LaTeX: pdfLaTeX BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source -PackageRoxygenize: rd,namespace +PackageRoxygenize: rd,collate,namespace