-
Notifications
You must be signed in to change notification settings - Fork 31
/
format_legacy_data.R
75 lines (66 loc) · 2.86 KB
/
format_legacy_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#' Legacy data formatting function.
#'
#' \code{format_legacy_data} formats legacy data from the U.S. census to allow
#' for Bayesian name geocoding.
#'
#' This function allows users to construct datasets for analysis using the census legacy data format.
#' These data are available for the 2020 census at
#' https://www2.census.gov/programs-surveys/decennial/2020/data/01-Redistricting_File--PL_94-171/.
#' This function returns data structured analogously to data from the Census API, which is not yet
#' available for the 2020 Census as of September 2021.
#'
#' @param legacyFilePath A character vector giving the location of a legacy census data folder,
#' sourced from https://www2.census.gov/programs-surveys/decennial/2020/data/01-Redistricting_File--PL_94-171/.
#' These file names should end in ".pl".
#' @param state The two letter state postal code.
#' @param outFile Optional character vector determining whether the formatted RData object should be saved. The
#' filepath should end in ".RData".
#'
#' @import PL94171
#'
#' @examples
#' \dontrun{
#' gaCensusData <- format_legacy_data(PL94171::pl_url('GA', 2020))
#' predict_race_new(ga.voter.file, namesToUse = 'last, first, mid', census.geo = 'block',
#' census.data = gaCensusData)
#'}
#'
#' @export
format_legacy_data <- function(legacyFilePath, state, outFile = NULL) {
# aggregation levels to convert (county, tract, block group, and block)
summaryLevels <- c('050', '140', '150', '750')
# read in the data
pl <- pl_read(legacyFilePath)
pl <- pl_select_standard(pl)
# iterate through the levels
censusData.2020 <- lapply(summaryLevels, FUN = function(level) {
levelData <- pl[pl$summary_level == level,]
# construct the base data frame
df <- data.frame(state = toupper(state),
county = levelData$county,
P005003 = levelData$pop_white,
P005004 = levelData$pop_black,
P005010 = levelData$pop_hisp,
P005006 = levelData$pop_asian,
P005007 = levelData$pop_nhpi,
P005005 = levelData$pop_aian,
P005008 = levelData$pop_other,
P005009 = levelData$pop_two)
# add geographic levels
if(level != '050') {
df$tract <- substr(levelData$GEOID, nchar(levelData$GEOID) - 5, nchar(levelData$GEOID))
if(level != '140') {
df$blockGroup <- substr(levelData$GEOID, nchar(levelData$GEOID), nchar(levelData$GEOID))
if(level != '150')
df$block <- substr(levelData$GEOID, nchar(levelData$GEOID)-2, nchar(levelData$GEOID))
}
}
df
})
# format and optionally save the file
names(censusData.2020) <- c('county', 'tract', 'blockGroup', 'block')
if(!is.null(outFile))
save(censusData.2020, file = outFile)
# return the object
return(censusData.2020)
}