diff --git a/articles/chapter-10.html b/articles/chapter-10.html index 16e5828..dec7e55 100644 --- a/articles/chapter-10.html +++ b/articles/chapter-10.html @@ -104,55 +104,250 @@
This chapter is under construction.
-
library(alda)
library(dplyr)
-#>
-#> Attaching package: 'dplyr'
-#> The following objects are masked from 'package:stats':
-#>
-#> filter, lag
-#> The following objects are masked from 'package:base':
-#>
-#> intersect, setdiff, setequal, union
-library(tidyr)
+library(tidyr)
+library(purrr)
library(ggplot2)
+library(patchwork)
library(survival)
library(broom)
Table 10.1, page 327:
+In Section 10.1 Singer and Willett (2003) introduce the life +table—the primary tool for for summarizing the sample +distribution of event occurrence—using a subset of data from Singer +(1993), who measured how many years 3941 newly hired special educators +in Michigan stayed in teaching between 1972 and 1978. Teachers were +followed for up to 13 years or until they stopped teaching in the +state.
+For this example we return to the teachers
data set
+introduced in Chapter 9, a person-level data frame with 3941 rows and 3
+columns:
id
: Teacher ID.years
: The number of years between a teacher’s dates of
+hire and departure from the Michigan public schools.censor
: Censoring status.
+teachers
+#> # A tibble: 3,941 × 3
+#> id years censor
+#> <fct> <dbl> <dbl>
+#> 1 1 1 0
+#> 2 2 2 0
+#> 3 3 1 0
+#> 4 4 1 0
+#> 5 5 12 1
+#> 6 6 1 0
+#> 7 7 12 1
+#> 8 8 1 0
+#> 9 9 2 0
+#> 10 10 2 0
+#> # ℹ 3,931 more rows
As Singer and Willett (2003) discuss, a life table tracks the event +histories of a sample of individuals over a series of contiguous +intervals—from the beginning of time through the end of data +collection—by including information on the number of individuals +who:
+We can either construct a life table “by hand” by first converting +the person-level data set to a person-period data set, then +cross-tabulating time period and +event-indicator variables; or by using a prepackaged +routine. Because we will be constructing a life table “by hand” in +Section 10.5, here we demonstrate the prepackaged routine approach.
+Conceptually, the life table is simply the tabular form of a +survival function (see Section 10.2); thus, an easy way +to construct a life table is to first fit a survival function to the +person-level data set, then use the summary of the fit as a starting +point to construct the remainder of the table.
+We can fit a survival function using the survfit()
+function from the survival package. The model formula
+for the survfit()
function takes the form
+response ~ terms
, where the response must be a “survival
+object” created by the Surv()
function. For right-censored
+data, the survival object can be created by supplying two unnamed
+arguments to the Surv()
function corresponding to
+time
and event
variables, in that order. Note
+that we can recode a censor
variable into an
+event
variable by reversing its values. For 0-1 coded data,
+we can write the event status as event = censor - 1
.
-# A life table is the tabular form of a survival curve, so begin by fitting a
-# Kaplan-Meir curve to the data.
-teachers_fit <- survfit(Surv(years, 1 - censor) ~ 1, data = teachers)
+teachers_fit <- survfit(Surv(years, 1 - censor) ~ 1, data = teachers)
-table_10.1 <- teachers_fit |>
- # Add a starting time (time 0) for the table.
+summary(teachers_fit)
+#> Call: survfit(formula = Surv(years, 1 - censor) ~ 1, data = teachers)
+#>
+#> time n.risk n.event survival std.err lower 95% CI upper 95% CI
+#> 1 3941 456 0.884 0.00510 0.874 0.894
+#> 2 3485 384 0.787 0.00652 0.774 0.800
+#> 3 3101 359 0.696 0.00733 0.682 0.710
+#> 4 2742 295 0.621 0.00773 0.606 0.636
+#> 5 2447 218 0.566 0.00790 0.550 0.581
+#> 6 2229 184 0.519 0.00796 0.504 0.535
+#> 7 2045 123 0.488 0.00796 0.472 0.504
+#> 8 1642 79 0.464 0.00800 0.449 0.480
+#> 9 1256 53 0.445 0.00811 0.429 0.461
+#> 10 948 35 0.428 0.00827 0.412 0.445
+#> 11 648 16 0.418 0.00848 0.401 0.435
+#> 12 391 5 0.412 0.00870 0.396 0.430
Next, we’ll collect the summary information from the
+survfit
object into a tibble using the tidy()
+function from the broom package. For now we will
+exclude any statistical summaries from the life table, focusing
+exclusively on columns related to the event histories of the
+teachers
data. Note also that the summary information from
+the survfit
object starts at the time of the first event,
+not the “beginning of time”. We can add a “beginning of time” to the
+survfit
object using the survfit0()
function
+from the survival package, which (by default) adds a starting time of 0
+to the life table.
+teachers_lifetable <- teachers_fit |>
survfit0() |>
tidy() |>
- # The summary of the fit gives most of what we want, but to match Table 10.1
- # we need to do a little more wrangling.
- select(-c(std.error:conf.low)) |>
+ select(-c(estimate:conf.low)) |>
+ mutate(interval = paste0("[", time, ", ", time + 1, ")"), .after = time) |>
+ rename(year = time)
+
+teachers_lifetable
+#> # A tibble: 13 × 5
+#> year interval n.risk n.event n.censor
+#> <dbl> <chr> <dbl> <dbl> <dbl>
+#> 1 0 [0, 1) 3941 0 0
+#> 2 1 [1, 2) 3941 456 0
+#> 3 2 [2, 3) 3485 384 0
+#> 4 3 [3, 4) 3101 359 0
+#> 5 4 [4, 5) 2742 295 0
+#> 6 5 [5, 6) 2447 218 0
+#> 7 6 [6, 7) 2229 184 0
+#> 8 7 [7, 8) 2045 123 280
+#> 9 8 [8, 9) 1642 79 307
+#> 10 9 [9, 10) 1256 53 255
+#> 11 10 [10, 11) 948 35 265
+#> 12 11 [11, 12) 648 16 241
+#> 13 12 [12, 13) 391 5 386
As Singer and Willett (2003) discuss, we interpret the columns of the +life table as follows:
+year
: Defines the time of each interval using ordinal
+numbers.interval
: Defines precisely which event times appear in
+each interval using the interval notation,
+[start, end)
, where each interval includes the starting
+time and excludes the ending time.n.risk
: Defines the risk set during
+each interval; that is, the number of (remaining) individuals who are
+eligible to experience the target event during an interval.n.event
: Defines the number of individuals who
+experienced the target event during an interval.n.censor
: Defines the number of individuals who were
+censored during an interval.Importantly, notice that once an individual experiences the target +event or is censored during an interval, they drop out of the risk set +in all future intervals; thus, the risk set is inherently +irreversible.
+In Section 10.2 Singer and Willett (2003) introduce three statistics +for summarizing the event history information of the life table, which +can estimated directly from the life table:
+Hazard is the fundamental quantity used to +assess the risk of event occurrence in each discrete time period. The +discrete-time hazard function is the conditional +probability that the \(i\)th +individual will experience the target event in the \(j\)th interval, given that they did not +experience it in any prior interval:
+\[ +h(t_{ij}) = \Pr[T_i = j \mid T \geq j], +\]
+whose maximum likelihood estimates are given by the proportion of +each interval’s risk set that experiences the event during that +interval:
+\[ +\hat h(t_j) = \frac{n \text{ events}_j}{n \text{ at risk}_j}. +\]
+The survival function is the cumulative +probability that the \(i\)th +individual will not experience the target event past the \(j\)th interval:
+\[ +S(t_{ij}) = \Pr[T > j], +\]
+whose maximum likelihood estimates are given by the cumulative +product of the complement of the estimated hazard probabilities across +the current and all previous intervals:
+\[ +\hat S(t_j) = [1 - \hat h(t_j)] + [1 - \hat h(t_{j-1})] + [1 - \hat h(t_{j-2})] + \dots + [1 - \hat h(t_1)]. +\]
+The median lifetime is a measure of central +tendency identifying the point in time by which we estimate that half of +the sample has experienced the target event and half has not, given +by:
+\[ +\text{Estimated median lifetime} = m + + \Bigg[ \frac{\hat S(t_m) - .5}{\hat S(t_m) - \hat S(t_{m + 1})} +\Bigg] + \big( (m + 1) - m \big), +\]
+where \(m\) is the time interval +immediately before the median lifetime, \(\hat +S(t_m)\) is the value of the survivor function in the \(m\)th interval, and \(\hat S(t_{m + 1})\) is the value of the +survivor function in the next interval.
+First, the discrete-time hazard function and the survival function.
+Note the use of if-else statements to provide preset values for the
+“beginning of time”, which by definition will always be NA
+for the discrete-time hazard function and 1
for the
+survival function.
+teachers_lifetable <- teachers_lifetable |>
mutate(
- interval = paste0("[", time, ", ", time + 1, ")"),
- haz.estimate = n.event / n.risk
- ) |>
- rename(year = time, surv.estimate = estimate) |>
- relocate(
- year, interval, n.risk, n.event, n.censor, haz.estimate, surv.estimate
+ haz.estimate = if_else(year != 0, n.event / n.risk, NA),
+ surv.estimate = if_else(year != 0, 1 - haz.estimate, 1),
+ surv.estimate = cumprod(surv.estimate)
)
-table_10.1
+# Table 10.1, page 327:
+teachers_lifetable
#> # A tibble: 13 × 7
#> year interval n.risk n.event n.censor haz.estimate surv.estimate
#> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
-#> 1 0 [0, 1) 3941 0 0 0 1
+#> 1 0 [0, 1) 3941 0 0 NA 1
#> 2 1 [1, 2) 3941 456 0 0.116 0.884
#> 3 2 [2, 3) 3485 384 0 0.110 0.787
#> 4 3 [3, 4) 3101 359 0 0.116 0.696
@@ -165,154 +360,469 @@ 10.1 The Life Table#> 11 10 [10, 11) 948 35 265 0.0369 0.428
#> 12 11 [11, 12) 648 16 241 0.0247 0.418
#> 13 12 [12, 13) 391 5 386 0.0128 0.412
Figure 10.1, page 333:
-
-ggplot(table_10.1, aes(x = year, y = haz.estimate)) +
- geom_line() +
- scale_x_continuous(breaks = 0:13, limits = c(1, 13)) +
- scale_y_continuous(breaks = c(0, .05, .1, .15), limits = c(0, .15)) +
- coord_cartesian(xlim = c(0, 13))
-#> Warning: Removed 1 row containing missing values or values outside the scale range
-#> (`geom_line()`).
-
-# First interpolate median lifetime
-median_lifetime <- table_10.1 |>
- # Get the row indices for the first survival estimate immediately below and
- # immediately above 0.5. This will only work correctly if the values are in
- # descending order, otherwise min() and max() must be swapped. By default, the
- # survival estimates are in descending order, however, I've added the
- # redundant step of ensuring they are here for demonstration purposes.
- arrange(desc(surv.estimate)) |>
- slice(min(which(surv.estimate <= .5)), max(which(surv.estimate >= .5))) |>
- select(year, surv.estimate) |>
- # Linearly interpolate between the two values of the survival estimates that
- # bracket .5 following Miller's (1981) equation.
+Then the median lifetime. Here we use the slice()
+function from the dplyr package to select the time
+intervals immediately before and after the median lifetime, then do a
+bit of wrangling to make applying the median lifetime equation easier
+and clearer.
+
+teachers_median_lifetime <- teachers_lifetable |>
+ slice(max(which(surv.estimate >= .5)), min(which(surv.estimate <= .5))) |>
+ mutate(m = c("before", "after")) |>
+ select(m, year, surv = surv.estimate) |>
+ pivot_wider(names_from = m, values_from = c(year, surv)) |>
summarise(
- year =
- min(year) +
- ((max(surv.estimate) - .5) /
- (max(surv.estimate) - min(surv.estimate))) *
- ((min(year) + 1) - min(year)),
- surv.estimate = .5
+ surv.estimate = .5,
+ year = year_before
+ + ((surv_before - .5) / (surv_before - surv_after))
+ * (year_after - year_before)
)
+
+teachers_median_lifetime
+#> # A tibble: 1 × 2
+#> surv.estimate year
+#> <dbl> <dbl>
+#> 1 0.5 6.61
+A valuable way of examining these statistics is to plot their
+trajectories over time.
+
+teachers_haz <- ggplot(teachers_lifetable, aes(x = year, y = haz.estimate)) +
+ geom_line() +
+ scale_x_continuous(breaks = 0:13) +
+ coord_cartesian(xlim = c(0, 13), ylim = c(0, .15))
-ggplot(table_10.1, aes(x = year, y = surv.estimate)) +
+teachers_surv <- ggplot(teachers_lifetable, aes(x = year, y = surv.estimate)) +
geom_line() +
geom_segment(
- aes(xend = year, y = 0, yend = .5), data = median_lifetime, linetype = 2
+ aes(xend = year, y = 0, yend = .5),
+ data = teachers_median_lifetime,
+ linetype = 2
) +
geom_segment(
- aes(xend = 0, yend = .5), data = median_lifetime, linetype = 2
+ aes(xend = 0, yend = .5),
+ data = teachers_median_lifetime,
+ linetype = 2
) +
scale_x_continuous(breaks = 0:13) +
- scale_y_continuous(breaks = c(0, .5, 1), limits = c(0, 1)) +
- coord_cartesian(xlim = c(0, 13))
-
+ scale_y_continuous(breaks = c(0, .5, 1)) +
+ coord_cartesian(xlim = c(0, 13))
+
+# Figure 10.1, page 333:
+teachers_haz + teachers_surv + plot_layout(ncol = 1, axes = "collect")
When examining plots like these, Singer and Willett (2003) recommend +looking for patterns in and between the trajectories to answer questions +like:
+Figure 10.2, page 340:
-
-relapse_fit <- survfit(Surv(weeks, 1 - censor) ~ 1, data = cocaine_relapse_1)
-relapse_tidy <- tidy(relapse_fit)
-relapse_summary <- glance(relapse_fit)
In Section 10.3 Singer and Willett (2003) examine and describe the +estimated discrete-time hazard functions, survivor functions, and median +lifetimes from four studies that differ by their type of target event, +metric for clocking time, and underlying profile of risk:
+cocaine_relapse_1
: A person-level data frame with
+104 rows and 4 columns containing a subset of data from Hall, Havassy,
+and Wasserman (1990), who measured the number of weeks of relapse to
+cocaine use in a sample of 104 former addicts released from an
+in-patient treatment program. In-patients were followed for up to 12
+weeks or until they used cocaine for 7 consecutive days.
+cocaine_relapse_1
+#> # A tibble: 104 × 4
+#> id weeks censor needle
+#> <fct> <dbl> <dbl> <dbl>
+#> 1 501 2 0 1
+#> 2 502 12 1 0
+#> 3 503 1 0 1
+#> 4 505 9 0 1
+#> 5 507 3 0 1
+#> 6 508 2 0 1
+#> 7 509 12 1 0
+#> 8 510 12 1 1
+#> 9 511 1 0 1
+#> 10 512 2 0 1
+#> # ℹ 94 more rows
first_sex
: A person-level data frame with 180 rows
+and 5 columns containing a subset of data from Capaldi, Crosby, and
+Stoolmiller (1996), who measured the grade year of first sexual
+intercourse in a sample of 180 at-risk heterosexual adolescent males.
+Adolescent males were followed from Grade 7 up to Grade 12 or until they
+reported having had sexual intercourse for the first time.
+first_sex
+#> # A tibble: 180 × 5
+#> id grade censor parental_transition parental_antisociality
+#> <fct> <dbl> <dbl> <dbl> <dbl>
+#> 1 1 9 0 0 1.98
+#> 2 2 12 1 1 -0.545
+#> 3 3 12 1 0 -1.40
+#> 4 5 12 0 1 0.974
+#> 5 6 11 0 0 -0.636
+#> 6 7 9 0 1 -0.243
+#> 7 9 12 1 0 -0.869
+#> 8 10 11 0 0 0.454
+#> 9 11 12 1 1 0.802
+#> 10 12 11 0 1 -0.746
+#> # ℹ 170 more rows
suicide_ideation
: A person-level data frame with 391
+rows and 4 columns containing a subset of data from Bolger and
+colleagues (1989) measuring age of first suicide ideation in a sample of
+391 undergraduate students aged 16 to 22. Age of first suicide ideation
+was measured with a two-item survey asking respondents “Have you ever
+thought of committing suicide?” and if so, “At what age did the thought
+first occur to you?”
+suicide_ideation
+#> # A tibble: 391 × 4
+#> id age censor age_now
+#> <fct> <dbl> <dbl> <dbl>
+#> 1 1 16 0 18
+#> 2 2 10 0 19
+#> 3 3 16 0 19
+#> 4 4 20 0 22
+#> 5 6 15 0 22
+#> 6 7 10 0 19
+#> 7 8 22 1 22
+#> 8 9 22 1 22
+#> 9 10 15 0 20
+#> 10 11 10 0 19
+#> # ℹ 381 more rows
congresswomen
: A person-level data frame with 168
+rows and 5 columns containing data measuring how long all 168 women who
+were elected to the U.S. House of Representatives between 1919 and 1996
+remained in office. Representatives were followed for up to eight terms
+or until 1998.
+congresswomen
+#> # A tibble: 168 × 5
+#> id name terms censor democrat
+#> <fct> <chr> <dbl> <dbl> <dbl>
+#> 1 1 Abzug, Bella 3 0 1
+#> 2 2 Andrews, Elizabeth 1 0 1
+#> 3 3 Ashbrook, Jean 1 0 0
+#> 4 4 Baker, Irene 1 0 0
+#> 5 5 Bentley, Helen 5 0 0
+#> 6 6 Blitch, Iris 4 0 1
+#> 7 7 Boggs, Corinne 8 1 1
+#> 8 8 Boland, Veronica Grace 1 0 1
+#> 9 9 Bolton, Frances 8 1 0
+#> 10 10 Bosone, Reva 2 0 1
+#> # ℹ 158 more rows
We can plot the discrete-time hazard functions, survivor functions,
+and median lifetimes from each of these four studies in a single call
+using the pmap()
function from the purrr
+package.
+study_plots <- pmap(
+ list(
+ list("cocaine_relapse_1", "first_sex", "suicide_ideation", "congresswomen"),
+ list(cocaine_relapse_1, first_sex, suicide_ideation, congresswomen),
+ list("weeks", "grade", "age", "terms"),
+ list(0, 6, 5, 0)
+ ),
+ \(.title, .study, .time, .beginning) {
+
+ # Get life table statistics.
+ study_fit <- survfit(Surv(.study[[.time]], 1 - censor) ~ 1, data = .study)
+
+ study_lifetable <- study_fit |>
+ survfit0(start.time = .beginning) |>
+ tidy() |>
+ rename(surv.estimate = estimate) |>
+ mutate(haz.estimate = if_else(time != .beginning, n.event / n.risk, NA))
+
+ study_median_lifetime <- study_lifetable |>
+ slice(max(which(surv.estimate >= .5)), min(which(surv.estimate <= .5))) |>
+ mutate(m = c("before", "after")) |>
+ select(m, time, surv = surv.estimate) |>
+ pivot_wider(names_from = m, values_from = c(time, surv)) |>
+ summarise(
+ surv.estimate = .5,
+ time = time_before
+ + ((surv_before - .5) / (surv_before - surv_after))
+ * (time_after - time_before)
+ )
+
+ # Plot discrete-time hazard and survival functions.
+ study_haz <- ggplot(study_lifetable, aes(x = time, y = haz.estimate)) +
+ geom_line() +
+ xlab(.time)
+
+ study_surv <- ggplot(study_lifetable, aes(x = time, y = surv.estimate)) +
+ geom_line() +
+ geom_segment(
+ aes(xend = time, y = 0, yend = .5),
+ data = study_median_lifetime,
+ linetype = 2
+ ) +
+ geom_segment(
+ aes(xend = .beginning, yend = .5),
+ data = study_median_lifetime,
+ linetype = 2
+ ) +
+ xlab(.time)
+
+ wrap_elements(panel = (study_haz | study_surv)) + ggtitle(.title)
+ }
+)
+
+# Figure 10.2, page 340:
+wrap_plots(study_plots, ncol = 1)
Focusing on the overall shape of the discrete-time hazard functions, +and contextualizing their shape against their respective survival +functions, Singer and Willet (2003) make the following observations:
+cocaine_relapse_1
: The discrete-time hazard function
+follows a monotonically decreasing pattern—peaking
+immediately after the “beginning of time” and decreasing
+thereafter—which is common when studying target events related to
+recurrence and relapse. This is reflected in the survival function,
+which drops rapidly in early time periods, then more slowly over time as
+the hazard decreases, indicating that the prevalence of relapse to
+cocaine use was greatest shortly after leaving treatment.
first_sex
: The discrete-time hazard function follows
+a monotonically increasing pattern—starting low
+immediately after the “beginning of time” and increasing
+thereafter—which is common when studying target events that are
+ultimately inevitable or near universal. This is reflected in the
+survival function, which drops slowly in early time periods, then more
+rapidly over time as the hazard increases, indicating that the
+prevalence of first sexual intercourse in those still at risk
+progressively increased as time progressed.
suicide_ideation
: The discrete-time hazard function
+follows a nonmonotonic pattern with multiple
+distinctive peaks and troughs, which generally arise in studies of long
+duration due to the data collection period being of sufficient length to
+capture reversals in otherwise seemingly monotonic trends. This is
+reflected in the survival function, which has multiple periods of slow
+and rapid decline, indicating that prevalence of suicide ideation was
+low during childhood, peaked during adolescence, and then declined near
+early-childhood levels in late adolescence for those still at
+risk.
congresswomen
: The discrete-time hazard function
+follows a U-shaped pattern—with periods of high risk
+immediately after the “beginning of time” and again at the end of
+time—which is common when studying target events that have different
+causes near the beginning and end of time. This is reflected in the
+survival function, which drops rapidly in early and late time periods,
+but more slowly in the middle, indicating that prevalence of leaving
+office was greatest shortly after their first election, and then after
+having served for a long period of time for those still at
+risk.
Table 10.2, page 349:
-
-summary(teachers_fit)
-#> Call: survfit(formula = Surv(years, 1 - censor) ~ 1, data = teachers)
-#>
-#> time n.risk n.event survival std.err lower 95% CI upper 95% CI
-#> 1 3941 456 0.884 0.00510 0.874 0.894
-#> 2 3485 384 0.787 0.00652 0.774 0.800
-#> 3 3101 359 0.696 0.00733 0.682 0.710
-#> 4 2742 295 0.621 0.00773 0.606 0.636
-#> 5 2447 218 0.566 0.00790 0.550 0.581
-#> 6 2229 184 0.519 0.00796 0.504 0.535
-#> 7 2045 123 0.488 0.00796 0.472 0.504
-#> 8 1642 79 0.464 0.00800 0.449 0.480
-#> 9 1256 53 0.445 0.00811 0.429 0.461
-#> 10 948 35 0.428 0.00827 0.412 0.445
-#> 11 648 16 0.418 0.00848 0.401 0.435
-#> 12 391 5 0.412 0.00870 0.396 0.430
-
-teachers_fit |>
- tidy() |>
+In Section 10.4 Singer and Willett (2003) return to the
+teachers
data to discuss standard errors for the estimated
+discrete-time hazard probabilities and survival probabilities, which can
+also estimated directly from the life table:
+
+-
+
Because the estimated discrete-time hazard probability is simply
+a sample proportion, its standard error in the \(j\)th time period can be estimated using
+the usual formula for estimating the standard error of a proportion:
+\[
+se \big(\hat h(t_j) \big) =
+ \sqrt{\frac{\hat h(t_j) \big(1 - \hat h(t_j) \big)}{n \text{ at
+risk}_j}}.
+\]
+
+-
+
For risk sets greater than size 20, the standard error of the
+survival probability in the \(j\)th
+time period can be can be estimated using Greenwood’s approximation:
+\[
+se \big(\hat S(t_j) \big) =
+ \hat S(t_j) \sqrt{
+ \frac{\hat h(t_1)}{n \text{ at risk}_1 \big(1 - \hat h(t_1) \big)}
+ + \frac{\hat h(t_2)}{n \text{ at risk}_2 \big(1 - \hat h(t_2) \big)}
+ + \cdots
+ + \frac{\hat h(t_j)}{n \text{ at risk}_j \big(1 - \hat h(t_j) \big)}
+ }.
+\]
+
+
+We estimate these standard errors here using the
+teachers_lifetable
from Section 10.2.
+
+# Table 10.2, page 349:
+teachers_lifetable |>
+ filter(year != 0) |>
mutate(
- # The tidy() method for survfit objects returns the standard error for the
- # cumulative hazard instead of the survival probability. Multiplying the
- # survival estimate with the cumulative hazard's standard error will return
- # the standard error for the survival probability. Note that it is unlikely
- # the tidy() method will ever change to return the the standard error for
- # the survival probability instead. See:
- # - https://github.com/tidymodels/broom/pull/1162
- # Other transformations of the survival probability can be found here:
- # - https://stat.ethz.ch/pipermail/r-help/2014-June/376247.html
- surv.std.error = estimate * std.error,
- haz.estimate = n.event / n.risk,
haz.std.error = sqrt(haz.estimate * (1 - haz.estimate) / n.risk),
- sqrt = (std.error)^2 / (estimate)^2
+ surv.std.error = surv.estimate * sqrt(
+ cumsum(haz.estimate / (n.risk * (1 - haz.estimate)))
+ )
) |>
- select(
- year = time,
- n.risk,
- haz.estimate,
- haz.std.error,
- surv.estimate = estimate,
- sqrt,
- surv.std.error
- )
-#> # A tibble: 12 × 7
-#> year n.risk haz.estimate haz.std.error surv.estimate sqrt surv.std.error
-#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-#> 1 1 3941 0.116 0.00510 0.884 4.25e-5 0.00510
-#> 2 2 3485 0.110 0.00530 0.787 1.11e-4 0.00652
-#> 3 3 3101 0.116 0.00575 0.696 2.29e-4 0.00733
-#> 4 4 2742 0.108 0.00592 0.621 4.02e-4 0.00773
-#> 5 5 2447 0.0891 0.00576 0.566 6.09e-4 0.00790
-#> 6 6 2229 0.0825 0.00583 0.519 8.74e-4 0.00796
-#> 7 7 2045 0.0601 0.00526 0.488 1.12e-3 0.00796
-#> 8 8 1642 0.0481 0.00528 0.464 1.38e-3 0.00800
-#> 9 9 1256 0.0422 0.00567 0.445 1.68e-3 0.00811
-#> 10 10 948 0.0369 0.00612 0.428 2.03e-3 0.00827
-#> 11 11 648 0.0247 0.00610 0.418 2.36e-3 0.00848
-#> 12 12 391 0.0128 0.00568 0.412 2.62e-3 0.00870
+ select(year, n.risk, starts_with("haz"), starts_with("surv"))
+#> # A tibble: 12 × 6
+#> year n.risk haz.estimate haz.std.error surv.estimate surv.std.error
+#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
+#> 1 1 3941 0.116 0.00510 0.884 0.00510
+#> 2 2 3485 0.110 0.00530 0.787 0.00652
+#> 3 3 3101 0.116 0.00575 0.696 0.00733
+#> 4 4 2742 0.108 0.00592 0.621 0.00773
+#> 5 5 2447 0.0891 0.00576 0.566 0.00790
+#> 6 6 2229 0.0825 0.00583 0.519 0.00796
+#> 7 7 2045 0.0601 0.00526 0.488 0.00796
+#> 8 8 1642 0.0481 0.00528 0.464 0.00800
+#> 9 9 1256 0.0422 0.00567 0.445 0.00811
+#> 10 10 948 0.0369 0.00612 0.428 0.00827
+#> 11 11 648 0.0247 0.00610 0.418 0.00848
+#> 12 12 391 0.0128 0.00568 0.412 0.00870
Figure 10.4, page 353:
-
-filter(teachers, id %in% c(20, 126, 129))
+In Section 10.5 Singer and Willett (2003) introduce the
+person-period format for event occurrence data,
+demonstrating how it can be used to construct the life table “by hand”
+using the person-level teachers
data
+set.
+
+The person-level data set
+
+In the person-level format for event occurrence data, each person has
+only one row of data with columns for their event time and censorship
+status, and (optionally) a participant identifier variable or any other
+variables of interest. This is demonstrated in the teachers
+data set, A person-level data frame with 3941 rows and 3 columns:
+
+-
+
id
: Teacher ID.
+-
+
years
: The number of years between a teacher’s dates of
+hire and departure from the Michigan public schools.
+-
+
censor
: Censoring status.
+
+
+teachers
+#> # A tibble: 3,941 × 3
+#> id years censor
+#> <fct> <dbl> <dbl>
+#> 1 1 1 0
+#> 2 2 2 0
+#> 3 3 1 0
+#> 4 4 1 0
+#> 5 5 12 1
+#> 6 6 1 0
+#> 7 7 12 1
+#> 8 8 1 0
+#> 9 9 2 0
+#> 10 10 2 0
+#> # ℹ 3,931 more rows
+Note that unlike when modelling change, the person-level data set
+does not contain multiple columns for each time period; thus, as we will
+demonstrate below, a new strategy is needed to convert a person-level
+data set into a person-period data set. Additionally, and also unlike
+when modelling change, the person-level data set is often useful for
+analyzing event occurrence—as we have demonstrated through several
+examples in the current and previous chapter.
+
+
+The person-period data set
+
+In the person-period format for event occurrence data, each person
+has one row of data for each time period when they were
+at risk, with a participant identifier variable for
+each person, and an event-indicator variable for each
+time period.
+We can use the reframe()
function from the
+dplyr package to convert a person-level data set into a
+person-period data set. The reframe()
function works
+similarly to dplyr’s summarise()
function, except that it
+can return an arbitrary number of rows per group. We take advantage of
+this property to add rows for each time period when individuals were at
+risk, then use the information stored in these new rows and the
+person-level data set to identify whether an event occurred in each
+individual’s last period, given their censorship status.
+
+teachers_pp <- teachers |>
+ group_by(id) |>
+ reframe(
+ year = 1:years,
+ event = if_else(year == years & censor == 0, true = 1, false = 0)
+ )
+
+teachers_pp
+#> # A tibble: 24,875 × 3
+#> id year event
+#> <fct> <int> <dbl>
+#> 1 1 1 1
+#> 2 2 1 0
+#> 3 2 2 1
+#> 4 3 1 1
+#> 5 4 1 1
+#> 6 5 1 0
+#> 7 5 2 0
+#> 8 5 3 0
+#> 9 5 4 0
+#> 10 5 5 0
+#> # ℹ 24,865 more rows
+Following similar logic, we can use the summarise()
+function from the dplyr package to convert a person-period data set to
+person-level data set.
+
+teachers_pl <- teachers_pp |>
+ group_by(id) |>
+ summarise(
+ years = max(year),
+ censor = if_else(all(event == 0), true = 1, false = 0)
+ )
+
+teachers_pl
+#> # A tibble: 3,941 × 3
+#> id years censor
+#> <fct> <int> <dbl>
+#> 1 1 1 0
+#> 2 2 2 0
+#> 3 3 1 0
+#> 4 4 1 0
+#> 5 5 12 1
+#> 6 6 1 0
+#> 7 7 12 1
+#> 8 8 1 0
+#> 9 9 2 0
+#> 10 10 2 0
+#> # ℹ 3,931 more rows
+The difference between the person-level and person-period formats is
+best seen by examining the data from a subset of individuals with
+different (censored) event times.
+
+# Figure 10.4, page 353:
+filter(teachers_pl, id %in% c(20, 126, 129))
#> # A tibble: 3 × 3
#> id years censor
-#> <fct> <dbl> <dbl>
+#> <fct> <int> <dbl>
#> 1 20 3 0
#> 2 126 12 0
#> 3 129 12 1
-
+
-teachers_pp <- teachers |>
- reframe(
- year = 1:max(years),
- event = if_else(year == years & censor == 0, 1, 0),
- .by = id
- )
-
teachers_pp |>
filter(id %in% c(20, 126, 129)) |>
print(n = 27)
@@ -346,32 +856,45 @@ 10.5 A Sim
#> 25 129 10 0
#> 26 129 11 0
#> 27 129 12 0
-Table 10.3, page 355:
-
-teachers_pp |>
+
+
+Using the person-period data set to construct the life table
+
+The life table can be constructed using the person-period data set
+through cross-tabulation of the time period and event-indicator
+variables. This can be accomplished using a standard
+df |> group_by(...) |> summarise(...)
statement with
+the dplyr package, where we count the number of individuals who were at
+risk, who experienced the target event, and who were censored for each
+time period. After this, statistics for summarizing the event history
+information of the life table can be estimated using the methods
+demonstrated in Section 10.2.
+
+# Table 10.3, page 355:
+teachers_pp |>
group_by(year) |>
- count(event) |>
- pivot_wider(names_from = event, names_prefix = "event_", values_from = n) |>
- mutate(
- total = event_0 + event_1,
- p.event_1 = event_1 / total
+ summarise(
+ n.risk = n(),
+ n.event = sum(event == 1),
+ n.censor = sum(event == 0),
+ haz.estimate = n.event / n.risk
)
#> # A tibble: 12 × 5
-#> # Groups: year [12]
-#> year event_0 event_1 total p.event_1
-#> <int> <int> <int> <int> <dbl>
-#> 1 1 3485 456 3941 0.116
-#> 2 2 3101 384 3485 0.110
-#> 3 3 2742 359 3101 0.116
-#> 4 4 2447 295 2742 0.108
-#> 5 5 2229 218 2447 0.0891
-#> 6 6 2045 184 2229 0.0825
-#> 7 7 1922 123 2045 0.0601
-#> 8 8 1563 79 1642 0.0481
-#> 9 9 1203 53 1256 0.0422
-#> 10 10 913 35 948 0.0369
-#> 11 11 632 16 648 0.0247
-#> 12 12 386 5 391 0.0128
+#> year n.risk n.event n.censor haz.estimate
+#> <int> <int> <int> <int> <dbl>
+#> 1 1 3941 456 3485 0.116
+#> 2 2 3485 384 3101 0.110
+#> 3 3 3101 359 2742 0.116
+#> 4 4 2742 295 2447 0.108
+#> 5 5 2447 218 2229 0.0891
+#> 6 6 2229 184 2045 0.0825
+#> 7 7 2045 123 1922 0.0601
+#> 8 8 1642 79 1563 0.0481
+#> 9 9 1256 53 1203 0.0422
+#> 10 10 948 35 913 0.0369
+#> 11 11 648 16 632 0.0247
+#> 12 12 391 5 386 0.0128
+
diff --git a/articles/chapter-10_files/figure-html/unnamed-chunk-12-1.png b/articles/chapter-10_files/figure-html/unnamed-chunk-12-1.png
new file mode 100644
index 0000000..81e6ae8
Binary files /dev/null and b/articles/chapter-10_files/figure-html/unnamed-chunk-12-1.png differ
diff --git a/articles/chapter-10_files/figure-html/unnamed-chunk-7-1.png b/articles/chapter-10_files/figure-html/unnamed-chunk-7-1.png
new file mode 100644
index 0000000..8f0354a
Binary files /dev/null and b/articles/chapter-10_files/figure-html/unnamed-chunk-7-1.png differ
diff --git a/articles/chapter-5.html b/articles/chapter-5.html
index 7bc2774..c372b06 100644
--- a/articles/chapter-5.html
+++ b/articles/chapter-5.html
@@ -1578,8 +1578,8 @@ 5
Fixing rates of change, where the model is
simplified by removing the varying slope change.
-
For this example we a subset of the dropout_wages
data
-purposefully constructed to be severely unbalanced.
+For this example we use a subset of the dropout_wages
+data purposefully constructed to be severely unbalanced.
dropout_wages_subset
#> # A tibble: 257 × 5
diff --git a/articles/chapter-9.html b/articles/chapter-9.html
index d75cbae..0d81e99 100644
--- a/articles/chapter-9.html
+++ b/articles/chapter-9.html
@@ -175,18 +175,18 @@ 9.1 Sh
suicide_ideation
#> # A tibble: 391 × 4
-#> id time censor age
-#> <fct> <dbl> <dbl> <dbl>
-#> 1 1 16 0 18
-#> 2 2 10 0 19
-#> 3 3 16 0 19
-#> 4 4 20 0 22
-#> 5 6 15 0 22
-#> 6 7 10 0 19
-#> 7 8 22 1 22
-#> 8 9 22 1 22
-#> 9 10 15 0 20
-#> 10 11 10 0 19
+#> id age censor age_now
+#> <fct> <dbl> <dbl> <dbl>
+#> 1 1 16 0 18
+#> 2 2 10 0 19
+#> 3 3 16 0 19
+#> 4 4 20 0 22
+#> 5 6 15 0 22
+#> 6 7 10 0 19
+#> 7 8 22 1 22
+#> 8 9 22 1 22
+#> 9 10 15 0 20
+#> 10 11 10 0 19
#> # ℹ 381 more rows
diff --git a/pkgdown.yml b/pkgdown.yml
index 4b0f9d9..691b700 100644
--- a/pkgdown.yml
+++ b/pkgdown.yml
@@ -17,7 +17,7 @@ articles:
chapter-8: chapter-8.html
chapter-9: chapter-9.html
longitudinal-data-organization: longitudinal-data-organization.html
-last_built: 2024-05-23T20:12Z
+last_built: 2024-05-27T04:16Z
urls:
reference: https://mccarthy-m-g.github.io/alda/reference
article: https://mccarthy-m-g.github.io/alda/articles
diff --git a/reference/cocaine_relapse_1.html b/reference/cocaine_relapse_1.html
index 2a5dee3..2034b09 100644
--- a/reference/cocaine_relapse_1.html
+++ b/reference/cocaine_relapse_1.html
@@ -1,8 +1,8 @@
-Weeks to cocaine relapse after treatment — cocaine_relapse_1 • alda Weeks to cocaine relapse after treatment — cocaine_relapse_1 • alda Age of first sexual intercourse — first_sex • alda Age of first sexual intercourse — first_sex • alda Survival analysis
- A subset of data from Capaldi, Crosby, and Stoolmiller's (1996) measuring the
+
A subset of data from Capaldi, Crosby, and Stoolmiller (1996) measuring the
grade year of first sexual intercourse in a sample of 180 at-risk
heterosexual adolescent males. Adolescent males were followed from Grade 7 up
to Grade 12 or until they reported having had sexual intercourse for the
diff --git a/reference/suicide_ideation.html b/reference/suicide_ideation.html
index 8e3a92e..15ade4c 100644
--- a/reference/suicide_ideation.html
+++ b/reference/suicide_ideation.html
@@ -103,13 +103,13 @@
Format<
id
Participant ID.
-time
+age
Reported age of first suicide ideation.
censor
Censoring status.
-age
+age_now
Participant age at the time of the survey.
diff --git a/search.json b/search.json
index 0742832..b89392d 100644
--- a/search.json
+++ b/search.json
@@ -1 +1 @@
-[{"path":[]},{"path":"https://mccarthy-m-g.github.io/alda/LICENSE.html","id":null,"dir":"","previous_headings":"","what":"CC0 1.0 Universal","title":"CC0 1.0 Universal","text":"CREATIVE COMMONS CORPORATION LAW FIRM PROVIDE LEGAL SERVICES. DISTRIBUTION DOCUMENT CREATE ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES INFORMATION “-” BASIS. CREATIVE COMMONS MAKES WARRANTIES REGARDING USE DOCUMENT INFORMATION WORKS PROVIDED HEREUNDER, DISCLAIMS LIABILITY DAMAGES RESULTING USE DOCUMENT INFORMATION WORKS PROVIDED HEREUNDER.","code":""},{"path":"https://mccarthy-m-g.github.io/alda/LICENSE.html","id":"statement-of-purpose","dir":"","previous_headings":"","what":"Statement of Purpose","title":"CC0 1.0 Universal","text":"laws jurisdictions throughout world automatically confer exclusive Copyright Related Rights (defined ) upon creator subsequent owner(s) (, “owner”) original work authorship /database (, “Work”). Certain owners wish permanently relinquish rights Work purpose contributing commons creative, cultural scientific works (“Commons”) public can reliably without fear later claims infringement build upon, modify, incorporate works, reuse redistribute freely possible form whatsoever purposes, including without limitation commercial purposes. owners may contribute Commons promote ideal free culture production creative, cultural scientific works, gain reputation greater distribution Work part use efforts others. /purposes motivations, without expectation additional consideration compensation, person associating CC0 Work (“Affirmer”), extent owner Copyright Related Rights Work, voluntarily elects apply CC0 Work publicly distribute Work terms, knowledge Copyright Related Rights Work meaning intended legal effect CC0 rights. Copyright Related Rights. Work made available CC0 may protected copyright related neighboring rights (“Copyright Related Rights”). Copyright Related Rights include, limited , following: right reproduce, adapt, distribute, perform, display, communicate, translate Work; moral rights retained original author(s) /performer(s); publicity privacy rights pertaining person’s image likeness depicted Work; rights protecting unfair competition regards Work, subject limitations paragraph 4(), ; rights protecting extraction, dissemination, use reuse data Work; database rights (arising Directive 96/9/EC European Parliament Council 11 March 1996 legal protection databases, national implementation thereof, including amended successor version directive); similar, equivalent corresponding rights throughout world based applicable law treaty, national implementations thereof. Waiver. greatest extent permitted , contravention , applicable law, Affirmer hereby overtly, fully, permanently, irrevocably unconditionally waives, abandons, surrenders Affirmer’s Copyright Related Rights associated claims causes action, whether now known unknown (including existing well future claims causes action), Work () territories worldwide, (ii) maximum duration provided applicable law treaty (including future time extensions), (iii) current future medium number copies, (iv) purpose whatsoever, including without limitation commercial, advertising promotional purposes (“Waiver”). Affirmer makes Waiver benefit member public large detriment Affirmer’s heirs successors, fully intending Waiver shall subject revocation, rescission, cancellation, termination, legal equitable action disrupt quiet enjoyment Work public contemplated Affirmer’s express Statement Purpose. Public License Fallback. part Waiver reason judged legally invalid ineffective applicable law, Waiver shall preserved maximum extent permitted taking account Affirmer’s express Statement Purpose. addition, extent Waiver judged Affirmer hereby grants affected person royalty-free, non transferable, non sublicensable, non exclusive, irrevocable unconditional license exercise Affirmer’s Copyright Related Rights Work () territories worldwide, (ii) maximum duration provided applicable law treaty (including future time extensions), (iii) current future medium number copies, (iv) purpose whatsoever, including without limitation commercial, advertising promotional purposes (“License”). License shall deemed effective date CC0 applied Affirmer Work. part License reason judged legally invalid ineffective applicable law, partial invalidity ineffectiveness shall invalidate remainder License, case Affirmer hereby affirms () exercise remaining Copyright Related Rights Work (ii) assert associated claims causes action respect Work, either case contrary Affirmer’s express Statement Purpose. Limitations Disclaimers. trademark patent rights held Affirmer waived, abandoned, surrendered, licensed otherwise affected document. Affirmer offers Work -makes representations warranties kind concerning Work, express, implied, statutory otherwise, including without limitation warranties title, merchantability, fitness particular purpose, non infringement, absence latent defects, accuracy, present absence errors, whether discoverable, greatest extent permissible applicable law. Affirmer disclaims responsibility clearing rights persons may apply Work use thereof, including without limitation person’s Copyright Related Rights Work. , Affirmer disclaims responsibility obtaining necessary consents, permissions rights required use Work. Affirmer understands acknowledges Creative Commons party document duty obligation respect CC0 use Work.","code":""},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-10.html","id":"the-life-table","dir":"Articles","previous_headings":"","what":"10.1 The Life Table","title":"Chapter 10: Describing discrete-time event occurrence data","text":"Table 10.1, page 327:","code":"# A life table is the tabular form of a survival curve, so begin by fitting a # Kaplan-Meir curve to the data. teachers_fit <- survfit(Surv(years, 1 - censor) ~ 1, data = teachers) table_10.1 <- teachers_fit |> # Add a starting time (time 0) for the table. survfit0() |> tidy() |> # The summary of the fit gives most of what we want, but to match Table 10.1 # we need to do a little more wrangling. select(-c(std.error:conf.low)) |> mutate( interval = paste0(\"[\", time, \", \", time + 1, \")\"), haz.estimate = n.event / n.risk ) |> rename(year = time, surv.estimate = estimate) |> relocate( year, interval, n.risk, n.event, n.censor, haz.estimate, surv.estimate ) table_10.1 #> # A tibble: 13 × 7 #> year interval n.risk n.event n.censor haz.estimate surv.estimate #> #> 1 0 [0, 1) 3941 0 0 0 1 #> 2 1 [1, 2) 3941 456 0 0.116 0.884 #> 3 2 [2, 3) 3485 384 0 0.110 0.787 #> 4 3 [3, 4) 3101 359 0 0.116 0.696 #> 5 4 [4, 5) 2742 295 0 0.108 0.621 #> 6 5 [5, 6) 2447 218 0 0.0891 0.566 #> 7 6 [6, 7) 2229 184 0 0.0825 0.519 #> 8 7 [7, 8) 2045 123 280 0.0601 0.488 #> 9 8 [8, 9) 1642 79 307 0.0481 0.464 #> 10 9 [9, 10) 1256 53 255 0.0422 0.445 #> 11 10 [10, 11) 948 35 265 0.0369 0.428 #> 12 11 [11, 12) 648 16 241 0.0247 0.418 #> 13 12 [12, 13) 391 5 386 0.0128 0.412"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-10.html","id":"a-framework-for-characterizing-the-distribution-of-discrete-time-event-occurrence-data","dir":"Articles","previous_headings":"","what":"10.2 A Framework for Characterizing the Distribution of Discrete-Time Event Occurrence Data","title":"Chapter 10: Describing discrete-time event occurrence data","text":"Figure 10.1, page 333:","code":"ggplot(table_10.1, aes(x = year, y = haz.estimate)) + geom_line() + scale_x_continuous(breaks = 0:13, limits = c(1, 13)) + scale_y_continuous(breaks = c(0, .05, .1, .15), limits = c(0, .15)) + coord_cartesian(xlim = c(0, 13)) #> Warning: Removed 1 row containing missing values or values outside the scale range #> (`geom_line()`). # First interpolate median lifetime median_lifetime <- table_10.1 |> # Get the row indices for the first survival estimate immediately below and # immediately above 0.5. This will only work correctly if the values are in # descending order, otherwise min() and max() must be swapped. By default, the # survival estimates are in descending order, however, I've added the # redundant step of ensuring they are here for demonstration purposes. arrange(desc(surv.estimate)) |> slice(min(which(surv.estimate <= .5)), max(which(surv.estimate >= .5))) |> select(year, surv.estimate) |> # Linearly interpolate between the two values of the survival estimates that # bracket .5 following Miller's (1981) equation. summarise( year = min(year) + ((max(surv.estimate) - .5) / (max(surv.estimate) - min(surv.estimate))) * ((min(year) + 1) - min(year)), surv.estimate = .5 ) ggplot(table_10.1, aes(x = year, y = surv.estimate)) + geom_line() + geom_segment( aes(xend = year, y = 0, yend = .5), data = median_lifetime, linetype = 2 ) + geom_segment( aes(xend = 0, yend = .5), data = median_lifetime, linetype = 2 ) + scale_x_continuous(breaks = 0:13) + scale_y_continuous(breaks = c(0, .5, 1), limits = c(0, 1)) + coord_cartesian(xlim = c(0, 13))"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-10.html","id":"developing-intuition-about-hazard-functions-survivor-functions-and-median-lifetimes","dir":"Articles","previous_headings":"","what":"10.3 Developing Intuition About Hazard Functions, Survivor Functions, and Median Lifetimes","title":"Chapter 10: Describing discrete-time event occurrence data","text":"Figure 10.2, page 340:","code":"relapse_fit <- survfit(Surv(weeks, 1 - censor) ~ 1, data = cocaine_relapse_1) relapse_tidy <- tidy(relapse_fit) relapse_summary <- glance(relapse_fit)"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-10.html","id":"quantifying-the-effects-of-sampling-variation","dir":"Articles","previous_headings":"","what":"10.4 Quantifying the Effects of Sampling Variation","title":"Chapter 10: Describing discrete-time event occurrence data","text":"Table 10.2, page 349:","code":"summary(teachers_fit) #> Call: survfit(formula = Surv(years, 1 - censor) ~ 1, data = teachers) #> #> time n.risk n.event survival std.err lower 95% CI upper 95% CI #> 1 3941 456 0.884 0.00510 0.874 0.894 #> 2 3485 384 0.787 0.00652 0.774 0.800 #> 3 3101 359 0.696 0.00733 0.682 0.710 #> 4 2742 295 0.621 0.00773 0.606 0.636 #> 5 2447 218 0.566 0.00790 0.550 0.581 #> 6 2229 184 0.519 0.00796 0.504 0.535 #> 7 2045 123 0.488 0.00796 0.472 0.504 #> 8 1642 79 0.464 0.00800 0.449 0.480 #> 9 1256 53 0.445 0.00811 0.429 0.461 #> 10 948 35 0.428 0.00827 0.412 0.445 #> 11 648 16 0.418 0.00848 0.401 0.435 #> 12 391 5 0.412 0.00870 0.396 0.430 teachers_fit |> tidy() |> mutate( # The tidy() method for survfit objects returns the standard error for the # cumulative hazard instead of the survival probability. Multiplying the # survival estimate with the cumulative hazard's standard error will return # the standard error for the survival probability. Note that it is unlikely # the tidy() method will ever change to return the the standard error for # the survival probability instead. See: # - https://github.com/tidymodels/broom/pull/1162 # Other transformations of the survival probability can be found here: # - https://stat.ethz.ch/pipermail/r-help/2014-June/376247.html surv.std.error = estimate * std.error, haz.estimate = n.event / n.risk, haz.std.error = sqrt(haz.estimate * (1 - haz.estimate) / n.risk), sqrt = (std.error)^2 / (estimate)^2 ) |> select( year = time, n.risk, haz.estimate, haz.std.error, surv.estimate = estimate, sqrt, surv.std.error ) #> # A tibble: 12 × 7 #> year n.risk haz.estimate haz.std.error surv.estimate sqrt surv.std.error #> #> 1 1 3941 0.116 0.00510 0.884 4.25e-5 0.00510 #> 2 2 3485 0.110 0.00530 0.787 1.11e-4 0.00652 #> 3 3 3101 0.116 0.00575 0.696 2.29e-4 0.00733 #> 4 4 2742 0.108 0.00592 0.621 4.02e-4 0.00773 #> 5 5 2447 0.0891 0.00576 0.566 6.09e-4 0.00790 #> 6 6 2229 0.0825 0.00583 0.519 8.74e-4 0.00796 #> 7 7 2045 0.0601 0.00526 0.488 1.12e-3 0.00796 #> 8 8 1642 0.0481 0.00528 0.464 1.38e-3 0.00800 #> 9 9 1256 0.0422 0.00567 0.445 1.68e-3 0.00811 #> 10 10 948 0.0369 0.00612 0.428 2.03e-3 0.00827 #> 11 11 648 0.0247 0.00610 0.418 2.36e-3 0.00848 #> 12 12 391 0.0128 0.00568 0.412 2.62e-3 0.00870"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-10.html","id":"a-simple-and-useful-strategy-for-constructing-the-life-table","dir":"Articles","previous_headings":"","what":"10.5 A Simple and Useful Strategy for Constructing the Life Table","title":"Chapter 10: Describing discrete-time event occurrence data","text":"Figure 10.4, page 353: Table 10.3, page 355:","code":"filter(teachers, id %in% c(20, 126, 129)) #> # A tibble: 3 × 3 #> id years censor #> #> 1 20 3 0 #> 2 126 12 0 #> 3 129 12 1 teachers_pp <- teachers |> reframe( year = 1:max(years), event = if_else(year == years & censor == 0, 1, 0), .by = id ) teachers_pp |> filter(id %in% c(20, 126, 129)) |> print(n = 27) #> # A tibble: 27 × 3 #> id year event #> #> 1 20 1 0 #> 2 20 2 0 #> 3 20 3 1 #> 4 126 1 0 #> 5 126 2 0 #> 6 126 3 0 #> 7 126 4 0 #> 8 126 5 0 #> 9 126 6 0 #> 10 126 7 0 #> 11 126 8 0 #> 12 126 9 0 #> 13 126 10 0 #> 14 126 11 0 #> 15 126 12 1 #> 16 129 1 0 #> 17 129 2 0 #> 18 129 3 0 #> 19 129 4 0 #> 20 129 5 0 #> 21 129 6 0 #> 22 129 7 0 #> 23 129 8 0 #> 24 129 9 0 #> 25 129 10 0 #> 26 129 11 0 #> 27 129 12 0 teachers_pp |> group_by(year) |> count(event) |> pivot_wider(names_from = event, names_prefix = \"event_\", values_from = n) |> mutate( total = event_0 + event_1, p.event_1 = event_1 / total ) #> # A tibble: 12 × 5 #> # Groups: year [12] #> year event_0 event_1 total p.event_1 #> #> 1 1 3485 456 3941 0.116 #> 2 2 3101 384 3485 0.110 #> 3 3 2742 359 3101 0.116 #> 4 4 2447 295 2742 0.108 #> 5 5 2229 218 2447 0.0891 #> 6 6 2045 184 2229 0.0825 #> 7 7 1922 123 2045 0.0601 #> 8 8 1563 79 1642 0.0481 #> 9 9 1203 53 1256 0.0422 #> 10 10 913 35 948 0.0369 #> 11 11 632 16 648 0.0247 #> 12 12 386 5 391 0.0128"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-11.html","id":"toward-a-statistical-model-for-discretetime-hazard","dir":"Articles","previous_headings":"","what":"11.1 Toward a Statistical Model for DiscreteTime Hazard","title":"Chapter 11: Fitting basic discrete-time hazard models","text":"Several examples chapter rely following: Figure 11.1, page 359: Table 11.1, page 360: Figure 11.2, page 363: Figure 11.3, page 366:","code":"first_sex_fit <- survfit(Surv(grade, 1 - censor) ~ 1, data = first_sex) first_sex_pt <- c(0, 1) |> map_dfr( \\(.x) { first_sex_fit_subset <- update( first_sex_fit, subset = (parental_transition == .x) ) first_sex_fit_subset |> survfit0(start.time = 6) |> tidy() |> rename(survival_probability = estimate) |> mutate( hazard_probability = n.event / n.risk, odds = hazard_probability / (1 - hazard_probability), log_odds = log(odds) ) |> select(-starts_with(\"conf\"), -std.error) |> rename(grade = time) |> pivot_longer( cols = c(survival_probability, hazard_probability, odds, log_odds), values_to = \"estimate\" ) |> # The figure doesn't include data for grade 6 in the hazard function. filter( !(name %in% c(\"hazard_probability\", \"odds\", \"log_odds\") & grade == 6) ) }, .id = \"parental_transition\" ) first_sex_pt |> filter(name %in% c(\"survival_probability\", \"hazard_probability\")) |> ggplot(aes(x = grade, y = estimate, colour = parental_transition)) + geom_hline( aes(yintercept = .5), data = tibble(name = \"survival_probability\"), alpha = .25, linetype = 2 ) + geom_line() + scale_x_continuous(breaks = 6:12) + coord_cartesian(xlim = c(6, 12)) + facet_wrap(vars(name), ncol = 1, scales = \"free_y\") + ggh4x::facetted_pos_scales( y = list( name == \"hazard_probability\" ~ scale_y_continuous(limits = c(0, .5)), name == \"survival_probability\" ~ scale_y_continuous(breaks = c(0, .5, 1), limits = c(0, 1)) ) ) # First two sections of the table first_sex_pt |> filter(grade != 6, !(name %in% c(\"odds\", \"log_odds\"))) |> pivot_wider(names_from = name, values_from = estimate) |> select(everything(), -n.censor, hazard_probability, survival_probability) #> # A tibble: 12 × 6 #> parental_transition grade n.risk n.event survival_probability #> #> 1 1 7 72 2 0.972 #> 2 1 8 70 2 0.944 #> 3 1 9 68 8 0.833 #> 4 1 10 60 8 0.722 #> 5 1 11 52 10 0.583 #> 6 1 12 42 8 0.472 #> 7 2 7 108 13 0.880 #> 8 2 8 95 5 0.833 #> 9 2 9 90 16 0.685 #> 10 2 10 74 21 0.491 #> 11 2 11 53 15 0.352 #> 12 2 12 38 18 0.185 #> # ℹ 1 more variable: hazard_probability # Last section first_sex_fit |> tidy() |> rename(survival_probability = estimate) |> mutate( hazard_probability = n.event / n.risk, .before = survival_probability ) |> select(-starts_with(\"conf\"), -std.error, -n.censor) |> rename(grade = time) #> # A tibble: 6 × 5 #> grade n.risk n.event hazard_probability survival_probability #> #> 1 7 180 15 0.0833 0.917 #> 2 8 165 7 0.0424 0.878 #> 3 9 158 24 0.152 0.744 #> 4 10 134 29 0.216 0.583 #> 5 11 105 25 0.238 0.444 #> 6 12 80 26 0.325 0.3 first_sex_pt |> filter(name %in% c(\"hazard_probability\", \"odds\", \"log_odds\")) |> mutate( name = factor(name, levels = c(\"hazard_probability\", \"odds\", \"log_odds\")) ) |> ggplot(aes(x = grade, y = estimate, colour = parental_transition)) + geom_line() + scale_x_continuous(breaks = 6:12) + coord_cartesian(xlim = c(6, 12)) + facet_wrap(vars(name), ncol = 1, scales = \"free_y\") + ggh4x::facetted_pos_scales( y = list( name %in% c(\"hazard_probability\", \"odds\") ~ scale_y_continuous(limits = c(0, 1)), name == \"log_odds\" ~ scale_y_continuous(limits = c(-4, 0)) ) ) # Transform to person-period format. first_sex_pp <- first_sex |> rename(grades = grade) |> reframe( grade = 7:max(grades), event = if_else(grade == grades & censor == 0, 1, 0), parental_transition, parental_antisociality, .by = id ) # Fit models for each panel. first_sex_fit_11.3a <- glm( event ~ parental_transition, family = \"binomial\", data = first_sex_pp ) first_sex_fit_11.3b <- update(first_sex_fit_11.3a, . ~ . + grade) first_sex_fit_11.3c <- update(first_sex_fit_11.3a, . ~ . + factor(grade)) # Plot: map_df( list(a = first_sex_fit_11.3a, b = first_sex_fit_11.3b, c = first_sex_fit_11.3c), \\(.x) augment(.x, newdata = first_sex_pp), .id = \"model\" ) |> ggplot(aes(x = grade, y = .fitted, colour = factor(parental_transition))) + geom_line() + geom_point( aes(y = estimate), data = first_sex_pt |> mutate(parental_transition = as.numeric(parental_transition) - 1) |> filter(name == \"log_odds\") ) + coord_cartesian(ylim = c(-4, 0)) + facet_wrap(vars(model), ncol = 1, labeller = label_both) + labs( y = \"logit(hazard)\", colour = \"parental_transition\" )"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-11.html","id":"a-formal-representation-of-the-population-discrete-time-hazard-model","dir":"Articles","previous_headings":"","what":"11.2 A Formal Representation of the Population Discrete-Time Hazard Model","title":"Chapter 11: Fitting basic discrete-time hazard models","text":"Figure 11.4, page 374:","code":"# Panel A: first_sex_fit_11.3c |> augment(newdata = first_sex_pp) |> ggplot(aes(x = grade, y = .fitted, colour = factor(parental_transition))) + geom_line() + coord_cartesian(ylim = c(-4, 0)) # Panel B: first_sex_fit_11.4b <- update( first_sex_fit_11.3c, . ~ . + parental_transition * factor(grade) ) first_sex_fit_11.4b |> augment(newdata = first_sex_pp) |> ggplot(aes(x = grade, y = exp(.fitted), colour = factor(parental_transition))) + geom_line() + coord_cartesian(ylim = c(0, 1)) # Panel C: first_sex_fit_11.4b |> augment(newdata = first_sex_pp, type.predict = \"response\") |> ggplot(aes(x = grade, y = .fitted, colour = factor(parental_transition))) + geom_line()"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-11.html","id":"fitting-a-discrete-time-hazard-model-to-data","dir":"Articles","previous_headings":"","what":"11.3 Fitting a Discrete-Time Hazard Model to Data","title":"Chapter 11: Fitting basic discrete-time hazard models","text":"Figure 11.5: Table 11.3, page 386:","code":"model_A <- glm( event ~ factor(grade) - 1, family = \"binomial\", data = first_sex_pp ) model_B <- update(model_A, . ~ . + parental_transition) model_C <- update(model_A, . ~ . + parental_antisociality) model_D <- update(model_B, . ~ . + parental_antisociality) anova(model_B) #> Analysis of Deviance Table #> #> Model: binomial, link: logit #> #> Response: event #> #> Terms added sequentially (first to last) #> #> #> Df Deviance Resid. Df Resid. Dev Pr(>Chi) #> NULL 822 1139.53 #> factor(grade) 6 487.58 816 651.96 < 2.2e-16 *** #> parental_transition 1 17.29 815 634.66 3.203e-05 *** #> --- #> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 anova(model_C) #> Analysis of Deviance Table #> #> Model: binomial, link: logit #> #> Response: event #> #> Terms added sequentially (first to last) #> #> #> Df Deviance Resid. Df Resid. Dev Pr(>Chi) #> NULL 822 1139.53 #> factor(grade) 6 487.58 816 651.96 < 2.2e-16 *** #> parental_antisociality 1 14.79 815 637.17 0.0001204 *** #> --- #> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 # Deviance tests are sequential so the order of terms matters. To test # parental_transition and parental_antisociality, the model needs to be # fit twice, once with each as the last term. anova(update(model_C, . ~ . + parental_transition)) #> Analysis of Deviance Table #> #> Model: binomial, link: logit #> #> Response: event #> #> Terms added sequentially (first to last) #> #> #> Df Deviance Resid. Df Resid. Dev Pr(>Chi) #> NULL 822 1139.53 #> factor(grade) 6 487.58 816 651.96 < 2.2e-16 *** #> parental_antisociality 1 14.79 815 637.17 0.0001204 *** #> parental_transition 1 8.02 814 629.15 0.0046222 ** #> --- #> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 anova(model_D) #> Analysis of Deviance Table #> #> Model: binomial, link: logit #> #> Response: event #> #> Terms added sequentially (first to last) #> #> #> Df Deviance Resid. Df Resid. Dev Pr(>Chi) #> NULL 822 1139.53 #> factor(grade) 6 487.58 816 651.96 < 2.2e-16 *** #> parental_transition 1 17.29 815 634.66 3.203e-05 *** #> parental_antisociality 1 5.51 814 629.15 0.01886 * #> --- #> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-11.html","id":"interpreting-parameter-estimates","dir":"Articles","previous_headings":"","what":"11.4 Interpreting Parameter Estimates","title":"Chapter 11: Fitting basic discrete-time hazard models","text":"Table 11.4, page 388:","code":"model_A |> tidy() |> select(term, estimate) |> mutate( odds = exp(estimate), hazard = 1 / (1 + exp(-estimate)) ) #> # A tibble: 6 × 4 #> term estimate odds hazard #> #> 1 factor(grade)7 -2.40 0.0909 0.0833 #> 2 factor(grade)8 -3.12 0.0443 0.0424 #> 3 factor(grade)9 -1.72 0.179 0.152 #> 4 factor(grade)10 -1.29 0.276 0.216 #> 5 factor(grade)11 -1.16 0.313 0.238 #> 6 factor(grade)12 -0.731 0.481 0.325"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-11.html","id":"displaying-fitted-hazard-and-survivor-functions","dir":"Articles","previous_headings":"","what":"11.5 Displaying Fitted Hazard and Survivor Functions","title":"Chapter 11: Fitting basic discrete-time hazard models","text":"Table 11.5, page 392: Figure 11.6, page 393: Figure 11.7, page 395:","code":"model_B_tidy <- model_B |> augment( newdata = expand_grid(grade = 7:12, parental_transition = 0:1) ) |> mutate( hazard = 1 / (1 + exp(-.fitted)), survival = cumprod(1 - hazard), .by = parental_transition ) model_B_tidy #> # A tibble: 12 × 5 #> grade parental_transition .fitted hazard survival #> #> 1 7 0 -2.99 0.0477 0.952 #> 2 7 1 -2.12 0.107 0.893 #> 3 8 0 -3.70 0.0241 0.929 #> 4 8 1 -2.83 0.0559 0.843 #> 5 9 0 -2.28 0.0927 0.843 #> 6 9 1 -1.41 0.197 0.677 #> 7 10 0 -1.82 0.139 0.726 #> 8 10 1 -0.949 0.279 0.488 #> 9 11 0 -1.65 0.161 0.609 #> 10 11 1 -0.781 0.314 0.335 #> 11 12 0 -1.18 0.235 0.466 #> 12 12 1 -0.305 0.424 0.193 # FIXME: should use survfit0() for the survival panel so time starts at 6. model_B_tidy |> pivot_longer(cols = .fitted:survival) |> ggplot(aes(x = grade, y = value, colour = factor(parental_transition))) + geom_line() + facet_wrap(vars(name), ncol = 1, scales = \"free_y\") prototypical_males <- tibble( id = rep(1:6, times = length(7:12)), expand_grid( grade = 7:12, parental_transition = c(0, 1), parental_antisociality = -1:1 ) ) prototypical_first_sex <- tibble( log_odds = predict( model_D, prototypical_males ), hazard = 1 / (1 + exp(-log_odds)) ) grade_six <- tibble( id = 1:6, grade = 6, expand_grid( parental_transition = c(0, 1), parental_antisociality = -1:1 ), log_odds = NA, hazard = NA, survival = 1 ) prototypical_males |> bind_cols(prototypical_first_sex) |> mutate(survival = cumprod(1 - hazard), .by = id) |> add_row(grade_six) |> pivot_longer(cols = c(hazard, survival)) |> ggplot(aes(x = grade, y = value, group = id)) + geom_line( aes( colour = factor(parental_antisociality), linetype = factor(parental_transition) ) ) + scale_colour_grey(start = 0, end = 0.75) + facet_wrap( vars(name), ncol = 1, scales = \"free_y\" ) #> Warning: Removed 6 rows containing missing values or values outside the scale range #> (`geom_line()`)."},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-12.html","id":"alternative-specifications-for-the-main-effect-of-time","dir":"Articles","previous_headings":"","what":"12.1 Alternative Specifications for the “Main Effect of TIME”","title":"Chapter 12: Extending the discrete-time hazard model","text":"Table 12.2, page 413: Figure 12.1, page 414:","code":"# Convert to person-period format tenure_pp <- tenure |> reframe( year = 1:max(years), event = if_else(year == years & censor == 0, 1, 0), .by = id ) |> mutate( temp_year = year, temp_dummy = 1 ) |> pivot_wider( names_from = temp_year, names_prefix = \"year_\", values_from = temp_dummy, values_fill = 0 ) # Fit models tenure_fit_general <- glm( event ~ factor(year), family = \"binomial\", data = tenure_pp ) tenure_fit_constant <- glm( event ~ 1, family = \"binomial\", data = tenure_pp ) tenure_fit_linear <- update(tenure_fit_constant, . ~ year) tenure_fit_quadratic <- update(tenure_fit_linear, . ~ . + I(year^2)) tenure_fit_cubic <- update(tenure_fit_quadratic, . ~ . + I(year^3)) tenure_fit_order_4 <- update(tenure_fit_cubic, . ~ . + I(year^4)) tenure_fit_order_5 <- update(tenure_fit_order_4, . ~ . + I(year^5)) # Compare anova( tenure_fit_constant, tenure_fit_linear, tenure_fit_quadratic, tenure_fit_cubic, tenure_fit_order_4, tenure_fit_order_5 ) #> Analysis of Deviance Table #> #> Model 1: event ~ 1 #> Model 2: event ~ year #> Model 3: event ~ year + I(year^2) #> Model 4: event ~ year + I(year^2) + I(year^3) #> Model 5: event ~ year + I(year^2) + I(year^3) + I(year^4) #> Model 6: event ~ year + I(year^2) + I(year^3) + I(year^4) + I(year^5) #> Resid. Df Resid. Dev Df Deviance Pr(>Chi) #> 1 1473 1037.57 #> 2 1472 867.46 1 170.103 < 2.2e-16 *** #> 3 1471 836.30 1 31.158 2.379e-08 *** #> 4 1470 833.17 1 3.132 0.07679 . #> 5 1469 832.74 1 0.430 0.51208 #> 6 1468 832.73 1 0.011 0.91831 #> --- #> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 tenure_fit_trajectories <- map_df( list( constant = tenure_fit_constant, linear = tenure_fit_linear, quadratic = tenure_fit_quadratic, cubic = tenure_fit_cubic, general = tenure_fit_general ), \\(.x) { augment(.x, newdata = tibble(year = 1:9)) }, .id = \"model\" ) tenure_fit_trajectories |> mutate( model = factor( model, levels = c(\"constant\", \"linear\", \"quadratic\", \"cubic\", \"general\") ), hazard = if_else( model %in% c(\"quadratic\", \"general\"), 1 / (1 + exp(-.fitted)), NA ), survival = if_else( model %in% c(\"quadratic\", \"general\"), cumprod(1 - hazard), NA ), .by = model ) |> rename(logit_hazard = .fitted) |> pivot_longer(cols = logit_hazard:survival, names_to = \"estimate\") |> mutate(estimate = factor( estimate, levels = c(\"logit_hazard\", \"hazard\", \"survival\")) ) |> ggplot(aes(x = year, y = value, colour = model)) + geom_line() + scale_color_brewer(type = \"qual\", palette = \"Dark2\") + scale_x_continuous(breaks = 1:9) + facet_wrap(vars(estimate), scales = \"free_y\", labeller = label_both) #> Warning: Removed 54 rows containing missing values or values outside the scale range #> (`geom_line()`)."},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-12.html","id":"using-the-complementary-log-log-link-to-specify-a-discrete-time-hazard-model","dir":"Articles","previous_headings":"","what":"12.2 Using the Complementary Log-Log Link to Specify a Discrete-Time Hazard Model","title":"Chapter 12: Extending the discrete-time hazard model","text":"Figure 12.2: Figure 12.3, page 423: Table 12.3, page 424:","code":"first_sex_pp <- first_sex |> rename(grades = grade) |> reframe( grade = 7:max(grades), event = if_else(grade == grades & censor == 0, 1, 0), parental_transition, parental_antisociality, .by = id ) # The nested map_() is used here so we can get an ID column for both the # link function and the subset. map_dfr( list(logit = \"logit\", cloglog = \"cloglog\"), \\(.x) { map_dfr( list(`0` = 0, `1` = 1), \\(.y) { first_sex_fit <- glm( event ~ factor(grade), family = binomial(link = .x), data = first_sex_pp, subset = c(parental_transition == .y) ) augment(first_sex_fit, newdata = tibble(grade = 7:12)) }, .id = \"parental_transition\" ) }, .id = \"link\" ) |> ggplot( aes(x = grade, y = .fitted, colour = parental_transition, linetype = link) ) + geom_line() map_dfr( list(cloglog = \"cloglog\", logit = \"logit\"), \\(.x) { first_sex_fit <- glm( event ~ -1 + factor(grade) + parental_transition, family = binomial(link = .x), data = first_sex_pp ) first_sex_fit |> tidy() |> select(term, estimate) |> mutate( base_hazard = case_when( .x == \"logit\" & term != \"parental_transition\" ~ 1 / (1 + exp(-estimate)), .x == \"cloglog\" & term != \"parental_transition\" ~ 1 - exp(-exp(estimate)) ) ) }, .id = \"link\" ) |> pivot_wider(names_from = link, values_from = c(estimate, base_hazard)) #> # A tibble: 7 × 5 #> term estimate_cloglog estimate_logit base_hazard_cloglog base_hazard_logit #> #> 1 factor(… -2.97 -2.99 0.0498 0.0477 #> 2 factor(… -3.66 -3.70 0.0254 0.0241 #> 3 factor(… -2.32 -2.28 0.0940 0.0927 #> 4 factor(… -1.90 -1.82 0.139 0.139 #> 5 factor(… -1.76 -1.65 0.158 0.161 #> 6 factor(… -1.34 -1.18 0.230 0.235 #> 7 parenta… 0.785 0.874 NA NA"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-12.html","id":"time-varying-predictors","dir":"Articles","previous_headings":"","what":"12.3 Time-Varying Predictors","title":"Chapter 12: Extending the discrete-time hazard model","text":"Figure 12.4, page 432: Figure 12.5, page 437:","code":"first_depression_fit <- glm( depressive_episode ~ poly(I(period - 18), 3, raw = TRUE) + parental_divorce, family = binomial(link = \"logit\"), data = first_depression_1 ) # When a predictor enters the model as part of a matrix of covariates, such as # with stats::poly(), it is represented in augment() as a matrix column. A simple # workaround to get the predictor on its original scale as a vector is to pass # the original data to augment(). first_depression_predictions <- first_depression_fit |> augment(data = first_depression_1) |> mutate(hazard = 1 / (1 + exp(-.fitted))) # Proportions of the risk set at each age who experienced an initial depressive # episode at that age, as function of their parental divorce status at that age. first_depression_proportions <- first_depression_1 |> group_by(period, parental_divorce) |> summarise( total = n(), event = sum(depressive_episode), proportion = event / total, proportion = if_else(proportion == 0, NA, proportion), logit = log(proportion / (1 - proportion)) ) #> `summarise()` has grouped output by 'period'. You can override using the #> `.groups` argument. # Top plot ggplot(mapping = aes(x = period, colour = factor(parental_divorce))) + geom_line( aes(y = hazard), data = first_depression_predictions ) + geom_point( aes(y = proportion), data = first_depression_proportions ) + scale_x_continuous(breaks = seq(0, 40, by = 5), limits = c(0, 40)) + scale_y_continuous(limits = c(0, 0.06)) #> Warning: Removed 14 rows containing missing values or values outside the scale range #> (`geom_point()`). # Bottom plot ggplot(mapping = aes(x = period, colour = factor(parental_divorce))) + geom_line( aes(y = .fitted), data = first_depression_predictions ) + geom_point( aes(y = logit), data = first_depression_proportions ) + scale_x_continuous(breaks = seq(0, 40, by = 5), limits = c(0, 40)) + scale_y_continuous(breaks = seq(-8, -2, by = 1), limits = c(-8, -2)) #> Warning: Removed 14 rows containing missing values or values outside the scale range #> (`geom_point()`). first_depression_fit_2 <- update(first_depression_fit, . ~ . + female) first_depression_fit_2 |> augment( newdata = expand_grid( period = 4:39, parental_divorce = c(0, 1), female = c(0, 1) ) ) |> mutate( female = factor(female), parental_divorce = factor(parental_divorce), hazard = 1 / (1 + exp(-.fitted)), survival = cumprod(1 - hazard), .by = c(female, parental_divorce) ) |> pivot_longer(cols = c(hazard, survival), names_to = \"estimate\") |> ggplot(aes(x = period, y = value, linetype = female, colour = parental_divorce)) + geom_line() + facet_wrap(vars(estimate), ncol = 1, scales = \"free_y\") + scale_x_continuous(breaks = seq(0, 40, by = 5), limits = c(0, 40)) + ggh4x::facetted_pos_scales( y = list( estimate == \"hazard\" ~ scale_y_continuous(limits = c(0, .04)), estimate == \"survival\" ~ scale_y_continuous(limits = c(0, 1)) ) )"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-12.html","id":"the-linear-additivity-assumption-uncovering-violations-and-simple-solutions","dir":"Articles","previous_headings":"","what":"12.4 The Linear Additivity Assumption: Uncovering Violations and Simple Solutions","title":"Chapter 12: Extending the discrete-time hazard model","text":"Figure 12.6, page 445: Table 12.4, page 449:","code":"# Raw first_arrest |> group_by(period, abused, black) |> summarise( total = n(), event = sum(event), proportion = event / total, proportion = if_else(proportion == 0, NA, proportion), logit = log(proportion / (1 - proportion)) ) |> ungroup() |> mutate(across(c(abused, black), factor)) |> na.omit() |> ggplot(aes(x = period, y = logit, colour = abused, group = abused)) + geom_line() + scale_x_continuous(breaks = 7:19, limits = c(7, 19)) + scale_y_continuous(limits = c(-7, -2)) + facet_wrap(vars(black), labeller = label_both) #> `summarise()` has grouped output by 'period', 'abused'. You can override using #> the `.groups` argument. # Model first_arrest_fit <- glm( event ~ factor(period) + abused + black + abused:black, family = binomial(link = \"logit\"), data = first_arrest ) first_arrest_fit |> augment( newdata = expand_grid(period = 8:18, abused = c(0, 1), black = c(0, 1)) ) |> ggplot( aes( x = period, y = .fitted, colour = factor(abused), linetype = factor(black) ) ) + geom_line() + scale_x_continuous(breaks = 7:19, limits = c(7, 19)) + scale_y_continuous(limits = c(-8, -2)) model_A <- update(first_depression_fit_2, . ~ . + siblings) model_B <- update( first_depression_fit_2, . ~ . + between(siblings, 1, 2) + between(siblings, 3, 4) + between(siblings, 5, 6) + between(siblings, 7, 8) + between(siblings, 9, Inf) ) model_C <- update(first_depression_fit_2, . ~ . + bigfamily) tidy(model_A) #> # A tibble: 7 × 5 #> term estimate std.error statistic p.value #> #> 1 (Intercept) -4.36 0.122 -35.8 2.23e-281 #> 2 poly(I(period - 18), 3, raw = TRUE)1 0.0611 0.0117 5.24 1.64e- 7 #> 3 poly(I(period - 18), 3, raw = TRUE)2 -0.00731 0.00122 -5.97 2.34e- 9 #> 4 poly(I(period - 18), 3, raw = TRUE)3 0.000182 0.0000790 2.30 2.14e- 2 #> 5 parental_divorce 0.373 0.162 2.29 2.18e- 2 #> 6 female 0.559 0.109 5.10 3.34e- 7 #> 7 siblings -0.0814 0.0223 -3.66 2.57e- 4 tidy(model_B) #> # A tibble: 11 × 5 #> term estimate std.error statistic p.value #> #> 1 (Intercept) -4.50 0.207 -21.8 4.22e-105 #> 2 poly(I(period - 18), 3, raw = TRUE)1 0.0615 0.0117 5.27 1.37e- 7 #> 3 poly(I(period - 18), 3, raw = TRUE)2 -0.00729 0.00122 -5.96 2.56e- 9 #> 4 poly(I(period - 18), 3, raw = TRUE)3 0.000181 0.0000790 2.30 2.17e- 2 #> 5 parental_divorce 0.373 0.162 2.29 2.18e- 2 #> 6 female 0.560 0.110 5.11 3.24e- 7 #> 7 between(siblings, 1, 2)TRUE 0.0209 0.198 0.106 9.16e- 1 #> 8 between(siblings, 3, 4)TRUE 0.0108 0.210 0.0512 9.59e- 1 #> 9 between(siblings, 5, 6)TRUE -0.494 0.255 -1.94 5.22e- 2 #> 10 between(siblings, 7, 8)TRUE -0.775 0.344 -2.26 2.41e- 2 #> 11 between(siblings, 9, Inf)TRUE -0.658 0.344 -1.91 5.56e- 2 tidy(model_C) #> # A tibble: 7 × 5 #> term estimate std.error statistic p.value #> #> 1 (Intercept) -4.48 0.109 -41.2 0 #> 2 poly(I(period - 18), 3, raw = TRUE)1 0.0614 0.0117 5.27 1.40e-7 #> 3 poly(I(period - 18), 3, raw = TRUE)2 -0.00729 0.00122 -5.96 2.54e-9 #> 4 poly(I(period - 18), 3, raw = TRUE)3 0.000182 0.0000790 2.30 2.15e-2 #> 5 parental_divorce 0.371 0.162 2.29 2.22e-2 #> 6 female 0.558 0.109 5.10 3.44e-7 #> 7 bigfamily -0.611 0.145 -4.22 2.39e-5"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-12.html","id":"the-proportionality-assumption-uncovering-violations-and-simple-solutions","dir":"Articles","previous_headings":"","what":"12.5 The proportionality assumption: Uncovering violations and simple solutions","title":"Chapter 12: Extending the discrete-time hazard model","text":"Figure 12.8, page 458: Table 12.5, page 459:","code":"# Raw math_dropout |> group_by(term, woman) |> summarise( total = n(), event = sum(event), proportion = event / total, proportion = if_else(proportion == 0, NA, proportion), logit = log(proportion / (1 - proportion)) ) |> ungroup() |> mutate(across(c(woman), factor)) |> na.omit() |> ggplot(aes(x = term, y = logit, colour = woman)) + geom_line() #> `summarise()` has grouped output by 'term'. You can override using the #> `.groups` argument. # Models model_A <- glm( event ~ -1 + factor(term) + woman, family = binomial(link = \"logit\"), data = math_dropout ) model_B <- glm( event ~ -1 + factor(term) + factor(term):woman, family = binomial(link = \"logit\"), data = math_dropout ) model_C <- update(model_A, . ~ . + woman:I(term - 1)) map_df( list(model_A = model_A, model_B = model_B, model_C = model_C), \\(.x) { .x |> augment(newdata = expand_grid(term = 1:5, woman = c(0, 1))) |> mutate(hazard = 1 / (1 + exp(-.fitted))) }, .id = \"model\" ) |> ggplot(aes(x = term, y = hazard, colour = factor(woman))) + geom_line() + facet_wrap(vars(model)) tidy(model_A) #> # A tibble: 6 × 5 #> term estimate std.error statistic p.value #> #> 1 factor(term)1 -2.13 0.0567 -37.6 0 #> 2 factor(term)2 -0.942 0.0479 -19.7 3.14e- 86 #> 3 factor(term)3 -1.45 0.0634 -22.8 1.66e-115 #> 4 factor(term)4 -0.618 0.0757 -8.16 3.42e- 16 #> 5 factor(term)5 -0.772 0.143 -5.40 6.54e- 8 #> 6 woman 0.379 0.0501 7.55 4.33e- 14 tidy(model_B) #> # A tibble: 10 × 5 #> term estimate std.error statistic p.value #> #> 1 factor(term)1 -2.01 0.0715 -28.1 1.40e-173 #> 2 factor(term)2 -0.964 0.0585 -16.5 5.98e- 61 #> 3 factor(term)3 -1.48 0.0847 -17.5 1.45e- 68 #> 4 factor(term)4 -0.710 0.101 -7.05 1.81e- 12 #> 5 factor(term)5 -0.869 0.191 -4.56 5.23e- 6 #> 6 factor(term)1:woman 0.157 0.0978 1.60 1.09e- 1 #> 7 factor(term)2:woman 0.419 0.0792 5.28 1.27e- 7 #> 8 factor(term)3:woman 0.441 0.116 3.81 1.42e- 4 #> 9 factor(term)4:woman 0.571 0.145 3.95 7.86e- 5 #> 10 factor(term)5:woman 0.601 0.286 2.10 3.55e- 2 tidy(model_C) #> # A tibble: 7 × 5 #> term estimate std.error statistic p.value #> #> 1 factor(term)1 -2.05 0.0646 -31.6 7.80e-220 #> 2 factor(term)2 -0.926 0.0482 -19.2 3.96e- 82 #> 3 factor(term)3 -1.50 0.0665 -22.5 3.54e-112 #> 4 factor(term)4 -0.718 0.0861 -8.34 7.34e- 17 #> 5 factor(term)5 -0.917 0.156 -5.89 3.94e- 9 #> 6 woman 0.227 0.0774 2.94 3.31e- 3 #> 7 woman:I(term - 1) 0.120 0.0470 2.55 1.08e- 2"},{"path":[]},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-12.html","id":"residual-analysis","dir":"Articles","previous_headings":"","what":"12.7 Residual Analysis","title":"Chapter 12: Extending the discrete-time hazard model","text":"Table 12.6, page 465: Figure 12.8, page 467:","code":"first_sex_fit <- glm( event ~ -1 + factor(grade) + parental_transition + parental_antisociality, family = binomial(link = \"logit\"), data = first_sex_pp ) first_sex_fit |> augment(data = first_sex_pp, type.residuals = \"deviance\") |> select(id:parental_antisociality, .resid) |> filter(id %in% c(22, 112, 166, 89, 102, 87, 67, 212)) |> pivot_wider( id_cols = id, names_from = grade, names_prefix = \"grade_\", values_from = .resid ) #> # A tibble: 8 × 7 #> id grade_7 grade_8 grade_9 grade_10 grade_11 grade_12 #> #> 1 22 -0.412 -0.294 -0.584 -0.718 -0.775 1.41 #> 2 67 -0.618 -0.448 -0.856 -1.03 -1.10 1.04 #> 3 87 1.82 NA NA NA NA NA #> 4 89 -0.325 -0.231 -0.464 -0.575 1.86 NA #> 5 102 -0.491 2.37 NA NA NA NA #> 6 112 -0.411 -0.294 -0.583 -0.717 -0.774 -0.956 #> 7 166 -0.661 -0.481 -0.911 -1.09 1.19 NA #> 8 212 -0.286 -0.203 -0.410 -0.509 -0.552 -0.696 first_sex_fit |> augment(data = first_sex_pp, type.residuals = \"deviance\") |> ggplot(aes(x = id, y = .resid)) + geom_point() + geom_hline(yintercept = 0) first_sex_fit |> augment(data = first_sex_pp, type.residuals = \"deviance\") |> group_by(id) |> summarise(ss.deviance = sum(.resid^2)) |> ggplot(aes(x = id, y = ss.deviance)) + geom_point()"},{"path":[]},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-13.html","id":"grouped-methods-for-estimating-continuous-time-survivor-and-hazard-functions","dir":"Articles","previous_headings":"","what":"13.2 Grouped Methods for Estimating Continuous-Time Survivor and Hazard Functions","title":"Chapter 13: Describing continuous-time event occurrence data","text":"Table 13.2, page 477: Figure 13.1, page 479:","code":"# Adding discrete-time intervals honking_discrete <- honking |> mutate( event = if_else(censor == 0, 1, 0), time_interval = cut(seconds, breaks = c(1:8, 18), right = FALSE), time_start = str_extract(time_interval, \"[[:digit:]]+(?=,)\"), time_end = str_extract(time_interval, \"(?<=,)[[:digit:]]+\"), across(c(time_start, time_end), as.numeric) ) # Grouped life table honking_grouped <- honking_discrete |> group_by(time_interval, time_start, time_end) |> summarise( total = n(), n_event = sum(event), n_censor = sum(censor), # All grouping needs to be dropped in order to calculate the number at risk # correctly. .groups = \"drop\" ) |> mutate(n_risk = sum(total) - lag(cumsum(total), default = 0)) # The conditional probability can be estimated using the same discrete-time methods # from the previous chapter, using the grouped data. honking_grouped_fit <- glm( cbind(n_event, n_risk - n_event) ~ 0 + time_interval, family = binomial(link = \"logit\"), data = honking_grouped ) honking_grouped_fit |> # .fitted is the conditional probability broom::augment(newdata = honking_grouped, type.predict = \"response\") |> mutate( survival = cumprod(1 - .fitted), hazard = .fitted / (time_end - time_start) ) #> # A tibble: 8 × 10 #> time_interval time_start time_end total n_event n_censor n_risk .fitted #> #> 1 [1,2) 1 2 6 5 1 57 0.0877 #> 2 [2,3) 2 3 17 14 3 51 0.275 #> 3 [3,4) 3 4 11 9 2 34 0.265 #> 4 [4,5) 4 5 10 6 4 23 0.261 #> 5 [5,6) 5 6 4 2 2 13 0.154 #> 6 [6,7) 6 7 4 2 2 9 0.222 #> 7 [7,8) 7 8 1 1 0 5 0.2 #> 8 [8,18) 8 18 4 3 1 4 0.75 #> # ℹ 2 more variables: survival , hazard # Estimates by hand honking_discrete_fit <- honking_grouped |> mutate( conditional_probability = n_event / n_risk, discrete.s = cumprod(1 - conditional_probability), discrete.h = conditional_probability / (time_end - time_start), # The actuarial method redefines the number of individuals to be at risk of # event occurrence for both the survival and hazard functions, and thus has # different conditional probabilities from the discrete method. n_risk.s = n_risk - (n_censor / 2), conditional_probability.s = n_event / n_risk.s, actuarial.s = cumprod(1 - conditional_probability.s), n_risk.h = n_risk.s - (n_event / 2), conditional_probability.h = n_event / n_risk.h, actuarial.h = conditional_probability.h / (time_end - time_start) ) |> select( -c(conditional_probability.s, conditional_probability.h, n_risk.s, n_risk.h) ) |> add_row(time_end = 0:1, discrete.s = 1, actuarial.s = 1) honking_discrete_fit #> # A tibble: 10 × 12 #> time_interval time_start time_end total n_event n_censor n_risk #> #> 1 [1,2) 1 2 6 5 1 57 #> 2 [2,3) 2 3 17 14 3 51 #> 3 [3,4) 3 4 11 9 2 34 #> 4 [4,5) 4 5 10 6 4 23 #> 5 [5,6) 5 6 4 2 2 13 #> 6 [6,7) 6 7 4 2 2 9 #> 7 [7,8) 7 8 1 1 0 5 #> 8 [8,18) 8 18 4 3 1 4 #> 9 NA NA 0 NA NA NA NA #> 10 NA NA 1 NA NA NA NA #> # ℹ 5 more variables: conditional_probability , discrete.s , #> # discrete.h , actuarial.s , actuarial.h honking_discrete_fit |> pivot_longer( cols = c(discrete.h, discrete.s, actuarial.h, actuarial.s), names_to = \"estimate\" ) |> mutate( estimate = factor( estimate, levels = c(\"discrete.s\", \"actuarial.s\", \"discrete.h\", \"actuarial.h\") ) ) |> ggplot(aes(x = time_end, y = value)) + geom_line(data = \\(x) filter(x, str_detect(estimate, \"discrete\"))) + geom_step( data = \\(x) filter(x, str_detect(estimate, \"actuarial\")), direction = \"vh\" ) + scale_x_continuous(limits = c(0, 20)) + facet_wrap(vars(estimate), scales = \"free_y\") + ggh4x::facetted_pos_scales( y = list( str_detect(estimate, \"s$\") ~ scale_y_continuous(limits = c(0, 1)), str_detect(estimate, \"h$\") ~ scale_y_continuous(limits = c(0, .35), breaks = seq(0, .35, by = .05)) ) )"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-13.html","id":"the-kaplan-meier-method-of-estimating-the-continuous-time-survivor-function","dir":"Articles","previous_headings":"","what":"13.3 The Kaplan-Meier Method of Estimating the Continuous-Time Survivor Function","title":"Chapter 13: Describing continuous-time event occurrence data","text":"Table 13.3, page 484: Figure 13.2, page 485:","code":"honking_continuous_fit <- survfit(Surv(seconds, 1 - censor) ~ 1, data = honking) honking_continuous_fit_tidy <- honking_continuous_fit |> survfit0() |> tidy() |> select(-starts_with(\"conf\")) |> mutate( # tidy() returns the standard error for the cumulative hazard, so we need to # transform it into the standard error for the survival. std.error = estimate * std.error, conditional_probability = n.event / n.risk, time_interval = 1:n(), time_end = lead(time, default = Inf), width = time_end - time, hazard = conditional_probability / width ) |> relocate( time_interval, time_start = time, time_end, n.risk:n.censor, conditional_probability, survival = estimate ) honking_continuous_fit_tidy #> # A tibble: 57 × 11 #> time_interval time_start time_end n.risk n.event n.censor #> #> 1 1 0 1.41 57 0 0 #> 2 2 1.41 1.51 57 1 1 #> 3 3 1.51 1.67 55 1 0 #> 4 4 1.67 1.68 54 1 0 #> 5 5 1.68 1.86 53 1 0 #> 6 6 1.86 2.12 52 1 0 #> 7 7 2.12 2.19 51 1 0 #> 8 8 2.19 2.36 50 1 0 #> 9 9 2.36 2.48 49 0 1 #> 10 10 2.48 2.5 48 1 0 #> # ℹ 47 more rows #> # ℹ 5 more variables: conditional_probability , survival , #> # std.error , width , hazard honking_continuous_fit_tidy |> add_row(time_end = 0:1, survival = 1) |> # The largest event time was censored, so we extend the last step out to that # largest censored value rather than going to infinity. mutate(time_end = if_else(time_end == Inf, time_start, time_end)) |> ggplot() + geom_step( aes(x = time_end, y = survival, linetype = \"1: Kaplan Meier\"), direction = \"vh\" ) + geom_line( aes(x = time_end, y = discrete.s, linetype = \"2: Discrete-time\"), data = honking_discrete_fit ) + geom_step( aes(x = time_end, y = actuarial.s, linetype = \"3: Actuarial\"), data = honking_discrete_fit, direction = \"vh\" ) + scale_x_continuous(limits = c(0, 20)) + labs(x = \"time\")"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-13.html","id":"the-cumulative-hazard-function","dir":"Articles","previous_headings":"","what":"13.4 The Cumulative Hazard Function","title":"Chapter 13: Describing continuous-time event occurrence data","text":"Figure 13.4, page 493:","code":"honking_continuous_fit_tidy |> mutate(time_end = if_else(time_end == Inf, time_start, time_end)) |> ggplot(aes(x = time_end)) + geom_step( aes(y = -log(survival), linetype = \"Negative log\"), direction = \"vh\" ) + geom_step( aes(y = cumsum(hazard * width), linetype = \"Nelson-Aalen\"), direction = \"vh\" ) #> Warning: Removed 1 row containing missing values or values outside the scale range #> (`geom_step()`)."},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-13.html","id":"kernel-smoothed-estimates-of-the-hazard-function","dir":"Articles","previous_headings":"","what":"13.5 Kernel-Smoothed Estimates of the Hazard Function","title":"Chapter 13: Describing continuous-time event occurrence data","text":"Figure 13.5, page 496:","code":"kernel_smoothed_hazards <- map_df( set_names(1:3), \\(bandwidth) { # muhaz() estimates the hazard function from right-censored data using # kernel-based methods, using the vector of survival and event times. kernel_smoothed_hazard <- muhaz( honking$seconds, 1 - honking$censor, # Narrow the temporal region the smoothed function describes, given the # bandwidth and the minimum and maximum observed event times. min.time = min(honking$seconds[honking$censor == 0]) + bandwidth, max.time = max(honking$seconds[honking$censor == 0]) - bandwidth, bw.grid = bandwidth, bw.method = \"global\", b.cor = \"none\", kern = \"epanechnikov\" ) tidy(kernel_smoothed_hazard) }, .id = \"bandwidth\" ) #> Warning in muhaz(honking$seconds, 1 - honking$censor, min.time = min(honking$seconds[honking$censor == : minimum time > minimum Survival Time #> Warning in muhaz(honking$seconds, 1 - honking$censor, min.time = min(honking$seconds[honking$censor == : minimum time > minimum Survival Time #> Warning in muhaz(honking$seconds, 1 - honking$censor, min.time = min(honking$seconds[honking$censor == : minimum time > minimum Survival Time ggplot(kernel_smoothed_hazards, aes(x = time, y = estimate)) + geom_line() + scale_x_continuous(limits = c(0, 20)) + facet_wrap(vars(bandwidth), ncol = 1, labeller = label_both)"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-13.html","id":"developing-an-intuition-about-continuous-time-survivor-cumulative-hazard-and-kernel-smoothed-hazard-functions","dir":"Articles","previous_headings":"","what":"13.6 Developing an Intuition about Continuous-Time Survivor, Cumulative Hazard, and Kernel-Smoothed Hazard Functions","title":"Chapter 13: Describing continuous-time event occurrence data","text":"Figure 13.6, page 499:","code":"# TODO: Check that models are correct, then tidy up code. # Fit survival models alcohol_relapse_fit <- survfit( Surv(weeks, 1 - censor) ~ 1, data = alcohol_relapse ) judges_fit <- survfit( Surv(tenure, dead) ~ 1, data = judges ) first_depression_fit <- survfit( Surv(age, 1 - censor) ~ 1, data = first_depression_2 ) health_workers_fit <- survfit( Surv(weeks, 1 - censor) ~ 1, data = health_workers ) # Tidy survival models survival_models <- list( alcohol_relapse = alcohol_relapse_fit, judges = judges_fit, first_depression = first_depression_fit, health_workers = health_workers_fit ) survival_models_tidy <- map( survival_models, \\(.x) { .x |> survfit0() |> tidy() |> mutate(cumulative_hazard = -log(estimate)) |> select(time, survival = estimate, cumulative_hazard) |> pivot_longer( cols = c(survival, cumulative_hazard), names_to = \"statistic\", values_to = \"estimate\" ) } ) # Estimate and tidy smoothed hazards kernel_smoothed_hazards_tidy <- pmap( list( list( alcohol_relapse = alcohol_relapse$weeks, judges = judges$tenure, first_depression = first_depression_2$age, health_workers = health_workers$weeks ), list( 1 - alcohol_relapse$censor, judges$dead, 1 - first_depression_2$censor, 1 - health_workers$censor ), list(12, 5, 7, 7) ), \\(survival_time, event, bandwidth) { kernel_smoothed_hazard <- muhaz( survival_time, event, min.time = min(survival_time[1 - event == 0]) + bandwidth, max.time = max(survival_time[1 - event == 0]) - bandwidth, bw.grid = bandwidth, bw.method = \"global\", b.cor = \"none\", kern = \"epanechnikov\" ) kernel_smoothed_hazard |> tidy() |> mutate(statistic = \"hazard\") } ) #> Warning in muhaz(survival_time, event, min.time = min(survival_time[1 - : minimum time > minimum Survival Time #> Warning in muhaz(survival_time, event, min.time = min(survival_time[1 - : minimum time > minimum Survival Time #> Warning in muhaz(survival_time, event, min.time = min(survival_time[1 - : minimum time > minimum Survival Time #> Warning in muhaz(survival_time, event, min.time = min(survival_time[1 - : minimum time > minimum Survival Time # Combine estimates estimates_tidy <- map2( survival_models_tidy, kernel_smoothed_hazards_tidy, \\(.x, .y) { bind_rows(.x, .y) |> mutate(statistic = factor( statistic, levels = c(\"survival\", \"cumulative_hazard\", \"hazard\")) ) } ) plots <- map2( estimates_tidy, names(estimates_tidy), \\(.x, .y) { ggplot(.x, aes(x = time, y = estimate)) + geom_step(data = \\(.x) filter(.x, statistic != \"hazard\")) + geom_line(data = \\(.x) filter(.x, statistic == \"hazard\")) + facet_wrap(vars(statistic), ncol = 1, scales = \"free_y\") + labs(title = .y) } ) patchwork::wrap_plots(plots, ncol = 4)"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-14.html","id":"toward-a-statistical-model-for-continuous-time-hazard","dir":"Articles","previous_headings":"","what":"14.1 Toward a Statistical Model for Continuous-Time Hazard","title":"Chapter 14: Fitting the Cox regression model","text":"Figure 14.1, page 505: Figure 14.2, page 508:","code":"# Fit survival models rearrest_fit <- survfit( Surv(months, abs(censor - 1)) ~ 1, data = rearrest ) person_crime_0_fit <- update(rearrest_fit, subset = person_crime == 0) person_crime_1_fit <- update(rearrest_fit, subset = person_crime == 1) # Tidy survival models survival_models <- list( person_crime_0 = person_crime_0_fit, person_crime_1 = person_crime_1_fit ) survival_models_tidy <- map( survival_models, \\(.x) { .x |> survfit0() |> tidy() |> mutate(cumulative_hazard = -log(estimate)) |> select(time, survival = estimate, cumulative_hazard) |> pivot_longer( cols = c(survival, cumulative_hazard), names_to = \"statistic\", values_to = \"estimate\" ) } ) # Estimate and tidy smoothed hazards kernel_smoothed_hazards_tidy <- map2( list( person_crime_0 = filter(rearrest, person_crime == 0)$months, person_crime_1 = filter(rearrest, person_crime == 1)$months ), list( abs(filter(rearrest, person_crime == 0)$censor - 1), abs(filter(rearrest, person_crime == 1)$censor - 1) ), \\(survival_time, event) { kernel_smoothed_hazard <- muhaz( survival_time, event, min.time = min(survival_time[1 - event == 0]) + 8, max.time = max(survival_time[1 - event == 0]) - 8, bw.grid = 8, bw.method = \"global\", b.cor = \"none\", kern = \"epanechnikov\" ) kernel_smoothed_hazard |> tidy() |> mutate(statistic = \"hazard\") } ) #> Warning in muhaz(survival_time, event, min.time = min(survival_time[1 - : minimum time > minimum Survival Time #> Warning in muhaz(survival_time, event, min.time = min(survival_time[1 - : minimum time > minimum Survival Time # Combine estimates estimates_tidy <- map2( survival_models_tidy, kernel_smoothed_hazards_tidy, \\(.x, .y) { bind_rows(.x, .y) |> mutate(statistic = factor( statistic, levels = c(\"survival\", \"cumulative_hazard\", \"hazard\")) ) } ) |> list_rbind(names_to = \"person_crime\") # Plot ggplot(estimates_tidy, aes(x = time, y = estimate, linetype = person_crime)) + geom_step(data = \\(.x) filter(.x, statistic != \"hazard\")) + geom_line(data = \\(.x) filter(.x, statistic == \"hazard\")) + facet_wrap(vars(statistic), ncol = 1, scales = \"free_y\") # Top plot log_cumulative_hazards <- estimates_tidy |> filter(statistic == \"cumulative_hazard\") |> mutate(estimate = log(estimate)) |> filter(!is.infinite(estimate)) ggplot( log_cumulative_hazards, aes(x = time, y = estimate, linetype = person_crime) ) + geom_hline(yintercept = 0) + geom_step() + coord_cartesian(xlim = c(0, 30), ylim = c(-6, 1)) # Middle and bottom plots ---- rearrest_fit_2 <- coxph( Surv(months, abs(censor - 1)) ~ person_crime, data = rearrest, method = \"efron\" ) rearrest_fit_2_curves <- map_df( list(person_crime_0 = 0, person_crime_1 = 1), \\(.x) { rearrest_fit_2 |> survfit( newdata = data.frame(person_crime = .x), type = \"kaplan-meier\" ) |> tidy() |> mutate( cumulative_hazard = -log(estimate), log_cumulative_hazard = log(cumulative_hazard) ) }, .id = \"person_crime\" ) # Middle plot rearrest_fit_2 |> augment(data = rearrest, type.predict = \"survival\") |> mutate( cumulative_hazard = -log(.fitted), log_cumulative_hazard = log(cumulative_hazard) ) |> ggplot(mapping = aes(x = months, y = log_cumulative_hazard)) + geom_step(aes(x = time, linetype = person_crime), data = rearrest_fit_2_curves)+ geom_point( aes(shape = person_crime, x = time, y = estimate), data = log_cumulative_hazards ) + scale_y_continuous(breaks = -6:1) + coord_cartesian(xlim = c(0, 30), ylim = c(-6, 1)) # Bottom plot rearrest_fit_2 |> augment(data = rearrest, type.predict = \"survival\") |> mutate( cumulative_hazard = -log(.fitted), log_cumulative_hazard = log(cumulative_hazard) ) |> ggplot(mapping = aes(x = months, y = cumulative_hazard)) + geom_step(aes(x = time, linetype = person_crime), data = rearrest_fit_2_curves) + geom_point( aes(shape = person_crime, x = time, y = estimate), data = filter(estimates_tidy, statistic == \"cumulative_hazard\") ) + coord_cartesian(xlim = c(0, 30))"},{"path":[]},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-14.html","id":"interpreting-the-results-of-fitting-the-cox-regression-model-to-data","dir":"Articles","previous_headings":"","what":"14.3 Interpreting the Results of Fitting the Cox Regression Model to Data","title":"Chapter 14: Fitting the Cox regression model","text":"Table 14.1, page 525: Table 14.2, page 533:","code":"# TODO: Make table model_A <- coxph(Surv(months, abs(censor - 1)) ~ person_crime, data = rearrest) model_B <- coxph(Surv(months, abs(censor - 1)) ~ property_crime, data = rearrest) model_C <- coxph(Surv(months, abs(censor - 1)) ~ age, data = rearrest) model_D <- coxph( Surv(months, abs(censor - 1)) ~ person_crime + property_crime + age, data = rearrest ) # TODO"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-14.html","id":"nonparametric-strategies-for-displaying-the-results-of-model-fitting","dir":"Articles","previous_headings":"","what":"14.4 Nonparametric Strategies for Displaying the Results of Model Fitting","title":"Chapter 14: Fitting the Cox regression model","text":"Figure 14.4, page 538: Figure 14.5, page 541:","code":"pmap( list( list(baseline = 0, average = mean(rearrest$person_crime)), list(0, mean(rearrest$property_crime)), list(0, mean(rearrest$age)) ), \\(.person_crime, .property_crime, .age) { model_D_baseline <- model_D |> survfit( newdata = tibble( person_crime = .person_crime, property_crime = .property_crime, age = .age) ) |> survfit0() |> tidy() survival <- ggplot(model_D_baseline, aes(x = time, y = estimate)) + geom_line() + geom_point() + scale_x_continuous(limits = c(0, 29)) + coord_cartesian(xlim = c(0, 36), ylim = c(0, 1)) cumulative_hazard <- ggplot( model_D_baseline, aes(x = time, y = -log(estimate)) ) + geom_line() + geom_point() + scale_x_continuous(limits = c(0, 29)) + coord_cartesian(xlim = c(0, 36), ylim = c(0, 1.5)) #TODO: Not sure if muhaz can deal with this situation with newdata hazard <- muhaz( model_D_baseline$time, 1 - model_D_baseline$n.censor, min.time = min(rearrest$months[rearrest$censor == 0]) + 8, max.time = max(rearrest$months[rearrest$censor == 0]) - 8, bw.grid = 8, bw.method = \"global\", b.cor = \"none\", kern = \"epanechnikov\" ) |> tidy() |> ggplot(aes(x = time, y = estimate)) + geom_line() + coord_cartesian(xlim = c(0, 36), ylim = c(0, 0.08)) survival + cumulative_hazard + hazard + plot_layout(ncol = 1) } ) |> patchwork::wrap_plots() #> Warning in muhaz(model_D_baseline$time, 1 - model_D_baseline$n.censor, min.time = min(rearrest$months[rearrest$censor == : minimum time > minimum Survival Time #> Warning in muhaz(model_D_baseline$time, 1 - model_D_baseline$n.censor, min.time = min(rearrest$months[rearrest$censor == : minimum time > minimum Survival Time #> Warning: Removed 6 rows containing missing values or values outside the scale range #> (`geom_line()`). #> Warning: Removed 6 rows containing missing values or values outside the scale range #> (`geom_point()`). #> Warning: Removed 6 rows containing missing values or values outside the scale range #> (`geom_line()`). #> Warning: Removed 6 rows containing missing values or values outside the scale range #> (`geom_point()`). #> Warning: Removed 6 rows containing missing values or values outside the scale range #> (`geom_line()`). #> Warning: Removed 6 rows containing missing values or values outside the scale range #> (`geom_point()`). #> Warning: Removed 6 rows containing missing values or values outside the scale range #> (`geom_line()`). #> Warning: Removed 6 rows containing missing values or values outside the scale range #> (`geom_point()`). #TODO: Not sure if muhaz can deal with this situation with newdata, not sure if the # estimates can be modified after fitting to get the desired values hazard_fit <- muhaz( rearrest$months, 1 - rearrest$censor, min.time = min(rearrest$months[rearrest$censor == 0]) + 8, max.time = max(rearrest$months[rearrest$censor == 0]) - 8, bw.grid = 8, bw.method = \"global\", b.cor = \"none\", kern = \"epanechnikov\" ) #> Warning in muhaz(rearrest$months, 1 - rearrest$censor, min.time = min(rearrest$months[rearrest$censor == : minimum time > minimum Survival Time hazard_fit |> str() #> List of 7 #> $ pin :List of 13 #> ..$ times : num [1:194] 0.0657 0.1314 0.23 0.2957 0.2957 ... #> ..$ delta : num [1:194] 1 1 1 1 1 1 1 0 1 1 ... #> ..$ nobs : int 194 #> ..$ min.time : num 8.07 #> ..$ max.time : num 21 #> ..$ n.min.grid : num 51 #> ..$ min.grid : num [1:51] 8.07 8.32 8.58 8.84 9.1 ... #> ..$ n.est.grid : num 101 #> ..$ bw.pilot : num 1.03 #> ..$ bw.smooth : num 5.16 #> ..$ method : int 1 #> ..$ b.cor : num 0 #> ..$ kernel.type: num 1 #> $ est.grid : num [1:101] 8.07 8.19 8.32 8.45 8.58 ... #> $ haz.est : num [1:101] 0.0418 0.0419 0.042 0.0421 0.0421 ... #> $ imse.opt : num 0 #> $ bw.glob : num 8 #> $ glob.imse: num 0 #> $ bw.grid : num 8 #> - attr(*, \"class\")= chr \"muhaz\" prototypical_individuals <- map2_df( # .person_crime list(neither = 0, personal_only = 1, property_only = 0, both = 1), # .property_crime list(0, 0, 1, 1), \\(.person_crime, .property_crime) { model_D |> survfit( newdata = tibble( person_crime = .person_crime, property_crime = .property_crime, age = mean(rearrest$age) ) ) |> survfit0() |> tidy() }, .id = \"prototypical_individual\" ) prototypical_individuals_survival <- ggplot( prototypical_individuals, aes(x = time, y = estimate, colour = prototypical_individual)) + geom_line() + scale_x_continuous(limits = c(0, 29)) + coord_cartesian(xlim = c(0, 36), ylim = c(0, 1)) + labs( y = \"Survival\" ) prototypical_individuals_cumhaz <- ggplot( prototypical_individuals, aes(x = time, y = -log(estimate), colour = prototypical_individual)) + geom_line() + scale_x_continuous(limits = c(0, 29)) + coord_cartesian(xlim = c(0, 36), ylim = c(0, 2)) + labs( y = \"Cumulative hazard\" ) prototypical_individuals_logcumhaz <- ggplot( filter(prototypical_individuals, time != 0), aes(x = time, y = log(-log(estimate)), colour = prototypical_individual)) + geom_line() + scale_x_continuous(limits = c(0, 29)) + scale_y_continuous(breaks = -7:1) + coord_cartesian(xlim = c(0, 36), ylim = c(-7, 1)) + labs( y = \"log Cumulative hazard\" ) prototypical_individuals_survival + prototypical_individuals_cumhaz + prototypical_individuals_logcumhaz + plot_layout(ncol = 1, guides = \"collect\") #> Warning: Removed 24 rows containing missing values or values outside the scale range #> (`geom_line()`). #> Removed 24 rows containing missing values or values outside the scale range #> (`geom_line()`). #> Removed 24 rows containing missing values or values outside the scale range #> (`geom_line()`)."},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-15.html","id":"time-varying-predictors","dir":"Articles","previous_headings":"","what":"15.1 Time-Varying Predictors","title":"Chapter 15: Extending the Cox regression model","text":"Table 15.1, page 548:","code":"# TODO: Clean up code and make table model_A <- coxph( Surv(used_cocaine_age, 1 - censor) ~ birthyr + early_marijuana_use + early_drug_use, data = first_cocaine ) # Model B ---- first_cocaine_pp <- first_cocaine |> group_by(id) |> reframe( # {survival} uses the counting process method for time-varying predictors, # so we need to construct intervals for the ages at which different events # occurred. These intervals are left-censored, so we start with the end # time; we also only require unique intervals, so duplicate ages should be # removed. age_end = sort(unique(c(used_cocaine_age, used_marijuana_age, used_drugs_age))), age_start = lag(age_end, default = 0), # Time-varying predictors should be lagged so that they describe an individual's # status in the immediately prior year. used_cocaine = if_else( age_end == used_cocaine_age & censor == 0, true = 1, false = 0, missing = 0 ), used_marijuana = if_else( age_end > used_marijuana_age, true = 1, false = 0, missing = 0 ), used_drugs = if_else( age_end > used_drugs_age, true = 1, false = 0, missing = 0 ), # Keep time-invariant predictors from the person-level data birthyr ) |> relocate(age_start, .before = age_end) model_B <- coxph( Surv(age_start, age_end, used_cocaine) ~ birthyr + used_marijuana + used_drugs, data = first_cocaine_pp, ties = \"efron\" ) ## This method with tmerge() also works tmerge( first_cocaine, first_cocaine, id = id, used_cocaine = event(used_cocaine_age, 1 - censor), used_marijuana = tdc(used_marijuana_age), used_drugs = tdc(used_drugs_age), options = list( tstartname = \"age_start\", tstopname = \"age_end\" ) ) |> as_tibble() |> arrange(id) #> Warning: Unknown or uninitialised column: `tstop`. #> Warning: Unknown or uninitialised column: `tstart`. #> Warning in tmerge(first_cocaine, first_cocaine, id = id, used_cocaine = #> event(used_cocaine_age, : replacement of variable 'used_marijuana' #> Warning in tmerge(first_cocaine, first_cocaine, id = id, used_cocaine = #> event(used_cocaine_age, : replacement of variable 'used_drugs' #> # A tibble: 3,086 × 18 #> id used_cocaine_age censor birthyr early_marijuana_use early_drug_use #> #> 1 5 41 1 0 0 0 #> 2 5 41 1 0 0 0 #> 3 5 41 1 0 0 0 #> 4 8 32 1 10 0 0 #> 5 9 36 1 5 0 0 #> 6 9 36 1 5 0 0 #> 7 11 41 1 0 0 0 #> 8 12 32 0 4 0 0 #> 9 12 32 0 4 0 0 #> 10 13 39 1 3 0 0 #> # ℹ 3,076 more rows #> # ℹ 12 more variables: used_marijuana , used_marijuana_age , #> # sold_marijuana , sold_marijuana_age , used_drugs , #> # used_drugs_age , sold_drugs , sold_drugs_age , rural , #> # age_start , age_end , used_cocaine coxph( Surv(age_start, age_end, used_cocaine) ~ birthyr + used_marijuana + used_drugs, data = tmerge( first_cocaine, first_cocaine, id = id, used_cocaine = event(used_cocaine_age, 1 - censor), used_marijuana = tdc(used_marijuana_age), used_drugs = tdc(used_drugs_age), options = list( tstartname = \"age_start\", tstopname = \"age_end\" ) ), ties = \"efron\" ) |> summary() #> Warning: Unknown or uninitialised column: `tstop`. #> Warning: Unknown or uninitialised column: `tstart`. #> Warning in tmerge(first_cocaine, first_cocaine, id = id, used_cocaine = #> event(used_cocaine_age, : replacement of variable 'used_marijuana' #> Warning in tmerge(first_cocaine, first_cocaine, id = id, used_cocaine = #> event(used_cocaine_age, : replacement of variable 'used_drugs' #> Warning: Unknown or uninitialised column: `tstop`. #> Warning: Unknown or uninitialised column: `tstart`. #> Warning in tmerge(first_cocaine, first_cocaine, id = id, used_cocaine = #> event(used_cocaine_age, : replacement of variable 'used_marijuana' #> Warning in tmerge(first_cocaine, first_cocaine, id = id, used_cocaine = #> event(used_cocaine_age, : replacement of variable 'used_drugs' #> Call: #> coxph(formula = Surv(age_start, age_end, used_cocaine) ~ birthyr + #> used_marijuana + used_drugs, data = tmerge(first_cocaine, #> first_cocaine, id = id, used_cocaine = event(used_cocaine_age, #> 1 - censor), used_marijuana = tdc(used_marijuana_age), #> used_drugs = tdc(used_drugs_age), options = list(tstartname = \"age_start\", #> tstopname = \"age_end\")), ties = \"efron\") #> #> n= 3086, number of events= 382 #> #> coef exp(coef) se(coef) z Pr(>|z|) #> birthyr 0.10741 1.11340 0.02145 5.008 5.5e-07 *** #> used_marijuana 2.55176 12.82972 0.28095 9.082 < 2e-16 *** #> used_drugs 1.85387 6.38446 0.12921 14.347 < 2e-16 *** #> --- #> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 #> #> exp(coef) exp(-coef) lower .95 upper .95 #> birthyr 1.113 0.89815 1.068 1.161 #> used_marijuana 12.830 0.07794 7.397 22.252 #> used_drugs 6.384 0.15663 4.956 8.225 #> #> Concordance= 0.876 (se = 0.008 ) #> Likelihood ratio test= 856 on 3 df, p=<2e-16 #> Wald test = 451.1 on 3 df, p=<2e-16 #> Score (logrank) test = 1039 on 3 df, p=<2e-16 # Model C and D ---- first_cocaine_pp_C <- first_cocaine |> group_by(id) |> reframe( age_end = sort( unique( c( used_cocaine_age, used_marijuana_age, used_drugs_age, sold_marijuana_age, sold_drugs_age ) ) ), age_start = lag(age_end, default = 0), # Time-varying predictors should be lagged so that they describe an individual's # status in the immediately prior year. used_cocaine = if_else( age_end == used_cocaine_age & censor == 0, true = 1, false = 0, missing = 0 ), used_marijuana = if_else( age_end > used_marijuana_age, true = 1, false = 0, missing = 0 ), used_drugs = if_else( age_end > used_drugs_age, true = 1, false = 0, missing = 0 ), sold_marijuana = if_else( age_end > sold_marijuana_age, true = 1, false = 0, missing = 0 ), sold_drugs = if_else( age_end > sold_drugs_age, true = 1, false = 0, missing = 0 ), # Keep time-invariant predictors from the person-level data birthyr, early_marijuana_use, early_drug_use, rural ) |> relocate(age_start, .before = age_end) first_cocaine_model_C <- coxph( Surv(age_start, age_end, used_cocaine) ~ birthyr + used_marijuana + used_drugs + sold_marijuana + sold_drugs, data = first_cocaine_pp_C, ties = \"efron\" ) model_D <- update(first_cocaine_model_C, . ~ . + early_marijuana_use + early_drug_use)"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-15.html","id":"imputation-strategies-for-time-varying-predictors","dir":"Articles","previous_headings":"15.1 Time-Varying Predictors","what":"15.1.3 Imputation Strategies for Time-Varying Predictors","title":"Chapter 15: Extending the Cox regression model","text":"Section 15.1.3 Singer Willett (2003) discuss imputation strategies time-varying predictors using subset unpublished data Hall, Havassy, Wasserman (1990), measured relation number days relapse cocaine use several predictors might associated relapse sample 104 newly abstinent cocaine users recently completed abstinence-oriented treatment program. Former cocaine users followed 12 weeks post-treatment used cocaine 7 consecutive days. Self-reported abstinence confirmed interview absence cocaine urine specimens. example use cocaine_relapse_2 data set, person-period data frame 1248 rows 7 columns: id: Participant ID. days: Number days relapse cocaine use censoring. Relapse defined 4 days cocaine use week preceding interview. Study dropouts lost participants coded relapsing cocaine use, number days relapse coded occurring week last follow-interview attended. censor: Censoring status (0 = relapsed, 1 = censored). needle: Binary indicator whether cocaine ever used intravenously. base_mood: Total score positive mood subscales (Activity Happiness) Mood Questionnaire (Ryman, Biersner, & LaRocco, 1974), taken intake interview last week treatment. item used five point Likert score (ranging 0 = , 4 = extremely). followup: Week follow-interview. -mood: Total score positive mood subscales (Activity Happiness) Mood Questionnaire (Ryman, Biersner, & LaRocco, 1974), taken follow-interviews week post-treatment. item used five point Likert score (ranging 0 = , 4 = extremely). time relapse measured days follow-interviews conducted week, cocaine relapse data current form fails meet data requirement time-varying predictors: unique event time days know time-varying mood scores—everyone still risk —moments. Thus, order meet data requirement must generate predictor histories provide near-daily mood scores participant. proceeding steps develop three Cox regression models fitted cocaine relapse data illustrate compare different imputation strategies time-varying predictors. includes number days relapse cocaine use outcome variable; time-invariant predictor needle; different time-varying variable representing predictor total score positive mood subscales Mood Questionnaire (Ryman, Biersner, & LaRocco, 1974), explore following popular imputation strategies suggested Singer Willett (2003): Carry forward mood score next one available. Interpolate adjacent mood scores. Compute moving average based recent several past mood scores.","code":"glimpse(cocaine_relapse_2) #> Rows: 1,248 #> Columns: 7 #> $ id 550, 550, 550, 550, 550, 550, 550, 550, 550, 550, 550, 550, … #> $ censor 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, … #> $ days 83, 83, 83, 83, 83, 83, 83, 83, 83, 83, 83, 83, 83, 83, 83, … #> $ needle 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, … #> $ base_mood 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 25, 25, 25, … #> $ followup 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, … #> $ mood 23, 27, 28, 31, 29, 32, 33, 28, 36, 33, 33, 24, 31, 19, 29, …"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-15.html","id":"exploratory-data-analysis","dir":"Articles","previous_headings":"15.1 Time-Varying Predictors > 15.1.3 Imputation Strategies for Time-Varying Predictors","what":"Exploratory Data Analysis","title":"Chapter 15: Extending the Cox regression model","text":"begin exploring time-invariant variables cocaine_relapse_2 data. convenient one Cox regression models fitted later , using person-level version cocaine_relapse_2 data. total 62 newly abstinent cocaine users (59.6%) relapsed cocaine use within 12 weeks completing abstinence-oriented treatment program. users relapsed early-follow-period. Across sample 38 unique event times. total 69 participants (66.3%) reported previously used cocaine intravenously.","code":"cocaine_relapse_2_pl <- cocaine_relapse_2 |> pivot_wider( names_from = followup, names_prefix = \"mood_\", values_from = mood ) glimpse(cocaine_relapse_2_pl) #> Rows: 104 #> Columns: 17 #> $ id 550, 604, 608, 631, 513, 531, 533, 536, 599, 542, 564, 573, … #> $ censor 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, … #> $ days 83, 83, 83, 83, 82, 82, 82, 82, 82, 81, 81, 81, 81, 81, 81, … #> $ needle 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, … #> $ base_mood 29, 25, 37, 39, 33, 27, 10, 27, 28, 19, 35, 32, 32, 31, 31, … #> $ mood_1 23, 31, 40, 42, 43, 14, 16, 22, 29, 31, 30, 27, 27, 41, 40, … #> $ mood_2 27, 19, 37, 22, 25, 11, 26, 21, 28, 25, 33, 26, 27, NA, NA, … #> $ mood_3 28, 29, 36, 38, 34, 2, 37, 24, 25, 28, 33, 24, 23, 38, 31, 4… #> $ mood_4 31, 24, 36, 41, 42, 8, 17, 24, NA, 20, 33, 29, 22, 37, NA, N… #> $ mood_5 29, 22, 32, 41, 46, 3, 30, NA, NA, 23, 26, 21, 26, 24, 28, N… #> $ mood_6 32, 22, 35, 42, 42, 3, 15, 22, 16, 29, 35, 28, 28, 27, 28, 2… #> $ mood_7 33, NA, 34, 42, 46, 5, 16, NA, 22, 27, 35, 22, 24, 27, 31, 2… #> $ mood_8 28, 20, 35, 42, 46, 3, 15, 23, 23, NA, 33, 28, 17, 28, 22, 3… #> $ mood_9 36, 31, 29, 46, 47, 2, 21, 19, 24, NA, 29, 25, 14, 31, 24, 3… #> $ mood_10 33, 33, 36, NA, NA, 2, 14, 21, 16, 31, NA, NA, NA, 37, 33, N… #> $ mood_11 33, 30, 30, 47, 28, 0, 20, 15, 18, 23, 26, 25, 17, 38, 29, 2… #> $ mood_12 24, NA, 36, 43, 44, 0, 16, 18, 16, 21, NA, 25, 19, 34, 30, 2… cocaine_relapse_2_pl |> group_by(relapsed = 1 - censor) |> summarise(count = n()) |> mutate(proportion = count / sum(count)) #> # A tibble: 2 × 3 #> relapsed count proportion #> #> 1 0 42 0.404 #> 2 1 62 0.596 ggplot(cocaine_relapse_2_pl, aes(x = days)) + geom_histogram(binwidth = 7) + scale_x_continuous(breaks = c(0, 1:12 * 7)) + facet_wrap(vars(relapsed = 1 - censor), labeller = label_both) # We will use these event times later on during the imputation procedure for # Model B. It is important they are sorted in ascending order for this # procedure, so we do so here for convenience while creating the object. event_times <- cocaine_relapse_2_pl |> filter(1 - censor == 1) |> pull(days) |> unique() |> sort() censor_times <- cocaine_relapse_2_pl |> filter(censor == 1) |> pull(days) |> unique() event_times |> discard(\\(.x) .x %in% censor_times) |> length() #> [1] 38 cocaine_relapse_2_pl |> group_by(needle) |> summarise(count = n()) |> mutate(proportion = count / sum(count)) #> # A tibble: 2 × 3 #> needle count proportion #> #> 1 0 35 0.337 #> 2 1 69 0.663"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-15.html","id":"model-a-time-invariant-baseline","dir":"Articles","previous_headings":"15.1 Time-Varying Predictors > 15.1.3 Imputation Strategies for Time-Varying Predictors","what":"Model A: Time-Invariant Baseline","title":"Chapter 15: Extending the Cox regression model","text":"Model uses time-invariant predictor assessing respondent’s mood score just release treatment.","code":"model_A <- coxph( Surv(days, 1 - censor) ~ needle + base_mood, data = cocaine_relapse_2_pl, ties = \"efron\" ) summary(model_A) #> Call: #> coxph(formula = Surv(days, 1 - censor) ~ needle + base_mood, #> data = cocaine_relapse_2_pl, ties = \"efron\") #> #> n= 104, number of events= 62 #> #> coef exp(coef) se(coef) z Pr(>|z|) #> needle 1.020734 2.775232 0.314068 3.250 0.00115 ** #> base_mood -0.003748 0.996259 0.014709 -0.255 0.79886 #> --- #> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 #> #> exp(coef) exp(-coef) lower .95 upper .95 #> needle 2.7752 0.3603 1.4996 5.136 #> base_mood 0.9963 1.0038 0.9679 1.025 #> #> Concordance= 0.63 (se = 0.036 ) #> Likelihood ratio test= 12.51 on 2 df, p=0.002 #> Wald test = 10.6 on 2 df, p=0.005 #> Score (logrank) test = 11.51 on 2 df, p=0.003"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-15.html","id":"model-b","dir":"Articles","previous_headings":"15.1 Time-Varying Predictors > 15.1.3 Imputation Strategies for Time-Varying Predictors","what":"Model B:","title":"Chapter 15: Extending the Cox regression model","text":"Model B return person-period version cocaine_relapse_2 data explore first imputation strategy suggested Singer Willett (2003): Carrying forward mood score next one available. procedure also lag mood score predictor one week—associating, example, first followup baseline mood scores, second followup first followup’s mood scores, forth. Next, prepare cocaine_relapse_2_prevweek data modelling, want transform data set number days relapse counting process (start, stop) format, person-period format : row transformed data set represents “-risk” time interval (day_start, day_end], open left closed right. event variable row 1 time interval ends event 0 otherwise. Variable values row values apply time interval. start end points time interval determined vector unique event_times, defined earlier. censored data, end point final time interval determined time censorship—included vector unique event times—needs handled separately. Transforming cocaine_relapse_2_prevweek data counting process format two-step process. First create counting process structure, columns participant ID, start time, stop time, event status record. also add week variable indicating week record occurred , important second step process. Note step done using either person-period person-level versions cocaine_relapse_2 data however, readability use person-level data . result can obtained using person-period data wrapping calls days censor unique(). Second, join cocaine_relapse_2_prevweek data counting process structure id week, giving us counting process formatted data time-varying predictor’s values occurring appropriate time interval participant. Finally, match text, rename mood score variable week_mood. survival package also comes two utility functions, survSplit() tmerge(), can used transform data counting process format. discussion, see vignette(\"timedep\", package=\"survival\"). Now can fit Model B.","code":"cocaine_relapse_2_prevweek <- cocaine_relapse_2 |> group_by(id) |> mutate( mood_previous_week = lag(mood, default = unique(base_mood)), mood_previous_week_fill = vec_fill_missing( mood_previous_week, direction = \"down\" ) ) cocaine_relapse_2_prevweek #> # A tibble: 1,248 × 9 #> # Groups: id [104] #> id censor days needle base_mood followup mood mood_previous_week #> #> 1 550 1 83 1 29 1 23 29 #> 2 550 1 83 1 29 2 27 23 #> 3 550 1 83 1 29 3 28 27 #> 4 550 1 83 1 29 4 31 28 #> 5 550 1 83 1 29 5 29 31 #> 6 550 1 83 1 29 6 32 29 #> 7 550 1 83 1 29 7 33 32 #> 8 550 1 83 1 29 8 28 33 #> 9 550 1 83 1 29 9 36 28 #> 10 550 1 83 1 29 10 33 36 #> # ℹ 1,238 more rows #> # ℹ 1 more variable: mood_previous_week_fill cocaine_relapse_2_prevweek_cp <- cocaine_relapse_2_pl |> group_by(id) |> reframe( # For censored data the final day should be a participant's days value, so # we need to concatenate their days to the vector of event times. The call # to unique() around the vector removes the duplicate for uncensored data in # the final time interval. day_end = unique(c(event_times[event_times <= days], days)), day_start = lag(day_end, default = 0), event = if_else(day_end == days & censor == 0, true = 1, false = 0), week = floor(day_end / 7) + 1 ) |> relocate(day_start, .after = id) cocaine_relapse_2_prevweek_cp #> # A tibble: 2,805 × 5 #> id day_start day_end event week #> #> 1 501 0 1 0 1 #> 2 501 1 2 0 1 #> 3 501 2 3 0 1 #> 4 501 3 4 0 1 #> 5 501 4 6 0 1 #> 6 501 6 7 0 2 #> 7 501 7 8 0 2 #> 8 501 8 9 0 2 #> 9 501 9 10 0 2 #> 10 501 10 11 0 2 #> # ℹ 2,795 more rows cocaine_relapse_2_prevweek_cp <- cocaine_relapse_2_prevweek_cp |> left_join( cocaine_relapse_2_prevweek, by = join_by(id == id, week == followup) ) |> rename(week_mood = mood_previous_week_fill) cocaine_relapse_2_prevweek_cp #> # A tibble: 2,805 × 12 #> id day_start day_end event week censor days needle base_mood mood #> #> 1 501 0 1 0 1 0 12 1 29 34 #> 2 501 1 2 0 1 0 12 1 29 34 #> 3 501 2 3 0 1 0 12 1 29 34 #> 4 501 3 4 0 1 0 12 1 29 34 #> 5 501 4 6 0 1 0 12 1 29 34 #> 6 501 6 7 0 2 0 12 1 29 19 #> 7 501 7 8 0 2 0 12 1 29 19 #> 8 501 8 9 0 2 0 12 1 29 19 #> 9 501 9 10 0 2 0 12 1 29 19 #> 10 501 10 11 0 2 0 12 1 29 19 #> # ℹ 2,795 more rows #> # ℹ 2 more variables: mood_previous_week , week_mood model_B <- coxph( Surv(day_start, day_end, event) ~ needle + week_mood, data = cocaine_relapse_2_prevweek_cp, ties = \"efron\" ) summary(model_B) #> Call: #> coxph(formula = Surv(day_start, day_end, event) ~ needle + week_mood, #> data = cocaine_relapse_2_prevweek_cp, ties = \"efron\") #> #> n= 2805, number of events= 62 #> #> coef exp(coef) se(coef) z Pr(>|z|) #> needle 1.07959 2.94348 0.31574 3.419 0.000628 *** #> week_mood -0.03490 0.96570 0.01387 -2.517 0.011832 * #> --- #> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 #> #> exp(coef) exp(-coef) lower .95 upper .95 #> needle 2.9435 0.3397 1.5853 5.4654 #> week_mood 0.9657 1.0355 0.9398 0.9923 #> #> Concordance= 0.662 (se = 0.037 ) #> Likelihood ratio test= 18.61 on 2 df, p=9e-05 #> Wald test = 16.62 on 2 df, p=2e-04 #> Score (logrank) test = 17.49 on 2 df, p=2e-04"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-15.html","id":"model-c","dir":"Articles","previous_headings":"15.1 Time-Varying Predictors > 15.1.3 Imputation Strategies for Time-Varying Predictors","what":"Model C:","title":"Chapter 15: Extending the Cox regression model","text":"Model C also start lagged weekly mood scores, however, use different imputation strategy: Interpolating adjacent mood scores. Although Singer Willett (2003) suggest “resisting temptation design sophisticated imputation algorithms,” approach interpolating adjacent mood scores somewhat complex. Consequently, need create function suit purpose, rather using existing functions like zoo::na.approx() imputeTS::na_ma(). Singer Willett (2003) describe approach text, algorithm appears based following rules: Trailing NAs imputed consecutively using recent non-missing mood score. Internal NAs imputed using mean adjacent non-missing mood scores. consecutive internal NAs, following first imputed mood score sequence, every NA thereafter imputed using mean previous NA value’s imputed mood score next non-missing mood score. Imputed mood scores rounded nearest integer. Now can impute lagged weekly mood scores using na_adjacent() function. Next, prepare cocaine_relapse_2_adjacent data modelling, transform counting process format; however, Model C “-risk” time interval one day long. Following Singer Willett (2003), construct day_mood variable linearly interpolating adjacent weekly values yield daily values, assigning given day mood value imputed immediate prior day. Now can fit Model C. Table 15.2, page 555:","code":"na_adjacent <- function(x) { # The while loop is used here to allow us to carry forward imputed mood scores # for consecutive internal NAs. x_avg <- x while (any(is.na(x_avg[2:length(x)]))) { x_avg <- pslide_dbl( list( x_avg, vec_fill_missing(x_avg, direction = \"down\"), vec_fill_missing(x_avg, direction = \"up\") ), \\(.x, .x_fill_down, .x_fill_up) { case_when( # Rule 1: all(is.na(.x[3:length(.x)])) ~ .x_fill_down[2], # Rule 2: !is.na(.x[1]) & is.na(.x[2]) ~ mean(c(.x_fill_up[1], .x_fill_up[2])), TRUE ~ .x[2] ) }, .before = 1, .after = Inf, .complete = TRUE ) # Rule 3. We are not using round() here because it goes to the even digit when # rounding off a 5, rather than always going upward. x_avg <- if_else(x_avg %% 1 < .5, floor(x_avg), ceiling(x_avg)) x_avg[1] <- x[1] } x_avg } cocaine_relapse_2_adjacent <- cocaine_relapse_2_prevweek |> group_by(id) |> mutate( # It's important to include the final follow-up when imputing between # adjacent mood scores, otherwise cases where the second last score is an # internal NA will fill down instead of using the mean between adjacent mood # scores. However, afterwards the final follow-up can be dropped. mood_adjacent_lag = na_adjacent(c(mood_previous_week, last(mood)))[-13], # We also want the non-lagged mood scores for later, which we impute using # similar logic. mood_adjacent = na_adjacent(c(first(mood_previous_week), mood))[-1] ) # Here is a small preview of the difference between the imputation strategies # for Models B and C: cocaine_relapse_2_adjacent |> filter(id == 544) |> select(id, followup, mood_previous_week:mood_adjacent_lag) #> # A tibble: 12 × 5 #> # Groups: id [1] #> id followup mood_previous_week mood_previous_week_fill mood_adjacent_lag #> #> 1 544 1 40 40 40 #> 2 544 2 40 40 40 #> 3 544 3 38 38 38 #> 4 544 4 27 27 27 #> 5 544 5 NA 27 25 #> 6 544 6 22 22 22 #> 7 544 7 NA 22 21 #> 8 544 8 NA 22 21 #> 9 544 9 20 20 20 #> 10 544 10 NA 20 25 #> 11 544 11 30 30 30 #> 12 544 12 28 28 28 cocaine_relapse_2_adjacent_cp <- cocaine_relapse_2_adjacent |> group_by(id, followup) |> reframe( day_end = (followup - 1) * 7 + 1:7, day_start = day_end - 1, days = unique(days), censor = unique(censor), event = if_else( day_end == days & censor == 0, true = 1, false = 0 ), needle = unique(needle), mood_day = approx(c(mood_adjacent_lag, mood_adjacent), n = 8)[[2]][1:7], ) |> relocate(day_start, day_end, days, .after = id) |> filter(day_end <= days) cocaine_relapse_2_adjacent_cp #> # A tibble: 4,948 × 9 #> id day_start day_end days followup censor event needle mood_day #> #> 1 501 0 1 12 1 0 0 1 29 #> 2 501 1 2 12 1 0 0 1 29.7 #> 3 501 2 3 12 1 0 0 1 30.4 #> 4 501 3 4 12 1 0 0 1 31.1 #> 5 501 4 5 12 1 0 0 1 31.9 #> 6 501 5 6 12 1 0 0 1 32.6 #> 7 501 6 7 12 1 0 0 1 33.3 #> 8 501 7 8 12 2 0 0 1 34 #> 9 501 8 9 12 2 0 0 1 31.9 #> 10 501 9 10 12 2 0 0 1 29.7 #> # ℹ 4,938 more rows model_C <- coxph( Surv(day_start, day_end, event) ~ needle + mood_day, data = cocaine_relapse_2_adjacent_cp, ties = \"efron\" ) summary(model_C) #> Call: #> coxph(formula = Surv(day_start, day_end, event) ~ needle + mood_day, #> data = cocaine_relapse_2_adjacent_cp, ties = \"efron\") #> #> n= 4948, number of events= 62 #> #> coef exp(coef) se(coef) z Pr(>|z|) #> needle 1.12077 3.06720 0.31700 3.536 0.000407 *** #> mood_day -0.05438 0.94707 0.01489 -3.651 0.000261 *** #> --- #> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 #> #> exp(coef) exp(-coef) lower .95 upper .95 #> needle 3.0672 0.326 1.6478 5.7091 #> mood_day 0.9471 1.056 0.9198 0.9751 #> #> Concordance= 0.695 (se = 0.036 ) #> Likelihood ratio test= 25.52 on 2 df, p=3e-06 #> Wald test = 23.1 on 2 df, p=1e-05 #> Score (logrank) test = 24.04 on 2 df, p=6e-06 # TODO"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-15.html","id":"nonproportional-hazards-models-via-stratification","dir":"Articles","previous_headings":"","what":"15.2 Nonproportional Hazards Models via Stratification","title":"Chapter 15: Extending the Cox regression model","text":"Figure 15.2, page 559: Table 15.3, page 560:","code":"# FIXME: The upper limit of the data doesn't match the textbook. survfit(Surv(used_cocaine_age, 1 - censor) ~ rural, data = first_cocaine) |> tidy() |> mutate( strata = stringr::str_remove(strata, \"rural=\"), cumulative_hazard = -log(estimate), log_cumulative_hazard = log(cumulative_hazard) ) |> rename(rural = strata) |> ggplot(aes(x = time, y = log_cumulative_hazard, linetype = rural)) + geom_line() + coord_cartesian(ylim = c(-6, -1)) # first_cocaine_model_C from earlier is the first model first_cocaine_model_C #> Call: #> coxph(formula = Surv(age_start, age_end, used_cocaine) ~ birthyr + #> used_marijuana + used_drugs + sold_marijuana + sold_drugs, #> data = first_cocaine_pp_C, ties = \"efron\") #> #> coef exp(coef) se(coef) z p #> birthyr 0.08493 1.08864 0.02183 3.890 1e-04 #> used_marijuana 2.45920 11.69542 0.28357 8.672 < 2e-16 #> used_drugs 1.25110 3.49419 0.15656 7.991 1.34e-15 #> sold_marijuana 0.68989 1.99349 0.12263 5.626 1.84e-08 #> sold_drugs 0.76037 2.13908 0.13066 5.819 5.91e-09 #> #> Likelihood ratio test=944.5 on 5 df, p=< 2.2e-16 #> n= 3312, number of events= 382 first_cocaine_model_stratified <- update( first_cocaine_model_C, . ~ . + strata(rural) ) first_cocaine_model_nonrural <- update( first_cocaine_model_C, subset = rural == 0 ) first_cocaine_model_rural <- update( first_cocaine_model_C, subset = rural == 1 ) # TODO: Make table."},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-15.html","id":"nonproportional-hazards-models-via-interactions-with-time","dir":"Articles","previous_headings":"","what":"15.3 Nonproportional Hazards Models via Interactions with Time","title":"Chapter 15: Extending the Cox regression model","text":"Table 15.4 page 566: Figure 15.3 page 567:","code":"# TODO psychiatric_discharge #> # A tibble: 174 × 4 #> id days censor treatment_plan #> #> 1 2 3 0 1 #> 2 8 46 0 0 #> 3 73 30 0 0 #> 4 76 45 0 0 #> 5 78 22 0 0 #> 6 79 50 0 0 #> 7 81 59 0 0 #> 8 83 44 0 0 #> 9 95 44 0 1 #> 10 117 22 0 0 #> # ℹ 164 more rows # FIXME: The upper limit of the data doesn't match the textbook. survfit(Surv(days, 1 - censor) ~ treatment_plan, data = psychiatric_discharge) |> tidy() |> mutate( strata = stringr::str_remove(strata, \"treatment_plan=\"), cumulative_hazard = -log(estimate), log_cumulative_hazard = log(cumulative_hazard) ) |> rename(treatment_plan = strata) |> ggplot(aes(x = time, y = log_cumulative_hazard, linetype = treatment_plan)) + geom_hline(yintercept = 0, linewidth = .25, linetype = 3) + geom_line() + coord_cartesian(xlim = c(0, 77), ylim = c(-4, 2)) # TODO: Bottom panel"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-15.html","id":"regression-diagnostics","dir":"Articles","previous_headings":"","what":"15.4 Regression Diagnostics","title":"Chapter 15: Extending the Cox regression model","text":"Figure 15.4 page 573: Figure 15.5 page 577: Figure 15.6 page 580: Figure 15.7 page 583:","code":"rearrest <- rearrest |> mutate(rank_time = rank(months, ties.method = \"average\"), .after = \"months\") rearrest_null_model <- coxph(Surv(months, 1 - censor) ~ 1, data = rearrest) rearrest_full_model <- update( rearrest_null_model, . ~ . + person_crime + property_crime + age ) rearrest_models <- list( null = rearrest_null_model, full = rearrest_full_model ) rearrest_fits <- rearrest_models |> map( \\(.x) { map_df( list(martingale = \"martingale\", deviance = \"deviance\"), \\(.y) augment( .x, data = rearrest, type.predict = \"lp\", type.residuals = .y ), .id = \".resid_type\" ) } ) |> list_rbind(names_to = \"model\") |> mutate( model = factor(model, levels = c(\"null\", \"full\")), censored = as.logical(censor) ) rearrest_fits |> filter(.resid_type == \"martingale\") |> ggplot(aes(x = age, y = .resid)) + geom_hline(yintercept = 0, linewidth = .25, linetype = 3) + geom_point(aes(shape = censored)) + scale_shape_manual(values = c(16, 3)) + geom_smooth(se = FALSE) + facet_wrap(vars(model), ncol = 1, labeller = label_both) + coord_cartesian(ylim = c(-3, 1)) #> `geom_smooth()` using method = 'loess' and formula = 'y ~ x' stem(resid(rearrest_full_model, type = \"deviance\"), scale = 2) #> #> The decimal point is 1 digit(s) to the left of the | #> #> -22 | 09 #> -20 | 21 #> -18 | 6491 #> -16 | 969321 #> -14 | 54216 #> -12 | 87654741 #> -10 | 208776542110 #> -8 | 99852176544 #> -6 | 876322098874400 #> -4 | 444443332877644210 #> -2 | 85009955411 #> -0 | 997597551 #> 0 | 268979 #> 2 | 0318 #> 4 | 133678802337 #> 6 | 2688919 #> 8 | 1136690012233889999 #> 10 | 122334724789 #> 12 | 0115 #> 14 | 0228055578 #> 16 | 03795 #> 18 | 2336 #> 20 | 0735 #> 22 | 6 #> 24 | 00 #> 26 | 7 rearrest_fits |> filter(model == \"full\" & .resid_type == \"deviance\") |> ggplot(aes(x = .fitted, y = .resid)) + geom_hline(yintercept = 0, linewidth = .25, linetype = 3) + geom_point(aes(shape = censored)) + scale_shape_manual(values = c(16, 3)) + scale_x_continuous(breaks = -3:3) + scale_y_continuous(breaks = -3:3) + coord_cartesian(xlim = c(-3, 3), ylim = c(-3, 3)) # augment.coxph is bugged and won't return the .resid column when using # `newdata`, likely related to this issue: https://github.com/tidymodels/broom/issues/937 # So this code doesn't work: # augment( # rearrest_full_model, # newdata = filter(rearrest, censor == 0), # type.predict = \"lp\", # type.residuals = \"schoenfeld\" # ) # Likewise, `data` can't be used because it expects the full dataset; thus, it # will error out even when using the filtered data. # However, updating the model first does work: # Schoenfeld residuals only pertain to those who experience the event, so we need # to update the model before retrieving them, and only use a subset of the data # when getting predictions. rearrest_full_model |> update(subset = censor == 0) |> augment( data = filter(rearrest, censor == 0), type.predict = \"lp\", type.residuals = \"schoenfeld\" ) |> mutate(.resid = as.data.frame(.resid)) |> unnest_wider(col = .resid, names_sep = \"_\") |> pivot_longer( cols = starts_with(\".resid\"), names_to = \"predictor\", values_to = \".resid\" ) |> mutate( predictor = stringr::str_remove(predictor, \".resid_\"), predictor = factor( predictor, levels = c(\"person_crime\", \"property_crime\", \"age\") ) ) |> ggplot(aes(x = rank_time, y = .resid)) + geom_hline(yintercept = 0, linewidth = .25, linetype = 3) + geom_point() + scale_shape_manual(values = c(16, 3)) + geom_smooth(se = FALSE, span = 1) + facet_wrap( vars(predictor), ncol = 1, scales = \"free_y\", labeller = label_both ) + scale_x_continuous(n.breaks = 8) + ggh4x::facetted_pos_scales( y = list( predictor == \"person_crime\" ~ scale_y_continuous(limits = c(-.5, 1)), predictor == \"property_crime\" ~ scale_y_continuous( n.breaks = 7, limits = c(-1, .2) ), predictor == \"age\" ~ scale_y_continuous(limits = c(-10, 20)) ) ) + coord_cartesian(xlim = c(0, 175)) #> `geom_smooth()` using method = 'loess' and formula = 'y ~ x' # TODO: set y-axis scales to match textbook. rearrest_full_model |> augment( data = rearrest, type.predict = \"lp\", type.residuals = \"score\" ) |> mutate(.resid = as.data.frame(.resid)) |> unnest_wider(col = .resid, names_sep = \"_\") |> pivot_longer( cols = starts_with(\".resid\"), names_to = \"predictor\", values_to = \".resid\" ) |> mutate( predictor = stringr::str_remove(predictor, \".resid_\"), predictor = factor( predictor, levels = c(\"person_crime\", \"property_crime\", \"age\") ), censored = as.logical(censor) ) |> ggplot(aes(x = rank_time, y = .resid)) + geom_hline(yintercept = 0, linewidth = .25, linetype = 3) + geom_point(aes(shape = censored)) + scale_shape_manual(values = c(16, 3)) + facet_wrap( vars(predictor), ncol = 1, scales = \"free_y\", labeller = label_both )"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-15.html","id":"competing-risks","dir":"Articles","previous_headings":"","what":"15.5 Competing Risks","title":"Chapter 15: Extending the Cox regression model","text":"Figure 15.8 page 589: Table 15.7 page 592:","code":"judges_null_models <- list( dead = survfit(Surv(tenure, dead) ~ 1, data = judges), retired = survfit(Surv(tenure, retired) ~ 1, data = judges) ) judges_null_models_tidy <- map( judges_null_models, \\(.x) { .x |> survfit0() |> tidy() |> mutate(cumulative_hazard = -log(estimate)) |> select(time, survival = estimate, cumulative_hazard) |> pivot_longer( cols = c(survival, cumulative_hazard), names_to = \"statistic\", values_to = \"estimate\" ) } ) # Estimate and tidy smoothed hazards judges_kernel_smoothed_hazards_tidy <- map( list( judges_dead = judges$dead, judges_retired = judges$retired ), \\(event) { kernel_smoothed_hazard <- muhaz( judges$tenure, event, min.time = min(judges$tenure[event == 0]) + 6, max.time = max(judges$tenure[event == 0]) - 6, bw.grid = 6, bw.method = \"global\", b.cor = \"none\", kern = \"epanechnikov\" ) kernel_smoothed_hazard |> tidy() |> mutate(statistic = \"hazard\") } ) #> Warning in muhaz(judges$tenure, event, min.time = min(judges$tenure[event == : minimum time > minimum Survival Time #> Warning in muhaz(judges$tenure, event, min.time = min(judges$tenure[event == : minimum time > minimum Survival Time # Combine estimates estimates_tidy <- map2_df( judges_null_models_tidy, judges_kernel_smoothed_hazards_tidy, \\(.x, .y) { bind_rows(.x, .y) |> mutate(statistic = factor( statistic, levels = c(\"survival\", \"cumulative_hazard\", \"hazard\")) ) }, .id = \"event\" ) ggplot(estimates_tidy, aes(x = time, y = estimate, linetype = event)) + geom_step(data = \\(.x) filter(.x, statistic != \"hazard\")) + geom_line(data = \\(.x) filter(.x, statistic == \"hazard\")) + facet_wrap(vars(statistic), ncol = 1, scales = \"free_y\") judges_model_A <- coxph( Surv(tenure, dead) ~ appointment_age + appointment_year, data = judges ) judges_model_B <- coxph( Surv(tenure, retired) ~ appointment_age + appointment_year, data = judges ) judges_model_C <- coxph( Surv(tenure, left_appointment) ~ appointment_age + appointment_year, data = judges ) # TODO: Make table."},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-15.html","id":"late-entry-into-the-risk-set","dir":"Articles","previous_headings":"","what":"15.6 Late Entry into the Risk Set","title":"Chapter 15: Extending the Cox regression model","text":"Table 15.8 page 601:","code":"# Model A ---- # First we need to transform to a counting process format. physicians_event_times_A <- physicians |> filter(1 - censor == 1) |> pull(exit) |> unique() |> sort() # We'll use survSplit() this time around. physicians_cp_A <- physicians |> mutate(event = 1 - censor) |> survSplit( Surv(entry, exit, event) ~ ., data = _, cut = physicians_event_times_A, end = \"exit\" ) |> as_tibble() # The warning message here can be ignored. physicians_model_A <- coxph( Surv(entry, exit, event) ~ part_time + age + age:exit, data = physicians_cp_A ) #> Warning in coxph(Surv(entry, exit, event) ~ part_time + age + age:exit, : a #> variable appears on both the left and right sides of the formula # Model B ---- physicians_event_times_B <- physicians |> filter(1 - censor == 1 & entry == 0) |> pull(exit) |> unique() |> sort() physicians_cp_B <- physicians |> filter(entry == 0) |> mutate(event = 1 - censor) |> survSplit( Surv(entry, exit, event) ~ ., data = _, cut = physicians_event_times_B, end = \"exit\" ) |> as_tibble() physicians_model_B <- coxph( Surv(entry, exit, event) ~ part_time + age + age:exit, data = physicians_cp_B ) #> Warning in coxph(Surv(entry, exit, event) ~ part_time + age + age:exit, : a #> variable appears on both the left and right sides of the formula # Model C ---- physicians_cp_C <- physicians |> mutate( event = 1 - censor, entry = 0 ) |> survSplit( Surv(entry, exit, event) ~ ., data = _, cut = physicians_event_times_A, end = \"exit\" ) |> as_tibble() physicians_model_C <- coxph( Surv(entry, exit, event) ~ part_time + age + age:exit, data = physicians_cp_C ) #> Warning in coxph(Surv(entry, exit, event) ~ part_time + age + age:exit, : a #> variable appears on both the left and right sides of the formula # TODO: Make table and clean up code."},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-15.html","id":"using-late-entrants-to-introduce-alternative-metrics-for-clocking-time","dir":"Articles","previous_headings":"15.6 Late Entry into the Risk Set","what":"15.6.2 Using Late Entrants to Introduce Alternative Metrics for Clocking Time","title":"Chapter 15: Extending the Cox regression model","text":"Table 15.9 page 604:","code":"monkeys_model_A <- coxph( Surv(sessions, 1 - censor) ~ initial_age + birth_weight + female, data = monkeys ) monkeys_model_B <- update(monkeys_model_A, Surv(end_age, 1 - censor) ~ .) # The warning message here can be ignored. monkeys_model_C <- update( monkeys_model_A, Surv(initial_age, end_age, 1 - censor) ~ . ) #> Warning in coxph(formula = Surv(initial_age, end_age, 1 - censor) ~ initial_age #> + : a variable appears on both the left and right sides of the formula # TODO: Make table."},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-2.html","id":"creating-a-longitudinal-data-set","dir":"Articles","previous_headings":"","what":"2.1 Creating a longitudinal data set","title":"Chapter 2: Exploring longitudinal data on change","text":"Section 2.1 Singer Willett (2003) introduce two distinct formats data organization longitudinal data—person-level format person-period format—using subset data National Youth Survey (NYS) measuring development tolerance towards deviant behaviour adolescents time relation self-reported sex exposure deviant peers (Raudenbush & Chan, 1992). Adolescents’ tolerance towards deviant behaviour based 9-item scale measuring attitudes tolerant deviant behaviour. scale administered year age 11 15 time-varying variable. However, adolescents’ self-reported sex exposure deviant peers recorded beginning study period time-invariant variables. example illustrate difference two formats using deviant_tolerance_pl deviant_tolerance_pp data sets, correspond adolescent tolerance deviant behaviour data organized person-level person-period formats, respectively.","code":""},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-2.html","id":"the-person-level-data-set","dir":"Articles","previous_headings":"2.1 Creating a longitudinal data set","what":"The Person-Level Data Set","title":"Chapter 2: Exploring longitudinal data on change","text":"person-level format (also known wide multivariate format), person one row data multiple columns containing data measurement occasion time-varying variables. demonstrated deviant_tolerance_pl data set, person-level data frame 16 rows 8 columns: id: Participant ID. tolerance_11, tolerance_12, tolerance_13, tolerance_14, tolerance_15: Average score across 9-item scale assessing attitudes favourable deviant behaviour ages 11, 12, 13, 14, 15. item used four point scale (1 = wrong, 2 = wrong, 3 = little bit wrong, 4 = wrong ). male: Binary indicator whether adolescent male. exposure: Average score across 9-item scale assessing level exposure deviant peers. item used five point Likert score (ranging 0 = none, 4 = ). Although person-level format common cross-sectional research, four disadvantages make ill-suited longitudinal data analysis: restricts data analysis examining rank order wave--wave relationships, leading non-informative summaries tell us person changes time, even direction change. omits explicit time-indicator variable, rendering time unavailable data analysis. requires adding additional variable data set unique measurement occasion, making inefficient useless number spacing measurement occasions varies across individuals. requires adding additional set columns time-varying predictor (one column per measurement occasion), rendering unable easily handle presence time-varying predictors. Singer Willett (2003) exemplify first disadvantages postulating one might analyze person-level tolerance towards deviant behaviour data set. natural approach summarize wave--wave relationships among tolerance_11 tolerance_15 using bivariate correlations bivariate plots; however, tell us anything adolescent tolerance towards deviant behaviour changed time either individuals groups. Rather, weak positive correlation measurement occasions merely tells us rank order tolerance towards deviant behaviour remained relatively stable across occasions—, adolescents tolerant towards deviant behaviour one measurement occasion tended tolerant next. first disadvantage also apparent examining bivariate plots measurement occasions: way tell adolescent tolerance towards deviant behaviour changed time either individuals groups. Moreover, lack explicit time-indicator variable, isn’t possible plot person-level data set meaningful way, time series plot organized id. Considered together, disadvantages make person-level format ill-suited longitudinal data analyses. Fortunately, disadvantages person-level format can addressed simple conversion person-period format.","code":"deviant_tolerance_pl #> # A tibble: 16 × 8 #> id tolerance_11 tolerance_12 tolerance_13 tolerance_14 tolerance_15 male #> #> 1 9 2.23 1.79 1.9 2.12 2.66 0 #> 2 45 1.12 1.45 1.45 1.45 1.99 1 #> 3 268 1.45 1.34 1.99 1.79 1.34 1 #> 4 314 1.22 1.22 1.55 1.12 1.12 0 #> 5 442 1.45 1.99 1.45 1.67 1.9 0 #> 6 514 1.34 1.67 2.23 2.12 2.44 1 #> 7 569 1.79 1.9 1.9 1.99 1.99 0 #> 8 624 1.12 1.12 1.22 1.12 1.22 1 #> 9 723 1.22 1.34 1.12 1 1.12 0 #> 10 918 1 1 1.22 1.99 1.22 0 #> 11 949 1.99 1.55 1.12 1.45 1.55 1 #> 12 978 1.22 1.34 2.12 3.46 3.32 1 #> 13 1105 1.34 1.9 1.99 1.9 2.12 1 #> 14 1542 1.22 1.22 1.99 1.79 2.12 0 #> 15 1552 1 1.12 2.23 1.55 1.55 0 #> 16 1653 1.11 1.11 1.34 1.55 2.12 0 #> # ℹ 1 more variable: exposure # Table 2.1, page 20: deviant_tolerance_pl |> select(starts_with(\"tolerance\")) |> correlate(diagonal = 1) |> shave() |> fashion() #> term tolerance_11 tolerance_12 tolerance_13 tolerance_14 tolerance_15 #> 1 tolerance_11 1.00 #> 2 tolerance_12 .66 1.00 #> 3 tolerance_13 .06 .25 1.00 #> 4 tolerance_14 .14 .21 .59 1.00 #> 5 tolerance_15 .26 .39 .57 .83 1.00 deviant_tolerance_pl |> select(starts_with(\"tolerance\")) |> pairs()"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-2.html","id":"the-person-period-data-set","dir":"Articles","previous_headings":"2.1 Creating a longitudinal data set","what":"The Person-Period Data Set","title":"Chapter 2: Exploring longitudinal data on change","text":"person-period format (also known long univariate format), person one row data measurement occasion, participant identifier variable person, time-indicator variable measurement occasion. format, time-invariant variables identical values across measurement occasion; whereas time-varying variables potentially differing values. demonstrated deviant_tolerance_pp data set, person-period data frame 80 rows 5 columns: id: Participant ID. age: Adolescent age years. tolerance: Average score across 9-item scale assessing attitudes favourable deviant behaviour. item used four point scale (1 = wrong, 2 = wrong, 3 = little bit wrong, 4 = wrong ). male: Binary indicator whether adolescent male. exposure: Average score across 9-item scale assessing level exposure deviant peers. item used five point Likert score (ranging 0 = none, 4 = ). Although person-period data set contains information person-level data set, format data organization makes amenable longitudinal data analysis, specifically: includes explicit participant identifier variable, enabling data sorted person-specific subsets. includes explicit time-indicator variable, rendering time available data analysis, accommodating research designs number spacing measurement occasions varies across individuals. needs single column variable data set—whether time-varying time-invariant, outcome predictor—making trivial handle number variables. Indeed, R functions designed work data person-period format—falls larger umbrella tidy data format—due R’s vectorized nature. Wickham, Çetinkaya-Rundel, Grolemund (2023) explain, three interrelated rules make data set tidy: variable must column. observation must row. value must cell. Thus, person-period format simply special case tidy data format, distinguishes longitudinal nature requirements explicit participant identifier time-indicator variables.","code":"deviant_tolerance_pp #> # A tibble: 80 × 5 #> id age tolerance male exposure #> #> 1 9 11 2.23 0 1.54 #> 2 9 12 1.79 0 1.54 #> 3 9 13 1.9 0 1.54 #> 4 9 14 2.12 0 1.54 #> 5 9 15 2.66 0 1.54 #> 6 45 11 1.12 1 1.16 #> 7 45 12 1.45 1 1.16 #> 8 45 13 1.45 1 1.16 #> 9 45 14 1.45 1 1.16 #> 10 45 15 1.99 1 1.16 #> # ℹ 70 more rows"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-2.html","id":"converting-between-person-level-and-person-period-data-sets","dir":"Articles","previous_headings":"2.1 Creating a longitudinal data set","what":"Converting Between Person-Level and Person-Period Data Sets","title":"Chapter 2: Exploring longitudinal data on change","text":"Unfortunately, longitudinal data often initially stored person-level data set, meaning real analyses require least little tidying get data person-period format. reasons : Many people aren’t familiar principles tidy data—special cases like person person-period format—’s hard derive without spending lot time working longitudinal data. person-level format closely resembles familiar cross-sectional data-set format, making seemingly sensible default inexperienced analysts. Data often organized facilitate non-analytical goals, data entry, rather data analysis. Thus, essential skill aspiring longitudinal data analyst able convert person-level data set person-period data set. tidyr package provides two functions can easily convert longitudinal data set one format : pivot_longer() pivot_wider(). convert person-level data set person-period data set use pivot_longer(): person-level data, five key arguments: cols specifies columns need pivoted longer format—longitudinal data, always columns corresponding time-varying variables. argument uses tidy selection, small data science language selecting columns data frame (?tidyr_tidy_select), making simple select column time-varying variable based naming pattern. names_to names new column (columns) create information stored column names specified cols. named new column age. names_prefix removes matching text start column name—longitudinal data, always prefix time-varying variables separating variable name measurement occasion. argument uses regular expression select matching text. names_transform applies function new column (columns). converted new column age type character type integer. values_to names new column (columns) create data stored cell values. named new column tolerance. Note \"age\" \"tolerance\" quoted call pivot_longer() represent column names new variables ’re creating, rather already-existing variables data. Although longitudinal data analyses begin getting data person-period format, can occasionally useful go opposite direction. computations can made easier using person-period data set, certain functions analyses expect person-period data set; therefore, ’s helpful know untidy, transform, re-tidy data needed. convert person-period data set person-level data set use dplyr::pivot_wider(): person-period data, three key arguments: names_from specifies column (columns) get name output columns —longitudinal data, always columns corresponding time-indicator variables. names_prefix adds specified string start output column name—longitudinal data, always prefix time-varying variables separating variable name measurement occasion. values_from specifies column (columns) get cell values —longitudinal data, always columns corresponding time-varying variables. learn principles tidy data pivoting works, see Data Tidying chapter R Data Science.","code":"# Figure 2.1, page 18: pivot_longer( deviant_tolerance_pl, cols = starts_with(\"tolerance_\"), names_to = \"age\", names_prefix = \"tolerance_\", names_transform = as.integer, values_to = \"tolerance\" ) #> # A tibble: 80 × 5 #> id male exposure age tolerance #> #> 1 9 0 1.54 11 2.23 #> 2 9 0 1.54 12 1.79 #> 3 9 0 1.54 13 1.9 #> 4 9 0 1.54 14 2.12 #> 5 9 0 1.54 15 2.66 #> 6 45 1 1.16 11 1.12 #> 7 45 1 1.16 12 1.45 #> 8 45 1 1.16 13 1.45 #> 9 45 1 1.16 14 1.45 #> 10 45 1 1.16 15 1.99 #> # ℹ 70 more rows pivot_wider( deviant_tolerance_pp, names_from = age, names_prefix = \"tolerance_\", values_from = tolerance ) #> # A tibble: 16 × 8 #> id male exposure tolerance_11 tolerance_12 tolerance_13 tolerance_14 #> #> 1 9 0 1.54 2.23 1.79 1.9 2.12 #> 2 45 1 1.16 1.12 1.45 1.45 1.45 #> 3 268 1 0.9 1.45 1.34 1.99 1.79 #> 4 314 0 0.81 1.22 1.22 1.55 1.12 #> 5 442 0 1.13 1.45 1.99 1.45 1.67 #> 6 514 1 0.9 1.34 1.67 2.23 2.12 #> 7 569 0 1.99 1.79 1.9 1.9 1.99 #> 8 624 1 0.98 1.12 1.12 1.22 1.12 #> 9 723 0 0.81 1.22 1.34 1.12 1 #> 10 918 0 1.21 1 1 1.22 1.99 #> 11 949 1 0.93 1.99 1.55 1.12 1.45 #> 12 978 1 1.59 1.22 1.34 2.12 3.46 #> 13 1105 1 1.38 1.34 1.9 1.99 1.9 #> 14 1542 0 1.44 1.22 1.22 1.99 1.79 #> 15 1552 0 1.04 1 1.12 2.23 1.55 #> 16 1653 0 1.25 1.11 1.11 1.34 1.55 #> # ℹ 1 more variable: tolerance_15 "},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-2.html","id":"descriptive-analysis-of-individual-change-over-time","dir":"Articles","previous_headings":"","what":"2.2 Descriptive analysis of individual change over time","title":"Chapter 2: Exploring longitudinal data on change","text":"Section 2.2 Singer Willett (2003) use deviant_tolerance_pp data set demonstrate person-period format facilitates exploratory analyses describe individuals data set change time, revealing nature idiosyncrasies person’s temporal pattern change.","code":""},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-2.html","id":"empirical-growth-plots","dir":"Articles","previous_headings":"2.2 Descriptive analysis of individual change over time","what":"Empirical Growth Plots","title":"Chapter 2: Exploring longitudinal data on change","text":"Empirical growth plots show, individual, sequence change time-varying variable. change can evaluated either absolute terms scale variable interest, relative terms comparison sample members. Singer Willett (2003) identify several questions helpful answer examining empirical growth plots: increasing? decreasing? increasing ? least? decreasing ? least? anyone increase decrease? anyone decrease increase? construct empirical growth plot ggplot2 package, put time-indicator x-axis, time-varying variable y-axis, facet individual separate panel. data set large, Singer Willett (2003) suggest constructing empirical growth plots randomly selected subsample individuals—perhaps stratified groups defined values important predictors—rather using entire sample. task can easily accomplished using filter() function dplyr package prior plotting. example, sample four random adolescents. Note use set.seed() function prior sampling, sets state R’s random number generator: results random sample reproducible. approach can also extended randomly select subsample individuals within different strata combining group_split() function dplyr package split data list different groups, map() function purrr package apply filter() call previous example group. example, sample two random adolescent males two random adolescent females, combine filtered data frames list back together using list_rbind() function purrr package.","code":"# Figure 2.2, page 25: deviant_tolerance_empgrowth <- deviant_tolerance_pp |> ggplot(aes(x = age, y = tolerance)) + geom_point() + coord_cartesian(ylim = c(0, 4)) + facet_wrap(vars(id), labeller = label_both) deviant_tolerance_empgrowth set.seed(345) deviant_tolerance_pp |> filter(id %in% sample(unique(id), size = 4)) #> # A tibble: 20 × 5 #> id age tolerance male exposure #> #> 1 268 11 1.45 1 0.9 #> 2 268 12 1.34 1 0.9 #> 3 268 13 1.99 1 0.9 #> 4 268 14 1.79 1 0.9 #> 5 268 15 1.34 1 0.9 #> 6 442 11 1.45 0 1.13 #> 7 442 12 1.99 0 1.13 #> 8 442 13 1.45 0 1.13 #> 9 442 14 1.67 0 1.13 #> 10 442 15 1.9 0 1.13 #> 11 569 11 1.79 0 1.99 #> 12 569 12 1.9 0 1.99 #> 13 569 13 1.9 0 1.99 #> 14 569 14 1.99 0 1.99 #> 15 569 15 1.99 0 1.99 #> 16 1105 11 1.34 1 1.38 #> 17 1105 12 1.9 1 1.38 #> 18 1105 13 1.99 1 1.38 #> 19 1105 14 1.9 1 1.38 #> 20 1105 15 2.12 1 1.38 set.seed(123) deviant_tolerance_pp |> group_split(male) |> map(\\(.group) filter(.group, id %in% sample(unique(id), size = 2))) |> list_rbind() #> # A tibble: 20 × 5 #> id age tolerance male exposure #> #> 1 442 11 1.45 0 1.13 #> 2 442 12 1.99 0 1.13 #> 3 442 13 1.45 0 1.13 #> 4 442 14 1.67 0 1.13 #> 5 442 15 1.9 0 1.13 #> 6 918 11 1 0 1.21 #> 7 918 12 1 0 1.21 #> 8 918 13 1.22 0 1.21 #> 9 918 14 1.99 0 1.21 #> 10 918 15 1.22 0 1.21 #> 11 268 11 1.45 1 0.9 #> 12 268 12 1.34 1 0.9 #> 13 268 13 1.99 1 0.9 #> 14 268 14 1.79 1 0.9 #> 15 268 15 1.34 1 0.9 #> 16 514 11 1.34 1 0.9 #> 17 514 12 1.67 1 0.9 #> 18 514 13 2.23 1 0.9 #> 19 514 14 2.12 1 0.9 #> 20 514 15 2.44 1 0.9"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-2.html","id":"using-a-trajectory-to-summarize-each-persons-empirical-growth-record","dir":"Articles","previous_headings":"2.2 Descriptive analysis of individual change over time","what":"Using a Trajectory to Summarize Each Person’s Empirical Growth Record","title":"Chapter 2: Exploring longitudinal data on change","text":"person’s empirical growth record can summarized applying two standardized approaches: nonparametric approach uses nonparametric smooths summarize person’s pattern change time graphically without imposing specific functional form. primary advantage nonparametric approach requires assumptions. parametric approach uses separate parametric models fit person’s data summarize pattern change time. model uses common functional form trajectories (e.g., straight line, quadratic curve, etc.). primary advantage parametric approach provides numeric summaries trajectories can used exploration. Singer Willett (2003) recommend using approaches—beginning nonparametric approach—examining smoothed trajectories help select common functional form trajectories parametric approach.","code":""},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-2.html","id":"the-nonparametric-approach","dir":"Articles","previous_headings":"2.2 Descriptive analysis of individual change over time > Using a Trajectory to Summarize Each Person’s Empirical Growth Record","what":"The Nonparametric Approach","title":"Chapter 2: Exploring longitudinal data on change","text":"stat_smooth() function can used add nonparametric smooth layer empirical growth record plot. choice particular smoothing algorithm primarily matter convenience, ’ll use default loess smoother. span argument controls amount smoothing default loess smoother—smaller numbers producing wigglier lines larger numbers producing smoother lines; choose value creates similar smooth textbook figure. Singer Willett (2003) recommend focusing elevation, shape, tilt smoothed trajectories answering questions like: scores hover low, medium, high end scale? everyone change time people remain ? trajectories inflection point plateau? rate change steep shallow? overall functional form trajectory group-level? linear curvilinear? smooth step-like? Answering last question particularly important, help select common functional form trajectories parametric approach.","code":"# Figure 2.3, page 27: deviant_tolerance_empgrowth + stat_smooth(method = \"loess\", se = FALSE, span = .9)"},{"path":"https://mccarthy-m-g.github.io/alda/articles/chapter-2.html","id":"the-parametric-approach","dir":"Articles","previous_headings":"2.2 Descriptive analysis of individual change over time > Using a Trajectory to Summarize Each Person’s Empirical Growth Record","what":"The Parametric Approach","title":"Chapter 2: Exploring longitudinal data on change","text":"parametric approach, Singer Willett (2003) suggest using following three-step process: Estimate within-person linear model person data set. Collect summary statistics within-person linear model single data set. Add person’s fitted trajectory empirical growth record plot. begin, ’ll use lmList() function lme4 package fit common linear model adolescent data set. model formula lmList() function takes form response ~ terms | group. select straight line common functional form trajectories, age centred age 11. Next ’ll collect summary statistics within-person linear model single data set using tidy() function broom package. However, lmList() returns list models, need apply tidy() call model prior collecting summary statistics single data set. Ironically, also need tidy result tidy() prepare data plotting. Finally, can add person’s fitted trajectory empirical growth record plot using geom_abline() function. However, centred age linear model, need transform scale x-axis empirical growth plot centred well—otherwise ggplot2 able align fitted trajectories correctly. , must create custom transformation object using new_transform() function scales package, defines transformation, inverse, methods generating breaks labels. Alternatively, plan examine parametric trajectories graphically, three-step process suggested Singer Willett (2003) can skipped altogether using stat_smooth() function \"lm\" method. approach also fits within-person linear model person data set; drawback makes awkward (though impossible) access summary statistics model.","code":"deviant_tolerance_fit <- lmList( tolerance ~ I(age - 11) | id, pool = FALSE, data = deviant_tolerance_pp ) # Table 2.2, page 30: summary(deviant_tolerance_fit) #> Call: #> Model: tolerance ~ I(age - 11) | NULL #> Data: deviant_tolerance_pp #> #> Coefficients: #> (Intercept) #> Estimate Std. Error t value Pr(>|t|) #> 9 1.902 0.25194841 7.549165 4.819462e-03 #> 45 1.144 0.13335666 8.578499 3.329579e-03 #> 268 1.536 0.26038049 5.899059 9.725771e-03 #> 314 1.306 0.15265648 8.555156 3.356044e-03 #> 442 1.576 0.20786534 7.581832 4.759898e-03 #> 514 1.430 0.13794927 10.366130 1.915399e-03 #> 569 1.816 0.02572936 70.580844 6.267530e-06 #> 624 1.120 0.04000000 28.000000 1.000014e-04 #> 723 1.268 0.08442748 15.018806 6.407318e-04 #> 918 1.000 0.30444376 3.284679 4.626268e-02 #> 949 1.728 0.24118043 7.164760 5.600382e-03 #> 978 1.028 0.31995000 3.213002 4.884420e-02 #> 1105 1.538 0.15115555 10.174949 2.022903e-03 #> 1542 1.194 0.18032748 6.621287 7.015905e-03 #> 1552 1.184 0.37355321 3.169562 5.049772e-02 #> 1653 0.954 0.13925516 6.850734 6.366647e-03 #> I(age - 11) #> Estimate Std. Error t value Pr(>|t|) #> 9 0.119 0.10285751 1.1569404 0.33105320 #> 45 0.174 0.05444263 3.1960249 0.04948216 #> 268 0.023 0.10629989 0.2163690 0.84257784 #> 314 -0.030 0.06232175 -0.4813729 0.66318168 #> 442 0.058 0.08486067 0.6834733 0.54336337 #> 514 0.265 0.05631755 4.7054602 0.01816360 #> 569 0.049 0.01050397 4.6649040 0.01859462 #> 624 0.020 0.01632993 1.2247449 0.30806801 #> 723 -0.054 0.03446738 -1.5666989 0.21516994 #> 918 0.143 0.12428864 1.1505476 0.33330784 #> 949 -0.098 0.09846150 -0.9953129 0.39294486 #> 978 0.632 0.13061904 4.8384983 0.01683776 #> 1105 0.156 0.06170899 2.5279945 0.08557441 #> 1542 0.237 0.07361839 3.2193045 0.04861002 #> 1552 0.153 0.15250246 1.0032625 0.38965538 #> 1653 0.246 0.05685068 4.3271249 0.02275586 deviant_tolerance_tidy <- deviant_tolerance_fit |> map(tidy) |> list_rbind(names_to = \"id\") |> mutate( id = as.factor(id), term = case_when( term == \"(Intercept)\" ~ \"intercept\", term == \"I(age - 11)\" ~ \"slope\" ) ) deviant_tolerance_abline <- deviant_tolerance_tidy |> select(id:estimate) |> pivot_wider(names_from = term, values_from = estimate) deviant_tolerance_abline #> # A tibble: 16 × 3 #> id intercept slope #>