9 Marginal Modeling of Correlated, Clustered Responses
9.1 Marginal Models Vs Subject-Specific Models
9.1.2 Example: Repeated Responses on Similar Survey Questions
<- read.table("http://users.stat.ufl.edu/~aa/cat/data/Abortion.dat",
Abortion header = TRUE, stringsAsFactors = TRUE)
library(tidyverse)
<-
a_wide %>%
Abortion ::pivot_wider(id_cols = c(person, gender), # do not pivot
tidyrnames_from=situation, # variable with situation number
names_prefix = "sit", # new name prefix
values_from = response) %>% #
mutate(pattern = paste(sit1, sit2, sit3, sep = ",")) %>% # make word with pattern
mutate(pattern = factor(pattern,
levels = c("1,1,1", "1,1,0", "0,1,1", "0,1,0",
"1,0,1", "1,0,0", "0,0,1","0,0,0"),
ordered = TRUE))
`Table 9.1` <- as.data.frame.matrix(xtabs( ~ a_wide$gender + a_wide$pattern ),
row = 2 ) %>%
bind_cols(Gender = c("Male", "Female")) %>% # add sex
select(Gender, everything()) # reorder columns
<- data.frame(col_keys = colnames(`Table 9.1`),
theHeader line1 = c("Gender",
rep("Sequene of Responses in Three Situations", 8)),
line2 = colnames(`Table 9.1`))
library(flextable)
flextable(`Table 9.1`, col_keys = theHeader$col_keys) %>%
set_header_df(
mapping = theHeader,
key = "col_keys"
%>%
) theme_booktabs() %>%
merge_v(part = "header") %>%
merge_h(part = "header") %>%
align(align = "center", part = "all") %>%
align(j="Gender", align= "left", part = "all") %>%
#autofit() %>%
empty_blanks() %>%
fix_border_issues() %>%
set_caption(caption = "Support (1 = yes, 0 = no) for legalized abortion in three situations, by gender") %>%
set_table_properties(width = .75, layout = "autofit")
Gender | Sequene of Responses in Three Situations | |||||||
---|---|---|---|---|---|---|---|---|
1,1,1 | 1,1,0 | 0,1,1 | 0,1,0 | 1,0,1 | 1,0,0 | 0,0,1 | 0,0,0 | |
Male | 342 | 26 | 6 | 21 | 11 | 32 | 19 | 356 |
Female | 440 | 25 | 14 | 18 | 14 | 47 | 22 | 457 |
9.2 Marginal Modeling: The Generalized Estimating Equations (GEE) Approach
9.2.3 Example: Opinion about Legalized Abortion Revisited
<- read.table("http://users.stat.ufl.edu/~aa/cat/data/Abortion.dat",
Abortion header = TRUE, stringsAsFactors = TRUE)
library(tidyverse)
%>%
Abortion filter(row_number() %in% c(1, 2, 3, n()-2, n()-1, n()))
person gender situation response
1 1 1 1 1
2 1 1 2 1
3 1 1 3 1
4 1850 0 1 0
5 1850 0 2 0
6 1850 0 3 0
<-
Abortion %>%
Abortion mutate(sit = factor(situation, levels = c(3, 1, 2)))
<-
fit.glm glm(response ~ sit + gender, family = binomial, data = Abortion)
# ML estimate for 5550 independent observations
summary(fit.glm)
Call:
glm(formula = response ~ sit + gender, family = binomial, data = Abortion)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.189 -1.148 -1.125 1.207 1.231
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.125408 0.055601 -2.255 0.0241 *
sit1 0.149347 0.065825 2.269 0.0233 *
sit2 0.052018 0.065843 0.790 0.4295
gender 0.003582 0.054138 0.066 0.9472
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 7689.5 on 5549 degrees of freedom
Residual deviance: 7684.2 on 5546 degrees of freedom
AIC: 7692.2
Number of Fisher Scoring iterations: 3
library(gee)
# cluster on "id" variable
<- gee(response ~ sit + gender, id = person, family = binomial,
fit.gee corstr = "independence", data = Abortion)
Beginning Cgee S-function, @(#) geeformula.q 4.13 98/01/27
running glm to get initial regression estimate
(Intercept) sit1 sit2 gender
-0.125407576 0.149347113 0.052017989 0.003582051
summary(fit.gee)
GEE: GENERALIZED LINEAR MODELS FOR DEPENDENT DATA
gee S-function, version 4.13 modified 98/01/27 (1998)
Model:
Link: Logit
Variance to Mean Relation: Binomial
Correlation Structure: Independent
Call:
gee(formula = response ~ sit + gender, id = person, data = Abortion,
family = binomial, corstr = "independence")
Summary of Residuals:
Min 1Q Median 3Q Max
-0.5068800 -0.4825552 -0.4686891 0.5174448 0.5313109
Coefficients:
Estimate Naive S.E. Naive z Robust S.E. Robust z
(Intercept) -0.125407576 0.05562131 -2.25466795 0.06758236 -1.85562596
sit1 0.149347113 0.06584875 2.26803253 0.02973865 5.02198759
sit2 0.052017989 0.06586692 0.78974374 0.02704704 1.92324166
gender 0.003582051 0.05415761 0.06614123 0.08784012 0.04077921
Estimated Scale Parameter: 1.000721
Number of Iterations: 1
Working Correlation
[,1] [,2] [,3]
[1,] 1 0 0
[2,] 0 1 0
[3,] 0 0 1
<-
fit.gee2 gee(response ~ sit + gender, id = person, family = binomial,
corstr = "exchangeable", data = Abortion)
Beginning Cgee S-function, @(#) geeformula.q 4.13 98/01/27
running glm to get initial regression estimate
(Intercept) sit1 sit2 gender
-0.125407576 0.149347113 0.052017989 0.003582051
summary(fit.gee2)
GEE: GENERALIZED LINEAR MODELS FOR DEPENDENT DATA
gee S-function, version 4.13 modified 98/01/27 (1998)
Model:
Link: Logit
Variance to Mean Relation: Binomial
Correlation Structure: Exchangeable
Call:
gee(formula = response ~ sit + gender, id = person, data = Abortion,
family = binomial, corstr = "exchangeable")
Summary of Residuals:
Min 1Q Median 3Q Max
-0.5068644 -0.4825396 -0.4687095 0.5174604 0.5312905
Coefficients:
Estimate Naive S.E. Naive z Robust S.E. Robust z
(Intercept) -0.125325730 0.06782579 -1.84775925 0.06758212 -1.85442135
sit1 0.149347107 0.02814374 5.30658404 0.02973865 5.02198729
sit2 0.052017986 0.02815145 1.84779075 0.02704703 1.92324179
gender 0.003437873 0.08790630 0.03910838 0.08784072 0.03913758
Estimated Scale Parameter: 1.000721
Number of Iterations: 2
Working Correlation
[,1] [,2] [,3]
[1,] 1.0000000 0.8173308 0.8173308
[2,] 0.8173308 1.0000000 0.8173308
[3,] 0.8173308 0.8173308 1.0000000
<-
fit.geeUn gee(response ~ sit + gender, id = person, family = binomial,
corstr = "unstructured", data = Abortion)
Beginning Cgee S-function, @(#) geeformula.q 4.13 98/01/27
running glm to get initial regression estimate
(Intercept) sit1 sit2 gender
-0.125407576 0.149347113 0.052017989 0.003582051
summary(fit.geeUn)$working.correlation
[,1] [,2] [,3]
[1,] 1.0000000 0.8248498 0.7958825
[2,] 0.8248498 1.0000000 0.8312594
[3,] 0.7958825 0.8312594 1.0000000
9.2.4 Limitations of GEE Compared to ML
library(geepack) # geepack library enables Wald tests comparing models
<-
fit geeglm(response ~ gender + factor(situation) , id = person, family = binomial,
corstr = "exchangeable", data = Abortion)
anova(fit)
Analysis of 'Wald statistic' Table
Model: binomial, link: logit
Response: response
Terms added sequentially (first to last)
Df X2 P(>|Chi|)
gender 1 0.0017 0.9675
factor(situation) 2 26.0171 2.241e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
9.3 Marginal Modeling for Clustered Multinomial Responses
9.3.1 Example: Insomnia Study
\[\hat\beta_1 = 1.038\ (0.168),\ \hat\beta_2 = 0.034\ (0.238),\ \hat\beta_3 = 0.708\ (0.224)\]
<- read.table("http://users.stat.ufl.edu/~aa/cat/data/Insomnia.dat",
Insomnia header = TRUE, stringsAsFactors = TRUE)
%>%
Insomnia filter(row_number() %in% c(1, 2, n()-1, n()))
case treat occasion response
1 1 1 0 1
2 1 1 1 1
3 127 0 0 4
4 127 0 1 4
library(multgee)
<- ordLORgee(response ~ occasion + treat + occasion:treat, id = case,
fit LORstr = "independence", data = Insomnia)
summary(fit)
GEE FOR ORDINAL MULTINOMIAL RESPONSES
version 1.6.0 modified 2017-07-10
Link : Cumulative logit
Local Odds Ratios:
Structure: independence
call:
ordLORgee(formula = response ~ occasion + treat + occasion:treat,
data = Insomnia, id = case, LORstr = "independence")
Summary of residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.3804488 -0.2952858 -0.1886117 0.0002019 -0.0938856 0.9061144
Number of Iterations: 1
Coefficients:
Estimate san.se san.z Pr(>|san.z|)
beta10 -2.26709 0.21876 -10.3633 < 2e-16 ***
beta20 -0.95146 0.18092 -5.2591 < 2e-16 ***
beta30 0.35174 0.17842 1.9714 0.04868 *
occasion 1.03808 0.16759 6.1943 < 2e-16 ***
treat 0.03361 0.23844 0.1410 0.88790
occasion:treat 0.70776 0.24352 2.9064 0.00366 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Local Odds Ratios Estimates:
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 0 0 0 1 1 1
[2,] 0 0 0 1 1 1
[3,] 0 0 0 1 1 1
[4,] 1 1 1 0 0 0
[5,] 1 1 1 0 0 0
[6,] 1 1 1 0 0 0
p-value of Null model: < 0.0001
9.4 Transitional Modeling, Given the Past
9.4.1 Transitional Models with Explanatory Variables
\[\mathrm{logit}[P(Y_t = 1)] = \alpha + \beta y_{t-1} + \beta_1 x_{1t} + \cdots + \beta_p x_{pt},\]
9.4.2 Example: Respiratory Illness and Maternal Smoking
\[\mathrm{logit}[P(Y_t = 1)] = \alpha + \beta y_{t-1} + \beta_1 s + \beta_2 t,\ t = 8, 9, 10\]
<- read.table("http://users.stat.ufl.edu/~aa/cat/data/Respiratory.dat",
Respiratory header = TRUE, stringsAsFactors = TRUE)
Respiratory
no yes previous s t
1 283 17 0 0 10
2 30 20 1 0 10
3 140 12 0 1 10
4 21 14 1 1 10
5 274 24 0 0 9
6 26 26 1 0 9
7 134 14 0 1 9
8 18 21 1 1 9
9 266 28 0 0 8
10 32 24 1 0 8
11 134 22 0 1 8
12 14 17 1 1 8
<- glm(yes/(yes+no) ~ previous + s + t, family = binomial,
fit weights = no + yes, data = Respiratory)
summary(fit)
Call:
glm(formula = yes/(yes + no) ~ previous + s + t, family = binomial,
data = Respiratory, weights = no + yes)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.98143 -0.30047 -0.09456 0.36684 0.96039
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.29256 0.84603 -0.346 0.7295
previous 2.21107 0.15819 13.977 <2e-16 ***
s 0.29596 0.15634 1.893 0.0583 .
t -0.24281 0.09466 -2.565 0.0103 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 207.2212 on 11 degrees of freedom
Residual deviance: 3.1186 on 8 degrees of freedom
AIC: 64.392
Number of Fisher Scoring iterations: 3
::Anova(fit) car
Analysis of Deviance Table (Type II tests)
Response: yes/(yes + no)
LR Chisq Df Pr(>Chisq)
previous 192.331 1 < 2e-16 ***
s 3.546 1 0.05969 .
t 6.649 1 0.00992 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
9.4.3 Group Comparisons Treating Initial Response as Covariate
\[\begin{equation} \mathrm{logit}[P(Y_2 \le j)] = \alpha_j + \beta_1 x + \beta_2 y_1 \tag{25} \end{equation}\]
<- read.table("http://users.stat.ufl.edu/~aa/cat/data/Insomnia2.dat",
Insomnia2 header = TRUE, stringsAsFactors = TRUE)
Insomnia2
treatment initial follow1 follow2 follow3 follow4
1 1 10 7 4 1 0
2 1 25 11 5 2 2
3 1 45 13 23 3 1
4 1 75 9 17 13 8
5 0 10 7 4 2 1
6 0 25 14 5 1 0
7 0 45 6 9 18 2
8 0 75 4 11 14 22
library(VGAM)
<- vglm(cbind(follow1, follow2, follow3, follow4) ~ treatment + initial,
fit family = cumulative(parallel = TRUE), data = Insomnia2)
summary(fit)
Call:
vglm(formula = cbind(follow1, follow2, follow3, follow4) ~ treatment +
initial, family = cumulative(parallel = TRUE), data = Insomnia2)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept):1 0.581161 0.318824 1.823 0.068330 .
(Intercept):2 2.277442 0.355043 6.415 1.41e-10 ***
(Intercept):3 3.750952 0.399804 9.382 < 2e-16 ***
treatment 0.884679 0.245552 3.603 0.000315 ***
initial -0.042106 0.005796 -7.264 3.75e-13 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Names of linear predictors: logitlink(P[Y<=1]), logitlink(P[Y<=2]),
logitlink(P[Y<=3])
Residual deviance: 34.7689 on 19 degrees of freedom
Log-likelihood: -50.5177 on 19 degrees of freedom
Number of Fisher scoring iterations: 5
No Hauck-Donner effect found in any of the estimates
Exponentiated coefficients:
treatment initial
2.4222064 0.9587682
detach("package:multgee",
unload = TRUE)
detach("package:VGAM",
unload = TRUE) # contains function s that conflicts with gam