8 Models for Matched Pairs
<- read.table("http://users.stat.ufl.edu/~aa/cat/data/Envir_opinions.dat",
Opinions header = TRUE, stringsAsFactors = TRUE)
# Make contingency table
<- as.matrix(addmargins(xtabs(~y1 + y2, data = Opinions)))
tab
library(tibble)
# Add label as the first column and variable names
`Table 8.1` <- tibble(`Pay Higher Taxes` = c("Yes", "No", "Total"),
Yes = tab[,1], No = tab[,2], Total = tab[,3])
# horrible kludge
`Table 8.1`$Yes <- paste(`Table 8.1`$Yes, c(" (pi11)", " (pi21)", ""))
`Table 8.1`$No <- paste(`Table 8.1`$No, c(" (pi12)", " (pi22)", ""))
library(flextable)
<- data.frame(
my_header col_keys = colnames(`Table 8.1`),
line1 = c("Pay Higher Taxes", rep("Cut Living Standards", 2), "Total"),
line2 = colnames(`Table 8.1`)
)
flextable(`Table 8.1`, col_keys = my_header$col_keys) %>%
set_header_df(
mapping = my_header,
key = "col_keys"
%>%
) theme_booktabs() %>%
autofit(part = "all") %>%
align(align = "center", part = "all") %>%
merge_h(part = "header") %>%
merge_v(part = "header") %>%
merge_h(part = "body") %>%
merge_v(part = "body") %>%
align_nottext_col(align = "center") %>%
set_caption(caption = "Table 8.1")
Pay Higher Taxes | Cut Living Standards | Total | |
---|---|---|---|
Yes | No | ||
Yes | 227 (pi11) | 132 (pi12) | 359 |
No | 107 (pi21) | 678 (pi22) | 785 |
Total | 334 | 810 | 1,144 |
8.1 Comparing Dependent Proporitons for Binary Mached Pairs
8.1.1 McNemar Test Comparing Marginal Proportions
\[H_0: P(Y_1 = 1) = P(Y_2 = 1), \mathrm{\ or\ equivalently\ } H_0: \pi_{12} = \pi_{21}.\]
\[\begin{equation} z = \frac{n_{12}-(\frac{1}{2}n^*)}{\sqrt{n^*(\frac{1}{2})(\frac{1}{2})}}= \frac{n_{12}- n_{21}}{\sqrt{n_{12}+n_{21}}} \tag{20} \end{equation}\]
<- read.table("http://users.stat.ufl.edu/~aa/cat/data/Envir_opinions.dat",
Opinions header = TRUE, stringsAsFactors = TRUE)
library(dplyr)
# Data file has 1144 lines, one for each of 1144 people.
# Each person has two binary responses (y1 and y2)
%>%
Opinions filter(row_number() %in% c(1, 2, n()))
person y1 y2
1 1 1 1
2 2 1 1
3 1144 2 2
<- xtabs(~y1 + y2, data = Opinions)
tab tab
y2
y1 1 2
1 227 132
2 107 678
# Don't use continuity correction, which is too conservative.
mcnemar.test(tab, correct = FALSE)
McNemar's Chi-squared test
data: tab
McNemar's chi-squared = 2.6151, df = 1, p-value = 0.1059
library(PropCIs)
# specific to how to round
= unlist(options("digits")) # save current rounding
currentDigits options(digits = 1)
# 95% Wald CI for difference of marginal probabilities
diffpropci.Wald.mp(107, 132, 1144, 0.95) # (n21, n12, n, confidence level)
data:
95 percent confidence interval:
-0.005 0.048
sample estimates:
[1] 0.02
diffpropci.Wald.mp(tab[2,1], tab[1,2], sum(tab), 0.95)
data:
95 percent confidence interval:
-0.005 0.048
sample estimates:
[1] 0.02
# 95% score CI for difference of marginal probabilities
scoreci.mp(tab[2,1], tab[1,2], sum(tab), 0.95)
data:
95 percent confidence interval:
-0.005 0.048
options(digits = currentDigits) # reset rounding
8.2 Marginal Models and Subject-Specific Models for Matched Pairs
8.2.1 Marginal Models for Marginal Proportions
\[P(Y_1 = 1) = \alpha + \delta,\ P(Y_2= 1) = \alpha,\]
\[P(Y_1 = 1) = \alpha + \delta x_i\]
\[\begin{equation} \mathrm{logit}[P(Y_t = 1)] = \alpha + \beta x_i. \tag{21} \end{equation}\]
8.2.2 Example: Enironmental Options Revisted
<- read.table("http://users.stat.ufl.edu/~aa/cat/data/Opinions.dat",
Opinions header = TRUE, stringsAsFactors = TRUE)
# Data file has 1144 lines, one for each of 1144 people.
# Each person has two binary responses (y1 and y2)
%>%
Opinions filter(row_number() %in% c(1, 2, 3, 4, n()-1, n()))
person question y
1 1 1 1
2 1 0 1
3 2 1 1
4 2 0 1
5 1144 1 0
6 1144 0 0
library(gee)
# id identifies variable on which observe y1, y2
<- gee(y ~ question, id = person, family = binomial(link = "identity"),
fit data = Opinions)
Beginning Cgee S-function, @(#) geeformula.q 4.13 98/01/27
running glm to get initial regression estimate
(Intercept) question
0.29195804 0.02185315
summary(fit) # question parameter for identity link is difference of proportions
GEE: GENERALIZED LINEAR MODELS FOR DEPENDENT DATA
gee S-function, version 4.13 modified 98/01/27 (1998)
Model:
Link: Identity
Variance to Mean Relation: Binomial
Correlation Structure: Independent
Call:
gee(formula = y ~ question, id = person, data = Opinions, family = binomial(link = "identity"))
Summary of Residuals:
Min 1Q Median 3Q Max
-0.3138112 -0.3138112 -0.2919580 0.6861888 0.7080420
Coefficients:
Estimate Naive S.E. Naive z Robust S.E. Robust z
(Intercept) 0.29195804 0.01344828 21.709701 0.0134424 21.719196
question 0.02185315 0.01921587 1.137245 0.0134982 1.618967
Estimated Scale Parameter: 1.000875
Number of Iterations: 1
Working Correlation
[,1] [,2]
[1,] 1 0
[2,] 0 1
<- gee(y ~ question, id = person, family = binomial(link = logit),
fit2 data = Opinions)
Beginning Cgee S-function, @(#) geeformula.q 4.13 98/01/27
running glm to get initial regression estimate
(Intercept) question
-0.8858933 0.1035319
summary(fit2) # question parameter for logit link is log odds ratio
GEE: GENERALIZED LINEAR MODELS FOR DEPENDENT DATA
gee S-function, version 4.13 modified 98/01/27 (1998)
Model:
Link: Logit
Variance to Mean Relation: Binomial
Correlation Structure: Independent
Call:
gee(formula = y ~ question, id = person, data = Opinions, family = binomial(link = logit))
Summary of Residuals:
Min 1Q Median 3Q Max
-0.3138112 -0.3138112 -0.2919580 0.6861888 0.7080420
Coefficients:
Estimate Naive S.E. Naive z Robust S.E. Robust z
(Intercept) -0.8858933 0.06505597 -13.617401 0.06502753 -13.623357
question 0.1035319 0.09107816 1.136737 0.06397794 1.618244
Estimated Scale Parameter: 1.000875
Number of Iterations: 1
Working Correlation
[,1] [,2]
[1,] 1 0
[2,] 0 1
8.2.3 Subject-Specific and Population-Averaging Tables
`Table 8.2` <- data.frame(Question = c("yi1: Pay higher taxes?",
"yi2: Cut living standards?"),
Yes = c(1, 1),
No = c(0, 0))
<- data.frame(col_keys = colnames(`Table 8.2`),
theHeader line1 = c("Question", rep("Response", 2)),
line2 = colnames(`Table 8.2`))
library(flextable)
library(dplyr)
flextable(`Table 8.2`, col_keys = theHeader$col_keys) %>%
set_header_df(
mapping = theHeader,
key = "col_keys"
%>%
) theme_booktabs() %>%
merge_v(part = "header") %>%
merge_h(part = "header") %>%
align(align = "center", part = "all") %>%
#autofit() %>%
empty_blanks() %>%
fix_border_issues() %>%
set_caption(caption = "Table 8.2: Representation of subject-specific table for matched pair contributing to count n_11 = 227 in Table 8.1.") %>%
set_table_properties(width = .75, layout = "autofit")
Question | Response | |
---|---|---|
Yes | No | |
yi1: Pay higher taxes? | 1 | 0 |
yi2: Cut living standards? | 1 | 0 |
8.2.4 Conditional Logistic Regression for Matched-Pairs*
\[\begin{equation} \mathrm{logit}[P(Y_{it} = 1)] = \alpha_i + \beta x_{it},\ t = 1,\ 2, \tag{22} \end{equation}\]
\[P(Y_{i1} = 1) = \frac{\mathrm{exp}(\alpha_i + \beta)}{1 + \mathrm{exp}(\alpha_i + \beta)},\ P(Y_{i2} = 1) = \frac{\mathrm{exp}(\alpha_i)}{1 + \mathrm{exp}(\alpha_i)}.\]
### 8.2.5 Logistic Regression for Matched Case-Control Studies*
library(tidyverse)
`Table 8.3` <- tibble(`Normal Birth Weight (Controls)` =
c("Nonsmokers",
"Smokers"),
Nonsmokers = c(159, 8),
Smokers = c(22, 14))
<- data.frame(col_keys = c("Normal Birth Weight (Controls)", "blank", "Nonsmokers", "Smokers"),
theHeader line1 = c("Normal Birth Weight (Controls)", "", rep("Low Birth Weight (Cases)", 2)),
line2 = c("Normal Birth Weight (Controls)", "blank", "Nonsmokers", "Smokers"))
library(flextable)
library(dplyr)
library(officer)
= fp_border(color="black", width = 2)
big_border
flextable(`Table 8.3`, col_keys = theHeader$col_keys) %>%
set_header_df(
mapping = theHeader,
key = "col_keys"
%>%
) theme_booktabs() %>%
merge_v(part = "header") %>%
merge_h(part = "header") %>%
align(align = "center", part = "all") %>%
#autofit() %>%
empty_blanks() %>%
fix_border_issues() %>%
set_caption(caption = "Table 8.3") %>%
set_table_properties(width = .75, layout = "autofit") %>%
hline_top(part="header", border = big_border) %>%
hline_bottom(part="body", border = big_border)
Normal Birth Weight (Controls) | Low Birth Weight (Cases) | ||
---|---|---|---|
Nonsmokers | Smokers | ||
Nonsmokers | 159 | 22 | |
Smokers | 8 | 14 |
library(tidyverse)
`Table 8.4` <- tibble(`Smoker` =
c("No",
"Yes"),
aCase = c(1, 0),
aCon = c(1, 0),
bCase = c(0, 1),
bCon = c(1, 0),
cCase = c(1, 0),
cCon = c(0, 1),
dCase = c(0, 1),
dCon = c(0, 1))
<- data.frame(col_keys = c("Smoker" ,
theHeader
"aCase",
"aCon",
"blank1",
"bCase",
"bCon",
"blank2",
"cCase",
"cCon",
"blank3",
"dCase",
"dCon",
"blank4" ),
line1 = c("Smoker", rep("a", 2), "", rep("b", 2), "",
rep("c", 2), "", rep("d", 2),""),
line2 = c("Smoker", rep(c("Case", "Control", " "), 4)))
library(flextable)
library(dplyr)
flextable(`Table 8.4`, col_keys = theHeader$col_keys) %>%
set_header_df(
mapping = theHeader,
key = "col_keys"
%>%
) theme_booktabs() %>%
merge_v(part = "header") %>%
merge_h(part = "header") %>%
align(align = "center", part = "all") %>%
#autofit() %>%
empty_blanks() %>%
fix_border_issues() %>%
set_caption(caption = "Table 8.4") %>%
set_table_properties(width = .75, layout = "autofit") %>%
hline_top(part="header", border = big_border) %>%
hline_top(part="body", border = big_border) %>%
hline_bottom(part="body", border = big_border)
Smoker | a | b | c | d | ||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
Case | Control | Case | Control | Case | Control | Case | Control | |||||
No | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | ||||
Yes | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 1 |
8.3 Comparing Proportions for Nominal Matched-Pairs Responses
\[P(Y_1 = i) = P(Y_2 = i)\ \mathrm{for}\ i = 1, \dots, c,\]
8.3.1 Marginal Homogeneity for Baseline-Category Logit Models
\[\mathrm{log}\left[\frac{P(Y_1 = j)}{P(Y_1 = c)}\right] = \alpha_j + \beta_j,\ \mathrm{log}\left[\frac{P(Y_2 = j)}{P(Y_2 = c)}\right] = \alpha_j\] ### 8.3.2 Example: Coffee Brand Market Share
<- read.table("http://users.stat.ufl.edu/~aa/cat/data/Coffee.dat",
Coffee header = TRUE, stringsAsFactors = TRUE)
library(dplyr)
library(tidyr)
<- Coffee %>%
coffee_wide ::pivot_wider(id_cols = person, names_from=purchase, names_prefix = "y",
tidyrvalues_from = y)
# Make contingency table
<- as.matrix(addmargins(xtabs(~y1 + y0, data = coffee_wide)))
tab
library(tibble)
# Add label as the first column and variable names
`Table 8.5` <- tibble(`First Purchase` = c("High Point", "Taster's Choice",
"Sanka", "Nescafe", "Brim", "Total"),
`High Point` = tab[, 1], `Taster's Choice` = tab[,2],
`Sanka` = tab[, 3], `Nescafe` = tab[,4], `Brim` = tab[,5],
Total = tab[, 6])
<- data.frame(col_keys = c("First Purchase", "Blank", "High Point",
theHeader "Taster's Choice", "Sanka", "Nescafe",
"Brim", "Total"),
line1 = c("First Purchase", "",
rep("Second Purchase", 6)),
line2 = c("First Purchase", "", "High Point",
"Taster's Choice", "Sanka", "Nescafe",
"Brim", "Total"))
library(flextable)
library(dplyr)
library(officer)
= fp_border(color="black", width = 2)
big_border
flextable(`Table 8.5`, col_keys = theHeader$col_keys) %>%
set_header_df(
mapping = theHeader,
key = "col_keys"
%>%
) theme_booktabs() %>%
merge_v(part = "header") %>%
merge_h(part = "header") %>%
align(align = "center", part = "all") %>%
align(j="First Purchase", align= "left", part = "all") %>%
#autofit() %>%
empty_blanks() %>%
fix_border_issues() %>%
set_caption(caption = "Table 8.5") %>%
set_table_properties(width = .75, layout = "autofit") %>%
hline_top(part="header", border = big_border) %>%
hline_bottom(part="body", border = big_border)
First Purchase | Second Purchase | ||||||
---|---|---|---|---|---|---|---|
High Point | Taster's Choice | Sanka | Nescafe | Brim | Total | ||
High Point | 93 | 17 | 44 | 7 | 10 | 171 | |
Taster's Choice | 9 | 46 | 11 | 0 | 9 | 75 | |
Sanka | 17 | 11 | 155 | 9 | 12 | 204 | |
Nescafe | 6 | 4 | 9 | 15 | 2 | 36 | |
Brim | 10 | 4 | 12 | 2 | 27 | 55 | |
Total | 135 | 82 | 231 | 33 | 60 | 541 |
Using GEE methodology, we can fit the baseline-category logit model, as shown in the following R
output.
<- read.table("http://users.stat.ufl.edu/~aa/cat/data/Coffee.dat",
Coffee header = TRUE, stringsAsFactors = TRUE)
# subject-specific data file, purchase is 1 for first purchase 0 for second
%>%
Coffee filter(row_number() %in% c(1, 2, 3, 4, n()-1, n()))
person purchase y
1 1 1 1
2 1 0 1
3 2 1 1
4 2 0 1
5 541 1 5
6 541 0 5
library(multgee) # package for multinomial GEE analysis
<- nomLORgee(y ~ purchase, id = person, LORstr = "independence",
fit data = Coffee)
# san.se = robust SE
# beta01 is alpha1 in our notation
summary(fit) # nomLORgee uses baseline category logits for nominal response
GEE FOR NOMINAL MULTINOMIAL RESPONSES
version 1.6.0 modified 2017-07-10
Link : Baseline Category Logit
Local Odds Ratios:
Structure: independence
Homogenous scores: TRUE
call:
nomLORgee(formula = y ~ purchase, data = Coffee, id = person,
LORstr = "independence")
Summary of residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.4270 -0.2495 -0.1386 0.0000 -0.0610 0.9390
Number of Iterations: 1
Coefficients:
Estimate san.se san.z Pr(>|san.z|)
beta10 0.81093 0.15516 5.2265 < 2e-16 ***
purchase:1 0.32340 0.16830 1.9215 0.05466 .
beta20 0.31237 0.16989 1.8387 0.06596 .
purchase:2 -0.00222 0.18662 -0.0119 0.99051
beta30 1.34807 0.14490 9.3036 < 2e-16 ***
purchase:3 -0.03729 0.15807 -0.2359 0.81353
beta40 -0.59784 0.21672 -2.7585 0.00581 **
purchase:4 0.17402 0.23531 0.7396 0.45957
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Local Odds Ratios Estimates:
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
[1,] 0 0 0 0 1 1 1 1
[2,] 0 0 0 0 1 1 1 1
[3,] 0 0 0 0 1 1 1 1
[4,] 0 0 0 0 1 1 1 1
[5,] 1 1 1 1 0 0 0 0
[6,] 1 1 1 1 0 0 0 0
[7,] 1 1 1 1 0 0 0 0
[8,] 1 1 1 1 0 0 0 0
p-value of Null model: < 0.0001
<- nomLORgee(y ~ 1, id = person, LORstr = "independence", data = Coffee)
fit0
# Model under H_0: y ~ 1 null model (marginal homogeneity)
waldts(fit0, fit)
Goodness of Fit based on the Wald test
Model under H_0: y ~ 1
Model under H_1: y ~ purchase
Wald Statistic = 12.4869, df = 4, p-value = 0.0141
with(Coffee, mantelhaen.test(purchase, y, person))
Cochran-Mantel-Haenszel test
data: purchase and y and person
Cochran-Mantel-Haenszel M^2 = 12.291, df = 4, p-value = 0.01531
8.3.4 Symmetry and Quasi-Symmetry Models for Square Contingency Tables
\[\pi_{ij} = \pi_{ji}\] \[\mathrm{log}(\pi_{ij}/\pi_{ji}) = 0\ \mathrm{for\ all}\ i \ \mathrm{and}\ j.\]
\[r_{ij} = (n_{ij} - n_{ji})/\sqrt{n_{ij} + n_{ji}}.\]
\[\begin{equation} \mathrm{log}(\pi_{ij}/\pi_{ji}) = \beta_i - \beta_j\ \mathrm{for\ all}\ i \ \mathrm{and}\ j. \tag{23} \end{equation}\]
8.4 Comparing Proportions for Ordinal Matched-Pairs Responses
8.4.1 Marginal Homogeneity and Cummulative Logit Marginal Model
\[\mathrm{logit}[P(Y_1 \le j)] = \alpha_j + \beta,\ \mathrm{logit}[P(Y_2 \le j)] = \alpha_j\]
8.4.2 Example: Recycle or Drive Less to Help the Environment?
<- read.table("http://users.stat.ufl.edu/~aa/cat/data/Envir.dat",
Envir header = TRUE, stringsAsFactors = TRUE)
library(dplyr)
library(tidyr)
<- Envir %>%
envir_wide ::pivot_wider(id_cols = person, names_from=question, names_prefix = "y",
tidyrvalues_from = y)
# Make contingency table
<- as.matrix(xtabs(~y1 + y0, data = envir_wide))
tab
library(tibble)
# Add label as the first column and variable names
`Table 8.6` <- tibble(`Recycle` = c("Always", "Often", "Sometimes", "Never"),
`Always` = tab[, 1], `Often` = tab[,2],
`Sometimes` = tab[, 3], `Never` = tab[,4])
<- data.frame(col_keys = c("Recycle", "Blank", "Always", "Often",
theHeader "Sometimes", "Never"),
line1 = c("Recycle", "", rep("Drive Less", 4)),
line2 = c("Recycle", "", "Always", "Often",
"Sometimes", "Never"))
library(flextable)
library(dplyr)
library(officer)
= fp_border(color="black", width = 2)
big_border
flextable(`Table 8.6`, col_keys = theHeader$col_keys) %>%
set_header_df(
mapping = theHeader,
key = "col_keys"
%>%
) theme_booktabs() %>%
merge_v(part = "header") %>%
merge_h(part = "header") %>%
align(align = "center", part = "all") %>%
align(j="Recycle", align= "left", part = "all") %>%
autofit() %>%
empty_blanks() %>%
fix_border_issues() %>%
set_caption(caption = "Table 8.6") %>%
#set_table_properties(width = .75, layout = "autofit") %>%
hline_top(part="header", border = big_border) %>%
hline_bottom(part="body", border = big_border)
Recycle | Drive Less | ||||
---|---|---|---|---|---|
Always | Often | Sometimes | Never | ||
Always | 12 | 43 | 163 | 233 | |
Often | 4 | 21 | 99 | 185 | |
Sometimes | 4 | 8 | 77 | 230 | |
Never | 0 | 1 | 18 | 132 |
<- read.table("http://users.stat.ufl.edu/~aa/cat/data/Envir.dat",
Envir header = TRUE, stringsAsFactors = TRUE)
%>%
Envir filter(row_number() %in% c(1, 2, n()-1, n()))
person question y
1 1 1 1
2 1 0 1
3 1230 1 4
4 1230 0 4
library(multgee)
<- ordLORgee(y ~ question, id = person, LORstr = "independence",
fit data = Envir)
summary(fit)
GEE FOR ORDINAL MULTINOMIAL RESPONSES
version 1.6.0 modified 2017-07-10
Link : Cumulative logit
Local Odds Ratios:
Structure: independence
call:
ordLORgee(formula = y ~ question, data = Envir, id = person,
LORstr = "independence")
Summary of residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.354916 -0.264742 -0.059210 -0.002021 -0.033859 0.966141
Number of Iterations: 1
Coefficients:
Estimate san.se san.z Pr(>|san.z|)
beta10 -3.35111 0.08289 -40.4287 < 2.2e-16 ***
beta20 -2.27673 0.07430 -30.6424 < 2.2e-16 ***
beta30 -0.58488 0.05882 -9.9443 < 2.2e-16 ***
question 2.75361 0.08147 33.7985 < 2.2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Local Odds Ratios Estimates:
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 0 0 0 1 1 1
[2,] 0 0 0 1 1 1
[3,] 0 0 0 1 1 1
[4,] 1 1 1 0 0 0
[5,] 1 1 1 0 0 0
[6,] 1 1 1 0 0 0
p-value of Null model: < 0.0001
8.4.3 An Ordinal Quasi-Symmetry Model *
\[\begin{equation} \mathrm{log}(\pi_{ij}/\pi_{ji}) = \beta(u_j = u_i). \tag{24} \end{equation}\]
8.4.4 Example: Recycle or Drive Less Revisited?
<- read.table("http://users.stat.ufl.edu/~aa/cat/data/Environment.dat",
Envir header = TRUE, stringsAsFactors = TRUE)
# nij and nji are opposite cell counts in Table 8.6
# x = j-1 = distance between categories, 6 pairs of opposite cells
Envir
always often sometimes never x nij nji
1 1 -1 0 0 1 43 4
2 1 0 -1 0 2 163 4
3 1 0 0 -1 3 233 0
4 0 1 -1 0 1 99 8
5 0 1 0 -1 2 185 1
6 0 0 1 -1 1 230 18
<- glm(nij/(nij+nji) ~ -1 + x, family = binomial, weight = nij + nji,
fit data = Envir)
summary(fit) # ordinal quasi symmetry; -1 in model sets intercept = 0
Call:
glm(formula = nij/(nij + nji) ~ -1 + x, family = binomial, data = Envir,
weights = nij + nji)
Deviance Residuals:
1 2 3 4 5 6
-0.03567 -1.82023 0.59541 0.33794 0.46512 0.64368
Coefficients:
Estimate Std. Error z value Pr(>|z|)
x 2.3936 0.1508 15.88 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1106.1240 on 6 degrees of freedom
Residual deviance: 4.4139 on 5 degrees of freedom
AIC: 23.35
Number of Fisher Scoring iterations: 4
<- glm(nij/(nij+nji) ~ -1 + always + often + sometimes + never,
fit.QS family = binomial, weight = nij + nji, data = Envir)
summary(fit.QS) # quasi symmetry
Call:
glm(formula = nij/(nij + nji) ~ -1 + always + often + sometimes +
never, family = binomial, data = Envir, weights = nij + nji)
Deviance Residuals:
1 2 3 4 5 6
0.7689 -1.1439 0.6760 0.4572 0.3006 -0.1381
Coefficients: (1 not defined because of singularities)
Estimate Std. Error z value Pr(>|z|)
always 6.9269 0.4708 14.71 <2e-16 ***
often 4.9332 0.3617 13.64 <2e-16 ***
sometimes 2.5817 0.2386 10.82 <2e-16 ***
never NA NA NA NA
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1106.1240 on 6 degrees of freedom
Residual deviance: 2.6751 on 3 degrees of freedom
AIC: 25.611
Number of Fisher Scoring iterations: 4
8.5 Analyzing Rater Agreement *
8.5.1 Example: Agreement on Carcinoma Diagnosis
<- read.table("http://users.stat.ufl.edu/~aa/cat/data/Pathologists.dat",
Pathology header = TRUE, stringsAsFactors = TRUE)
library(dplyr)
library(tibble)
# Make contingency table
<- matrix(Pathology$count, ncol = 4,byrow = T) %>%
mat addmargins()
colnames(mat) <- c("V1", "V2", "V3", "V4", "Sum")
`Table 8.7` <-
%>%
mat as_tibble() %>%
bind_cols(`Pathologist X` = c(1:4, "Total")) %>%
select(`Pathologist X`, everything())
<- data.frame(col_keys = c("Pathologist X", "Blank", "V1", "V2",
theHeader "V3", "V4", "Sum"),
line1 = c("Pathologist X", "", rep("Pathologist Y", 4),
"Total"),
line2 = c("Pathologist X", "", "1", "2", "3", "4",
"Total"))
library(flextable)
library(dplyr)
library(officer)
= fp_border(color="black", width = 2)
big_border
flextable(`Table 8.7`, col_keys = theHeader$col_keys) %>%
set_header_df(
mapping = theHeader,
key = "col_keys"
%>%
) theme_booktabs() %>%
merge_v(part = "header") %>%
merge_h(part = "header") %>%
align(align = "center", part = "all") %>%
align(j="Pathologist X", align= "left", part = "all") %>%
autofit() %>%
empty_blanks() %>%
fix_border_issues() %>%
set_caption(caption = "Table 8.7") %>%
#set_table_properties(width = .75, layout = "autofit") %>%
hline_top(part="header", border = big_border) %>%
hline_bottom(part="body", border = big_border)
Pathologist X | Pathologist Y | Total | ||||
---|---|---|---|---|---|---|
1 | 2 | 3 | 4 | |||
1 | 22 | 2 | 2 | 0 | 26 | |
2 | 5 | 7 | 14 | 0 | 26 | |
3 | 0 | 2 | 36 | 0 | 38 | |
4 | 0 | 1 | 17 | 10 | 28 | |
Total | 27 | 12 | 69 | 10 | 118 |
8.5.3 Quasi-Independence Model
<- read.table("http://users.stat.ufl.edu/~aa/cat/data/Pathologists.dat",
Pathology header = TRUE, stringsAsFactors = TRUE)
# diag: separate category (1, 2, 3, 4) for each main-diagonal cell Pathology
X Y count diag
1 1 1 22 1
2 1 2 2 0
3 1 3 2 0
4 1 4 0 0
5 2 1 5 0
6 2 2 7 2
7 2 3 14 0
8 2 4 0 0
9 3 1 0 0
10 3 2 2 0
11 3 3 36 3
12 3 4 0 0
13 4 1 0 0
14 4 2 1 0
15 4 3 17 0
16 4 4 10 4
# and a single (0) for all other cells
<- glm(count ~ factor(X) + factor(Y) + factor(diag), family = poisson,
fit data = Pathology)
# quasi independence, not showing intercept or main effects
summary(fit)
Call:
glm(formula = count ~ factor(X) + factor(Y) + factor(diag), family = poisson,
data = Pathology)
Deviance Residuals:
1 2 3 4 5 6 7 8
0.00000 1.19586 -0.74802 -0.00006 1.48661 0.00000 -0.66364 -0.00014
9 10 11 12 13 14 15 16
-1.23667 0.63080 0.00000 -0.00008 -1.93254 -1.35095 1.02517 0.00000
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.7700 0.6979 -1.103 0.26986
factor(X)2 1.6321 0.5604 2.912 0.00359 **
factor(X)3 0.5017 0.9261 0.542 0.58801
factor(X)4 1.3946 0.5551 2.512 0.01199 *
factor(Y)2 0.4796 0.6552 0.732 0.46414
factor(Y)3 1.9493 0.4971 3.921 8.80e-05 ***
factor(Y)4 -19.3097 4988.8789 -0.004 0.99691
factor(diag)1 3.8611 0.7297 5.291 1.22e-07 ***
factor(diag)2 0.6042 0.6900 0.876 0.38119
factor(diag)3 1.9025 0.8367 2.274 0.02298 *
factor(diag)4 20.9877 4988.8789 0.004 0.99664
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 190.398 on 15 degrees of freedom
Residual deviance: 13.178 on 5 degrees of freedom
AIC: 75.997
Number of Fisher Scoring iterations: 17
8.5.4 Quasi Independence and Odds Ratios Summarizing Agreement
\[\tau_{ab} = \mathrm{exp}(\delta_a + \delta_b).\]
8.5.5 Kappa Summary Measure of Agreement
\[\kappa = \frac{\sum_i\pi_{ii}-\sum_i\pi_{i+}\pi_{+i}}{1-\sum_i\pi_{i+}\pi_{+i}}\]
library(psych)
<- matrix(Pathology$count, ncol = 4, byrow = TRUE)
dat cohen.kappa(dat)
Call: cohen.kappa1(x = x, w = w, n.obs = n.obs, alpha = alpha, levels = levels)
Cohen Kappa and Weighted Kappa correlation coefficients and confidence boundaries
lower estimate upper
unweighted kappa 0.38 0.49 0.60
weighted kappa 0.78 0.78 0.78
Number of subjects = 118
8.6 Bradley-Terry Model for Paired Preferences *
`Table 8.8` <- tibble(Winner = c("Djokovic", "Federer", "Murray", "Nadal",
"Wawrinka"),
Djokovic = c("-", 6, 3, 2, 3),
Federer = c(9, "-", 0 ,1 , 2),
Murray = c(14, 5, "-", 4, 2),
Nadal = c(9, 5, 2, "-", 3),
Wawrinka = c(4, 7, 2, 4, "-"))
<- data.frame(col_keys = c("Winner", "Blank", "Djokovic", "Federer",
theHeader "Murray", "Nadal", "Wawrinka"),
line1 = c("Winner", "", rep("Looser", 5)),
line2 = c("Winner", "", "Djokovic", "Federer",
"Murray", "Nadal", "Wawrinka"))
library(flextable)
library(dplyr)
library(officer)
= fp_border(color="black", width = 2)
big_border
flextable(`Table 8.8`, col_keys = theHeader$col_keys) %>%
set_header_df(
mapping = theHeader,
key = "col_keys"
%>%
) theme_booktabs() %>%
merge_v(part = "header") %>%
merge_h(part = "header") %>%
align(align = "center", part = "all") %>%
align(j="Winner", align= "left", part = "all") %>%
autofit() %>%
empty_blanks() %>%
fix_border_issues() %>%
set_caption(caption = "Resuls of 2014-2018 matches for men tennis players.") %>%
#set_table_properties(width = .75, layout = "autofit") %>%
hline_top(part="header", border = big_border) %>%
hline_bottom(part="body", border = big_border)
Winner | Looser | |||||
---|---|---|---|---|---|---|
Djokovic | Federer | Murray | Nadal | Wawrinka | ||
Djokovic | - | 9 | 14 | 9 | 4 | |
Federer | 6 | - | 5 | 5 | 7 | |
Murray | 3 | 0 | - | 2 | 2 | |
Nadal | 2 | 1 | 4 | - | 4 | |
Wawrinka | 3 | 2 | 2 | 3 | - |
8.6.1 The Bradley-Terry Model of Quasi-Symmetry
\[\mathrm{logit}(\Pi_{ij}) = \mathrm{log}(\Pi_{ij}/\Pi_{ji}) = \beta_i - \beta_j.\]
\[\hat\Pi_{ij} = \mathrm{exp}(\hat\beta_i - \hat\beta_j)/[1 + \mathrm{exp}(\hat\beta_i - \hat\beta_j)].\]
8.6.2 Example: Ranking Men Tennis Players
<- read.table("http://users.stat.ufl.edu/~aa/cat/data/Tennis.dat",
Tennis header = TRUE, stringsAsFactors = TRUE)
# Djokovic won 9 lost 6 vs Fed
# Federer always beat Murray
<-
fit glm(nij/(nij + nji) ~ -1 + Djokovic + Federer + Murray + Nadal + Wawrinka,
family = binomial, weights = nij + nji, data = Tennis)
summary(fit)
Call:
glm(formula = nij/(nij + nji) ~ -1 + Djokovic + Federer + Murray +
Nadal + Wawrinka, family = binomial, data = Tennis, weights = nij +
nji)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.1225 -0.1262 0.3715 0.5386 1.2928
Coefficients: (1 not defined because of singularities)
Estimate Std. Error z value Pr(>|z|)
Djokovic 1.17612 0.49952 2.354 0.0185 *
Federer 1.13578 0.51095 2.223 0.0262 *
Murray -0.56852 0.56833 -1.000 0.3172
Nadal -0.06185 0.51487 -0.120 0.9044
Wawrinka NA NA NA NA
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 26.8960 on 10 degrees of freedom
Residual deviance: 4.3958 on 6 degrees of freedom
AIC: 34.041
Number of Fisher Scoring iterations: 4
\[\hat\Pi_{ij} = \frac{\mathrm{exp}(\hat\beta_i - \hat\beta_j)}{1 + \mathrm{exp}(\hat\beta_i - \hat\beta_j)} = \frac{\mathrm{exp}(1.136 - 1.176)}{1 + \mathrm{exp}(1.136 - 1.176)} = 0.49\]