8 Models for Matched Pairs

Opinions <- read.table("http://users.stat.ufl.edu/~aa/cat/data/Envir_opinions.dat",
                        header = TRUE, stringsAsFactors = TRUE)
# Make contingency table
tab <- as.matrix(addmargins(xtabs(~y1 + y2, data = Opinions)))

library(tibble)
# Add label as the first column and variable names
`Table 8.1` <- tibble(`Pay Higher Taxes` = c("Yes", "No", "Total"), 
                      Yes = tab[,1], No = tab[,2], Total = tab[,3])

# horrible kludge
`Table 8.1`$Yes <- paste(`Table 8.1`$Yes, c(" (pi11)", " (pi21)", ""))
`Table 8.1`$No <- paste(`Table 8.1`$No, c(" (pi12)", " (pi22)", ""))

library(flextable)
my_header <- data.frame(
  col_keys = colnames(`Table 8.1`),
  line1 = c("Pay Higher Taxes", rep("Cut Living Standards", 2), "Total"),
  line2 = colnames(`Table 8.1`)
)

flextable(`Table 8.1`, col_keys = my_header$col_keys) %>%
  set_header_df(
    mapping = my_header,
    key = "col_keys"
  ) %>% 
  theme_booktabs() %>% 
  autofit(part = "all") %>%    
  align(align = "center", part = "all") %>% 
  merge_h(part = "header") %>% 
  merge_v(part = "header") %>% 
  merge_h(part = "body") %>% 
  merge_v(part = "body") %>%
  align_nottext_col(align = "center") %>% 
  set_caption(caption = "Table 8.1")

Table 2: Table 8.1
Pay Higher Taxes	Cut Living Standards		Total
Pay Higher Taxes	Yes	No	Total
Yes	227 (pi11)	132 (pi12)	359
No	107 (pi21)	678 (pi22)	785
Total	334	810	1,144

8.1 Comparing Dependent Proporitons for Binary Mached Pairs

8.1.1 McNemar Test Comparing Marginal Proportions

\[H_0: P(Y_1 = 1) = P(Y_2 = 1), \mathrm{\ or\ equivalently\ } H_0: \pi_{12} = \pi_{21}.\]

\[\begin{equation} z = \frac{n_{12}-(\frac{1}{2}n^*)}{\sqrt{n^*(\frac{1}{2})(\frac{1}{2})}}= \frac{n_{12}- n_{21}}{\sqrt{n_{12}+n_{21}}} \tag{20} \end{equation}\]

Opinions <- read.table("http://users.stat.ufl.edu/~aa/cat/data/Envir_opinions.dat",
                        header = TRUE, stringsAsFactors = TRUE)

library(dplyr)

# Data file has 1144 lines, one for each of 1144 people. 
# Each person has two binary responses (y1 and y2)
Opinions %>% 
  filter(row_number() %in% c(1, 2, n()))

  person y1 y2
1      1  1  1
2      2  1  1
3   1144  2  2

tab <- xtabs(~y1 + y2, data = Opinions)
tab

   y2
y1    1   2
  1 227 132
  2 107 678

# Don't use continuity correction, which is too conservative.
mcnemar.test(tab, correct = FALSE)


    McNemar's Chi-squared test

data:  tab
McNemar's chi-squared = 2.6151, df = 1, p-value = 0.1059

library(PropCIs)

# specific to how to round 
currentDigits = unlist(options("digits"))  # save current rounding
options(digits = 1)

# 95% Wald CI for difference of marginal probabilities 
diffpropci.Wald.mp(107, 132, 1144, 0.95)  # (n21, n12, n, confidence level)




data:  

95 percent confidence interval:
 -0.005  0.048
sample estimates:
[1] 0.02

diffpropci.Wald.mp(tab[2,1], tab[1,2], sum(tab), 0.95)




data:  

95 percent confidence interval:
 -0.005  0.048
sample estimates:
[1] 0.02

# 95% score CI for difference of marginal probabilities 
scoreci.mp(tab[2,1], tab[1,2], sum(tab), 0.95)




data:  

95 percent confidence interval:
 -0.005  0.048

options(digits = currentDigits)  # reset rounding

8.1.2 Estimating the difference between Dependent Proportions

8.2 Marginal Models and Subject-Specific Models for Matched Pairs

8.2.1 Marginal Models for Marginal Proportions

\[P(Y_1 = 1) = \alpha + \delta,\ P(Y_2= 1) = \alpha,\]

\[P(Y_1 = 1) = \alpha + \delta x_i\]

\[\begin{equation} \mathrm{logit}[P(Y_t = 1)] = \alpha + \beta x_i. \tag{21} \end{equation}\]

8.2.2 Example: Enironmental Options Revisted

Opinions <- read.table("http://users.stat.ufl.edu/~aa/cat/data/Opinions.dat",
                        header = TRUE, stringsAsFactors = TRUE)

# Data file has 1144 lines, one for each of 1144 people. 
# Each person has two binary responses (y1 and y2)
Opinions %>% 
  filter(row_number() %in% c(1, 2, 3, 4, n()-1, n()))

  person question y
1      1        1 1
2      1        0 1
3      2        1 1
4      2        0 1
5   1144        1 0
6   1144        0 0

library(gee)
# id identifies variable on which observe y1, y2
fit <- gee(y ~ question, id = person, family = binomial(link = "identity"),
           data = Opinions)

Beginning Cgee S-function, @(#) geeformula.q 4.13 98/01/27

running glm to get initial regression estimate

(Intercept)    question 
 0.29195804  0.02185315

summary(fit) # question parameter for identity link is difference of proportions


 GEE:  GENERALIZED LINEAR MODELS FOR DEPENDENT DATA
 gee S-function, version 4.13 modified 98/01/27 (1998) 

Model:
 Link:                      Identity 
 Variance to Mean Relation: Binomial 
 Correlation Structure:     Independent 

Call:
gee(formula = y ~ question, id = person, data = Opinions, family = binomial(link = "identity"))

Summary of Residuals:
       Min         1Q     Median         3Q        Max 
-0.3138112 -0.3138112 -0.2919580  0.6861888  0.7080420 


Coefficients:
              Estimate Naive S.E.   Naive z Robust S.E.  Robust z
(Intercept) 0.29195804 0.01344828 21.709701   0.0134424 21.719196
question    0.02185315 0.01921587  1.137245   0.0134982  1.618967

Estimated Scale Parameter:  1.000875
Number of Iterations:  1

Working Correlation
     [,1] [,2]
[1,]    1    0
[2,]    0    1

fit2 <- gee(y ~ question, id = person, family = binomial(link = logit),
           data = Opinions)

Beginning Cgee S-function, @(#) geeformula.q 4.13 98/01/27
running glm to get initial regression estimate

(Intercept)    question 
 -0.8858933   0.1035319

summary(fit2) # question parameter for logit link is log odds ratio


 GEE:  GENERALIZED LINEAR MODELS FOR DEPENDENT DATA
 gee S-function, version 4.13 modified 98/01/27 (1998) 

Model:
 Link:                      Logit 
 Variance to Mean Relation: Binomial 
 Correlation Structure:     Independent 

Call:
gee(formula = y ~ question, id = person, data = Opinions, family = binomial(link = logit))

Summary of Residuals:
       Min         1Q     Median         3Q        Max 
-0.3138112 -0.3138112 -0.2919580  0.6861888  0.7080420 


Coefficients:
              Estimate Naive S.E.    Naive z Robust S.E.   Robust z
(Intercept) -0.8858933 0.06505597 -13.617401  0.06502753 -13.623357
question     0.1035319 0.09107816   1.136737  0.06397794   1.618244

Estimated Scale Parameter:  1.000875
Number of Iterations:  1

Working Correlation
     [,1] [,2]
[1,]    1    0
[2,]    0    1

8.2.3 Subject-Specific and Population-Averaging Tables

`Table 8.2` <- data.frame(Question = c("yi1: Pay higher taxes?", 
                                       "yi2: Cut living standards?"),
                          Yes = c(1, 1),
                          No = c(0, 0))

theHeader <- data.frame(col_keys = colnames(`Table 8.2`),
                        line1 = c("Question", rep("Response", 2)),
                        line2 = colnames(`Table 8.2`))

library(flextable)
library(dplyr)

flextable(`Table 8.2`, col_keys = theHeader$col_keys) %>% 
  set_header_df(
    mapping = theHeader,
    key = "col_keys"
  ) %>% 
  theme_booktabs() %>% 
  merge_v(part = "header") %>% 
  merge_h(part = "header") %>% 
  align(align = "center", part = "all") %>% 
  #autofit() %>% 
  empty_blanks() %>% 
  fix_border_issues()  %>%
  set_caption(caption = "Table 8.2: Representation of subject-specific table for matched pair contributing to count n_11 = 227 in Table 8.1.") %>% 
  set_table_properties(width = .75, layout = "autofit")

Table 3: Table 8.2: Representation of subject-specific table for matched pair contributing to count n_11 = 227 in Table 8.1.
Question	Response
Question	Yes	No
yi1: Pay higher taxes?	1	0
yi2: Cut living standards?	1	0

8.2.4 Conditional Logistic Regression for Matched-Pairs*

\[\begin{equation} \mathrm{logit}[P(Y_{it} = 1)] = \alpha_i + \beta x_{it},\ t = 1,\ 2, \tag{22} \end{equation}\]

\[P(Y_{i1} = 1) = \frac{\mathrm{exp}(\alpha_i + \beta)}{1 + \mathrm{exp}(\alpha_i + \beta)},\ P(Y_{i2} = 1) = \frac{\mathrm{exp}(\alpha_i)}{1 + \mathrm{exp}(\alpha_i)}.\]
### 8.2.5 Logistic Regression for Matched Case-Control Studies*

library(tidyverse)
`Table 8.3` <- tibble(`Normal Birth Weight (Controls)` = 
                            c("Nonsmokers", 
                              "Smokers"),
                          Nonsmokers = c(159, 8),
                          Smokers = c(22, 14))

theHeader <- data.frame(col_keys = c("Normal Birth Weight (Controls)", "blank", "Nonsmokers", "Smokers"),
                        line1 = c("Normal Birth Weight (Controls)", "", rep("Low Birth Weight (Cases)", 2)),
                        line2 = c("Normal Birth Weight (Controls)", "blank", "Nonsmokers", "Smokers"))

library(flextable)
library(dplyr)

library(officer)
big_border = fp_border(color="black", width = 2)

flextable(`Table 8.3`, col_keys = theHeader$col_keys) %>% 
  set_header_df(
    mapping = theHeader,
    key = "col_keys"
  ) %>% 
  theme_booktabs() %>% 
  merge_v(part = "header") %>% 
  merge_h(part = "header") %>% 
  align(align = "center", part = "all") %>% 
  #autofit() %>% 
  empty_blanks() %>% 
  fix_border_issues()  %>%
  set_caption(caption = "Table 8.3") %>% 
  set_table_properties(width = .75, layout = "autofit") %>% 
  hline_top(part="header", border = big_border) %>% 
  hline_bottom(part="body", border = big_border)

Table 4: Table 8.3
Normal Birth Weight (Controls)	Low Birth Weight (Cases)
Normal Birth Weight (Controls)	Nonsmokers	Smokers
Nonsmokers	159	22
Smokers	8	14

library(tidyverse)
`Table 8.4` <- tibble(`Smoker` = 
                            c("No", 
                              "Yes"),
                          aCase = c(1, 0),
                          aCon = c(1, 0),
                          bCase = c(0, 1),
                          bCon = c(1, 0),
                          cCase = c(1, 0),
                          cCon = c(0, 1),
                          dCase = c(0, 1),
                          dCon = c(0, 1))

theHeader <- data.frame(col_keys = c("Smoker" ,
                         
                          "aCase",
                          "aCon", 
                          "blank1",
                          "bCase",
                          "bCon", 
                           
                          "blank2",
                          "cCase",
                          "cCon", 
                           
                          "blank3",
                          "dCase",
                          "dCon", 
                          "blank4" ),
                        line1 = c("Smoker", rep("a", 2), "", rep("b", 2), "", 
                                  rep("c", 2), "", rep("d", 2),""),
                        line2 = c("Smoker", rep(c("Case", "Control", " "), 4)))

library(flextable)
library(dplyr)

flextable(`Table 8.4`, col_keys = theHeader$col_keys) %>% 
  set_header_df(
    mapping = theHeader,
    key = "col_keys"
  ) %>% 
  theme_booktabs() %>% 
  merge_v(part = "header") %>% 
  merge_h(part = "header") %>% 
  align(align = "center", part = "all") %>% 
  #autofit() %>% 
  empty_blanks() %>% 
  fix_border_issues()  %>%
  set_caption(caption = "Table 8.4") %>% 
  set_table_properties(width = .75, layout = "autofit") %>% 
  hline_top(part="header", border = big_border) %>% 
  hline_top(part="body", border = big_border)  %>% 
  hline_bottom(part="body", border = big_border)

Table 5: Table 8.4
Smoker	a		b		c		d
Smoker	Case	Control	Case	Control	Case	Control	Case	Control
No	1	1	0	1	1	0	0	0
Yes	0	0	1	0	0	1	1	1

8.3 Comparing Proportions for Nominal Matched-Pairs Responses

\[P(Y_1 = i) = P(Y_2 = i)\ \mathrm{for}\ i = 1, \dots, c,\]

8.3.1 Marginal Homogeneity for Baseline-Category Logit Models

\[\mathrm{log}\left[\frac{P(Y_1 = j)}{P(Y_1 = c)}\right] = \alpha_j + \beta_j,\ \mathrm{log}\left[\frac{P(Y_2 = j)}{P(Y_2 = c)}\right] = \alpha_j\] ### 8.3.2 Example: Coffee Brand Market Share

Coffee <- read.table("http://users.stat.ufl.edu/~aa/cat/data/Coffee.dat",
                        header = TRUE, stringsAsFactors = TRUE)

library(dplyr)
library(tidyr)
coffee_wide <- Coffee %>% 
  tidyr::pivot_wider(id_cols = person, names_from=purchase, names_prefix = "y", 
                     values_from = y)
  


# Make contingency table
tab <- as.matrix(addmargins(xtabs(~y1 + y0, data = coffee_wide)))

library(tibble)
# Add label as the first column and variable names
`Table 8.5` <- tibble(`First Purchase` = c("High Point", "Taster's Choice",
                                           "Sanka", "Nescafe", "Brim", "Total"), 
                      `High Point` = tab[, 1], `Taster's Choice` = tab[,2],
                      `Sanka` = tab[, 3], `Nescafe` = tab[,4], `Brim` = tab[,5],
                      Total = tab[, 6])


theHeader <- data.frame(col_keys = c("First Purchase", "Blank", "High Point", 
                                     "Taster's Choice", "Sanka", "Nescafe", 
                                     "Brim", "Total"),
                        line1 = c("First Purchase", "", 
                                  rep("Second Purchase", 6)),
                        line2 = c("First Purchase", "", "High Point", 
                                     "Taster's Choice", "Sanka", "Nescafe", 
                                     "Brim", "Total"))

library(flextable)
library(dplyr)

library(officer)
big_border = fp_border(color="black", width = 2)

flextable(`Table 8.5`, col_keys = theHeader$col_keys) %>% 
  set_header_df(
    mapping = theHeader,
    key = "col_keys"
  ) %>% 
  theme_booktabs() %>% 
  merge_v(part = "header") %>% 
  merge_h(part = "header") %>% 
  align(align = "center", part = "all") %>% 
  align(j="First Purchase", align= "left", part = "all") %>% 
  #autofit() %>% 
  empty_blanks() %>% 
  fix_border_issues()  %>%
  set_caption(caption = "Table 8.5") %>% 
  set_table_properties(width = .75, layout = "autofit") %>% 
  hline_top(part="header", border = big_border) %>% 
  hline_bottom(part="body", border = big_border)

Table 6: Table 8.5
First Purchase	Second Purchase
First Purchase	High Point	Taster's Choice	Sanka	Nescafe	Brim	Total
High Point	93	17	44	7	10	171
Taster's Choice	9	46	11	0	9	75
Sanka	17	11	155	9	12	204
Nescafe	6	4	9	15	2	36
Brim	10	4	12	2	27	55
Total	135	82	231	33	60	541

Using GEE methodology, we can fit the baseline-category logit model, as shown in the following R output.

Coffee <- read.table("http://users.stat.ufl.edu/~aa/cat/data/Coffee.dat",
                        header = TRUE, stringsAsFactors = TRUE)

# subject-specific data file, purchase is 1 for first purchase 0 for second
Coffee %>% 
  filter(row_number() %in% c(1, 2, 3, 4, n()-1, n()))

  person purchase y
1      1        1 1
2      1        0 1
3      2        1 1
4      2        0 1
5    541        1 5
6    541        0 5

library(multgee)  # package for multinomial GEE analysis
fit <- nomLORgee(y ~ purchase, id = person, LORstr = "independence", 
                 data = Coffee)
# san.se = robust SE
# beta01 is alpha1 in our notation
summary(fit)  # nomLORgee uses baseline category logits for nominal response

GEE FOR NOMINAL MULTINOMIAL RESPONSES 
version 1.6.0 modified 2017-07-10 

Link : Baseline Category Logit 

Local Odds Ratios:
Structure:         independence
Homogenous scores: TRUE

call:
nomLORgee(formula = y ~ purchase, data = Coffee, id = person, 
    LORstr = "independence")

Summary of residuals:
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-0.4270 -0.2495 -0.1386  0.0000 -0.0610  0.9390 

Number of Iterations: 1 

Coefficients:
           Estimate   san.se   san.z Pr(>|san.z|)    
beta10      0.81093  0.15516  5.2265      < 2e-16 ***
purchase:1  0.32340  0.16830  1.9215      0.05466 .  
beta20      0.31237  0.16989  1.8387      0.06596 .  
purchase:2 -0.00222  0.18662 -0.0119      0.99051    
beta30      1.34807  0.14490  9.3036      < 2e-16 ***
purchase:3 -0.03729  0.15807 -0.2359      0.81353    
beta40     -0.59784  0.21672 -2.7585      0.00581 ** 
purchase:4  0.17402  0.23531  0.7396      0.45957    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Local Odds Ratios Estimates:
     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
[1,]    0    0    0    0    1    1    1    1
[2,]    0    0    0    0    1    1    1    1
[3,]    0    0    0    0    1    1    1    1
[4,]    0    0    0    0    1    1    1    1
[5,]    1    1    1    1    0    0    0    0
[6,]    1    1    1    1    0    0    0    0
[7,]    1    1    1    1    0    0    0    0
[8,]    1    1    1    1    0    0    0    0

p-value of Null model: < 0.0001

fit0 <- nomLORgee(y ~ 1, id = person, LORstr = "independence", data = Coffee)

# Model under H_0: y ~ 1 null model (marginal homogeneity)
waldts(fit0, fit)

Goodness of Fit based on the Wald test 

Model under H_0: y ~ 1
Model under H_1: y ~ purchase

Wald Statistic = 12.4869, df = 4, p-value = 0.0141

with(Coffee, mantelhaen.test(purchase, y, person))


    Cochran-Mantel-Haenszel test

data:  purchase and y and person
Cochran-Mantel-Haenszel M^2 = 12.291, df = 4, p-value = 0.01531

8.3.3 Using Cochran-Mantel-Haenszel Test to Test marinal Homogeneity*

8.3.4 Symmetry and Quasi-Symmetry Models for Square Contingency Tables

\[\pi_{ij} = \pi_{ji}\] \[\mathrm{log}(\pi_{ij}/\pi_{ji}) = 0\ \mathrm{for\ all}\ i \ \mathrm{and}\ j.\]

\[r_{ij} = (n_{ij} - n_{ji})/\sqrt{n_{ij} + n_{ji}}.\]

\[\begin{equation} \mathrm{log}(\pi_{ij}/\pi_{ji}) = \beta_i - \beta_j\ \mathrm{for\ all}\ i \ \mathrm{and}\ j. \tag{23} \end{equation}\]

8.3.5 Example: Coffee Brand market Share Revisited

Coffee2 <- read.table("http://users.stat.ufl.edu/~aa/cat/data/Coffee2.dat",
                        header = TRUE, stringsAsFactors = TRUE)
# quasi-symmetry model, nij and nji are opposite cell counts in table 8.5
Coffee2

   H  T  S  N  B nij nji
1  1 -1  0  0  0  17   9
2  1  0 -1  0  0  44  17
3  1  0  0 -1  0   7   6
4  1  0  0  0 -1  10  10
5  0  1 -1  0  0  11  11
6  0  1  0 -1  0   0   4
7  0  1  0  0 -1   9   4
8  0  0  1 -1  0   9   9
9  0  0  1  0 -1  12  12
10 0  0  0  1 -1   2   2

symm <- glm(nij/(nij+nji) ~ -1, family = binomial, weights = nij + nji, 
            data = Coffee2)
summary(symm)  # symmetry model, -1 in model statement sets intercept = 0


Call:
glm(formula = nij/(nij + nji) ~ -1, family = binomial, data = Coffee2, 
    weights = nij + nji)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-2.355   0.000   0.000   1.123   3.518  

No Coefficients

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 22.473  on 10  degrees of freedom
Residual deviance: 22.473  on 10  degrees of freedom
AIC: 52.433

Number of Fisher Scoring iterations: 0

QS <- glm(nij/(nij+nji) ~ -1 + H + T + S + N + B , family = binomial, 
          weights = nij + nji, data = Coffee2)
summary(QS)


Call:
glm(formula = nij/(nij + nji) ~ -1 + H + T + S + N + B, family = binomial, 
    data = Coffee2, weights = nij + nji)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1010  -0.2902  -0.0804   0.7165   1.4120  

Coefficients: (1 not defined because of singularities)
   Estimate Std. Error z value Pr(>|z|)  
H  0.595440   0.293658   2.028   0.0426 *
T -0.004004   0.329359  -0.012   0.9903  
S -0.113299   0.285082  -0.397   0.6911  
N  0.302115   0.401589   0.752   0.4519  
B        NA         NA      NA       NA  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 22.473  on 10  degrees of freedom
Residual deviance:  9.974  on  6  degrees of freedom
AIC: 47.934

Number of Fisher Scoring iterations: 4

8.4 Comparing Proportions for Ordinal Matched-Pairs Responses

8.4.1 Marginal Homogeneity and Cummulative Logit Marginal Model

\[\mathrm{logit}[P(Y_1 \le j)] = \alpha_j + \beta,\ \mathrm{logit}[P(Y_2 \le j)] = \alpha_j\]

8.4.2 Example: Recycle or Drive Less to Help the Environment?

Envir <- read.table("http://users.stat.ufl.edu/~aa/cat/data/Envir.dat",
                        header = TRUE, stringsAsFactors = TRUE)

library(dplyr)
library(tidyr)
envir_wide <- Envir %>% 
  tidyr::pivot_wider(id_cols = person, names_from=question, names_prefix = "y", 
                     values_from = y)
  


# Make contingency table
tab <- as.matrix(xtabs(~y1 + y0, data = envir_wide))

library(tibble)
# Add label as the first column and variable names
`Table 8.6` <- tibble(`Recycle` = c("Always", "Often", "Sometimes", "Never"), 
                      `Always` = tab[, 1], `Often` = tab[,2], 
                      `Sometimes` = tab[, 3], `Never` = tab[,4])


theHeader <- data.frame(col_keys = c("Recycle", "Blank", "Always", "Often",
                                           "Sometimes", "Never"),
                        line1 = c("Recycle", "", rep("Drive Less", 4)),
                        line2 = c("Recycle", "", "Always", "Often", 
                                  "Sometimes", "Never"))

library(flextable)
library(dplyr)

library(officer)
big_border = fp_border(color="black", width = 2)

flextable(`Table 8.6`, col_keys = theHeader$col_keys) %>% 
  set_header_df(
    mapping = theHeader,
    key = "col_keys"
  ) %>% 
  theme_booktabs() %>% 
  merge_v(part = "header") %>% 
  merge_h(part = "header") %>% 
  align(align = "center", part = "all") %>% 
  align(j="Recycle", align= "left", part = "all") %>% 
  autofit() %>% 
  empty_blanks() %>% 
  fix_border_issues()  %>%
  set_caption(caption = "Table 8.6") %>% 
  #set_table_properties(width = .75, layout = "autofit") %>% 
  hline_top(part="header", border = big_border) %>% 
  hline_bottom(part="body", border = big_border)

Table 7: Table 8.6
Recycle	Drive Less
Recycle	Always	Often	Sometimes	Never
Always	12	43	163	233
Often	4	21	99	185
Sometimes	4	8	77	230
Never	0	1	18	132

Envir <- read.table("http://users.stat.ufl.edu/~aa/cat/data/Envir.dat",
                        header = TRUE, stringsAsFactors = TRUE)

Envir %>% 
  filter(row_number() %in% c(1, 2, n()-1, n()))

  person question y
1      1        1 1
2      1        0 1
3   1230        1 4
4   1230        0 4

library(multgee)
fit <- ordLORgee(y ~ question, id = person, LORstr = "independence", 
                 data = Envir)
summary(fit)

GEE FOR ORDINAL MULTINOMIAL RESPONSES 
version 1.6.0 modified 2017-07-10 

Link : Cumulative logit 

Local Odds Ratios:
Structure:         independence

call:
ordLORgee(formula = y ~ question, data = Envir, id = person, 
    LORstr = "independence")

Summary of residuals:
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
-0.354916 -0.264742 -0.059210 -0.002021 -0.033859  0.966141 

Number of Iterations: 1 

Coefficients:
         Estimate   san.se    san.z Pr(>|san.z|)    
beta10   -3.35111  0.08289 -40.4287    < 2.2e-16 ***
beta20   -2.27673  0.07430 -30.6424    < 2.2e-16 ***
beta30   -0.58488  0.05882  -9.9443    < 2.2e-16 ***
question  2.75361  0.08147  33.7985    < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Local Odds Ratios Estimates:
     [,1] [,2] [,3] [,4] [,5] [,6]
[1,]    0    0    0    1    1    1
[2,]    0    0    0    1    1    1
[3,]    0    0    0    1    1    1
[4,]    1    1    1    0    0    0
[5,]    1    1    1    0    0    0
[6,]    1    1    1    0    0    0

p-value of Null model: < 0.0001

8.4.3 An Ordinal Quasi-Symmetry Model *

\[\begin{equation} \mathrm{log}(\pi_{ij}/\pi_{ji}) = \beta(u_j = u_i). \tag{24} \end{equation}\]

8.4.4 Example: Recycle or Drive Less Revisited?

Envir <- read.table("http://users.stat.ufl.edu/~aa/cat/data/Environment.dat",
                        header = TRUE, stringsAsFactors = TRUE)

# nij and nji are opposite cell counts in Table 8.6
# x = j-1 = distance between categories, 6 pairs of opposite cells
Envir

  always often sometimes never x nij nji
1      1    -1         0     0 1  43   4
2      1     0        -1     0 2 163   4
3      1     0         0    -1 3 233   0
4      0     1        -1     0 1  99   8
5      0     1         0    -1 2 185   1
6      0     0         1    -1 1 230  18

fit <- glm(nij/(nij+nji) ~ -1 + x, family = binomial, weight = nij + nji, 
           data = Envir)
summary(fit)  # ordinal quasi symmetry; -1 in model sets intercept = 0


Call:
glm(formula = nij/(nij + nji) ~ -1 + x, family = binomial, data = Envir, 
    weights = nij + nji)

Deviance Residuals: 
       1         2         3         4         5         6  
-0.03567  -1.82023   0.59541   0.33794   0.46512   0.64368  

Coefficients:
  Estimate Std. Error z value Pr(>|z|)    
x   2.3936     0.1508   15.88   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1106.1240  on 6  degrees of freedom
Residual deviance:    4.4139  on 5  degrees of freedom
AIC: 23.35

Number of Fisher Scoring iterations: 4

fit.QS <- glm(nij/(nij+nji) ~ -1 + always + often + sometimes + never, 
              family = binomial, weight = nij + nji, data = Envir)
summary(fit.QS)  # quasi symmetry


Call:
glm(formula = nij/(nij + nji) ~ -1 + always + often + sometimes + 
    never, family = binomial, data = Envir, weights = nij + nji)

Deviance Residuals: 
      1        2        3        4        5        6  
 0.7689  -1.1439   0.6760   0.4572   0.3006  -0.1381  

Coefficients: (1 not defined because of singularities)
          Estimate Std. Error z value Pr(>|z|)    
always      6.9269     0.4708   14.71   <2e-16 ***
often       4.9332     0.3617   13.64   <2e-16 ***
sometimes   2.5817     0.2386   10.82   <2e-16 ***
never           NA         NA      NA       NA    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1106.1240  on 6  degrees of freedom
Residual deviance:    2.6751  on 3  degrees of freedom
AIC: 25.611

Number of Fisher Scoring iterations: 4

8.5 Analyzing Rater Agreement *

8.5.1 Example: Agreement on Carcinoma Diagnosis

Pathology <- read.table("http://users.stat.ufl.edu/~aa/cat/data/Pathologists.dat",
                        header = TRUE, stringsAsFactors = TRUE)

library(dplyr)
library(tibble)

# Make contingency table
mat <- matrix(Pathology$count, ncol = 4,byrow = T) %>% 
  addmargins()
  
colnames(mat) <- c("V1", "V2", "V3", "V4", "Sum") 
  
`Table 8.7` <-  
  mat %>% 
  as_tibble() %>% 
  bind_cols(`Pathologist X` = c(1:4, "Total")) %>% 
  select(`Pathologist X`, everything())
  
theHeader <- data.frame(col_keys = c("Pathologist X", "Blank", "V1", "V2",
                                           "V3", "V4", "Sum"),
                        line1 = c("Pathologist X", "", rep("Pathologist Y", 4),
                                  "Total"),
                        line2 = c("Pathologist X", "", "1", "2", "3", "4",
                                  "Total"))

library(flextable)
library(dplyr)

library(officer)
big_border = fp_border(color="black", width = 2)

flextable(`Table 8.7`, col_keys = theHeader$col_keys) %>% 
  set_header_df(
    mapping = theHeader,
    key = "col_keys"
  ) %>% 
  theme_booktabs() %>% 
  merge_v(part = "header") %>% 
  merge_h(part = "header") %>% 
  align(align = "center", part = "all") %>% 
  align(j="Pathologist X", align= "left", part = "all") %>% 
  autofit() %>% 
  empty_blanks() %>% 
  fix_border_issues()  %>%
  set_caption(caption = "Table 8.7") %>% 
  #set_table_properties(width = .75, layout = "autofit") %>% 
  hline_top(part="header", border = big_border) %>% 
  hline_bottom(part="body", border = big_border)

Table 8: Table 8.7
Pathologist X	Pathologist Y				Total
Pathologist X	1	2	3	4	Total
1	22	2	2	0	26
2	5	7	14	0	26
3	0	2	36	0	38
4	0	1	17	10	28
Total	27	12	69	10	118

8.5.2 Cell Residuals for Independence Model

8.5.3 Quasi-Independence Model

Pathology <- read.table("http://users.stat.ufl.edu/~aa/cat/data/Pathologists.dat",
                        header = TRUE, stringsAsFactors = TRUE)

Pathology  # diag: separate category (1, 2, 3, 4) for each main-diagonal cell

   X Y count diag
1  1 1    22    1
2  1 2     2    0
3  1 3     2    0
4  1 4     0    0
5  2 1     5    0
6  2 2     7    2
7  2 3    14    0
8  2 4     0    0
9  3 1     0    0
10 3 2     2    0
11 3 3    36    3
12 3 4     0    0
13 4 1     0    0
14 4 2     1    0
15 4 3    17    0
16 4 4    10    4

# and a single (0) for all other cells

fit <- glm(count ~ factor(X) + factor(Y) + factor(diag), family = poisson,
           data = Pathology)

# quasi independence, not showing intercept or main effects
summary(fit)


Call:
glm(formula = count ~ factor(X) + factor(Y) + factor(diag), family = poisson, 
    data = Pathology)

Deviance Residuals: 
       1         2         3         4         5         6         7         8  
 0.00000   1.19586  -0.74802  -0.00006   1.48661   0.00000  -0.66364  -0.00014  
       9        10        11        12        13        14        15        16  
-1.23667   0.63080   0.00000  -0.00008  -1.93254  -1.35095   1.02517   0.00000  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -0.7700     0.6979  -1.103  0.26986    
factor(X)2       1.6321     0.5604   2.912  0.00359 ** 
factor(X)3       0.5017     0.9261   0.542  0.58801    
factor(X)4       1.3946     0.5551   2.512  0.01199 *  
factor(Y)2       0.4796     0.6552   0.732  0.46414    
factor(Y)3       1.9493     0.4971   3.921 8.80e-05 ***
factor(Y)4     -19.3097  4988.8789  -0.004  0.99691    
factor(diag)1    3.8611     0.7297   5.291 1.22e-07 ***
factor(diag)2    0.6042     0.6900   0.876  0.38119    
factor(diag)3    1.9025     0.8367   2.274  0.02298 *  
factor(diag)4   20.9877  4988.8789   0.004  0.99664    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 190.398  on 15  degrees of freedom
Residual deviance:  13.178  on  5  degrees of freedom
AIC: 75.997

Number of Fisher Scoring iterations: 17

8.5.4 Quasi Independence and Odds Ratios Summarizing Agreement

\[\tau_{ab} = \mathrm{exp}(\delta_a + \delta_b).\]

8.5.5 Kappa Summary Measure of Agreement

\[\kappa = \frac{\sum_i\pi_{ii}-\sum_i\pi_{i+}\pi_{+i}}{1-\sum_i\pi_{i+}\pi_{+i}}\]

library(psych)
dat <- matrix(Pathology$count, ncol = 4, byrow = TRUE)
cohen.kappa(dat)

Call: cohen.kappa1(x = x, w = w, n.obs = n.obs, alpha = alpha, levels = levels)

Cohen Kappa and Weighted Kappa correlation coefficients and confidence boundaries 
                 lower estimate upper
unweighted kappa  0.38     0.49  0.60
weighted kappa    0.78     0.78  0.78

 Number of subjects = 118

8.6 Bradley-Terry Model for Paired Preferences *

`Table 8.8` <- tibble(Winner = c("Djokovic", "Federer", "Murray", "Nadal", 
                                 "Wawrinka"),
                      Djokovic = c("-", 6, 3, 2, 3), 
                      Federer = c(9, "-", 0 ,1 , 2),
                      Murray = c(14, 5, "-", 4, 2),
                      Nadal = c(9, 5, 2, "-", 3),
                      Wawrinka = c(4, 7, 2, 4, "-"))

theHeader <- data.frame(col_keys = c("Winner", "Blank", "Djokovic", "Federer",
                                     "Murray", "Nadal", "Wawrinka"),
                        line1 = c("Winner", "", rep("Looser", 5)),
                        line2 = c("Winner", "", "Djokovic", "Federer",
                                     "Murray", "Nadal", "Wawrinka"))

library(flextable)
library(dplyr)

library(officer)
big_border = fp_border(color="black", width = 2)

flextable(`Table 8.8`, col_keys = theHeader$col_keys) %>% 
  set_header_df(
    mapping = theHeader,
    key = "col_keys"
  ) %>% 
  theme_booktabs() %>% 
  merge_v(part = "header") %>% 
  merge_h(part = "header") %>% 
  align(align = "center", part = "all") %>% 
  align(j="Winner", align= "left", part = "all") %>% 
  autofit() %>% 
  empty_blanks() %>% 
  fix_border_issues()  %>%
  set_caption(caption = "Resuls of 2014-2018 matches for men tennis players.") %>% 
  #set_table_properties(width = .75, layout = "autofit") %>% 
  hline_top(part="header", border = big_border) %>% 
  hline_bottom(part="body", border = big_border)

Table 9: Resuls of 2014-2018 matches for men tennis players.
Winner	Looser
Winner	Djokovic	Federer	Murray	Nadal	Wawrinka
Djokovic	-	9	14	9	4
Federer	6	-	5	5	7
Murray	3	0	-	2	2
Nadal	2	1	4	-	4
Wawrinka	3	2	2	3	-

8.6.1 The Bradley-Terry Model of Quasi-Symmetry

\[\mathrm{logit}(\Pi_{ij}) = \mathrm{log}(\Pi_{ij}/\Pi_{ji}) = \beta_i - \beta_j.\]

\[\hat\Pi_{ij} = \mathrm{exp}(\hat\beta_i - \hat\beta_j)/[1 + \mathrm{exp}(\hat\beta_i - \hat\beta_j)].\]

8.6.2 Example: Ranking Men Tennis Players

Tennis <- read.table("http://users.stat.ufl.edu/~aa/cat/data/Tennis.dat",
                        header = TRUE, stringsAsFactors = TRUE)
# Djokovic won 9 lost 6 vs Fed
# Federer always beat Murray

fit <- 
  glm(nij/(nij + nji) ~ -1 + Djokovic + Federer + Murray + Nadal + Wawrinka, 
      family = binomial, weights = nij + nji, data = Tennis)
summary(fit)


Call:
glm(formula = nij/(nij + nji) ~ -1 + Djokovic + Federer + Murray + 
    Nadal + Wawrinka, family = binomial, data = Tennis, weights = nij + 
    nji)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.1225  -0.1262   0.3715   0.5386   1.2928  

Coefficients: (1 not defined because of singularities)
         Estimate Std. Error z value Pr(>|z|)  
Djokovic  1.17612    0.49952   2.354   0.0185 *
Federer   1.13578    0.51095   2.223   0.0262 *
Murray   -0.56852    0.56833  -1.000   0.3172  
Nadal    -0.06185    0.51487  -0.120   0.9044  
Wawrinka       NA         NA      NA       NA  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 26.8960  on 10  degrees of freedom
Residual deviance:  4.3958  on  6  degrees of freedom
AIC: 34.041

Number of Fisher Scoring iterations: 4

\[\hat\Pi_{ij} = \frac{\mathrm{exp}(\hat\beta_i - \hat\beta_j)}{1 + \mathrm{exp}(\hat\beta_i - \hat\beta_j)} = \frac{\mathrm{exp}(1.136 - 1.176)}{1 + \mathrm{exp}(1.136 - 1.176)} = 0.49\]