11 Classification and Smoothing

11.1 Classification: Linear Discrinant Analysis

11.1.1 Classification with Fisher’s Linear Discriminant Function

11.1.2 Example: Horseshoe Crab Satellites Revisited

Crabs <- read.table("http://users.stat.ufl.edu/~aa/cat/data/Crabs.dat",
                        header = TRUE, stringsAsFactors = TRUE)

library(MASS)
lda(y~width + color, data = Crabs)

Call:
lda(y ~ width + color, data = Crabs)

Prior probabilities of groups:
        0         1 
0.3583815 0.6416185 

Group means:
     width    color
0 25.16935 2.725806
1 26.92973 2.279279

Coefficients of linear discriminants:
             LD1
width  0.4290351
color -0.5517138

fit.lda <- lda(y ~ width + color, prior = c(0.5, 0.5), CV = TRUE, data = Crabs)
# if prior note specified, uses sample proportions in the two categories

xtabs(~Crabs$y + fit.lda$class)  # using cross-validation (CV = TRUE)

       fit.lda$class
Crabs$y  0  1
      0 43 19
      1 39 72

t1df <- as.data.frame.matrix(xtabs(~Crabs$y + fit.lda$class)) %>% 
  rename("t1_1" = `1`) %>% 
  rename("t1_0" = `0`) %>% 
  map_df(rev)


prop <- sum(Crabs$y) / nrow(Crabs)
prop

[1] 0.6416185

fit <- glm(y ~ width + factor(color), family = binomial, data = Crabs)

predicted <- as.numeric(fitted(fit) > 0.6416185) # predict y=1 when est. > observed probability
t2df <- as.data.frame.matrix(xtabs(~ Crabs$y + predicted)) %>% 
  rename("t2_1" = `1`) %>% 
  rename("t2_0" = `0`) %>% 
  map_df(rev)  # purrr::map_df(rev) reverses order of the data frame


Actual <-tibble(Actual = c("y = 1", "y = 0")) 
Total <- tibble(Total = c(111, 62))  
  

library(flextable)
suppressMessages(conflict_prefer("compose", "flextable"))

library(dplyr)  # for bind_cols()
library(officer) # for fp_border()

# Make analysis table
`Table 11.1` <- bind_cols(Actual, t1df, t2df, Total)

# The header needs blank columns for spaces in actual table.
# The wide column labels use Unicode characters for pi. Those details are 
#   replaced later with the compose() function.
theHeader <- data.frame(
  col_keys = c("Actual", "blank", "t1_1", "t1_0",
               "blank2", "t2_1", "t2_0", 
               "blank3", "Total"),
  line1 = c("Actual", "", 
            rep("Discriminat Analysis", 2), "", 
            rep("Logistic Regression", 2), "",
            "Total"),
  line2 = c("Actual", "", "t1_1", "t1_0", "", "t2_1", "t2_0", "", "Total"))

# Border lines
big_border <- fp_border(color="black", width = 2)

# Make the table - compose uses Unicode character 
# https://stackoverflow.com/questions/64088118/in-r-flextable-can-complex-symbols-appear-in-column-headings
flextable(`Table 11.1`, col_keys = theHeader$col_keys) %>% 
  set_header_df(mapping = theHeader, key = "col_keys") %>% 
  theme_booktabs() %>% 
  merge_v(part = "header") %>% 
  merge_h(part = "header") %>% 
  align(align = "center", part = "all") %>% 
  empty_blanks() %>% 
  fix_border_issues()  %>%
  set_caption(caption = "Classification tables for horseshoe crab data with width and factor color predictors. Logistic model does not match book.") %>% 
  set_table_properties(width = 1, layout = "autofit") %>% 
  hline_top(part="header", border = big_border) %>% 
  hline_bottom(part="body", border = big_border)  %>% 
  # i = 2 refers to second row of column headings
  compose(part = "header", i = 2, j = "t1_1",  
          value = as_paragraph("y\U0302 = 1")) %>% 
  compose(part = "header", i = 2, j = "t1_0", 
          value = as_paragraph("y\U0302 = 0")) %>% 
  compose(part = "header", i = 2, j = "t2_1", 
          value = as_paragraph("y\U0302 = 1")) %>% 
  compose(part = "header", i = 2, j = "t2_0", 
          value = as_paragraph("y\U0302 = 0"))

Table 15: Classification tables for horseshoe crab data with width and factor color predictors. Logistic model does not match book.
Actual	Discriminat Analysis		Logistic Regression		Total
Actual	ŷ = 1	ŷ = 0	ŷ = 1	ŷ = 0	Total
y = 1	72	39	75	36	111
y = 0	19	43	19	43	62

11.1.3 Discriminant Analysis Versus Logistic Regression

11.2 Classification: Tree-Based Prediction

11.2.1 Classification Trees

11.2.2 Example: A classification Tree for Horseshoe Crab Mating

Crabs <- read.table("http://users.stat.ufl.edu/~aa/cat/data/Crabs.dat",
                        header = TRUE, stringsAsFactors = TRUE)

library(rpart)
fit <- rpart(y ~ color + width, method = "class", data = Crabs)
p.fit <- prune(fit, cp = 0.02) 

library(rpart.plot)
rpart.plot(p.fit, extra = 1, digits = 4, box.palette = 0)

11.2.3 How Does the Classification Tree Grow?

11.2.4 Pruning a Tree and Checking Prediction Accuracy

Crabs <- read.table("http://users.stat.ufl.edu/~aa/cat/data/Crabs.dat",
                        header = TRUE, stringsAsFactors = TRUE)

library(rpart)
# method = "class" for categorical y 
fit <- rpart(y ~ color + width, method = "class", data = Crabs)
# plots error rate by cp = complexity parameter for pruning
plotcp(fit)

# select leftmost cp with mean error below horizontal line (1SE above min.)

p.fit <- prune(fit, cp = 0.056)

library(rpart.plot)
rpart.plot(p.fit, extra = 1, digits = 4, box.palette = 0)

11.2.5 Classification Trees Versus Logistic Regression and Discriminant Analysis

11.3 Cluster Analysis for Categorical Responses

11.3.1 Measuring Dissimilarity Between Observations

library(flextable)
suppressMessages(conflict_prefer("compose", "flextable"))

library(dplyr)  # for bind_cols()
library(officer) # for fp_border()

# Make analysis table
`Table 11.2` <- bind_cols(`Observation 1` = c("1", "0"), 
                          `1` = c("a", "c"),
                          `0` = c("b", "d"))

# The header needs blank columns for spaces in actual table.
# The wide column labels use Unicode characters for pi. Those details are 
#   replaced later with the compose() function.
theHeader <- data.frame(
  col_keys = c("Observation 1", "blank", "1", "0"),
  line1 = c("Observation 1", "", 
            rep("Observtion 2", 2)),
  line2 = c("Observation 1", "", "1", "0"))

# Border lines
big_border <- fp_border(color="black", width = 2)

# Make the table - compose uses Unicode character 
# https://stackoverflow.com/questions/64088118/in-r-flextable-can-complex-symbols-appear-in-column-headings
flextable(`Table 11.2`, col_keys = theHeader$col_keys) %>% 
  set_header_df(mapping = theHeader, key = "col_keys") %>% 
  theme_booktabs() %>% 
  merge_v(part = "header") %>% 
  merge_h(part = "header") %>% 
  align(align = "center", part = "all") %>% 
  empty_blanks() %>% 
  fix_border_issues()  %>%
  set_caption(caption = "Cross-classification of two observtions on p binary response variables where p = (a + b + c + d)") %>% 
  set_table_properties(width = .75, layout = "autofit") %>% 
  hline_top(part="header", border = big_border) %>% 
  hline_top(part="body", border = big_border) %>% 
  hline_bottom(part="body", border = big_border)

Table 16: Cross-classification of two observtions on p binary response variables where p = (a + b + c + d)
Observation 1	Observtion 2
Observation 1	1	0
1	a	b
0	c	d

11.3.2 Hierarchical Clustering Algorithm and Dendrograms

11.3.3 Example: Clustering States on Presidential Elections

Elections <- read.table("http://users.stat.ufl.edu/~aa/cat/data/Elections2.dat",
                        header = TRUE, stringsAsFactors = TRUE)

names(Elections) <- c("Number", "State", seq(1980, 2016, by =4))

# function to recode "1" to "D" and "0" to "R"
rd <- function(x){
  if_else(x == 1, "D", "R")
}

`Table 11.3` <- Elections %>% 
  select(-"Number") %>% 
  mutate(across(where(is.numeric), rd))

knitr::kable(`Table 11.3`)

State	1980	1984	1988	1992	1996	2000	2004	2008	2012	2016
Arizona	R	R	R	R	D	R	R	R	R	R
California	R	R	R	D	D	D	D	D	D	D
Colorado	R	R	R	D	R	R	R	D	D	D
Florida	R	R	R	R	D	R	R	D	D	R
Illinois	R	R	R	D	D	D	D	D	D	D
Massachusetts	R	R	D	D	D	D	D	D	D	D
Minnesota	D	D	D	D	D	D	D	D	D	D
Missouro	R	R	R	D	D	R	R	R	R	R
NewMexico	R	R	R	D	D	D	R	D	D	D
NewYork	R	R	D	D	D	D	D	D	D	D
Ohio	R	R	R	D	D	R	R	D	D	R
Texas	R	R	R	R	R	R	R	R	R	R
Virginia	R	R	R	R	R	R	R	D	D	D
Wyoming	R	R	R	R	R	R	R	R	R	R

Elections <- read.table("http://users.stat.ufl.edu/~aa/cat/data/Elections2.dat",
                        header = TRUE, stringsAsFactors = TRUE)


distances <- dist(Elections[, 3:12], method = "manhattan")

#manhattan measure dissimilarity by no. of election outcomes that differ
democlust <-hclust(distances, "average")  # hierarchical clustering

plot(democlust, labels = Elections$state)

library(gplots)

Registered S3 method overwritten by 'gplots':
  method         from 
  reorder.factor gdata

heatmap.2(as.matrix(Elections[, 3:12]), labRow = Elections$state, 
          dendrogram = "row", Colv=FALSE)

11.4 Smoothing: Generalized Additive Models

11.4.1 Generalized Additive Models

\[g(\mu_i) = s_1(X_{i1}) + s_2(X_{i2}) + \dots + s_p(X_{ip}),\]

11.4.2 Example: GAMs for Horseshoe Crab Data

11.4.3 How Much Smoothing? The Bias/Variance Tradeoff

11.4.4 Example: Smoothing to Portray Probability of Kyphosis

Kyphosis <- read.table("http://users.stat.ufl.edu/~aa/cat/data/Kyphosis.dat",
                        header = TRUE, stringsAsFactors = TRUE)

Kyphosis %>% 
  filter(row_number() %in% c(1, n()))

    x y
1  12 1
2 206 0

gam.fit1 <- gam(y~ s(x, df=1), family = binomial, data = Kyphosis)  # linear complexity
gam.fit2 <- gam(y~ s(x, df=2), family = binomial, data = Kyphosis)  # quadratic
gam.fit3 <- gam(y~ s(x, df=3), family = binomial, data = Kyphosis)  # cubic
anova(gam.fit1, gam.fit2, gam.fit3)

Analysis of Deviance Table

Model 1: y ~ s(x, df = 1)
Model 2: y ~ s(x, df = 2)
Model 3: y ~ s(x, df = 3)
  Resid. Df Resid. Dev      Df Deviance Pr(>Chi)  
1        38     54.504                            
2        37     49.216 0.99988   5.2879  0.02147 *
3        36     48.231 1.00017   0.9852  0.32097  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot(y ~ x, xlab = "Age", ylab = "Presence of Kythosis", data = Kyphosis)
curve(predict(gam.fit2, data.frame(x = x), type = "resp"), add = TRUE)

11.5 Regularization for High-Dimensional Categorical Data (Large p)

11.5.1 Penalized-Likelihood Methods and \(L_q\)-Norm Smoothing

\[L^*(\beta) = L(\beta)-s(\beta),\] \[s(\beta)= \lambda\sum_{j=1}^p|\beta_j|^q\]

11.5.2 Implementing the Lasso

11.5.3 Example: Predicting Option on Abortion with Student Survey

Students <- read.table("http://users.stat.ufl.edu/~aa/cat/data/Students.dat",
                        header = TRUE, stringsAsFactors = TRUE)

fit <- glm(abor ~ gender + age + hsgpa + cogpa + dhome + dres + tv + sport +  
           news + aids + veg + ideol + relig + affirm, 
           family = binomial, data = Students)

# news LR P-value = 0.0003
# ideol LR P-value = 0.0010
summary(fit)


Call:
glm(formula = abor ~ gender + age + hsgpa + cogpa + dhome + dres + 
    tv + sport + news + aids + veg + ideol + relig + affirm, 
    family = binomial, data = Students)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.48649   0.00009   0.04055   0.20125   1.97028  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)  
(Intercept) 10.1014276 10.8914049   0.927   0.3537  
gender       1.0021615  1.8655342   0.537   0.5911  
age         -0.0783427  0.1274839  -0.615   0.5389  
hsgpa       -3.7344482  2.8093160  -1.329   0.1837  
cogpa        2.5112750  3.7399060   0.671   0.5019  
dhome        0.0005574  0.0006789   0.821   0.4116  
dres        -0.3388230  0.2953830  -1.147   0.2514  
tv           0.2659760  0.2531643   1.051   0.2934  
sport        0.0272104  0.2551460   0.107   0.9151  
news         1.3868778  0.6986772   1.985   0.0471 *
aids         0.3966764  0.5663706   0.700   0.4837  
veg          4.3213535  3.8614639   1.119   0.2631  
ideol       -1.6377902  0.7892505  -2.075   0.0380 *
relig       -0.7245665  0.7820724  -0.926   0.3542  
affirm      -2.7481550  2.6898822  -1.022   0.3069  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 62.719  on 59  degrees of freedom
Residual deviance: 21.368  on 45  degrees of freedom
AIC: 51.368

Number of Fisher Scoring iterations: 9

# LR test that all 14 betas = 0
1 - pchisq(62.719 - 21.368, 59-45)

[1] 0.0001566051

library(tidyverse)
Students <- read.table("http://users.stat.ufl.edu/~aa/cat/data/Students.dat",
                        header = TRUE, stringsAsFactors = TRUE)

# explanatory variables for lasso
x <- with(Students, cbind(gender, age, hsgpa, cogpa, dhome, dres, tv, sport, 
                          news, aids, veg, ideol, relig, affirm))
abor <- Students$abor

library(glmnet)

Loaded glmnet 4.1-6

# alpha = 1 is lasso
fit.lasso <- glmnet(x, abor, alpha = 1, family = "binomial")

plot(fit.lasso)

set.seed(1)
est <- cv.glmnet(x, abor, alpha = 1, family = "binomial", 
                 type.measure= "class") # , maxit=1000000000)

Warning: from glmnet C++ code (error code -85); Convergence for 85th lambda
value not reached after maxit=100000 iterations; solutions for larger lambdas
returned

# this is a random variable changes from run to run. It is 0.0661 in the book. 
est$lambda.min  # best lambda by 10-fold cross-validation

[1] 0.05000413

# also a random variable.  It is 0.1267 in the book.
theEst <- est$lambda.1se  # lambda suggested by one-standard-error rule 
theEst

[1] 0.07962071

coef(glmnet(x, abor, alpha = 1, family = "binomial", lambda = 0.1267787))

15 x 1 sparse Matrix of class "dgCMatrix"
                    s0
(Intercept)  2.3671144
gender       .        
age          .        
hsgpa        .        
cogpa        .        
dhome        .        
dres         .        
tv           .        
sport        .        
news         .        
aids         .        
veg          .        
ideol       -0.2599409
relig       -0.1831115
affirm       .

coef(glmnet(x, abor, alpha = 1, family = "binomial", lambda = theEst))

15 x 1 sparse Matrix of class "dgCMatrix"
                     s0
(Intercept)  2.66049060
gender       .         
age          .         
hsgpa        .         
cogpa        .         
dhome        .         
dres         .         
tv           .         
sport        .         
news         0.08634866
aids         .         
veg          .         
ideol       -0.37767205
relig       -0.32335761
affirm       .

summary(glm(abor ~ ideol + relig + news, family = binomial, data = Students))


Call:
glm(formula = abor ~ ideol + relig + news, family = binomial, 
    data = Students)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.29381   0.00082   0.10156   0.33555   1.65503  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)   
(Intercept)   3.5205     1.2513   2.814  0.00490 **
ideol        -1.2515     0.4671  -2.679  0.00738 **
relig        -0.7198     0.4982  -1.445  0.14854   
news          1.1292     0.4574   2.469  0.01356 * 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 62.719  on 59  degrees of freedom
Residual deviance: 29.791  on 56  degrees of freedom
AIC: 37.791

Number of Fisher Scoring iterations: 7

summary(glm(abor ~ ideol + relig + news, family = binomial, data = Students))


Call:
glm(formula = abor ~ ideol + relig + news, family = binomial, 
    data = Students)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.29381   0.00082   0.10156   0.33555   1.65503  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)   
(Intercept)   3.5205     1.2513   2.814  0.00490 **
ideol        -1.2515     0.4671  -2.679  0.00738 **
relig        -0.7198     0.4982  -1.445  0.14854   
news          1.1292     0.4574   2.469  0.01356 * 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 62.719  on 59  degrees of freedom
Residual deviance: 29.791  on 56  degrees of freedom
AIC: 37.791

Number of Fisher Scoring iterations: 7

11.5.4 Why Shrink ML Estimates Toward 0?

11.5.5 Issues In variable Selection (Dimension Reduction)

11.5.6 Controlling the False Discovery Rate

pvals <- c(0.0001, 0.0004, 0.0019, 0.0095, 0.020, 0.028, 0.030, 0.034, 
           0.046, 0.32, 0.43, 0.57, 0.65, 0.76, 1.00)
p.adjust(pvals, method = c("bonferroni"))

 [1] 0.0015 0.0060 0.0285 0.1425 0.3000 0.4200 0.4500 0.5100 0.6900 1.0000
[11] 1.0000 1.0000 1.0000 1.0000 1.0000

p.adjust(pvals, method = c("fdr"))

 [1] 0.00150000 0.00300000 0.00950000 0.03562500 0.06000000 0.06375000
 [7] 0.06375000 0.06375000 0.07666667 0.48000000 0.58636364 0.71250000
[13] 0.75000000 0.81428571 1.00000000

html

11 Classification and Smoothing

11.1 Classification: Linear Discrinant Analysis

11.1.1 Classification with Fisher’s Linear Discriminant Function

11.1.2 Example: Horseshoe Crab Satellites Revisited

11.1.3 Discriminant Analysis Versus Logistic Regression

11.2 Classification: Tree-Based Prediction

11.2.1 Classification Trees

11.2.2 Example: A classification Tree for Horseshoe Crab Mating

11.2.3 How Does the Classification Tree Grow?

11.2.4 Pruning a Tree and Checking Prediction Accuracy

11.2.5 Classification Trees Versus Logistic Regression and Discriminant Analysis

11.3 Cluster Analysis for Categorical Responses

11.3.1 Measuring Dissimilarity Between Observations

11.3.2 Hierarchical Clustering Algorithm and Dendrograms

11.3.3 Example: Clustering States on Presidential Elections

11.4 Smoothing: Generalized Additive Models

11.4.1 Generalized Additive Models

11.4.2 Example: GAMs for Horseshoe Crab Data

11.4.3 How Much Smoothing? The Bias/Variance Tradeoff

11.4.4 Example: Smoothing to Portray Probability of Kyphosis

11.5 Regularization for High-Dimensional Categorical Data (Large p)

11.5.1 Penalized-Likelihood Methods and \(L_q\)-Norm Smoothing

11.5.2 Implementing the Lasso

11.5.3 Example: Predicting Option on Abortion with Student Survey

11.5.4 Why Shrink ML Estimates Toward 0?

11.5.5 Issues In variable Selection (Dimension Reduction)

11.5.6 Controlling the False Discovery Rate

11.5.7 Large \(p\) also Makes Bayesian Inference Challenging