Announcement

Acknowledgement

Dr. Hua Zhou’s slides

Galápagos data

The gala data set records the number of plant species and the number of endemic species for 30 Galápagos islands. We are interested in modeling the number of species using a regression.

gala <- as_tibble(gala, rownames = "Island") %>%
  print(n = Inf)
## # A tibble: 30 x 8
##    Island       Species Endemics     Area Elevation Nearest Scruz Adjacent
##    <chr>          <dbl>    <dbl>    <dbl>     <dbl>   <dbl> <dbl>    <dbl>
##  1 Baltra            58       23   25.1         346     0.6   0.6    1.84 
##  2 Bartolome         31       21    1.24        109     0.6  26.3  572.   
##  3 Caldwell           3        3    0.21        114     2.8  58.7    0.78 
##  4 Champion          25        9    0.1          46     1.9  47.4    0.18 
##  5 Coamano            2        1    0.05         77     1.9   1.9  904.   
##  6 Daphne.Major      18       11    0.34        119     8     8      1.84 
##  7 Daphne.Minor      24        0    0.08         93     6    12      0.34 
##  8 Darwin            10        7    2.33        168    34.1 290.     2.85 
##  9 Eden               8        4    0.03         71     0.4   0.4   18.0  
## 10 Enderby            2        2    0.18        112     2.6  50.2    0.1  
## 11 Espanola          97       26   58.3         198     1.1  88.3    0.570
## 12 Fernandina        93       35  634.         1494     4.3  95.3 4669.   
## 13 Gardner1          58       17    0.570        49     1.1  93.1   58.3  
## 14 Gardner2           5        4    0.78        227     4.6  62.2    0.21 
## 15 Genovesa          40       19   17.4          76    47.4  92.2  129.   
## 16 Isabela          347       89 4669.         1707     0.7  28.1  634.   
## 17 Marchena          51       23  129.          343    29.1  85.9   59.6  
## 18 Onslow             2        2    0.01         25     3.3  45.9    0.1  
## 19 Pinta            104       37   59.6         777    29.1 120.   129.   
## 20 Pinzon           108       33   18.0         458    10.7  10.7    0.03 
## 21 Las.Plazas        12        9    0.23         94     0.5   0.6   25.1  
## 22 Rabida            70       30    4.89        367     4.4  24.4  572.   
## 23 SanCristobal     280       65  552.          716    45.2  66.6    0.570
## 24 SanSalvador      237       81  572.          906     0.2  19.8    4.89 
## 25 SantaCruz        444       95  904.          864     0.6   0      0.52 
## 26 SantaFe           62       28   24.1         259    16.5  16.5    0.52 
## 27 SantaMaria       285       73  171.          640     2.6  49.2    0.1  
## 28 Seymour           44       16    1.84        147     0.6   9.6   25.1  
## 29 Tortuga           16        8    1.24        186     6.8  50.9   18.0  
## 30 Wolf              21       12    2.85        253    34.1 255.     2.33

A histogram of the number of species

gala %>%
  ggplot() + 
  geom_histogram(mapping = aes(x = Species)) + 
  labs(title = "Species on Galápagos Islands")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Poisson regession

modp <- glm(Species ~ . - Endemics - Island, family = poisson, data = gala)
summary(modp)
## 
## Call:
## glm(formula = Species ~ . - Endemics - Island, family = poisson, 
##     data = gala)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -8.2752  -4.4966  -0.9443   1.9168  10.1849  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.155e+00  5.175e-02  60.963  < 2e-16 ***
## Area        -5.799e-04  2.627e-05 -22.074  < 2e-16 ***
## Elevation    3.541e-03  8.741e-05  40.507  < 2e-16 ***
## Nearest      8.826e-03  1.821e-03   4.846 1.26e-06 ***
## Scruz       -5.709e-03  6.256e-04  -9.126  < 2e-16 ***
## Adjacent    -6.630e-04  2.933e-05 -22.608  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 3510.73  on 29  degrees of freedom
## Residual deviance:  716.85  on 24  degrees of freedom
## AIC: 889.68
## 
## Number of Fisher Scoring iterations: 5

Interpretation

library(gtsummary)
modp %>%
  tbl_regression(intercept = TRUE)
Characteristic log(IRR)1 95% CI1 p-value
(Intercept) 3.2 3.1, 3.3 <0.001
Area 0.00 0.00, 0.00 <0.001
Elevation 0.00 0.00, 0.00 <0.001
Nearest 0.01 0.01, 0.01 <0.001
Scruz -0.01 -0.01, 0.00 <0.001
Adjacent 0.00 0.00, 0.00 <0.001

1 IRR = Incidence Rate Ratio, CI = Confidence Interval

modp %>%
  tbl_regression(intercept = TRUE, exponentiate = TRUE)
Characteristic IRR1 95% CI1 p-value
(Intercept) 23.4 21.2, 25.9 <0.001
Area 1.00 1.00, 1.00 <0.001
Elevation 1.00 1.00, 1.00 <0.001
Nearest 1.01 1.01, 1.01 <0.001
Scruz 0.99 0.99, 1.00 <0.001
Adjacent 1.00 1.00, 1.00 <0.001

1 IRR = Incidence Rate Ratio, CI = Confidence Interval

Goodness of fit

# predmu = predict(modp, type = "response")
# sum((gala$Species - predmu)^2 / predmu)
(px2 <- sum(residuals(modp, type = "pearson")^2))
## [1] 761.9792

Diagnostics

gala %>%
  mutate(devres  = residuals(modp, type = "deviance"), 
         linpred = predict(modp, type = "link")) %>%
  ggplot + 
  geom_point(mapping = aes(x = linpred, y = devres)) + 
  labs(x = "Linear predictor", y = "Deviance residual")

gala %>%
  mutate(perres  = residuals(modp, type = "pearson"),
         linpred = predict(modp, type = "link")) %>%
  ggplot + 
  geom_point(mapping = aes(x = linpred, y = perres)) + 
  labs(x = "Linear predictor", y = "Pearson residual")

halfnorm(hatvalues(modp))

gala %>%
  slice(c(12, 16))
## # A tibble: 2 x 8
##   Island     Species Endemics  Area Elevation Nearest Scruz Adjacent
##   <chr>        <dbl>    <dbl> <dbl>     <dbl>   <dbl> <dbl>    <dbl>
## 1 Fernandina      93       35  634.      1494     4.3  95.3    4669.
## 2 Isabela        347       89 4669.      1707     0.7  28.1     634.
halfnorm(cooks.distance((modp)))

gala %>%
  mutate(predmu = predict(modp, type = "response"), 
         res    = Species - predmu) %>%
  ggplot() +
  geom_point(mapping = aes(x = predmu, y = res^2)) +
  geom_abline(intercept = 0, slope = 1) +
  labs(x = expression(hat(mu)), y = expression((y - hat(mu))^2)) + 
  scale_x_log10() + 
  scale_y_log10()

Overdispersion

(dp <- px2 / modp$df.residual)
## [1] 31.74914
summary(modp, dispersion = dp)
## 
## Call:
## glm(formula = Species ~ . - Endemics - Island, family = poisson, 
##     data = gala)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -8.2752  -4.4966  -0.9443   1.9168  10.1849  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.1548079  0.2915897  10.819  < 2e-16 ***
## Area        -0.0005799  0.0001480  -3.918 8.95e-05 ***
## Elevation    0.0035406  0.0004925   7.189 6.53e-13 ***
## Nearest      0.0088256  0.0102621   0.860    0.390    
## Scruz       -0.0057094  0.0035251  -1.620    0.105    
## Adjacent    -0.0006630  0.0001653  -4.012 6.01e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 31.74914)
## 
##     Null deviance: 3510.73  on 29  degrees of freedom
## Residual deviance:  716.85  on 24  degrees of freedom
## AIC: 889.68
## 
## Number of Fisher Scoring iterations: 5
drop1(modp, scale = dp, test = "F")

Quasi-Poisson

modd <- glm(Species ~ . - Endemics - Island, family = quasipoisson, data = gala)
summary(modd)
## 
## Call:
## glm(formula = Species ~ . - Endemics - Island, family = quasipoisson, 
##     data = gala)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -8.2752  -4.4966  -0.9443   1.9168  10.1849  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.1548079  0.2915901  10.819 1.03e-10 ***
## Area        -0.0005799  0.0001480  -3.918 0.000649 ***
## Elevation    0.0035406  0.0004925   7.189 1.98e-07 ***
## Nearest      0.0088256  0.0102622   0.860 0.398292    
## Scruz       -0.0057094  0.0035251  -1.620 0.118380    
## Adjacent    -0.0006630  0.0001653  -4.012 0.000511 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 31.74921)
## 
##     Null deviance: 3510.73  on 29  degrees of freedom
## Residual deviance:  716.85  on 24  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5
drop1(modd, test = "F")
## Single term deletions
## 
## Model:
## Species ~ (Island + Endemics + Area + Elevation + Nearest + Scruz + 
##     Adjacent) - Endemics - Island
##           Df Deviance F value    Pr(>F)    
## <none>         716.85                      
## Area       1  1204.35 16.3217 0.0004762 ***
## Elevation  1  2389.57 56.0028 1.007e-07 ***
## Nearest    1   739.41  0.7555 0.3933572    
## Scruz      1   813.62  3.2400 0.0844448 .  
## Adjacent   1  1341.45 20.9119 0.0001230 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Negative bionomial regression

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:gtsummary':
## 
##     select
## The following object is masked from 'package:dplyr':
## 
##     select
modn <- glm(Species ~ . - Endemics - Island, family = negative.binomial(1), data = gala)
summary(modn)
## 
## Call:
## glm(formula = Species ~ . - Endemics - Island, family = negative.binomial(1), 
##     data = gala)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7056  -0.6807  -0.1151   0.3578   1.4398  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.9040718  0.2726617  10.651 1.41e-10 ***
## Area        -0.0006338  0.0003143  -2.017  0.05506 .  
## Elevation    0.0038575  0.0007561   5.102 3.21e-05 ***
## Nearest      0.0027834  0.0149036   0.187  0.85342    
## Scruz       -0.0018568  0.0030569  -0.607  0.54929    
## Adjacent    -0.0007617  0.0002492  -3.057  0.00542 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(1) family taken to be 0.7232273)
## 
##     Null deviance: 54.069  on 29  degrees of freedom
## Residual deviance: 20.741  on 24  degrees of freedom
## AIC: 305.75
## 
## Number of Fisher Scoring iterations: 16
glm.nb(Species ~ . - Endemics - Island, data = gala) %>%
  summary()
## 
## Call:
## glm.nb(formula = Species ~ . - Endemics - Island, data = gala, 
##     init.theta = 1.674602286, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1344  -0.8597  -0.1476   0.4576   1.8416  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.9065247  0.2510344  11.578  < 2e-16 ***
## Area        -0.0006336  0.0002865  -2.211 0.027009 *  
## Elevation    0.0038551  0.0006916   5.574 2.49e-08 ***
## Nearest      0.0028264  0.0136618   0.207 0.836100    
## Scruz       -0.0018976  0.0028096  -0.675 0.499426    
## Adjacent    -0.0007605  0.0002278  -3.338 0.000842 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(1.6746) family taken to be 1)
## 
##     Null deviance: 88.431  on 29  degrees of freedom
## Residual deviance: 33.196  on 24  degrees of freedom
## AIC: 304.22
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.675 
##           Std. Err.:  0.442 
## 
##  2 x log-likelihood:  -290.223

Zero-inflated count models

library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
(bioChemists <- as_tibble(bioChemists))
## # A tibble: 915 x 6
##      art fem   mar      kid5   phd  ment
##    <int> <fct> <fct>   <dbl> <dbl> <int>
##  1     0 Men   Married     0  2.52     7
##  2     0 Women Single      0  2.05     6
##  3     0 Women Single      0  3.75     6
##  4     0 Men   Married     1  1.18     3
##  5     0 Women Single      0  3.75    26
##  6     0 Women Married     2  3.59     2
##  7     0 Women Single      0  3.19     3
##  8     0 Men   Married     2  2.96     4
##  9     0 Men   Single      0  4.62     6
## 10     0 Women Married     0  1.25     0
## # … with 905 more rows
bioChemists %>%
  ggplot() + 
  geom_bar(mapping = aes(x = art, fill = fem)) + 
  labs(x = "Articles", y = "Count", title = "Publications by PhD Students")

bioChemists %>%
  ggplot() + 
  geom_bar(mapping = aes(x = art, fill = mar))

modp <- glm(art ~ ., family = poisson, data = bioChemists)
summary(modp)
## 
## Call:
## glm(formula = art ~ ., family = poisson, data = bioChemists)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.5672  -1.5398  -0.3660   0.5722   5.4467  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.304617   0.102981   2.958   0.0031 ** 
## femWomen    -0.224594   0.054613  -4.112 3.92e-05 ***
## marMarried   0.155243   0.061374   2.529   0.0114 *  
## kid5        -0.184883   0.040127  -4.607 4.08e-06 ***
## phd          0.012823   0.026397   0.486   0.6271    
## ment         0.025543   0.002006  12.733  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 1817.4  on 914  degrees of freedom
## Residual deviance: 1634.4  on 909  degrees of freedom
## AIC: 3314.1
## 
## Number of Fisher Scoring iterations: 5
tibble(ocount = table(bioChemists$art)[1:8],    # observed count for 0:7
       pcount = colSums(predprob(modp)[, 1:8]), # expected count for 0:7
       count  = 0:7) %>%
  ggplot(mapping = aes(x = pcount, y = ocount, label = count)) + 
  geom_point() + 
  geom_text(nudge_y = 8) +
  labs(x = "Predicted", y = "Observed")
## Don't know how to automatically pick scale for object of type table. Defaulting to continuous.

library(gtsummary)
modh <- hurdle(art ~ ., data = bioChemists)
summary(modh)
## 
## Call:
## hurdle(formula = art ~ ., data = bioChemists)
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4105 -0.8913 -0.2817  0.5530  7.0324 
## 
## Count model coefficients (truncated poisson with log link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.67114    0.12246   5.481 4.24e-08 ***
## femWomen    -0.22858    0.06522  -3.505 0.000457 ***
## marMarried   0.09649    0.07283   1.325 0.185209    
## kid5        -0.14219    0.04845  -2.934 0.003341 ** 
## phd         -0.01273    0.03130  -0.407 0.684343    
## ment         0.01875    0.00228   8.222  < 2e-16 ***
## Zero hurdle model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.23680    0.29552   0.801   0.4230    
## femWomen    -0.25115    0.15911  -1.579   0.1144    
## marMarried   0.32623    0.18082   1.804   0.0712 .  
## kid5        -0.28525    0.11113  -2.567   0.0103 *  
## phd          0.02222    0.07956   0.279   0.7800    
## ment         0.08012    0.01302   6.155 7.52e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 12 
## Log-likelihood: -1605 on 12 Df
modz <- zeroinfl(art ~ ., data = bioChemists)
summary(modz)
## 
## Call:
## zeroinfl(formula = art ~ ., data = bioChemists)
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3253 -0.8652 -0.2826  0.5404  7.2976 
## 
## Count model coefficients (poisson with log link):
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.640839   0.121307   5.283 1.27e-07 ***
## femWomen    -0.209144   0.063405  -3.299 0.000972 ***
## marMarried   0.103750   0.071111   1.459 0.144567    
## kid5        -0.143320   0.047429  -3.022 0.002513 ** 
## phd         -0.006166   0.031008  -0.199 0.842376    
## ment         0.018098   0.002294   7.888 3.07e-15 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -0.577060   0.509386  -1.133  0.25728   
## femWomen     0.109752   0.280082   0.392  0.69517   
## marMarried  -0.354018   0.317611  -1.115  0.26501   
## kid5         0.217095   0.196483   1.105  0.26920   
## phd          0.001275   0.145263   0.009  0.99300   
## ment        -0.134114   0.045243  -2.964  0.00303 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 19 
## Log-likelihood: -1605 on 12 Df

The coefficients for the binary response part take opposite signs from those from the hurdle model, because hurdle model models the probability of observing a nonzero and ZIP models the probability of always observing zero.

tibble(fitted_h = fitted(modh),
       fitted_z = fitted(modz)) %>%
  ggplot() + 
  geom_point(mapping = aes(x = fitted_h, y = fitted_z)) + 
  geom_abline(intercept = 0, slope = 1) +
  labs(x = "Hurdle predictions", y = "ZIP predictions")

modz2 <- zeroinfl(art ~ fem + kid5 + ment | ment, data = bioChemists)
summary(modz2)
## 
## Call:
## zeroinfl(formula = art ~ fem + kid5 + ment | ment, data = bioChemists)
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -2.2802 -0.8807 -0.2718  0.5131  7.4788 
## 
## Count model coefficients (poisson with log link):
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.694517   0.053025  13.098  < 2e-16 ***
## femWomen    -0.233857   0.058400  -4.004 6.22e-05 ***
## kid5        -0.126516   0.039668  -3.189  0.00143 ** 
## ment         0.018004   0.002224   8.096 5.67e-16 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.68488    0.20529  -3.336 0.000849 ***
## ment        -0.12680    0.03981  -3.185 0.001448 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 22 
## Log-likelihood: -1608 on 6 Df

The LRT yields a large p-value, indicating the smaller model is justifiable.

lrt <- 2 * (modz$loglik - modz2$loglik)
pchisq(lrt, 6, lower.tail = FALSE)
## [1] 0.4041153
exp(coef(modz2))
## count_(Intercept)    count_femWomen        count_kid5        count_ment 
##         2.0027411         0.7914748         0.8811604         1.0181669 
##  zero_(Intercept)         zero_ment 
##         0.5041522         0.8809081
tibble(fem = "Men", mar = "Single", kid5 = 0, ment = 6) %>%
  predict(modz2, newdata = ., type = "prob")
##           0         1       2         3          4          5          6
## 1 0.2775879 0.1939403 0.21636 0.1609142 0.08975799 0.04005363 0.01489462
##             7           8            9           10           11           12
## 1 0.004747556 0.001324094 0.0003282578 7.324092e-05 1.485593e-05 2.762214e-06
##             13           14           15           16          17          18
## 1 4.740812e-07 7.555503e-08 1.123857e-08 1.567219e-09 2.05693e-10 2.54968e-11
##             19
## 1 2.994131e-12

The zero part of model contributes 0.191 and the Poisson part contributes 0.087.

tibble(fem = "Men", mar = "Single", kid5 = 0, ment = 6) %>%
  predict(modz2, newdata = ., type = "zero")
##        1 
## 0.190666