10: Bootstrapping and Confidence Intervals

Based on Chapter 8 of ModernDive. Code for Quiz 12.

Load the R package we will use.

What is the average age of members that have served in congress?

set.seed(123)
congress_age_100 <- congress_age %>%
  rep_sample_n(size = 100)
congress_age_100
# A tibble: 100 x 14
# Groups:   replicate [1]
   replicate congress chamber bioguide firstname middlename lastname 
       <int>    <int> <chr>   <chr>    <chr>     <chr>      <chr>    
 1         1       85 house   F000209  Daniel    John       Flood    
 2         1       83 house   R000187  George    Milton     Rhodes   
 3         1       86 house   C001011  Laurence  NA         Curtis   
 4         1      101 house   M000842  Guy       Victor     Molinari 
 5         1       88 house   G000504  James     Russell    Grover   
 6         1       92 house   A000189  Glenn     Malcolm    Anderson 
 7         1      109 house   M000388  James     O.         McCrery  
 8         1       84 senate  G000267  Barry     Morris     Goldwater
 9         1      102 senate  N000102  Don       NA         Nickles  
10         1       96 house   H000074  Tony      P.         Hall     
# ... with 90 more rows, and 7 more variables: suffix <chr>,
#   birthday <date>, state <chr>, party <chr>, incumbent <lgl>,
#   termstart <date>, age <dbl>

Construct the confidence interval

1. Use specify to indicate the variable from congress_age_100 that you are interested in

congress_age_100 %>%
  specify(response = age)
Response: age (numeric)
# A tibble: 100 x 1
     age
   <dbl>
 1  53.1
 2  54.9
 3  65.3
 4  60.1
 5  43.8
 6  57.9
 7  55.3
 8  46  
 9  42.1
10  37  
# ... with 90 more rows

2. generate 1000 replicas of your sample of 100

congress_age_100 %>%
  specify(response = age) %>%
  generate(reps = 1000, type = "bootstrap")
Response: age (numeric)
# A tibble: 100,000 x 2
# Groups:   replicate [1,000]
   replicate   age
       <int> <dbl>
 1         1  42.1
 2         1  71.2
 3         1  45.6
 4         1  39.6
 5         1  56.8
 6         1  71.6
 7         1  60.5
 8         1  56.4
 9         1  43.3
10         1  53.1
# ... with 99,990 more rows

The output has 100,000 rows

3, calculate the mean for each replicate

bootstrap_distribution_mean_age <- congress_age_100 %>%
  specify(response = age) %>%
  generate(reps = 1000, type = "bootstrap") %>%
  calculate(stat = "mean")

bootstrap_distribution_mean_age
# A tibble: 1,000 x 2
   replicate  stat
 *     <int> <dbl>
 1         1  53.6
 2         2  53.2
 3         3  52.8
 4         4  51.5
 5         5  53.0
 6         6  54.2
 7         7  52.0
 8         8  52.8
 9         9  53.8
10        10  52.4
# ... with 990 more rows

4. visualize the bootstrap distribution

visualize(bootstrap_distribution_mean_age)

Calculate the 95% confidence interval using the percentile method

congress_ci_percentile <- bootstrap_distribution_mean_age %>%
  get_confidence_interval(type = "percentile", level = 0.95)
congress_ci_percentile
# A tibble: 1 x 2
  lower_ci upper_ci
     <dbl>    <dbl>
1     51.5     55.2
obs_mean_age <- congress_age_100 %>%
  specify(response = age) %>%
  calculate(stat = "mean") %>%
  pull()

obs_mean_age
[1] 53.36
visualize(bootstrap_distribution_mean_age) +
  shade_confidence_interval(endpoints = congress_ci_percentile) +
geom_vline(xintercept = obs_mean_age, color = "hotpink", size = 1)

pop_mean_age <- congress_age %>%
  summarize(pop_mean = mean(age))  %>% pull()

pop_mean_age
[1] 53.31373
visualize(bootstrap_distribution_mean_age) +
  shade_confidence_interval(endpoints = congress_ci_percentile) +
  
  geom_vline(xintercept = obs_mean_age, color = "hotpink", size = 1) +
  geom_vline(xintercept = pop_mean_age, color = "purple", size = 3)

Save the previous plot to preview.png and add to the yaml chunk at the top

ggsave(filename = "preview.png",
       path = here::here("_posts", "2021-05-08-bootstrapping"))