nycflights13
flights
datalibrary(nycflights13)
library(dplyr)
library(ggplot2)
library(stringr)
library(infer)
set.seed(2017)
fli_small <- flights %>%
na.omit() %>%
sample_n(size = 500) %>%
mutate(season = case_when(
month %in% c(10:12, 1:3) ~ "winter",
month %in% c(4:9) ~ "summer"
)) %>%
mutate(day_hour = case_when(
between(hour, 1, 12) ~ "morning",
between(hour, 13, 24) ~ "not morning"
)) %>%
select(arr_delay, dep_delay, season,
day_hour, origin, carrier)
arr_delay
, dep_delay
season
("winter"
, "summer"
),day_hour
("morning"
, "not morning"
)origin
("EWR"
, "JFK"
, "LGA"
)carrier
Observed stat
stat |
---|
11.49 |
null_distn <- fli_small %>%
specify(response = dep_delay) %>%
hypothesize(null = "point", mu = 10) %>%
generate(reps = 1000) %>%
calculate(stat = "mean")
## Setting `type = "bootstrap"` in `generate()`.
p_value |
---|
0.356 |
Observed stat
stat |
---|
6.827 |
null_distn <- fli_small %>%
specify(response = dep_delay) %>%
hypothesize(null = "point", mu = 8) %>%
generate(reps = 1000) %>%
calculate(stat = "t")
## Setting `type = "bootstrap"` in `generate()`.
p_value |
---|
0 |
Observed stat
stat |
---|
-2 |
null_distn <- fli_small %>%
specify(response = dep_delay) %>%
hypothesize(null = "point", med = -1) %>%
generate(reps = 1000) %>%
calculate(stat = "median")
## Setting `type = "bootstrap"` in `generate()`.
p_value |
---|
0.018 |
Observed stat
( p_hat <- fli_small %>%
specify(response = day_hour, success = "morning") %>%
calculate(stat = "prop") )
stat |
---|
0.452 |
null_distn <- fli_small %>%
specify(response = day_hour, success = "morning") %>%
hypothesize(null = "point", p = .5) %>%
generate(reps = 1000) %>%
calculate(stat = "prop")
## Setting `type = "simulate"` in `generate()`.
p_value |
---|
0.036 |
Logical variables will be coerced to factors:
null_distn <- fli_small %>%
mutate(day_hour_logical = (day_hour == "morning")) %>%
specify(response = day_hour_logical, success = "TRUE") %>%
hypothesize(null = "point", p = .5) %>%
generate(reps = 1000) %>%
calculate(stat = "prop")
## Setting `type = "simulate"` in `generate()`.
Not yet implemented.
Observed stat
( d_hat <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
calculate(stat = "diff in props", order = c("winter", "summer")) )
stat |
---|
0.0044 |
null_distn <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
hypothesize(null = "independence") %>%
generate(reps = 1000) %>%
calculate(stat = "diff in props", order = c("winter", "summer"))
## Setting `type = "permute"` in `generate()`.
p_value |
---|
0.954 |
Standardized observed stat
( z_hat <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
calculate(stat = "z", order = c("winter", "summer")) )
stat |
---|
0.0985 |
null_distn <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
hypothesize(null = "independence") %>%
generate(reps = 1000) %>%
calculate(stat = "z", order = c("winter", "summer"))
## Setting `type = "permute"` in `generate()`.
p_value |
---|
0.95 |
Note the similarities in this plot and the previous one.
Observed stat
Note the need to add in the hypothesized values here to compute the observed statistic.
( Chisq_hat <- fli_small %>%
specify(response = origin) %>%
hypothesize(null = "point",
p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>%
calculate(stat = "Chisq") )
stat |
---|
7.009 |
null_distn <- fli_small %>%
specify(response = origin) %>%
hypothesize(null = "point",
p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>%
generate(reps = 1000, type = "simulate") %>%
calculate(stat = "Chisq")
visualize(null_distn) +
shade_p_value(obs_stat = Chisq_hat, direction = "greater")
p_value |
---|
0.037 |
Observed stat
stat |
---|
0.5284 |
null_distn <- fli_small %>%
specify(day_hour ~ origin) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "Chisq")
visualize(null_distn) +
shade_p_value(obs_stat = Chisq_hat, direction = "greater")
p_value |
---|
0.77 |
Observed stat
( d_hat <- fli_small %>%
specify(dep_delay ~ season) %>%
calculate(stat = "diff in means", order = c("summer", "winter")) )
stat |
---|
3 |
null_distn <- fli_small %>%
specify(dep_delay ~ season) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in means", order = c("summer", "winter"))
visualize(null_distn) +
shade_p_value(obs_stat = d_hat, direction = "two_sided")
p_value |
---|
0.338 |
Standardized observed stat
( t_hat <- fli_small %>%
specify(dep_delay ~ season) %>%
calculate(stat = "t", order = c("summer", "winter")) )
stat |
---|
0.8909 |
null_distn <- fli_small %>%
specify(dep_delay ~ season) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "t", order = c("summer", "winter"))
visualize(null_distn) +
shade_p_value(obs_stat = t_hat, direction = "two_sided")
p_value |
---|
0.4 |
Note the similarities in this plot and the previous one.
Observed stat
( d_hat <- fli_small %>%
specify(dep_delay ~ season) %>%
calculate(stat = "diff in medians", order = c("summer", "winter")) )
stat |
---|
1 |
null_distn <- fli_small %>%
specify(dep_delay ~ season) %>% # alt: response = dep_delay,
# explanatory = season
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in medians", order = c("summer", "winter"))
visualize(null_distn) +
shade_p_value(obs_stat = d_hat, direction = "two_sided")
p_value |
---|
0.64 |
Observed stat
stat |
---|
0.6858 |
null_distn <- fli_small %>%
specify(arr_delay ~ origin) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "F")
visualize(null_distn) +
shade_p_value(obs_stat = F_hat, direction = "greater")
p_value |
---|
0.529 |
Observed stat
stat |
---|
0.9916 |
null_distn <- fli_small %>%
specify(arr_delay ~ dep_delay) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "slope")
visualize(null_distn) +
shade_p_value(obs_stat = slope_hat, direction = "two_sided")
p_value |
---|
0 |
Observed stat
( correlation_hat <- fli_small %>%
specify(arr_delay ~ dep_delay) %>%
calculate(stat = "correlation") )
stat |
---|
0.8951 |
null_distn <- fli_small %>%
specify(arr_delay ~ dep_delay) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "correlation")
visualize(null_distn) +
shade_p_value(obs_stat = correlation_hat, direction = "two_sided")
p_value |
---|
0 |
Not currently implemented since \(t\) could refer to standardized slope or standardized correlation.
Point estimate
stat |
---|
6.154 |
boot <- fli_small %>%
specify(response = arr_delay) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "mean")
( percentile_ci <- get_ci(boot) )
2.5% | 97.5% |
---|---|
2.606 | 9.602 |
lower | upper |
---|---|
2.609 | 9.699 |
Point estimate
stat |
---|
3.3 |
boot <- fli_small %>%
specify(response = arr_delay) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "t")
( percentile_ci <- get_ci(boot) )
2.5% | 97.5% |
---|---|
1.625 | 4.879 |
lower | upper |
---|---|
1.7 | 4.9 |
Point estimate
( p_hat <- fli_small %>%
specify(response = day_hour, success = "morning") %>%
calculate(stat = "prop") )
stat |
---|
0.452 |
boot <- fli_small %>%
specify(response = day_hour, success = "morning") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "prop")
( percentile_ci <- get_ci(boot) )
2.5% | 97.5% |
---|---|
0.406 | 0.496 |
lower | upper |
---|---|
0.4079 | 0.4961 |
Not yet implemented.
Point estimate
( d_hat <- fli_small %>%
specify(arr_delay ~ season) %>%
calculate(stat = "diff in means", order = c("summer", "winter")) )
stat |
---|
5.629 |
boot <- fli_small %>%
specify(arr_delay ~ season) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "diff in means", order = c("summer", "winter"))
( percentile_ci <- get_ci(boot) )
2.5% | 97.5% |
---|---|
-2.025 | 12.54 |
lower | upper |
---|---|
-1.605 | 12.86 |
Standardized point estimate
( t_hat <- fli_small %>%
specify(arr_delay ~ season) %>%
calculate(stat = "t", order = c("summer", "winter")) )
stat |
---|
1.511 |
boot <- fli_small %>%
specify(arr_delay ~ season) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "t", order = c("summer", "winter"))
( percentile_ci <- get_ci(boot) )
2.5% | 97.5% |
---|---|
-0.3589 | 3.736 |
lower | upper |
---|---|
-0.5783 | 3.601 |
Point estimate
( d_hat <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
calculate(stat = "diff in props", order = c("summer", "winter")) )
stat |
---|
-0.0044 |
boot <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "diff in props", order = c("summer", "winter"))
( percentile_ci <- get_ci(boot) )
2.5% | 97.5% |
---|---|
-0.0957 | 0.0818 |
lower | upper |
---|---|
-0.0914 | 0.0826 |
Standardized point estimate
( z_hat <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
calculate(stat = "z", order = c("summer", "winter")) )
stat |
---|
-0.0985 |
boot <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "z", order = c("summer", "winter"))
( percentile_ci <- get_ci(boot) )
2.5% | 97.5% |
---|---|
-1.962 | 1.788 |
lower | upper |
---|---|
-2.042 | 1.845 |
Point estimate
stat |
---|
0.9916 |
boot <- fli_small %>%
specify(arr_delay ~ dep_delay) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "slope")
( percentile_ci <- get_ci(boot) )
2.5% | 97.5% |
---|---|
0.9463 | 1.032 |
lower | upper |
---|---|
0.9468 | 1.036 |
Point estimate
( correlation_hat <- fli_small %>%
specify(arr_delay ~ dep_delay) %>%
calculate(stat = "correlation") )
stat |
---|
0.8951 |
boot <- fli_small %>%
specify(arr_delay ~ dep_delay) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "correlation")
( percentile_ci <- get_ci(boot) )
2.5% | 97.5% |
---|---|
0.827 | 0.9332 |
lower | upper |
---|---|
0.8418 | 0.9485 |
Not currently implemented since \(t\) could refer to standardized slope or standardized correlation.