# See blog post for inspiration for this exercise.
# http://svmiller.com/blog/2020/03/normal-distribution-central-limit-theorem-inference/
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# library(stevemisc) # optional for this exercise
# RUN PREVIOUSLY:
# Population <- rbnorm(250000, mean =40.01578, sd = 40.24403,
# lowerbound = 0,
# upperbound = 100,
# round = TRUE,
# seed = 8675309) # Jenny, I got your number...
#
#
#
# Population %>% as_tibble() %>%
# mutate(uid = 1:n()) %>%
# rename(therm = value) %>%
# select(uid, therm) -> Population
Population <- readRDS(url("http://posc3410.svmiller.com/toy-data/population.rds"))
Population %>%
group_by(therm) %>%
tally() %>%
ggplot(.,aes(therm, n)) + geom_bar(stat="identity", fill="#619cff",color="black")
Population %>%
summarize(mean = mean(therm),
median = median(therm),
sd = sd(therm))
## # A tibble: 1 × 3
## mean median sd
## <dbl> <dbl> <dbl>
## 1 40.0 23 40.3
# Get five samples of size 10
# don't forget your seed!
set.seed(8675309) # Jenny, I got your number...
# Get the actual samples and conver them to a data frame.
Samp5_10 <- sapply(1:5, function(i){ x <-sample(Population$therm, 10, replace = TRUE) }) %>%
# coerce matrix to data frame, then tibble
as.data.frame %>% as_tibble() %>%
gather(sample, therm)
# Get sample mean of each of the five samples
Samp5_10 %>% group_by(sample) %>% summarize(meantherm = mean(therm))
## # A tibble: 5 × 2
## sample meantherm
## <chr> <dbl>
## 1 V1 27.4
## 2 V2 28.3
## 3 V3 12.9
## 4 V4 23.2
## 5 V5 61.2
# Get a million samples of size 10
# don't forget your seed!
set.seed(8675309) # Jenny, I got your number...
# Get 100,000 samples of size 10.
# Btw, this might take a while and it's fine that it does.
# There are faster sampling approaches in R, but they require more add-on packages.
Samp100k10 <- sapply(1:1e5, function(i){ x <- mean(sample(Population$therm, 10, replace = TRUE)) }) %>%
as_tibble() %>% mutate(iter = 1:n()) %>% select(iter, value) %>% rename(sampmean = value)
Samp100k10 %>%
ggplot(.,aes(sampmean)) +
geom_density(linetype="dashed") +
geom_vline(xintercept = mean(Population$therm), linetype="dotted") +
stat_function(fun=dnorm,
color="black", size=1.5,
args=list(mean=mean(Samp100k10$sampmean),
sd=sd(Samp100k10$sampmean))) +
labs(caption = "Density plot in dashed lines. Normal function with sample mean and standard deviation overlayed in thick, solid line.")
# Extra credit
sample_sizes <- c(10, 25, 100, 400, 2000, 4000, 10000)
Samps = list()
# don't forget your seed!
set.seed(8675309) # Jenny, I got your number...
for (j in sample_sizes) {
Samps[[paste0("Sample size: ", j)]] = data.frame(sampsize=j, samp=sapply(1:10, function(i){ x <- sample(Population$therm, j, replace = FALSE) }))
}
Samps %>%
map_df(as_tibble) %>%
gather(samp, value, samp.1:samp.10) -> Samps
Samps %>%
group_by(sampsize, samp) %>%
mutate(sampmean = mean(value),
se = sd(Population$therm)/sqrt((sampsize)),
lb95 = sampmean - 1.96*se,
ub95 = sampmean + 1.96*se) %>%
distinct(sampsize, samp, sampmean, se, lb95, ub95) %>%
ungroup() %>%
mutate(sampsize = fct_inorder(paste0("Sample Size: ", sampsize)),
samp = as.numeric(str_replace(samp, "samp.", ""))) %>%
ggplot(.,aes(as.factor(samp), sampmean, ymax=ub95, ymin=lb95)) +
facet_wrap(~sampsize) +
geom_hline(yintercept = mean(Population$therm), linetype="dashed") +
geom_pointrange() + coord_flip() +
labs(y = "Sample Mean (with 95% Intervals)",
x = "Sample Number [1:10]",
title = "Ten Sample Means of Varying Sizes (with 95% Intervals) from a Population",
caption = "Simulated data for Homework #3",
subtitle = "This is extra credit, so the students should interpret this plot for themselves. :P")