library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
SCP16 <- readRDS(url("http://posc3410.svmiller.com/toy-data/SCP16.rds"))
# Get mean and median of Trump's vote share
SCP16 %>%
summarize(meanvotetrump = mean(trump),
medianvotetrump = median(trump)) %>% data.frame
## meanvotetrump medianvotetrump
## 1 36.13043 35.55
# Get a density plot of the distribution of Trump's vote share, county-wide
SCP16 %>%
ggplot(.,aes(trump)) + geom_density() +
scale_x_continuous(limits=c(20,60))
# Get mean and median of county population size
SCP16 %>%
summarize(meanpopsize = mean(population),
medianpopsize = median(population)) %>% data.frame
## meanpopsize medianpopsize
## 1 105054 58398.5
# Get a density plot of the county population size
SCP16 %>%
ggplot(.,aes(population)) + geom_density() +
scale_x_continuous(labels = scales::comma)
# Get top five most populous counties and bottom five for comparison sake (if that helps you)
SCP16 %>%
# Reshuffle the data such that the most populous counties come first
arrange(-population) %>%
# Create a rank variable for context
mutate(rank = 1:n()) %>%
# Get top 5 and bottom 5 by population
# I had to cheat a little bit, knowing SC has 46 counties
slice(1:5, 42:46) %>%
# Select just what we want to look at
select(rank, county, population)
## # A tibble: 10 × 3
## rank county population
## <int> <chr> <dbl>
## 1 1 greenville 482752
## 2 2 richland 401566
## 3 3 charleston 381015
## 4 4 horry 298832
## 5 5 spartanburg 293542
## 6 42 lee 18343
## 7 43 bamberg 15182
## 8 44 calhoun 14878
## 9 45 mccormick 9846
## 10 46 allendale 9695
# Create natural logarithmic transformations of county population size
SCP16 %>%
mutate(logpop = log(population)) -> SCP16
# Get and compare mean and medians for population and log(population)
SCP16 %>%
summarize(meanpop = mean(population),
medianpop = median(population),
meanlogpop = mean(logpop),
medianlogpop = median(logpop)) %>%
gather(var, val) %>%
data.frame
## var val
## 1 meanpop 105053.95652
## 2 medianpop 58398.50000
## 3 meanlogpop 11.01882
## 4 medianlogpop 10.97422
# Get a density plot of natural log of county population size
SCP16 %>%
ggplot(.,aes(logpop)) + geom_density() +
scale_x_continuous(limits=c(8,14))
# ...and compare with the other one
SCP16 %>%
ggplot(.,aes(population)) + geom_density() +
scale_x_continuous(labels = scales::comma)
# Extra Credit
summary(M1 <- lm(trump ~ illiteracy + unemployment + perblack, SCP16))
##
## Call:
## lm(formula = trump ~ illiteracy + unemployment + perblack, data = SCP16)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.626 -2.388 0.348 1.612 10.446
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.31072 2.70773 4.547 4.57e-05 ***
## illiteracy 0.81136 0.18401 4.409 7.06e-05 ***
## unemployment 2.50525 0.50059 5.005 1.05e-05 ***
## perblack -0.20685 0.06187 -3.343 0.00175 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.481 on 42 degrees of freedom
## Multiple R-squared: 0.6658, Adjusted R-squared: 0.6419
## F-statistic: 27.89 on 3 and 42 DF, p-value: 4.392e-10