1-sc-primary-2016.R

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

SCP16 <- readRDS(url("http://posc3410.svmiller.com/toy-data/SCP16.rds"))

# Get mean and median of Trump's vote share

SCP16 %>%
  summarize(meanvotetrump = mean(trump),
            medianvotetrump = median(trump)) %>% data.frame

##   meanvotetrump medianvotetrump
## 1      36.13043           35.55

# Get a density plot of the distribution of Trump's vote share, county-wide

SCP16 %>%
  ggplot(.,aes(trump)) + geom_density() +
  scale_x_continuous(limits=c(20,60))

# Get mean and median of county population size

SCP16 %>%
  summarize(meanpopsize = mean(population),
            medianpopsize = median(population)) %>% data.frame

##   meanpopsize medianpopsize
## 1      105054       58398.5

# Get a density plot of the county population size

SCP16 %>%
  ggplot(.,aes(population)) + geom_density() +
  scale_x_continuous(labels = scales::comma)

# Get top five most populous counties and bottom five for comparison sake (if that helps you)

SCP16 %>%
  # Reshuffle the data such that the most populous counties come first
  arrange(-population) %>%
  # Create a rank variable for context
  mutate(rank = 1:n()) %>%
  # Get top 5 and bottom 5 by population
  # I had to cheat a little bit, knowing SC has 46 counties
  slice(1:5, 42:46) %>%
  # Select just what we want to look at
  select(rank, county, population)

## # A tibble: 10 × 3
##     rank county      population
##    <int> <chr>            <dbl>
##  1     1 greenville      482752
##  2     2 richland        401566
##  3     3 charleston      381015
##  4     4 horry           298832
##  5     5 spartanburg     293542
##  6    42 lee              18343
##  7    43 bamberg          15182
##  8    44 calhoun          14878
##  9    45 mccormick         9846
## 10    46 allendale         9695

# Create natural logarithmic transformations of county population size

SCP16 %>%
  mutate(logpop = log(population)) -> SCP16

# Get and compare mean and medians for population and log(population)
SCP16 %>%
  summarize(meanpop = mean(population),
            medianpop = median(population),
            meanlogpop = mean(logpop),
            medianlogpop = median(logpop)) %>%
  gather(var, val) %>%
  data.frame

##            var          val
## 1      meanpop 105053.95652
## 2    medianpop  58398.50000
## 3   meanlogpop     11.01882
## 4 medianlogpop     10.97422

# Get a density plot of natural log of county population size

SCP16 %>%
  ggplot(.,aes(logpop)) + geom_density() +
  scale_x_continuous(limits=c(8,14))

# ...and compare with the other one

SCP16 %>%
  ggplot(.,aes(population)) + geom_density() +
  scale_x_continuous(labels = scales::comma)

# Extra Credit

summary(M1 <- lm(trump ~ illiteracy +  unemployment + perblack, SCP16))

## 
## Call:
## lm(formula = trump ~ illiteracy + unemployment + perblack, data = SCP16)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.626 -2.388  0.348  1.612 10.446 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  12.31072    2.70773   4.547 4.57e-05 ***
## illiteracy    0.81136    0.18401   4.409 7.06e-05 ***
## unemployment  2.50525    0.50059   5.005 1.05e-05 ***
## perblack     -0.20685    0.06187  -3.343  0.00175 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.481 on 42 degrees of freedom
## Multiple R-squared:  0.6658, Adjusted R-squared:  0.6419 
## F-statistic: 27.89 on 3 and 42 DF,  p-value: 4.392e-10

1-sc-primary-2016.R

steve

2022-01-21