Notes for Data Visualization A practical introduction

2020/01/26

This post includes the most of the codes and output from this Book:

preface

my_packages <- c("tidyverse", "broom", "coefplot", "cowplot",
                 "gapminder", "GGally", "ggrepel", "ggridges", "gridExtra",
                 "here", "interplot", "margins", "maps", "mapproj",
                 "mapdata", "MASS", "quantreg", "rlang", "scales",
                 "survey", "srvyr", "viridis", "viridisLite", "devtools")

install.packages(my_packages, repos = "http://cran.rstudio.com")

devtools::install_github("kjhealy/socviz")
my_packages <- c("tidyverse", "broom", "coefplot", "cowplot",
                 "gapminder", "GGally", "ggrepel", "ggridges", "gridExtra",
                 "here", "interplot", "margins", "maps", "mapproj",
                 "mapdata", "MASS", "quantreg", "rlang", "scales",
                 "survey", "srvyr", "viridis", "viridisLite", "devtools")

library(tidyverse)
## -- Attaching packages ------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ---------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(socviz)
library(gapminder)
library(broom)
theme_set(theme_minimal()) 

chapter 2

2.6 make my first graph

head(gapminder)  
## # A tibble: 6 x 6
##   country     continent  year lifeExp      pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.
## 2 Afghanistan Asia       1957    30.3  9240934      821.
## 3 Afghanistan Asia       1962    32.0 10267083      853.
## 4 Afghanistan Asia       1967    34.0 11537966      836.
## 5 Afghanistan Asia       1972    36.1 13079460      740.
## 6 Afghanistan Asia       1977    38.4 14880372      786.
p <- gapminder %>% ggplot(
  aes(x = gdpPercap, y = lifeExp)
) 

chapter 3

p + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

p + geom_point() + geom_smooth()  
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

p + geom_point() + geom_smooth(method = "lm")

p + geom_smooth(method = "gam") + scale_x_log10() + geom_point()

p + geom_smooth(method = "gam") + scale_x_log10(labels = scales::dollar) + geom_point()

# bad way!! 
p <- ggplot(data = gapminder, aes(x = gdpPercap, y = lifeExp, color = "purple"))   

p + geom_point() + geom_smooth(method = "loess") + scale_x_log10()   

# good way!!  

p <- ggplot(data = gapminder, aes(x = gdpPercap, y = lifeExp))   
p + geom_point(color = "purple") + geom_smooth(method = "loess") + scale_x_log10()   

p <- ggplot(data = gapminder, aes(x = gdpPercap, y = lifeExp))  

p + geom_point(alpha = 0.3) + geom_smooth(color = "orange", se = FALSE, size = 8, method = "lm") + scale_x_log10()

# final one 
p <- ggplot(data = gapminder, aes(x = gdpPercap, y = lifeExp)) 

p + geom_point(alpha = 0.3) + geom_smooth(method = "gam") + scale_x_log10(labels = scales::dollar) + labs(
  x = "GDP per capita", y = "Life expectancy in years", title = "Economic growth and life expectancy", subtitle = "Data points are country-years", caption = "Source: gapminder."
)

p <- ggplot(data = gapminder, aes(x = gdpPercap, y = lifeExp, color = continent))   

p + geom_point() + geom_smooth(method = "loess") + scale_x_log10()   

p <- ggplot(data = gapminder, aes(x = gdpPercap, y = lifeExp, color = continent, fill = continent))   

p + geom_point() + geom_smooth(method = "loess") + scale_x_log10()   

p <- ggplot(data = gapminder, mapping = aes(x = gdpPercap, y = lifeExp))

p + geom_point(mapping = aes(color = continent)) +
    geom_smooth(method = "loess") +
    scale_x_log10()

p <- ggplot(data = gapminder,
            mapping = aes(x = gdpPercap,
                          y = lifeExp))

p + geom_point(mapping = aes(color = log(pop))) +
    scale_x_log10()    

p + geom_point(mapping = aes(color = log(pop))) +
    scale_x_reverse()    

p + geom_point(mapping = aes(color = log(pop))) +
    scale_x_sqrt()    

chapter 4 show the right numbers

# wrong graph
p <- ggplot(data = gapminder, aes(x = year, y = gdpPercap))

p + geom_line()

# right - group by country

p <- ggplot(data = gapminder, aes(x = year, y = gdpPercap))

p + geom_line(aes(group = country))

# facet to make small multiples   
facet_wrap(~ continent)  
## <ggproto object: Class FacetWrap, Facet, gg>
##     compute_layout: function
##     draw_back: function
##     draw_front: function
##     draw_labels: function
##     draw_panels: function
##     finish_data: function
##     init_scales: function
##     map_data: function
##     params: list
##     setup_data: function
##     setup_params: function
##     shrink: TRUE
##     train_scales: function
##     vars: function
##     super:  <ggproto object: Class FacetWrap, Facet, gg>
p <- ggplot(data = gapminder, aes(x = year, y = gdpPercap))

p + geom_line(aes(group = country)) + facet_wrap(~ continent)

p <- ggplot(gapminder, aes(x = year, y = gdpPercap))  

p + geom_line(color = "gray70", aes(group = country)) + geom_smooth(size = 1.1, method = "loess", se = FALSE) + scale_y_log10(labels = scales::dollar) + facet_wrap(~ continent, ncol = 5) + labs(
  x = "Year", 
  y = "GDP per capita", 
  title = "GDP per capita on five continents"
)

# use dataset GSS
glimpse(gss_sm)
## Observations: 2,867
## Variables: 32
## $ year        <dbl> 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 20...
## $ id          <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ ballot      <dbl> 1, 2, 3, 1, 3, 2, 1, 3, 1, 3, 2, 1, 2, 3, 2, 3, 3,...
## $ age         <dbl> 47, 61, 72, 43, 55, 53, 50, 23, 45, 71, 33, 86, 32...
## $ childs      <dbl> 3, 0, 2, 4, 2, 2, 2, 3, 3, 4, 5, 4, 3, 5, 7, 2, 6,...
## $ sibs        <dbl> 2, 3, 3, 3, 2, 2, 2, 6, 5, 1, 4, 4, 3, 6, 0, 1, 3,...
## $ degree      <fct> Bachelor, High School, Bachelor, High School, Grad...
## $ race        <fct> White, White, White, White, White, White, White, O...
## $ sex         <fct> Male, Male, Male, Female, Female, Female, Male, Fe...
## $ region      <fct> New England, New England, New England, New England...
## $ income16    <fct> $170000 or over, $50000 to 59999, $75000 to $89999...
## $ relig       <fct> None, None, Catholic, Catholic, None, None, None, ...
## $ marital     <fct> Married, Never Married, Married, Married, Married,...
## $ padeg       <fct> Graduate, Lt High School, High School, NA, Bachelo...
## $ madeg       <fct> High School, High School, Lt High School, High Sch...
## $ partyid     <fct> "Independent", "Ind,near Dem", "Not Str Republican...
## $ polviews    <fct> Moderate, Liberal, Conservative, Moderate, Slightl...
## $ happy       <fct> Pretty Happy, Pretty Happy, Very Happy, Pretty Hap...
## $ partners    <fct> NA, 1 Partner, 1 Partner, NA, 1 Partner, 1 Partner...
## $ grass       <fct> NA, Legal, Not Legal, NA, Legal, Legal, NA, Not Le...
## $ zodiac      <fct> Aquarius, Scorpio, Pisces, Cancer, Scorpio, Scorpi...
## $ pres12      <dbl> 3, 1, 2, 2, 1, 1, NA, NA, NA, 2, NA, NA, 1, 1, 2, ...
## $ wtssall     <dbl> 0.9570, 0.4785, 0.9570, 1.9140, 1.4355, 0.9570, 1....
## $ income_rc   <fct> Gt $170000, Gt $50000, Gt $75000, Gt $170000, Gt $...
## $ agegrp      <fct> Age 45-55, Age 55-65, Age 65+, Age 35-45, Age 45-5...
## $ ageq        <fct> Age 34-49, Age 49-62, Age 62+, Age 34-49, Age 49-6...
## $ siblings    <fct> 2, 3, 3, 3, 2, 2, 2, 6+, 5, 1, 4, 4, 3, 6+, 0, 1, ...
## $ kids        <fct> 3, 0, 2, 4+, 2, 2, 2, 3, 3, 4+, 4+, 4+, 3, 4+, 4+,...
## $ religion    <fct> None, None, Catholic, Catholic, None, None, None, ...
## $ bigregion   <fct> Northeast, Northeast, Northeast, Northeast, Northe...
## $ partners_rc <fct> NA, 1, 1, NA, 1, 1, NA, 1, NA, 3, 1, NA, 1, NA, 0,...
## $ obama       <dbl> 0, 1, 0, 0, 1, 1, NA, NA, NA, 0, NA, NA, 1, 1, 0, ...
p <- ggplot(gss_sm, aes(x = age, y = childs))

p + geom_point(alpha = 0.2) + geom_smooth() + facet_grid(sex ~ race)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 18 rows containing non-finite values (stat_smooth).
## Warning: Removed 18 rows containing missing values (geom_point).

p <- ggplot(gss_sm, aes(x = bigregion))

p + geom_bar()

p <- ggplot(gss_sm, aes(x = bigregion))  

# wrong way
p + geom_bar(aes(y = ..prop..))

# right way  
p + geom_bar(aes(y = ..prop.., group = 1))

table(gss_sm$religion)   
## 
## Protestant   Catholic     Jewish       None      Other 
##       1371        649         51        619        159
p <- ggplot(gss_sm, aes(x = religion, color = religion))  

p + geom_bar()

p <- ggplot(gss_sm, aes(x = religion, fill = religion))  

p + geom_bar() + guides(fill = FALSE) # guides() function controls whether guiding information about any particular mapping appears or not.  If we set guides(fill = FALSE), the legend is removed, in effect saying that the viewer of the figure does not need to be shown any guiding information about this mapping.

p <- ggplot(gss_sm, aes(x = bigregion, fill = religion)) 

p + geom_bar() # the default output is a stacked bar chart 

p + geom_bar(position = "fill")

p + geom_bar(position = "dodge")

p + geom_bar(position = "dodge", aes(y = ..prop..)) # wrong way

p + geom_bar(position = "dodge", aes(y = ..prop.., group = religion)) # this gives us a bar chart where the values of religion are broken down across regions, with a proportio showing on the y axis.The bars for any particular religion sum to one across regions  

p <- ggplot(gss_sm, aes(x = religion))  

p + geom_bar(position = "dodge", aes(y = ..prop.., group = bigregion)) + facet_wrap(~ bigregion, ncol = 2)

p <- ggplot(midwest, aes(x = area))

p + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# change bins  
p <- ggplot(midwest, aes(x = area))

p + geom_histogram(bins = 10)

oh_wi <- c("OH", "WI")

p <- ggplot(data = subset(midwest, subset = state %in% oh_wi),
            mapping = aes(x = percollege, fill = state))

p + geom_histogram(alpha = 0.4, bins = 20)  

p <- ggplot(data = midwest, aes(x = area))  

p + geom_density()

p <- ggplot(data = midwest, aes(x = area, fill = state, color = state))  
p + geom_density(alpha = 0.3)

p <- ggplot(data = subset(midwest, subset = state %in% oh_wi), 
            aes(x = area, fill = state, color = state))  

p + geom_density(alpha = 0.3, aes(y = ..scaled..)) # For geom_density(), the stat_density() function can return its default ..density.. statistic, or ..scaled.., which will give a proportional density estimate. It can also return a statistic called ..count.., which is the density times the number of points. This can be used in stacked density plots.  

titanic
##       fate    sex    n percent
## 1 perished   male 1364    62.0
## 2 perished female  126     5.7
## 3 survived   male  367    16.7
## 4 survived female  344    15.6
p <- ggplot(titanic, aes(x = fate, y = percent, fill = sex))

p +geom_bar(position = "dodge", stat = "identity") + theme(legend.position = "top")

head(oecd_sum)
## # A tibble: 6 x 5
## # Groups:   year [6]
##    year other   usa  diff hi_lo
##   <int> <dbl> <dbl> <dbl> <chr>
## 1  1960  68.6  69.9 1.3   Below
## 2  1961  69.2  70.4 1.2   Below
## 3  1962  68.9  70.2 1.30  Below
## 4  1963  69.1  70   0.9   Below
## 5  1964  69.5  70.3 0.800 Below
## 6  1965  69.6  70.3 0.7   Below
p <- ggplot(oecd_sum, aes(x = year, y = diff, fill = hi_lo)) 

p + geom_col() + guides(fill = FALSE) + labs(x = NULL, y = "Difference in Years", title = "The US life expectancy gap", subtitle = "Difference between US and OECD average life expectancies, 1960-2015", caption = "Data: OECD. After a chart by Christopher Ingraham, Washington Post, December 27th 2017.")
## Warning: Removed 1 rows containing missing values (position_stack).

# Investigate the difference between a formula written as facet_grid(sex ~ race) versus one written as facet_grid(~ sex + race)  


p <- ggplot(gss_sm, aes(x = age, y = childs))

p + geom_point(alpha = 0.2) + geom_smooth() + facet_grid(sex ~ race)  
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 18 rows containing non-finite values (stat_smooth).
## Warning: Removed 18 rows containing missing values (geom_point).

p + geom_point(alpha = 0.2) + geom_smooth() + facet_grid(~ sex + race)  
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 18 rows containing non-finite values (stat_smooth).

## Warning: Removed 18 rows containing missing values (geom_point).

p + geom_point(alpha = 0.2) + geom_smooth() + facet_wrap(~ sex + race)  
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 18 rows containing non-finite values (stat_smooth).

## Warning: Removed 18 rows containing missing values (geom_point).

# Frequency polygons are closely related to histograms. Instead of displaying the count of observations using bars, they display it with a series of connected lines instead. You can try the various geom_histogram() calls in this chapter using geom_freqpoly() instead.


p <- ggplot(midwest, aes(x = area))

p + geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Density estimates can also be drawn in two dimensions. The geom_density_2d() function draws contour lines estimating the joint distribution of two variables. Try it with the midwest data, for example, plotting percent below the poverty line (percbelowpoverty) against percent college-educated (percollege). Try it with and without a geom_point() layer.  

p <- ggplot(midwest, aes(y = percbelowpoverty, x = percollege))

p + geom_density_2d() 

p + geom_density_2d() + geom_point()

chapter 5 - Graph tables, add labels, make notes

Use pipe to summarize data

rel_by_region <- gss_sm %>% 
  group_by(bigregion, religion) %>% 
  summarize(N = n()) %>% 
  mutate(freq = N/ sum(N), pct = round((freq*100), 0))  
## Warning: Factor `religion` contains implicit NA, consider using
## `forcats::fct_explicit_na`
rel_by_region
## # A tibble: 24 x 5
## # Groups:   bigregion [4]
##    bigregion religion       N    freq   pct
##    <fct>     <fct>      <int>   <dbl> <dbl>
##  1 Northeast Protestant   158 0.324      32
##  2 Northeast Catholic     162 0.332      33
##  3 Northeast Jewish        27 0.0553      6
##  4 Northeast None         112 0.230      23
##  5 Northeast Other         28 0.0574      6
##  6 Northeast <NA>           1 0.00205     0
##  7 Midwest   Protestant   325 0.468      47
##  8 Midwest   Catholic     172 0.247      25
##  9 Midwest   Jewish         3 0.00432     0
## 10 Midwest   None         157 0.226      23
## # ... with 14 more rows
# sanity check  
rel_by_region %>% group_by(bigregion) %>% summarize(total = sum(pct))
## # A tibble: 4 x 2
##   bigregion total
##   <fct>     <dbl>
## 1 Northeast   100
## 2 Midwest     101
## 3 South       100
## 4 West        101
p <- ggplot(rel_by_region, aes(x = bigregion, y = pct, fill = religion))

p + geom_col(position = "dodge2") + labs(x = "Region", y = "Percent", fill = "Religion") + theme(legend.position = "top") 

p + geom_col(position = "dodge") + labs(x = "Region", y = "Percent", fill = "Religion") + theme(legend.position = "top") 

p <- ggplot(rel_by_region, aes(x = religion, y = pct, fill = religion))

p + geom_col(position = "dodge2") + labs(x = NULL, y = "Percent", fill = "Religion") + guides(fill = FALSE) + coord_flip() + facet_grid(~ bigregion)

5.2 Continuous variables by group or category

organdata %>% select(1:6) %>% sample_n(size = 10)    
## # A tibble: 10 x 6
##    country       year       donors    pop pop_dens   gdp
##    <chr>         <date>      <dbl>  <int>    <dbl> <int>
##  1 Spain         NA           NA    38850    7.68  12971
##  2 Netherlands   1991-01-01   14.9  15070   36.3   18708
##  3 Belgium       NA           NA       NA   NA        NA
##  4 Canada        1997-01-01   14.2  29987    0.301 23949
##  5 Finland       1999-01-01   16.5   5166    1.53  23702
##  6 Switzerland   1999-01-01   14.4   7144   17.3   28562
##  7 Germany       2001-01-01   12.8  82350   23.1   25436
##  8 Spain         1997-01-01   29    39348    7.78  17203
##  9 United States 2001-01-01   21.3 285318    2.96  35118
## 10 Denmark       2002-01-01   12.7   5376   12.5   29228
p <- ggplot(data = organdata, aes(x = year, y = donors))

p + geom_point()
## Warning: Removed 34 rows containing missing values (geom_point).

p + geom_line(aes(group = country)) + facet_wrap(~ country) # by default, the facets are ordered alphabetically by country  
## Warning: Removed 34 rows containing missing values (geom_path).

p <- ggplot(organdata, aes(x = country, y = donors))  

p + geom_boxplot()
## Warning: Removed 34 rows containing non-finite values (stat_boxplot).

p + geom_boxplot() + coord_flip()
## Warning: Removed 34 rows containing non-finite values (stat_boxplot).

# reorder country  

p <- ggplot(data = organdata, aes(x = reorder(country, donors, na.rm = TRUE), y = donors))  

p + geom_boxplot() + labs(x = NULL) + coord_flip()  
## Warning: Removed 34 rows containing non-finite values (stat_boxplot).

# other way  

p + geom_violin() + labs(x = NULL) + coord_flip()    
## Warning: Removed 34 rows containing non-finite values (stat_ydensity).

p <- ggplot(organdata, aes(x = reorder(country, donors, na.rm = TRUE), y = donors, fill = world))  

p + geom_boxplot() + labs(x = NULL) + coord_flip() + theme(legend.position = "top")
## Warning: Removed 34 rows containing non-finite values (stat_boxplot).

p <- ggplot(organdata, aes(x = reorder(country, donors, na.rm = TRUE), y = donors, color = world))  

p + geom_point() + labs(x = NULL) + coord_flip() + theme(legend.position = "top")
## Warning: Removed 34 rows containing missing values (geom_point).

p <- ggplot(organdata, aes(x = reorder(country, donors, na.rm = TRUE), y = donors, color = world))  

p + geom_jitter() + labs(x = NULL) + coord_flip() + theme(legend.position = "top") # the default jitter is a little too much  
## Warning: Removed 34 rows containing missing values (geom_point).

p + geom_jitter(position = position_jitter(width = 0.15)) + labs(x = NULL) + coord_flip() + theme(legend.position = "top")  
## Warning: Removed 34 rows containing missing values (geom_point).

# When we want to summarize a categorical variable that just has one point per category, we should use this approach as well. The result will be a Cleveland dotplot, a simple and extremely effective method of presenting data that is usually better than either a bar chart or a table.  

# bad way
by_country <- organdata %>% group_by(consent_law, country) %>%
    summarize(donors_mean= mean(donors, na.rm = TRUE),
              donors_sd = sd(donors, na.rm = TRUE),
              gdp_mean = mean(gdp, na.rm = TRUE),
              health_mean = mean(health, na.rm = TRUE),
              roads_mean = mean(roads, na.rm = TRUE),
              cerebvas_mean = mean(cerebvas, na.rm = TRUE))  

by_country  
## # A tibble: 17 x 8
## # Groups:   consent_law [2]
##    consent_law country donors_mean donors_sd gdp_mean health_mean
##    <chr>       <chr>         <dbl>     <dbl>    <dbl>       <dbl>
##  1 Informed    Austra~        10.6     1.14    22179.       1958.
##  2 Informed    Canada         14.0     0.751   23711.       2272.
##  3 Informed    Denmark        13.1     1.47    23722.       2054.
##  4 Informed    Germany        13.0     0.611   22163.       2349.
##  5 Informed    Ireland        19.8     2.48    20824.       1480.
##  6 Informed    Nether~        13.7     1.55    23013.       1993.
##  7 Informed    United~        13.5     0.775   21359.       1561.
##  8 Informed    United~        20.0     1.33    29212.       3988.
##  9 Presumed    Austria        23.5     2.42    23876.       1875.
## 10 Presumed    Belgium        21.9     1.94    22500.       1958.
## 11 Presumed    Finland        18.4     1.53    21019.       1615.
## 12 Presumed    France         16.8     1.60    22603.       2160.
## 13 Presumed    Italy          11.1     4.28    21554.       1757 
## 14 Presumed    Norway         15.4     1.11    26448.       2217.
## 15 Presumed    Spain          28.1     4.96    16933        1289.
## 16 Presumed    Sweden         13.1     1.75    22415.       1951.
## 17 Presumed    Switze~        14.2     1.71    27233        2776.
## # ... with 2 more variables: roads_mean <dbl>, cerebvas_mean <dbl>
# good way
by_country <- organdata %>% group_by(consent_law, country) %>%
    summarize_if(is.numeric, funs(mean, sd), na.rm = TRUE) %>%
    ungroup()
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
by_country  
## # A tibble: 17 x 28
##    consent_law country donors_mean pop_mean pop_dens_mean gdp_mean
##    <chr>       <chr>         <dbl>    <dbl>         <dbl>    <dbl>
##  1 Informed    Austra~        10.6   18318.         0.237   22179.
##  2 Informed    Canada         14.0   29608.         0.297   23711.
##  3 Informed    Denmark        13.1    5257.        12.2     23722.
##  4 Informed    Germany        13.0   80255.        22.5     22163.
##  5 Informed    Ireland        19.8    3674.         5.23    20824.
##  6 Informed    Nether~        13.7   15548.        37.4     23013.
##  7 Informed    United~        13.5   58187.        24.0     21359.
##  8 Informed    United~        20.0  269330.         2.80    29212.
##  9 Presumed    Austria        23.5    7927.         9.45    23876.
## 10 Presumed    Belgium        21.9   10153.        30.7     22500.
## 11 Presumed    Finland        18.4    5112.         1.51    21019.
## 12 Presumed    France         16.8   58056.        10.5     22603.
## 13 Presumed    Italy          11.1   57360.        19.0     21554.
## 14 Presumed    Norway         15.4    4386.         1.35    26448.
## 15 Presumed    Spain          28.1   39666.         7.84    16933 
## 16 Presumed    Sweden         13.1    8789.         1.95    22415.
## 17 Presumed    Switze~        14.2    7037.        17.0     27233 
## # ... with 22 more variables: gdp_lag_mean <dbl>, health_mean <dbl>,
## #   health_lag_mean <dbl>, pubhealth_mean <dbl>, roads_mean <dbl>,
## #   cerebvas_mean <dbl>, assault_mean <dbl>, external_mean <dbl>,
## #   txp_pop_mean <dbl>, donors_sd <dbl>, pop_sd <dbl>, pop_dens_sd <dbl>,
## #   gdp_sd <dbl>, gdp_lag_sd <dbl>, health_sd <dbl>, health_lag_sd <dbl>,
## #   pubhealth_sd <dbl>, roads_sd <dbl>, cerebvas_sd <dbl>,
## #   assault_sd <dbl>, external_sd <dbl>, txp_pop_sd <dbl>
p <- ggplot(by_country, aes(x = donors_mean, y = reorder(country, donors_mean), color = consent_law))  

p + geom_point(size = 3) + labs(x = "Donor Procurement Rate", y = "", color = "Consent Law") + theme(legend.position = "top")

# instead we could use facet_wrap, not color = consent_law
# free_by for scales  
p <- ggplot(by_country, aes(x = donors_mean, y = reorder(country, donors_mean)))  

p + geom_point(size = 3) + labs(x = "Donor Procurement Rate", y = "") + facet_wrap(~ consent_law, scales = "free_y", ncol = 1)

p <- ggplot(by_country, aes(x = reorder(country, donors_mean), y = donors_mean))  

p + geom_pointrange(aes(ymin = donors_mean - donors_sd, ymax = donors_mean + donors_sd)) + labs(x = "", y = "Donor Procurement Rate") + coord_flip()

# geom_linerange(), geom_crossbar(), and geom_errorbar().  

5.3 Plot text directly

p <- ggplot(by_country, aes(x = roads_mean, y = donors_mean)) 
p + geom_point() + geom_text(aes(label = country))  

# We can left- or right-justify the labels using the hjust argument to geom_text(). Setting hjust=0 will left justify the label, and hjust=1 will right justify it.    

p <- ggplot(by_country, aes(x = roads_mean, y = donors_mean))  
p + geom_point() + geom_text(aes(label = country), hjust = 0)

# better way is to use ggrepel  
library(ggrepel)
elections_historic %>% select(2:7)
## # A tibble: 49 x 6
##     year winner                 win_party ec_pct popular_pct popular_margin
##    <int> <chr>                  <chr>      <dbl>       <dbl>          <dbl>
##  1  1824 John Quincy Adams      D.-R.      0.322       0.309        -0.104 
##  2  1828 Andrew Jackson         Dem.       0.682       0.559         0.122 
##  3  1832 Andrew Jackson         Dem.       0.766       0.547         0.178 
##  4  1836 Martin Van Buren       Dem.       0.578       0.508         0.142 
##  5  1840 William Henry Harrison Whig       0.796       0.529         0.0605
##  6  1844 James Polk             Dem.       0.618       0.495         0.0145
##  7  1848 Zachary Taylor         Whig       0.562       0.473         0.0479
##  8  1852 Franklin Pierce        Dem.       0.858       0.508         0.0695
##  9  1856 James Buchanan         Dem.       0.588       0.453         0.122 
## 10  1860 Abraham Lincoln        Rep.       0.594       0.396         0.101 
## # ... with 39 more rows
p_title <- "Presidential Elections: Popular and Electoral College Margins"  
p_subtitle <- "1824-2016"
p_caption <- "Data for 2016 are provisional"
x_label <- "Winner's share of popular vote"  
y_label <- "Winner's share of Electoral College Votes"

p <- ggplot(elections_historic, aes(x = popular_pct, y = ec_pct, label = winner_label)) 

# We use two new geoms, geom_hline() and geom_vline() to make the lines. They take yintercept and xintercept arguments, respectively, and the lines can also be sized and colored as you please. There is also a geom_abline() geom that draws straight lines based on a supplied slope and intercept. This is useful for plotting, for example, 45 degree reference lines in scatterplots.


p + geom_hline(yintercept = 0.5, size = 1.4, color = "gray80") + 
  geom_vline(xintercept = 0.5, size = 1.4, color = "gray80") + 
  geom_point() + 
  geom_text_repel() + 
  scale_x_continuous(labels = scales::percent) + 
  scale_y_continuous(labels = scales::percent) + 
  labs(x = x_label, y = y_label, title = p_title, subtitle = p_subtitle, caption = p_caption)

5.4 Label outliers

p <- ggplot(data = by_country,
            mapping = aes(x = gdp_mean, y = health_mean))

p + geom_point() +
    geom_text_repel(data = subset(by_country, gdp_mean > 25000),
                    mapping = aes(label = country))

p <- ggplot(data = by_country,
            mapping = aes(x = gdp_mean, y = health_mean))

p + geom_point() +
    geom_text_repel(data = subset(by_country,
                                  gdp_mean > 25000 | health_mean < 1500 |
                                  country %in% "Belgium"),
                    mapping = aes(label = country))

organdata$ind <- organdata$ccode %in% c("Ita", "Spa") &
                    organdata$year > 1998

p <- ggplot(data = organdata,
            mapping = aes(x = roads,
                          y = donors, color = ind))
p + geom_point() +
    geom_text_repel(data = subset(organdata, ind),
                    mapping = aes(label = ccode)) +
    guides(label = FALSE, color = FALSE)
## Warning: Removed 34 rows containing missing values (geom_point).

5.5 Write and draw in the plot area

p <- ggplot(data = organdata, mapping = aes(x = roads, y = donors))
p + geom_point() + annotate(geom = "text", x = 91, y = 33,
                            label = "A surprisingly high \n recovery rate.",
                            hjust = 0)
## Warning: Removed 34 rows containing missing values (geom_point).

# The annotate() function can work with other geoms, too. Use it to draw rectangles, line segments, and arrows. Just remember to pass along the right arguments to the geom you use. We can add a rectangle to this plot, for instance, with a second call to the function.

p <- ggplot(data = organdata,
            mapping = aes(x = roads, y = donors))
p + geom_point() +
    annotate(geom = "rect", xmin = 125, xmax = 155,
             ymin = 30, ymax = 35, fill = "red", alpha = 0.2) + 
    annotate(geom = "text", x = 157, y = 33,
             label = "A surprisingly high \n recovery rate.", hjust = 0)
## Warning: Removed 34 rows containing missing values (geom_point).

5.6 Understanding scales, guides, and themes

p <- ggplot(data = organdata,
            mapping = aes(x = roads,
                          y = donors,
                          color = world))
p + geom_point() +
    scale_x_log10() +
    scale_y_continuous(breaks = c(5, 15, 25),
                       labels = c("Five", "Fifteen", "Twenty Five"))
## Warning: Removed 34 rows containing missing values (geom_point).

p <- ggplot(data = organdata,
            mapping = aes(x = roads,
                          y = donors,
                          color = world))
p + geom_point() +
    scale_color_discrete(labels =
                             c("Corporatist", "Liberal",
                               "Social Democratic", "Unclassified")) +
    labs(x = "Road Deaths",
         y = "Donor Procurement",
        color = "Welfare State")  
## Warning: Removed 34 rows containing missing values (geom_point).

# remove legend to top 
p + geom_point() +
    scale_color_discrete(labels =
                             c("Corporatist", "Liberal",
                               "Social Democratic", "Unclassified")) +
    labs(x = "Road Deaths",
         y = "Donor Procurement",
        color = "Welfare State")  +
  theme(legend.position = "top") 
## Warning: Removed 34 rows containing missing values (geom_point).

# make legend disappear
p <- ggplot(data = organdata,
            mapping = aes(x = roads,
                          y = donors,
                          color = world))
p + geom_point() +
    labs(x = "Road Deaths",
         y = "Donor Procurement") +
    guides(color = FALSE)
## Warning: Removed 34 rows containing missing values (geom_point).

# chapter 6. work with models

p <- ggplot(data = gapminder, mapping = aes(x = log(gdpPercap), y = lifeExp))  

p + geom_point(alpha = 0.1) + geom_smooth(color = "tomato", fill = "tomato", method = MASS::rlm) + geom_smooth(color = "steelblue", fill = "steelblue", method = "lm")  

p + geom_point(alpha = 0.1) +  geom_smooth(color = "tomato", method = "lm", size = 1.2, formula = y ~ splines::bs(x, 3), se = FALSE)

p + geom_point(alpha = 0.1) +  geom_quantile(color = "tomato", method = "rqss", lambda = 1, quantiles = c(0.2, 0.5, 0.85))  
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 1)

model_colors <- RColorBrewer::brewer.pal(3, "Set1")
model_colors
## [1] "#E41A1C" "#377EB8" "#4DAF4A"
p0 <- ggplot(data = gapminder,
            mapping = aes(x = log(gdpPercap), y = lifeExp))

p1 <- p0 + geom_point(alpha = 0.2) +
    geom_smooth(method = "lm", aes(color = "OLS", fill = "OLS")) +
    geom_smooth(method = "lm", formula = y ~ splines::bs(x, df = 3),
                aes(color = "Cubic Spline", fill = "Cubic Spline")) +
    geom_smooth(method = "loess",
                aes(color = "LOESS", fill = "LOESS"))


p1 + scale_color_manual(name = "Models", values = model_colors) +
    scale_fill_manual(name = "Models", values = model_colors) +
    theme(legend.position = "top")

6.2 Look inside model objects

out <- lm(formula = lifeExp ~ gdpPercap + pop + continent,
          data = gapminder)
out
## 
## Call:
## lm(formula = lifeExp ~ gdpPercap + pop + continent, data = gapminder)
## 
## Coefficients:
##       (Intercept)          gdpPercap                pop  
##          4.78e+01           4.50e-04           6.57e-09  
## continentAmericas      continentAsia    continentEurope  
##          1.35e+01           8.19e+00           1.75e+01  
##  continentOceania  
##          1.81e+01
summary(out)
## 
## Call:
## lm(formula = lifeExp ~ gdpPercap + pop + continent, data = gapminder)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -49.16  -4.49   0.30   5.11  25.17 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)
## (Intercept)       4.78e+01   3.40e-01  140.82   <2e-16
## gdpPercap         4.50e-04   2.35e-05   19.16   <2e-16
## pop               6.57e-09   1.98e-09    3.33    9e-04
## continentAmericas 1.35e+01   6.00e-01   22.46   <2e-16
## continentAsia     8.19e+00   5.71e-01   14.34   <2e-16
## continentEurope   1.75e+01   6.25e-01   27.97   <2e-16
## continentOceania  1.81e+01   1.78e+00   10.15   <2e-16
## 
## Residual standard error: 8.37 on 1697 degrees of freedom
## Multiple R-squared:  0.582,  Adjusted R-squared:  0.581 
## F-statistic:  394 on 6 and 1697 DF,  p-value: <2e-16

6.4 Generate predictions to graph

min_gdp <- min(gapminder$gdpPercap)
max_gdp <- max(gapminder$gdpPercap)
med_pop <- median(gapminder$pop)

pred_df <- expand.grid(gdpPercap = (seq(from = min_gdp,
                                        to = max_gdp,
                                        length.out = 100)),
                       pop = med_pop,
                       continent = c("Africa", "Americas",
                                     "Asia", "Europe", "Oceania"))

dim(pred_df)  
## [1] 500   3
head(pred_df)
##   gdpPercap     pop continent
## 1     241.2 7023596    Africa
## 2    1385.4 7023596    Africa
## 3    2529.7 7023596    Africa
## 4    3674.0 7023596    Africa
## 5    4818.2 7023596    Africa
## 6    5962.5 7023596    Africa
pred_out <- predict(object = out,
                    newdata = pred_df,
                    interval = "predict") # If we specify interval = 'predict' as an argument, it will calculate 95% prediction intervals in addition to the point estimate.
head(pred_out)  
##     fit   lwr   upr
## 1 47.97 31.55 64.39
## 2 48.48 32.06 64.90
## 3 49.00 32.58 65.42
## 4 49.51 33.09 65.93
## 5 50.03 33.60 66.45
## 6 50.54 34.12 66.96
pred_df <- cbind(pred_df, pred_out)

head(pred_df) 
##   gdpPercap     pop continent   fit   lwr   upr
## 1     241.2 7023596    Africa 47.97 31.55 64.39
## 2    1385.4 7023596    Africa 48.48 32.06 64.90
## 3    2529.7 7023596    Africa 49.00 32.58 65.42
## 4    3674.0 7023596    Africa 49.51 33.09 65.93
## 5    4818.2 7023596    Africa 50.03 33.60 66.45
## 6    5962.5 7023596    Africa 50.54 34.12 66.96
p <- ggplot(data = subset(pred_df, continent %in% c("Europe", "Africa")),
            aes(x = gdpPercap,
                y = fit, ymin = lwr, ymax = upr,
                color = continent,
                fill = continent,
                group = continent))

p + geom_point(data = subset(gapminder,
                             continent %in% c("Europe", "Africa")),
               aes(x = gdpPercap, y = lifeExp,
                   color = continent),
               alpha = 0.5,
               inherit.aes = FALSE) + 
    geom_line() +
    geom_ribbon(alpha = 0.2, color = FALSE) +
    scale_x_log10(labels = scales::dollar) # We use a new geom here to draw the area covered by the prediction intervals: geom_ribbon(). 

6.5 Tidy model objects with broom

out_comp <- tidy(out)
out_comp %>% round_df()
## # A tibble: 7 x 5
##   term              estimate std.error statistic p.value
##   <chr>                <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)          47.8      0.34     141.         0
## 2 gdpPercap             0        0         19.2        0
## 3 pop                   0        0          3.33       0
## 4 continentAmericas    13.5      0.6       22.5        0
## 5 continentAsia         8.19     0.570     14.3        0
## 6 continentEurope      17.5      0.62      28.0        0
## 7 continentOceania     18.1      1.78      10.2        0
p <- ggplot(out_comp, mapping = aes(x = term,
                                    y = estimate))

p + geom_point() + coord_flip() 

out_conf <- tidy(out, conf.int = TRUE)
out_conf %>% round_df()
## # A tibble: 7 x 7
##   term              estimate std.error statistic p.value conf.low conf.high
##   <chr>                <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Intercept)          47.8      0.34     141.         0    47.2      48.5 
## 2 gdpPercap             0        0         19.2        0     0         0   
## 3 pop                   0        0          3.33       0     0         0   
## 4 continentAmericas    13.5      0.6       22.5        0    12.3      14.6 
## 5 continentAsia         8.19     0.570     14.3        0     7.07      9.31
## 6 continentEurope      17.5      0.62      28.0        0    16.2      18.7 
## 7 continentOceania     18.1      1.78      10.2        0    14.6      21.6
out_conf <- subset(out_conf, term %nin% "(Intercept)")  
# The convenience “not in” operator %nin% is available via the socviz library. It does the opposite of %in% and selects only the items in a first vector of characters that are not in the second. We’ll use it to drop the intercept term from the table. 


out_conf$nicelabs <- prefix_strip(out_conf$term, "continent")  
# R will create coefficient names based on the variable name and the category name, like continentAmericas. Normally we like to clean these up before plotting. Most commonly, we just want to strip away the variable name at the beginning of the coefficient label. For this we can use prefix_strip(), a convenience function in the socviz library. We tell it which prefixes to drop, using it to create a new column variable in out_conf that corresponds to the terms column, but that has nicer labels.

p <- ggplot(out_conf, mapping = aes(x = reorder(nicelabs, estimate),
                                    y = estimate, ymin = conf.low, ymax = conf.high))  

p + geom_pointrange() + coord_flip() + labs(x="", y="OLS Estimate")  

get observation level statistics using augment()

out_aug <- augment(out)  

head(out_aug) %>% round_df()
## # A tibble: 6 x 11
##   lifeExp gdpPercap    pop continent .fitted .se.fit .resid  .hat .sigma
##     <dbl>     <dbl>  <dbl> <fct>       <dbl>   <dbl>  <dbl> <dbl>  <dbl>
## 1    28.8      779. 8.43e6 Asia         56.4    0.47  -27.6     0   8.34
## 2    30.3      821. 9.24e6 Asia         56.4    0.47  -26.1     0   8.34
## 3    32        853. 1.03e7 Asia         56.5    0.47  -24.5     0   8.35
## 4    34.0      836. 1.15e7 Asia         56.5    0.47  -22.4     0   8.35
## 5    36.1      740. 1.31e7 Asia         56.4    0.47  -20.3     0   8.35
## 6    38.4      786. 1.49e7 Asia         56.5    0.47  -18.0     0   8.36
## # ... with 2 more variables: .cooksd <dbl>, .std.resid <dbl>
out_aug <- augment(out, data = gapminder)
head(out_aug) %>% round_df()
## # A tibble: 6 x 13
##   country continent  year lifeExp    pop gdpPercap .fitted .se.fit .resid
##   <fct>   <fct>     <dbl>   <dbl>  <dbl>     <dbl>   <dbl>   <dbl>  <dbl>
## 1 Afghan~ Asia       1952    28.8 8.43e6      779.    56.4    0.47  -27.6
## 2 Afghan~ Asia       1957    30.3 9.24e6      821.    56.4    0.47  -26.1
## 3 Afghan~ Asia       1962    32   1.03e7      853.    56.5    0.47  -24.5
## 4 Afghan~ Asia       1967    34.0 1.15e7      836.    56.5    0.47  -22.4
## 5 Afghan~ Asia       1972    36.1 1.31e7      740.    56.4    0.47  -20.3
## 6 Afghan~ Asia       1977    38.4 1.49e7      786.    56.5    0.47  -18.0
## # ... with 4 more variables: .hat <dbl>, .sigma <dbl>, .cooksd <dbl>,
## #   .std.resid <dbl>
p <- ggplot(data = out_aug,
            mapping = aes(x = .fitted, y = .resid))
p + geom_point()

get model-level statistics with glance()

glance(out) %>% round_df()  
## # A tibble: 1 x 11
##   r.squared adj.r.squared sigma statistic p.value    df logLik    AIC
##       <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>  <dbl>  <dbl>
## 1     0.580         0.580  8.37      394.       0     7 -6034. 12084.
## # ... with 3 more variables: BIC <dbl>, deviance <dbl>, df.residual <dbl>
library(survival)
## Warning: package 'survival' was built under R version 3.6.2
out_cph <- coxph(Surv(time, status) ~ age + sex, data = lung)

out_surv <- survfit(out_cph) # predict values 

summary(out_cph)
## Call:
## coxph(formula = Surv(time, status) ~ age + sex, data = lung)
## 
##   n= 228, number of events= 165 
## 
##         coef exp(coef) se(coef)     z Pr(>|z|)
## age  0.01705   1.01719  0.00922  1.85   0.0646
## sex -0.51322   0.59857  0.16746 -3.06   0.0022
## 
##     exp(coef) exp(-coef) lower .95 upper .95
## age     1.017      0.983     0.999     1.036
## sex     0.599      1.671     0.431     0.831
## 
## Concordance= 0.603  (se = 0.025 )
## Likelihood ratio test= 14.1  on 2 df,   p=9e-04
## Wald test            = 13.5  on 2 df,   p=0.001
## Score (logrank) test = 13.7  on 2 df,   p=0.001
summary(out_surv)
## Call: survfit(formula = out_cph)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     5    228       1   0.9958 0.00417       0.9877        1.000
##    11    227       3   0.9833 0.00831       0.9671        1.000
##    12    224       1   0.9791 0.00928       0.9611        0.997
##    13    223       2   0.9706 0.01096       0.9494        0.992
##    15    221       1   0.9664 0.01170       0.9438        0.990
##    26    220       1   0.9622 0.01240       0.9382        0.987
##    30    219       1   0.9579 0.01305       0.9327        0.984
##    31    218       1   0.9537 0.01368       0.9273        0.981
##    53    217       2   0.9452 0.01484       0.9165        0.975
##    54    215       1   0.9409 0.01538       0.9112        0.972
##    59    214       1   0.9366 0.01590       0.9060        0.968
##    60    213       2   0.9280 0.01689       0.8955        0.962
##    61    211       1   0.9237 0.01735       0.8903        0.958
##    62    210       1   0.9194 0.01780       0.8852        0.955
##    65    209       2   0.9109 0.01865       0.8751        0.948
##    71    207       1   0.9066 0.01906       0.8700        0.945
##    79    206       1   0.9023 0.01945       0.8650        0.941
##    81    205       2   0.8937 0.02020       0.8550        0.934
##    88    203       2   0.8851 0.02091       0.8451        0.927
##    92    201       1   0.8809 0.02126       0.8402        0.924
##    93    199       1   0.8766 0.02159       0.8352        0.920
##    95    198       2   0.8679 0.02224       0.8254        0.913
##   105    196       1   0.8636 0.02256       0.8205        0.909
##   107    194       2   0.8549 0.02317       0.8107        0.902
##   110    192       1   0.8506 0.02346       0.8058        0.898
##   116    191       1   0.8462 0.02375       0.8009        0.894
##   118    190       1   0.8419 0.02403       0.7961        0.890
##   122    189       1   0.8375 0.02431       0.7912        0.887
##   131    188       1   0.8332 0.02458       0.7863        0.883
##   132    187       2   0.8244 0.02510       0.7767        0.875
##   135    185       1   0.8201 0.02535       0.7719        0.871
##   142    184       1   0.8157 0.02560       0.7671        0.867
##   144    183       1   0.8113 0.02584       0.7622        0.864
##   145    182       2   0.8026 0.02631       0.7527        0.856
##   147    180       1   0.7982 0.02654       0.7479        0.852
##   153    179       1   0.7939 0.02676       0.7431        0.848
##   156    178       2   0.7852 0.02719       0.7336        0.840
##   163    176       3   0.7720 0.02780       0.7194        0.828
##   166    173       2   0.7632 0.02819       0.7099        0.821
##   167    171       1   0.7588 0.02837       0.7052        0.817
##   170    170       1   0.7545 0.02856       0.7005        0.813
##   175    167       1   0.7500 0.02874       0.6958        0.809
##   176    165       1   0.7456 0.02892       0.6910        0.804
##   177    164       1   0.7411 0.02910       0.6862        0.800
##   179    162       2   0.7321 0.02946       0.6766        0.792
##   180    160       1   0.7276 0.02963       0.6717        0.788
##   181    159       2   0.7185 0.02996       0.6621        0.780
##   182    157       1   0.7140 0.03012       0.6573        0.776
##   183    156       1   0.7095 0.03028       0.6525        0.771
##   186    154       1   0.7049 0.03044       0.6477        0.767
##   189    152       1   0.7003 0.03059       0.6428        0.763
##   194    149       1   0.6957 0.03075       0.6379        0.759
##   197    147       1   0.6910 0.03091       0.6330        0.754
##   199    145       1   0.6863 0.03106       0.6280        0.750
##   201    144       2   0.6769 0.03136       0.6181        0.741
##   202    142       1   0.6722 0.03150       0.6132        0.737
##   207    139       1   0.6674 0.03165       0.6082        0.732
##   208    138       1   0.6627 0.03179       0.6032        0.728
##   210    137       1   0.6579 0.03192       0.5982        0.724
##   212    135       1   0.6531 0.03206       0.5932        0.719
##   218    134       1   0.6483 0.03219       0.5882        0.715
##   222    132       1   0.6435 0.03232       0.5832        0.710
##   223    130       1   0.6386 0.03246       0.5781        0.706
##   226    126       1   0.6336 0.03260       0.5728        0.701
##   229    125       1   0.6286 0.03274       0.5676        0.696
##   230    124       1   0.6236 0.03287       0.5624        0.691
##   239    121       2   0.6133 0.03314       0.5517        0.682
##   245    117       1   0.6081 0.03327       0.5463        0.677
##   246    116       1   0.6030 0.03340       0.5409        0.672
##   267    112       1   0.5977 0.03354       0.5354        0.667
##   268    111       1   0.5924 0.03367       0.5299        0.662
##   269    110       1   0.5871 0.03379       0.5244        0.657
##   270    108       1   0.5817 0.03392       0.5189        0.652
##   283    104       1   0.5762 0.03405       0.5132        0.647
##   284    103       1   0.5707 0.03419       0.5075        0.642
##   285    101       2   0.5595 0.03444       0.4959        0.631
##   286     99       1   0.5538 0.03456       0.4901        0.626
##   288     98       1   0.5482 0.03468       0.4843        0.621
##   291     97       1   0.5426 0.03479       0.4785        0.615
##   293     94       1   0.5368 0.03490       0.4726        0.610
##   301     91       1   0.5310 0.03502       0.4666        0.604
##   303     89       1   0.5250 0.03514       0.4604        0.599
##   305     87       1   0.5189 0.03526       0.4542        0.593
##   306     86       1   0.5129 0.03537       0.4480        0.587
##   310     85       2   0.5007 0.03558       0.4356        0.576
##   320     82       1   0.4946 0.03568       0.4294        0.570
##   329     81       1   0.4884 0.03577       0.4231        0.564
##   337     79       1   0.4822 0.03586       0.4168        0.558
##   340     78       1   0.4760 0.03594       0.4105        0.552
##   345     77       1   0.4698 0.03601       0.4043        0.546
##   348     76       1   0.4636 0.03607       0.3981        0.540
##   350     75       1   0.4575 0.03612       0.3919        0.534
##   351     74       1   0.4514 0.03617       0.3858        0.528
##   353     73       2   0.4391 0.03623       0.3735        0.516
##   361     70       1   0.4329 0.03626       0.3674        0.510
##   363     69       2   0.4205 0.03629       0.3551        0.498
##   364     67       1   0.4143 0.03629       0.3489        0.492
##   371     65       2   0.4017 0.03629       0.3365        0.479
##   387     60       1   0.3952 0.03629       0.3301        0.473
##   390     59       1   0.3887 0.03629       0.3237        0.467
##   394     58       1   0.3822 0.03627       0.3173        0.460
##   426     55       1   0.3753 0.03628       0.3106        0.454
##   428     54       1   0.3685 0.03627       0.3038        0.447
##   429     53       1   0.3616 0.03625       0.2971        0.440
##   433     52       1   0.3547 0.03622       0.2904        0.433
##   442     51       1   0.3478 0.03618       0.2837        0.426
##   444     50       1   0.3409 0.03612       0.2770        0.420
##   450     48       1   0.3338 0.03607       0.2701        0.413
##   455     47       1   0.3267 0.03600       0.2633        0.405
##   457     46       1   0.3196 0.03592       0.2564        0.398
##   460     44       1   0.3123 0.03584       0.2494        0.391
##   473     43       1   0.3049 0.03575       0.2423        0.384
##   477     42       1   0.2976 0.03564       0.2353        0.376
##   519     39       1   0.2899 0.03554       0.2280        0.369
##   520     38       1   0.2822 0.03543       0.2206        0.361
##   524     37       2   0.2668 0.03514       0.2061        0.345
##   533     34       1   0.2590 0.03498       0.1987        0.337
##   550     32       1   0.2510 0.03481       0.1913        0.329
##   558     30       1   0.2428 0.03463       0.1836        0.321
##   567     28       1   0.2344 0.03445       0.1757        0.313
##   574     27       1   0.2259 0.03425       0.1678        0.304
##   583     26       1   0.2173 0.03401       0.1599        0.295
##   613     24       1   0.2083 0.03378       0.1516        0.286
##   624     23       1   0.1992 0.03351       0.1433        0.277
##   641     22       1   0.1901 0.03320       0.1350        0.268
##   643     21       1   0.1811 0.03283       0.1269        0.258
##   654     20       1   0.1719 0.03243       0.1187        0.249
##   655     19       1   0.1627 0.03196       0.1107        0.239
##   687     18       1   0.1535 0.03145       0.1027        0.229
##   689     17       1   0.1444 0.03088       0.0950        0.220
##   705     16       1   0.1352 0.03025       0.0872        0.210
##   707     15       1   0.1263 0.02954       0.0799        0.200
##   728     14       1   0.1172 0.02878       0.0725        0.190
##   731     13       1   0.1083 0.02794       0.0653        0.180
##   735     12       1   0.0995 0.02703       0.0584        0.169
##   765     10       1   0.0904 0.02606       0.0514        0.159
##   791      9       1   0.0817 0.02499       0.0449        0.149
##   814      7       1   0.0719 0.02387       0.0375        0.138
##   883      4       1   0.0577 0.02303       0.0264        0.126
out_tidy <- tidy(out_surv)

p <- ggplot(data = out_tidy, mapping = aes(time, estimate))
p + geom_line() +
    geom_ribbon(mapping = aes(ymin = conf.low, ymax = conf.high), 
                alpha = .2)

6.6 Grouped analysis and list columns

eu77 <- gapminder %>% filter(continent == "Europe", year == 1977)  
fit <- lm(lifeExp ~ log(gdpPercap), data = eu77)
summary(fit)
## 
## Call:
## lm(formula = lifeExp ~ log(gdpPercap), data = eu77)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.496 -1.031  0.093  1.176  3.712 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)
## (Intercept)      29.489      7.161    4.12  0.00031
## log(gdpPercap)    4.488      0.756    5.94  2.2e-06
## 
## Residual standard error: 2.11 on 28 degrees of freedom
## Multiple R-squared:  0.557,  Adjusted R-squared:  0.541 
## F-statistic: 35.2 on 1 and 28 DF,  p-value: 2.17e-06
# With dplyr and broom we can do this for every continent-year slice of the data in a compact and tidy way 
out_le <- gapminder %>%
    group_by(continent, year) %>%
    nest()

out_le
## # A tibble: 60 x 3
## # Groups:   continent, year [60]
##    continent  year           data
##    <fct>     <int> <list<df[,4]>>
##  1 Asia       1952       [33 x 4]
##  2 Asia       1957       [33 x 4]
##  3 Asia       1962       [33 x 4]
##  4 Asia       1967       [33 x 4]
##  5 Asia       1972       [33 x 4]
##  6 Asia       1977       [33 x 4]
##  7 Asia       1982       [33 x 4]
##  8 Asia       1987       [33 x 4]
##  9 Asia       1992       [33 x 4]
## 10 Asia       1997       [33 x 4]
## # ... with 50 more rows
# Our “Europe 1977” fit is in there. We can look at it, if we like, by filtering the data and then unnesting the list column.  

out_le %>% filter(continent == "Europe" & year == 1977) %>% unnest()
## Warning: `cols` is now required.
## Please use `cols = c(data)`
## # A tibble: 30 x 6
## # Groups:   continent, year [5]
##    continent  year country                lifeExp      pop gdpPercap
##    <fct>     <int> <fct>                    <dbl>    <int>     <dbl>
##  1 Europe     1977 Albania                   68.9  2509048     3533.
##  2 Europe     1977 Austria                   72.2  7568430    19749.
##  3 Europe     1977 Belgium                   72.8  9821800    19118.
##  4 Europe     1977 Bosnia and Herzegovina    69.9  4086000     3528.
##  5 Europe     1977 Bulgaria                  70.8  8797022     7612.
##  6 Europe     1977 Croatia                   70.6  4318673    11305.
##  7 Europe     1977 Czech Republic            70.7 10161915    14800.
##  8 Europe     1977 Denmark                   74.7  5088419    20423.
##  9 Europe     1977 Finland                   72.5  4738902    15605.
## 10 Europe     1977 France                    73.8 53165019    18293.
## # ... with 20 more rows

List columns are useful because we can act on them in a compact and tidy way. In particular, we can pass functions along to each row of the list column and make something happen. For example, a moment ago we ran a regression of life expectancy and logged GDP for European countries in 1977. We can do that for every continent-year combination in the data. We first create a convenience function called fit_ols() that takes a single argument, df (for data frame) and that fits the linear model we are interested in. Then we map that function to each of our list column rows in turn. Recall from Chapter 4 that mutate creates new variables or columns on the fly within a pipeline.

The map action is an important idea in functional programming. If you have written code in other, more imperative languages you can think of it as a compact alternative to writing for … next loops. You can of course write loops like this in R. Computationally they are often not any less efficient than their functional alternatives. But mapping functions to arrays is more easily integrated into a sequence of data transformations.

fit_ols <- function(df) {
    lm(lifeExp ~ log(gdpPercap), data = df)
}

out_le <- gapminder %>%
    group_by(continent, year) %>%
    nest() %>% 
    mutate(model = map(data, fit_ols)) 

out_le    
## # A tibble: 60 x 4
## # Groups:   continent, year [60]
##    continent  year           data model 
##    <fct>     <int> <list<df[,4]>> <list>
##  1 Asia       1952       [33 x 4] <lm>  
##  2 Asia       1957       [33 x 4] <lm>  
##  3 Asia       1962       [33 x 4] <lm>  
##  4 Asia       1967       [33 x 4] <lm>  
##  5 Asia       1972       [33 x 4] <lm>  
##  6 Asia       1977       [33 x 4] <lm>  
##  7 Asia       1982       [33 x 4] <lm>  
##  8 Asia       1987       [33 x 4] <lm>  
##  9 Asia       1992       [33 x 4] <lm>  
## 10 Asia       1997       [33 x 4] <lm>  
## # ... with 50 more rows
# First we extract summary statistics from each model by mapping the tidy() function from broom to the model list column. Then we unnest the result, dropping the other columns in the process. Finally, we filter out all the Intercept terms, and also drop all observations from Oceania. In the case of the Intercepts we do this just out of convenience. Oceania we drop just because there are so few observations. We put the results in an object called out_tidy.


out_tidy <- gapminder %>%
    group_by(continent, year) %>%
    nest() %>% 
    mutate(model = map(data, fit_ols),
           tidied = map(model, tidy)) %>%
    unnest(tidied, .drop = TRUE) %>%
    filter(term %nin% "(Intercept)" &
           continent %nin% "Oceania")
## Warning: The `.drop` argument of `unnest()` is deprecated as of tidyr 1.0.0.
## All list-columns are now preserved.
## This warning is displayed once per session.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
out_tidy %>% head()
## # A tibble: 6 x 9
## # Groups:   continent, year [6]
##   continent  year     data model term  estimate std.error statistic p.value
##   <fct>     <int> <list<d> <lis> <chr>    <dbl>     <dbl>     <dbl>   <dbl>
## 1 Asia       1952 [33 x 4] <lm>  log(~     4.16      1.25      3.33 2.28e-3
## 2 Asia       1957 [33 x 4] <lm>  log(~     4.17      1.28      3.26 2.71e-3
## 3 Asia       1962 [33 x 4] <lm>  log(~     4.59      1.24      3.72 7.94e-4
## 4 Asia       1967 [33 x 4] <lm>  log(~     4.50      1.15      3.90 4.77e-4
## 5 Asia       1972 [33 x 4] <lm>  log(~     4.44      1.01      4.41 1.16e-4
## 6 Asia       1977 [33 x 4] <lm>  log(~     4.87      1.03      4.75 4.42e-5
p <- ggplot(data = out_tidy,
            mapping = aes(x = year, y = estimate,
                          ymin = estimate - 2*std.error,
                          ymax = estimate + 2*std.error,
                          group = continent, color = continent))

# The call to position_dodge() within geom_pointrange() allows the point ranges for each continent to be near each other within years, instead of being plotted right on top of one another. 

p + geom_pointrange(position = position_dodge(width = 1)) + 
    scale_x_continuous(breaks = unique(gapminder$year)) + 
    theme(legend.position = "top") +
    labs(x = "Year", y = "Estimate", color = "Continent")

6.7 Plot marginal effects

library(margins)
## Warning: package 'margins' was built under R version 3.6.2

We will fit a logistic regression on obama, with age, polviews, race, and sex as the predictors. The age variable is the respondent’s age in years. The sex variable is coded as “Male” or “Female” with “Male” as the reference category. The race variable is coded as “White”, “Black”, or “Other” with “White” as the reference category. The polviews measure is a self-reported scale of the respondent’s political orientation from “Extremely Conservative” through “Extremely Liberal”, with “Moderate” in the middle. We take polviews and create a new variable, polviews_m, using the relevel() function to recode “Moderate” to be the reference category. We fit the model with the glm() function, and specify an interaction between race and sex.

gss_sm$polviews_m <- relevel(gss_sm$polviews, ref = "Moderate") 

out_bo <- glm(obama ~ polviews_m + sex*race, family = "binomial", data = gss_sm)

summary(out_bo)
## 
## Call:
## glm(formula = obama ~ polviews_m + sex * race, family = "binomial", 
##     data = gss_sm)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.905  -0.554   0.177   0.542   2.244  
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)
## (Intercept)                       0.29649    0.13409    2.21   0.0270
## polviews_mExtremely Liberal       2.37295    0.52504    4.52  6.2e-06
## polviews_mLiberal                 2.60003    0.35667    7.29  3.1e-13
## polviews_mSlightly Liberal        1.29317    0.24843    5.21  1.9e-07
## polviews_mSlightly Conservative  -1.35528    0.18129   -7.48  7.7e-14
## polviews_mConservative           -2.34746    0.20038  -11.71  < 2e-16
## polviews_mExtremely Conservative -2.72738    0.38721   -7.04  1.9e-12
## sexFemale                         0.25487    0.14537    1.75   0.0796
## raceBlack                         3.84953    0.50132    7.68  1.6e-14
## raceOther                        -0.00214    0.43576    0.00   0.9961
## sexFemale:raceBlack              -0.19751    0.66007   -0.30   0.7648
## sexFemale:raceOther               1.57483    0.58766    2.68   0.0074
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2247.9  on 1697  degrees of freedom
## Residual deviance: 1345.9  on 1686  degrees of freedom
##   (1169 observations deleted due to missingness)
## AIC: 1370
## 
## Number of Fisher Scoring iterations: 6
# use margins(), we can calculate the marginal effects for each variable  
bo_m <- margins(out_bo)

summary(bo_m)
##                            factor     AME     SE        z      p   lower
##            polviews_mConservative -0.4119 0.0283 -14.5394 0.0000 -0.4674
##  polviews_mExtremely Conservative -0.4538 0.0420 -10.7971 0.0000 -0.5361
##       polviews_mExtremely Liberal  0.2681 0.0295   9.0996 0.0000  0.2103
##                 polviews_mLiberal  0.2768 0.0229  12.0736 0.0000  0.2319
##   polviews_mSlightly Conservative -0.2658 0.0330  -8.0596 0.0000 -0.3304
##        polviews_mSlightly Liberal  0.1933 0.0303   6.3896 0.0000  0.1340
##                         raceBlack  0.4032 0.0173  23.3568 0.0000  0.3694
##                         raceOther  0.1247 0.0386   3.2297 0.0012  0.0490
##                         sexFemale  0.0443 0.0177   2.5073 0.0122  0.0097
##    upper
##  -0.3564
##  -0.3714
##   0.3258
##   0.3218
##  -0.2011
##   0.2526
##   0.4371
##   0.2005
##   0.0789
plot(bo_m) # to see a plot of the average marginal effects  

bo_gg <- as_tibble(summary(bo_m))
bo_gg
## # A tibble: 9 x 7
##   factor                       AME     SE      z         p    lower   upper
##   <chr>                      <dbl>  <dbl>  <dbl>     <dbl>    <dbl>   <dbl>
## 1 polviews_mConservative   -0.412  0.0283 -14.5  6.82e- 48 -0.467   -0.356 
## 2 polviews_mExtremely Con~ -0.454  0.0420 -10.8  3.55e- 27 -0.536   -0.371 
## 3 polviews_mExtremely Lib~  0.268  0.0295   9.10 9.07e- 20  0.210    0.326 
## 4 polviews_mLiberal         0.277  0.0229  12.1  1.46e- 33  0.232    0.322 
## 5 polviews_mSlightly Cons~ -0.266  0.0330  -8.06 7.65e- 16 -0.330   -0.201 
## 6 polviews_mSlightly Libe~  0.193  0.0303   6.39 1.66e- 10  0.134    0.253 
## 7 raceBlack                 0.403  0.0173  23.4  1.18e-120  0.369    0.437 
## 8 raceOther                 0.125  0.0386   3.23 1.24e-  3  0.0490   0.200 
## 9 sexFemale                 0.0443 0.0177   2.51 1.22e-  2  0.00967  0.0789
prefixes <- c("polviews_m", "sex")
bo_gg$factor <- prefix_strip(bo_gg$factor, prefixes) 
bo_gg$factor
## [1] "Conservative"           "Extremely Conservative"
## [3] "Extremely Liberal"      "Liberal"               
## [5] "Slightly Conservative"  "Slightly Liberal"      
## [7] "raceBlack"              "raceOther"             
## [9] "Female"
bo_gg$factor <- prefix_replace(bo_gg$factor, "race", "Race: ")
bo_gg$factor
## [1] "Conservative"           "Extremely Conservative"
## [3] "Extremely Liberal"      "Liberal"               
## [5] "Slightly Conservative"  "Slightly Liberal"      
## [7] "Race: Black"            "Race: Other"           
## [9] "Female"
bo_gg %>% select(factor, AME, lower, upper)
## # A tibble: 9 x 4
##   factor                     AME    lower   upper
##   <chr>                    <dbl>    <dbl>   <dbl>
## 1 Conservative           -0.412  -0.467   -0.356 
## 2 Extremely Conservative -0.454  -0.536   -0.371 
## 3 Extremely Liberal       0.268   0.210    0.326 
## 4 Liberal                 0.277   0.232    0.322 
## 5 Slightly Conservative  -0.266  -0.330   -0.201 
## 6 Slightly Liberal        0.193   0.134    0.253 
## 7 Race: Black             0.403   0.369    0.437 
## 8 Race: Other             0.125   0.0490   0.200 
## 9 Female                  0.0443  0.00967  0.0789
# now we have a table that we can plot as we have learned  
p <- ggplot(data = bo_gg, aes(x = reorder(factor, AME), y = AME, 
                              ymin = lower, ymax = upper)) 

p + geom_hline(yintercept = 0, color = "gray80") + geom_pointrange() + 
  coord_flip() + labs(x = NULL, y = "Average Marginal Effect")

# getting conditional effects for a particular variable 
pv_cp <- cplot(out_bo, x = "sex", draw = FALSE)
##    xvals  yvals  upper  lower
## 1   Male 0.5736 0.6379 0.5093
## 2 Female 0.6345 0.6888 0.5801
pv_cp  
##    xvals  yvals  upper  lower
## 1   Male 0.5736 0.6379 0.5093
## 2 Female 0.6345 0.6888 0.5801
p <- ggplot(data = pv_cp, aes(x = reorder(xvals, yvals), y = yvals, 
                              ymin = lower, ymax = upper))

p + geom_hline(yintercept = 0, color = "gray80") + 
  geom_pointrange() + coord_flip() + 
  labs(x = NULL, y = "Conditional Effect")

6.8 Plots from complex surveys

library(survey)
## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(srvyr)
## Warning: package 'srvyr' was built under R version 3.6.2
## 
## Attaching package: 'srvyr'
## The following object is masked from 'package:stats':
## 
##     filter
# these two options set provide some information to the survey library
# about how to behavve- cosult survey packages for details
options(survey.lonely.psu = "adjust")  
options(na.action = "na.pass")

# The subsequent operations create gss_wt, an object with one additional column (stratvar), describing the yearly sampling strata. We use the interaction() function to do this. It multiplies the vstrat variable by the year variable to get a vector of stratum information for each year. In the next step, we use the as_survey_design() function to add the key pieces of information about the survey design. It adds information about the sampling identifiers (ids), the strata (strata), and the replicate weights (weights). 

gss_wt <- subset(gss_lon, year > 1974) %>% 
  mutate(stratvar = interaction(year, vstrat)) %>% 
  as_survey_design(ids = vpsu, 
                   strata = stratvar, 
                   weights = wtssall, 
                   nest = TRUE)

head(gss_wt)
## Stratified 1 - level Cluster Sampling design (with replacement)
## With (2) clusters.
## Called via srvyr
## Sampling variables:
##  - ids: vpsu
##  - strata: stratvar
##  - weights: wtssall
## Data variables: year (dbl), id (dbl), ballot (dbl), age (dbl), degree
##   (fct), race (fct), sex (fct), siblings (fct), kids (fct), bigregion
##   (fct), income16 (fct), religion (fct), marital (fct), padeg (fct), madeg
##   (fct), partyid (fct), polviews (fct), happy (fct), partners_rc (fct),
##   grass (fct), zodiac (fct), pres12 (dbl), wtssall (dbl), vpsu (dbl),
##   vstrat (dbl), stratvar (fct)
out_grp <- gss_wt %>% filter(year %in% seq(1976, 2016, by = 4)) %>% 
  group_by(year, race, degree) %>% 
  summarize(prop = survey_mean(na.rm = TRUE))
## Warning: Factor `degree` contains implicit NA, consider using
## `forcats::fct_explicit_na`
out_grp
## # A tibble: 162 x 5
##     year race  degree            prop  prop_se
##    <dbl> <fct> <fct>            <dbl>    <dbl>
##  1  1976 White Lt High School  0.328   0.0160 
##  2  1976 White High School     0.518   0.0162 
##  3  1976 White Junior College  0.0129  0.00298
##  4  1976 White Bachelor        0.101   0.00960
##  5  1976 White Graduate        0.0393  0.00644
##  6  1976 White <NA>           NA      NA      
##  7  1976 Black Lt High School  0.562   0.0611 
##  8  1976 Black High School     0.337   0.0476 
##  9  1976 Black Junior College  0.0426  0.0193 
## 10  1976 Black Bachelor        0.0581  0.0239 
## # ... with 152 more rows
out_mrg <- gss_wt %>%
    filter(year %in% seq(1976, 2016, by = 4)) %>%
    mutate(racedeg = interaction(race, degree)) %>%
    group_by(year, racedeg) %>%
    summarize(prop = survey_mean(na.rm = TRUE))
## Warning: Factor `racedeg` contains implicit NA, consider using
## `forcats::fct_explicit_na`
out_mrg  
## # A tibble: 155 x 4
##     year racedeg                 prop prop_se
##    <dbl> <fct>                  <dbl>   <dbl>
##  1  1976 White.Lt High School 0.298   0.0146 
##  2  1976 Black.Lt High School 0.0471  0.00840
##  3  1976 Other.Lt High School 0.00195 0.00138
##  4  1976 White.High School    0.471   0.0160 
##  5  1976 Black.High School    0.0283  0.00594
##  6  1976 Other.High School    0.00325 0.00166
##  7  1976 White.Junior College 0.0117  0.00268
##  8  1976 Black.Junior College 0.00357 0.00162
##  9  1976 White.Bachelor       0.0919  0.00888
## 10  1976 Black.Bachelor       0.00487 0.00214
## # ... with 145 more rows
out_mrg <- gss_wt %>%
    filter(year %in% seq(1976, 2016, by = 4)) %>%
    mutate(racedeg = interaction(race, degree)) %>%
    group_by(year, racedeg) %>%
    summarize(prop = survey_mean(na.rm = TRUE)) %>% 
  separate(racedeg, sep = "\\.", into = c("race", "degree"))  
## Warning: Factor `racedeg` contains implicit NA, consider using
## `forcats::fct_explicit_na`
p <- ggplot(data = subset(out_grp, race %nin% "Other"),
            mapping = aes(x = degree, y = prop,
                          ymin = prop - 2*prop_se,
                          ymax = prop + 2*prop_se,
                          fill = race,
                          color = race,
                          group = race))

dodge <- position_dodge(width=0.9)

p + geom_col(position = dodge, alpha = 0.2) +
    geom_errorbar(position = dodge, width = 0.2) +
    scale_x_discrete(labels = scales::wrap_format(10)) +
    scale_y_continuous(labels = scales::percent) +
    scale_color_brewer(type = "qual", palette = "Dark2") +
    scale_fill_brewer(type = "qual", palette = "Dark2") +
    labs(title = "Educational Attainment by Race",
         subtitle = "GSS 1976-2016",
         fill = "Race",
         color = "Race",
         x = NULL, y = "Percent") +
    facet_wrap(~ year, ncol = 2) +
    theme(legend.position = "top")
## Warning: Removed 13 rows containing missing values (geom_col).
## Warning: Removed 13 rows containing missing values (geom_errorbar).

p <- ggplot(data = subset(out_grp, race %nin% "other"), 
            aes(x = year, y = prop, ymin = prop - 2*prop_se, 
                ymax = prop + 2*prop_se, fill = race, color = race, 
                group = race))
 
p + geom_ribbon(alpha = 0.3, aes(color = NULL)) + 
  geom_line() + 
  facet_wrap(~ degree, ncol = 1) + 
  scale_y_continuous(labels = scales::percent)  +
  scale_color_brewer(type = "qual", palette = "Dark2") + 
  scale_fill_brewer(type = "qual", palette = "Dark2") + 
  labs(title = "Educational Attainment\nby Race", 
       subtitle = "GSS 1976 - 2016", fill = "Race", 
       color = "Race", x = NULL, y = "Percent") + 
  theme(legend.position = "top")
## Warning: Removed 16 rows containing missing values (geom_path).

6.9 Where to go next

default plots for models

out <- lm(lifeExp ~ log(gdpPercap) + pop + continent, data = gapminder)

plot(out, which = c(1, 2), ask = FALSE) 

# which statement here selects the first two of four default plots for this kind of model. 

# the library ggfortify: similar to broom
# ggfortify::autoplot() the default function 
# the library coefplot
library(coefplot)
## Warning: package 'coefplot' was built under R version 3.6.2
out <- lm(lifeExp ~ log(gdpPercap) + log(pop) + continent, data = gapminder)

coefplot(out, sort = "magnitude", intercept = FALSE)

extensions to ggplot

More often they are a useful tool for the working researcher to quickly investigate aspects of a data set. The goal is not to pithily summarize a single point one already knows, but to open things up for further exploration.

This sort of plot is like the visual version of a correlation matrix. It shows a bivariate plot for all pairs of variables in the data. This is relatively straightforward when all the variables are continuous measures. Things get more complex when, as is often the case in the social sciences, some or all variables are categorical or otherwise limited in the range of values they can take.

library(GGally)  
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa
organdata_sum <- organdata %>% select(donors, pop_dens, pubhealth, roads, consent_law)

ggpairs(data = organdata_sum, 
        aes(color = consent_law), 
        upper = list(continuous = wrap("density"), combo = "box_no_facet"),
        lower = list(continuous = wrap("points"), combo = wrap("dot_no_facet")))
## Warning: Removed 34 rows containing non-finite values (stat_density).
## Warning: Removed 34 rows containing non-finite values (stat_density2d).
## Warning: Removed 37 rows containing non-finite values (stat_density2d).
## Warning: Removed 34 rows containing non-finite values (stat_density2d).
## Warning: Removed 34 rows containing non-finite values (stat_boxplot).
## Warning: Removed 34 rows containing missing values (geom_point).
## Warning: Removed 17 rows containing non-finite values (stat_density).
## Warning: Removed 21 rows containing non-finite values (stat_density2d).
## Warning: Removed 17 rows containing non-finite values (stat_density2d).
## Warning: Removed 17 rows containing non-finite values (stat_boxplot).
## Warning: Removed 37 rows containing missing values (geom_point).
## Warning: Removed 21 rows containing missing values (geom_point).
## Warning: Removed 21 rows containing non-finite values (stat_density).
## Warning: Removed 21 rows containing non-finite values (stat_density2d).
## Warning: Removed 21 rows containing non-finite values (stat_boxplot).
## Warning: Removed 34 rows containing missing values (geom_point).
## Warning: Removed 17 rows containing missing values (geom_point).
## Warning: Removed 21 rows containing missing values (geom_point).
## Warning: Removed 17 rows containing non-finite values (stat_density).
## Warning: Removed 17 rows containing non-finite values (stat_boxplot).
## Warning: Removed 34 rows containing missing values (geom_point).
## Warning: Removed 17 rows containing missing values (geom_point).
## Warning: Removed 21 rows containing missing values (geom_point).
## Warning: Removed 17 rows containing missing values (geom_point).

chapter 7 draw maps

election %>% select(state, total_vote,
                    r_points, pct_trump, party, census) %>%
    sample_n(5)
## # A tibble: 5 x 6
##   state                total_vote r_points pct_trump party      census   
##   <chr>                     <dbl>    <dbl>     <dbl> <chr>      <chr>    
## 1 New Hampshire            744296   -0.370     46.5  Democrat   Northeast
## 2 Arizona                 2604657    3.5       48.1  Republican West     
## 3 District of Columbia     311268  -86.8        4.09 Democrat   South    
## 4 South Dakota             370093   29.8       61.5  Republican Midwest  
## 5 California             14237884  -30.0       31.5  Democrat   West
# Hex color codes for Dem Blue and Rep Red
party_colors <- c("#2E74C0", "#CB454A") 

p0 <- ggplot(data = subset(election, st %nin% "DC"),
             mapping = aes(x = r_points,
                           y = reorder(state, r_points),
                           color = party))

p1 <- p0 + geom_vline(xintercept = 0, color = "gray30") +
    geom_point(size = 2)

p1 

p2 <- p1 + scale_color_manual(values = party_colors)

p2 

p3 <- p2 + scale_x_continuous(breaks = c(-30, -20, -10, 0, 10, 20, 30, 40),
                              labels = c("30\n (Clinton)", "20", "10", "0",
                                         "10", "20", "30", "40\n(Trump)"))
p3 

p3 + facet_wrap(~ census, ncol=1, scales="free_y") +
    guides(color=FALSE) + labs(x = "Point Margin", y = "") +
    theme(axis.text=element_text(size=8))  

library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
us_states <- map_data("state")
head(us_states)
##     long   lat group order  region subregion
## 1 -87.46 30.39     1     1 alabama      <NA>
## 2 -87.48 30.37     1     2 alabama      <NA>
## 3 -87.53 30.37     1     3 alabama      <NA>
## 4 -87.53 30.33     1     4 alabama      <NA>
## 5 -87.57 30.33     1     5 alabama      <NA>
## 6 -87.59 30.33     1     6 alabama      <NA>
dim(us_states)
## [1] 15537     6
p <- ggplot(data = us_states,
            mapping = aes(x = long, y = lat,
                          group = group))

p + geom_polygon(fill = "white", color = "black")

p <- ggplot(data = us_states,
            aes(x = long, y = lat,
                group = group, fill = region))

p + geom_polygon(color = "gray90", size = 0.1) + guides(fill = FALSE)

p <- ggplot(data = us_states,
            mapping = aes(x = long, y = lat,
                          group = group, fill = region))

p + geom_polygon(color = "gray90", size = 0.1) +
    coord_map(projection = "albers", lat0 = 39, lat1 = 45) +
    guides(fill = FALSE)  

# merge data  
election$region <- tolower(election$state)  

us_states_elec <- left_join(us_states, election)  
## Joining, by = "region"
p <- ggplot(data = us_states_elec,
            aes(x = long, y = lat,
                group = group, fill = party))

p + geom_polygon(color = "gray90", size = 0.1) +
    coord_map(projection = "albers", lat0 = 39, lat1 = 45) 

p0 <- ggplot(data = us_states_elec,
             mapping = aes(x = long, y = lat,
                           group = group, fill = party))
p1 <- p0 + geom_polygon(color = "gray90", size = 0.1) +
    coord_map(projection = "albers", lat0 = 39, lat1 = 45) 
p2 <- p1 + scale_fill_manual(values = party_colors) +
    labs(title = "Election Results 2016", fill = NULL)

library(ggthemes)  

p2 + theme_map() 

# fill other variables 
p0 <- ggplot(data = us_states_elec,
             mapping = aes(x = long, y = lat, group = group, fill = pct_trump))

p1 <- p0 + geom_polygon(color = "gray90", size = 0.1) +
    coord_map(projection = "albers", lat0 = 39, lat1 = 45) 

p1 + labs(title = "Trump vote") + theme_map() + labs(fill = "Percent")

p2 <- p1 + scale_fill_gradient(low = "white", high = "#CB454A") +
        labs(title = "Trump vote") 

p2 + theme_map() + labs(fill = "Percent")

The scale_gradient2() function gives us a blue-red spectrum that passes through white by default. Alternatively, we can re-specify the mid-level color along with the high and low colors. We will make purple our midpoint, and use the muted() function from the scales library to tone down the color a little.

p0 <- ggplot(data = us_states_elec,
             mapping = aes(x = long, y = lat, group = group, fill = d_points))

p1 <- p0 + geom_polygon(color = "gray90", size = 0.1) +
    coord_map(projection = "albers", lat0 = 39, lat1 = 45) 

p2 <- p1 + scale_fill_gradient2() + labs(title = "Winning margins") 
p2 + theme_map() + labs(fill = "Percent")

p3 <- p1 + scale_fill_gradient2(low = "red", mid = scales::muted("purple"),
                                high = "blue", breaks = c(-25, 0, 25, 50, 75)) +
    labs(title = "Winning margins") 

p3 + theme_map() + labs(fill = "Percent")

# omit dc to see the difference  
p0 <- ggplot(data = subset(us_states_elec,
                           region %nin% "district of columbia"),
             aes(x = long, y = lat, group = group, fill = d_points))

p1 <- p0 + geom_polygon(color = "gray90", size = 0.1) +
    coord_map(projection = "albers", lat0 = 39, lat1 = 45) 

p2 <- p1 + scale_fill_gradient2(low = "red",
                                mid = scales::muted("purple"),
                                high = "blue") +
    labs(title = "Winning margins") 
p2 + theme_map() + labs(fill = "Percent")

7.2 America’s ur-choropleths

county_map %>% sample_n(5)  
##       long     lat  order  hole piece            group    id
## 1  1954188 -411909 170052 FALSE     1 0500000US51013.1 51013
## 2  -321340 -647892  32314 FALSE     1 0500000US08073.1 08073
## 3 -1099211   52795  49037 FALSE     1 0500000US16037.1 16037
## 4   838935 -396424  53724 FALSE     1 0500000US17143.1 17143
## 5   880714 -925626  19916 FALSE     1 0500000US05021.1 05021
county_data %>%
    select(id, name, state, pop_dens, pct_black) %>%
    sample_n(5)  
##      id             name state      pop_dens   pct_black
## 1 12119    Sumter County    FL [  100,  500) [ 5.0,10.0)
## 2 48395 Robertson County    TX [   10,   50) [15.0,25.0)
## 3 46107    Potter County    SD [    0,   10) [ 0.0, 2.0)
## 4 38091    Steele County    ND [    0,   10) [ 0.0, 2.0)
## 5 47089 Jefferson County    TN [  100,  500) [ 2.0, 5.0)
county_full <- left_join(county_map, county_data, by = "id")   

p <- ggplot(data = county_full,
            mapping = aes(x = long, y = lat,
                          fill = pop_dens, 
                          group = group))

p1 <- p + geom_polygon(color = "gray90", size = 0.05) + coord_equal()

p2 <- p1 + scale_fill_brewer(palette="Blues",
                             labels = c("0-10", "10-50", "50-100", "100-500",
                                        "500-1,000", "1,000-5,000", ">5,000"))

# We also tweak how the legend is drawn using the guides() function to make sure each element of the key appears on the same row.   

# The use of coord_equal() makes sure that the relative scale of our map does not change even if we alter the overall dimensions of the plot.  

p2 + labs(fill = "Population per\nsquare mile") +
    theme_map() +
    guides(fill = guide_legend(nrow = 1)) + 
    theme(legend.position = "bottom")  

# choose a different range of hues  
p <- ggplot(data = county_full,
            mapping = aes(x = long, y = lat, fill = pct_black, 
                          group = group))
p1 <- p + geom_polygon(color = "gray90", size = 0.05) + coord_equal()
p2 <- p1 + scale_fill_brewer(palette="Greens")

p2 + labs(fill = "US Population, Percent Black") +
    guides(fill = guide_legend(nrow = 1)) + 
    theme_map() + theme(legend.position = "bottom")

visualize measure of the rate of all firearm-related suicides between 1999 and 2015

orange_pal <- RColorBrewer::brewer.pal(n = 6, name = "Oranges")
orange_pal 
## [1] "#FEEDDE" "#FDD0A2" "#FDAE6B" "#FD8D3C" "#E6550D" "#A63603"
orange_rev <- rev(orange_pal)
orange_rev 
## [1] "#A63603" "#E6550D" "#FD8D3C" "#FDAE6B" "#FDD0A2" "#FEEDDE"
gun_p <- ggplot(data = county_full,
            mapping = aes(x = long, y = lat,
                          fill = su_gun6, 
                          group = group))

gun_p1 <- gun_p + geom_polygon(color = "gray90", size = 0.05) + coord_equal()

gun_p2 <- gun_p1 + scale_fill_manual(values = orange_pal)

gun_p2 + labs(title = "Gun-Related Suicides, 1999-2015",
              fill = "Rate per 100,000 pop.") +
    theme_map() + theme(legend.position = "bottom")

pop_p <- ggplot(data = county_full, mapping = aes(x = long, y = lat,
                                                  fill = pop_dens6, 
                                                  group = group))

pop_p1 <- pop_p + geom_polygon(color = "gray90", size = 0.05) + coord_equal()

pop_p2 <- pop_p1 + scale_fill_manual(values = orange_rev)

pop_p2 + labs(title = "Reverse-coded Population Density",
              fill = "People per square mile") +
    theme_map() + theme(legend.position = "bottom")

7.3 statebins

library(statebins)
## Warning: package 'statebins' was built under R version 3.6.2
statebins_continuous(state_data = election, state_col = "state",
                     text_color = "white", value_col = "pct_trump",
                     brewer_pal="Reds", font_size = 3,
                     legend_title="Percent Trump")
## Warning: `show_guide` has been deprecated. Please use `show.legend`
## instead.

statebins_continuous(state_data = subset(election, st %nin% "DC"),
                     state_col = "state",
                     text_color = "black", value_col = "pct_clinton",
                     brewer_pal="Blues", font_size = 3,
                     legend_title="Percent Clinton")  
## Warning: `show_guide` has been deprecated. Please use `show.legend`
## instead.

election <- election %>% mutate(color = recode(party, Republican = "darkred",
                                               Democrat = "royalblue"))

statebins_manual(state_data = election, state_col = "st",
                 color_col = "color", text_color = "white",
                 font_size = 3, legend_title="Winner",
                 labels=c("Trump", "Clinton"), legend_position = "right")
## Warning: `show_guide` has been deprecated. Please use `show.legend`
## instead.

# Alternatively, we can have statebins() cut the data for us using the breaks argument  

statebins(state_data = election,          
          state_col = "state", value_col = "pct_trump",
          text_color = "white", breaks = 4,
          labels = c("4-21", "21-37", "37-53", "53-70"),
          brewer_pal="Reds", font_size = 3, legend_title="Percent Trump")  
## Warning: `show_guide` has been deprecated. Please use `show.legend`
## instead.

7.4 small multiple maps

head(opiates)
## # A tibble: 6 x 11
##    year state  fips deaths population crude adjusted adjusted_se region
##   <int> <chr> <int>  <int>      <int> <dbl>    <dbl>       <dbl> <ord> 
## 1  1999 Alab~     1     37    4430141   0.8      0.8         0.1 South 
## 2  1999 Alas~     2     27     624779   4.3      4           0.8 West  
## 3  1999 Ariz~     4    229    5023823   4.6      4.7         0.3 West  
## 4  1999 Arka~     5     28    2651860   1.1      1.1         0.2 South 
## 5  1999 Cali~     6   1474   33499204   4.4      4.5         0.1 West  
## 6  1999 Colo~     8    164    4226018   3.9      3.7         0.3 West  
## # ... with 2 more variables: abbr <chr>, division_name <chr>
opiates$region <- tolower(opiates$state)
opiates_map <- left_join(us_states, opiates)
## Joining, by = "region"
library(viridis)
## Loading required package: viridisLite
p0 <- ggplot(data = subset(opiates_map, year > 1999),
             mapping = aes(x = long, y = lat,
                 group = group,
                 fill = adjusted))

p1 <- p0 + geom_polygon(color = "gray90", size = 0.05) +
    coord_map(projection = "albers", lat0 = 39, lat1 = 45) 

p2 <- p1 + scale_fill_viridis_c(option = "plasma")

p2 + theme_map() + facet_wrap(~ year, ncol = 3) +
    theme(legend.position = "bottom",
          strip.background = element_blank()) +
    labs(fill = "Death rate per 100,000 population ",
         title = "Opiate Related Deaths by State, 2000-2014")  

7.5 Is your data really spatial?

p <- ggplot(data = opiates,
            mapping = aes(x = year, y = adjusted,
                          group = state))
p + geom_line(color = "gray70") 
## Warning: Removed 17 rows containing missing values (geom_path).

p0 <- ggplot(data = drop_na(opiates, division_name),
            mapping = aes(x = year, y = adjusted))
            
p1 <- p0 + geom_line(color = "gray70", 
              mapping = aes(group = state)) 

p2 <- p1 + geom_smooth(mapping = aes(group = division_name),
                       se = FALSE)

p3 <- p2 + geom_text_repel(data = subset(opiates,
                                         year == max(year) & abbr !="DC"),
                     mapping = aes(x = year, y = adjusted, label = abbr),
                     size = 1.8, segment.color = NA, nudge_x = 30) +
         coord_cartesian(c(min(opiates$year), 
                    max(opiates$year)))  

p3 + labs(x = "", y = "Rate per 100,000 population",
       title = "State-Level Opiate Death Rates by Census Division, 1999-2014") +
    facet_wrap(~ reorder(division_name, -adjusted, na.rm = TRUE), nrow  = 3)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 27 rows containing non-finite values (stat_smooth).

## Warning: Removed 17 rows containing missing values (geom_path).

This is changing fast. Brundson & Comber (2015) provide an introduction to some of R’s mapping capabilities. Meanwhile, very recently these tools have become much more easily accessible via the tidyverse. Of particular interest to social science. Also see news and updates at r-spatial.org. is Edzer Pebesma’s ongoing development of the sf package, which implements the standard Simple Features data model for spatial features in a tidyverse-friendly way. Relatedly, Kyle Walker and Bob Rudis’s tigris package github.com/walkerke/tigris allows for (sf-library combatible) access to the U.S. Census Bureau’s TIGER/Line shapefiles, which allow you to map data for many different geographical, administrative, and Census-related subdivisions of the United States, as well as things roads and water features. Finally, Kyle Walker’s tidycensus packagewalkerke.github.io/tidycensus (Walker, 2018) makes it much easier to tidily get both substantive and spatial feature data from the U.S. Census and the American Community Survey.

chapter 8 Refine your plots

head(asasec)  
##                                Section         Sname Beginning Revenues
## 1      Aging and the Life Course (018)         Aging     12752    12104
## 2     Alcohol, Drugs and Tobacco (030) Alcohol/Drugs     11933     1144
## 3 Altruism and Social Solidarity (047)      Altruism      1139     1862
## 4            Animals and Society (042)       Animals       473      820
## 5             Asia/Asian America (024)          Asia      9056     2116
## 6            Body and Embodiment (048)          Body      3408     1618
##   Expenses Ending Journal Year Members
## 1    12007  12849      No 2005     598
## 2      400  12677      No 2005     301
## 3     1875   1126      No 2005      NA
## 4     1116    177      No 2005     209
## 5     1710   9462      No 2005     365
## 6     1920   3106      No 2005      NA
p <- ggplot(data = subset(asasec, Year == 2014), aes(x = Members, y = Revenues, label = Sname)) 

p + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# identify some outliers  
p <- ggplot(data = subset(asasec, Year == 2014), aes(x = Members, y = Revenues, label = Sname)) 

p + geom_point(aes(color = Journal)) + geom_smooth(method = "lm")

# add some text labels  
p0 <- ggplot(data = subset(asasec, Year == 2014),
             mapping = aes(x = Members, y = Revenues, label = Sname))

p1 <- p0 + geom_smooth(method = "lm", se = FALSE, color = "gray80") +
    geom_point(mapping = aes(color = Journal)) 

p2 <- p1 + geom_text_repel(data=subset(asasec,
                                       Year == 2014 & Revenues > 7000),
                           size = 2)

p3 <- p2 + labs(x="Membership",
        y="Revenues",
        color = "Section has own Journal",
        title = "ASA Sections",
        subtitle = "2014 Calendar year.",
        caption = "Source: ASA annual report.")
p4 <- p3 + scale_y_continuous(labels = scales::dollar) +
     theme(legend.position = "bottom")
p4

use the RColorBrewer package to make a wide range of named color palettes available to you
use function scale_color_brewer() or scale_fill_brewer()

p <- ggplot(data = organdata,
            mapping = aes(x = roads, y = donors, color = world))
p + geom_point(size = 2) + scale_color_brewer(palette = "Set2") +
    theme(legend.position = "top")
## Warning: Removed 46 rows containing missing values (geom_point).

p + geom_point(size = 2) + scale_color_brewer(palette = "Pastel2") +
        theme(legend.position = "top")
## Warning: Removed 46 rows containing missing values (geom_point).

p + geom_point(size = 2) + scale_color_brewer(palette = "Dark2") +
    theme(legend.position = "top")
## Warning: Removed 46 rows containing missing values (geom_point).

You can also specify colors manually, via scale_color_manual() or scale_fill_manual().

# manually introduce a palette from Chang (2013) that’s friendly to color-blind viewers. 
cb_palette <- c("#999999", "#E69F00", "#56B4E9", "#009E73",
                "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

p4 + scale_color_manual(values = cb_palette)  

library(RColorBrewer)

Default <- brewer.pal(5, "Set2") 


library(dichromat) # transform these colors to new values that simulate different kinds of color blindness

types <- c("deutan", "protan", "tritan")
names(types) <- c("Deuteronopia", "Protanopia", "Tritanopia")

color_table <- types %>%
    purrr::map(~ dichromat(Default, .x)) %>%
    as_tibble() %>%
    add_column(Default, .before = TRUE)

color_table
## # A tibble: 5 x 4
##   Default Deuteronopia Protanopia Tritanopia
##   <chr>   <chr>        <chr>      <chr>     
## 1 #66C2A5 #AEAEA7      #BABAA5    #82BDBD   
## 2 #FC8D62 #B6B661      #9E9E63    #F29494   
## 3 #8DA0CB #9C9CCB      #9E9ECB    #92ABAB   
## 4 #E78AC3 #ACACC1      #9898C3    #DA9C9C   
## 5 #A6D854 #CACA5E      #D3D355    #B6C8C8
color_comp(color_table)

In this code, we create a vector of types of color blindness that the dichromat() function knows about, and give them proper names. Then we make a table of colors for each type using the purrr library’s map() function. The rest of the pipeline converts the results from a list to a tibble and adds the original colors as the first column in the table. We can now plot them to see how they compare, using a convenience function from the socviz library.

8.2 Layer color and text together

# Democrat Blue and Republican Red
party_colors <- c("#2E74C0", "#CB454A")

p0 <- ggplot(data = subset(county_data,
                           flipped == "No"),
             mapping = aes(x = pop,
                           y = black/100))

p1 <- p0 + geom_point(alpha = 0.15, color = "gray50") +
    scale_x_log10(labels=scales::comma) 

p1  

p2 <- p1 + geom_point(data = subset(county_data,
                                    flipped == "Yes"),
                      mapping = aes(x = pop, y = black/100,
                                    color = partywinner16)) +
    scale_color_manual(values = party_colors)

p2

p3 <- p2 + scale_y_continuous(labels=scales::percent) +
    labs(color = "County flipped to ... ",
         x = "County Population (log scale)",
         y = "Percent Black Population",
         title = "Flipped counties, 2016",
         caption = "Counties in gray did not flip.")

p3 

p4 <- p3 + geom_text_repel(data = subset(county_data,
                                      flipped == "Yes" &
                                      black  > 25),
                           mapping = aes(x = pop,
                                   y = black/100,
                                   label = state), size = 2)

p4 + theme_minimal() +
    theme(legend.position="top") 

8.3 Change the appearance of plots with themes

theme_set(theme_bw())
p4 + theme(legend.position="top")

theme_set(theme_dark())
p4 + theme(legend.position="top")

# many other options from ggthemes
library(ggthemes)

theme_set(theme_economist())
p4 + theme(legend.position="top")

theme_set(theme_wsj())

p4 + theme(plot.title = element_text(size = rel(0.6)),
           legend.title = element_text(size = rel(0.35)),
           plot.caption = element_text(size = rel(0.35)),
           legend.position = "top")

ClausIt also contains some convenience functions for laying out several plot objects in a single figure, amongst other features, as we shall see below in one of the case studies. O. Wilke’s cowplot package, for instance, contains a well-developed theme suitable for figures whose final destination is a journal article. Bob Rudis’s hrbrthemes package, meanwhile, has a distinctive and compact look and feel that takes advantage of some freely-available typefaces. Both are available via install.packages()

use theme() function

p4 + theme(legend.position = "top")

p4 + theme(legend.position = "top",
           plot.title = element_text(size=rel(2),
                                     lineheight=.5,
                                     family="Times",
                                     face="bold.italic",
                                     colour="orange"),
           axis.text.x = element_text(size=rel(1.1),
                                      family="Courier",
                                      face="bold",
                                      color="purple"))
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font
## family not found in Windows font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

8.4 Use theme elements in a substantive way

library(socviz)

yrs <- c(seq(1972, 1988, 4), 1993, seq(1996, 2016, 4))

mean_age <- gss_lon %>%
    filter(age %nin% NA && year %in% yrs) %>%
    group_by(year) %>%
    summarize(xbar = round(mean(age, na.rm = TRUE), 0))
mean_age$y <- 0.3

yr_labs <- data.frame(x = 85, y = 0.8,
                      year = yrs)  


p <- ggplot(data = subset(gss_lon, year %in% yrs),
            mapping = aes(x = age))

p1 <- p + geom_density(fill = "gray20", color = FALSE,
                       alpha = 0.9, mapping = aes(y = ..scaled..)) +
    geom_vline(data = subset(mean_age, year %in% yrs),
               aes(xintercept = xbar), color = "white", size = 0.5) +
    geom_text(data = subset(mean_age, year %in% yrs),
              aes(x = xbar, y = y, label = xbar), nudge_x = 7.5,
              color = "white", size = 3.5, hjust = 1) +
    geom_text(data = subset(yr_labs, year %in% yrs),
              aes(x = x, y = y, label = year)) +
    facet_grid(year ~ ., switch = "y")  



p1  +
    theme(plot.title = element_text(size = 16),
          axis.text.x= element_text(size = 12),
          axis.title.y=element_blank(),
          axis.text.y=element_blank(),
          axis.ticks.y = element_blank(),
          strip.background = element_blank(),
          strip.text.y = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank()) +
    labs(x = "Age",
         y = NULL,
         title = "Age Distribution of\nGSS Respondents") 
## Warning: Removed 83 rows containing non-finite values (stat_density).

library(ggridges)
## 
## Attaching package: 'ggridges'
## The following object is masked from 'package:ggplot2':
## 
##     scale_discrete_manual
p <- ggplot(data = gss_lon, aes(x = age, y = factor(year, levels = rev(unique(year)), ordered = TRUE)))  
            
p + geom_density_ridges(alpha = 0.6, fill = "lightblue", scale = 1.5) + scale_x_continuous(breaks = c(25, 50, 75)) + 
  scale_y_discrete(expand = c(0.01, 0)) + 
  labs(x = "Age", y = NULL, title = "Age Distribution of\nGSS Respondents") + 
  theme_ridges() + 
  theme(title = element_text(size = 16, face = "bold"))
## Picking joint bandwidth of 3.49
## Warning: Removed 221 rows containing non-finite values
## (stat_density_ridges).

8.5 Case studies

two y axises

head(fredts) 
##         date sp500 monbase sp500_i monbase_i
## 1 2009-03-11 696.7 1542228   100.0     100.0
## 2 2009-03-18 766.7 1693133   110.1     109.8
## 3 2009-03-25 799.1 1693133   114.7     109.8
## 4 2009-04-01 809.1 1733017   116.1     112.4
## 5 2009-04-08 830.6 1733017   119.2     112.4
## 6 2009-04-15 852.2 1789878   122.3     116.1
fredts_m <- fredts %>% select(date, sp500_i, monbase_i) %>%
    gather(key = series, value = score, sp500_i:monbase_i)  


p <- ggplot(data = fredts_m,
            mapping = aes(x = date, y = score,
                          group = series,
                          color = series))
p1 <- p + geom_line() + theme(legend.position = "top") +
    labs(x = "Date",
         y = "Index",
         color = "Series")

p <- ggplot(data = fredts,
            mapping = aes(x = date, y = sp500_i - monbase_i))

p2 <- p + geom_line() +
    labs(x = "Date",
         y = "Difference")  

cowplot::plot_grid(p1, p2, nrow = 2, rel_heights = c(0.75, 0.25), align = "v")
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

yahoo’s example

head(yahoo)
##   Year Revenue Employees Mayer
## 1 2004    3574      7600    No
## 2 2005    5257      9800    No
## 3 2006    6425     11400    No
## 4 2007    6969     14300    No
## 5 2008    7208     13600    No
## 6 2009    6460     13900    No
p <- ggplot(data = yahoo, aes(x = Employees, y = Revenue))

p + geom_path(color = "gray80") + 
  geom_text(aes(color = Mayer, label = "year"), 
            size = 3, fontface = "bold") + 
  theme(legend.position = "bottom") + 
  labs(color = "Mayer is CEO",
       x = "Employees", 
       y = "Revenue (Millions)", 
       title = "Yahoo Employees vs Revenues, 2004-2014") + 
  scale_y_continuous(labels = scales::dollar) + 
  scale_x_continuous(labels = scales::comma)

p <- ggplot(data = yahoo,
            mapping = aes(x = Year, y = Revenue/Employees))

p + geom_vline(xintercept = 2012) +
    geom_line(color = "gray60", size = 2) +
    annotate("text", x = 2013, y = 0.44,
             label = " Mayer becomes CEO", size = 2.5) +
    labs(x = "Year\n",
         y = "Revenue/Employees",
         title = "Yahoo Revenue to Employee Ratio, 2004-2014")

saying no to pie

head(studebt)
## # A tibble: 6 x 4
##   Debt     type        pct Debtrc  
##   <ord>    <fct>     <int> <ord>   
## 1 Under $5 Borrowers    20 Under $5
## 2 $5-$10   Borrowers    17 $5-$10  
## 3 $10-$25  Borrowers    28 $10-$25 
## 4 $25-$50  Borrowers    19 $25-$50 
## 5 $50-$75  Borrowers     8 $50-$75 
## 6 $75-$100 Borrowers     3 $75-$100
p_xlab <- "Amount Owed, in thousands of Dollars"
p_title <- "Outstanding Student Loans"
p_subtitle <- "44 million borrowers owe a total of $1.3 trillion"
p_caption <- "Source: FRB NY"

f_labs <- c(`Borrowers` = "Percent of\nall Borrowers",
            `Balances` = "Percent of\nall Balances")

p <- ggplot(data = studebt,
            mapping = aes(x = Debt, y = pct/100, fill = type))
p + geom_bar(stat = "identity") +
    scale_fill_brewer(type = "qual", palette = "Dark2") +
    scale_y_continuous(labels = scales::percent) +
    guides(fill = FALSE) +
    theme(strip.text.x = element_text(face = "bold")) +
    labs(y = NULL, x = p_xlab,
      caption = p_caption,
      title = p_title,
      subtitle = p_subtitle) +
    facet_grid(~ type, labeller = as_labeller(f_labs)) +
    coord_flip()

library(viridis)

p <- ggplot(studebt, aes(y = pct/100, x = type, fill = Debtrc))
p + geom_bar(stat = "identity", color = "gray80") +
  scale_x_discrete(labels = as_labeller(f_labs)) +
  scale_y_continuous(labels = scales::percent) +
  scale_fill_viridis(discrete = TRUE) +
  guides(fill = guide_legend(reverse = TRUE,
                             title.position = "top",
                             label.position = "bottom",
                             keywidth = 3,
                             nrow = 1)) +
  labs(x = NULL, y = NULL,
       fill = "Amount Owed, in thousands of dollars",
       caption = p_caption,
       title = p_title,
       subtitle = p_subtitle) +
  theme(legend.position = "top",
        axis.text.y = element_text(face = "bold", hjust = 1, size = 12),
        axis.ticks.length = unit(0, "cm"),
        panel.grid.major.y = element_blank()) +
  coord_flip()

Appendix.

Write your own functions

plot_section <- function(section="Culture", x = "Year",
                         y = "Members", data = asasec,
                         smooth=FALSE){
    require(ggplot2)
    require(splines)
    # Note use of aes_string() rather than aes() 
    p <- ggplot(subset(data, Sname==section),
            mapping = aes_string(x=x, y=y))

    if(smooth == TRUE) {
        p0 <- p + geom_smooth(color = "#999999",
                              size = 1.2, method = "lm",
                              formula = y ~ ns(x, 3)) +
            scale_x_continuous(breaks = c(seq(2005, 2015, 4))) +
            labs(title = section)
    } else {
    p0 <- p + geom_line(color= "#E69F00", size=1.2) +
        scale_x_continuous(breaks = c(seq(2005, 2015, 4))) +
        labs(title = section)
    }

    print(p0)
}

plot_section("Rationality")  
## Loading required package: splines

plot_section("Sexualities", smooth = TRUE)   

plot_section <- function(section="Culture", x = "Year",
                         y = "Members", data = asasec,
                         smooth=FALSE, ...){
    require(ggplot2)
    require(splines)
    # Note use of aes_string() rather than aes() 
    p <- ggplot(subset(data, Sname==section),
            mapping = aes_string(x=x, y=y))

    if(smooth == TRUE) {
        p0 <- p + geom_smooth(color = "#999999",
                              size = 1.2, ...) +
            scale_x_continuous(breaks = c(seq(2005, 2015, 4))) +
            labs(title = section)
    } else {
    p0 <- p + geom_line(color= "#E69F00", size=1.2) +
        scale_x_continuous(breaks = c(seq(2005, 2015, 4))) +
        labs(title = section)
    }

    print(p0)
}
plot_section("Comm/Urban",
             smooth = TRUE,
             method = "loess")

plot_section("Children",
             smooth = TRUE,
             method = "lm",
             formula = y ~ ns(x, 2))

Managing projects and files

An absolute file path.
Notice the leading “/” that starts at the very top of the computer’s file hierarchy.
my_data <- read_csv("/Users/kjhealy/projects/gss/data/gss.csv")

Instead, because you have an R project file started in the gss folder you can use the here() library to specify a relative path, like this:

my_data <- read_csv(here("data", "gss.csv"))