A map of the US election results

  1. Upload libraries:
rm(list = ls(all=T)) #clear workspace
library(dplyr)
library(readr)
library(stringr)
library(tidyr)
library(readxl)
library(classInt)
library(RColorBrewer)
library(maptools) #to read shapefiles

2. Download the data files (note they are not ready for use but need some cleaning as there are more areas in the excel files than polygons in the shape file). I copy here the code as I have used it in my script but it’s available at RPubs thanks to David Robinson.

download.file("http://www2.census.gov/prod2/statcomp/usac/excel/LND01.xls", "LND01.xls")
download.file("http://www2.census.gov/prod2/statcomp/usac/excel/POP01.xls", "POP01.xls")

according to metadata, this is Land Area in 2010 and resident population in 2010:

us_county_area <- read_excel("LND01.xls") 
transmute(CountyCode = as.character(as.integer(STCOU)), Area = LND110210D)

us_county_population <- read_excel("POP01.xls") 
transmute(CountyCode = as.character(as.integer(STCOU)),Population = POP010210D)

3. Adjust data

election_url <- "https://raw.githubusercontent.com/Prooffreader/election_2016_data/master/data/presidential_general_election_2016_by_county.csv"
county_data <- read_csv(election_url) 
group_by(CountyCode = as.character(fips)) 
mutate(TotalVotes = sum(votes)) 
ungroup() 
mutate(name = str_replace(name, ".\\. ", "")) 
filter(name %in% c("Trump", "Clinton", "Johnson", "Stein")) 
transmute(County = str_replace(geo_name, " County", ""),
State = state,
CountyCode = as.character(fips),
Candidate = name,
Percent = vote_pct / 100,
TotalVotes) 
spread(Candidate, Percent, fill = 0) 
inner_join(us_county_population, by = "CountyCode") 
inner_join(us_county_area, by = "CountyCode")

you can save the data into a csv file:

# write_csv(county_data, "county_election_2016.csv")

You can download the cleaned datafile here: data_election_2016_by_county

4. Upload data and shape files

setwd("/Users/...")
dt <- read.csv("new_county_election_2016.csv", header=T)
us <- readShapePoly("./USA_adm/USA_adm2.shp")
us0 <- readShapePoly("./USA_adm/USA_adm0.shp")
us.m <- us[-c(which(us$NAME_1=="Alaska")),] #get rid of Alaska
us.d <- us.m[-c(67:71),]

5. Prepare the color palette(s)

nclassint <- 5 #number of colors to be used in the palette
cat.T <- classIntervals(dt$Trump[-c(67:71)], nclassint,style = "jenks") #style refers to how the breaks are created
colpal.T <- brewer.pal(nclassint,"Reds")
color.T <- findColours(cat.T,colpal.T) #sequential
bins.T <- cat.T$brks
lb.T <- length(bins.T)

5. Plot the maps with map basic

# pdf("Where are the trump voters.pdf")
# plot(us.d, col=color.T, border=F)
# plot(us0,add=T, lwd=0.1)
# legend("bottomleft",fill=colpal.T,legend=paste(round(bins[-length(bins.T)],1),":",round(bins.T[-1],1)),cex=1, bg="white")
# dev.off()
clinton-voters
% Votes for Clinton
where-are-the-trump-voters
% Votes for Trump

… or ggplot2

library(ggplot2)
library(scales)
theme_set(theme_bw())

ggplot(county_data, aes(Population / Area, Trump)) +
  geom_point() +
  geom_point(data=county_data[which(county_data$State=="Texas"),], aes(x=Population/Area, y=Trump), colour="red")+
  scale_x_log10() +
  scale_y_continuous(labels = percent_format()) +
  xlab("Population density (ppl / square mile)") +
  ylab("% of votes going to Trump") +
  geom_text(aes(label = County), vjust = 1, hjust = 1, check_overlap = TRUE) +
  geom_smooth(method = "lm") +
  ggtitle("Population density vs Trump voters by county (Texas Counties in red)")

This is the code to plot in red points according to State (in red) and to add red labels to those points. The check_overlap=T avoids overlapping labels.

# ggplot(county_data, aes(Population / Area, Trump)) +
#   geom_point() +
#   geom_point(data=county_data[which(county_data$State=="California"),], aes(x=Population/Area, y=Trump), colour="red")+
#   scale_x_log10() +
#   scale_y_continuous(labels = percent_format()) +
#   xlab("Population density (ppl / square mile)") +
#   ylab("% of votes going to Trump") +
#   geom_text(data=county_data[which(county_data$State=="California"),], aes(label = ifelse(Trump&amp;gt;.5, as.character(dt$County), "" )), color= "red",size=5,vjust = 1, hjust = 1, check_overlap = TRUE) +
#   geom_smooth(method = "lm") +
#   ggtitle("Population density vs Trump voters by county (California in red)")

rplot1geom_point_texas

californiaclinton

Arranging ggplot2 graphs on a page

How to arrange graphs in ggplot2 without the help of the layout matrix

How do you arrange non-simmetric plots in ggplot2?
With the print command:

After installing these two packages: install.packages(“grid”, “ggplot2”), load the  libraries:
library(grid)
library(ggplot2)

The data and code for the three graphs is taken from this website:

# create factors with value labels
mtcars$gear <- factor(mtcars$gear,levels=c(3,4,5), labels=c("3gears","4gears","5gears"))
mtcars$am <- factor(mtcars$am,levels=c(0,1), labels=c("Automatic","Manual"))
mtcars$cyl <- factor(mtcars$cyl,levels=c(4,6,8), labels=c("4cyl","6cyl","8cyl"))

# Kernel density plots for mpg
# grouped by number of gears (indicated by color)
a <- qplot(mpg, data=mtcars, geom="density", fill=gear, alpha=I(.5),
main="Distribution of Gas Milage", xlab="Miles Per Gallon",
ylab="Density")

# Scatterplot of mpg vs. hp for each combination of gears and cylinders
# in each facet, transmittion type is represented by shape and color
b <- qplot(hp, mpg, data=mtcars, shape=am, color=am,
facets=gear~cyl, size=I(3),
xlab="Horsepower", ylab="Miles per Gallon")

c <- qplot(gear, mpg, data=mtcars, geom=c("boxplot", "jitter"),
fill=gear, main="Mileage by Gear Number",
xlab="", ylab="Miles per Gallon")

a, b, and c are our graphs. Here we decide how to place the plots on the plotting surface:

grid.newpage() # Open a new page on grid device
pushViewport(viewport(layout = grid.layout(3, 1))) #this can really be anything... just remember to change accordingly the print commands below
print(a, vp = viewport(layout.pos.row = 1, layout.pos.col = 1:1))
print(b, vp = viewport(layout.pos.row = 2, layout.pos.col = 1:1))
print(c, vp = viewport(layout.pos.row = 3, layout.pos.col = 1:1))

The layout=grid.layout is the command dividing the plotting surface, in the example I have divided it into three rows and one column, hence the layout.pos.row = 1, 2, 3 and the layout.pos.row = 1:1 equal for all three plots.

image-29-09-2016-at-16-07

What if I need something asymmetrical? For instance two small plots on one column and one taking up more space… The reasoning is very similar to that of the layout matrix: divide the space into 4 squares grid.layout(2, 2) and then plot the third graph over two rows layout.pos.row=1:2

grid.newpage() # Open a new page on grid device
pushViewport(viewport(layout = grid.layout(2, 2))) #this can really be anything... just remember to change accordingly the print commands below
print(a, vp = viewport(layout.pos.row = 1, layout.pos.col = 1:1))
print(b, vp = viewport(layout.pos.row = 2, layout.pos.col = 1:1))
print(c, vp = viewport(layout.pos.row = 1:2, layout.pos.col = 2:2))

image-29-09-2016-at-16-18

How to get good maps in R and avoid the expensive softwares

How to convey as much information as possible in a clear and simple way? Producing maps for social sciences is not difficult, there are a plethora of softwares that can help us. But there are a few issues to consider when choosing your to go program:
(1) Do I want to do all my analysis in one (or more) program(s) and then switch to another one to make those maps?
(2) Are those programs freely accessible to me?
1. Using more than one softwares usually implies spending time to learn different syntax:  why do your analysis in (insert name here ____) and then plot in R when you can do everything in R?
2. The availability of mapping softwares is no trivial issue. Not all researchers have powerful computers, not all institutes have bottomless funds to buy licences, and sometimes having the possibility to map on your laptop while bingeing on Netflix is way nicer than waiting for the one computer with the one licence.

Probably the best and most elegant mapping tool available to Geographers is ArcGIS (to my knowledge, but again, I use R and own a Mac), however it does not come for free. What to do? Well, R is a very good alternative, you can produce elegant maps, customizable to the very last detail. The only drawback I have encountered is the time you would spend to get the first map, but then you would have the syntax and any other map would be pretty quick to plot, and you can always for loop all graphics (although I do not recommend it). Moreover, R runs on your Mac (and Linux), it allows for way more control over features, and has great color palettes (see here and here).

Here are some useful libraries:
library(maps) #for creating geographical maps
library(RColorBrewer) #contains color palettes
library(classInt) #defines the class intervals for the color palettes
library(maptools) #tools for handling spatial objects
library(raster) #tools to deal with raster maps
library(ggplot2) #to create maps, quick and painless

Some stuff to keep in mind:
(1) add a scale with scale.map (or a nice  scalebar);
(2) it is sometimes required to add a north arrow, you can find many versions for that (see this document on page 4 for  examples, I use the same with no labels);
(3) locator() is a very useful tool to get the coordinates when adding labels, arrows, scales and so on.

Part 1: get a plain map.

Below is a very simple example produced using EUROSTAT shape files for world countries (world) and DIVA-GIS for Spain at NUTS3 level (spain). In this map I have removed the Canary Islands, but you can always cut it and paste it in the map using either par(fig=c(…)) or par(fin(…)), inset, or something more elaborated with layout, and framing it using box() or rectangle.

world is the shapefile for the whole world, where I select the neighboring countries I want to appear in the map, in this case Spain, France, Portugal, France, Morocco, and Algeria.
spain is the Spain NUTS3 shapefile where I remove the Canary Islands (45)


plot(spain[-c(45),], border=F) #this first line does not plot anything, it just centers my graph on Spain, the -c(45) removes the Canary Islands
plot(world[c(6, 67,74, 132, 177),], border="lightblue",add=T, col="beige") #plotting the countries appearing in the map
plot(spain[-c(45),], border="brown", lwd=0.2, add=T, col="lightblue") #plot spain, removing the Canary Islands
map.scale(3,35.81324, ratio=F, cex=0.7, relwidth=0.1) # scale map
map.axes(cex.axis=0.9)
northarrow(c(4.8,42.9),0.7, cex=0.8)

Spain

Part 2: Add labels

Using the function shadow text to avoid labels overlapping.

coords<- coordinates(spain) # get goordinates of the centroids, it's where you center your labels
# p.names is a data frame containing the coordinates and all the names of the provinces (remember to get rid of those you don't want to use if using only a selection). Usually you can find the names in the shapefile, but I didn't have them.
shadowtext(p.names[,1],p.names[,2], label=paste(p.names[,3]), cex=0.7,col="black", bg="white",r=0.1)

Spain3

 

A view of Spanish fertility by age groups (with the help of log scales)

I have been working a lot with the demography library in R, it is a great teaching tool for demography, modeling, life tables, graphic visualization of demographic data, and for many other things (see demography ).
There are a lot of examples available using data from the Human Fertility and Mortality Database.
Here I am using data that I have obtained from Spanish Statistics, a fertility rates time series consisting of 5 years age groups (available from download from here).
It is very nice to plot fertility rates by age groups as one can appreciate the changes in fertility occurred over time (in terms of quantum) and how much each age group contributes to fertility. In the case of Spain,.


library(demography)
plot(spain,plot.type="time",xlab="Year",lwd=2)
legend("topright",legend=c("15-19","20-24","25-29",
"30-34","35-39","40-44","45-49"),
col=c("red","yellow","lightgreen","green","lightblue",
"blue","violet"),bty="n",lty=1,cex=0.8,lwd=2)

sp_fert_by5

The very same plot can be obtained through ggplot2 library (given an appropriate theme (see ggplot themes):

ggplot(ddfert, aes(Year, Female, group= Age,col= Age))+
geom_line()+
scale_color_manual(values= c("red", "yellow", "lightgreen", "green","lightblue", "blue", "violet"))+
scale_x_continuous(labels = c(1975, 1985, 1995, 2005, 2015))+
scale_y_continuous("Fertility Rate")

GGPLOTsp_fert_by5.png

I find it often interesting to plot using a log scale, so that small values don’t get compressed to the end of the graph. In this case it would be sufficient to add to the demography code:
plot(spain, plot.type="time", xlab= "Year", lwd=2, transform=T)...
and to ggplot :
ggplot(ddfert, aes(Year, log(Female), group= Age,col= Age))+...

GGPLOTsp_fert_by5LOG.png

The gap between desired and observed fertility in Europe. Part 2: Childlessness levels.

To better understand the effect of postponement we tried to measure it by calculating the effect of time spent on contraception while in a union by women who want to have children, a ‘conscious’ way to postpone childbearing.

Involuntary childlessness has gained momentum in mainstream media, which attribute a large part (if not the totality) of the blame on the postponement of childbearing: women wait too long to have children, they don’t hear their biological clock ticking and bam! no children. Ever.

Delaying childbearing to later ages has undoubtedly a repercussion on the biological ability to have children, but it is hardly a simple component of the total effect. What the mainstream discussion is often missing on is that the great majority of children are conceived in unions, hence it is a couple’s decision to have children. Indeed, being single is an important if not pivotal deterrent to motherhood, usually delayed until union formation.

This is why it is important to consider factors such as union dissolution risk to appreciate the variation in involuntary childlessness. To better understand the effect of postponement we tried to measure it by calculating the effect of time spent on contraception while in a union by women who want to have children, a ‘conscious’ way to postpone childbearing.

This is a preview of average population childlessness obtained through simulation using 3 variables: celibacy (%of women ending up single and never entering a union), divorce (%women previously in a union but currently without a partner), and waiting time, the average time spent on contraception at the beginning of a union by a woman who wishes to have children.

childlessness

>ggplot(dt, aes( Age, value, linetype=Variable, col=Variable))+
> geom_line( size=1) +
> scale_color_manual( values=c( "black", "#666666", "grey","black", "#666666", "grey"), guide=guide_legend( nrow=3, byrow=F, title =  "Childlessness" )) +
> xlab("")+
>ylab("")+
>scale_linetype_manual( values=c("solid", "solid",  "solid", "twodash", "dotted", "dashed"), guide=guide_legend( nrow=3, byrow= F, title =  "Childlessness" ))+
>theme( plot.margin= unit(c(1,4,1,1), "cm"), legend.position="bottom", legend.direction= "vertical")

1. ggplot(dt, aes( Age, value, linetype= Variable, col=Variable))

linetype= Variable and col=Variable set in the aes tell ggplot to automatically divide the lines based on the number of Variable(s);

2. scale_color_manual sets the colors of the lines contained in values. I was not satisfied with what I got with scale_color_grey so I set my colors manually (_manual!);

3. since I want the legend at the bottom AND in two columns (or 3 rows) AND I have two features specified in the aes I need to add a guide=guide_legend(nrow=3) to each scale_blablabla_manual (that is to say scale_color_manual AND scale_linetype_manual);

4. In guide=guide_legend the byrow=F means that I do not want the legend to appear ordered by row, but rather by columns;

5. in theme( legend.position=”bottom”) tells ggplot to put the legend below the graph and legend.direction to plot it in a vertical way (which I divide in 3 rows)

1887 crude mortality rate in Spain using classInt package

TBM_1887 jenks
Crude Mortality Rate in Spain, 1887 Census

TBM_1887 quantile TBM_1887 bclust TBM_1887 fisher

>nclassint <- 5 #number of colors to be used in the palette
>cat <- classIntervals(dt$TBM, nclassint,style = "jenks")
>colpal <- brewer.pal(nclassint,"Reds")
>color <- findColours(cat,colpal) #sequential
>bins <- cat$brks
>lb <- length(bins)
>cat

style: jenks
[20.3,25.9] (25.9,30.5] (30.5,34.4] (34.4,38.4] (38.4,58.2]
68         114         130         115          35

Save the categories into a data.frame (dat)

type first second third fourth fifth
1 quantile    91     93    92     91    95
2       sd    10    202   244      5     0
3    equal   100    246   113      2     1
4   kmeans    68    115   142    118    19
5    jenks    68    114   130    115    35
6   hclust   100    174   153     34     1
7   bclust    53    120   275     13     1
8   fisher    68    114   130    115    35

and melt it into a long format (required by ggplot):

dat1 <- melt(dat,id.vars=c("type"),value.name="n.breaks")

ggplot(dat1,aes(x=variable,y=n.breaks,fill=type))+
geom_bar(stat="identity", position=position_dodge())

Rplot

A match made in R: checking the order of geographical areas in shape files and in your data frames

Not every shape file is as nice as those provided in libraries. Sometimes we have to deal with historical maps, which have been hand-drawn, re-touched and what not. To work with geo-referenced data it is essential to have a variable in both shape file and dataframe with unique coding that has exactly the same number of areas and the same ordering in both files.

A quick way to check if shapefile and dataframe have the same number of areas:

nrow(df) == length(shape.file$Code)

In the shapefile, one can also select a couple of areas big enough so that they can easily be located, and plot them as “control” areas.
For instance, I want to select the area with code “15078” in the shapefile:
>which(shape.file$Code=="15078",arr.Ind=T)
[1] 271

which is the area in the 271-th position (same way shape.file$Code[271] gives the code of area 271).
plot(shape.file)
plot(shape.file[c(271,898),],col="red",border="red",add=T)

this is an easy way to locate your “control” area(s).
Rplot
Ideally, you should have some variable that is identical to the one in the shapefile, a codification of some sort, providing a unique Code, the name of the area or some factors that allow you to locate the area in space.

An easy way to check if both shape file and data frame have the same ordering of geographical areas is to test it:
>code.sh <- cbind(c(1:length(shape.file$Code)),as.vector(shape.file$Code))
>code.df <- cbind(c(1:nrow(df)),df$Code)
>code.df==code
.sh
[,1]  [,2]
[1,] TRUE  TRUE
[2,] TRUE  TRUE
[3,] TRUE  TRUE

What if it’s not?
First option: the inelegant solution
Manually change the order of the areas in a csv file according to the exact order they have in the shape file. It’s easy as you can create an ordinal index for the shapefile codes, paste it in excel, and assign it with a vlookup function.
Second option: the smart R match
In R there is a function called match that returns a vector of the positions of first matches of the first argument in its second:
>my.match <- match(df$Code, shape.file$Code)
NB: to use match the two variables providing the code for the areas have to have the very same unique and identical codes, or else funny stuff happens. To check that everything is in its right place, you can plot the two “control” spatial polygons we chose in the beginning, using their position in the dataframe rather than in the shapefile:
>plot(shape.file)
>plot(shape.file[c(which(df$Code=="305"),which(df$Code=="15078")),],col="orange",add=T)