Plot made in ggplot2 using geofacet, personal elaboration of Census data.

# Category: spatial demography

## Georgia Mapping in R

You can download session 9 files for constructing the population pyramids of Georgia here: https://github.com/rladies/meetup-presentations_tbilisi and specify your working directory with setwd(“/Users/mydomain/myfolder/”)

#set working directory mypath<-"/Users/DrSpengler/The rectification of the Vuldrini/" #upload shape files georgia <- readOGR("./GEO_adm/","GEO_adm0")

## OGR data source with driver: ESRI Shapefile ## Source: "./GEO_adm/", layer: "GEO_adm0" ## with 1 features ## It has 70 fields

# plot(georgia, lwd=1.5) georgia1 <- readOGR("./GEO_adm/","GEO_adm1")

## OGR data source with driver: ESRI Shapefile ## Source: "./GEO_adm/", layer: "GEO_adm1" ## with 12 features ## It has 16 fields

# plot(georgia1) georgia2 <- readOGR("./GEO_adm/","GEO_adm2")

## OGR data source with driver: ESRI Shapefile ## Source: "./GEO_adm/", layer: "GEO_adm2" ## with 69 features ## It has 18 fields

# plot(georgia2) gwat <- readOGR("./GEO_wat/" , "GEO_water_lines_dcw")

## OGR data source with driver: ESRI Shapefile ## Source: "./GEO_wat/", layer: "GEO_water_lines_dcw" ## with 559 features ## It has 5 fields

# plot(gwat) gpop <- raster("./GEO_pop/geo_pop.grd") # plot(gpop) galt <- raster("./GEO_msk_alt/GEO_msk_alt.grd") # plot(galt)

plot(georgia, lwd=1.5) #n1

plot(georgia1, lwd=1.5) #n2

plot(georgia2, lwd=1.5) #n3

plot(georgia, lwd=1.5) #n4 plot(gwat, lwd=1.5, col="blue", add=T) #n4

plot(gpop) #n5 plot(georgia, lwd=1.5, add=T) #n5

plot(galt, lwd=1.5) #n6

### Plot neighbouring countries

tur <- readOGR("./TUR_adm" , "TUR_adm0")

## OGR data source with driver: ESRI Shapefile ## Source: "./TUR_adm", layer: "TUR_adm0" ## with 1 features ## It has 70 fields ## Integer64 fields read as strings: ID_0 OBJECTID_1

arm <- readOGR("./ARM_adm" , "ARM_adm0")

## OGR data source with driver: ESRI Shapefile ## Source: "./ARM_adm", layer: "ARM_adm0" ## with 1 features ## It has 70 fields ## Integer64 fields read as strings: ID_0 OBJECTID_1

rus <- readOGR("./RUS_adm" , "RUS_adm0")

## OGR data source with driver: ESRI Shapefile ## Source: "./RUS_adm", layer: "RUS_adm0" ## with 1 features ## It has 70 fields ## Integer64 fields read as strings: ID_0 OBJECTID_1

aze <- readOGR("./AZE_adm" , "AZE_adm0")

## OGR data source with driver: ESRI Shapefile ## Source: "./AZE_adm", layer: "AZE_adm0" ## with 1 features ## It has 70 fields ## Integer64 fields read as strings: ID_0 OBJECTID_1

### plot maps

plot(georgia, lwd=1.5, col="white", bg="lightblue") plot(georgia1, add=T, lty=2) plot(tur, add=T, col="white") plot(arm, add=T, col="white") plot(rus, add=T, col="white") plot(aze, add=T, col="white")

### add labels for the countries

x.loc <- c(44.32002, 46.35746, 44.40421, 42.18156, 40.71662) y.loc <- c(43.42472, 40.87209, 40.82228, 40.90945, 41.99276) nb.lab <- c("Russia", "Azerbaijan", "Armenia", "Turkey", "Black Sea") plot(georgia, lwd=1.5, col="white", bg="lightblue") plot(georgia1, add=T, lty=2) plot(tur, add=T, col="white") plot(arm, add=T, col="white") plot(rus, add=T, col="white") plot(aze, add=T, col="white") text(x.loc, y.loc, nb.lab)

### let’s add everything (or almost everything) together

plot(gwat, col="blue") # plot(georgia1[1,], lwd=1, col="lightblue", border="black", add=T) plot(georgia2, lwd=0.5, border="black", lty=3, add=T) plot(georgia1, border="black", lty=2, add=T) plot(georgia, lwd=1.5, add=T)

### check georgia@data

head(georgia1)

## ID_0 ISO NAME_0 ID_1 NAME_1 VARNAME_1 NL_NAME_1 HASC_1 CC_1 ## 0 81 GEO Georgia 1034 Abkhazia Sokhumi <NA> GE.AB <NA> ## 1 81 GEO Georgia 1035 Ajaria Batumi <NA> GE.AJ <NA> ## 2 81 GEO Georgia 1036 Guria Ozurgeti <NA> GE.GU <NA> ## 3 81 GEO Georgia 1037 Imereti Kutaisi <NA> GE.IM <NA> ## 4 81 GEO Georgia 1038 Kakheti Telavi <NA> GE.KA <NA> ## 5 81 GEO Georgia 1039 Kvemo Kartli Rustavi <NA> GE.KK <NA> ## TYPE_1 ENGTYPE_1 VALIDFR_1 VALIDTO_1 REMARKS_1 ## 0 Avtonomiuri Respublika Autonomous Republic 1994 Present <NA> ## 1 Avtonomiuri Respublika Autonomous Republic 1994 Present <NA> ## 2 Region Region 1994 Present <NA> ## 3 Region Region 1994 Present <NA> ## 4 Region Region 1994 Present <NA> ## 5 Region Region 1994 Present <NA> ## Shape_Leng Shape_Area ## 0 6.643211 0.9744622 ## 1 3.055014 0.3074264 ## 2 2.880653 0.2092665 ## 3 4.214567 0.6783179 ## 4 6.820519 1.2485036 ## 5 5.219352 0.6807876

# print labels on the map

### labels for admin 2

coords2<- coordinates(georgia2[2:6,]) admin2 <- c(as.character(georgia2$NAME_2[1:5])) admin2

## [1] "Gagra" "Gali" "Gudauta" "Gulripshi" "Ochamchire"

## Upload data from World Bank

dt <- read.csv("/Users/ac1y15/Google Drive/blog/RLadies_Georgia_files/Session_3/Data_Extract_From_Subnational_Malnutrition/3f075abc-c51c-40c5-afb1-f8fbcfa30f23_Data.csv", header=T) dt.1 <- subset(dt, dt$type==1&dt$select==1) head(dt.1)

## Admin.Region.Name select order ## 6 1 1 ## 7 Georgia, Adjara Aut. Rep. 1 2 ## 16 Georgia, Guria 1 3 ## 26 Georgia, Imereti 1 4 ## 31 Georgia, Kakheti 1 5 ## 36 Georgia, Kvemo Kartli 1 6 ## Admin.Region.Code type ## 6 1 ## 7 GEO_Adjara_Aut._Rep._GE.AR_1297_GEO002 1 ## 16 GEO_Guria_GE.GU_1298_GEO003 1 ## 26 GEO_Imereti_GE.IM_1299_GEO004 1 ## 31 GEO_Kakheti_GE.KA_1300_GEO005 1 ## 36 GEO_Kvemo_Kartli_GE.KK_1301_GEO006 1 ## Series.Name ## 6 ## 7 Prevalence of overweight, weight for height (% of children under 5) ## 16 Prevalence of overweight, weight for height (% of children under 5) ## 26 Prevalence of overweight, weight for height (% of children under 5) ## 31 Prevalence of overweight, weight for height (% of children under 5) ## 36 Prevalence of overweight, weight for height (% of children under 5) ## Series.Code YR2000 YR2005 YR2009 ## 6 NA NA NA ## 7 SN.SH.STA.OWGH.ZS NA 28.1 NA ## 16 SN.SH.STA.OWGH.ZS NA 7.9 NA ## 26 SN.SH.STA.OWGH.ZS 9.9 21.5 NA ## 31 SN.SH.STA.OWGH.ZS 7.0 19.6 13.2 ## 36 SN.SH.STA.OWGH.ZS 9.5 28.2 19.1

### Map the prevalence overweight w/h

library(classInt) nclassint <- 3 #number of colors to be used in the palette cat <- classIntervals(dt.1$YR2005, nclassint,style = "quantile") #style refers to how the breaks are created

colpal <- brewer.pal(nclassint,"Greens") #sequential color.palette <- findColours(cat,colpal) is.na(color.palette)

## [1] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE ## [12] FALSE

bins <- cat$brks lb <- length(bins) color.palette[c(1, 10)] <- "gray" value.vec <- c(round(bins[-length(bins)],2)) value.vec.tail <- c(round(bins[-1],2))

### Plot and SAVE map:

plot(georgia1, col=color.palette, border=T, main="Prevalence of overweight, \nweight for height (% of children under 5)") legend("topright",fill=c("gray", "#E5F5E0", "#A1D99B", "#31A354"),legend=c("NA",paste(value.vec,":",value.vec.tail)),cex=1.1, bg="white", bty = "n") # map.scale(41, 41, 2, "km", 2, 100) map.scale(x=40.1, y=41.2, relwidth=0.1 , metric=T, ratio=F, cex=0.8) SpatialPolygonsRescale(layout.north.arrow(2), offset= c(40.1, 41.6), scale = 0.5, plot.grid=F)

## See you at IUSSP to talk about the fantastic work we do at WorldPop! (plus Demotrends and R-Ladies)

As IUSSP is approaching, I’m looking forward to talk more about fine grid scale mapping research at WorldPop (University of Southampton) and Flowminder.

I will present my research on **High resolution mapping of fertility and mortality from national household survey data in low income settings** the *30th of October* in *session 5 at 8:30am* **Integrating spatial and statistical methods in demographic research, **meeting room 1.41 and 1.42.

Prof. Andy Tatem will host a side meeting **Geospatial Demography: ****Combining Satellite, Survey, Census and Cellphone Data to Provide Small-area Estimates** on the* 29 October 2017, 8:30-16:00.*

**Liili Abuladze** and **I** will be contributing to the Cape Town R-Ladies chapter Saturday 4 November (details here) with a talk on R-Ladies and data visualization in ggplot2. Come talk to us and become an R-Lady, we are looking forward to sharing our experiences.

A few Demotrend(ers) will also be presenting at IUSSP, come talk to us 🙂

See you in Cape Town!

## Global and Local Measures of Spatial Autocorrelation

This post aims at being a summary of the available techniques to investigate spatial autocorrelation for the social sciences, rather than presenting the theory behind spatial autocorrelation. For that there are great books available on line, like Anselin’s, Le Page, and Bivand just to cite a few.

The techniques presented here work for a spatial polygons data frame. The difference between spatial points and polygons data frames is not that big, the idea is the same and most of what I am doing here can be applied to data points.

Why do we look at spatial autocorrelation at all? Spatial autocorrelation leads to biased results in regressions, this is the reason why we want to compute Moran’s I and why we include spatial autocorrelation if its measurement proves to be significative and non-random.

Spatial autocorrelation can be investigated **globally** or **locally**. “**Globally”**, implies that the measure you’re going to obtain refers to the dataset as a whole, whether it is a whole country, continent or region. “**Locally”**, means that you are taking into consideration each and every polygon and getting a measure for each one of them.

We start by uploading the data, projecting them (important when considering the distance based measures -earth is not flat, whatever they may say!), and construct neighborhood relations (in this case Queen and Rook, but could be any other). For more detail see this post on how to construct neighbor relations.

library(maptools)

library(spdep)

NC= readShapePoly(system.file("shapes/sids.shp", package="maptools")[1], IDvar="FIPSNO", proj4string=CRS("+proj=longlat +ellps=clrk66"))

nb.FOQ = poly2nb(NC, queen=TRUE, row.names=NC$FIPSNO) nb.RK = poly2nb(NC, queen=FALSE, row.names=NC$FIPSNO)

### Global measures of spatial autocorrelation

There are two main measures of global spatial autocorrelation: **Moran’s I** and **Geary’s C**. **Moran’s I** is the most used in my experience, but both work are perfectly acceptable.

**Moran’s I** ranges between -1 (strong negative spatial autocorrelation with a dispersed pattern) and 1 (strong positive spatial autocorrelation with a clustered pattern) with 0 being the absence of spatial autocorrelation.

**Geary’s C** ranges between 0 and 2, with positive spatial autocorrelation ranging from 0 to 1 and negative spatial autocorrelation between 1 and 2.

Of course being these inferential measures, if the p-value is non significant we cannot exclude that the patterns could be random(!)

In this case Moran’s I is positive and significant, the z-score (not provided by moran.test) is positive implying spatial clusters, so we can reject the null hypothesis.

library(spdep) nwb <- NC$NWBIR74 moran.test(nwb, listw = nb2listw(nb.RK))

## ## Moran I test under randomisation ## ## data: nwb ## weights: nb2listw(nb.RK) ## ## Moran I statistic standard deviate = 3.0787, p-value = 0.001039 ## alternative hypothesis: greater ## sample estimates: ## Moran I statistic Expectation Variance ## 0.185965551 -0.010101010 0.004055701

geary.test(nwb, listw = nb2listw(nb.RK))

## ## Geary C test under randomisation ## ## data: nwb ## weights: nb2listw(nb.RK) ## ## Geary C statistic standard deviate = 2.0324, p-value = 0.02106 ## alternative hypothesis: Expectation greater than statistic ## sample estimates: ## Geary C statistic Expectation Variance ## 0.83274888 1.00000000 0.00677185

if you have polygons with no neighbors remember to specify `zero.policy=NULL`

moran.plot(nwb, listw = nb2listw(nb.RK))

moran.mc(nwb, listw = nb2listw(nb.RK), nsim=100)

## ## Monte-Carlo simulation of Moran I ## ## data: nwb ## weights: nb2listw(nb.RK) ## number of simulations + 1: 101 ## ## statistic = 0.18597, observed rank = 100, p-value = 0.009901 ## alternative hypothesis: greater

plot(moran.mc(nwb, listw = nb2listw(nb.RK), nsim=100))

Same thing can be done for Geary’s C:

geary.mc(nwb, listw = nb2listw(nb.RK), nsim=100)

## ## Monte-Carlo simulation of Geary C ## ## data: nwb ## weights: nb2listw(nb.RK) ## number of simulations + 1: 101 ## ## statistic = 0.83275, observed rank = 5, p-value = 0.0495 ## alternative hypothesis: greater

plot(geary.mc(nwb, listw = nb2listw(nb.RK), nsim=100))

### Local measures of spatial autocorrelation

**Local Moran and Local G**

locm <- localmoran(nwb, listw = nb2listw(nb.RK)) locG <- localG(nwb, listw = nb2listw(nb.RK))

Get the neighbor matrix into a listwise format with `listw`

: there’s two options here, row-standardized weights matrix `style = "W"`

creates proportional weights when polygons have an unequal number of neighbors, balancing out observations with few neighbors. Binary weights `style = "B"`

upweight observations with many neighbors.

library(classInt) library(dplyr)

myvar <- NC$NWBIR74 nb <- nb.RK # Define weight style ws <- c("W") # Define significance for the maps significance <- 0.05 plot.only.significant <- TRUE # Transform the neigh mtx into a listwise object listw <- nb2listw(nb, style=ws) # Create the lagged variable lagvar <- lag.listw(listw, myvar) # get the mean of each m.myvar <- mean(myvar) m.lagvar <- mean(lagvar)

The next step is to derive the quadrants and set the coloring scheme. I like to color the border of each polygon with the color of their local moran score, regardless of their pvalue, and then fill only the significant ones.

n <- length(NC) # vec <- c(1:n) vec <- ifelse(locm[,5] < significance, 1,0) # Derive quadrants q <- c(1:n) for (i in 1:n) { if (myvar[[i]]>=m.myvar & lagvar[[i]]>=m.lagvar) q[i] <- 1 if (myvar[[i]]<m.myvar & lagvar[[i]]<m.lagvar) q[i] <- 2 if (myvar[[i]]<m.myvar & lagvar[[i]]>=m.lagvar) q[i] <- 3 if (myvar[[i]]>=m.myvar & lagvar[[i]]<m.lagvar) q[i] <- 4 } # set coloring scheme q.all <- q colors <- c(1:n) for (i in 1:n) { if (q.all[i]==1) colors[i] <- "red" if (q.all[i]==2) colors[i] <- "blue" if (q.all[i]==3) colors[i] <- "lightblue" if (q.all[i]==4) colors[i] <- "pink" if (q.all[i]==0) colors[i] <- "white" if (q.all[i]>4) colors[i] <- "white" } # Mark all non-significant regions white locm.dt <- q*vec colors1 <- colors for (i in 1:n) { if ( !(is.na (locm.dt[i])) ) { if (locm.dt[i]==0) colors1[i] <- "white" } } colors2 <- colors colors2 <- paste(colors2,vec) pos = list() for (i in 1:n) { pos[[i]] <- c(which(NC$NWBIR74==colors2["blue 0"])) } blue0 <- which(colors2=="blue 0") red0 <- which(colors2=="red 0") lightblue0 <- which(colors2=="lightblue 0") pink0 <- which(colors2=="pink 0") lb <- 6 labels=c("High-High", "High-Low", "Low-High", "Low-Low") # plot the map if (plot.only.significant==TRUE) plot(NC, col=colors1,border=F) else plot(NC, col=colors,border=F) plot(NC[blue0,],border="blue",lwd=0.5,add=T) plot(NC[lightblue0,],border="lightblue",add=T,lwd=0.5) plot(NC[red0,],border="red",add=T,lwd=0.5) plot(NC[pink0,],border="pink",add=T,lwd=0.5) legend("bottomleft", legend = labels, fill = c("red", "pink", "lightblue", "blue"), bty = "n")

Local G gives back z-scores values and indicate the posibility of a local cluster of high values of the variable being analysed, very low values indicate a similar cluster of low values.

library(RColorBrewer) nclassint <- 3 colpal <- brewer.pal(nclassint,"PiYG") cat <- classIntervals(locG, nclassint, style = "jenks", na.ignore=T) color.z <- findColours(cat, colpal) plot(NC, col= color.z, border=T)

# color only significant polygons plot(NC, border=T) plot(NC[which(vec==1),], col=color.z[which(vec==1)], border=T, add=T)

## A nice example of hafen/geofacet from Washington Post to ggplot2

I’ve recently came across the hafen/geofacet function and was pondering to blog an example. Then, I came across a perfect example, thanks to **kanishkamisra** for working on the dataset & code and making it available via github here!

## High Resolution Mapping of Fertility and Mortality from Household Survey Data in Low Income Settings – PAA presentation

I will present at PAA my WorldPop mapping of Demographic indicators in low-income settings at PAA in Chicago. “Advances in Mathematical, Spatial, and Small-Area Demography”, Thursday, April 27, 2017: 10:15 AM – 11:45 AM, Hilton, Joliet Room.

## Find color breaks for mapping (fast)

I’ve stumbled upon a little trick to compute **jenks** breaks faster than with the **classInt** package, just be sure to use **n+1** instead of **n** as the breaks are computed a little bit differently. That is to say, if you want 5 breaks, **n=6**, no biggie there.

For more on the Bayesian Analysis of Macroevolutionary Mixtures see BAMMtools library

`install.packages("BAMMtools")`

library(BAMMtools)

system.time(getJenksBreaks(mydata$myvar, 6))

> user system elapsed

> 0.970 0.001 0.971

On the other hand this takes way more time with large datasets

`library(classInt)`

system.time(classIntervals(mydata$myvar, n=5, style="jenks"))

> Timing stopped at: 1081.894 1.345 1083.511