Bangladesh geofacet plot: Female population projections by upazilas

BGD_plot

Bangladesh 2020 population pyramids by upazila

BGD_pyr.png

Georgia Mapping in R

You can download session 9 files for constructing the population pyramids of Georgia here: https://github.com/rladies/meetup-presentations_tbilisi and specify your working directory with setwd(“/Users/mydomain/myfolder/”)

#set working directory
mypath<-"/Users/DrSpengler/The rectification of the Vuldrini/"
#upload shape files
georgia <- readOGR("./GEO_adm/","GEO_adm0")
## OGR data source with driver: ESRI Shapefile
## Source: "./GEO_adm/", layer: "GEO_adm0"
## with 1 features
## It has 70 fields
# plot(georgia, lwd=1.5)

georgia1 <- readOGR("./GEO_adm/","GEO_adm1")
## OGR data source with driver: ESRI Shapefile
## Source: "./GEO_adm/", layer: "GEO_adm1"
## with 12 features
## It has 16 fields
# plot(georgia1)

georgia2 <- readOGR("./GEO_adm/","GEO_adm2")
## OGR data source with driver: ESRI Shapefile
## Source: "./GEO_adm/", layer: "GEO_adm2"
## with 69 features
## It has 18 fields
# plot(georgia2)

gwat <- readOGR("./GEO_wat/" , "GEO_water_lines_dcw")
## OGR data source with driver: ESRI Shapefile
## Source: "./GEO_wat/", layer: "GEO_water_lines_dcw"
## with 559 features
## It has 5 fields
# plot(gwat)

gpop <- raster("./GEO_pop/geo_pop.grd")
# plot(gpop)

galt <- raster("./GEO_msk_alt/GEO_msk_alt.grd")
# plot(galt)
 plot(georgia, lwd=1.5) #n1

map1

 plot(georgia1, lwd=1.5) #n2

map2

 plot(georgia2, lwd=1.5) #n3

map3

 plot(georgia, lwd=1.5) #n4
 plot(gwat, lwd=1.5, col="blue", add=T) #n4

map4

 plot(gpop) #n5
 plot(georgia, lwd=1.5,  add=T) #n5

map5

 plot(galt, lwd=1.5) #n6

map6

Plot neighbouring countries

tur <- readOGR("./TUR_adm" , "TUR_adm0")
## OGR data source with driver: ESRI Shapefile
## Source: "./TUR_adm", layer: "TUR_adm0"
## with 1 features
## It has 70 fields
## Integer64 fields read as strings:  ID_0 OBJECTID_1
arm <- readOGR("./ARM_adm" , "ARM_adm0")
## OGR data source with driver: ESRI Shapefile
## Source: "./ARM_adm", layer: "ARM_adm0"
## with 1 features
## It has 70 fields
## Integer64 fields read as strings:  ID_0 OBJECTID_1
rus <- readOGR("./RUS_adm" , "RUS_adm0")
## OGR data source with driver: ESRI Shapefile
## Source: "./RUS_adm", layer: "RUS_adm0"
## with 1 features
## It has 70 fields
## Integer64 fields read as strings:  ID_0 OBJECTID_1
aze <- readOGR("./AZE_adm" , "AZE_adm0")
## OGR data source with driver: ESRI Shapefile
## Source: "./AZE_adm", layer: "AZE_adm0"
## with 1 features
## It has 70 fields
## Integer64 fields read as strings:  ID_0 OBJECTID_1

plot maps

plot(georgia, lwd=1.5, col="white", bg="lightblue")
plot(georgia1, add=T, lty=2)
plot(tur, add=T, col="white")
plot(arm, add=T, col="white")
plot(rus, add=T, col="white")
plot(aze, add=T, col="white")

map7

add labels for the countries

x.loc <- c(44.32002, 46.35746, 44.40421, 42.18156, 40.71662)
y.loc <- c(43.42472, 40.87209, 40.82228, 40.90945, 41.99276)
nb.lab <- c("Russia", "Azerbaijan", "Armenia", "Turkey", "Black Sea")
plot(georgia, lwd=1.5, col="white", bg="lightblue")
plot(georgia1, add=T, lty=2)
plot(tur, add=T, col="white")
plot(arm, add=T, col="white")
plot(rus, add=T, col="white")
plot(aze, add=T, col="white")
text(x.loc, y.loc, nb.lab)

let’s add everything (or almost everything) together

plot(gwat, col="blue")
# plot(georgia1[1,], lwd=1, col="lightblue", border="black", add=T)
plot(georgia2, lwd=0.5, border="black", lty=3, add=T)
plot(georgia1, border="black", lty=2, add=T)
plot(georgia, lwd=1.5, add=T)

map8

check georgia@data

head(georgia1)
##   ID_0 ISO  NAME_0 ID_1       NAME_1 VARNAME_1 NL_NAME_1 HASC_1 CC_1
## 0   81 GEO Georgia 1034     Abkhazia   Sokhumi      <NA>  GE.AB <NA>
## 1   81 GEO Georgia 1035       Ajaria    Batumi      <NA>  GE.AJ <NA>
## 2   81 GEO Georgia 1036        Guria  Ozurgeti      <NA>  GE.GU <NA>
## 3   81 GEO Georgia 1037      Imereti   Kutaisi      <NA>  GE.IM <NA>
## 4   81 GEO Georgia 1038      Kakheti    Telavi      <NA>  GE.KA <NA>
## 5   81 GEO Georgia 1039 Kvemo Kartli   Rustavi      <NA>  GE.KK <NA>
##                   TYPE_1           ENGTYPE_1 VALIDFR_1 VALIDTO_1 REMARKS_1
## 0 Avtonomiuri Respublika Autonomous Republic      1994   Present      <NA>
## 1 Avtonomiuri Respublika Autonomous Republic      1994   Present      <NA>
## 2                 Region              Region      1994   Present      <NA>
## 3                 Region              Region      1994   Present      <NA>
## 4                 Region              Region      1994   Present      <NA>
## 5                 Region              Region      1994   Present      <NA>
##   Shape_Leng Shape_Area
## 0   6.643211  0.9744622
## 1   3.055014  0.3074264
## 2   2.880653  0.2092665
## 3   4.214567  0.6783179
## 4   6.820519  1.2485036
## 5   5.219352  0.6807876

print labels on the map

labels for admin 2

coords2<- coordinates(georgia2[2:6,])
admin2 <- c(as.character(georgia2$NAME_2[1:5]))
admin2
## [1] "Gagra"      "Gali"       "Gudauta"    "Gulripshi"  "Ochamchire"

Upload data from World Bank

dt <- read.csv("/Users/ac1y15/Google Drive/blog/RLadies_Georgia_files/Session_3/Data_Extract_From_Subnational_Malnutrition/3f075abc-c51c-40c5-afb1-f8fbcfa30f23_Data.csv", header=T)
dt.1 <- subset(dt, dt$type==1&dt$select==1)

head(dt.1)
##            Admin.Region.Name select order
## 6                                 1     1
## 7  Georgia, Adjara Aut. Rep.      1     2
## 16            Georgia, Guria      1     3
## 26          Georgia, Imereti      1     4
## 31          Georgia, Kakheti      1     5
## 36     Georgia, Kvemo Kartli      1     6
##                         Admin.Region.Code type
## 6                                            1
## 7  GEO_Adjara_Aut._Rep._GE.AR_1297_GEO002    1
## 16            GEO_Guria_GE.GU_1298_GEO003    1
## 26          GEO_Imereti_GE.IM_1299_GEO004    1
## 31          GEO_Kakheti_GE.KA_1300_GEO005    1
## 36     GEO_Kvemo_Kartli_GE.KK_1301_GEO006    1
##                                                            Series.Name
## 6
## 7  Prevalence of overweight, weight for height (% of children under 5)
## 16 Prevalence of overweight, weight for height (% of children under 5)
## 26 Prevalence of overweight, weight for height (% of children under 5)
## 31 Prevalence of overweight, weight for height (% of children under 5)
## 36 Prevalence of overweight, weight for height (% of children under 5)
##          Series.Code YR2000 YR2005 YR2009
## 6                        NA     NA     NA
## 7  SN.SH.STA.OWGH.ZS     NA   28.1     NA
## 16 SN.SH.STA.OWGH.ZS     NA    7.9     NA
## 26 SN.SH.STA.OWGH.ZS    9.9   21.5     NA
## 31 SN.SH.STA.OWGH.ZS    7.0   19.6   13.2
## 36 SN.SH.STA.OWGH.ZS    9.5   28.2   19.1

Map the prevalence overweight w/h

library(classInt)
nclassint <- 3 #number of colors to be used in the palette
cat <- classIntervals(dt.1$YR2005, nclassint,style = "quantile") #style refers to how the breaks are created
colpal <- brewer.pal(nclassint,"Greens") #sequential
color.palette <- findColours(cat,colpal)
is.na(color.palette)
##  [1]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [12] FALSE
bins <- cat$brks
lb <- length(bins)

color.palette[c(1, 10)] <- "gray"
value.vec <- c(round(bins[-length(bins)],2))
value.vec.tail <- c(round(bins[-1],2))

Plot and SAVE map:

plot(georgia1, col=color.palette, border=T, main="Prevalence of overweight, \nweight for height (% of children under 5)")
legend("topright",fill=c("gray", "#E5F5E0", "#A1D99B", "#31A354"),legend=c("NA",paste(value.vec,":",value.vec.tail)),cex=1.1, bg="white", bty = "n")
# map.scale(41, 41, 2, "km", 2, 100)
map.scale(x=40.1, y=41.2, relwidth=0.1 , metric=T, ratio=F, cex=0.8)
SpatialPolygonsRescale(layout.north.arrow(2), offset= c(40.1, 41.6), scale = 0.5, plot.grid=F)

map12

See you at IUSSP to talk about the fantastic work we do at WorldPop! (plus Demotrends and R-Ladies)

As IUSSP is approaching, I’m looking forward to talk more about fine grid scale mapping research at WorldPop (University of Southampton) and Flowminder.

I will present my research on  the 30th of October in session 5 at 8:30am Integrating spatial and statistical methods in demographic research, meeting room 1.41 and 1.42.

Prof. Andy Tatem will host a side meeting Geospatial Demography: Combining Satellite, Survey, Census and Cellphone Data to Provide Small-area Estimates on the 29 October 2017, 8:30-16:00.

and  will be contributing to the Cape Town R-Ladies chapter Saturday 4 November (details here) with a talk on R-Ladies and data visualization in ggplot2. Come talk to us and become an R-Lady, we are looking forward to sharing our experiences.

A few Demotrend(ers) will also be presenting at IUSSP, come talk to us 🙂

See you in Cape Town!

 

Global and Local Measures of Spatial Autocorrelation

This post aims at being a summary of the available techniques to investigate spatial autocorrelation for the social sciences, rather than presenting the theory behind spatial autocorrelation. For that there are great books available on line, like Anselin’s, Le Page, and Bivand just to cite a few.

The techniques presented here work for a spatial polygons data frame. The difference between spatial points and polygons data frames is not that big, the idea is the same and most of what I am doing here can be applied to data points.

Why do we look at spatial autocorrelation at all? Spatial autocorrelation leads to biased results in regressions, this is the reason why we want to compute Moran’s I and why we include spatial autocorrelation if its measurement proves to be significative and non-random.

Spatial autocorrelation can be investigated globally or locally. “Globally”, implies that the measure you’re going to obtain refers to the dataset as a whole, whether it is a whole country, continent or region. “Locally”, means that you are taking into consideration each and every polygon and getting a measure for each one of them.

We start by uploading the data, projecting them (important when considering the distance based measures -earth is not flat, whatever they may say!), and construct neighborhood relations (in this case Queen and Rook, but could be any other). For more detail see this post on how to construct neighbor relations.

library(maptools)
library(spdep)
NC= readShapePoly(system.file("shapes/sids.shp", package="maptools")[1], IDvar="FIPSNO", proj4string=CRS("+proj=longlat +ellps=clrk66"))
nb.FOQ = poly2nb(NC, queen=TRUE, row.names=NC$FIPSNO)
nb.RK = poly2nb(NC, queen=FALSE, row.names=NC$FIPSNO)

Global measures of spatial autocorrelation

There are two main measures of global spatial autocorrelation: Moran’s I and Geary’s C. Moran’s I is the most used in my experience, but both work are perfectly acceptable.
Moran’s I ranges between -1 (strong negative spatial autocorrelation with a dispersed pattern) and 1 (strong positive spatial autocorrelation with a clustered pattern) with 0 being the absence of spatial autocorrelation.
Geary’s C ranges between 0 and 2, with positive spatial autocorrelation ranging from 0 to 1 and negative spatial autocorrelation between 1 and 2.
Of course being these inferential measures, if the p-value is non significant we cannot exclude that the patterns could be random(!)

In this case Moran’s I is positive and significant, the z-score (not provided by moran.test) is positive implying spatial clusters, so we can reject the null hypothesis.

library(spdep)
nwb <- NC$NWBIR74
moran.test(nwb, listw = nb2listw(nb.RK))
##
##  Moran I test under randomisation
##
## data:  nwb
## weights: nb2listw(nb.RK)
##
## Moran I statistic standard deviate = 3.0787, p-value = 0.001039
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance
##       0.185965551      -0.010101010       0.004055701
geary.test(nwb, listw = nb2listw(nb.RK))
##
##  Geary C test under randomisation
##
## data:  nwb
## weights: nb2listw(nb.RK)
##
## Geary C statistic standard deviate = 2.0324, p-value = 0.02106
## alternative hypothesis: Expectation greater than statistic
## sample estimates:
## Geary C statistic       Expectation          Variance
##        0.83274888        1.00000000        0.00677185

if you have polygons with no neighbors remember to specify zero.policy=NULL

moran.plot(nwb, listw = nb2listw(nb.RK))

moran plot

moran.mc(nwb, listw = nb2listw(nb.RK), nsim=100)
##
##  Monte-Carlo simulation of Moran I
##
## data:  nwb
## weights: nb2listw(nb.RK)
## number of simulations + 1: 101
##
## statistic = 0.18597, observed rank = 100, p-value = 0.009901
## alternative hypothesis: greater

permutation Moran_s I

plot(moran.mc(nwb, listw = nb2listw(nb.RK), nsim=100))


Same thing can be done for Geary’s C:

geary.mc(nwb, listw = nb2listw(nb.RK), nsim=100)
##
##  Monte-Carlo simulation of Geary C
##
## data:  nwb
## weights: nb2listw(nb.RK)
## number of simulations + 1: 101
##
## statistic = 0.83275, observed rank = 5, p-value = 0.0495
## alternative hypothesis: greater
plot(geary.mc(nwb, listw = nb2listw(nb.RK), nsim=100))

Local measures of spatial autocorrelation

Local Moran and Local G

locm <- localmoran(nwb, listw = nb2listw(nb.RK))
locG <- localG(nwb, listw = nb2listw(nb.RK))

Get the neighbor matrix into a listwise format with listw: there’s two options here, row-standardized weights matrix style = "W" creates proportional weights when polygons have an unequal number of neighbors, balancing out observations with few neighbors. Binary weights style = "B" upweight observations with many neighbors.

library(classInt)
library(dplyr)
myvar <- NC$NWBIR74
nb <- nb.RK
# Define weight style
ws <- c("W")

# Define significance for the maps
significance <- 0.05
plot.only.significant <- TRUE

# Transform the neigh mtx into a listwise object
listw <- nb2listw(nb, style=ws)

# Create the lagged variable
lagvar <- lag.listw(listw, myvar)

# get the mean of each
m.myvar <- mean(myvar)
m.lagvar <- mean(lagvar)

The next step is to derive the quadrants and set the coloring scheme. I like to color the border of each polygon with the color of their local moran score, regardless of their pvalue, and then fill only the significant ones.

n <- length(NC)
#
vec <- c(1:n)
vec <- ifelse(locm[,5] < significance, 1,0)

# Derive quadrants
q <- c(1:n) for (i in 1:n) {   if (myvar[[i]]>=m.myvar & lagvar[[i]]>=m.lagvar) q[i] <- 1
  if (myvar[[i]]<m.myvar & lagvar[[i]]<m.lagvar) q[i] <- 2
  if (myvar[[i]]<m.myvar & lagvar[[i]]>=m.lagvar) q[i] <- 3   if (myvar[[i]]>=m.myvar & lagvar[[i]]<m.lagvar) q[i] <- 4
}

# set coloring scheme
q.all <- q
colors <- c(1:n)
for (i in 1:n) {
  if (q.all[i]==1) colors[i] <- "red"
  if (q.all[i]==2) colors[i] <- "blue"
  if (q.all[i]==3) colors[i] <- "lightblue"
  if (q.all[i]==4) colors[i] <- "pink"
  if (q.all[i]==0) colors[i] <- "white"   if (q.all[i]>4) colors[i] <- "white"
}

# Mark all non-significant regions white
locm.dt <- q*vec
colors1 <- colors
for (i in 1:n)
{
  if ( !(is.na (locm.dt[i])) )  {
  if (locm.dt[i]==0) colors1[i] <- "white"
}
}
colors2 <- colors
colors2 <- paste(colors2,vec)
pos = list()
for (i in 1:n) {
  pos[[i]] <- c(which(NC$NWBIR74==colors2["blue 0"]))
}

blue0 <- which(colors2=="blue 0")
red0 <- which(colors2=="red 0")
lightblue0 <- which(colors2=="lightblue 0")
pink0 <- which(colors2=="pink 0")
lb <- 6
labels=c("High-High", "High-Low", "Low-High", "Low-Low")

# plot the map
if (plot.only.significant==TRUE) plot(NC, col=colors1,border=F) else
  plot(NC, col=colors,border=F)
plot(NC[blue0,],border="blue",lwd=0.5,add=T)
plot(NC[lightblue0,],border="lightblue",add=T,lwd=0.5)
plot(NC[red0,],border="red",add=T,lwd=0.5)
plot(NC[pink0,],border="pink",add=T,lwd=0.5)
legend("bottomleft", legend = labels, fill = c("red", "pink", "lightblue", "blue"), bty = "n")

lisa map
Local G gives back z-scores values and indicate the posibility of a local cluster of high values of the variable being analysed, very low values indicate a similar cluster of low values.

library(RColorBrewer)

nclassint <- 3
colpal <- brewer.pal(nclassint,"PiYG")
cat <- classIntervals(locG, nclassint, style = "jenks", na.ignore=T)
color.z <- findColours(cat, colpal)

plot(NC, col= color.z, border=T)

Rplot

# color only significant polygons
plot(NC, border=T)
plot(NC[which(vec==1),], col=color.z[which(vec==1)], border=T, add=T)

lisa map2

Population Pyramids of Georgia in ggplot2

You can download session 9 files for constructing the population pyramids of Georgia here: RLadies Tbilisi 

rm(list=ls(all=TRUE))
cat("\014")
mypath <- "/Users/GozerTheGozerian/Keymasters Folder/"
setwd(paste(mypath)) #set your working directory
##  [1] "Index"
##  [2] "Variant"
##  [3] "Major.area..region..country.or.area.."
##  [4] "sex"
##  [5] "Notes"
##  [6] "Country.code"
##  [7] "Reference.date..as.of.1.July."
##  [8] "X0.4"
##  [9] "X05.Sep"
## [10] "Oct.14"
## [11] "X15.19"
[...]
## [23] "X75.79"
## [24] "X80."
## [25] "X80.84"
## [26] "X85.89"
## [27] "X90.94"
## [28] "X95.99"
## [29] "X100."
head(pyr)
##   Index   Variant Major.area..region..country.or.area..  sex Notes
## 1     1 Estimates                                 WORLD both
## 2     2 Estimates                                 WORLD both
## 3     3 Estimates                                 WORLD both
## 4     4 Estimates                                 WORLD both
## 5     5 Estimates                                 WORLD both
## 6     6 Estimates                                 WORLD both
##   Country.code Reference.date..as.of.1.July.   X0.4 X05.Sep Oct.14 X15.19
## 1          900                          1950 337432  269550 260286 238628
## 2          900                          1955 402845  315055 263266 254815
## 3          900                          1960 430565  380319 309276 257899
## 4          900                          1965 477798  409020 372817 303891
## 5          900                          1970 522641  458298 403911 367789
## 6          900                          1975 543225  503753 452706 398384
##   X20.24 X25.29 X30.34 X35.39 X40.44 X45.49 X50.54 X55.59 X60.64 X65.69
## 1 221781 194424 166937 162917 147483 127415 107608  88601  73422  55106
## 2 231892 214878 187941 160385 155546 138743 119084  97441  76843  59322
## 3 248413 225957 208747 181632 153398 147699 130210 108435  85064  62665
## 4 251897 242692 219978 202499 174884 145701 138601 119505  95085  70256
## 5 297557 246921 237657 214330 196585 168438 137799 128954 107201  81023
## 6 361883 293531 243384 233137 209181 190299 161058 128755 116704  92385
##   X70.74 X75.79  X80. X80.84 X85.89 X90.94 X95.99 X100.
## 1  37360  21997 14202     NA     NA     NA     NA    NA
## 2  40346  23755 16158     NA     NA     NA     NA    NA
## 3  44018  25986 18061     NA     NA     NA     NA    NA
## 4  47382  29457 21032     NA     NA     NA     NA    NA
## 5  55168  32876 25340     NA     NA     NA     NA    NA
## 6  64337  38934 29743     NA     NA     NA     NA    NA
pyr <- read.csv("Session_2_POPULATION_BY_AGE_BOTH_SEXES.csv", header=T)
names(pyr)
##  [1] "Index"
##  [2] "Variant"
##  [3] "Major.area..region..country.or.area.."
##  [4] "sex"
##  [5] "Notes"
##  [6] "Country.code"
##  [7] "Reference.date..as.of.1.July."
##  [8] "X0.4"
##  [9] "X05.Sep"
## [10] "Oct.14"
## [11] "X15.19"
[...]
## [22] "X70.74"
## [23] "X75.79"
## [24] "X80."
## [25] "X80.84"
## [26] "X85.89"
## [27] "X90.94"
## [28] "X95.99"
## [29] "X100."
#make a new variable with names of all variables:
#make a new variable with names of all variables:
vars <- names(pyr)
#and change those variables names that start with an X
age <- c(paste(seq(0, 75, by=5), "-", seq(4, 79, by=5)), "80+", paste(seq(80, 95, by=5), "-", seq(84, 99, by=5)), "100+")
age
##  [1] "0 - 4"   "5 - 9"   "10 - 14" "15 - 19" "20 - 24" "25 - 29" "30 - 34"
##  [8] "35 - 39" "40 - 44" "45 - 49" "50 - 54" "55 - 59" "60 - 64" "65 - 69"
## [15] "70 - 74" "75 - 79" "80+"     "80 - 84" "85 - 89" "90 - 94" "95 - 99"
## [22] "100+"
names(pyr) <- c(vars[1], vars[2], "Major.Area", "sex", vars[5], vars[6], "year", age)
names(pyr)[1:15]
##  [1] "Index"        "Variant"      "Major.Area"   "sex"
##  [5] "Notes"        "Country.code" "year"         "0 - 4"
##  [9] "5 - 9"        "10 - 14"      "15 - 19"      "20 - 24"
## [13] "25 - 29"      "30 - 34"      "35 - 39"
library(tidyr)
# transform the data from wide to long format
pyr <- gather(pyr, "age.group", "value", 8:29)
head(pyr)
##   Index   Variant Major.Area  sex Notes Country.code year age.group  value
## 1     1 Estimates      WORLD both                900 1950     0 - 4 337432
## 2     2 Estimates      WORLD both                900 1955     0 - 4 402845
## 3     3 Estimates      WORLD both                900 1960     0 - 4 430565
## 4     4 Estimates      WORLD both                900 1965     0 - 4 477798
## 5     5 Estimates      WORLD both                900 1970     0 - 4 522641
## 6     6 Estimates      WORLD both                900 1975     0 - 4 543225
#replace all NA with 0
library(dplyr)
is.na(pyr$value) <- 0
pyr.g <- pyr %>%
 filter(Major.Area=="Georgia"&sex!="both") # exclude "both"

#create an order vector to sort data
o <- seq(1,22, by=1) # 22 is the number of age groups length(unique(pyr$age.group))
oo <- rep(o,28) # 28 number of years
order <- as.vector(sort(oo, decreasing=F))
pyr.g$order <- order
breaks <- pyr.g$age.group
library(ggplot2)
###
# get rid of the 80+ abridged age group
pyr.g1 <- pyr.g[-c(which(pyr.g$age.group=="80+")),]
### simple pyramid plot
p <- ggplot(pyr.g1, aes(x=age.group, y=value, fill=factor(sex)))+
geom_bar(data=pyr.g1 %>%
filter(sex=="female"&year=="2015"),
aes(x=reorder(age.group, order), y=value), stat="identity")+
geom_bar(data=pyr.g1 %>%
filter(sex=="male"&year=="2015"),
aes(x=reorder(age.group, order), y=-value), stat="identity")+ #negative value for males not to overlap; reorder values of age group by order; "identity" is only for bar charts
coord_flip()+ #bending function: flip the coordinates
labs(x = "", y = "")+
scale_fill_manual(values = c(female = "red", male = "blue"), name="")+
scale_x_discrete(breaks=c(paste(seq(0,90, by=10),"-", seq(4,94, by=10)), "100+"),labels=c(paste(seq(0,90, by=10),"-", seq(4,94, by=10)), "100+" ))+ #not to show all the age groups all the time
scale_y_continuous(breaks=seq(-200,200,25),labels=abs(seq(-200,200,25)))+ #tell R t paste absolute numbers of values not to have negative values on graph
theme_bw()+
theme(axis.text.x = element_text(size=10, color="black"), # size of x axis text
axis.text.y = element_text(size=10, color="black"))

pyr1

#############################################
### STEP 2: add lines/bars to compare other years
#############################################
p+
geom_line(data=pyr.g1 %>%
filter(sex=="male"&year=="1975"),
aes(x=reorder(age.group, order), y=-value), colour="lightblue", group=1)+
geom_line(data=pyr.g1 %>%
filter(sex=="female"&year=="1975"),
aes(x=reorder(age.group, order), y=value), colour="pink", group=1)

pyr2

# bars: since in ggplot the last plot is
#the one that appears on top (hiding everything underneath),
#we can add alpha=0.5 to add some transparence, 1 being the
#full color
p+
geom_bar(data=pyr.g1 %>%
filter(sex=="male"&year=="1975"),
aes(x=reorder(age.group, order), y=-value), fill="lightblue", alpha=.5,stat="identity")+
geom_bar(data=pyr.g1 %>%
filter(sex=="female"&year=="1975"),
aes(x=reorder(age.group, order), y=value), fill="pink", alpha=.5, stat="identity")

pyr3

#######################################################################
# STEP 3: add different legends for the two years: now we only have one for the sex, as the fill factors for all 4 geom_bar(s) is the same
#
ggplot(pyr.g1, aes(x=age.group, y=value, fill=factor(sex), col=factor(year)))+ # add different colors for the two years 1975 and 2015 by adding col=factor(year)
# this part stays the same
geom_bar(data=pyr.g1 %>%
filter(sex=="female"&year=="2015"),
aes(x=reorder(age.group, order), y=value), stat="identity")+
geom_bar(data=pyr.g1 %>%
filter(sex=="male"&year=="2015"),
aes(x=reorder(age.group, order), y=-value), stat="identity")+
geom_bar(data=pyr.g1 %>%
filter(sex=="male"&year=="1975"),
aes(x=reorder(age.group, order),y=-value), alpha=.5,stat="identity")+
geom_bar(data=pyr.g1 %>%
filter(sex=="female"&year=="1975"),
aes(x=reorder(age.group, order), y=value), alpha=.5, stat="identity")+
coord_flip()+
labs(x = "", y = "")+
scale_x_discrete(breaks=c(paste(seq(0,90, by=10),"-", seq(4,94, by=10)) , "100+"),labels=c(paste(seq(0,90, by=10),"-", seq(4,94, by=10)), "100+" ))+
scale_y_continuous(breaks=seq(-200,200,25),labels=abs(seq(-200,200,25)))+
theme_bw()+
theme(axis.text.x = element_text(size=10, color="black"),
axis.text.y = element_text(size=10, color="black"))+
# add the legends with scale_fill_manual which controls the filling colors for sex and scale_color_manual which controls the border color that distinguisces the two years
scale_fill_manual(values = c(female = "red", male = "blue"), name="")+
scale_color_manual(values=c("1975"="black", "2015"="grey"), name="" )+
# and I want the year legend squares to look empty
guides(colour = guide_legend(override.aes = list(alpha = 0))) #makes the squares for the years legend empty of any color

pyr4

################################################################
## STEP 4: one pyramid plot for each year in one page with facet_wrap
##
ggplot(pyr.g1, aes(x=age.group, y=value, fill=factor(sex)))+
geom_bar(data=pyr.g1 %>%
filter(sex=="male"),
aes(x=reorder(age.group, order), y=-value), stat="identity")+
geom_bar(data=pyr.g1 %>%
filter(sex=="female"),
aes(x=reorder(age.group, order), y=value), stat="identity")+
coord_flip()+
labs(x = "", y = "")+
scale_x_discrete(breaks=c( paste(seq(0,90, by=10),"-", seq(4,94, by=10)), "100+" ))+
scale_y_continuous(breaks=seq(-300,300,100),labels=abs(seq(-300,300,100)))+
scale_fill_manual(values = c(female = "red", male = "blue"), name="")+
theme_bw()+
theme(axis.text.x = element_text(size=10, color="black"),
axis.text.y = element_text(size=10, color="black"))+
facet_wrap(~year)

Untitled

pyr.ar <- pyr %>%
filter(Major.Area=="Armenia"&sex!="both") # exclude "both"
pyr.az <- pyr %>%
filter(Major.Area=="Azerbaijan"&sex!="both") # exclude "both"
pyr.ar$order <- order
pyr.az$order <- order
pyr.c <- rbind(pyr.g, pyr.ar, pyr.az)
pyr.c1 <- pyr.c[-c(which(pyr.c$age.group=="80+")),] 

ggplot(pyr.c1,
aes(x=age.group, y=value,
fill=factor(Major.Area)))+
 geom_bar(data=pyr.c1 %>%
filter(sex=="female"&year=="2015"),
aes(x=reorder(age.group, order), y=value), stat="identity")+
geom_bar(data=pyr.c1 %>%
filter(sex=="male"&year=="2015"),
aes(x=reorder(age.group, order), y=-value),
stat="identity")+
coord_flip()+
 labs(x = "", y = "")+
scale_x_discrete(breaks=c(paste(seq(0,90, by=10),"-", seq(4,94, by=10)) , "100+"),
labels=c(paste(seq(0,90, by=10),"-", seq(4,94, by=10)), "100+" ))+
 scale_y_continuous(breaks=seq(-400,400,200),
labels=abs(seq(-400,400,200)))+
 theme_bw()+
scale_fill_manual(values = c(Armenia = "red",
Georgia="green", Azerbaijan = "blue"), name="")+
theme(axis.text.x = element_text(size=10, color="black"),
axis.text.y = element_text(size=10, color="black"),
legend.position="none")+
facet_wrap(~Major.Area)
#facet_wrap(~Major.Area, scales="free_x")

Untitled3

And with scales=”free_x”

Untitled1


Long to wide format with tidyr (and save it in n files)

The data comes from the https://esa.un.org/unpd/wpp/UN population projections

library(tidyr) #load tidyr or <a href="https://www.tidyverse.org/">tidyverse</a>, the latter being a collection of libraries

setwd("/Users/...") #set your working directory

dt <- read.csv("mydataset.csv", header=T) #read data

head(dt) #look at data

##   Index       Country Year Age Male_Pop Female_Pop
## 1     1 AmericanSamoa 2000   0      874        836
## 2     2 AmericanSamoa 2000   1      773        747
## 3     3 AmericanSamoa 2000   2      760        735
## 4     4 AmericanSamoa 2000   3      783        760
## 5     5 AmericanSamoa 2000   4      820        796
## 6     6 AmericanSamoa 2000   5      851        825

The idea would be to have a KEY column with the variables names and a VALUE column with the values. Since we have 2 value columns (male_pop and female_pop) we first need to gather them into 1 value column (Pop_sex) and then paste Pop_sex with Age.

# get it into the right format for "spread"
dt1 <- dt %>%

  gather(Pop_sex, value, 5:6) %>%

  unite(Pop_age, 5, 4, sep="_", remove=T) %>% # paste cols 5 and 4

  spread(Pop_age, value) %>% # spread into wide format

  write.csv(., file = "~/My folder of choice/nameofmyfile.csv") # this is optional

There’s a useful trick I’ve been using to get n csv files out of one long format dataset (eg. 1 file per year), I’ve found this somewhere in stackoverflow:

customFun  = function(mydt) {
  write.csv(mydt,paste0("name_",unique(mydt$year),".csv"))
  return(mydt)
}

mydt %>% 
  unite(newvar, 3:4, sep="_", remove=T) %>%
  spread(newvar, value) %>%
  group_by(year) %>% 
  do(customFun(.))

Note of the author: wide formats are never very useful but in case you really need them (linear regression &co) tidyr is a very compact solution. Be mindful that spreading over >1000 cols takes time. To get back from wide to long format use gather