Popupation pyramids updated

Upload the relevant packages and dataset. You can find the data on github here

library(tidyverse)
options(scipen = 9)
setwd("/myworkingdirectory/")
mydt % filter(iso=='UGA')

The dataset includes population estimates at subnational level for Uganda.

# reformat the dataset using tidy

newdf % gather(variable, value,6:761) %>% separate(variable,c('year','sex', 'age'), sep='_') %>% mutate(sex=if_else(sex=='F','female','male')) %>%
spread(year, value) %>%
mutate(age2=recode(age, '1'='0-4', '4'='0-4', '5'='5-9','10'='10-14','15'='15-19', '20'='20-24', '25'= '25-29', '30'='30-34', '35'='35-39', '40'='40-44', '45'='45-49', '50'='50-54', '55'='55-59', '60'='60-64', '65'='65-69', '70'='70-74', '75'='75-79', '80'='80+')) %>%
mutate(age=recode(age, '1'='0', '4'='0'))

newdf$age %
gather(key = year, value = pop, 10:14) %>%
# mutate(pop = pop/1e03) %>%
filter(iso == "UGA"&adm_id==c("UGMIS2014452022"), year %in% c(2000, 2005, 2010, 2015, 2020))

newdf4 %
group_by(iso, adm_id, id, year, sex, age, age2, ageno) %>%
summarise(pop= sum(pop)) %>%
mutate(ageno = ageno + 1)

library(ggthemes)
ggplot(data = newdf4, aes(x = age, y = pop/1000, fill = year)) +
#bars for all but 2100
geom_bar(data = newdf4 %>% filter(sex == "female", year != 2100) %>% arrange(rev(year)),
stat = "identity",
position = "identity", width = 4.5) +
geom_bar(data = newdf4 %>% filter(sex == "male", year != 2100) %>% arrange(rev(year)),
stat = "identity",
position = "identity",
mapping = aes(y = -pop/1000)) +
coord_flip() +
scale_y_continuous(labels = abs, breaks = seq(-600, 600, 250)) +
geom_hline(yintercept = 0) +
theme_economist_white(horizontal = FALSE) +
scale_fill_economist() +
labs(fill = "", x = "", y = "")

Screen Shot 2019-07-14 at 15.46.36

 

Long to wide format with tidyr (and save it in n files)

The data comes from the https://esa.un.org/unpd/wpp/UN population projections

library(tidyr) #load tidyr or <a href="https://www.tidyverse.org/">tidyverse</a>, the latter being a collection of libraries

setwd("/Users/...") #set your working directory

dt <- read.csv("mydataset.csv", header=T) #read data

head(dt) #look at data

##   Index       Country Year Age Male_Pop Female_Pop
## 1     1 AmericanSamoa 2000   0      874        836
## 2     2 AmericanSamoa 2000   1      773        747
## 3     3 AmericanSamoa 2000   2      760        735
## 4     4 AmericanSamoa 2000   3      783        760
## 5     5 AmericanSamoa 2000   4      820        796
## 6     6 AmericanSamoa 2000   5      851        825

The idea would be to have a KEY column with the variables names and a VALUE column with the values. Since we have 2 value columns (male_pop and female_pop) we first need to gather them into 1 value column (Pop_sex) and then paste Pop_sex with Age.

# get it into the right format for "spread"
dt1 <- dt %>%

  gather(Pop_sex, value, 5:6) %>%

  unite(Pop_age, 5, 4, sep="_", remove=T) %>% # paste cols 5 and 4

  spread(Pop_age, value) %>% # spread into wide format

  write.csv(., file = "~/My folder of choice/nameofmyfile.csv") # this is optional

There’s a useful trick I’ve been using to get n csv files out of one long format dataset (eg. 1 file per year), I’ve found this somewhere in stackoverflow:

customFun  = function(mydt) {
  write.csv(mydt,paste0("name_",unique(mydt$year),".csv"))
  return(mydt)
}

mydt %>% 
  unite(newvar, 3:4, sep="_", remove=T) %>%
  spread(newvar, value) %>%
  group_by(year) %>% 
  do(customFun(.))

Note of the author: wide formats are never very useful but in case you really need them (linear regression &co) tidyr is a very compact solution. Be mindful that spreading over >1000 cols takes time. To get back from wide to long format use gather