Refugees Analysis

Tidytuesday 2023-08-22

Juha Päällysaho


November 13, 2023


November 23, 2023

data prep

# Option 1: tidytuesdayR package
## install.packages("tidytuesdayR")

read data

tuesdata <- tidytuesdayR::tt_load("2023-08-22")
data_raw <- tuesdata |> pluck(1)

skim the data

check data range missing data from origin_continent oip and hst are completely missing, ill remove them from dateset date values are from 2010 to 2022

Data summary
Name data_raw
Number of rows 64809
Number of columns 16
Column type frequency:
character 6
numeric 10
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
coo_name 0 1 4 52 0 210 0
coo 0 1 3 3 0 210 0
coo_iso 0 1 3 3 0 210 0
coa_name 0 1 4 52 0 189 0
coa 0 1 3 3 0 189 0
coa_iso 0 1 3 3 0 189 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1.00 2016.39 3.72 2010 2013.00 2017 2020 2022 ▆▅▇▅▇
refugees 0 1.00 3440.01 55255.97 0 5.00 12 88 3737369 ▇▁▁▁▁
asylum_seekers 0 1.00 564.17 7455.73 0 0.00 8 57 940668 ▇▁▁▁▁
returned_refugees 0 1.00 73.48 2460.28 0 0.00 0 0 381275 ▇▁▁▁▁
idps 0 1.00 7088.70 163174.63 0 0.00 0 0 8252788 ▇▁▁▁▁
returned_idps 0 1.00 706.16 23654.24 0 0.00 0 0 2134349 ▇▁▁▁▁
stateless 0 1.00 756.63 19980.22 0 0.00 0 0 955399 ▇▁▁▁▁
ooc 0 1.00 437.50 20182.68 0 0.00 0 0 3206577 ▇▁▁▁▁
oip 64709 0.00 196611.16 419586.96 5 8218.25 23165 164760 2453862 ▇▁▁▁▁
hst 58845 0.09 6264.66 210090.46 0 0.00 0 0 15209720 ▇▁▁▁▁

continents data

countries <- read_csv("")

countries_less <- countries |> select(country_code3, continent_name)


df <- data_raw |>
  select(!c(oip, hst)) |>
  mutate(year = as_date(parse_date_time(year, "%Y")), .before = 1) |>
    origin_country = coo_name,
    destination_country = coa_name,
    others_of_concern = ooc
  ) |>
  left_join(countries_less, by = join_by("coo_iso" == "country_code3")) |>
  left_join(countries_less, by = join_by("coa_iso" == "country_code3")) |>
    origin_continent = continent_name.x,
    destination_continent = continent_name.y

df |> glimpse()
Rows: 64,809
Columns: 16
$ year                  <date> 2010-01-01, 2010-01-01, 2010-01-01, 2010-01-01,…
$ origin_country        <chr> "Afghanistan", "Iran (Islamic Rep. of)", "Iraq",…
$ coo                   <chr> "AFG", "IRN", "IRQ", "PAK", "ARE", "CHI", "GAZ",…
$ coo_iso               <chr> "AFG", "IRN", "IRQ", "PAK", "EGY", "CHN", "PSE",…
$ destination_country   <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Af…
$ coa                   <chr> "AFG", "AFG", "AFG", "AFG", "ALB", "ALB", "ALB",…
$ coa_iso               <chr> "AFG", "AFG", "AFG", "AFG", "ALB", "ALB", "ALB",…
$ refugees              <dbl> 0, 30, 6, 6398, 5, 6, 5, 5, 49, 5, 5, 0, 0, 6, 6…
$ asylum_seekers        <dbl> 0, 21, 0, 9, 0, 0, 0, 0, 20, 0, 0, 5, 10, 92, 5,…
$ returned_refugees     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ idps                  <dbl> 351907, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ returned_idps         <dbl> 3366, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ stateless             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ others_of_concern     <dbl> 838250, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ origin_continent      <chr> "Asia", "Asia", "Asia", "Asia", "Africa", "Asia"…
$ destination_continent <chr> "Asia", "Asia", "Asia", "Asia", "Europe", "Europ…

data to long format

group_to_long <- function(.data, ...) {
  .data |>
    group_by(year, ...) |>
    summarise(across(where(is.double), \(x) sum(x))) |>

refugees_long <- group_to_long(df)
refugees_long_origin <- group_to_long(df, origin_continent)

refugees_long_destination <- group_to_long(df, destination_continent)

get colorspace

[1] "#E16A86" "#B88A00" "#50A315" "#00AD9A" "#009ADE" "#C86DD7"

high level analysis


origin and destinations countries which general direction of refugees movement. refugees group are split to different sub classes, like asylum seekers, internally displaced persons and stateless person refugee and internally displaced person also have returned level


check which refugees group gives interesting analysis layer

global level

scale_y_mil <- function() {
  scale_y_continuous(labels = scales::label_number(scale = 1 / 1000000, big.mark = " ", suffix = "M"))

scale_x_yrs <- function() {
  scale_x_date(date_labels = "%y", date_breaks = "4 years")

refugees_long |>
  ggplot(aes(year, value, fill = name)) +
  geom_col() +
  scale_y_mil() +
  scale_x_yrs() +
    title = "has there been a change in refugees on global level?",
    subtitle = "2010 - 2023"
  ) +

continent level

plot_continents <- function(.data, .title = "value", .subtitle = "value2", facet_var = origin_continent, .name = name) {
  .data |>
    ggplot(aes(year, value, fill = {{ .name }})) +
    geom_col() +
    scale_x_yrs() +
    scale_y_mil() +
      title = .title,
      subtitle = .subtitle
    ) +
    scale_fill_discrete_qualitative() +
    facet_grid(cols = vars({{ facet_var }}))

  refugees_long_origin, "How the refugee level change between the continents?",
  "Origin country of refugees"

  refugees_long_destination, "How the refugee level change between the continents?",
  "Destination country of refugees", destination_continent

refugees between the continets

there seems to be intra continent refugees, next ill check how between the continets and domestic differ

df2 <- df |>
  mutate(between_continents = if_else(origin_continent != destination_continent, TRUE, FALSE), .before = 1)
df2 |>
  filter(between_continents == TRUE) |>
  group_to_long(destination_continent) |>
  plot_continents("what is the destination of Between continents refugees", "", destination_continent)

df2 |>
  filter(between_continents == TRUE) |>
  group_to_long(origin_continent) |>
    "where the Between continents refugees originate from?", "",

deep dive south america, between continets

south america has interesting change, and I’m not familiar with the continent. I’ll try to check if data gives interesting elements

southAmerica <- df2 |>
  filter(origin_continent == "South America", between_continents == TRUE)
southAmericaSummarized <- southAmerica |>
  select(-year) |>
  group_by(origin_country) |>
  summarise(across(where(is.double), list(sum = sum)))

southAmericaSummarized |>
origin_country refugees_sum asylum_seekers_sum returned_refugees_sum idps_sum returned_idps_sum stateless_sum others_of_concern_sum
Argentina 3045 6788 0 0 0 0 20
Bolivia (Plurinational State of) 4767 6888 0 0 0 0 0
Brazil 16218 127858 0 0 0 0 3574
Chile 8302 25238 0 0 0 0 0
Colombia 510459 383772 40 0 0 0 45253
Ecuador 15796 176299 0 0 0 0 24525
French Guiana 11 238 0 0 0 0 0
Guyana 6193 4578 0 0 0 0 0
Paraguay 1017 3537 0 0 0 0 0
Peru 32663 64606 0 0 0 0 0
Suriname 250 635 0 0 0 0 0
Uruguay 1028 2470 0 0 0 0 0
Venezuela (Bolivarian Republic of) 577459 1422868 5 0 0 0 230430

select refugees and asylum seekers since the other refugree classes are mostly zeros

southAmerica |>
  group_to_long(origin_country) |>
  filter(name %in% c("refugees", "asylum_seekers")) |>
  plot_continents("how south america refugees have changed?", "", facet_var = name, .name = origin_country)