Πλοήγηση σε →[Διδασκαλία και μαθήματα] →[Erasmus+ and/or international visits] →[Online Databases and Data Management for Economists]

Eurostat

Managing eurostat data with R

Begin

In this section we will examine Eurostat’s datasets. First, as an example we will compare goverment spending in research and development.

Load the required libraries:

library(eurostat)
library(dplyr)
library(readr)
library(tidyr)
library(lubridate)
library(DT)
library(purrr)
library(broom)
library(ggplot2)
library(ggthemes)
library(ggrepel)

RnD

Here we are going to examine the goverment pending on research and development. The table tipsst10 holds the corresponding data.

We can read the dataset by calling the get_eurostat function and we can store the results to a variable (here named rnd):

rnd <- get_eurostat("tipsst10")

We can examine the column names of the table (data frame) rnd:

names(rnd)

## [1] "sectperf" "unit"     "geo"      "time"     "values"

Or we examine the domain values of its columns:

rnd %>% select(sectperf) %>% distinct()

## # A tibble: 1 x 1
##   sectperf
##   <fct>   
## 1 TOTAL

rnd %>% select(unit) %>% distinct()

## # A tibble: 2 x 1
##   unit   
##   <fct>  
## 1 MIO_NAC
## 2 PC_GDP

We can filter the dataset, thus we can restrict it to specific rows:

rnd %>% 
    filter(unit == 'PC_GDP' & geo == 'EL')

## # A tibble: 18 x 5
##    sectperf unit   geo   time       values
##    <fct>    <fct>  <fct> <date>      <dbl>
##  1 TOTAL    PC_GDP EL    1995-01-01  0.42 
##  2 TOTAL    PC_GDP EL    1997-01-01  0.43 
##  3 TOTAL    PC_GDP EL    1999-01-01  0.570
##  4 TOTAL    PC_GDP EL    2001-01-01  0.56 
##  5 TOTAL    PC_GDP EL    2003-01-01  0.55 
##  6 TOTAL    PC_GDP EL    2004-01-01  0.53 
##  7 TOTAL    PC_GDP EL    2005-01-01  0.580
##  8 TOTAL    PC_GDP EL    2006-01-01  0.56 
##  9 TOTAL    PC_GDP EL    2007-01-01  0.580
## 10 TOTAL    PC_GDP EL    2008-01-01  0.66 
## 11 TOTAL    PC_GDP EL    2009-01-01  0.63 
## 12 TOTAL    PC_GDP EL    2010-01-01  0.6  
## 13 TOTAL    PC_GDP EL    2011-01-01  0.67 
## 14 TOTAL    PC_GDP EL    2012-01-01  0.7  
## 15 TOTAL    PC_GDP EL    2013-01-01  0.81 
## 16 TOTAL    PC_GDP EL    2014-01-01  0.83 
## 17 TOTAL    PC_GDP EL    2015-01-01  0.97 
## 18 TOTAL    PC_GDP EL    2016-01-01  1.01

rnd %>% 
    filter(unit == 'PC_GDP' & geo == 'EL') %>% 
    ggplot(aes(x = time, y = values)) +
    geom_line()

rnd %>% 
    filter(unit == 'PC_GDP' & geo %in% c('EL', 'PT')) %>% 
    ggplot(aes(x = time, y = values, colour = geo)) +
        geom_line(size = 1.2) + 
        theme_economist()

Plot the time series data:

rnd %>% 
    filter(unit == 'PC_GDP' & geo %in% c('EL', 'PT', 'CZ', 'BE')) %>% 
    mutate(label = if_else(time == max(time), as.character(geo), NA_character_)) %>%
    ggplot(aes(x = time, y = values, colour = geo)) +
    geom_line(size = 1.2) + 
    geom_label_repel(aes(label = label), nudge_x = 1, na.rm = TRUE) +
    scale_color_discrete(guide = FALSE) +
    theme_economist() +
    xlab("Time") + ylab("% GDP in RnD") +
    theme(text = element_text(size = 18))

Examine changes betweeen years 2006 and 2016:

rnd %>% 
    filter(unit == 'PC_GDP' & geo %in% c('EL', 'PT', 'ES', 'SE') & (time == '2006-01-01' | time == '2016-01-01')) %>%
    mutate(Year = as.character(year(time))) %>% 
    mutate(label = if_else(time == max(time), as.character(geo), NA_character_)) %>%
    ggplot(aes(x = Year, y = values, fill = Year)) +
    geom_bar(stat = "identity", width = 0.6) +
    facet_wrap(~geo) +
    theme_economist_white() +
    xlab("Time") + ylab("% GDP in RnD") +
    theme(text = element_text(size = 12)) +
    theme(axis.title = element_text(size = 14, face = "bold")) +
    scale_fill_manual("", values = c("2006" = "blue", "2016" = "darkorange")) +
    theme(strip.text.x = element_text(size = 14, face = "bold", colour = "black")) +
    geom_text(aes(label = paste(round(values, 2), "%")),  color="white", size = 6, position = position_stack(vjust = 0.5))

Poverty

We now turn our interest to an EC indicator for 2020 policy: People at risk of poverty or social exclusion. This is the percentage of population that their income is not enough to support a full inclusion in social life. Also, these people they live in poverty or their income is so low that with even small reduction they could enter poverty conditions.

Download the related dataset:

pov <- get_eurostat("t2020_50", time_format = "raw")

and examine the structure and the data:

names(pov)

## [1] "sex"    "age"    "unit"   "geo"    "time"   "values"

str(pov)

## Classes 'tbl_df', 'tbl' and 'data.frame':    1371 obs. of  6 variables:
##  $ sex   : Factor w/ 1 level "T": 1 1 1 1 1 1 1 1 1 1 ...
##  $ age   : Factor w/ 1 level "TOTAL": 1 1 1 1 1 1 1 1 1 1 ...
##  $ unit  : Factor w/ 3 levels "PC","THS_PER",..: 1 1 1 1 1 1 1 2 2 2 ...
##  $ geo   : Factor w/ 39 levels "AT","BE","DK",..: 1 2 3 4 5 6 7 1 2 3 ...
##  $ time  : chr  "2003" "2003" "2003" "2003" ...
##  $ values: num  15.7 23.4 16.9 32.9 24.4 ...

datatable(pov)

Let also see what is inside the variables:

pov %>% select(sex) %>% distinct()

## # A tibble: 1 x 1
##   sex  
##   <fct>
## 1 T

pov %>% select(age) %>% distinct()

## # A tibble: 1 x 1
##   age  
##   <fct>
## 1 TOTAL

pov %>% select(unit) %>% distinct()

## # A tibble: 3 x 1
##   unit    
##   <fct>   
## 1 PC      
## 2 THS_PER 
## 3 THS_CD08

We now compare graphically Greece, Portugal and the average values for EU28 (28 member countries):

pov %>% 
    filter(unit == 'PC' & geo %in% c('EL', 'PT', 'EU28')) %>% 
    mutate(label = if_else(time == max(time), as.character(geo), NA_character_)) %>%
    ggplot(aes(x = as.integer(time), y = values, colour = geo)) +
    geom_line(size = 1.2) +
    geom_point(size = 3) +
    geom_label_repel(aes(label = label), nudge_x = 1, na.rm = TRUE) +
    scale_color_discrete(guide = FALSE) +
    theme_wsj() +
    xlab("Time") + 
    ylab("% in risk of social exclusion") +
    theme(text = element_text(size = 16)) +
    theme(axis.title = element_text(size = 16, face = "bold"))

Unemployment

Now we turn our interest to unemployent. We consider the annual average unemlpoyment.

unemp <- get_eurostat("une_rt_a")

And we can look at the strudture of the data:

names(unemp)

## [1] "age"    "unit"   "sex"    "geo"    "time"   "values"

unemp %>% select(age) %>% distinct()

## # A tibble: 3 x 1
##   age   
##   <fct> 
## 1 TOTAL 
## 2 Y25-74
## 3 Y_LT25

unemp %>% select(unit) %>% distinct()

## # A tibble: 3 x 1
##   unit   
##   <fct>  
## 1 PC_ACT 
## 2 PC_POP 
## 3 THS_PER

unemp %>% select(sex) %>% distinct()

## # A tibble: 3 x 1
##   sex  
##   <fct>
## 1 F    
## 2 M    
## 3 T

unemp %>% select(time) %>% distinct()

## # A tibble: 35 x 1
##    time      
##    <date>    
##  1 2017-01-01
##  2 2016-01-01
##  3 2015-01-01
##  4 2014-01-01
##  5 2013-01-01
##  6 2012-01-01
##  7 2011-01-01
##  8 2010-01-01
##  9 2009-01-01
## 10 2008-01-01
## # … with 25 more rows

Timeto plot unemployment data:

unemp %>% 
    filter(sex == 'T' & age == 'TOTAL' & unit=='PC_ACT' & time>='2003-01-01' & geo %in% c('EL', 'PT')) %>% 
    mutate(label = if_else(time == max(time), as.character(geo), NA_character_)) %>%
    ggplot(aes(x = time, y = values, colour = geo)) +
    geom_line(size = 1.2) +
    geom_point(size = 3) +
    geom_label_repel(aes(label = label), nudge_x = 1, na.rm = TRUE) +
    scale_color_discrete(guide = FALSE) +
    theme_wsj() +
    xlab("Time") + 
    ylab("% unemployment") +
    theme(text = element_text(size = 16)) +
    theme(axis.title = element_text(size = 16, face = "bold"))

Join datasets

We now produce a transformed subset of the original dataset, considering only Greece, Protugal and Spain:

pov2 <- pov %>% 
    filter(unit=='PC' & time>='2004' & geo %in% c('EL', 'PT', 'ES')) %>% 
    select(geo, time, values) %>%
    mutate(time = as.integer(time)) %>% 
    rename(poverty = values)
datatable(pov2)

And we calculate first differences:

pov2_d <- pov2 %>% spread(geo, poverty) %>% 
    mutate(EL = c(NA, diff(EL)), ES = c(NA, diff(ES)), PT = c(NA, diff(PT))) %>% 
    gather(geo, d_poverty, gather_cols=c("EL", "ES", "PT"))
datatable(pov2_d)

Please notice the application of the function gather to reshape the original dataset.

We simiraly transform unemployment:

unemp2 <- unemp %>% 
    filter(sex == 'T' & age == 'TOTAL' & unit == 'PC_ACT' & 
               time >= '2004-01-01' & geo %in% c('EL', 'PT', 'ES')) %>% 
    select(geo, time, values) %>% 
    mutate(time = as.integer(year(time))) %>% 
    rename(unemployment = values) 
datatable(unemp2)

Also with first differences:

unemp2_d <- unemp2 %>% 
    spread(geo, unemployment) %>% 
    mutate(EL = c(NA, diff(EL)), ES = c(NA, diff(ES)), PT = c(NA, diff(PT))) %>% 
    gather(geo, d_unemployment, gather_cols=c("EL", "ES", "PT"))
datatable(unemp2_d)

We can easily join all datasets into one:

pov_unemp <- inner_join(pov2, unemp2, by = c("geo", "time")) %>% 
    inner_join(pov2_d, by=c("geo", "time")) %>% 
    inner_join(unemp2_d, by=c("geo", "time"))  
datatable(pov_unemp)

OLS

It seems that there is a link between unemployment and risk of social exclusion.

pov_unemp %>% 
    ggplot(aes(x = unemployment, y = poverty, shape = geo, colour = geo)) +
    geom_point(size = 4) +
    geom_smooth(method = "lm", fill = NA, size = 1.5) +
    theme_stata() +
    ylim(c(20, 40)) +
    xlab("% unemployment") + 
    ylab("% in risk of social exclusion") +
    theme(legend.title = element_blank()) +
    theme(text = element_text(size = 16)) +
    theme(axis.title = element_text(size = 16, face = "bold"))

lm(poverty ~ unemployment, data = pov_unemp)

## 
## Call:
## lm(formula = poverty ~ unemployment, data = pov_unemp)
## 
## Coefficients:
##  (Intercept)  unemployment  
##      22.2995        0.3663

lm.res <- pov_unemp %>% 
    group_by(geo) %>% 
    nest() %>% 
    mutate(model = map(data, ~lm(poverty ~ unemployment, data = .)))

classical regression

lm(poverty ~ unemployment, data = pov_unemp) %>% summary()

## 
## Call:
## lm(formula = poverty ~ unemployment, data = pov_unemp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5590 -2.0019 -0.7194  2.7788  4.7181 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  22.29946    1.11281  20.039  < 2e-16 ***
## unemployment  0.36627    0.06786   5.397 3.31e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.81 on 40 degrees of freedom
## Multiple R-squared:  0.4214, Adjusted R-squared:  0.4069 
## F-statistic: 29.13 on 1 and 40 DF,  p-value: 3.315e-06

tidy regression:

lm.res <- pov_unemp %>% 
    group_by(geo) %>% 
    nest() %>% 
    mutate(model = map(data, ~lm(poverty ~ unemployment, data = .)))

Results for Greece:

lm.res$model[[1]] %>% tidy()

## # A tibble: 2 x 5
##   term         estimate std.error statistic  p.value
##   <chr>           <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)    24.7      0.741       33.3 3.41e-13
## 2 unemployment    0.424    0.0404      10.5 2.15e- 7

Results for Spain:

lm.res$model[[2]] %>% tidy()

## # A tibble: 2 x 5
##   term         estimate std.error statistic  p.value
##   <chr>           <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)    21.6      0.754      28.7  1.99e-12
## 2 unemployment    0.256    0.0411      6.23 4.37e- 5

Results for Portugal:

lm.res$model[[2]] %>% tidy()

## # A tibble: 2 x 5
##   term         estimate std.error statistic  p.value
##   <chr>           <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)    21.6      0.754      28.7  1.99e-12
## 2 unemployment    0.256    0.0411      6.23 4.37e- 5

And a simple way to combine all results:

lm.1 <- tidy(lm.res$model[[1]]) %>% 
    mutate(country = 'Greece') %>% 
    select(country, term, estimate, std.error, p.value)
lm.2 <- tidy(lm.res$model[[2]]) %>% 
    mutate(country = 'Spain') %>% 
    select(country, term, estimate, std.error, p.value)
lm.3 <- tidy(lm.res$model[[3]]) %>% 
    mutate(country = 'Portugal') %>% 
    select(country, term, estimate, std.error, p.value)
bind_rows(lm.1, lm.2, lm.3)  %>% datatable()

We could also run the regression model on first differences.

pov_unemp %>% 
    ggplot(aes(x = d_unemployment, y = d_poverty, shape = geo, colour = geo)) +
    geom_point(size = 4) +
    geom_smooth(method = "lm", fill = NA, size = 1.5) +
    theme_stata() +
    xlab("change of unemployment") + 
    ylab("change of risk of social exclusion") +
    theme(legend.title = element_blank()) +
    theme(text = element_text(size = 16)) +
    theme(axis.title = element_text(size = 16, face = "bold"))