7 Business demographics

Business demographics in EU.

Libraries we will need:

library(tidyverse)
library(eurostat)
library(scales)
library(leaflet)
library(sf)
library(ggrepel)
library(ggthemes)
library(wesanderson)

7.1 Romania as one country example

7.1.1 Business demographics data at NUTS3 level.

Download the bd_size_r3 dataset which contains business demographics data at NUTS-3 level.

bd_size_r3 <- get_eurostat("bd_size_r3", 
                   stringsAsFactors = FALSE, 
                   time_format = "num")

The dataset contains the following variables:

names(bd_size_r3)

## [1] "indic_sb" "sizeclas" "nace_r2"  "geo"      "time"     "values"

Pay attention and the domain of values that each variable has:

bd_size_r3 %>% distinct(indic_sb)

## # A tibble: 28 × 1
##    indic_sb
##    <chr>   
##  1 V11910  
##  2 V11920  
##  3 V11930  
##  4 V11943  
##  5 V16910  
##  6 V16911  
##  7 V16920  
##  8 V16921  
##  9 V16930  
## 10 V16931  
## # … with 18 more rows

bd_size_r3 %>% distinct(sizeclas)

## # A tibble: 4 × 1
##   sizeclas
##   <chr>   
## 1 0       
## 2 1-9     
## 3 GE10    
## 4 TOTAL

bd_size_r3 %>% distinct(nace_r2)

## # A tibble: 1 × 1
##   nace_r2   
##   <chr>     
## 1 B-S_X_K642

For example, if we need to see the number of enterprises born in the during in 2015 and survived to 2016 in NUTS3 of Romania we need to write:

bd_size_r3 %>% 
  filter(str_sub(geo, 1, 2) == "RO") %>% 
  filter(nchar(geo) == 5) %>% 
  filter(time == 2016) %>% 
  filter(indic_sb == 'V11920')  %>% 
  filter(sizeclas == 'TOTAL')

## # A tibble: 42 × 6
##    indic_sb sizeclas nace_r2    geo    time values
##    <chr>    <chr>    <chr>      <chr> <dbl>  <dbl>
##  1 V11920   TOTAL    B-S_X_K642 RO111  2016   3138
##  2 V11920   TOTAL    B-S_X_K642 RO112  2016   1097
##  3 V11920   TOTAL    B-S_X_K642 RO113  2016   5114
##  4 V11920   TOTAL    B-S_X_K642 RO114  2016   1952
##  5 V11920   TOTAL    B-S_X_K642 RO115  2016   1240
##  6 V11920   TOTAL    B-S_X_K642 RO116  2016   1031
##  7 V11920   TOTAL    B-S_X_K642 RO121  2016   1464
##  8 V11920   TOTAL    B-S_X_K642 RO122  2016   3119
##  9 V11920   TOTAL    B-S_X_K642 RO123  2016    672
## 10 V11920   TOTAL    B-S_X_K642 RO124  2016   1053
## # … with 32 more rows

7.1.2 NUTS3 geospatial data

NUTS3 level geospational data can be retrieved and plotted as follows.

Get the dataset:

SHP3 <- get_eurostat_geospatial(nuts_level = 3,
                                resolution = 10, 
                                year = 2016)

We can restrict the dataset to one country, for example Romania:

SHP3_cntr <- SHP3 %>% 
  filter(CNTR_CODE == "RO")

And we can see the following map containing the administrative borders of NUTS-3 regions of Romania:

leaflet() %>% 
  addPolygons(data = SHP3_cntr)

Or, with little more configuration:

leaflet() %>% 
  addTiles() %>% 
  addPolygons(data = SHP3_cntr,
              weight = 1,
              color = "black",
              fillColor = "blue",
              fillOpacity = 0.2,
              popup = paste(SHP3_cntr$NUTS_ID, SHP3_cntr$NUTS_NAME))

7.1.3 Creating a choropleth map of business demographics in Romania

Now we will focus more on Romania. We can filter geo data to Romania, for example:

SHP3_cntr <- SHP3 %>% 
  filter(CNTR_CODE == 'RO')

However, in order to have the code more flexible, it is always better to have parametrised lie:

CNTR <- 'RO'
SHP3_cntr <- SHP3 %>% 
  filter(CNTR_CODE == CNTR)

Even better, we will use the in operator. This will allow us to easily modify the code in order to have run for more than just one country.

CNTR <- c('RO')
SHP3_cntr <- SHP3 %>% 
  filter(CNTR_CODE %in% CNTR)

Now we have to select a proper subset of the BD dataset. We will first focus on year 2016 data for the specific country (Romania):

bd_size_r3 %>% 
  filter(time == 2016)

## # A tibble: 95,854 × 6
##    indic_sb sizeclas nace_r2    geo    time values
##    <chr>    <chr>    <chr>      <chr> <dbl>  <dbl>
##  1 V11910   0        B-S_X_K642 AT     2016 311599
##  2 V11910   0        B-S_X_K642 AT1    2016 144889
##  3 V11910   0        B-S_X_K642 AT11   2016  10843
##  4 V11910   0        B-S_X_K642 AT111  2016   1299
##  5 V11910   0        B-S_X_K642 AT112  2016   6084
##  6 V11910   0        B-S_X_K642 AT113  2016   3460
##  7 V11910   0        B-S_X_K642 AT12   2016  59411
##  8 V11910   0        B-S_X_K642 AT121  2016   7535
##  9 V11910   0        B-S_X_K642 AT122  2016   8858
## 10 V11910   0        B-S_X_K642 AT123  2016   5164
## # … with 95,844 more rows

Let us consider the GE10 size class of enterprises, thus those that they have at least 10 employees:

bd_size_r3 %>% 
  filter(time == 2016) %>% 
  filter(sizeclas == 'GE10')

## # A tibble: 24,282 × 6
##    indic_sb sizeclas nace_r2    geo    time values
##    <chr>    <chr>    <chr>      <chr> <dbl>  <dbl>
##  1 V11910   GE10     B-S_X_K642 AT     2016  43655
##  2 V11910   GE10     B-S_X_K642 AT1    2016  18029
##  3 V11910   GE10     B-S_X_K642 AT11   2016   1277
##  4 V11910   GE10     B-S_X_K642 AT111  2016    174
##  5 V11910   GE10     B-S_X_K642 AT112  2016    719
##  6 V11910   GE10     B-S_X_K642 AT113  2016    384
##  7 V11910   GE10     B-S_X_K642 AT12   2016   6920
##  8 V11910   GE10     B-S_X_K642 AT121  2016   1059
##  9 V11910   GE10     B-S_X_K642 AT122  2016   1033
## 10 V11910   GE10     B-S_X_K642 AT123  2016    665
## # … with 24,272 more rows

And we will consider now the number of enterprises opened three years before the year of reference and survided until the year of reference. This id the index V11943:

bd_size_r3 %>% 
  filter(time == 2016) %>% 
  filter(sizeclas == 'GE10') %>% 
  filter(indic_sb == 'V11943')

## # A tibble: 912 × 6
##    indic_sb sizeclas nace_r2    geo    time values
##    <chr>    <chr>    <chr>      <chr> <dbl>  <dbl>
##  1 V11943   GE10     B-S_X_K642 AT     2016    480
##  2 V11943   GE10     B-S_X_K642 AT1    2016    203
##  3 V11943   GE10     B-S_X_K642 AT11   2016      8
##  4 V11943   GE10     B-S_X_K642 AT111  2016      0
##  5 V11943   GE10     B-S_X_K642 AT112  2016      5
##  6 V11943   GE10     B-S_X_K642 AT113  2016      3
##  7 V11943   GE10     B-S_X_K642 AT12   2016     54
##  8 V11943   GE10     B-S_X_K642 AT121  2016      1
##  9 V11943   GE10     B-S_X_K642 AT122  2016      7
## 10 V11943   GE10     B-S_X_K642 AT123  2016      6
## # … with 902 more rows

And now the difficult part of the filtering, selecting the NUTS-3 regions of Romania. Why the difficulty? Look at the data, top rows is about Austria. We see values in the geo column like AT, AT1, AT11, AT111, AT112, etc. This is AT for the whole coutnry (NUTS-0), AT1 for the NUTS-1 region, AT11 for the NUTS-2 region and AT111 for the NUTS-3 region, etc. So, in order to filter for only NUTS-3 regions of a country we need a trick similar to:

bd_size_r3 %>% 
  filter(time == 2016) %>% 
  filter(sizeclas == 'GE10') %>% 
  filter(indic_sb == 'V11943') %>% 
  filter(str_sub(geo, 1, 2) %in% CNTR & str_length(geo) == 5)

## # A tibble: 42 × 6
##    indic_sb sizeclas nace_r2    geo    time values
##    <chr>    <chr>    <chr>      <chr> <dbl>  <dbl>
##  1 V11943   GE10     B-S_X_K642 RO111  2016     50
##  2 V11943   GE10     B-S_X_K642 RO112  2016      9
##  3 V11943   GE10     B-S_X_K642 RO113  2016     73
##  4 V11943   GE10     B-S_X_K642 RO114  2016     35
##  5 V11943   GE10     B-S_X_K642 RO115  2016     19
##  6 V11943   GE10     B-S_X_K642 RO116  2016      9
##  7 V11943   GE10     B-S_X_K642 RO121  2016     18
##  8 V11943   GE10     B-S_X_K642 RO122  2016     51
##  9 V11943   GE10     B-S_X_K642 RO123  2016      8
## 10 V11943   GE10     B-S_X_K642 RO124  2016      8
## # … with 32 more rows

which translates to: geo contains 5 characters and the first two are equal to CNTR (RO in this case).

We can save the results in a variable:

BD_cntr <- bd_size_r3 %>% 
  filter(time == 2016) %>% 
  filter(sizeclas == 'GE10') %>% 
  filter(indic_sb == 'V11943') %>% 
  filter(str_sub(geo, 1, 2) %in% CNTR & str_length(geo) == 5)

In order to have the data displayed on a map, we need to join them with the SHP3_cntr data:

inner_join(SHP3_cntr, BD_cntr, by = ("NUTS_ID" = "geo")) %>% 
  select(NUTS_ID, NUTS_NAME, values, geometry)

## Simple feature collection with 42 features and 3 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 20.2643 ymin: 43.62289 xmax: 29.67964 ymax: 48.25975
## Geodetic CRS:  WGS 84
## First 10 features:
##    NUTS_ID NUTS_NAME values                       geometry
## 1    RO126     Sibiu     33 MULTIPOLYGON (((24.94655 46...
## 2    RO211     Bacău     29 MULTIPOLYGON (((27.20622 46...
## 3    RO212  Botoşani      9 MULTIPOLYGON (((27.39117 47...
## 4    RO213      Iaşi     37 MULTIPOLYGON (((28.11152 46...
## 5    RO214     Neamţ      8 MULTIPOLYGON (((26.50779 47...
## 6    RO215   Suceava     21 MULTIPOLYGON (((26.70057 47...
## 7    RO216    Vaslui     14 MULTIPOLYGON (((28.11152 46...
## 8    RO221    Brăila     14 MULTIPOLYGON (((28.03052 45...
## 9    RO222     Buzău     12 MULTIPOLYGON (((27.43473 45...
## 10   RO223 Constanţa     70 MULTIPOLYGON (((28.99409 44...

In leaflet this will work very fine. If we plot in ggplot then we will need an extra modification of the dataset:

inner_join(SHP3_cntr, BD_cntr, by = ("NUTS_ID" = "geo")) %>% 
  select(NUTS_ID, NUTS_NAME, values, geometry) %>% 
  st_as_sf()

## Simple feature collection with 42 features and 3 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 20.2643 ymin: 43.62289 xmax: 29.67964 ymax: 48.25975
## Geodetic CRS:  WGS 84
## First 10 features:
##    NUTS_ID NUTS_NAME values                       geometry
## 1    RO126     Sibiu     33 MULTIPOLYGON (((24.94655 46...
## 2    RO211     Bacău     29 MULTIPOLYGON (((27.20622 46...
## 3    RO212  Botoşani      9 MULTIPOLYGON (((27.39117 47...
## 4    RO213      Iaşi     37 MULTIPOLYGON (((28.11152 46...
## 5    RO214     Neamţ      8 MULTIPOLYGON (((26.50779 47...
## 6    RO215   Suceava     21 MULTIPOLYGON (((26.70057 47...
## 7    RO216    Vaslui     14 MULTIPOLYGON (((28.11152 46...
## 8    RO221    Brăila     14 MULTIPOLYGON (((28.03052 45...
## 9    RO222     Buzău     12 MULTIPOLYGON (((27.43473 45...
## 10   RO223 Constanţa     70 MULTIPOLYGON (((28.99409 44...

So, we can save the joined datasets into a new variable:

DF <- inner_join(SHP3_cntr, BD_cntr, by = ("NUTS_ID" = "geo")) %>% 
  select(NUTS_ID, NUTS_NAME, values, geometry) %>% 
  st_as_sf()

Last step is to create a color palette for the values we want to represent. A good first try is something like this:

myPal  <- colorNumeric(palette = "Spectral", domain = DF$values, na.color = "transparent")

And here is the plot:

leaflet(DF) %>% 
  addPolygons(weight = 1, 
              fillColor = ~myPal(values),
              fillOpacity = 0.7)

One think we can do here to improve the graph is to exclude the capital, Bucurest. Obsiously due to its much bigger economic actvity Bucurest makes everything else to look inactive. So:

DF <- DF %>% 
  mutate(values = ifelse(NUTS_ID != "RO321", values, NA))

And we will re-assign colors:

myPal  <- colorNumeric(palette = "Spectral", domain = DF$values, na.color = "transparent")

Also, we can label the regions with a popup window:

LABELS <- paste(DF$NUTS_NAME, DF$values)

And now we have:

leaflet(DF) %>% 
  addPolygons(weight = 1, 
              fillColor = ~myPal(values),
              fillOpacity = 0.7, 
              popup = LABELS)

And some improvements. We will modify the center (close to Brasov) of the view and the zoom of the map. We will also add a legend about colors and corresponding values. This is definetely needed in all choropleth maps.

leaflet(DF) %>%
  setView(lat = 45.5, lng = 25.5, zoom = 6) %>% 
  addPolygons(weight = 1,
              color = "white",
              fillColor = ~myPal(values),
              fillOpacity = 0.7, 
              label = LABELS) %>% 
  addLegend("topright", pal = myPal, values = ~values,
    title = "",
    opacity = 0.7)

Changing the color palette. We can also use a monochrome palette. For example we color all regions with Green color.

myPal  <- colorNumeric(palette = "Greens", domain = DF$values, na.color = "transparent")
leaflet(DF) %>%
  setView(lat = 45.5, lng = 25.5, zoom = 6) %>% 
  addPolygons(weight = 1,
              color = "white",
              fillColor = ~myPal(values),
              fillOpacity = 0.7, 
              label = LABELS) %>% 
  addLegend("topright", pal = myPal, values = ~values,
    title = "",
    opacity = 0.7)

So, the darker the color the bigger the value.

7.1.4 Adding controls for layers

let’s show a map about percentages of enterprises based on size class.

Step 1:

DB_cntr_s3 <- bd_size_r3 %>% 
  filter(str_sub(geo, 1, 2) %in% CNTR ) %>% 
  filter(str_length(geo) == 5) %>%
  filter(time == 2016) %>%
  filter(indic_sb == 'V11943') %>% 
  pivot_wider(id_cols = c(geo),
              names_from = sizeclas,
              names_prefix = "S_",
              values_from = values,
              ) %>% 
  mutate(p_0 = round(100*S_0/S_TOTAL, 1),
         p_1 = round(100*`S_1-9`/S_TOTAL, 1),
         p_2 = round(100*S_GE10/S_TOTAL, 1)) %>%
  select(geo, p_0, p_1, p_2)

Step 2:

DF <- inner_join(SHP3_cntr, DB_cntr_s3, by = ("NUTS_ID" = "geo")) %>% 
  select(NUTS_ID, NUTS_NAME, p_0, p_1, p_2, geometry)

Step 3:

myPal_0  <- colorNumeric(palette = "Spectral", domain = DF$p_0, na.color = "transparent")
myPal_1  <- colorNumeric(palette = "Spectral", domain = DF$p_1, na.color = "transparent")
myPal_2  <- colorNumeric(palette = "Spectral", domain = DF$p_2, na.color = "transparent")

Step 4:

leaflet(DF) %>% 
  setView(lat = 45.5, lng = 25.5 , zoom = 6) %>%
  addPolygons(weight = 1, 
              color = "white",
              fillOpacity = 0.5, 
              fillColor = ~myPal_0(p_0),
              label = LABELS,
              group = "s = 0") %>% 
  addPolygons(weight = 1, 
              color = "white",
              fillOpacity = 0.5, 
              fillColor = ~myPal_1(p_1),
              label = LABELS,
              group = "s = 1-9") %>%
  addPolygons(weight = 1, 
              color = "white",
              fillOpacity = 0.5, 
              fillColor = ~myPal_2(p_2),
              label = LABELS,
              group = "s >= 10") %>% 
  addLayersControl(baseGroups = c("s = 0", "s = 1-9", "s >= 10"),
                   options = layersControlOptions(collapsed = FALSE))