Gapminder Revisited

Gapminder data frame contains just 6 columns from the larger data in Gapminder World. In this part, we shall join a few dataframes with more data than the ‘gapminder’ package. Specifically, let’s look at data on

library(gapminder)
skim(gapminder)
Data summary
Name gapminder
Number of rows 1704
Number of columns 6
_______________________
Column type frequency:
factor 2
numeric 4
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
country 0 1 FALSE 142 Afg: 12, Alb: 12, Alg: 12, Ang: 12
continent 0 1 FALSE 5 Afr: 624, Asi: 396, Eur: 360, Ame: 300

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1 1.98e+03 1.73e+01 1952.0 1.97e+03 1.98e+03 1.99e+03 2.01e+03 ▇▅▅▅▇
lifeExp 0 1 5.95e+01 1.29e+01 23.6 4.82e+01 6.07e+01 7.08e+01 8.26e+01 ▁▆▇▇▇
pop 0 1 2.96e+07 1.06e+08 60011.0 2.79e+06 7.02e+06 1.96e+07 1.32e+09 ▇▁▁▁▁
gdpPercap 0 1 7.22e+03 9.86e+03 241.2 1.20e+03 3.53e+03 9.33e+03 1.14e+05 ▇▁▁▁▁
# load gapminder HIV data
hiv <- read_csv(here::here("data","adults_with_hiv_percent_age_15_49.csv"))
life_expectancy <- read_csv(here::here("data","life_expectancy_years.csv"))

# get World bank data using wbstats
indicators <- c("SP.DYN.TFRT.IN","SE.PRM.NENR", "SH.DYN.MORT", "NY.GDP.PCAP.KD")


library(wbstats)

worldbank_data <- wb_data(country="countries_only", #countries only- no aggregates like Latin America, Europe, etc.
                          indicator = indicators, 
                          start_date = 1960, 
                          end_date = 2016)

# get a dataframe of information regarding countries, indicators, sources, regions, indicator topics, lending types, income levels,  from the World Bank API 
countries <-  wbstats::wb_cachelist$countries

Left Join makes the most sense among the different types of join operations like outer joins - left, right, and full. This is because a left join operation (regardless of there being a match) preserves the original observations, especially when one looks up additional data from another table. Since, we are working with 3 dataframes, while mapping the year and date column with different start/end time frames, it is essential to preserve the original observations in each dataframe.

# tidying HIV data - hiv and life_expectancy dataframes - using pivot_longer() + remving NA values
hiv1 <- hiv %>%
  pivot_longer(2:34, names_to = "year", values_to = "Percentage_HIV_Cases_Age_15_49") %>% 
  drop_na()
skim(hiv1)
Data summary
Name hiv1
Number of rows 3301
Number of columns 3
_______________________
Column type frequency:
character 2
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
country 0 1 3 24 0 154 0
year 0 1 4 4 0 31 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Percentage_HIV_Cases_Age_15_49 0 1 1.74 4.09 0.01 0.1 0.3 1.2 26.5 ▇▁▁▁▁
life_expectancy_1 <- life_expectancy %>%
  pivot_longer(2:302, names_to = "year", values_to = "Life_Expectancy") %>% 
  drop_na()
skim(life_expectancy_1)
Data summary
Name life_expectancy_1
Number of rows 55528
Number of columns 3
_______________________
Column type frequency:
character 2
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
country 0 1 3 30 0 187 0
year 0 1 4 4 0 301 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Life_Expectancy 0 1 53 21.7 1.01 32.3 48.7 74.2 94.8 ▁▇▂▅▅
#Removing NA values in worldbank_data
worldbank_data_1 <- worldbank_data %>%
  drop_na()
skim(worldbank_data_1)
Data summary
Name worldbank_data_1
Number of rows 3715
Number of columns 8
_______________________
Column type frequency:
character 3
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
iso2c 0 1 2 2 0 177 0
iso3c 0 1 3 3 0 177 0
country 0 1 4 30 0 177 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
date 0 1 1998.57 12.65 1970.00 1989.00 2001.0 2009.00 2.02e+03 ▂▃▃▆▇
NY.GDP.PCAP.KD 0 1 12893.37 18110.77 164.46 1514.19 4313.9 17178.37 1.16e+05 ▇▁▁▁▁
SE.PRM.NENR 0 1 85.08 17.09 10.05 81.05 91.7 96.43 1.00e+02 ▁▁▁▂▇
SH.DYN.MORT 0 1 52.96 59.80 2.20 10.90 26.7 76.35 3.70e+02 ▇▂▁▁▁
SP.DYN.TFRT.IN 0 1 3.35 1.82 1.08 1.83 2.7 4.82 8.45e+00 ▇▃▂▂▁
#Left Join of life_expectancy_1 (tidied life_expectancy) and hiv1 (tidied hiv) dataframes 
join_1 <- left_join(life_expectancy_1, hiv1, by = c("country"="country", "year"="year"))
# Converting datatype of column (year) from character type to numeric type
join_1$year = as.numeric(join_1$year)
#Left Join of join_1 (left join of hiv1 and life_expectancy_1 dataframes) and worldbank_data dataframes
join_2 <- left_join(join_1, worldbank_data_1, by = c("country"="country", "year"="date"))
#Left Join of join_1 (left join of hiv1 and life_expectancy_1 dataframes) and worldbank_data dataframes
join_3 <- left_join(join_2, countries, "country"="country") %>%
  drop_na()
join_3
## # A tibble: 1,083 × 25
##    country  year Life_Expectancy Percentage_HIV_Case… iso2c iso3c NY.GDP.PCAP.KD
##    <chr>   <dbl>           <dbl>                <dbl> <chr> <chr>          <dbl>
##  1 Algeria  1990            71.7                 0.06 DZ    DZA            3572.
##  2 Algeria  1991            72.2                 0.06 DZ    DZA            3444.
##  3 Algeria  1992            72.5                 0.06 DZ    DZA            3424.
##  4 Algeria  1993            72.7                 0.06 DZ    DZA            3279.
##  5 Algeria  1994            72.8                 0.06 DZ    DZA            3183.
##  6 Algeria  1995            72.9                 0.06 DZ    DZA            3241.
##  7 Algeria  1996            73.3                 0.06 DZ    DZA            3315.
##  8 Algeria  1997            73.2                 0.06 DZ    DZA            3298.
##  9 Algeria  1999            73.9                 0.06 DZ    DZA            3474.
## 10 Algeria  2000            74                   0.06 DZ    DZA            3558.
## # … with 1,073 more rows, and 18 more variables: SE.PRM.NENR <dbl>,
## #   SH.DYN.MORT <dbl>, SP.DYN.TFRT.IN <dbl>, capital_city <chr>,
## #   longitude <dbl>, latitude <dbl>, region_iso3c <chr>, region_iso2c <chr>,
## #   region <chr>, admin_region_iso3c <chr>, admin_region_iso2c <chr>,
## #   admin_region <chr>, income_level_iso3c <chr>, income_level_iso2c <chr>,
## #   income_level <chr>, lending_type_iso3c <chr>, lending_type_iso2c <chr>,
## #   lending_type <chr>

What is the relationship between HIV prevalence and life expectancy?

Based on the scatterplot with a smoothing line, it seems like region plays a role in the relationship between HIV prevalence and life expectancy. HIV Prevalence is depicted through the percentage of HIV cases in the age group 15-49 years. In the case of Sub-Saharan Africa, there is a definite pattern of a negative slope which implies that as HIV prevalence increases, there is a decrease in Life Expectancy. In Latin America too, there is a negative slope implying an inverse relationship between HIV prevalence and Life Expectancy. However, there is more variability in the plot for Sub-Saharan Africa as compared to Latin America.

#Scatterplot for Life Expectancy vs. HIV prevalence 
ggplot(join_3, aes(x = Percentage_HIV_Cases_Age_15_49, y = Life_Expectancy)) +
    geom_point(size = 0.3) +
    geom_smooth(method="lm") +
    facet_wrap(~ region) +
    labs(title = "Relationship between HIV Prevalence and Life Expectancy", 
         x = "HIV Prevalence",
         y = "Life Expectancy")

What is the relationship between fertility rate and GDP per capita?

Based on the scatterplot with a smoothing line, it seems like facetting by region solidifies the belief that all regions depict an inverse relationship between fertility rate and GDP per capita. In the case of Sub-Saharan Africa in particular, as the fertility rate increases, the GDP per capita decreases. The inverse relationship between the two variables demonstrates the connection between fertility choices and economic considerations. In general, developing or low-income countries tend to have higher levels of fertility than their developed counterparts for several reasons such as infant mortality, lack of access to contraceptives, generally lower levels of female education, etc.

ggplot(join_3, aes(x = SP.DYN.TFRT.IN, y = NY.GDP.PCAP.KD)) +
    geom_point(size = 0.5) +
    geom_smooth(method="lm") +
    facet_wrap(~ region) +
    labs(title = "Relationship between Fertility Rate and GDP per capita", 
         x = "Fertility Rate",
         y = "GDP per capita")+
  NULL

Which regions have the most observations with missing HIV data?

Region ‘Sub-Saharan Africa’ has the most observations with missing HIV data. This is followed by Europe & Central Asia with the 2nd most observations with missing HIV data.

#Tidying hiv dataframe
tidy_hiv <- hiv %>% 
  pivot_longer(cols=2:34, names_to="year", values_to = "Percentage_HIV_Cases_Age_15_49")
#Left joining tidy_hiv and countries dataframes
joined_hiv_countries <- left_join(tidy_hiv, countries, "country"= "country")
joined_hiv_countries
## # A tibble: 5,082 × 20
##    country     year  Percentage_HIV_… iso3c iso2c capital_city longitude latitude
##    <chr>       <chr>            <dbl> <chr> <chr> <chr>            <dbl>    <dbl>
##  1 Afghanistan 1979                NA AFG   AF    Kabul             69.2     34.5
##  2 Afghanistan 1980                NA AFG   AF    Kabul             69.2     34.5
##  3 Afghanistan 1981                NA AFG   AF    Kabul             69.2     34.5
##  4 Afghanistan 1982                NA AFG   AF    Kabul             69.2     34.5
##  5 Afghanistan 1983                NA AFG   AF    Kabul             69.2     34.5
##  6 Afghanistan 1984                NA AFG   AF    Kabul             69.2     34.5
##  7 Afghanistan 1985                NA AFG   AF    Kabul             69.2     34.5
##  8 Afghanistan 1986                NA AFG   AF    Kabul             69.2     34.5
##  9 Afghanistan 1987                NA AFG   AF    Kabul             69.2     34.5
## 10 Afghanistan 1988                NA AFG   AF    Kabul             69.2     34.5
## # … with 5,072 more rows, and 12 more variables: region_iso3c <chr>,
## #   region_iso2c <chr>, region <chr>, admin_region_iso3c <chr>,
## #   admin_region_iso2c <chr>, admin_region <chr>, income_level_iso3c <chr>,
## #   income_level_iso2c <chr>, income_level <chr>, lending_type_iso3c <chr>,
## #   lending_type_iso2c <chr>, lending_type <chr>
# Determining NA values in joined_hiv_countries dataframe
joined_hiv_countries %>%
  filter(!is.na(region)) %>%
  group_by(region) %>%
  summarise(missing_hiv_values=sum(is.na(Percentage_HIV_Cases_Age_15_49))) %>%
  mutate(
    region=fct_reorder(region,-missing_hiv_values)) %>%
# Plotting Bar Chart of Region Specific Missing HIV Data in Descending Order
  ggplot(aes(x=missing_hiv_values,y=region))+
  geom_col(fill = "coral")+
  labs(title="Bar Chart of Region Specific Missing HIV Data in Descending Order",
    x= "Missing NA values in HIV data",
    y= "Region"
  )

How has mortality rate for under 5 changed by region? In each region, the top 5 countries that have seen the greatest improvement, as well as those 5 countries where mortality rates have had the least improvement or even deterioration, have been determined.

#Minimum Year and Maximum Year determination
mortality <- join_3 %>% 
  group_by(country) %>%
  summarise(minimum_year=min(year), maximum_year=max(year))
mortality
## # A tibble: 92 × 3
##    country    minimum_year maximum_year
##    <chr>             <dbl>        <dbl>
##  1 Algeria            1990         2008
##  2 Angola             1998         2011
##  3 Argentina          1991         2011
##  4 Armenia            2002         2011
##  5 Azerbaijan         1991         2011
##  6 Bangladesh         1990         2010
##  7 Belarus            1995         2011
##  8 Belize             1999         2011
##  9 Benin              1984         2011
## 10 Bhutan             1998         2011
## # … with 82 more rows
# Dataframe with original mortality rates
join_5 <- left_join(join_3, mortality, "country"="country") %>% #Left Joining join_3 and mortality dataframes
select(country, year, minimum_year, maximum_year, SH.DYN.MORT, region) %>% # Selecting required columns
mutate(
  original_mortality = ifelse(year == minimum_year, SH.DYN.MORT, 0)) %>% #Determining original mortality rates
select(!year) %>%
filter(!original_mortality == 0)%>%
select(!SH.DYN.MORT)

join_5
## # A tibble: 92 × 5
##    country    minimum_year maximum_year region                     original_mortal…
##    <chr>             <dbl>        <dbl> <chr>                                 <dbl>
##  1 Algeria            1990         2008 Middle East & North Africa             49.5
##  2 Angola             1998         2011 Sub-Saharan Africa                    214. 
##  3 Argentina          1991         2011 Latin America & Caribbean              28.1
##  4 Armenia            2002         2011 Europe & Central Asia                  27.8
##  5 Azerbaijan         1991         2011 Europe & Central Asia                  95.3
##  6 Bangladesh         1990         2010 South Asia                            144. 
##  7 Belarus            1995         2011 Europe & Central Asia                  15.6
##  8 Belize             1999         2011 Latin America & Caribbean              24.4
##  9 Benin              1984         2011 Sub-Saharan Africa                    199. 
## 10 Bhutan             1998         2011 South Asia                             86.1
## # … with 82 more rows
# Dataframe with final mortality rates
join_6 <- left_join(join_3, mortality, "country"="country") %>% #Left Joining join_3 and mortality dataframes
select(country, year, minimum_year, maximum_year, SH.DYN.MORT, region) %>% # Selecting required columns
mutate(
  final_mortality = ifelse(year == maximum_year, SH.DYN.MORT, 0)) %>%  #Determining final mortality rates
select(!year) %>%
filter(!final_mortality == 0)%>%
select(!SH.DYN.MORT)

join_6
## # A tibble: 92 × 5
##    country    minimum_year maximum_year region                     final_mortality
##    <chr>             <dbl>        <dbl> <chr>                                <dbl>
##  1 Algeria            1990         2008 Middle East & North Africa            29.5
##  2 Angola             1998         2011 Sub-Saharan Africa                   112. 
##  3 Argentina          1991         2011 Latin America & Caribbean             13.9
##  4 Armenia            2002         2011 Europe & Central Asia                 17.6
##  5 Azerbaijan         1991         2011 Europe & Central Asia                 35  
##  6 Bangladesh         1990         2010 South Asia                            48.7
##  7 Belarus            1995         2011 Europe & Central Asia                  5.1
##  8 Belize             1999         2011 Latin America & Caribbean             18.3
##  9 Benin              1984         2011 Sub-Saharan Africa                   109. 
## 10 Bhutan             1998         2011 South Asia                            39.9
## # … with 82 more rows
# Joining aforementioned join_5 and join_6 dataframes along with Improvement in Mortality Rates calculation
join_7 <- left_join(join_5, join_6, "country"="country") %>%  #Left Joining join_5 and join_6
  mutate(
    mortality_improvement = (final_mortality - original_mortality)/original_mortality) %>%
  arrange(desc(mortality_improvement))

join_7
## # A tibble: 92 × 7
##    country   minimum_year maximum_year region   original_mortal… final_mortality
##    <chr>            <dbl>        <dbl> <chr>               <dbl>           <dbl>
##  1 South Af…         1991         2005 Sub-Sah…             56              79.1
##  2 Gabon             1997         1997 Sub-Sah…             87.2            87.2
##  3 Haiti             1997         1997 Latin A…            115.            115. 
##  4 South Su…         2011         2011 Sub-Sah…            102.            102. 
##  5 Sudan             2011         2011 Sub-Sah…             73.4            73.4
##  6 Lesotho           1984         2011 Sub-Sah…            101.             96.1
##  7 Zimbabwe          1998         2003 Sub-Sah…             95.6            90.6
##  8 Sao Tome…         2009         2010 Sub-Sah…             47.9            45  
##  9 Uzbekist…         2007         2008 Europe …             41.3            38.5
## 10 Ethiopia          2009         2011 Sub-Sah…             86.7            77.6
## # … with 82 more rows, and 1 more variable: mortality_improvement <dbl>
# Determining Change in Mortality Rate for Under 5 in each region
mortality_improvement_by_region <- join_7 %>%
  group_by(region) %>%
  summarise(mean_mortality_improvement_rate = 100 * mean(mortality_improvement))

mortality_improvement_by_region
## # A tibble: 6 × 2
##   region                     mean_mortality_improvement_rate
##   <chr>                                                <dbl>
## 1 East Asia & Pacific                                  -40.0
## 2 Europe & Central Asia                                -48.1
## 3 Latin America & Caribbean                            -48.9
## 4 Middle East & North Africa                           -56.5
## 5 South Asia                                           -48.0
## 6 Sub-Saharan Africa                                   -32.0
# Plot for Region-Specific Change in Mortality Rate for Under 5
ggplot(mortality_improvement_by_region, aes(x = mean_mortality_improvement_rate, y = fct_reorder(region, -mean_mortality_improvement_rate))) +
  geom_col(fill = "steelblue")+
  labs(
    title= "Region-Specific Change in Mortality Rate for Under 5",
    x= "Change in Mortality Rate for Under 5",
    y="Region"
  )

#Sub-Saharan Africa
join_7 %>% #The Top 5 countries with the greatest improvement in mortality rates
  filter(region == "Sub-Saharan Africa") %>%
  slice_max(order_by = mortality_improvement, n = 5)
## # A tibble: 5 × 7
##   country      minimum_year maximum_year region original_mortal… final_mortality
##   <chr>               <dbl>        <dbl> <chr>             <dbl>           <dbl>
## 1 South Africa         1991         2005 Sub-S…             56              79.1
## 2 Gabon                1997         1997 Sub-S…             87.2            87.2
## 3 South Sudan          2011         2011 Sub-S…            102.            102. 
## 4 Sudan                2011         2011 Sub-S…             73.4            73.4
## 5 Lesotho              1984         2011 Sub-S…            101.             96.1
## # … with 1 more variable: mortality_improvement <dbl>
join_7 %>% #The Top 5 countries with the least improvement in mortality rates
  filter(region == "Sub-Saharan Africa") %>%
  slice_max(order_by = -mortality_improvement, n = 5)
## # A tibble: 5 × 7
##   country    minimum_year maximum_year region   original_mortal… final_mortality
##   <chr>             <dbl>        <dbl> <chr>               <dbl>           <dbl>
## 1 Senegal            1981         2011 Sub-Sah…             198.            63  
## 2 Niger              1990         2011 Sub-Sah…             329.           115. 
## 3 Eritrea            1992         2011 Sub-Sah…             138.            53.2
## 4 Mozambique         1990         2011 Sub-Sah…             243.           100. 
## 5 Tanzania           1990         2010 Sub-Sah…             165.            71.9
## # … with 1 more variable: mortality_improvement <dbl>
#South Asia
join_7 %>% #The Top 5 countries with the greatest improvement in mortality rates
  filter(region == "South Asia") %>%
  slice_max(order_by = mortality_improvement, n = 5)
## # A tibble: 5 × 7
##   country   minimum_year maximum_year region     original_mortal… final_mortality
##   <chr>            <dbl>        <dbl> <chr>                 <dbl>           <dbl>
## 1 Pakistan          2002         2011 South Asia            102.             85  
## 2 Sri Lanka         2001         2011 South Asia             16              11.2
## 3 Nepal             1999         2011 South Asia             85.8            44.2
## 4 India             1990         2009 South Asia            126.             61.4
## 5 Bhutan            1998         2011 South Asia             86.1            39.9
## # … with 1 more variable: mortality_improvement <dbl>
join_7 %>% #The Top 5 countries with the least improvement in mortality rates
  filter(region == "South Asia") %>%
  slice_max(order_by = -mortality_improvement, n = 5)
## # A tibble: 5 × 7
##   country    minimum_year maximum_year region     original_mortal… final_mortality
##   <chr>             <dbl>        <dbl> <chr>                 <dbl>           <dbl>
## 1 Maldives           1997         2008 South Asia             52.8            16  
## 2 Bangladesh         1990         2010 South Asia            144.             48.7
## 3 Bhutan             1998         2011 South Asia             86.1            39.9
## 4 India              1990         2009 South Asia            126.             61.4
## 5 Nepal              1999         2011 South Asia             85.8            44.2
## # … with 1 more variable: mortality_improvement <dbl>
#Latin America & Caribbean
join_7 %>% #The Top 5 countries with the greatest improvement in mortality rates
  filter(region == "Latin America & Caribbean") %>%
  slice_max(order_by = mortality_improvement, n = 5)
## # A tibble: 5 × 7
##   country            minimum_year maximum_year region original_mortal… final_mortality
##   <chr>                     <dbl>        <dbl> <chr>             <dbl>           <dbl>
## 1 Haiti                      1997         1997 Latin…            115.            115. 
## 2 Suriname                   2005         2011 Latin…             26.5            22.5
## 3 Dominican Republic         1999         2011 Latin…             42              33.7
## 4 Belize                     1999         2011 Latin…             24.4            18.3
## 5 Costa Rica                 1990         2011 Latin…             16.9            10.2
## # … with 1 more variable: mortality_improvement <dbl>
join_7 %>% #The Top 5 countries with the least improvement in mortality rates
  filter(region == "Latin America & Caribbean") %>%
  slice_max(order_by = -mortality_improvement, n = 5)
## # A tibble: 5 × 7
##   country   minimum_year maximum_year region    original_mortal… final_mortality
##   <chr>            <dbl>        <dbl> <chr>                <dbl>           <dbl>
## 1 Ecuador           1979         2011 Latin Am…             96.1            17.5
## 2 Honduras          1979         2008 Latin Am…             99.3            25.5
## 3 Peru              1993         2011 Latin Am…             67.1            18.9
## 4 Nicaragua         1986         2010 Latin Am…             78.7            23.9
## 5 Colombia          1984         2011 Latin Am…             45.1            17.8
## # … with 1 more variable: mortality_improvement <dbl>
#Europe & Central Asia
join_7 %>% #The Top 5 countries with the greatest improvement in mortality rates
  filter(region == "Europe & Central Asia") %>%
  slice_max(order_by = mortality_improvement, n = 5)
## # A tibble: 5 × 7
##   country    minimum_year maximum_year region   original_mortal… final_mortality
##   <chr>             <dbl>        <dbl> <chr>               <dbl>           <dbl>
## 1 Uzbekistan         2007         2008 Europe …             41.3            38.5
## 2 Serbia             2005         2011 Europe …              8.9             7.4
## 3 Ukraine            2002         2011 Europe …             16.7            11.2
## 4 Armenia            2002         2011 Europe …             27.8            17.6
## 5 Bulgaria           1996         2011 Europe …             19.2            10.4
## # … with 1 more variable: mortality_improvement <dbl>
join_7 %>% #The Top 5 countries with the least improvement in mortality rates
  filter(region == "Europe & Central Asia") %>%
  slice_max(order_by = -mortality_improvement, n = 5)
## # A tibble: 5 × 7
##   country    minimum_year maximum_year region   original_mortal… final_mortality
##   <chr>             <dbl>        <dbl> <chr>               <dbl>           <dbl>
## 1 Turkey             1990         2011 Europe …             73.9            17  
## 2 Georgia            1995         2011 Europe …             45.2            13  
## 3 Belarus            1995         2011 Europe …             15.6             5.1
## 4 Azerbaijan         1991         2011 Europe …             95.3            35  
## 5 Kazakhstan         2000         2011 Europe …             42.2            18.3
## # … with 1 more variable: mortality_improvement <dbl>
#Middle East & North Africa
join_7 %>% #The Top 5 countries with the greatest improvement in mortality rates
  filter(region == "Middle East & North Africa") %>%
  slice_max(order_by = mortality_improvement, n = 5)
## # A tibble: 3 × 7
##   country minimum_year maximum_year region      original_mortal… final_mortality
##   <chr>          <dbl>        <dbl> <chr>                  <dbl>           <dbl>
## 1 Algeria         1990         2008 Middle Eas…             49.5            29.5
## 2 Morocco         1990         2011 Middle Eas…             79.1            30.4
## 3 Tunisia         1990         2011 Middle Eas…             55.3            18  
## # … with 1 more variable: mortality_improvement <dbl>
join_7 %>% #The Top 5 countries with the least improvement in mortality rates
  filter(region == "Middle East & North Africa") %>%
  slice_max(order_by = -mortality_improvement, n = 5)
## # A tibble: 3 × 7
##   country minimum_year maximum_year region      original_mortal… final_mortality
##   <chr>          <dbl>        <dbl> <chr>                  <dbl>           <dbl>
## 1 Tunisia         1990         2011 Middle Eas…             55.3            18  
## 2 Morocco         1990         2011 Middle Eas…             79.1            30.4
## 3 Algeria         1990         2008 Middle Eas…             49.5            29.5
## # … with 1 more variable: mortality_improvement <dbl>
#East Asia & Pacific
join_7 %>% #The Top 5 countries with the greatest improvement in mortality rates
  filter(region == "East Asia & Pacific") %>%
  slice_max(order_by = mortality_improvement, n = 5)
## # A tibble: 5 × 7
##   country     minimum_year maximum_year region  original_mortal… final_mortality
##   <chr>              <dbl>        <dbl> <chr>              <dbl>           <dbl>
## 1 Fiji                1992         2011 East A…             26.9            23.6
## 2 Thailand            2006         2009 East A…             16.4            14.2
## 3 Myanmar             2000         2010 East A…             89              63.4
## 4 Vietnam             1998         2011 East A…             32.8            22.6
## 5 Philippines         1992         2009 East A…             50.4            32.1
## # … with 1 more variable: mortality_improvement <dbl>
join_7 %>% #The Top 5 countries with the least improvement in mortality rates
  filter(region == "East Asia & Pacific") %>%
  slice_max(order_by = -mortality_improvement, n = 5)
## # A tibble: 5 × 7
##   country     minimum_year maximum_year region  original_mortal… final_mortality
##   <chr>              <dbl>        <dbl> <chr>              <dbl>           <dbl>
## 1 Mongolia            1995         2011 East A…             88.1            27.9
## 2 Cambodia            1997         2011 East A…            120.             40.6
## 3 Indonesia           1990         2011 East A…             84              32.6
## 4 Malaysia            1994         2011 East A…             13.9             8  
## 5 Philippines         1992         2009 East A…             50.4            32.1
## # … with 1 more variable: mortality_improvement <dbl>

Is there a relationship between primary school enrollment and fertility rate?

Yes, there seems to be a strong inverse relationship between primary school enrollment and fertility rate. We would desire to see a distribution of values like in Europe&Central Asia, where there is a very high percentage of people completing primary education and the higher this percentage, the lower the mortality rate. We can observe a similar pattern for Latin America & Caribbean. In South Asia, the countries have very diverse percentages of population having completed primary education, as does Sub-Saharan Africa. In the latter and in East Asia&Pacific, we also observe an inverse trend: we seem to find an increased mortality rate for countries with higher percentage of population having completed primary education.

ggplot(join_3, aes(x = SE.PRM.NENR, y = SP.DYN.TFRT.IN)) +
    geom_point() +
    geom_smooth(method="lm") +
    labs(title = "Relationship between Primary School Enrollment and Fertility Rate", 
         x = "Primary School Enrollment",
         y = "Fertility Rate")

# GDP components over time and among countries!

UN_GDP_data  <-  read_excel(here::here("data", "Download-GDPconstant-USD-countries.xls"), # Excel filename
                sheet="Download-GDPconstant-USD-countr", # Sheet name
                skip=2) # Number of rows to skip
# Tidying data - Making data long and expressing all figures in billions
tidy_GDP_data_1 <- UN_GDP_data %>%
  pivot_longer(4:51, names_to = "year", values_to = "indicator_data") %>%
  mutate(indicator_data = indicator_data / 1000000000)
# Renaming indicators into shorter counterparts
tidy_GDP_data_1 <- tidy_GDP_data_1
tidy_GDP_data_1$IndicatorName[tidy_GDP_data_1$IndicatorName == "Exports of goods and services"] <-  "Exports"
tidy_GDP_data_1$IndicatorName[tidy_GDP_data_1$IndicatorName == "General government final consumption expenditure"] <-  "Government expenditure" 
tidy_GDP_data_1$IndicatorName[tidy_GDP_data_1$IndicatorName == "Household consumption expenditure (including Non-profit institutions serving households)"] <-  "Household expenditure" 
tidy_GDP_data_1$IndicatorName[tidy_GDP_data_1$IndicatorName == "Imports of goods and services"] <-  "Imports"

glimpse(tidy_GDP_data_1)
## Rows: 176,880
## Columns: 5
## $ CountryID      <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4…
## $ Country        <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanist…
## $ IndicatorName  <chr> "Final consumption expenditure", "Final consumption exp…
## $ year           <chr> "1970", "1971", "1972", "1973", "1974", "1975", "1976",…
## $ indicator_data <dbl> 5.56, 5.33, 5.20, 5.75, 6.15, 6.32, 6.37, 6.90, 7.09, 6…
# Let us compare GDP components for these 3 countries
country_list <- c("United States","India", "Germany")
# Required Indicators names for graph
list_indicator_name = c("Gross capital formation","Exports", "Government expenditure", "Household expenditure", "Imports")
tidy_GDP_data_1 %>%
  filter(Country%in% country_list) %>% #Filtering by aforementioned country_list
  filter(IndicatorName %in% list_indicator_name ) %>% #Filtering by list_indicator_name
  group_by(IndicatorName) %>%  
  # Line plot for GDP over time graph
  ggplot(aes(x = year, y = indicator_data, color = IndicatorName, group = IndicatorName)) +
  geom_line(aes(x = year, y = indicator_data, color = IndicatorName)) +
  facet_wrap(~ Country) +
  theme_bw() +
  theme(legend.position="right") + 
   scale_x_discrete(breaks = seq(1970, 2017, 10)) +
   scale_color_discrete("Components of GDP", breaks = c("Gross capital formation","Exports", "Government expenditure", "Household expenditure", "Imports")) + 
  labs(title = "GDP Components over time", subtitle = "In constant 2010 USD", x = "", y = "Billion US$") +
  theme(plot.title = element_text(face="bold")) 

Secondly, GDP is the sum of Household Expenditure (Consumption C), Gross Capital Formation (business investment I), Government Expenditure (G) and Net Exports (exports - imports). Even though there is an indicator Gross Domestic Product (GDP) in your dataframe, let’s calculate it given its components discussed above.

#converting to tidy wide format 
tidy_GDP_data_2 <- tidy_GDP_data_1%>%
  pivot_wider(
    names_from = IndicatorName, 
    values_from = indicator_data)
glimpse(tidy_GDP_data_2)
## Rows: 10,560
## Columns: 20
## $ CountryID                                                                            <dbl> …
## $ Country                                                                              <chr> …
## $ year                                                                                 <chr> …
## $ `Final consumption expenditure`                                                      <dbl> …
## $ `Household expenditure`                                                              <dbl> …
## $ `Government expenditure`                                                             <dbl> …
## $ `Gross capital formation`                                                            <dbl> …
## $ `Gross fixed capital formation (including Acquisitions less disposals of valuables)` <dbl> …
## $ Exports                                                                              <dbl> …
## $ Imports                                                                              <dbl> …
## $ `Gross Domestic Product (GDP)`                                                       <dbl> …
## $ `Agriculture, hunting, forestry, fishing (ISIC A-B)`                                 <dbl> …
## $ `Mining, Manufacturing, Utilities (ISIC C-E)`                                        <dbl> …
## $ `Manufacturing (ISIC D)`                                                             <dbl> …
## $ `Construction (ISIC F)`                                                              <dbl> …
## $ `Wholesale, retail trade, restaurants and hotels (ISIC G-H)`                         <dbl> …
## $ `Transport, storage and communication (ISIC I)`                                      <dbl> …
## $ `Other Activities (ISIC J-P)`                                                        <dbl> …
## $ `Total Value Added`                                                                  <dbl> …
## $ `Changes in inventories`                                                             <dbl> …
# Using GDP formula to calculate GDP using GDP components - the sum of Household Expenditure (Consumption *C*), Gross Capital Formation (business investment *I*), Government Expenditure (G) and Net Exports (exports - imports)
tidy_GDP_data_2 <- tidy_GDP_data_2 %>%
  mutate(calculated_GDP = tidy_GDP_data_2$`Household expenditure`+ tidy_GDP_data_2$`Gross capital formation`+ tidy_GDP_data_2$`Government expenditure`+ tidy_GDP_data_2$`Exports`- tidy_GDP_data_2$`Imports`)
#Calculating % difference between calculated GDP and UN_GDP_data GDP Figures
tidy_GDP_data_2 <- tidy_GDP_data_2 %>%
  mutate(percentage_difference_calculatedGDP_columnGDP = (((calculated_GDP / tidy_GDP_data_2$`Gross Domestic Product (GDP)`)-1) * 100))

tidy_GDP_data_2
## # A tibble: 10,560 × 22
##    CountryID Country     year  `Final consumpt… `Household expe… `Government exp…
##        <dbl> <chr>       <chr>            <dbl>            <dbl>            <dbl>
##  1         4 Afghanistan 1970              5.56             5.07            0.372
##  2         4 Afghanistan 1971              5.33             4.84            0.382
##  3         4 Afghanistan 1972              5.20             4.70            0.402
##  4         4 Afghanistan 1973              5.75             5.21            0.421
##  5         4 Afghanistan 1974              6.15             5.59            0.431
##  6         4 Afghanistan 1975              6.32             5.65            0.598
##  7         4 Afghanistan 1976              6.37             5.68            0.627
##  8         4 Afghanistan 1977              6.90             6.15            0.676
##  9         4 Afghanistan 1978              7.09             6.30            0.725
## 10         4 Afghanistan 1979              6.92             6.15            0.708
## # … with 10,550 more rows, and 16 more variables:
## #   Gross capital formation <dbl>,
## #   Gross fixed capital formation (including Acquisitions less disposals of valuables) <dbl>,
## #   Exports <dbl>, Imports <dbl>, Gross Domestic Product (GDP) <dbl>,
## #   Agriculture, hunting, forestry, fishing (ISIC A-B) <dbl>,
## #   Mining, Manufacturing, Utilities (ISIC C-E) <dbl>,
## #   Manufacturing (ISIC D) <dbl>, Construction (ISIC F) <dbl>, …
# Plotting a graph, utilizing Percentage Difference of Calculated GDP and UN_GDP_data GDP Figures to compare this difference across Germany, India, and United States
tidy_GDP_data_2 %>%
  filter(Country %in% country_list) %>%
  ggplot(aes(x=year, y=percentage_difference_calculatedGDP_columnGDP)) +
  geom_line(group = 1, color = "black", size = 0.8) + geom_line(group = 1,color = "black",y=0, size = 0.8) +
  facet_wrap(~ Country) +
  theme_bw() +
  scale_x_discrete(breaks = seq(1970, 2017, 10)) +
  geom_ribbon(aes(ymin = 0, ymax = pmin(0, percentage_difference_calculatedGDP_columnGDP), group=1), alpha=0.8, fill = "blue") +
  geom_ribbon(aes(ymin = percentage_difference_calculatedGDP_columnGDP, ymax = pmin(0, percentage_difference_calculatedGDP_columnGDP), group=1), alpha=0.8, fill = "yellow") +
  labs(title = "Percentage Difference of Calculated GDP and UN_GDP_data GDP Figures",
       subtitle = "Yellow: Calculated GDP > UN_GDP_data GDP 
Blue: Calculated GDP < UN_GDP_data GDP",
         x = "Time period (1970-2017)", 
         y = "Percentage Difference", 
        )+
  NULL

What is the % difference between calculated GDP and the GDP figure included in the dataframe? In the case of India, the period 1970-1990, the graph depicts a yellow region wherein calculated GDP is higher than the UN_GDP_data GDP figures. However, post 1990 (excluding 2007 and 2010), the graph depicts a blue region wherein calculated GDP is lower than the UN_GDP_data GDP figures. In 2007, the % difference between calculated GDP and the GDP figure is 2.1309% and in 2010, the % difference between calculated GDP and the GDP figure is 1.0660%

In the case of Germany, the graph depicts a yellow region throughout from 1970 to 2017 wherein calculated GDP is higher than the UN_GDP_data GDP figures.The % difference between calculated GDP and the GDP figure was maximum in 1972 at 3.56%. In the United States, the graph depicts a mostly yellow region i.e.calculated GDP is higher than the UN_GDP_data GDP figures, excluding the time period from 2005-2008, 2010, and 2016-2017 which depicted a blue region i.e. calculated GDP is lower than the UN_GDP_data GDP figures.