The Grammar of Data Wrangling

October 1, 2024

filter()
select()
mutate()

Downloading V-Dem Data


The vdem function from vdemdata just downloads all of the data. Try running this code chunk. What do you see in democracy?


library(vdemdata) # load the V-Dem package

democracy <- vdem() # download the V-Dem dataset
02:00

filter()

  • Run this code. What do you see?
  • Try changing the year
  • For one year, use == instead of >=
  • Or try <= and see what happens


democracy <- vdem |> # download the V-Dem dataset
  filter(year >= 1990) # filter out years less than 1990
  
glimpse(democracy)  
02:00

= versus ==


  • = is used to assign values to variables, just like <-
  • == is used to test if two values are equal to each other
  • So filter(year == 1990) will give you just the observations for 1990

>= and <=

  • >= is used to test if a value is greater than or equal to another value
  • <= is used to test if a value is less than or equal to another value
  • So filter(year >= 1990) will give you the observations for 1990 and later
  • And filter(year <= 1990) will give you the observations for 1990 and earlier

select()

  • Run this code. What do you see?
  • Now try v2x_libdem instead of v2x_polyarchy
  • Choose more from the codebook
democracy <- vdem |> # download the V-Dem dataset
  select(                  # select (and rename) these variables
    country = country_name,     # before the = sign is new name  
    vdem_ctry_id = country_id,  # after the = sign is the old name
    year, 
    polyarchy = v2x_polyarchy
  )
  
glimpse(democracy)  
02:00

mutate()

  • Modify the code to create new variable that is three times the value of polyarchy
  • How about polyarchy squared?
democracy <- vdem |> # download the V-Dem dataset
  filter(year == 2015) |> # keep only observations from 2015
  select(                  # select (and rename) these variables
    country = country_name,     # name before the = sign is new name  
    vdem_ctry_id = country_id,  # name after the = sign is old name
    year, 
    polyarchy = v2x_polyarchy 
    ) |>
  mutate(
    polyarchy_dbl = polyarchy * 2 # create variable 2X polyarchy
  )
  
glimpse(democracy)  
02:00

Some Common Arithmetic Operators


  • + addition
  • - subtraction
  • * multiplication
  • / division
  • ^ exponentiation (also **)

vdemdata Example


# Load packages
library(vdemdata) # to download V-Dem data
library(dplyr)

# Download the data
democracy <- vdem |> # download the V-Dem dataset
  filter(year == 2015)  |> # filter year, keep 2015
  select(                  # select (and rename) these variables
    country = country_name,     # the name before the = sign is the new name  
    vdem_ctry_id = country_id,  # the name after the = sign is the old name
    year, 
    polyarchy = v2x_polyarchy, 
    gdp_pc = e_gdppc, 
    region = e_regionpol_6C
    ) |>
  mutate(
    region = case_match(region, # replace the values in region with country names
                     1 ~ "Eastern Europe", 
                     2 ~ "Latin America",  
                     3 ~ "Middle East",   
                     4 ~ "Africa", 
                     5 ~ "The West", 
                     6 ~ "Asia")
  )

# View the data
glimpse(democracy)


Use filter() to select years…


# Download the data
democracy <- vdem |> 
  filter(year == 2015)  |> # keep 2015
  select(                 
    country = country_name,       
    vdem_ctry_id = country_id,  
    year, 
    polyarchy = v2x_polyarchy, 
    gdp_pc = e_gdppc, 
    region = e_regionpol_6C
    ) |>
  mutate(
    region = case_match(region,
                     1 ~ "Eastern Europe", 
                     2 ~ "Latin America",  
                     3 ~ "Middle East",   
                     4 ~ "Africa", 
                     5 ~ "The West", 
                     6 ~ "Asia")
  )


Use select() to choose variables…


# Download the data
democracy <- vdem |> 
  filter(year == 2015)  |> 
  select(                  # select (and rename) these variables
    country = country_name,     # the name before the = sign is the new name  
    vdem_ctry_id = country_id,  # the name after the = sign is the old name
    year, 
    polyarchy = v2x_polyarchy, 
    gdp_pc = e_gdppc, 
    region = e_regionpol_6C
    ) |>
  mutate(
    region = case_match(region, 
                     1 ~ "Eastern Europe", 
                     2 ~ "Latin America",  
                     3 ~ "Middle East",   
                     4 ~ "Africa", 
                     5 ~ "The West", 
                     6 ~ "Asia")
  )


Use mutate with case_match() to Recode Region….


# Download the data
democracy <- vdem |>
  filter(year == 2015)  |> 
  select(                  
    country = country_name,     
    vdem_ctry_id = country_id,  
    year, 
    polyarchy = v2x_polyarchy, 
    gdp_pc = e_gdppc, 
    region = e_regionpol_6C
    ) |>
  mutate(
    region = case_match(region, # replace the values in region with country names
                     1 ~ "Eastern Europe", 
                     2 ~ "Latin America",  
                     3 ~ "Middle East",   
                     4 ~ "Africa", 
                     5 ~ "The West", 
                     6 ~ "Asia")
                    # number on the left of the ~ is the V-Dem region code
                    # we are changing the number to the country name on the right
                    # of the equals sign
  )

Your Turn!


Have a look at the V-Dem codebook


  • filter the data for the year 2010
  • select a different democracy indicator
  • use a different region variable (e.g., e_regionpol_7C)

group_by()
summarize()
arrange()

Group, Summarize and Arrange


  • group_by(), summarize(), arrange()
  • A very common sequence in data science:
    • Take an average or some other statistic for a group
    • Rank from high to low values of summary value

Example


# group_by(), summarize() and arrange()
democracy |> # save result as new object
  group_by(region)  |> # group data by region
  summarize(           # summarize following vars (by region)
    polyarchy_mean = mean(polyarchy, na.rm = TRUE), # calculate mean after remove NAs
    libdem_median = median(libdem, na.rm = TRUE), # calculate median after remove NAs
    gender = sd(gender, na.rm = TRUE), # calculate std. dev after remove NAs
    gdp_pc = min(gdp_pc, na.rm = TRUE) # calculate minimum flfp after remove NAs
  ) |> 
  arrange(desc(polyarchy_mean)) # arrange in descending order by polyarchy score


Use across() to Apply Same Function to Multiple Columns


dem_women |>
  group_by(region) |> 
  summarize(
    across(c(polyarchy, libdem, women_rep, flfp), # apply to these columns 
           mean, # take the mean of the columns
           na.rm = TRUE,  # remove NAs
           .names = "mean_{col}") # change the suffix of the vars
  ) |> 
  arrange(desc(mean_polyarchy))

Try it Yourself

Now try grouping by country instead of region and filter for years >= 2000.

  1. What is the median value of polyarchy for Sweden?
  2. What is the max value of libdem New Zealand?
  3. What is the standard deviation of gender for Norway?
  4. What is the min of gdp_pc for Germany?

Try using across() to calculate the mean of polyarchy, libdem, gender1 and gdp_pc for each country.