dplyr

Preparation

library(tidyverse)
library(nycflights13)

Caution
dplyr overwrites some functions in base R.
When base version is needed, stats::filter() and stats::lag() are employed

nycflights13

flights
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest  air_time distance  hour
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>
##  1  2013     1     1      517            515         2      830            819        11 UA        1545 N14228  EWR    IAH        227     1400     5
##  2  2013     1     1      533            529         4      850            830        20 UA        1714 N24211  LGA    IAH        227     1416     5
##  3  2013     1     1      542            540         2      923            850        33 AA        1141 N619AA  JFK    MIA        160     1089     5
##  4  2013     1     1      544            545        -1     1004           1022       -18 B6         725 N804JB  JFK    BQN        183     1576     5
##  5  2013     1     1      554            600        -6      812            837       -25 DL         461 N668DN  LGA    ATL        116      762     6
##  6  2013     1     1      554            558        -4      740            728        12 UA        1696 N39463  EWR    ORD        150      719     5
##  7  2013     1     1      555            600        -5      913            854        19 B6         507 N516JB  EWR    FLL        158     1065     6
##  8  2013     1     1      557            600        -3      709            723       -14 EV        5708 N829AS  LGA    IAD         53      229     6
##  9  2013     1     1      557            600        -3      838            846        -8 B6          79 N593JB  JFK    MCO        140      944     6
## 10  2013     1     1      558            600        -2      753            745         8 AA         301 N3ALAA  LGA    ORD        138      733     6
## # ... with 336,766 more rows, and 2 more variables: minute <dbl>, time_hour <dttm>

types of values

  • int: integers
  • dbl: doubles (real numbers)
  • chr: character vectors (strings)
  • dttm: date-time
  • lgl: lagical (TRUE or FALSE)
  • fctr: factors (categorical variables with fixed values)
  • date: dates

Basic (dplyr)

  • filter(): pick observations (row) by their values
  • select(): pick variables (column) by their names
  • arrange(): reorder the rows (observations)
  • mutate(): create new variables with existing variables
  • summarise(): collapse many values down to a single summary
  • group_by(): change the scope of each function

(i) filter()

jan1 <- filter(flights, month == 1, day == 1)
dec25 <- filter(flights, month == 12, day == 25)
nov_dec <- filter(flights, month == 11 | month == 12)

jan1
dec25
nov_dec
## # A tibble: 842 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest  air_time distance  hour
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>
##  1  2013     1     1      517            515         2      830            819        11 UA        1545 N14228  EWR    IAH        227     1400     5
##  2  2013     1     1      533            529         4      850            830        20 UA        1714 N24211  LGA    IAH        227     1416     5
##  3  2013     1     1      542            540         2      923            850        33 AA        1141 N619AA  JFK    MIA        160     1089     5
##  4  2013     1     1      544            545        -1     1004           1022       -18 B6         725 N804JB  JFK    BQN        183     1576     5
##  5  2013     1     1      554            600        -6      812            837       -25 DL         461 N668DN  LGA    ATL        116      762     6
##  6  2013     1     1      554            558        -4      740            728        12 UA        1696 N39463  EWR    ORD        150      719     5
##  7  2013     1     1      555            600        -5      913            854        19 B6         507 N516JB  EWR    FLL        158     1065     6
##  8  2013     1     1      557            600        -3      709            723       -14 EV        5708 N829AS  LGA    IAD         53      229     6
##  9  2013     1     1      557            600        -3      838            846        -8 B6          79 N593JB  JFK    MCO        140      944     6
## 10  2013     1     1      558            600        -2      753            745         8 AA         301 N3ALAA  LGA    ORD        138      733     6
## # ... with 832 more rows, and 2 more variables: minute <dbl>, time_hour <dttm>
## # A tibble: 719 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest  air_time distance  hour
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>
##  1  2013    12    25      456            500        -4      649            651        -2 US        1895 N156UW  EWR    CLT         98      529     5
##  2  2013    12    25      524            515         9      805            814        -9 UA        1016 N32404  EWR    IAH        203     1400     5
##  3  2013    12    25      542            540         2      832            850       -18 AA        2243 N5EBAA  JFK    MIA        146     1089     5
##  4  2013    12    25      546            550        -4     1022           1027        -5 B6         939 N665JB  JFK    BQN        191     1576     5
##  5  2013    12    25      556            600        -4      730            745       -15 AA         301 N3JLAA  LGA    ORD        123      733     6
##  6  2013    12    25      557            600        -3      743            752        -9 DL         731 N369NB  LGA    DTW         88      502     6
##  7  2013    12    25      557            600        -3      818            831       -13 DL         904 N397DA  LGA    ATL        118      762     6
##  8  2013    12    25      559            600        -1      855            856        -1 B6         371 N608JB  LGA    FLL        147     1076     6
##  9  2013    12    25      559            600        -1      849            855        -6 B6         605 N536JB  EWR    FLL        149     1065     6
## 10  2013    12    25      600            600         0      850            846         4 B6         583 N746JB  JFK    MCO        137      944     6
## # ... with 709 more rows, and 2 more variables: minute <dbl>, time_hour <dttm>
## # A tibble: 55,403 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest  air_time distance  hour
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>
##  1  2013    11     1        5           2359         6      352            345         7 B6         745 N568JB  JFK    PSE        205     1617    23
##  2  2013    11     1       35           2250       105      123           2356        87 B6        1816 N353JB  JFK    SYR         36      209    22
##  3  2013    11     1      455            500        -5      641            651       -10 US        1895 N192UW  EWR    CLT         88      529     5
##  4  2013    11     1      539            545        -6      856            827        29 UA        1714 N38727  LGA    IAH        229     1416     5
##  5  2013    11     1      542            545        -3      831            855       -24 AA        2243 N5CLAA  JFK    MIA        147     1089     5
##  6  2013    11     1      549            600       -11      912            923       -11 UA         303 N595UA  JFK    SFO        359     2586     6
##  7  2013    11     1      550            600       -10      705            659         6 US        2167 N748UW  LGA    DCA         57      214     6
##  8  2013    11     1      554            600        -6      659            701        -2 US        2134 N742PS  LGA    BOS         40      184     6
##  9  2013    11     1      554            600        -6      826            827        -1 DL         563 N912DE  LGA    ATL        126      762     6
## 10  2013    11     1      554            600        -6      749            751        -2 DL         731 N315NB  LGA    DTW         93      502     6
## # ... with 55,393 more rows, and 2 more variables: minute <dbl>, time_hour <dttm>

(ii) arrange()

arrange(flights, month, day)
arrange(flights, desc(month), desc(day))
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest  air_time distance  hour
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>
##  1  2013     1     1      517            515         2      830            819        11 UA        1545 N14228  EWR    IAH        227     1400     5
##  2  2013     1     1      533            529         4      850            830        20 UA        1714 N24211  LGA    IAH        227     1416     5
##  3  2013     1     1      542            540         2      923            850        33 AA        1141 N619AA  JFK    MIA        160     1089     5
##  4  2013     1     1      544            545        -1     1004           1022       -18 B6         725 N804JB  JFK    BQN        183     1576     5
##  5  2013     1     1      554            600        -6      812            837       -25 DL         461 N668DN  LGA    ATL        116      762     6
##  6  2013     1     1      554            558        -4      740            728        12 UA        1696 N39463  EWR    ORD        150      719     5
##  7  2013     1     1      555            600        -5      913            854        19 B6         507 N516JB  EWR    FLL        158     1065     6
##  8  2013     1     1      557            600        -3      709            723       -14 EV        5708 N829AS  LGA    IAD         53      229     6
##  9  2013     1     1      557            600        -3      838            846        -8 B6          79 N593JB  JFK    MCO        140      944     6
## 10  2013     1     1      558            600        -2      753            745         8 AA         301 N3ALAA  LGA    ORD        138      733     6
## # ... with 336,766 more rows, and 2 more variables: minute <dbl>, time_hour <dttm>
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest  air_time distance  hour
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>
##  1  2013    12    31       13           2359        14      439            437         2 B6         839 N566JB  JFK    BQN        189     1576    23
##  2  2013    12    31       18           2359        19      449            444         5 DL         412 N713TW  JFK    SJU        192     1598    23
##  3  2013    12    31       26           2245       101      129           2353        96 B6         108 N374JB  JFK    PWM         50      273    22
##  4  2013    12    31      459            500        -1      655            651         4 US        1895 N557UW  EWR    CLT         95      529     5
##  5  2013    12    31      514            515        -1      814            812         2 UA         700 N470UA  EWR    IAH        223     1400     5
##  6  2013    12    31      549            551        -2      925            900        25 UA         274 N577UA  EWR    LAX        346     2454     5
##  7  2013    12    31      550            600       -10      725            745       -20 AA         301 N3CXAA  LGA    ORD        127      733     6
##  8  2013    12    31      552            600        -8      811            826       -15 EV        3825 N14916  EWR    IND        118      645     6
##  9  2013    12    31      553            600        -7      741            754       -13 DL         731 N333NB  LGA    DTW         86      502     6
## 10  2013    12    31      554            550         4     1024           1027        -3 B6         939 N552JB  JFK    BQN        195     1576     5
## # ... with 336,766 more rows, and 2 more variables: minute <dbl>, time_hour <dttm>
  • desc(): reorder in descending order

(iii) select()

select(flights, year, month, day)
select(flights, year:day)
select(flights, -(year:day))
select(flights, YEAR = year, everything())
## # A tibble: 336,776 x 3
##     year month   day
##    <int> <int> <int>
##  1  2013     1     1
##  2  2013     1     1
##  3  2013     1     1
##  4  2013     1     1
##  5  2013     1     1
##  6  2013     1     1
##  7  2013     1     1
##  8  2013     1     1
##  9  2013     1     1
## 10  2013     1     1
## # ... with 336,766 more rows
## # A tibble: 336,776 x 3
##     year month   day
##    <int> <int> <int>
##  1  2013     1     1
##  2  2013     1     1
##  3  2013     1     1
##  4  2013     1     1
##  5  2013     1     1
##  6  2013     1     1
##  7  2013     1     1
##  8  2013     1     1
##  9  2013     1     1
## 10  2013     1     1
## # ... with 336,766 more rows
## # A tibble: 336,776 x 16
##    dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest  air_time distance  hour minute
##       <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl>
##  1      517            515         2      830            819        11 UA        1545 N14228  EWR    IAH        227     1400     5     15
##  2      533            529         4      850            830        20 UA        1714 N24211  LGA    IAH        227     1416     5     29
##  3      542            540         2      923            850        33 AA        1141 N619AA  JFK    MIA        160     1089     5     40
##  4      544            545        -1     1004           1022       -18 B6         725 N804JB  JFK    BQN        183     1576     5     45
##  5      554            600        -6      812            837       -25 DL         461 N668DN  LGA    ATL        116      762     6      0
##  6      554            558        -4      740            728        12 UA        1696 N39463  EWR    ORD        150      719     5     58
##  7      555            600        -5      913            854        19 B6         507 N516JB  EWR    FLL        158     1065     6      0
##  8      557            600        -3      709            723       -14 EV        5708 N829AS  LGA    IAD         53      229     6      0
##  9      557            600        -3      838            846        -8 B6          79 N593JB  JFK    MCO        140      944     6      0
## 10      558            600        -2      753            745         8 AA         301 N3ALAA  LGA    ORD        138      733     6      0
## # ... with 336,766 more rows, and 1 more variable: time_hour <dttm>
## # A tibble: 336,776 x 19
##     YEAR month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest  air_time distance  hour
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>
##  1  2013     1     1      517            515         2      830            819        11 UA        1545 N14228  EWR    IAH        227     1400     5
##  2  2013     1     1      533            529         4      850            830        20 UA        1714 N24211  LGA    IAH        227     1416     5
##  3  2013     1     1      542            540         2      923            850        33 AA        1141 N619AA  JFK    MIA        160     1089     5
##  4  2013     1     1      544            545        -1     1004           1022       -18 B6         725 N804JB  JFK    BQN        183     1576     5
##  5  2013     1     1      554            600        -6      812            837       -25 DL         461 N668DN  LGA    ATL        116      762     6
##  6  2013     1     1      554            558        -4      740            728        12 UA        1696 N39463  EWR    ORD        150      719     5
##  7  2013     1     1      555            600        -5      913            854        19 B6         507 N516JB  EWR    FLL        158     1065     6
##  8  2013     1     1      557            600        -3      709            723       -14 EV        5708 N829AS  LGA    IAD         53      229     6
##  9  2013     1     1      557            600        -3      838            846        -8 B6          79 N593JB  JFK    MCO        140      944     6
## 10  2013     1     1      558            600        -2      753            745         8 AA         301 N3ALAA  LGA    ORD        138      733     6
## # ... with 336,766 more rows, and 2 more variables: minute <dbl>, time_hour <dttm>
  • A:Z: select column A to Z
  • -A: exclude column A
  • old_col_name = new_col_name: rename a column
  • everything(): select all columns
  • starts_with("abc"): matches names that begin with abc
  • ends_with("xyz"): matches names that end with xyz
  • contains("ijk"): matches names that contain ijk
  • matches("(.)\\1"): selects variables that match a regular expression
  • num_range("x", 1:3): matches x1, x2 and x3

(iv) mutate()

flights_sml <- select(flights, 
  year:day, 
  ends_with("delay"), 
  distance, 
  air_time
)

a. mutate() and transmute()

  • mutate(): add new variables at the last
  • transmute(): extract only new variables
mutate(flights_sml,
  gain = dep_delay - arr_delay,
  hours = air_time / 60,
  gain_per_hour = gain / hours
)

transmute(flights,
  gain = dep_delay - arr_delay,
  hours = air_time / 60,
  gain_per_hour = gain / hours
)
## # A tibble: 336,776 x 10
##     year month   day dep_delay arr_delay distance air_time  gain hours gain_per_hour
##    <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl> <dbl> <dbl>         <dbl>
##  1  2013     1     1         2        11     1400      227    -9 3.78          -2.38
##  2  2013     1     1         4        20     1416      227   -16 3.78          -4.23
##  3  2013     1     1         2        33     1089      160   -31 2.67         -11.6 
##  4  2013     1     1        -1       -18     1576      183    17 3.05           5.57
##  5  2013     1     1        -6       -25      762      116    19 1.93           9.83
##  6  2013     1     1        -4        12      719      150   -16 2.5           -6.4 
##  7  2013     1     1        -5        19     1065      158   -24 2.63          -9.11
##  8  2013     1     1        -3       -14      229       53    11 0.883         12.5 
##  9  2013     1     1        -3        -8      944      140     5 2.33           2.14
## 10  2013     1     1        -2         8      733      138   -10 2.3           -4.35
## # ... with 336,766 more rows
## # A tibble: 336,776 x 3
##     gain hours gain_per_hour
##    <dbl> <dbl>         <dbl>
##  1    -9 3.78          -2.38
##  2   -16 3.78          -4.23
##  3   -31 2.67         -11.6 
##  4    17 3.05           5.57
##  5    19 1.93           9.83
##  6   -16 2.5           -6.4 
##  7   -24 2.63          -9.11
##  8    11 0.883         12.5 
##  9     5 2.33           2.14
## 10   -10 2.3           -4.35
## # ... with 336,766 more rows

b. %/% (integer division) and %% (remainder)

transmute(flights,
  dep_time,
  hour = dep_time %/% 100,
  minute = dep_time %% 100
)
## # A tibble: 336,776 x 3
##    dep_time  hour minute
##       <int> <dbl>  <dbl>
##  1      517     5     17
##  2      533     5     33
##  3      542     5     42
##  4      544     5     44
##  5      554     5     54
##  6      554     5     54
##  7      555     5     55
##  8      557     5     57
##  9      557     5     57
## 10      558     5     58
## # ... with 336,766 more rows

(v) summarise()

summarise() should be used with group_by()

summarise(flights, delay = mean(dep_delay, na.rm = TRUE))

by_day <- group_by(flights, year, month, day)
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))
## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)
## # A tibble: 1 x 1
##   delay
##   <dbl>
## 1  12.6
## # A tibble: 365 x 4
## # Groups:   year, month [12]
##     year month   day delay
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.5 
##  2  2013     1     2 13.9 
##  3  2013     1     3 11.0 
##  4  2013     1     4  8.95
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.55
##  9  2013     1     9  2.28
## 10  2013     1    10  2.84
## # ... with 355 more rows

a. group_by() %>% summarise() %>% filter()

delays <- flights %>%
  group_by(dest) %>%
  summarise(
    count = n(),
    dist = mean(distance, na.rm = TRUE),
    delay = mean(arr_delay, na.rm = TRUE)
  ) %>%
  filter(count > 20, dest != "HNL")
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(data = delays, mapping = aes(x = dist, y = delay)) +
  geom_point(aes(size = count), alpha = 1/3) +
  geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

b. na.rm =: drop NA

flights %>% 
  group_by(year, month, day) %>% 
  summarise(mean = mean(dep_delay))
## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)
## # A tibble: 365 x 4
## # Groups:   year, month [12]
##     year month   day  mean
##    <int> <int> <int> <dbl>
##  1  2013     1     1    NA
##  2  2013     1     2    NA
##  3  2013     1     3    NA
##  4  2013     1     4    NA
##  5  2013     1     5    NA
##  6  2013     1     6    NA
##  7  2013     1     7    NA
##  8  2013     1     8    NA
##  9  2013     1     9    NA
## 10  2013     1    10    NA
## # ... with 355 more rows
flights %>% 
  group_by(year, month, day) %>% 
  summarise(mean = mean(dep_delay, na.rm = TRUE))
## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)
## # A tibble: 365 x 4
## # Groups:   year, month [12]
##     year month   day  mean
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.5 
##  2  2013     1     2 13.9 
##  3  2013     1     3 11.0 
##  4  2013     1     4  8.95
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.55
##  9  2013     1     9  2.28
## 10  2013     1    10  2.84
## # ... with 355 more rows

Remove canceled flights (arr_delay = NA)

not_cancelled <- flights %>% 
  filter(!is.na(dep_delay), !is.na(arr_delay))

not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(mean = mean(dep_delay))
## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)
## # A tibble: 365 x 4
## # Groups:   year, month [12]
##     year month   day  mean
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.4 
##  2  2013     1     2 13.7 
##  3  2013     1     3 10.9 
##  4  2013     1     4  8.97
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.56
##  9  2013     1     9  2.30
## 10  2013     1    10  2.84
## # ... with 355 more rows

c. count

ex1: flight
(i) ggplot2::geom_freqploy()

delays <- not_cancelled %>% 
  group_by(tailnum) %>% 
  summarise(
    delay = mean(arr_delay)
  )
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(data = delays, mapping = aes(x = delay)) + 
  geom_freqpoly(binwidth = 10)

  1. n(): count
delays <- not_cancelled %>% 
  group_by(tailnum) %>% 
  summarise(
    delay = mean(arr_delay),
    n = n()
  )
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(data = delays, mapping = aes(x = n, y = delay)) + 
  geom_point(alpha = 1/10) + 
  coord_flip()

  1. filter(): n > 25
delays %>% 
  filter(n > 25) %>% 
  ggplot(mapping = aes(x = n, y = delay)) + 
    geom_point(alpha = 1/10) +
    coord_flip()

ex2: batting

batting <- as_tibble(Lahman::Batting)

batters <- batting %>% 
  group_by(playerID) %>% 
  summarise(
    ba = sum(H, na.rm = TRUE) / sum(AB, na.rm = TRUE),
    ab = sum(AB, na.rm = TRUE)
  )
## `summarise()` ungrouping output (override with `.groups` argument)
batters %>% 
  filter(ab > 100) %>% 
  ggplot(mapping = aes(x = ab, y = ba)) +
    geom_point() + 
    geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

batters %>% 
  arrange(desc(ba))
## # A tibble: 19,689 x 3
##    playerID     ba    ab
##    <chr>     <dbl> <int>
##  1 abramge01     1     1
##  2 alanirj01     1     1
##  3 alberan01     1     1
##  4 banisje01     1     1
##  5 bartocl01     1     1
##  6 bassdo01      1     1
##  7 birasst01     1     2
##  8 bruneju01     1     1
##  9 burnscb01     1     1
## 10 cammaer01     1     1
## # ... with 19,679 more rows

d. Summary functions

not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(
    avg_delay = mean(arr_delay),
    avg_delay_posi = mean(arr_delay[arr_delay > 0]) # the average positive delay
  )
## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)
## # A tibble: 365 x 5
## # Groups:   year, month [12]
##     year month   day avg_delay avg_delay_posi
##    <int> <int> <int>     <dbl>          <dbl>
##  1  2013     1     1    12.7             32.5
##  2  2013     1     2    12.7             32.0
##  3  2013     1     3     5.73            27.7
##  4  2013     1     4    -1.93            28.3
##  5  2013     1     5    -1.53            22.6
##  6  2013     1     6     4.24            24.4
##  7  2013     1     7    -4.95            27.8
##  8  2013     1     8    -3.23            20.8
##  9  2013     1     9    -0.264           25.6
## 10  2013     1    10    -5.90            27.3
## # ... with 355 more rows
  • min(x): minimum
  • quantile(x, 0.25): 25%
  • median(x): half
  • max(x): maximum
  • first(x): first
  • nth(x, N): Nth number
  • last(x): last
  • n(): total number
  • sum(!is.na(x)): total number excluding NA
  • n_distinct(chr): count distinct values chr
  • wt = X: count a weighted value by X
  • sum(x > N): sum of x which are greater than N

tibble

library(tidyverse)

Basic

Read a data frame as tibble

as_tibble(iris)
## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1          5.1         3.5          1.4         0.2 setosa 
##  2          4.9         3            1.4         0.2 setosa 
##  3          4.7         3.2          1.3         0.2 setosa 
##  4          4.6         3.1          1.5         0.2 setosa 
##  5          5           3.6          1.4         0.2 setosa 
##  6          5.4         3.9          1.7         0.4 setosa 
##  7          4.6         3.4          1.4         0.3 setosa 
##  8          5           3.4          1.5         0.2 setosa 
##  9          4.4         2.9          1.4         0.2 setosa 
## 10          4.9         3.1          1.5         0.1 setosa 
## # ... with 140 more rows

a. Create a tibble with tibble()

tibble(
  x = 1:5, 
  y = 1, 
  z = x ^ 2 + y
)
## # A tibble: 5 x 3
##       x     y     z
##   <int> <dbl> <dbl>
## 1     1     1     2
## 2     2     1     5
## 3     3     1    10
## 4     4     1    17
## 5     5     1    26

b. Create a tibble with tribble()

tb <- tribble(
  ~x, ~y, ~z,
  #--|--|----
  "a", 2, 3.6,
  "b", 1, 8.5
)
tb
## # A tibble: 2 x 3
##   x         y     z
##   <chr> <dbl> <dbl>
## 1 a         2   3.6
## 2 b         1   8.5

c. Convert tibble to data.frame

df <- as.data.frame(tb)
class(tb)
## [1] "tbl_df"     "tbl"        "data.frame"
class(df)
## [1] "data.frame"

Tibble vs data.frame

a. Printing

tibble(
  a = lubridate::now() + runif(1e3) * 86316,
  b = lubridate::today() + runif(1e3) * 30,
  c = 1:1e3,
  d = runif(1e3),
  e = sample(letters, 1e3, replace = TRUE)
)
## # A tibble: 1,000 x 5
##    a                   b              c      d e    
##    <dttm>              <date>     <int>  <dbl> <chr>
##  1 2020-07-10 01:10:28 2020-07-22     1 0.944  a    
##  2 2020-07-09 21:06:13 2020-07-26     2 0.244  z    
##  3 2020-07-10 01:40:29 2020-08-05     3 0.253  o    
##  4 2020-07-10 04:53:31 2020-07-21     4 0.185  g    
##  5 2020-07-09 18:08:12 2020-07-12     5 0.314  u    
##  6 2020-07-10 16:07:23 2020-07-29     6 0.0186 q    
##  7 2020-07-10 05:31:50 2020-07-11     7 0.538  i    
##  8 2020-07-09 23:41:48 2020-07-20     8 0.103  x    
##  9 2020-07-10 07:37:19 2020-08-06     9 0.411  f    
## 10 2020-07-10 04:42:10 2020-07-14    10 0.539  h    
## # ... with 990 more rows

b. Subsetting

tb <- tibble(
  x = runif(5),
  y = rnorm(5)
)
tb

# xtract by name
tb$x
tb[["x"]]

# Extract by position
tb[[1]]

# Extract with a pipe
tb %>% .$x
tb %>% .[["x"]]
tb %>% .[[1]]
## # A tibble: 5 x 2
##       x       y
##   <dbl>   <dbl>
## 1 0.977  0.701 
## 2 0.426  0.793 
## 3 0.777 -0.0242
## 4 0.341  1.61  
## 5 0.595  1.24  
## [1] 0.9773 0.4256 0.7765 0.3409 0.5950
## [1] 0.9773 0.4256 0.7765 0.3409 0.5950
## [1] 0.9773 0.4256 0.7765 0.3409 0.5950
## [1] 0.9773 0.4256 0.7765 0.3409 0.5950
## [1] 0.9773 0.4256 0.7765 0.3409 0.5950
## [1] 0.9773 0.4256 0.7765 0.3409 0.5950

tidyr

Basic

library(tidyverse)
table1
table2
table3
## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583
## # A tibble: 12 x 4
##    country      year type            count
##    <chr>       <int> <chr>           <int>
##  1 Afghanistan  1999 cases             745
##  2 Afghanistan  1999 population   19987071
##  3 Afghanistan  2000 cases            2666
##  4 Afghanistan  2000 population   20595360
##  5 Brazil       1999 cases           37737
##  6 Brazil       1999 population  172006362
##  7 Brazil       2000 cases           80488
##  8 Brazil       2000 population  174504898
##  9 China        1999 cases          212258
## 10 China        1999 population 1272915272
## 11 China        2000 cases          213766
## 12 China        2000 population 1280428583
## # A tibble: 6 x 3
##   country      year rate             
## * <chr>       <int> <chr>            
## 1 Afghanistan  1999 745/19987071     
## 2 Afghanistan  2000 2666/20595360    
## 3 Brazil       1999 37737/172006362  
## 4 Brazil       2000 80488/174504898  
## 5 China        1999 212258/1272915272
## 6 China        2000 213766/1280428583

# Compute rate per 10,000
table1 %>% 
  mutate(rate = cases / population * 10000)

# Compute cases per year
table1 %>% 
  count(year, wt = cases)

# Visualize changes over time
library(ggplot2)
ggplot(table1, aes(year, cases)) + 
  geom_line(aes(group = country), colour = "grey50") + 
  geom_point(aes(colour = country))

## # A tibble: 6 x 5
##   country      year  cases population  rate
##   <chr>       <int>  <int>      <int> <dbl>
## 1 Afghanistan  1999    745   19987071 0.373
## 2 Afghanistan  2000   2666   20595360 1.29 
## 3 Brazil       1999  37737  172006362 2.19 
## 4 Brazil       2000  80488  174504898 4.61 
## 5 China        1999 212258 1272915272 1.67 
## 6 China        2000 213766 1280428583 1.67 
## # A tibble: 2 x 2
##    year      n
##   <int>  <int>
## 1  1999 250740
## 2  2000 296920

Pivot

(i) pivot_longer()

e.g. year is variable, not observation

table4a
table4b
## # A tibble: 3 x 3
##   country     `1999` `2000`
## * <chr>        <int>  <int>
## 1 Afghanistan    745   2666
## 2 Brazil       37737  80488
## 3 China       212258 213766
## # A tibble: 3 x 3
##   country         `1999`     `2000`
## * <chr>            <int>      <int>
## 1 Afghanistan   19987071   20595360
## 2 Brazil       172006362  174504898
## 3 China       1272915272 1280428583

Make longer table4a

table4a %>%
  pivot_longer(c("1999", "2000"), names_to = "year", values_to = "cases")
## # A tibble: 6 x 3
##   country     year   cases
##   <chr>       <chr>  <int>
## 1 Afghanistan 1999     745
## 2 Afghanistan 2000    2666
## 3 Brazil      1999   37737
## 4 Brazil      2000   80488
## 5 China       1999  212258
## 6 China       2000  213766

Make longer table4b

table4b %>% 
  pivot_longer(c("1999", "2000"), names_to = "year", values_to = "population")
## # A tibble: 6 x 3
##   country     year  population
##   <chr>       <chr>      <int>
## 1 Afghanistan 1999    19987071
## 2 Afghanistan 2000    20595360
## 3 Brazil      1999   172006362
## 4 Brazil      2000   174504898
## 5 China       1999  1272915272
## 6 China       2000  1280428583

Join table4a and table4b

tidy4a <- table4a %>% 
  pivot_longer(c("1999", "2000"), names_to = "year", values_to = "cases")
tidy4b <- table4b %>% 
  pivot_longer(c("1999", "2000"), names_to = "year", values_to = "population")
left_join(tidy4a, tidy4b)
## Joining, by = c("country", "year")
## # A tibble: 6 x 4
##   country     year   cases population
##   <chr>       <chr>  <int>      <int>
## 1 Afghanistan 1999     745   19987071
## 2 Afghanistan 2000    2666   20595360
## 3 Brazil      1999   37737  172006362
## 4 Brazil      2000   80488  174504898
## 5 China       1999  212258 1272915272
## 6 China       2000  213766 1280428583

(ii) pivot_wider()

e.g. type is observation, not variable

table2
## # A tibble: 12 x 4
##    country      year type            count
##    <chr>       <int> <chr>           <int>
##  1 Afghanistan  1999 cases             745
##  2 Afghanistan  1999 population   19987071
##  3 Afghanistan  2000 cases            2666
##  4 Afghanistan  2000 population   20595360
##  5 Brazil       1999 cases           37737
##  6 Brazil       1999 population  172006362
##  7 Brazil       2000 cases           80488
##  8 Brazil       2000 population  174504898
##  9 China        1999 cases          212258
## 10 China        1999 population 1272915272
## 11 China        2000 cases          213766
## 12 China        2000 population 1280428583

Wider table2

table2 %>%
    pivot_wider(names_from = type, values_from = count)
## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583

separate and unite

(i) separate()

e.g. rate has two variables

table3
## # A tibble: 6 x 3
##   country      year rate             
## * <chr>       <int> <chr>            
## 1 Afghanistan  1999 745/19987071     
## 2 Afghanistan  2000 2666/20595360    
## 3 Brazil       1999 37737/172006362  
## 4 Brazil       2000 80488/174504898  
## 5 China        1999 212258/1272915272
## 6 China        2000 213766/1280428583
table3 %>% 
  separate(rate, into = c("cases", "population"), sep = "/", convert = TRUE)

table3 %>% 
  separate(year, into = c("century", "year"), sep = 2, convert = TRUE)
## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583
## # A tibble: 6 x 4
##   country     century  year rate             
##   <chr>         <int> <int> <chr>            
## 1 Afghanistan      19    99 745/19987071     
## 2 Afghanistan      20     0 2666/20595360    
## 3 Brazil           19    99 37737/172006362  
## 4 Brazil           20     0 80488/174504898  
## 5 China            19    99 212258/1272915272
## 6 China            20     0 213766/1280428583
  • sep =: specify the place to separate
  • convert =: if TRUE, try and convert to better types

(ii) unite()

e.g. century and year can be united

table5
## # A tibble: 6 x 4
##   country     century year  rate             
## * <chr>       <chr>   <chr> <chr>            
## 1 Afghanistan 19      99    745/19987071     
## 2 Afghanistan 20      00    2666/20595360    
## 3 Brazil      19      99    37737/172006362  
## 4 Brazil      20      00    80488/174504898  
## 5 China       19      99    212258/1272915272
## 6 China       20      00    213766/1280428583
table5 %>%
  unite(new, century, year, sep = "")
## # A tibble: 6 x 3
##   country     new   rate             
##   <chr>       <chr> <chr>            
## 1 Afghanistan 1999  745/19987071     
## 2 Afghanistan 2000  2666/20595360    
## 3 Brazil      1999  37737/172006362  
## 4 Brazil      2000  80488/174504898  
## 5 China       1999  212258/1272915272
## 6 China       2000  213766/1280428583

Missing values

two types of missing values

  • explicitly: i.e. flagged with NA (the presence of an absence)
  • implicitly: i.e. simply not present in the data (the absence of a presence)
stocks <- tibble(
  year   = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),
  qtr    = c(   1,    2,    3,    4,    2,    3,    4),
  return = c(1.88, 0.59, 0.35,   NA, 0.92, 0.17, 2.66)
)

stocks %>% 
  pivot_wider(names_from = year, values_from = return)
## # A tibble: 4 x 3
##     qtr `2015` `2016`
##   <dbl>  <dbl>  <dbl>
## 1     1   1.88  NA   
## 2     2   0.59   0.92
## 3     3   0.35   0.17
## 4     4  NA      2.66

(i) longer with value_drop_na

stocks %>% 
  pivot_wider(names_from = year, values_from = return) %>% 
  pivot_longer(
    cols = c(`2015`, `2016`), 
    names_to = "year", 
    values_to = "return", 
    values_drop_na = TRUE
  )
## # A tibble: 6 x 3
##     qtr year  return
##   <dbl> <chr>  <dbl>
## 1     1 2015    1.88
## 2     2 2015    0.59
## 3     2 2016    0.92
## 4     3 2015    0.35
## 5     3 2016    0.17
## 6     4 2016    2.66
  • values_drop_na =: if TRUE, NA is dropped

(ii) complete()

stocks %>% 
  complete(year, qtr)
## # A tibble: 8 x 3
##    year   qtr return
##   <dbl> <dbl>  <dbl>
## 1  2015     1   1.88
## 2  2015     2   0.59
## 3  2015     3   0.35
## 4  2015     4  NA   
## 5  2016     1  NA   
## 6  2016     2   0.92
## 7  2016     3   0.17
## 8  2016     4   2.66

(iii)fill() to replace NA by the most recent non-missing value

treatment <- tribble(
  ~ person,           ~ treatment, ~response,
  "Derrick Whitmore", 1,           7,
  NA,                 2,           10,
  NA,                 3,           9,
  "Katherine Burke",  1,           4
)

treatment %>%
  fill(person)
## # A tibble: 4 x 3
##   person           treatment response
##   <chr>                <dbl>    <dbl>
## 1 Derrick Whitmore         1        7
## 2 Derrick Whitmore         2       10
## 3 Derrick Whitmore         3        9
## 4 Katherine Burke          1        4

Case study (using WHO data)

who
## # A tibble: 7,240 x 60
##    country iso2  iso3   year new_sp_m014 new_sp_m1524 new_sp_m2534 new_sp_m3544 new_sp_m4554 new_sp_m5564 new_sp_m65 new_sp_f014 new_sp_f1524
##    <chr>   <chr> <chr> <int>       <int>        <int>        <int>        <int>        <int>        <int>      <int>       <int>        <int>
##  1 Afghan~ AF    AFG    1980          NA           NA           NA           NA           NA           NA         NA          NA           NA
##  2 Afghan~ AF    AFG    1981          NA           NA           NA           NA           NA           NA         NA          NA           NA
##  3 Afghan~ AF    AFG    1982          NA           NA           NA           NA           NA           NA         NA          NA           NA
##  4 Afghan~ AF    AFG    1983          NA           NA           NA           NA           NA           NA         NA          NA           NA
##  5 Afghan~ AF    AFG    1984          NA           NA           NA           NA           NA           NA         NA          NA           NA
##  6 Afghan~ AF    AFG    1985          NA           NA           NA           NA           NA           NA         NA          NA           NA
##  7 Afghan~ AF    AFG    1986          NA           NA           NA           NA           NA           NA         NA          NA           NA
##  8 Afghan~ AF    AFG    1987          NA           NA           NA           NA           NA           NA         NA          NA           NA
##  9 Afghan~ AF    AFG    1988          NA           NA           NA           NA           NA           NA         NA          NA           NA
## 10 Afghan~ AF    AFG    1989          NA           NA           NA           NA           NA           NA         NA          NA           NA
## # ... with 7,230 more rows, and 47 more variables: new_sp_f2534 <int>, new_sp_f3544 <int>, new_sp_f4554 <int>, new_sp_f5564 <int>, new_sp_f65 <int>,
## #   new_sn_m014 <int>, new_sn_m1524 <int>, new_sn_m2534 <int>, new_sn_m3544 <int>, new_sn_m4554 <int>, new_sn_m5564 <int>, new_sn_m65 <int>,
## #   new_sn_f014 <int>, new_sn_f1524 <int>, new_sn_f2534 <int>, new_sn_f3544 <int>, new_sn_f4554 <int>, new_sn_f5564 <int>, new_sn_f65 <int>,
## #   new_ep_m014 <int>, new_ep_m1524 <int>, new_ep_m2534 <int>, new_ep_m3544 <int>, new_ep_m4554 <int>, new_ep_m5564 <int>, new_ep_m65 <int>,
## #   new_ep_f014 <int>, new_ep_f1524 <int>, new_ep_f2534 <int>, new_ep_f3544 <int>, new_ep_f4554 <int>, new_ep_f5564 <int>, new_ep_f65 <int>,
## #   newrel_m014 <int>, newrel_m1524 <int>, newrel_m2534 <int>, newrel_m3544 <int>, newrel_m4554 <int>, newrel_m5564 <int>, newrel_m65 <int>,
## #   newrel_f014 <int>, newrel_f1524 <int>, newrel_f2534 <int>, newrel_f3544 <int>, newrel_f4554 <int>, newrel_f5564 <int>, newrel_f65 <int>

a. pivot_longer()

who1 <- who %>%
  pivot_longer(
    new_sp_m014:newrel_f65,
    names_to = "key",
    values_to = "cases",
    values_drop_na = TRUE
  )
who1
## # A tibble: 76,046 x 6
##    country     iso2  iso3   year key          cases
##    <chr>       <chr> <chr> <int> <chr>        <int>
##  1 Afghanistan AF    AFG    1997 new_sp_m014      0
##  2 Afghanistan AF    AFG    1997 new_sp_m1524    10
##  3 Afghanistan AF    AFG    1997 new_sp_m2534     6
##  4 Afghanistan AF    AFG    1997 new_sp_m3544     3
##  5 Afghanistan AF    AFG    1997 new_sp_m4554     5
##  6 Afghanistan AF    AFG    1997 new_sp_m5564     2
##  7 Afghanistan AF    AFG    1997 new_sp_m65       0
##  8 Afghanistan AF    AFG    1997 new_sp_f014      5
##  9 Afghanistan AF    AFG    1997 new_sp_f1524    38
## 10 Afghanistan AF    AFG    1997 new_sp_f2534    36
## # ... with 76,036 more rows

b. count(): count the number of each ovservation

who1 %>% 
  count(key)
## # A tibble: 56 x 2
##    key              n
##    <chr>        <int>
##  1 new_ep_f014   1032
##  2 new_ep_f1524  1021
##  3 new_ep_f2534  1021
##  4 new_ep_f3544  1021
##  5 new_ep_f4554  1017
##  6 new_ep_f5564  1017
##  7 new_ep_f65    1014
##  8 new_ep_m014   1038
##  9 new_ep_m1524  1026
## 10 new_ep_m2534  1020
## # ... with 46 more rows

c. tidy key colunm

  • mutate(): newrel -> new and rel
  • separate(): key -> type, sex_age
  • separate(): sex_age -> sex, age
  • select(): drop some variables (new, iso2, iso3)

  1. mutate(): newrel -> new and rel
who2 <- who1 %>%
  mutate(names_from = stringr::str_replace(key, "newrel", "new_rel"))
who2
## # A tibble: 76,046 x 7
##    country     iso2  iso3   year key          cases names_from  
##    <chr>       <chr> <chr> <int> <chr>        <int> <chr>       
##  1 Afghanistan AF    AFG    1997 new_sp_m014      0 new_sp_m014 
##  2 Afghanistan AF    AFG    1997 new_sp_m1524    10 new_sp_m1524
##  3 Afghanistan AF    AFG    1997 new_sp_m2534     6 new_sp_m2534
##  4 Afghanistan AF    AFG    1997 new_sp_m3544     3 new_sp_m3544
##  5 Afghanistan AF    AFG    1997 new_sp_m4554     5 new_sp_m4554
##  6 Afghanistan AF    AFG    1997 new_sp_m5564     2 new_sp_m5564
##  7 Afghanistan AF    AFG    1997 new_sp_m65       0 new_sp_m65  
##  8 Afghanistan AF    AFG    1997 new_sp_f014      5 new_sp_f014 
##  9 Afghanistan AF    AFG    1997 new_sp_f1524    38 new_sp_f1524
## 10 Afghanistan AF    AFG    1997 new_sp_f2534    36 new_sp_f2534
## # ... with 76,036 more rows
  1. separate(): key -> type, sex_age
who3 <- who2 %>%
  separate(key, c("new", "type", "sexage"), sep = "_")
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 2580 rows [243, 244, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692,
## 903, 904, 905, 906, ...].
who3
## # A tibble: 76,046 x 9
##    country     iso2  iso3   year new   type  sexage cases names_from  
##    <chr>       <chr> <chr> <int> <chr> <chr> <chr>  <int> <chr>       
##  1 Afghanistan AF    AFG    1997 new   sp    m014       0 new_sp_m014 
##  2 Afghanistan AF    AFG    1997 new   sp    m1524     10 new_sp_m1524
##  3 Afghanistan AF    AFG    1997 new   sp    m2534      6 new_sp_m2534
##  4 Afghanistan AF    AFG    1997 new   sp    m3544      3 new_sp_m3544
##  5 Afghanistan AF    AFG    1997 new   sp    m4554      5 new_sp_m4554
##  6 Afghanistan AF    AFG    1997 new   sp    m5564      2 new_sp_m5564
##  7 Afghanistan AF    AFG    1997 new   sp    m65        0 new_sp_m65  
##  8 Afghanistan AF    AFG    1997 new   sp    f014       5 new_sp_f014 
##  9 Afghanistan AF    AFG    1997 new   sp    f1524     38 new_sp_f1524
## 10 Afghanistan AF    AFG    1997 new   sp    f2534     36 new_sp_f2534
## # ... with 76,036 more rows
  1. separate(): sex_age -> sex, age
who4 <- who3 %>%
  separate(sexage, c("sex", "age"), sep = 1)
who4
## # A tibble: 76,046 x 10
##    country     iso2  iso3   year new   type  sex   age   cases names_from  
##    <chr>       <chr> <chr> <int> <chr> <chr> <chr> <chr> <int> <chr>       
##  1 Afghanistan AF    AFG    1997 new   sp    m     014       0 new_sp_m014 
##  2 Afghanistan AF    AFG    1997 new   sp    m     1524     10 new_sp_m1524
##  3 Afghanistan AF    AFG    1997 new   sp    m     2534      6 new_sp_m2534
##  4 Afghanistan AF    AFG    1997 new   sp    m     3544      3 new_sp_m3544
##  5 Afghanistan AF    AFG    1997 new   sp    m     4554      5 new_sp_m4554
##  6 Afghanistan AF    AFG    1997 new   sp    m     5564      2 new_sp_m5564
##  7 Afghanistan AF    AFG    1997 new   sp    m     65        0 new_sp_m65  
##  8 Afghanistan AF    AFG    1997 new   sp    f     014       5 new_sp_f014 
##  9 Afghanistan AF    AFG    1997 new   sp    f     1524     38 new_sp_f1524
## 10 Afghanistan AF    AFG    1997 new   sp    f     2534     36 new_sp_f2534
## # ... with 76,036 more rows
  1. select(): drop new, iso2, and iso3
who5  <- who4 %>%
  select(-new, -iso2, -iso3)
who5
## # A tibble: 76,046 x 7
##    country      year type  sex   age   cases names_from  
##    <chr>       <int> <chr> <chr> <chr> <int> <chr>       
##  1 Afghanistan  1997 sp    m     014       0 new_sp_m014 
##  2 Afghanistan  1997 sp    m     1524     10 new_sp_m1524
##  3 Afghanistan  1997 sp    m     2534      6 new_sp_m2534
##  4 Afghanistan  1997 sp    m     3544      3 new_sp_m3544
##  5 Afghanistan  1997 sp    m     4554      5 new_sp_m4554
##  6 Afghanistan  1997 sp    m     5564      2 new_sp_m5564
##  7 Afghanistan  1997 sp    m     65        0 new_sp_m65  
##  8 Afghanistan  1997 sp    f     014       5 new_sp_f014 
##  9 Afghanistan  1997 sp    f     1524     38 new_sp_f1524
## 10 Afghanistan  1997 sp    f     2534     36 new_sp_f2534
## # ... with 76,036 more rows

d. with one code

who %>%
  pivot_longer(
    cols = new_sp_m014:newrel_f65, 
    names_to = "key", 
    values_to = "cases", 
    values_drop_na = TRUE
  ) %>% 
  mutate(
    key = stringr::str_replace(key, "newrel", "new_rel")
  ) %>%
  separate(key, c("new", "var", "sexage")) %>% 
  separate(sexage, c("sex", "age"), sep = 1) %>%
  select(-new, -iso2, -iso3)
## # A tibble: 76,046 x 6
##    country      year var   sex   age   cases
##    <chr>       <int> <chr> <chr> <chr> <int>
##  1 Afghanistan  1997 sp    m     014       0
##  2 Afghanistan  1997 sp    m     1524     10
##  3 Afghanistan  1997 sp    m     2534      6
##  4 Afghanistan  1997 sp    m     3544      3
##  5 Afghanistan  1997 sp    m     4554      5
##  6 Afghanistan  1997 sp    m     5564      2
##  7 Afghanistan  1997 sp    m     65        0
##  8 Afghanistan  1997 sp    f     014       5
##  9 Afghanistan  1997 sp    f     1524     38
## 10 Afghanistan  1997 sp    f     2534     36
## # ... with 76,036 more rows

Relational Data

library(tidyverse)
library(nycflights13)

mutating joins: add new variables to a data frame from matching observations in another.
filtering joins: filter observations from a data frame based on whether they match an observation in the other table
set operations: treat observations as if they were set elements

five dataset

flights
airlines
airports
planes
weather
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest  air_time distance  hour
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>
##  1  2013     1     1      517            515         2      830            819        11 UA        1545 N14228  EWR    IAH        227     1400     5
##  2  2013     1     1      533            529         4      850            830        20 UA        1714 N24211  LGA    IAH        227     1416     5
##  3  2013     1     1      542            540         2      923            850        33 AA        1141 N619AA  JFK    MIA        160     1089     5
##  4  2013     1     1      544            545        -1     1004           1022       -18 B6         725 N804JB  JFK    BQN        183     1576     5
##  5  2013     1     1      554            600        -6      812            837       -25 DL         461 N668DN  LGA    ATL        116      762     6
##  6  2013     1     1      554            558        -4      740            728        12 UA        1696 N39463  EWR    ORD        150      719     5
##  7  2013     1     1      555            600        -5      913            854        19 B6         507 N516JB  EWR    FLL        158     1065     6
##  8  2013     1     1      557            600        -3      709            723       -14 EV        5708 N829AS  LGA    IAD         53      229     6
##  9  2013     1     1      557            600        -3      838            846        -8 B6          79 N593JB  JFK    MCO        140      944     6
## 10  2013     1     1      558            600        -2      753            745         8 AA         301 N3ALAA  LGA    ORD        138      733     6
## # ... with 336,766 more rows, and 2 more variables: minute <dbl>, time_hour <dttm>
## # A tibble: 16 x 2
##    carrier name                       
##    <chr>   <chr>                      
##  1 9E      Endeavor Air Inc.          
##  2 AA      American Airlines Inc.     
##  3 AS      Alaska Airlines Inc.       
##  4 B6      JetBlue Airways            
##  5 DL      Delta Air Lines Inc.       
##  6 EV      ExpressJet Airlines Inc.   
##  7 F9      Frontier Airlines Inc.     
##  8 FL      AirTran Airways Corporation
##  9 HA      Hawaiian Airlines Inc.     
## 10 MQ      Envoy Air                  
## 11 OO      SkyWest Airlines Inc.      
## 12 UA      United Air Lines Inc.      
## 13 US      US Airways Inc.            
## 14 VX      Virgin America             
## 15 WN      Southwest Airlines Co.     
## 16 YV      Mesa Airlines Inc.         
## # A tibble: 1,458 x 8
##    faa   name                             lat    lon   alt    tz dst   tzone              
##    <chr> <chr>                          <dbl>  <dbl> <dbl> <dbl> <chr> <chr>              
##  1 04G   Lansdowne Airport               41.1  -80.6  1044    -5 A     America/New_York   
##  2 06A   Moton Field Municipal Airport   32.5  -85.7   264    -6 A     America/Chicago    
##  3 06C   Schaumburg Regional             42.0  -88.1   801    -6 A     America/Chicago    
##  4 06N   Randall Airport                 41.4  -74.4   523    -5 A     America/New_York   
##  5 09J   Jekyll Island Airport           31.1  -81.4    11    -5 A     America/New_York   
##  6 0A9   Elizabethton Municipal Airport  36.4  -82.2  1593    -5 A     America/New_York   
##  7 0G6   Williams County Airport         41.5  -84.5   730    -5 A     America/New_York   
##  8 0G7   Finger Lakes Regional Airport   42.9  -76.8   492    -5 A     America/New_York   
##  9 0P2   Shoestring Aviation Airfield    39.8  -76.6  1000    -5 U     America/New_York   
## 10 0S9   Jefferson County Intl           48.1 -123.    108    -8 A     America/Los_Angeles
## # ... with 1,448 more rows
## # A tibble: 3,322 x 9
##    tailnum  year type                    manufacturer     model     engines seats speed engine   
##    <chr>   <int> <chr>                   <chr>            <chr>       <int> <int> <int> <chr>    
##  1 N10156   2004 Fixed wing multi engine EMBRAER          EMB-145XR       2    55    NA Turbo-fan
##  2 N102UW   1998 Fixed wing multi engine AIRBUS INDUSTRIE A320-214        2   182    NA Turbo-fan
##  3 N103US   1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214        2   182    NA Turbo-fan
##  4 N104UW   1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214        2   182    NA Turbo-fan
##  5 N10575   2002 Fixed wing multi engine EMBRAER          EMB-145LR       2    55    NA Turbo-fan
##  6 N105UW   1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214        2   182    NA Turbo-fan
##  7 N107US   1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214        2   182    NA Turbo-fan
##  8 N108UW   1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214        2   182    NA Turbo-fan
##  9 N109UW   1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214        2   182    NA Turbo-fan
## 10 N110UW   1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214        2   182    NA Turbo-fan
## # ... with 3,312 more rows
## # A tibble: 26,115 x 15
##    origin  year month   day  hour  temp  dewp humid wind_dir wind_speed wind_gust precip pressure visib time_hour          
##    <chr>  <int> <int> <int> <int> <dbl> <dbl> <dbl>    <dbl>      <dbl>     <dbl>  <dbl>    <dbl> <dbl> <dttm>             
##  1 EWR     2013     1     1     1  39.0  26.1  59.4      270      10.4         NA      0    1012     10 2013-01-01 01:00:00
##  2 EWR     2013     1     1     2  39.0  27.0  61.6      250       8.06        NA      0    1012.    10 2013-01-01 02:00:00
##  3 EWR     2013     1     1     3  39.0  28.0  64.4      240      11.5         NA      0    1012.    10 2013-01-01 03:00:00
##  4 EWR     2013     1     1     4  39.9  28.0  62.2      250      12.7         NA      0    1012.    10 2013-01-01 04:00:00
##  5 EWR     2013     1     1     5  39.0  28.0  64.4      260      12.7         NA      0    1012.    10 2013-01-01 05:00:00
##  6 EWR     2013     1     1     6  37.9  28.0  67.2      240      11.5         NA      0    1012.    10 2013-01-01 06:00:00
##  7 EWR     2013     1     1     7  39.0  28.0  64.4      240      15.0         NA      0    1012.    10 2013-01-01 07:00:00
##  8 EWR     2013     1     1     8  39.9  28.0  62.2      250      10.4         NA      0    1012.    10 2013-01-01 08:00:00
##  9 EWR     2013     1     1     9  39.9  28.0  62.2      260      15.0         NA      0    1013.    10 2013-01-01 09:00:00
## 10 EWR     2013     1     1    10  41    28.0  59.6      260      13.8         NA      0    1012.    10 2013-01-01 10:00:00
## # ... with 26,105 more rows

Keys

planes %>% 
  count(tailnum) %>% 
  filter(n > 1)
## # A tibble: 0 x 2
## # ... with 2 variables: tailnum <chr>, n <int>
weather %>% 
  count(year, month, day, hour, origin) %>% 
  filter(n > 1)
## # A tibble: 3 x 6
##    year month   day  hour origin     n
##   <int> <int> <int> <int> <chr>  <int>
## 1  2013    11     3     1 EWR        2
## 2  2013    11     3     1 JFK        2
## 3  2013    11     3     1 LGA        2

(i) Mutating joins

basic
- inner_join(x, y, by = "key")
- left_join(x, y, by = "key")
- right_join(x, y, by = "key")
- full_join(x, y, by = "key")
define keys
- by = "key"
- by = c("key_x" = "key_y")

a. inner and outer join

x <- tribble(
  ~key, ~val_x,
     1, "x1",
     2, "x2",
     3, "x3"
)
y <- tribble(
  ~key, ~val_y,
     1, "y1",
     2, "y2",
     4, "y3"
)

x %>%
  inner_join(y, by = "key")

x %>%
  left_join(y, by = "key")

x %>%
  right_join(y, by = "key")

x %>%
  full_join(y, by = "key")
## # A tibble: 2 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## # A tibble: 3 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     3 x3    <NA> 
## # A tibble: 3 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     4 <NA>  y3   
## # A tibble: 4 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     3 x3    <NA> 
## 4     4 <NA>  y3


b. duplicate keys

one-side

x <- tribble(
  ~key_x, ~val_x,
     1, "x1",
     2, "x2",
     2, "x3",
     1, "x4"
)
y <- tribble(
  ~key_y, ~val_y,
     1, "y1",
     2, "y2"
)

left_join(x, y, by = c("key_x" = "key_y"))
## # A tibble: 4 x 3
##   key_x val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     2 x3    y2   
## 4     1 x4    y1

both-side

x <- tribble(
  ~key_x, ~val_x,
     1, "x1",
     2, "x2",
     2, "x3",
     3, "x4"
)
y <- tribble(
  ~key_y, ~val_y,
     1, "y1",
     2, "y2",
     2, "y3",
     3, "y4"
)

left_join(x, y, by = c("key_x" = "key_y"))
## # A tibble: 6 x 3
##   key_x val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     2 x2    y3   
## 4     2 x3    y2   
## 5     2 x3    y3   
## 6     3 x4    y4

(ii) Filtering joins

  • semi_join(x, y): keeps all observations in x that have a match in y
  • anti_join(x, y): drops all observations in x that have a match in y (check if foreign keys match primary keys)
x <- tribble(
  ~key, ~val_x,
     1, "x1",
     2, "x2",
     3, "x3"
)
y <- tribble(
  ~key, ~val_y,
     1, "y1",
     2, "y2",
     4, "y3"
)

x %>%
  semi_join(y, by = "key")

x %>%
  anti_join(y, by = "key")
## # A tibble: 2 x 2
##     key val_x
##   <dbl> <chr>
## 1     1 x1   
## 2     2 x2   
## # A tibble: 1 x 2
##     key val_x
##   <dbl> <chr>
## 1     3 x3

(iii) Set operations

  • intersect(x, y): return only observations in both x and y
  • union(x, y): return unique observations in x and y
  • setdiff(x, y): return observations in x, but not in y

dataset

df1 <- tribble(
  ~x, ~y,
   1,  1,
   2,  1
)

df2 <- tribble(
  ~x, ~y,
   1,  1,
   1,  2
)

a. intersect()

intersect(df1, df2)
## # A tibble: 1 x 2
##       x     y
##   <dbl> <dbl>
## 1     1     1

b. union()

union(df1, df2)
## # A tibble: 3 x 2
##       x     y
##   <dbl> <dbl>
## 1     1     1
## 2     2     1
## 3     1     2

c. setdiff()

setdiff(df1, df2)
## # A tibble: 1 x 2
##       x     y
##   <dbl> <dbl>
## 1     2     1
setdiff(df2, df1)
## # A tibble: 1 x 2
##       x     y
##   <dbl> <dbl>
## 1     1     2