dplyr
library(tidyverse)
library(nycflights13)
Caution
dplyr
overwrites some functions in base R.
When base version is needed, stats::filter()
and stats::lag()
are employed
nycflights13
flights
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 EWR IAH 227 1400 5
## 2 2013 1 1 533 529 4 850 830 20 UA 1714 N24211 LGA IAH 227 1416 5
## 3 2013 1 1 542 540 2 923 850 33 AA 1141 N619AA JFK MIA 160 1089 5
## 4 2013 1 1 544 545 -1 1004 1022 -18 B6 725 N804JB JFK BQN 183 1576 5
## 5 2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA ATL 116 762 6
## 6 2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR ORD 150 719 5
## 7 2013 1 1 555 600 -5 913 854 19 B6 507 N516JB EWR FLL 158 1065 6
## 8 2013 1 1 557 600 -3 709 723 -14 EV 5708 N829AS LGA IAD 53 229 6
## 9 2013 1 1 557 600 -3 838 846 -8 B6 79 N593JB JFK MCO 140 944 6
## 10 2013 1 1 558 600 -2 753 745 8 AA 301 N3ALAA LGA ORD 138 733 6
## # ... with 336,766 more rows, and 2 more variables: minute <dbl>, time_hour <dttm>
int
: integersdbl
: doubles (real numbers)chr
: character vectors (strings)dttm
: date-timelgl
: lagical (TRUE
or FALSE
)fctr
: factors (categorical variables with fixed values)date
: datesdplyr
)filter()
: pick observations (row) by their valuesselect()
: pick variables (column) by their namesarrange()
: reorder the rows (observations)mutate()
: create new variables with existing variablessummarise()
: collapse many values down to a single summarygroup_by()
: change the scope of each functionfilter()
jan1 <- filter(flights, month == 1, day == 1)
dec25 <- filter(flights, month == 12, day == 25)
nov_dec <- filter(flights, month == 11 | month == 12)
jan1
dec25
nov_dec
## # A tibble: 842 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 EWR IAH 227 1400 5
## 2 2013 1 1 533 529 4 850 830 20 UA 1714 N24211 LGA IAH 227 1416 5
## 3 2013 1 1 542 540 2 923 850 33 AA 1141 N619AA JFK MIA 160 1089 5
## 4 2013 1 1 544 545 -1 1004 1022 -18 B6 725 N804JB JFK BQN 183 1576 5
## 5 2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA ATL 116 762 6
## 6 2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR ORD 150 719 5
## 7 2013 1 1 555 600 -5 913 854 19 B6 507 N516JB EWR FLL 158 1065 6
## 8 2013 1 1 557 600 -3 709 723 -14 EV 5708 N829AS LGA IAD 53 229 6
## 9 2013 1 1 557 600 -3 838 846 -8 B6 79 N593JB JFK MCO 140 944 6
## 10 2013 1 1 558 600 -2 753 745 8 AA 301 N3ALAA LGA ORD 138 733 6
## # ... with 832 more rows, and 2 more variables: minute <dbl>, time_hour <dttm>
## # A tibble: 719 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2013 12 25 456 500 -4 649 651 -2 US 1895 N156UW EWR CLT 98 529 5
## 2 2013 12 25 524 515 9 805 814 -9 UA 1016 N32404 EWR IAH 203 1400 5
## 3 2013 12 25 542 540 2 832 850 -18 AA 2243 N5EBAA JFK MIA 146 1089 5
## 4 2013 12 25 546 550 -4 1022 1027 -5 B6 939 N665JB JFK BQN 191 1576 5
## 5 2013 12 25 556 600 -4 730 745 -15 AA 301 N3JLAA LGA ORD 123 733 6
## 6 2013 12 25 557 600 -3 743 752 -9 DL 731 N369NB LGA DTW 88 502 6
## 7 2013 12 25 557 600 -3 818 831 -13 DL 904 N397DA LGA ATL 118 762 6
## 8 2013 12 25 559 600 -1 855 856 -1 B6 371 N608JB LGA FLL 147 1076 6
## 9 2013 12 25 559 600 -1 849 855 -6 B6 605 N536JB EWR FLL 149 1065 6
## 10 2013 12 25 600 600 0 850 846 4 B6 583 N746JB JFK MCO 137 944 6
## # ... with 709 more rows, and 2 more variables: minute <dbl>, time_hour <dttm>
## # A tibble: 55,403 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2013 11 1 5 2359 6 352 345 7 B6 745 N568JB JFK PSE 205 1617 23
## 2 2013 11 1 35 2250 105 123 2356 87 B6 1816 N353JB JFK SYR 36 209 22
## 3 2013 11 1 455 500 -5 641 651 -10 US 1895 N192UW EWR CLT 88 529 5
## 4 2013 11 1 539 545 -6 856 827 29 UA 1714 N38727 LGA IAH 229 1416 5
## 5 2013 11 1 542 545 -3 831 855 -24 AA 2243 N5CLAA JFK MIA 147 1089 5
## 6 2013 11 1 549 600 -11 912 923 -11 UA 303 N595UA JFK SFO 359 2586 6
## 7 2013 11 1 550 600 -10 705 659 6 US 2167 N748UW LGA DCA 57 214 6
## 8 2013 11 1 554 600 -6 659 701 -2 US 2134 N742PS LGA BOS 40 184 6
## 9 2013 11 1 554 600 -6 826 827 -1 DL 563 N912DE LGA ATL 126 762 6
## 10 2013 11 1 554 600 -6 749 751 -2 DL 731 N315NB LGA DTW 93 502 6
## # ... with 55,393 more rows, and 2 more variables: minute <dbl>, time_hour <dttm>
arrange()
arrange(flights, month, day)
arrange(flights, desc(month), desc(day))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 EWR IAH 227 1400 5
## 2 2013 1 1 533 529 4 850 830 20 UA 1714 N24211 LGA IAH 227 1416 5
## 3 2013 1 1 542 540 2 923 850 33 AA 1141 N619AA JFK MIA 160 1089 5
## 4 2013 1 1 544 545 -1 1004 1022 -18 B6 725 N804JB JFK BQN 183 1576 5
## 5 2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA ATL 116 762 6
## 6 2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR ORD 150 719 5
## 7 2013 1 1 555 600 -5 913 854 19 B6 507 N516JB EWR FLL 158 1065 6
## 8 2013 1 1 557 600 -3 709 723 -14 EV 5708 N829AS LGA IAD 53 229 6
## 9 2013 1 1 557 600 -3 838 846 -8 B6 79 N593JB JFK MCO 140 944 6
## 10 2013 1 1 558 600 -2 753 745 8 AA 301 N3ALAA LGA ORD 138 733 6
## # ... with 336,766 more rows, and 2 more variables: minute <dbl>, time_hour <dttm>
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2013 12 31 13 2359 14 439 437 2 B6 839 N566JB JFK BQN 189 1576 23
## 2 2013 12 31 18 2359 19 449 444 5 DL 412 N713TW JFK SJU 192 1598 23
## 3 2013 12 31 26 2245 101 129 2353 96 B6 108 N374JB JFK PWM 50 273 22
## 4 2013 12 31 459 500 -1 655 651 4 US 1895 N557UW EWR CLT 95 529 5
## 5 2013 12 31 514 515 -1 814 812 2 UA 700 N470UA EWR IAH 223 1400 5
## 6 2013 12 31 549 551 -2 925 900 25 UA 274 N577UA EWR LAX 346 2454 5
## 7 2013 12 31 550 600 -10 725 745 -20 AA 301 N3CXAA LGA ORD 127 733 6
## 8 2013 12 31 552 600 -8 811 826 -15 EV 3825 N14916 EWR IND 118 645 6
## 9 2013 12 31 553 600 -7 741 754 -13 DL 731 N333NB LGA DTW 86 502 6
## 10 2013 12 31 554 550 4 1024 1027 -3 B6 939 N552JB JFK BQN 195 1576 5
## # ... with 336,766 more rows, and 2 more variables: minute <dbl>, time_hour <dttm>
desc()
: reorder in descending orderselect()
select(flights, year, month, day)
select(flights, year:day)
select(flights, -(year:day))
select(flights, YEAR = year, everything())
## # A tibble: 336,776 x 3
## year month day
## <int> <int> <int>
## 1 2013 1 1
## 2 2013 1 1
## 3 2013 1 1
## 4 2013 1 1
## 5 2013 1 1
## 6 2013 1 1
## 7 2013 1 1
## 8 2013 1 1
## 9 2013 1 1
## 10 2013 1 1
## # ... with 336,766 more rows
## # A tibble: 336,776 x 3
## year month day
## <int> <int> <int>
## 1 2013 1 1
## 2 2013 1 1
## 3 2013 1 1
## 4 2013 1 1
## 5 2013 1 1
## 6 2013 1 1
## 7 2013 1 1
## 8 2013 1 1
## 9 2013 1 1
## 10 2013 1 1
## # ... with 336,766 more rows
## # A tibble: 336,776 x 16
## dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
## <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 517 515 2 830 819 11 UA 1545 N14228 EWR IAH 227 1400 5 15
## 2 533 529 4 850 830 20 UA 1714 N24211 LGA IAH 227 1416 5 29
## 3 542 540 2 923 850 33 AA 1141 N619AA JFK MIA 160 1089 5 40
## 4 544 545 -1 1004 1022 -18 B6 725 N804JB JFK BQN 183 1576 5 45
## 5 554 600 -6 812 837 -25 DL 461 N668DN LGA ATL 116 762 6 0
## 6 554 558 -4 740 728 12 UA 1696 N39463 EWR ORD 150 719 5 58
## 7 555 600 -5 913 854 19 B6 507 N516JB EWR FLL 158 1065 6 0
## 8 557 600 -3 709 723 -14 EV 5708 N829AS LGA IAD 53 229 6 0
## 9 557 600 -3 838 846 -8 B6 79 N593JB JFK MCO 140 944 6 0
## 10 558 600 -2 753 745 8 AA 301 N3ALAA LGA ORD 138 733 6 0
## # ... with 336,766 more rows, and 1 more variable: time_hour <dttm>
## # A tibble: 336,776 x 19
## YEAR month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 EWR IAH 227 1400 5
## 2 2013 1 1 533 529 4 850 830 20 UA 1714 N24211 LGA IAH 227 1416 5
## 3 2013 1 1 542 540 2 923 850 33 AA 1141 N619AA JFK MIA 160 1089 5
## 4 2013 1 1 544 545 -1 1004 1022 -18 B6 725 N804JB JFK BQN 183 1576 5
## 5 2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA ATL 116 762 6
## 6 2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR ORD 150 719 5
## 7 2013 1 1 555 600 -5 913 854 19 B6 507 N516JB EWR FLL 158 1065 6
## 8 2013 1 1 557 600 -3 709 723 -14 EV 5708 N829AS LGA IAD 53 229 6
## 9 2013 1 1 557 600 -3 838 846 -8 B6 79 N593JB JFK MCO 140 944 6
## 10 2013 1 1 558 600 -2 753 745 8 AA 301 N3ALAA LGA ORD 138 733 6
## # ... with 336,766 more rows, and 2 more variables: minute <dbl>, time_hour <dttm>
A:Z
: select column A to Z-A
: exclude column Aold_col_name = new_col_name
: rename a columneverything()
: select all columnsstarts_with("abc")
: matches names that begin with abcends_with("xyz")
: matches names that end with xyzcontains("ijk")
: matches names that contain ijkmatches("(.)\\1")
: selects variables that match a regular expressionnum_range("x", 1:3)
: matches x1, x2 and x3mutate()
flights_sml <- select(flights,
year:day,
ends_with("delay"),
distance,
air_time
)
mutate()
and transmute()
mutate()
: add new variables at the lasttransmute()
: extract only new variablesmutate(flights_sml,
gain = dep_delay - arr_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)
transmute(flights,
gain = dep_delay - arr_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)
## # A tibble: 336,776 x 10
## year month day dep_delay arr_delay distance air_time gain hours gain_per_hour
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 1 2 11 1400 227 -9 3.78 -2.38
## 2 2013 1 1 4 20 1416 227 -16 3.78 -4.23
## 3 2013 1 1 2 33 1089 160 -31 2.67 -11.6
## 4 2013 1 1 -1 -18 1576 183 17 3.05 5.57
## 5 2013 1 1 -6 -25 762 116 19 1.93 9.83
## 6 2013 1 1 -4 12 719 150 -16 2.5 -6.4
## 7 2013 1 1 -5 19 1065 158 -24 2.63 -9.11
## 8 2013 1 1 -3 -14 229 53 11 0.883 12.5
## 9 2013 1 1 -3 -8 944 140 5 2.33 2.14
## 10 2013 1 1 -2 8 733 138 -10 2.3 -4.35
## # ... with 336,766 more rows
## # A tibble: 336,776 x 3
## gain hours gain_per_hour
## <dbl> <dbl> <dbl>
## 1 -9 3.78 -2.38
## 2 -16 3.78 -4.23
## 3 -31 2.67 -11.6
## 4 17 3.05 5.57
## 5 19 1.93 9.83
## 6 -16 2.5 -6.4
## 7 -24 2.63 -9.11
## 8 11 0.883 12.5
## 9 5 2.33 2.14
## 10 -10 2.3 -4.35
## # ... with 336,766 more rows
%/%
(integer division) and %%
(remainder)transmute(flights,
dep_time,
hour = dep_time %/% 100,
minute = dep_time %% 100
)
## # A tibble: 336,776 x 3
## dep_time hour minute
## <int> <dbl> <dbl>
## 1 517 5 17
## 2 533 5 33
## 3 542 5 42
## 4 544 5 44
## 5 554 5 54
## 6 554 5 54
## 7 555 5 55
## 8 557 5 57
## 9 557 5 57
## 10 558 5 58
## # ... with 336,766 more rows
summarise()
summarise()
should be used with group_by()
summarise(flights, delay = mean(dep_delay, na.rm = TRUE))
by_day <- group_by(flights, year, month, day)
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))
## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)
## # A tibble: 1 x 1
## delay
## <dbl>
## 1 12.6
## # A tibble: 365 x 4
## # Groups: year, month [12]
## year month day delay
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.5
## 2 2013 1 2 13.9
## 3 2013 1 3 11.0
## 4 2013 1 4 8.95
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.55
## 9 2013 1 9 2.28
## 10 2013 1 10 2.84
## # ... with 355 more rows
group_by()
%>% summarise()
%>% filter()
delays <- flights %>%
group_by(dest) %>%
summarise(
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(count > 20, dest != "HNL")
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(data = delays, mapping = aes(x = dist, y = delay)) +
geom_point(aes(size = count), alpha = 1/3) +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
na.rm =
: drop NA
flights %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay))
## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)
## # A tibble: 365 x 4
## # Groups: year, month [12]
## year month day mean
## <int> <int> <int> <dbl>
## 1 2013 1 1 NA
## 2 2013 1 2 NA
## 3 2013 1 3 NA
## 4 2013 1 4 NA
## 5 2013 1 5 NA
## 6 2013 1 6 NA
## 7 2013 1 7 NA
## 8 2013 1 8 NA
## 9 2013 1 9 NA
## 10 2013 1 10 NA
## # ... with 355 more rows
flights %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay, na.rm = TRUE))
## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)
## # A tibble: 365 x 4
## # Groups: year, month [12]
## year month day mean
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.5
## 2 2013 1 2 13.9
## 3 2013 1 3 11.0
## 4 2013 1 4 8.95
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.55
## 9 2013 1 9 2.28
## 10 2013 1 10 2.84
## # ... with 355 more rows
Remove canceled flights (arr_delay
= NA
)
not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
not_cancelled %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay))
## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)
## # A tibble: 365 x 4
## # Groups: year, month [12]
## year month day mean
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.4
## 2 2013 1 2 13.7
## 3 2013 1 3 10.9
## 4 2013 1 4 8.97
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.56
## 9 2013 1 9 2.30
## 10 2013 1 10 2.84
## # ... with 355 more rows
ex1: flight
(i) ggplot2::geom_freqploy()
delays <- not_cancelled %>%
group_by(tailnum) %>%
summarise(
delay = mean(arr_delay)
)
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(data = delays, mapping = aes(x = delay)) +
geom_freqpoly(binwidth = 10)
n()
: countdelays <- not_cancelled %>%
group_by(tailnum) %>%
summarise(
delay = mean(arr_delay),
n = n()
)
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(data = delays, mapping = aes(x = n, y = delay)) +
geom_point(alpha = 1/10) +
coord_flip()
filter()
: n > 25
delays %>%
filter(n > 25) %>%
ggplot(mapping = aes(x = n, y = delay)) +
geom_point(alpha = 1/10) +
coord_flip()
ex2: batting
batting <- as_tibble(Lahman::Batting)
batters <- batting %>%
group_by(playerID) %>%
summarise(
ba = sum(H, na.rm = TRUE) / sum(AB, na.rm = TRUE),
ab = sum(AB, na.rm = TRUE)
)
## `summarise()` ungrouping output (override with `.groups` argument)
batters %>%
filter(ab > 100) %>%
ggplot(mapping = aes(x = ab, y = ba)) +
geom_point() +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
batters %>%
arrange(desc(ba))
## # A tibble: 19,689 x 3
## playerID ba ab
## <chr> <dbl> <int>
## 1 abramge01 1 1
## 2 alanirj01 1 1
## 3 alberan01 1 1
## 4 banisje01 1 1
## 5 bartocl01 1 1
## 6 bassdo01 1 1
## 7 birasst01 1 2
## 8 bruneju01 1 1
## 9 burnscb01 1 1
## 10 cammaer01 1 1
## # ... with 19,679 more rows
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
avg_delay = mean(arr_delay),
avg_delay_posi = mean(arr_delay[arr_delay > 0]) # the average positive delay
)
## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)
## # A tibble: 365 x 5
## # Groups: year, month [12]
## year month day avg_delay avg_delay_posi
## <int> <int> <int> <dbl> <dbl>
## 1 2013 1 1 12.7 32.5
## 2 2013 1 2 12.7 32.0
## 3 2013 1 3 5.73 27.7
## 4 2013 1 4 -1.93 28.3
## 5 2013 1 5 -1.53 22.6
## 6 2013 1 6 4.24 24.4
## 7 2013 1 7 -4.95 27.8
## 8 2013 1 8 -3.23 20.8
## 9 2013 1 9 -0.264 25.6
## 10 2013 1 10 -5.90 27.3
## # ... with 355 more rows
min(x)
: minimumquantile(x, 0.25)
: 25%median(x)
: halfmax(x)
: maximumfirst(x)
: firstnth(x, N)
: Nth numberlast(x)
: lastn()
: total numbersum(!is.na(x))
: total number excluding NA
n_distinct(chr)
: count distinct values chrwt = X
: count a weighted value by Xsum(x > N)
: sum of x which are greater than Ntibble
library(tidyverse)
tibble
as_tibble(iris)
## # A tibble: 150 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## # ... with 140 more rows
tibble()
tibble(
x = 1:5,
y = 1,
z = x ^ 2 + y
)
## # A tibble: 5 x 3
## x y z
## <int> <dbl> <dbl>
## 1 1 1 2
## 2 2 1 5
## 3 3 1 10
## 4 4 1 17
## 5 5 1 26
tribble()
tb <- tribble(
~x, ~y, ~z,
#--|--|----
"a", 2, 3.6,
"b", 1, 8.5
)
tb
## # A tibble: 2 x 3
## x y z
## <chr> <dbl> <dbl>
## 1 a 2 3.6
## 2 b 1 8.5
df <- as.data.frame(tb)
class(tb)
## [1] "tbl_df" "tbl" "data.frame"
class(df)
## [1] "data.frame"
tibble(
a = lubridate::now() + runif(1e3) * 86316,
b = lubridate::today() + runif(1e3) * 30,
c = 1:1e3,
d = runif(1e3),
e = sample(letters, 1e3, replace = TRUE)
)
## # A tibble: 1,000 x 5
## a b c d e
## <dttm> <date> <int> <dbl> <chr>
## 1 2020-07-10 01:10:28 2020-07-22 1 0.944 a
## 2 2020-07-09 21:06:13 2020-07-26 2 0.244 z
## 3 2020-07-10 01:40:29 2020-08-05 3 0.253 o
## 4 2020-07-10 04:53:31 2020-07-21 4 0.185 g
## 5 2020-07-09 18:08:12 2020-07-12 5 0.314 u
## 6 2020-07-10 16:07:23 2020-07-29 6 0.0186 q
## 7 2020-07-10 05:31:50 2020-07-11 7 0.538 i
## 8 2020-07-09 23:41:48 2020-07-20 8 0.103 x
## 9 2020-07-10 07:37:19 2020-08-06 9 0.411 f
## 10 2020-07-10 04:42:10 2020-07-14 10 0.539 h
## # ... with 990 more rows
tb <- tibble(
x = runif(5),
y = rnorm(5)
)
tb
# xtract by name
tb$x
tb[["x"]]
# Extract by position
tb[[1]]
# Extract with a pipe
tb %>% .$x
tb %>% .[["x"]]
tb %>% .[[1]]
## # A tibble: 5 x 2
## x y
## <dbl> <dbl>
## 1 0.977 0.701
## 2 0.426 0.793
## 3 0.777 -0.0242
## 4 0.341 1.61
## 5 0.595 1.24
## [1] 0.9773 0.4256 0.7765 0.3409 0.5950
## [1] 0.9773 0.4256 0.7765 0.3409 0.5950
## [1] 0.9773 0.4256 0.7765 0.3409 0.5950
## [1] 0.9773 0.4256 0.7765 0.3409 0.5950
## [1] 0.9773 0.4256 0.7765 0.3409 0.5950
## [1] 0.9773 0.4256 0.7765 0.3409 0.5950
tidyr
library(tidyverse)
table1
table2
table3
## # A tibble: 6 x 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
## # A tibble: 12 x 4
## country year type count
## <chr> <int> <chr> <int>
## 1 Afghanistan 1999 cases 745
## 2 Afghanistan 1999 population 19987071
## 3 Afghanistan 2000 cases 2666
## 4 Afghanistan 2000 population 20595360
## 5 Brazil 1999 cases 37737
## 6 Brazil 1999 population 172006362
## 7 Brazil 2000 cases 80488
## 8 Brazil 2000 population 174504898
## 9 China 1999 cases 212258
## 10 China 1999 population 1272915272
## 11 China 2000 cases 213766
## 12 China 2000 population 1280428583
## # A tibble: 6 x 3
## country year rate
## * <chr> <int> <chr>
## 1 Afghanistan 1999 745/19987071
## 2 Afghanistan 2000 2666/20595360
## 3 Brazil 1999 37737/172006362
## 4 Brazil 2000 80488/174504898
## 5 China 1999 212258/1272915272
## 6 China 2000 213766/1280428583
# Compute rate per 10,000
table1 %>%
mutate(rate = cases / population * 10000)
# Compute cases per year
table1 %>%
count(year, wt = cases)
# Visualize changes over time
library(ggplot2)
ggplot(table1, aes(year, cases)) +
geom_line(aes(group = country), colour = "grey50") +
geom_point(aes(colour = country))
## # A tibble: 6 x 5
## country year cases population rate
## <chr> <int> <int> <int> <dbl>
## 1 Afghanistan 1999 745 19987071 0.373
## 2 Afghanistan 2000 2666 20595360 1.29
## 3 Brazil 1999 37737 172006362 2.19
## 4 Brazil 2000 80488 174504898 4.61
## 5 China 1999 212258 1272915272 1.67
## 6 China 2000 213766 1280428583 1.67
## # A tibble: 2 x 2
## year n
## <int> <int>
## 1 1999 250740
## 2 2000 296920
pivot_longer()
e.g. year
is variable, not observation
table4a
table4b
## # A tibble: 3 x 3
## country `1999` `2000`
## * <chr> <int> <int>
## 1 Afghanistan 745 2666
## 2 Brazil 37737 80488
## 3 China 212258 213766
## # A tibble: 3 x 3
## country `1999` `2000`
## * <chr> <int> <int>
## 1 Afghanistan 19987071 20595360
## 2 Brazil 172006362 174504898
## 3 China 1272915272 1280428583
Make longer table4a
table4a %>%
pivot_longer(c("1999", "2000"), names_to = "year", values_to = "cases")
## # A tibble: 6 x 3
## country year cases
## <chr> <chr> <int>
## 1 Afghanistan 1999 745
## 2 Afghanistan 2000 2666
## 3 Brazil 1999 37737
## 4 Brazil 2000 80488
## 5 China 1999 212258
## 6 China 2000 213766
Make longer table4b
table4b %>%
pivot_longer(c("1999", "2000"), names_to = "year", values_to = "population")
## # A tibble: 6 x 3
## country year population
## <chr> <chr> <int>
## 1 Afghanistan 1999 19987071
## 2 Afghanistan 2000 20595360
## 3 Brazil 1999 172006362
## 4 Brazil 2000 174504898
## 5 China 1999 1272915272
## 6 China 2000 1280428583
Join table4a and table4b
tidy4a <- table4a %>%
pivot_longer(c("1999", "2000"), names_to = "year", values_to = "cases")
tidy4b <- table4b %>%
pivot_longer(c("1999", "2000"), names_to = "year", values_to = "population")
left_join(tidy4a, tidy4b)
## Joining, by = c("country", "year")
## # A tibble: 6 x 4
## country year cases population
## <chr> <chr> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
pivot_wider()
e.g. type
is observation, not variable
table2
## # A tibble: 12 x 4
## country year type count
## <chr> <int> <chr> <int>
## 1 Afghanistan 1999 cases 745
## 2 Afghanistan 1999 population 19987071
## 3 Afghanistan 2000 cases 2666
## 4 Afghanistan 2000 population 20595360
## 5 Brazil 1999 cases 37737
## 6 Brazil 1999 population 172006362
## 7 Brazil 2000 cases 80488
## 8 Brazil 2000 population 174504898
## 9 China 1999 cases 212258
## 10 China 1999 population 1272915272
## 11 China 2000 cases 213766
## 12 China 2000 population 1280428583
Wider table2
table2 %>%
pivot_wider(names_from = type, values_from = count)
## # A tibble: 6 x 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
separate()
e.g. rate
has two variables
table3
## # A tibble: 6 x 3
## country year rate
## * <chr> <int> <chr>
## 1 Afghanistan 1999 745/19987071
## 2 Afghanistan 2000 2666/20595360
## 3 Brazil 1999 37737/172006362
## 4 Brazil 2000 80488/174504898
## 5 China 1999 212258/1272915272
## 6 China 2000 213766/1280428583
table3 %>%
separate(rate, into = c("cases", "population"), sep = "/", convert = TRUE)
table3 %>%
separate(year, into = c("century", "year"), sep = 2, convert = TRUE)
## # A tibble: 6 x 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
## # A tibble: 6 x 4
## country century year rate
## <chr> <int> <int> <chr>
## 1 Afghanistan 19 99 745/19987071
## 2 Afghanistan 20 0 2666/20595360
## 3 Brazil 19 99 37737/172006362
## 4 Brazil 20 0 80488/174504898
## 5 China 19 99 212258/1272915272
## 6 China 20 0 213766/1280428583
sep =
: specify the place to separateconvert =
: if TRUE, try and convert to better typesunite()
e.g. century
and year
can be united
table5
## # A tibble: 6 x 4
## country century year rate
## * <chr> <chr> <chr> <chr>
## 1 Afghanistan 19 99 745/19987071
## 2 Afghanistan 20 00 2666/20595360
## 3 Brazil 19 99 37737/172006362
## 4 Brazil 20 00 80488/174504898
## 5 China 19 99 212258/1272915272
## 6 China 20 00 213766/1280428583
table5 %>%
unite(new, century, year, sep = "")
## # A tibble: 6 x 3
## country new rate
## <chr> <chr> <chr>
## 1 Afghanistan 1999 745/19987071
## 2 Afghanistan 2000 2666/20595360
## 3 Brazil 1999 37737/172006362
## 4 Brazil 2000 80488/174504898
## 5 China 1999 212258/1272915272
## 6 China 2000 213766/1280428583
two types of missing values
NA
(the presence of an absence)stocks <- tibble(
year = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),
qtr = c( 1, 2, 3, 4, 2, 3, 4),
return = c(1.88, 0.59, 0.35, NA, 0.92, 0.17, 2.66)
)
stocks %>%
pivot_wider(names_from = year, values_from = return)
## # A tibble: 4 x 3
## qtr `2015` `2016`
## <dbl> <dbl> <dbl>
## 1 1 1.88 NA
## 2 2 0.59 0.92
## 3 3 0.35 0.17
## 4 4 NA 2.66
value_drop_na
stocks %>%
pivot_wider(names_from = year, values_from = return) %>%
pivot_longer(
cols = c(`2015`, `2016`),
names_to = "year",
values_to = "return",
values_drop_na = TRUE
)
## # A tibble: 6 x 3
## qtr year return
## <dbl> <chr> <dbl>
## 1 1 2015 1.88
## 2 2 2015 0.59
## 3 2 2016 0.92
## 4 3 2015 0.35
## 5 3 2016 0.17
## 6 4 2016 2.66
values_drop_na =
: if TRUE, NA
is droppedcomplete()
stocks %>%
complete(year, qtr)
## # A tibble: 8 x 3
## year qtr return
## <dbl> <dbl> <dbl>
## 1 2015 1 1.88
## 2 2015 2 0.59
## 3 2015 3 0.35
## 4 2015 4 NA
## 5 2016 1 NA
## 6 2016 2 0.92
## 7 2016 3 0.17
## 8 2016 4 2.66
fill()
to replace NA by the most recent non-missing valuetreatment <- tribble(
~ person, ~ treatment, ~response,
"Derrick Whitmore", 1, 7,
NA, 2, 10,
NA, 3, 9,
"Katherine Burke", 1, 4
)
treatment %>%
fill(person)
## # A tibble: 4 x 3
## person treatment response
## <chr> <dbl> <dbl>
## 1 Derrick Whitmore 1 7
## 2 Derrick Whitmore 2 10
## 3 Derrick Whitmore 3 9
## 4 Katherine Burke 1 4
who
## # A tibble: 7,240 x 60
## country iso2 iso3 year new_sp_m014 new_sp_m1524 new_sp_m2534 new_sp_m3544 new_sp_m4554 new_sp_m5564 new_sp_m65 new_sp_f014 new_sp_f1524
## <chr> <chr> <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 Afghan~ AF AFG 1980 NA NA NA NA NA NA NA NA NA
## 2 Afghan~ AF AFG 1981 NA NA NA NA NA NA NA NA NA
## 3 Afghan~ AF AFG 1982 NA NA NA NA NA NA NA NA NA
## 4 Afghan~ AF AFG 1983 NA NA NA NA NA NA NA NA NA
## 5 Afghan~ AF AFG 1984 NA NA NA NA NA NA NA NA NA
## 6 Afghan~ AF AFG 1985 NA NA NA NA NA NA NA NA NA
## 7 Afghan~ AF AFG 1986 NA NA NA NA NA NA NA NA NA
## 8 Afghan~ AF AFG 1987 NA NA NA NA NA NA NA NA NA
## 9 Afghan~ AF AFG 1988 NA NA NA NA NA NA NA NA NA
## 10 Afghan~ AF AFG 1989 NA NA NA NA NA NA NA NA NA
## # ... with 7,230 more rows, and 47 more variables: new_sp_f2534 <int>, new_sp_f3544 <int>, new_sp_f4554 <int>, new_sp_f5564 <int>, new_sp_f65 <int>,
## # new_sn_m014 <int>, new_sn_m1524 <int>, new_sn_m2534 <int>, new_sn_m3544 <int>, new_sn_m4554 <int>, new_sn_m5564 <int>, new_sn_m65 <int>,
## # new_sn_f014 <int>, new_sn_f1524 <int>, new_sn_f2534 <int>, new_sn_f3544 <int>, new_sn_f4554 <int>, new_sn_f5564 <int>, new_sn_f65 <int>,
## # new_ep_m014 <int>, new_ep_m1524 <int>, new_ep_m2534 <int>, new_ep_m3544 <int>, new_ep_m4554 <int>, new_ep_m5564 <int>, new_ep_m65 <int>,
## # new_ep_f014 <int>, new_ep_f1524 <int>, new_ep_f2534 <int>, new_ep_f3544 <int>, new_ep_f4554 <int>, new_ep_f5564 <int>, new_ep_f65 <int>,
## # newrel_m014 <int>, newrel_m1524 <int>, newrel_m2534 <int>, newrel_m3544 <int>, newrel_m4554 <int>, newrel_m5564 <int>, newrel_m65 <int>,
## # newrel_f014 <int>, newrel_f1524 <int>, newrel_f2534 <int>, newrel_f3544 <int>, newrel_f4554 <int>, newrel_f5564 <int>, newrel_f65 <int>
pivot_longer()
who1 <- who %>%
pivot_longer(
new_sp_m014:newrel_f65,
names_to = "key",
values_to = "cases",
values_drop_na = TRUE
)
who1
## # A tibble: 76,046 x 6
## country iso2 iso3 year key cases
## <chr> <chr> <chr> <int> <chr> <int>
## 1 Afghanistan AF AFG 1997 new_sp_m014 0
## 2 Afghanistan AF AFG 1997 new_sp_m1524 10
## 3 Afghanistan AF AFG 1997 new_sp_m2534 6
## 4 Afghanistan AF AFG 1997 new_sp_m3544 3
## 5 Afghanistan AF AFG 1997 new_sp_m4554 5
## 6 Afghanistan AF AFG 1997 new_sp_m5564 2
## 7 Afghanistan AF AFG 1997 new_sp_m65 0
## 8 Afghanistan AF AFG 1997 new_sp_f014 5
## 9 Afghanistan AF AFG 1997 new_sp_f1524 38
## 10 Afghanistan AF AFG 1997 new_sp_f2534 36
## # ... with 76,036 more rows
count()
: count the number of each ovservationwho1 %>%
count(key)
## # A tibble: 56 x 2
## key n
## <chr> <int>
## 1 new_ep_f014 1032
## 2 new_ep_f1524 1021
## 3 new_ep_f2534 1021
## 4 new_ep_f3544 1021
## 5 new_ep_f4554 1017
## 6 new_ep_f5564 1017
## 7 new_ep_f65 1014
## 8 new_ep_m014 1038
## 9 new_ep_m1524 1026
## 10 new_ep_m2534 1020
## # ... with 46 more rows
mutate()
: newrel -> new and relseparate()
: key -> type, sex_ageseparate()
: sex_age -> sex, ageselect()
: drop some variables (new, iso2, iso3)mutate()
: newrel -> new and relwho2 <- who1 %>%
mutate(names_from = stringr::str_replace(key, "newrel", "new_rel"))
who2
## # A tibble: 76,046 x 7
## country iso2 iso3 year key cases names_from
## <chr> <chr> <chr> <int> <chr> <int> <chr>
## 1 Afghanistan AF AFG 1997 new_sp_m014 0 new_sp_m014
## 2 Afghanistan AF AFG 1997 new_sp_m1524 10 new_sp_m1524
## 3 Afghanistan AF AFG 1997 new_sp_m2534 6 new_sp_m2534
## 4 Afghanistan AF AFG 1997 new_sp_m3544 3 new_sp_m3544
## 5 Afghanistan AF AFG 1997 new_sp_m4554 5 new_sp_m4554
## 6 Afghanistan AF AFG 1997 new_sp_m5564 2 new_sp_m5564
## 7 Afghanistan AF AFG 1997 new_sp_m65 0 new_sp_m65
## 8 Afghanistan AF AFG 1997 new_sp_f014 5 new_sp_f014
## 9 Afghanistan AF AFG 1997 new_sp_f1524 38 new_sp_f1524
## 10 Afghanistan AF AFG 1997 new_sp_f2534 36 new_sp_f2534
## # ... with 76,036 more rows
separate()
: key -> type, sex_agewho3 <- who2 %>%
separate(key, c("new", "type", "sexage"), sep = "_")
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 2580 rows [243, 244, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692,
## 903, 904, 905, 906, ...].
who3
## # A tibble: 76,046 x 9
## country iso2 iso3 year new type sexage cases names_from
## <chr> <chr> <chr> <int> <chr> <chr> <chr> <int> <chr>
## 1 Afghanistan AF AFG 1997 new sp m014 0 new_sp_m014
## 2 Afghanistan AF AFG 1997 new sp m1524 10 new_sp_m1524
## 3 Afghanistan AF AFG 1997 new sp m2534 6 new_sp_m2534
## 4 Afghanistan AF AFG 1997 new sp m3544 3 new_sp_m3544
## 5 Afghanistan AF AFG 1997 new sp m4554 5 new_sp_m4554
## 6 Afghanistan AF AFG 1997 new sp m5564 2 new_sp_m5564
## 7 Afghanistan AF AFG 1997 new sp m65 0 new_sp_m65
## 8 Afghanistan AF AFG 1997 new sp f014 5 new_sp_f014
## 9 Afghanistan AF AFG 1997 new sp f1524 38 new_sp_f1524
## 10 Afghanistan AF AFG 1997 new sp f2534 36 new_sp_f2534
## # ... with 76,036 more rows
separate()
: sex_age -> sex, agewho4 <- who3 %>%
separate(sexage, c("sex", "age"), sep = 1)
who4
## # A tibble: 76,046 x 10
## country iso2 iso3 year new type sex age cases names_from
## <chr> <chr> <chr> <int> <chr> <chr> <chr> <chr> <int> <chr>
## 1 Afghanistan AF AFG 1997 new sp m 014 0 new_sp_m014
## 2 Afghanistan AF AFG 1997 new sp m 1524 10 new_sp_m1524
## 3 Afghanistan AF AFG 1997 new sp m 2534 6 new_sp_m2534
## 4 Afghanistan AF AFG 1997 new sp m 3544 3 new_sp_m3544
## 5 Afghanistan AF AFG 1997 new sp m 4554 5 new_sp_m4554
## 6 Afghanistan AF AFG 1997 new sp m 5564 2 new_sp_m5564
## 7 Afghanistan AF AFG 1997 new sp m 65 0 new_sp_m65
## 8 Afghanistan AF AFG 1997 new sp f 014 5 new_sp_f014
## 9 Afghanistan AF AFG 1997 new sp f 1524 38 new_sp_f1524
## 10 Afghanistan AF AFG 1997 new sp f 2534 36 new_sp_f2534
## # ... with 76,036 more rows
select()
: drop new, iso2, and iso3who5 <- who4 %>%
select(-new, -iso2, -iso3)
who5
## # A tibble: 76,046 x 7
## country year type sex age cases names_from
## <chr> <int> <chr> <chr> <chr> <int> <chr>
## 1 Afghanistan 1997 sp m 014 0 new_sp_m014
## 2 Afghanistan 1997 sp m 1524 10 new_sp_m1524
## 3 Afghanistan 1997 sp m 2534 6 new_sp_m2534
## 4 Afghanistan 1997 sp m 3544 3 new_sp_m3544
## 5 Afghanistan 1997 sp m 4554 5 new_sp_m4554
## 6 Afghanistan 1997 sp m 5564 2 new_sp_m5564
## 7 Afghanistan 1997 sp m 65 0 new_sp_m65
## 8 Afghanistan 1997 sp f 014 5 new_sp_f014
## 9 Afghanistan 1997 sp f 1524 38 new_sp_f1524
## 10 Afghanistan 1997 sp f 2534 36 new_sp_f2534
## # ... with 76,036 more rows
who %>%
pivot_longer(
cols = new_sp_m014:newrel_f65,
names_to = "key",
values_to = "cases",
values_drop_na = TRUE
) %>%
mutate(
key = stringr::str_replace(key, "newrel", "new_rel")
) %>%
separate(key, c("new", "var", "sexage")) %>%
separate(sexage, c("sex", "age"), sep = 1) %>%
select(-new, -iso2, -iso3)
## # A tibble: 76,046 x 6
## country year var sex age cases
## <chr> <int> <chr> <chr> <chr> <int>
## 1 Afghanistan 1997 sp m 014 0
## 2 Afghanistan 1997 sp m 1524 10
## 3 Afghanistan 1997 sp m 2534 6
## 4 Afghanistan 1997 sp m 3544 3
## 5 Afghanistan 1997 sp m 4554 5
## 6 Afghanistan 1997 sp m 5564 2
## 7 Afghanistan 1997 sp m 65 0
## 8 Afghanistan 1997 sp f 014 5
## 9 Afghanistan 1997 sp f 1524 38
## 10 Afghanistan 1997 sp f 2534 36
## # ... with 76,036 more rows
library(tidyverse)
library(nycflights13)
mutating joins: add new variables to a data frame from matching observations in another.
filtering joins: filter observations from a data frame based on whether they match an observation in the other table
set operations: treat observations as if they were set elements
flights
airlines
airports
planes
weather
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 EWR IAH 227 1400 5
## 2 2013 1 1 533 529 4 850 830 20 UA 1714 N24211 LGA IAH 227 1416 5
## 3 2013 1 1 542 540 2 923 850 33 AA 1141 N619AA JFK MIA 160 1089 5
## 4 2013 1 1 544 545 -1 1004 1022 -18 B6 725 N804JB JFK BQN 183 1576 5
## 5 2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA ATL 116 762 6
## 6 2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR ORD 150 719 5
## 7 2013 1 1 555 600 -5 913 854 19 B6 507 N516JB EWR FLL 158 1065 6
## 8 2013 1 1 557 600 -3 709 723 -14 EV 5708 N829AS LGA IAD 53 229 6
## 9 2013 1 1 557 600 -3 838 846 -8 B6 79 N593JB JFK MCO 140 944 6
## 10 2013 1 1 558 600 -2 753 745 8 AA 301 N3ALAA LGA ORD 138 733 6
## # ... with 336,766 more rows, and 2 more variables: minute <dbl>, time_hour <dttm>
## # A tibble: 16 x 2
## carrier name
## <chr> <chr>
## 1 9E Endeavor Air Inc.
## 2 AA American Airlines Inc.
## 3 AS Alaska Airlines Inc.
## 4 B6 JetBlue Airways
## 5 DL Delta Air Lines Inc.
## 6 EV ExpressJet Airlines Inc.
## 7 F9 Frontier Airlines Inc.
## 8 FL AirTran Airways Corporation
## 9 HA Hawaiian Airlines Inc.
## 10 MQ Envoy Air
## 11 OO SkyWest Airlines Inc.
## 12 UA United Air Lines Inc.
## 13 US US Airways Inc.
## 14 VX Virgin America
## 15 WN Southwest Airlines Co.
## 16 YV Mesa Airlines Inc.
## # A tibble: 1,458 x 8
## faa name lat lon alt tz dst tzone
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A America/New_York
## 2 06A Moton Field Municipal Airport 32.5 -85.7 264 -6 A America/Chicago
## 3 06C Schaumburg Regional 42.0 -88.1 801 -6 A America/Chicago
## 4 06N Randall Airport 41.4 -74.4 523 -5 A America/New_York
## 5 09J Jekyll Island Airport 31.1 -81.4 11 -5 A America/New_York
## 6 0A9 Elizabethton Municipal Airport 36.4 -82.2 1593 -5 A America/New_York
## 7 0G6 Williams County Airport 41.5 -84.5 730 -5 A America/New_York
## 8 0G7 Finger Lakes Regional Airport 42.9 -76.8 492 -5 A America/New_York
## 9 0P2 Shoestring Aviation Airfield 39.8 -76.6 1000 -5 U America/New_York
## 10 0S9 Jefferson County Intl 48.1 -123. 108 -8 A America/Los_Angeles
## # ... with 1,448 more rows
## # A tibble: 3,322 x 9
## tailnum year type manufacturer model engines seats speed engine
## <chr> <int> <chr> <chr> <chr> <int> <int> <int> <chr>
## 1 N10156 2004 Fixed wing multi engine EMBRAER EMB-145XR 2 55 NA Turbo-fan
## 2 N102UW 1998 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2 182 NA Turbo-fan
## 3 N103US 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2 182 NA Turbo-fan
## 4 N104UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2 182 NA Turbo-fan
## 5 N10575 2002 Fixed wing multi engine EMBRAER EMB-145LR 2 55 NA Turbo-fan
## 6 N105UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2 182 NA Turbo-fan
## 7 N107US 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2 182 NA Turbo-fan
## 8 N108UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2 182 NA Turbo-fan
## 9 N109UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2 182 NA Turbo-fan
## 10 N110UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2 182 NA Turbo-fan
## # ... with 3,312 more rows
## # A tibble: 26,115 x 15
## origin year month day hour temp dewp humid wind_dir wind_speed wind_gust precip pressure visib time_hour
## <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dttm>
## 1 EWR 2013 1 1 1 39.0 26.1 59.4 270 10.4 NA 0 1012 10 2013-01-01 01:00:00
## 2 EWR 2013 1 1 2 39.0 27.0 61.6 250 8.06 NA 0 1012. 10 2013-01-01 02:00:00
## 3 EWR 2013 1 1 3 39.0 28.0 64.4 240 11.5 NA 0 1012. 10 2013-01-01 03:00:00
## 4 EWR 2013 1 1 4 39.9 28.0 62.2 250 12.7 NA 0 1012. 10 2013-01-01 04:00:00
## 5 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7 NA 0 1012. 10 2013-01-01 05:00:00
## 6 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5 NA 0 1012. 10 2013-01-01 06:00:00
## 7 EWR 2013 1 1 7 39.0 28.0 64.4 240 15.0 NA 0 1012. 10 2013-01-01 07:00:00
## 8 EWR 2013 1 1 8 39.9 28.0 62.2 250 10.4 NA 0 1012. 10 2013-01-01 08:00:00
## 9 EWR 2013 1 1 9 39.9 28.0 62.2 260 15.0 NA 0 1013. 10 2013-01-01 09:00:00
## 10 EWR 2013 1 1 10 41 28.0 59.6 260 13.8 NA 0 1012. 10 2013-01-01 10:00:00
## # ... with 26,105 more rows
planes %>%
count(tailnum) %>%
filter(n > 1)
## # A tibble: 0 x 2
## # ... with 2 variables: tailnum <chr>, n <int>
weather %>%
count(year, month, day, hour, origin) %>%
filter(n > 1)
## # A tibble: 3 x 6
## year month day hour origin n
## <int> <int> <int> <int> <chr> <int>
## 1 2013 11 3 1 EWR 2
## 2 2013 11 3 1 JFK 2
## 3 2013 11 3 1 LGA 2
basic
- inner_join(x, y, by = "key")
- left_join(x, y, by = "key")
- right_join(x, y, by = "key")
- full_join(x, y, by = "key")
define keys
- by = "key"
- by = c("key_x" = "key_y")
x <- tribble(
~key, ~val_x,
1, "x1",
2, "x2",
3, "x3"
)
y <- tribble(
~key, ~val_y,
1, "y1",
2, "y2",
4, "y3"
)
x %>%
inner_join(y, by = "key")
x %>%
left_join(y, by = "key")
x %>%
right_join(y, by = "key")
x %>%
full_join(y, by = "key")
## # A tibble: 2 x 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## # A tibble: 3 x 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 3 x3 <NA>
## # A tibble: 3 x 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 4 <NA> y3
## # A tibble: 4 x 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 3 x3 <NA>
## 4 4 <NA> y3
one-side
x <- tribble(
~key_x, ~val_x,
1, "x1",
2, "x2",
2, "x3",
1, "x4"
)
y <- tribble(
~key_y, ~val_y,
1, "y1",
2, "y2"
)
left_join(x, y, by = c("key_x" = "key_y"))
## # A tibble: 4 x 3
## key_x val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 2 x3 y2
## 4 1 x4 y1
both-side
x <- tribble(
~key_x, ~val_x,
1, "x1",
2, "x2",
2, "x3",
3, "x4"
)
y <- tribble(
~key_y, ~val_y,
1, "y1",
2, "y2",
2, "y3",
3, "y4"
)
left_join(x, y, by = c("key_x" = "key_y"))
## # A tibble: 6 x 3
## key_x val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 2 x2 y3
## 4 2 x3 y2
## 5 2 x3 y3
## 6 3 x4 y4
semi_join(x, y)
: keeps all observations in x that have a match in yanti_join(x, y)
: drops all observations in x that have a match in y (check if foreign keys match primary keys)x <- tribble(
~key, ~val_x,
1, "x1",
2, "x2",
3, "x3"
)
y <- tribble(
~key, ~val_y,
1, "y1",
2, "y2",
4, "y3"
)
x %>%
semi_join(y, by = "key")
x %>%
anti_join(y, by = "key")
## # A tibble: 2 x 2
## key val_x
## <dbl> <chr>
## 1 1 x1
## 2 2 x2
## # A tibble: 1 x 2
## key val_x
## <dbl> <chr>
## 1 3 x3
intersect(x, y)
: return only observations in both x
and y
union(x, y)
: return unique observations in x
and y
setdiff(x, y)
: return observations in x
, but not in y
df1 <- tribble(
~x, ~y,
1, 1,
2, 1
)
df2 <- tribble(
~x, ~y,
1, 1,
1, 2
)
intersect()
intersect(df1, df2)
## # A tibble: 1 x 2
## x y
## <dbl> <dbl>
## 1 1 1
union()
union(df1, df2)
## # A tibble: 3 x 2
## x y
## <dbl> <dbl>
## 1 1 1
## 2 2 1
## 3 1 2
setdiff()
setdiff(df1, df2)
## # A tibble: 1 x 2
## x y
## <dbl> <dbl>
## 1 2 1
setdiff(df2, df1)
## # A tibble: 1 x 2
## x y
## <dbl> <dbl>
## 1 1 2