Data ingestion and manipulation
if (! file.exists(" flights.csv" )) {
download.file(
" http://stat-computing.org/dataexpo/2009/2008.csv.bz2" ,
" flights.csv.bz2" )
R.utils :: bunzip2(
" flights.csv.bz2" ,
" flights.csv" )
unlink(" flights.csv.bz2" , force = TRUE )
}
library(readr )
tr <- system.time(
flights_readr <- read_csv(" flights.csv" )
)
# > Parsed with column specification:
# > cols(
# > .default = col_double(),
# > UniqueCarrier = col_character(),
# > TailNum = col_character(),
# > Origin = col_character(),
# > Dest = col_character(),
# > CancellationCode = col_character()
# > )
# > See spec(...) for full column specifications.
tr [[3 ]]
# > [1] 21.748
library(data.table )
tdt <- system.time(
flights_dt <- fread(" flights.csv" )
)
tdt [[3 ]]
# > [1] 3.717
tva <- system.time(
flights_vroom_altrep <- vroom(" flights.csv" , altrep_opts = TRUE )
)
# > Observations: 7,009,728
# > Variables: 29
# > chr [ 5]: UniqueCarrier, TailNum, Origin, Dest, CancellationCode
# > dbl [24]: Year, Month, DayofMonth, DayOfWeek, DepTime, CRSDepTime, ArrTime, CRSArrTim...
# >
# > Call `spec()` for a copy-pastable column specification
# > Specify the column types with `col_types` to quiet this message
tva [[3 ]]
# > [1] 1.996
library(tidyverse )
comparison <- tibble(
readr = tr [[3 ]],
`data.table` = tdt [[3 ]],
vroom = tva [[3 ]]
)
comparison
# > # A tibble: 1 x 3
# > readr data.table vroom
# > <dbl> <dbl> <dbl>
# > 1 21.7 3.72 2.00
comparison %> %
gather() %> %
ggplot(aes(key , value , fill = key )) +
geom_col() +
geom_label(aes(label = paste0(round(value ), " secs" )), fill = " white" ) +
coord_flip() +
labs(title = " File read times" , x = " " , y = " " ) +
theme_minimal() +
theme(legend.position = " none" , axis.text.x = element_blank())
flights_readr %> %
group_by(Month ) %> %
summarise(avg_delay = mean(ArrDelay , na.rm = TRUE ))
# > # A tibble: 12 x 2
# > Month avg_delay
# > <dbl> <dbl>
# > 1 1 10.2
# > 2 2 13.1
# > 3 3 11.2
# > 4 4 6.81
# > 5 5 5.98
# > 6 6 13.3
# > 7 7 9.98
# > 8 8 6.91
# > 9 9 0.698
# > 10 10 0.415
# > 11 11 2.02
# > 12 12 16.7
mr <- system.time(
flights_readr %> %
group_by(Month ) %> %
summarise(avg_delay = mean(ArrDelay , na.rm = TRUE ))
)
mva <- system.time(
flights_vroom_altrep %> %
group_by(Month ) %> %
summarise(avg_delay = mean(ArrDelay , na.rm = TRUE ))
)
mdt <- system.time(
flights_dt [! is.na(ArrDelay ), .(avg_delay = mean(ArrDelay )), Month ]
)
comp <- tibble(
readr = mr [[3 ]],
`data.table` = mdt [[3 ]],
vroom = mva [[3 ]]
)
comp
# > # A tibble: 1 x 3
# > readr data.table vroom
# > <dbl> <dbl> <dbl>
# > 1 0.232 0.212 0.536
comp %> %
gather() %> %
ggplot(aes(key , value , fill = key )) +
geom_col() +
geom_label(aes(label = paste0(round(value , 2 ), " secs" )), fill = " white" ) +
coord_flip() +
labs(title = " Data manipulation times" , x = " " , y = " " ) +
theme_minimal() +
theme(legend.position = " none" , axis.text.x = element_blank())