Skip to content

Commit

Permalink
Add code for #292
Browse files Browse the repository at this point in the history
  • Loading branch information
Robinlovelace committed Oct 25, 2020
1 parent 397f108 commit 529ab68
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 59 deletions.
1 change: 0 additions & 1 deletion 05-input-output.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ We also use the **WDI** package to illustrate accessing online data sets
library("WDI")
```


## Top 5 tips for efficient data I/O

1. If possible, keep the names of local files downloaded from the internet or copied onto your computer unchanged. This will help you trace the provenance of the data in the future.
Expand Down
113 changes: 55 additions & 58 deletions code/05-io_f1.R
Original file line number Diff line number Diff line change
@@ -1,39 +1,43 @@
source("code/initialise.R")
library(ggplot2)
library(data.table)
library(readr)
library("ggplot2")
library("data.table")
library("readr")
library("vroom")
library("arrow")
library("microbenchmark")

# # Start corresponds to 0.1 MB
# rows = 10^(seq(3.445,6, length.out = 50))
# cols = 2 * 10^(0:2)
# res = NULL
# for(i in seq_along(rows)) {
# for(k in 1:80) {
# for(j in seq_along(cols)) {
# no_of_rows = floor(rows[i]/(10^(j-1)))
# m = matrix(runif(no_of_rows * cols[j]), nrow = no_of_rows, ncol = cols[j])
# fname = tempfile()
# write.csv(m, file = fname, row.names = FALSE)
# mb = microbenchmark(times = 10,
# base_default = read.csv(fname),
# readr_default = read_csv(fname),
# fread_default = fread(fname)
# )
#
#
# tab = tapply(mb$time/1000, mb$expr, mean)
# res_tmp = data.frame(exp = names(tab),
# time = as.vector(tab), rows = no_of_rows,
# cols = cols[j])
# unlink(fname)
# res = rbind(res, res_tmp)
# save(res, file="04_tmp.RData")
# }
# }
# message(i)
# }
#save(res, file="extdata/04-f3.RData")
# Start corresponds to 0.1 MB
rows = 10^(seq(3.445, 4, length.out = 5))
cols = 2 * 5^(0:2)
res = NULL
for(i in seq_along(rows)) {
for(k in 1:3) {
for(j in seq_along(cols)) {
no_of_rows = floor(rows[i]/(10^(j-1)))
m = matrix(runif(no_of_rows * cols[j]), nrow = no_of_rows, ncol = cols[j])
fname = tempfile()
write.csv(m, file = fname, row.names = FALSE)
mb = microbenchmark(times = 10,
base_default = read.csv(fname),
readr_default = read_csv(fname),
fread_default = fread(fname),
vroom_default = vroom(fname, delim = ","),
arrow_default = arrow::read_csv_arrow(fname)
)


tab = tapply(mb$time/1000, mb$expr, mean)
res_tmp = data.frame(exp = names(tab),
time = as.vector(tab), rows = no_of_rows,
cols = cols[j])
unlink(fname)
res = rbind(res, res_tmp)
save(res, file="04_tmp.RData")
}
}
message(i)
}
save(res, file="extdata/04-f3.RData")
load("extdata/05-f1.RData")
res = aggregate(time ~ cols+rows+ exp, mean, data=res)
res$MB = res$cols*res$rows*18/1000000 ## Approximate
Expand All @@ -43,32 +47,32 @@ for(i in res$cells){
sel = res$cells == i
res$Time[sel] = res$time[sel] / min(res$time[sel])
}
res$type = factor(res$exp, labels=c("base", "data.table","readr"))
res$type = factor(res$exp, labels=c("base", "data.table","readr", "vroom"))

library("ggplot2")
res$facet_cols = paste("No of columns:", res$cols)
res = res[res$MB >= 0.1,]
g = ggplot(res, aes(MB, Time)) +
geom_line(aes(colour = type, linetype=type), size=1) +
g = ggplot(res, aes(MB, Time)) +
geom_line(aes(colour = type, linetype=type), size=1) +
facet_grid(~ facet_cols) +
scale_x_continuous(limits=c(min(res$MB),36), expand = c(0, 0), trans="log10") +
theme(panel.grid.major.y = element_line(colour = "gray90"),
panel.grid.minor = element_line(colour = NA),
panel.grid.major.x = element_line(colour = NA),
plot.title = element_text(size = 12,
face = "bold", hjust = 1, vjust = 0),
panel.background = element_rect(fill = NA),
legend.background = element_rect(fill = NA),
legend.position = c(0.95, 0.92),
scale_x_continuous(limits=c(min(res$MB),36), expand = c(0, 0), trans="log10") +
theme(panel.grid.major.y = element_line(colour = "gray90"),
panel.grid.minor = element_line(colour = NA),
panel.grid.major.x = element_line(colour = NA),
plot.title = element_text(size = 12,
face = "bold", hjust = 1, vjust = 0),
panel.background = element_rect(fill = NA),
legend.background = element_rect(fill = NA),
legend.position = c(0.95, 0.92),
axis.ticks.x = element_line(linetype = "blank"),
axis.ticks.y = element_line(linetype = "blank"),
legend.text = element_text(size = 11),
legend.text = element_text(size = 11),
legend.key = element_rect(fill = NA)) +
ylab("Relative time") + xlab("File size (MB)") +
scale_colour_manual(values=c(get_col(2), get_col(3), get_col(4))) +
scale_y_continuous(limits=c(0,15), expand = c(0, 0))
g1 = g + theme(strip.background = element_rect(fill = "white"),
strip.text = element_text( hjust = 0.95, face="bold")) +
ylab("Relative time") + xlab("File size (MB)") +
scale_colour_manual(values=c(get_col(2), get_col(3), get_col(4))) +
scale_y_continuous(limits=c(0,15), expand = c(0, 0))
g1 = g + theme(strip.background = element_rect(fill = "white"),
strip.text = element_text( hjust = 0.95, face="bold")) +
guides(colour = FALSE, linetype=FALSE)


Expand All @@ -82,10 +86,3 @@ labels = tibble::frame_data(
g2 = g1 + geom_text(data=labels, aes(color=type, label=type))
print(g2)








0 comments on commit 529ab68

Please sign in to comment.