deduped
contains one main function
deduped()
which speeds up slow, vectorized functions by
only performing computations on the unique values of the input and
expanding the results at the end.
One particular use case of deduped()
that I come across
a lot is when using basename()
and dirname()
on the file_path
column after reading multiple CSVs
(e.g. with readr::read_csv(..., id="file_path")
).
basename()
and dirname()
are surprisingly slow
(especially on Windows), and most of the column is duplicated.
You can install the released version of deduped
from CRAN with:
install.packages("deduped")
And the development version from GitHub:
if(!requireNamespace("remotes")) install.packages("remotes")
::install_github("orgadish/deduped") remotes
library(deduped)
set.seed(0)
<- function(ii) {
slow_func for (i in ii) {
Sys.sleep(0.001)
}
}
# deduped()
<- sample(LETTERS, 10)
unique_vec
unique_vec#> [1] "N" "Y" "D" "G" "A" "B" "K" "Z" "R" "V"
<- sample(rep(unique_vec, 100))
duplicated_vec length(duplicated_vec)
#> [1] 1000
system.time({
<- deduped(slow_func)(duplicated_vec)
x1
})#> user system elapsed
#> 0.097 0.015 0.134
system.time({
<- slow_func(duplicated_vec)
x2
})#> user system elapsed
#> 0.032 0.013 1.197
all.equal(x1, x2)
#> [1] TRUE
# deduped() can be combined with lapply() or purrr::map().
<- lapply(1:5, function(j) sample(LETTERS, j, replace = TRUE))
unique_list str(unique_list)
#> List of 5
#> $ : chr "M"
#> $ : chr [1:2] "P" "Y"
#> $ : chr [1:3] "D" "E" "L"
#> $ : chr [1:4] "B" "I" "J" "N"
#> $ : chr [1:5] "W" "T" "F" "E" ...
# Create a list with significant duplication.
<- sample(rep(unique_list, 100))
duplicated_list length(duplicated_list)
#> [1] 500
system.time({
<- deduped(lapply)(duplicated_list, slow_func)
y1
})#> user system elapsed
#> 0.001 0.000 0.018
system.time({
<- lapply(duplicated_list, slow_func)
y2
})#> user system elapsed
#> 0.025 0.016 1.756
all.equal(y1, y2)
#> [1] TRUE
file_path
Example# Create multiple CSVs to read
<- tempfile()
tf dir.create(tf)
# Duplicate mtcars 10,000x and write 1 CSV for each value of `am`
<- dplyr::slice(mtcars, rep(1:nrow(mtcars), 10000))
duplicated_mtcars invisible(sapply(
::group_split(duplicated_mtcars, am),
dplyrfunction(k) {
<- paste0("mtcars_", unique(k$am), ".csv")
file_name ::write_csv(k, file.path(tf, file_name))
readr
}
))
<- readr::read_csv(
duplicated_mtcars_from_files list.files(tf, full.names = TRUE),
id = "file_path",
show_col_types = FALSE
)::count(duplicated_mtcars_from_files, basename(file_path))
dplyr#> # A tibble: 2 × 2
#> `basename(file_path)` n
#> <chr> <int>
#> 1 mtcars_0.csv 190000
#> 2 mtcars_1.csv 130000
system.time({
<- dplyr::mutate(
df1
duplicated_mtcars_from_files,file_name = basename(file_path)
)
})#> user system elapsed
#> 0.104 0.000 0.104
system.time({
<- dplyr::mutate(
df2
duplicated_mtcars_from_files,file_name = deduped(basename)(file_path)
)
})#> user system elapsed
#> 0.010 0.002 0.013
all.equal(df1, df2)
#> [1] TRUE
unlink(tf)