rudimentale (microbenchmark::microbenchmark
) analisi comparativa mostra una piuttosto significativo speed-up utilizzando:
library(tidyr)
library(magrittr)
df <- data.frame(x, y, z)
df %>%
complete(x, y, z) %>%
unite("combo", x, y, z, sep = "_")
Un po 'più lento, ma forse più semplice e la variante vectorized l'apply
tecnica:
df <- expand.grid(x, y, z)
df$combo <- paste(df$Var1, df$Var1, df$Var3, sep = "_")
Qualcuno dovrebbe carillon con un approccio data.table
...
Benchmarking: Piccolo griglia (256 elementi)
set.seed(21034)
x <- sample(letters, 4, TRUE)
y <- sample(letters, 4, TRUE)
z <- sample(letters, 4, TRUE)
a <- sample(letters, 4, TRUE)
library(data.table)
library(microbenchmark)
library(magrittr)
library(tidyr)
microbenchmark(times = 25L,
DT1 = CJ(x, y, z, a)[ , paste(V1, V2, V3, V4, sep = "_")],
DT2 = CJ(x, y, z, a)[ , do.call(paste, c(.SD, sep = "_"))],
app1 = do.call(paste, c(expand.grid(x, y, z, a), sep = "_")),
app2 = paste((df <- expand.grid(x, y, z, a))$Var1,
df$Var2, df$Var3, sep = "_"),
magg_outer = outer(x, y, paste, sep = "_") %>%
outer(z, paste, sep = "_") %>%
outer(a, paste, sep = "_") %>% as.vector,
magg_tidy = data.frame(x, y, z, a) %>%
complete(x, y, z, a) %>%
unite("combo", x, y, z, a, sep = "_"),
interaction = levels(interaction(x, y, z, a, sep = "_")),
original = apply(expand.grid(x, y, z, a), 1, paste, collapse = "_"),
rep = paste(rep(x, each = (ny <- length(y)) * (nz <- length(z)) *
(na <- length(a))),
rep(rep(y, each = nz * na), (nx <- length(x))),
rep(rep(z, each = na), nx * ny), sep = "_"),
Reduce = Reduce(function(x, y) paste(rep(x, each = length(y)),
rep(y, length(x)), sep = "_"),
list(x, y, z, a)))
# Unit: microseconds
# expr min lq mean median uq max neval cld
# DT1 529.578 576.6400 624.00002 589.8270 604.9845 5449.287 1000 d
# DT2 561.028 606.4220 639.94659 620.4335 636.2735 5484.514 1000 d
# app1 201.043 225.4475 240.36960 233.4795 243.7090 4244.687 1000 b
# app2 196.692 225.6130 244.33543 234.0455 243.7925 4110.605 1000 b
# magg_outer 164.352 194.1395 205.30300 204.4220 211.1990 456.122 1000 b
# magg_tidy 1872.228 2038.1560 2150.98234 2067.8770 2126.1025 21891.884 1000 f
# interaction 254.885 295.1935 313.54392 306.6680 316.8095 4196.465 1000 c
# original 852.018 935.4960 976.24388 954.5115 972.5550 4973.724 1000 e
# rep 50.737 54.1515 60.22671 55.3660 56.9220 3823.655 1000 a
# Reduce 58.395 65.3860 68.46049 66.8920 68.5640 158.184 1000 a
Benchmarking: griglia di grandi dimensioni (1.000.000 elementi)
set.seed(21034)
x <- sprintf("%03d", sample(100))
y <- sprintf("%03d", sample(100))
z <- sprintf("%02d", sample(10))
a <- sprintf("%02d", sample(10))
library(data.table)
library(microbenchmark)
library(magrittr)
library(tidyr)
microbenchmark(times = 25L,
DT1 = CJ(x, y, z, a)[ , paste(V1, V2, V3, V4, sep = "_")],
DT2 = CJ(x, y, z, a)[ , do.call(paste, c(.SD, sep = "_"))],
app1 = do.call(paste, c(expand.grid(x, y, z, a), sep = "_")),
app2 = paste((df <- expand.grid(x, y, z, a))$Var1,
df$Var2, df$Var3, sep = "_"),
magg_outer = outer(x, y, paste, sep = "_") %>%
outer(z, paste, sep = "_") %>%
outer(a, paste, sep = "_") %>% as.vector,
magg_tidy = data.frame(x, y, z, a) %>%
complete(x, y, z, a) %>%
unite("combo", x, y, z, a, sep = "_"),
interaction = levels(interaction(x, y, z, a, sep = "_")),
original = apply(expand.grid(x, y, z, a), 1, paste, collapse = "_"),
rep = paste(rep(x, each = (ny <- length(y)) * (nz <- length(z)) *
(na <- length(a))),
rep(rep(y, each = nz * na), (nx <- length(x))),
rep(rep(z, each = na), nx * ny), sep = "_"),
Reduce = Reduce(function(x, y) paste(rep(x, each = length(y)),
rep(y, length(x)), sep = "_"),
list(x, y, z, a)))
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# DT1 360.6528 467.8408 517.4579 520.1484 549.1756 861.1567 25 ab
# DT2 355.0438 504.9642 572.0732 551.9106 615.6621 927.3210 25 b
# app1 727.4513 766.3053 926.1888 910.3998 957.7610 1690.1540 25 c
# app2 472.5724 567.1121 633.5304 600.3779 634.3158 1135.7535 25 b
# magg_outer 384.0112 475.5070 600.6317 525.8936 676.7134 927.6736 25 b
# magg_tidy 520.6428 602.5028 695.5500 680.8821 748.8746 1180.1107 25 bc
# interaction 353.7317 481.4732 531.0035 518.7084 585.0872 693.5171 25 ab
# original 4965.1156 5358.8704 5914.3560 5780.6609 6074.7470 9024.6476 25 d
# rep 206.0964 236.5811 273.1093 252.8179 285.0910 455.1776 25 a
# Reduce 322.0695 390.2595 446.3948 424.9185 508.5235 621.1878 25 ab
Avere 'Reduce' si potrebbe estendere la funzione binaria a più argomenti:' Ridurre (function (x, y) Pasta (rep (x, ciascuna = lunghezza (y)), rep (y, length (x)), sep = "_"), lista (x, y, z)) '. Questo, inoltre, potrebbe salvare la ri-concatenazione degli stessi elementi più di una volta e potrebbe essere efficace in alcuni casi. –
@alexis_laz oh wow, in realtà ho appena avuto la stessa funzione esaminando le altre risposte. Si prega di postare una risposta perché è molto più veloce di qualsiasi altra cosa, oltre a sillabare manualmente la sequenza 'rep' a mano. – MichaelChirico