My goal is to sum all values in columns that start with the prefix skill_
in a data.table
. I would prefer a solution using data.table
but I am not picky.
我的目标是在data.table中对以前缀skill_开头的列中的所有值求和。我更喜欢使用data.table的解决方案,但我不挑剔。
My solution up to now:
我的解决方案到目前为止:
> require(data.table)
> DT <- data.table(x=1:4, skill_a=c(0,1,0,0), skill_b=c(0,1,1,0), skill_c=c(0,1,1,1))
> DT[, row_idx := 1:nrow(DT)]
> DT[, count_skills :=
sapply(1:nrow(DT),
function(id) sum(DT[row_idx == id,
grepl("skill_", names(DT)), with=FALSE]))]
> DT
x skill_a skill_b skill_c row_idx count_skills
1: 1 0 0 0 1 0
2: 2 1 1 1 2 3
3: 3 0 1 1 3 2
4: 4 0 0 1 4 1
But this becomes very slow when DT is very large. Is there a more efficient way to do this?
但是当DT非常大时,这变得非常慢。有没有更有效的方法来做到这一点?
4 个解决方案
#1
13
A question about efficiency and performance always deserves benchmarks...
关于效率和性能的问题总是值得基准......
The size of your data is important as growth rate makes a huge difference...
您的数据大小非常重要,因为增长率会产生巨大差异......
Relative Benchmark Timings between 2^4 and 2^24.
Sizes along floor( 2^logb(10^( seq( 4, 24, .5 ) ), 10 ) )
相对基准时间在2 ^ 4和2 ^ 24之间。沿地板的大小(2 ^ logb(10 ^(seq(4,24,.5)),10))
Excerpt of benchmarks at 1 million rows...
100万行的基准测试摘录......
## Unit: milliseconds
## expr min lq median uq max neval
## dplyr.sol(DT) 21.803 50.260 51.765 52.45 73.30 100
## rowSums.sol(DT) 20.759 50.224 51.418 52.56 96.28 100
## SDCols.sol(DT) 7.250 8.916 37.699 38.50 52.69 100
## eval.sol(DT) 6.883 7.007 7.916 9.45 50.91 100
eval.sol
is an answer that takes advantage of data.table's handling of expressions, in the below source...
eval.sol是一个利用data.table处理表达式的答案,在下面的源代码中......
library(compiler)
library(data.table)
suppressMessages(library(dplyr))
library(microbenchmark)
buildDT <- function(reps) {
data.table(x=seq_len(reps*4),
skill_a=rep(c(0,1,0,0),reps),
skill_b=rep(c(0,1,1,0),reps),
skill_c=rep(c(0,1,1,1),reps))
}
OP.sol <- function(DT) {
DT[, row_idx := 1:nrow(DT)]
DT[, count_skills :=
sapply(1:nrow(DT),
function(id) sum(DT[row_idx == id,
grepl("skill_", names(DT)), with=FALSE]))]
}
dplyr.sol <- function(DT)
DT %.% select(starts_with("skill_")) %.% rowSums()
SDCols.sol <- function(DT)
DT[, Reduce(`+`, .SD),
.SDcols = grep("skill_", names(DT), value = T)]
rowSums.sol <- function(DT)
rowSums(DT[,grep("skill_", names(DT)),with=FALSE])
eval.sol <- function(DT) {
cmd <- parse(text=paste(colnames(DT)[grepl("^skill_", colnames(DT))],collapse='+') )
DT[,eval(cmd)]
}
DT <- buildDT(1)
identical(OP.sol(DT)$count_skills, dplyr.sol(DT))
## [1] TRUE
identical(OP.sol(DT)$count_skills, rowSums.sol(DT))
## [1] TRUE
identical(OP.sol(DT)$count_skills, SDCols.sol(DT))
## [1] TRUE
identical(OP.sol(DT)$count_skills, eval.sol(DT))
## [1] TRUE
DT<-buildDT(2500)
nrow(DT)
## [1] 10000
microbenchmark( # OP.sol(DT), forget this method.
dplyr.sol(DT),
rowSums.sol(DT),
SDCols.sol(DT),
eval.sol(DT),
times=100)
## Unit: microseconds
## expr min lq median uq max neval
## dplyr.sol(DT) 760.1 809.0 848.2 951.5 2276 100
## rowSums.sol(DT) 580.5 605.3 627.6 745.7 28481 100
## SDCols.sol(DT) 559.8 610.5 638.8 694.0 2016 100
## eval.sol(DT) 636.4 677.7 692.4 740.5 2021 100
DT<-buildDT(25000)
nrow(DT)
## [1] 100000
microbenchmark( # OP.sol(DT), forget this method.
dplyr.sol(DT),
rowSums.sol(DT),
SDCols.sol(DT),
eval.sol(DT),
times=100)
## Unit: milliseconds
## expr min lq median uq max neval
## dplyr.sol(DT) 2.668 3.744 4.045 4.573 33.87 100
## rowSums.sol(DT) 2.455 3.339 3.756 4.235 34.19 100
## SDCols.sol(DT) 1.253 1.401 2.179 2.392 31.72 100
## eval.sol(DT) 1.294 1.427 2.116 2.484 32.02 100
DT<-buildDT(250000)
nrow(DT)
## [1] 1000000
microbenchmark( # OP.sol(DT), forget this method.
dplyr.sol(DT),
rowSums.sol(DT),
SDCols.sol(DT),
eval.sol(DT),
times=100)
## Unit: milliseconds
## expr min lq median uq max neval
## dplyr.sol(DT) 21.803 50.260 51.765 52.45 73.30 100
## rowSums.sol(DT) 20.759 50.224 51.418 52.56 96.28 100
## SDCols.sol(DT) 7.250 8.916 37.699 38.50 52.69 100
## eval.sol(DT) 6.883 7.007 7.916 9.45 50.91 100
identical(dplyr.sol(DT), rowSums.sol(DT))
## [1] TRUE
identical(dplyr.sol(DT), SDCols.sol(DT))
## [1] TRUE
identical(dplyr.sol(DT), eval.sol(DT))
## [1] TRUE
#2
9
Why not to use rowSums
, It is generally efficient:
为什么不使用rowSums,它通常很有效:
rowSums(DT[,grep("skill_", names(DT)),with=FALSE])
#3
8
Here is a dplyr solution:
这是一个dplyr解决方案:
library(dplyr)
DT %>% mutate(count = DT %>% select(starts_with("skill_")) %>% rowSums())
#4
7
Solution using data.table
and .SDcols
.
使用data.table和.SDcols的解决方案。
require(data.table)
DT <- data.table(x=1:4, skill_a=c(0,1,0,0), skill_b=c(0,1,1,0),
skill_c=c(0,1,1,1))
DT[, row_idx := 1:nrow(DT)]
DT[, count_skills := Reduce(`+`, .SD),
.SDcols = grep("skill_", names(DT), value = T)]
DT
#1
13
A question about efficiency and performance always deserves benchmarks...
关于效率和性能的问题总是值得基准......
The size of your data is important as growth rate makes a huge difference...
您的数据大小非常重要,因为增长率会产生巨大差异......
Relative Benchmark Timings between 2^4 and 2^24.
Sizes along floor( 2^logb(10^( seq( 4, 24, .5 ) ), 10 ) )
相对基准时间在2 ^ 4和2 ^ 24之间。沿地板的大小(2 ^ logb(10 ^(seq(4,24,.5)),10))
Excerpt of benchmarks at 1 million rows...
100万行的基准测试摘录......
## Unit: milliseconds
## expr min lq median uq max neval
## dplyr.sol(DT) 21.803 50.260 51.765 52.45 73.30 100
## rowSums.sol(DT) 20.759 50.224 51.418 52.56 96.28 100
## SDCols.sol(DT) 7.250 8.916 37.699 38.50 52.69 100
## eval.sol(DT) 6.883 7.007 7.916 9.45 50.91 100
eval.sol
is an answer that takes advantage of data.table's handling of expressions, in the below source...
eval.sol是一个利用data.table处理表达式的答案,在下面的源代码中......
library(compiler)
library(data.table)
suppressMessages(library(dplyr))
library(microbenchmark)
buildDT <- function(reps) {
data.table(x=seq_len(reps*4),
skill_a=rep(c(0,1,0,0),reps),
skill_b=rep(c(0,1,1,0),reps),
skill_c=rep(c(0,1,1,1),reps))
}
OP.sol <- function(DT) {
DT[, row_idx := 1:nrow(DT)]
DT[, count_skills :=
sapply(1:nrow(DT),
function(id) sum(DT[row_idx == id,
grepl("skill_", names(DT)), with=FALSE]))]
}
dplyr.sol <- function(DT)
DT %.% select(starts_with("skill_")) %.% rowSums()
SDCols.sol <- function(DT)
DT[, Reduce(`+`, .SD),
.SDcols = grep("skill_", names(DT), value = T)]
rowSums.sol <- function(DT)
rowSums(DT[,grep("skill_", names(DT)),with=FALSE])
eval.sol <- function(DT) {
cmd <- parse(text=paste(colnames(DT)[grepl("^skill_", colnames(DT))],collapse='+') )
DT[,eval(cmd)]
}
DT <- buildDT(1)
identical(OP.sol(DT)$count_skills, dplyr.sol(DT))
## [1] TRUE
identical(OP.sol(DT)$count_skills, rowSums.sol(DT))
## [1] TRUE
identical(OP.sol(DT)$count_skills, SDCols.sol(DT))
## [1] TRUE
identical(OP.sol(DT)$count_skills, eval.sol(DT))
## [1] TRUE
DT<-buildDT(2500)
nrow(DT)
## [1] 10000
microbenchmark( # OP.sol(DT), forget this method.
dplyr.sol(DT),
rowSums.sol(DT),
SDCols.sol(DT),
eval.sol(DT),
times=100)
## Unit: microseconds
## expr min lq median uq max neval
## dplyr.sol(DT) 760.1 809.0 848.2 951.5 2276 100
## rowSums.sol(DT) 580.5 605.3 627.6 745.7 28481 100
## SDCols.sol(DT) 559.8 610.5 638.8 694.0 2016 100
## eval.sol(DT) 636.4 677.7 692.4 740.5 2021 100
DT<-buildDT(25000)
nrow(DT)
## [1] 100000
microbenchmark( # OP.sol(DT), forget this method.
dplyr.sol(DT),
rowSums.sol(DT),
SDCols.sol(DT),
eval.sol(DT),
times=100)
## Unit: milliseconds
## expr min lq median uq max neval
## dplyr.sol(DT) 2.668 3.744 4.045 4.573 33.87 100
## rowSums.sol(DT) 2.455 3.339 3.756 4.235 34.19 100
## SDCols.sol(DT) 1.253 1.401 2.179 2.392 31.72 100
## eval.sol(DT) 1.294 1.427 2.116 2.484 32.02 100
DT<-buildDT(250000)
nrow(DT)
## [1] 1000000
microbenchmark( # OP.sol(DT), forget this method.
dplyr.sol(DT),
rowSums.sol(DT),
SDCols.sol(DT),
eval.sol(DT),
times=100)
## Unit: milliseconds
## expr min lq median uq max neval
## dplyr.sol(DT) 21.803 50.260 51.765 52.45 73.30 100
## rowSums.sol(DT) 20.759 50.224 51.418 52.56 96.28 100
## SDCols.sol(DT) 7.250 8.916 37.699 38.50 52.69 100
## eval.sol(DT) 6.883 7.007 7.916 9.45 50.91 100
identical(dplyr.sol(DT), rowSums.sol(DT))
## [1] TRUE
identical(dplyr.sol(DT), SDCols.sol(DT))
## [1] TRUE
identical(dplyr.sol(DT), eval.sol(DT))
## [1] TRUE
#2
9
Why not to use rowSums
, It is generally efficient:
为什么不使用rowSums,它通常很有效:
rowSums(DT[,grep("skill_", names(DT)),with=FALSE])
#3
8
Here is a dplyr solution:
这是一个dplyr解决方案:
library(dplyr)
DT %>% mutate(count = DT %>% select(starts_with("skill_")) %>% rowSums())
#4
7
Solution using data.table
and .SDcols
.
使用data.table和.SDcols的解决方案。
require(data.table)
DT <- data.table(x=1:4, skill_a=c(0,1,0,0), skill_b=c(0,1,1,0),
skill_c=c(0,1,1,1))
DT[, row_idx := 1:nrow(DT)]
DT[, count_skills := Reduce(`+`, .SD),
.SDcols = grep("skill_", names(DT), value = T)]
DT