如何修改代码以提高处理速度

时间:2021-08-03 03:57:02

I have to run a similar code across columns in a large matrix.

我必须在大矩阵中的列之间运行类似的代码。

set.seed(1)

my_vector <- runif( 10000 )

my_sums <- NULL

for ( l in 1:length( my_vector ) ) {

  current_result <- my_vector[ my_vector < runif( 1 ) ]

  my_sums[l] <- sum( current_result )

}

head(my_sums)
# [1]   21.45613 2248.31463 2650.46104   62.82708   11.11391   86.21950

Sys.time results:

系统时间结果:

   user  system elapsed 
   1.14    0.00    1.14

Any ideas on how to improve performance?

关于如何提高绩效的任何想法?

5 个解决方案

#1


19  

Matt Dowle's excellent data.table approach in base R

Matt Dowle在基础R中的出色数据表

system.time({
  set.seed(1)
  my_vector <- runif(10000)
  x <- runif(10000)
  sorted <- sort(my_vector)
  ind <- findInterval(x, sorted) + 1
  my_sums <- c(0, cumsum(sorted))[ind]
})

#   user  system elapsed 
#      0       0       0 

head(my_sums)
#[1]   21.45613 2248.31463 2650.46104   62.82708   11.11391   86.21950

#2


14  

require(data.table)

system.time({
  set.seed(1)
  my_vector = runif(10000)
  DT = data.table(my_vector)
  setkey(DT, my_vector)
  DT[,cumsum:=cumsum(my_vector)]
  my_sums = DT[.(runif(10000)), cumsum, roll=TRUE]
  my_sums[is.na(my_sums)] = 0
})

head(my_sums)
# [1]   21.45613 2248.31463 2650.46104   62.82708   11.11391   86.21950

#   user  system elapsed 
#  0.004   0.000   0.004

#3


1  

What about sapply?

sapply怎么样?

temp <- sapply(seq_along(my_vector), function(l){

  current_result <- my_vector[ my_vector < runif( 1 ) ]
  my_sums[l] <- sum( current_result )

})

Gives this some performance improvements?

这给性能带来了一些改进?

#4


1  

Edit: the addition of sort() cuts my time down to 0.74. The time it takes to sort my_vector is trivial on this example, but may be costly on larger/ different data.

编辑:添加sort()会将我的时间减少到0.74。在这个例子中,对my_vector进行排序所需的时间非常简单,但在较大/不同的数据上可能会很昂贵。

set.seed(1)

my_vector <- runif( 10000 )
n<-runif(10000)
my_sums <- 1:10000
system.time(my_vector<-sort(my_vector))

#user  system elapsed 
# 0       0       0 
# my_vector is now sorted.


system.time(
for ( l in 1:length( my_vector ) ) {

my_sums[l] <- sum(my_vector[my_vector < n[l]])
})

# user  system elapsed 
# 0.73    0.00    0.74 

head(my_sums)
# [1]   21.4561 2248.3146 2650.4610   62.8271   11.1139   86.2195

#5


1  

Since you want to apply the same function across columns in a large matrix, I would suggest this:

由于您希望在大矩阵中的列之间应用相同的函数,我建议如下:

dt <- data.table( my_vector1 = runif( 1000000 ),
                  my_vector2 = runif( 1000000 ),
                  my_vector3 = runif( 1000000 ))

cols <- paste0(names(dt),"_csum")

setkey(dt)

dt[, (cols) := lapply (.SD, function(x)  cumsum(x) )]


> head(dt)
#>      my_vector1 my_vector2 my_vector3 my_vector1_csum my_vector2_csum my_vector3_csum
#> 1: 7.664785e-07 0.47817820  0.9008552    7.664785e-07       0.4781782       0.9008552
#> 2: 8.875504e-07 0.24142375  0.9849384    1.654029e-06       0.7196019       1.8857936
#> 3: 1.326203e-06 0.48592786  0.3791094    2.980232e-06       1.2055298       2.2649030
#> 4: 2.730172e-06 0.76847160  0.5732031    5.710404e-06       1.9740014       2.8381061
#> 5: 4.655216e-06 0.01094117  0.5120915    1.036562e-05       1.9849426       3.3501976

Additionally, the library profvis is really helpful in identifying the time and memory consumption of each line in your code. Example here.

此外,库profvis非常有助于识别代码中每行的时间和内存消耗。这里的例子。

#1


19  

Matt Dowle's excellent data.table approach in base R

Matt Dowle在基础R中的出色数据表

system.time({
  set.seed(1)
  my_vector <- runif(10000)
  x <- runif(10000)
  sorted <- sort(my_vector)
  ind <- findInterval(x, sorted) + 1
  my_sums <- c(0, cumsum(sorted))[ind]
})

#   user  system elapsed 
#      0       0       0 

head(my_sums)
#[1]   21.45613 2248.31463 2650.46104   62.82708   11.11391   86.21950

#2


14  

require(data.table)

system.time({
  set.seed(1)
  my_vector = runif(10000)
  DT = data.table(my_vector)
  setkey(DT, my_vector)
  DT[,cumsum:=cumsum(my_vector)]
  my_sums = DT[.(runif(10000)), cumsum, roll=TRUE]
  my_sums[is.na(my_sums)] = 0
})

head(my_sums)
# [1]   21.45613 2248.31463 2650.46104   62.82708   11.11391   86.21950

#   user  system elapsed 
#  0.004   0.000   0.004

#3


1  

What about sapply?

sapply怎么样?

temp <- sapply(seq_along(my_vector), function(l){

  current_result <- my_vector[ my_vector < runif( 1 ) ]
  my_sums[l] <- sum( current_result )

})

Gives this some performance improvements?

这给性能带来了一些改进?

#4


1  

Edit: the addition of sort() cuts my time down to 0.74. The time it takes to sort my_vector is trivial on this example, but may be costly on larger/ different data.

编辑:添加sort()会将我的时间减少到0.74。在这个例子中,对my_vector进行排序所需的时间非常简单,但在较大/不同的数据上可能会很昂贵。

set.seed(1)

my_vector <- runif( 10000 )
n<-runif(10000)
my_sums <- 1:10000
system.time(my_vector<-sort(my_vector))

#user  system elapsed 
# 0       0       0 
# my_vector is now sorted.


system.time(
for ( l in 1:length( my_vector ) ) {

my_sums[l] <- sum(my_vector[my_vector < n[l]])
})

# user  system elapsed 
# 0.73    0.00    0.74 

head(my_sums)
# [1]   21.4561 2248.3146 2650.4610   62.8271   11.1139   86.2195

#5


1  

Since you want to apply the same function across columns in a large matrix, I would suggest this:

由于您希望在大矩阵中的列之间应用相同的函数,我建议如下:

dt <- data.table( my_vector1 = runif( 1000000 ),
                  my_vector2 = runif( 1000000 ),
                  my_vector3 = runif( 1000000 ))

cols <- paste0(names(dt),"_csum")

setkey(dt)

dt[, (cols) := lapply (.SD, function(x)  cumsum(x) )]


> head(dt)
#>      my_vector1 my_vector2 my_vector3 my_vector1_csum my_vector2_csum my_vector3_csum
#> 1: 7.664785e-07 0.47817820  0.9008552    7.664785e-07       0.4781782       0.9008552
#> 2: 8.875504e-07 0.24142375  0.9849384    1.654029e-06       0.7196019       1.8857936
#> 3: 1.326203e-06 0.48592786  0.3791094    2.980232e-06       1.2055298       2.2649030
#> 4: 2.730172e-06 0.76847160  0.5732031    5.710404e-06       1.9740014       2.8381061
#> 5: 4.655216e-06 0.01094117  0.5120915    1.036562e-05       1.9849426       3.3501976

Additionally, the library profvis is really helpful in identifying the time and memory consumption of each line in your code. Example here.

此外,库profvis非常有助于识别代码中每行的时间和内存消耗。这里的例子。