多步/嵌套分组方案中的特定于组的计算

时间:2022-01-01 12:48:40

I guess I'm trying to compute cumulative sums on certain data segments/groups where the grouping needs to happen in multiple cycles and in that sense, is somewhat nested (sorry, I don't know how better to describe it).

我想我正在尝试计算某些数据段/组的累积总和,其中分组需要在多个周期中发生,并且在这个意义上,有点嵌套(抱歉,我不知道如何更好地描述它)。

I'm trying to get from this

我想要摆脱这个

struc <- structure(list(X1 = c(1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 
  0, 1, 0, 0, 0, 0), X2 = c(0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 
  0, 1, 0, 1, 0, 1, 0), X3 = c(0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 
  0, 0, 0, 0, 0, 1, 0, 1)), .Names = c("X1", "X2", "X3"), row.names = c(NA, 
  -19L), class = "data.frame")
> struc
   X1 X2 X3
1   1  0  0
2   0  1  0
3   0  0  1
4   0  0  1
5   0  1  0
6   0  0  1
7   0  0  1
8   1  0  0
9   0  1  0
10  1  0  0
11  1  0  0
12  0  1  0
13  1  0  0
14  0  1  0
15  1  0  0
16  0  1  0
17  0  0  1
18  0  1  0
19  0  0  1

to this

对此

> data.frame(
  x1 = cumsum(struc[,1]),
  x2 = c(0, rep(1, 3), rep(2, 3), 0, 1, rep(0, 2), 1, 0, 1, 0, rep(1, 2), rep(2, 2)),
  x3 = c(rep(0, 2), 1, 2, 0, 1, 2, rep(0, 9), 1, 0, 1) 
)
   x1 x2 x3
1   1  0  0
2   1  1  0
3   1  1  1
4   1  1  2
5   1  2  0
6   1  2  1
7   1  2  2
8   2  0  0
9   2  1  0
10  3  0  0
11  4  0  0
12  4  1  0
13  5  0  0
14  5  1  0
15  6  0  0
16  6  1  0
17  6  1  1
18  6  2  0
19  6  2  1

EDIT

Background

背景

I'm using the info in struc to come up with a fast way to compute sort of an index matrix that tells me about the structure of arbitrary deeply nested lists and that I could use for <- and [[ operations on those lists:

我正在使用struc中的信息来提出一种快速计算索引矩阵的方法,该矩阵告诉我任意深度嵌套列表的结构,并且我可以用于< - 和[[对这些列表的操作:

x <- list(
  x1 = list(x11 = list(x111 = 1, x112 = 1), x12 = list(x121 = 1, x122 = 1)),
  b = list(c = 2),
  c = 3,
  d = list(1:4),
  list(1:5),
  list(list(1:2), list(3:4))
)

getStructure <- function(x) {
  struc <- capture.output(str(x))
  struc <- unlist(strsplit(struc, split = "\n"))
  tops <- str_count(struc, "\\s\\$\\s")
  subs <- str_count(struc, "((\\.\\.)(\\s|\\$))")
  ## Clean //
  idx_out <- which(tops == 0 & subs == 0)
  if (length(idx_out)) {
    tops <- tops[-idx_out]
    subs <- subs[-idx_out]
    struc <- struc[-idx_out]
  }
  ## Levels //
  subs_2 <- lapply(0:subs[which.max(subs)], function(ii) {
    out <- subs == ii
    out[out] <- 1
    out
  })
  names(subs_2) <- 1:length(subs_2)
  data.frame(subs_2)
}

The best I could come up with so far is to loop over the columns in a certain way, but it feels like I'm not using the power of clever matrix algebra or the power of dplyr or things like filter() and similar "split-and-combine" approaches:

到目前为止我能想出的最好的方法是以某种方式循环使用列,但感觉就像我没有使用聪明的矩阵代数或dplyr的强大功能或类似filter()和类似的“分裂”并结合“方法:

getIndexMatrix <- function(x) {
  for (ii in 1:ncol(x)) {
    if (ii == 1) {
      x[,ii] <- cumsum(x[,ii])
    } else {
      f <- factor(apply(x[,1:(ii-1), drop = FALSE], 1, paste, collapse = "-"))
      spl <- split(x, f = f)
      out <- lapply(spl, function(ii) {
        apply(ii, 2, cumsum)
      })
      x[,ii] <- do.call("rbind", out)[,ii]  
    }
    NULL
  }
  x
}
(indexes <- getIndexMatrix(struc))
   X1 X2 X3
1   1  0  0
2   1  1  0
3   1  1  1
4   1  1  2
5   1  2  0
6   1  2  1
7   1  2  2
8   2  0  0
9   2  1  0
10  3  0  0
11  4  0  0
12  4  1  0
13  5  0  0
14  5  1  0
15  6  0  0
16  6  1  0
17  6  1  1
18  6  2  0
19  6  2  1

This could for example be used to retrieve the "leaf" values of a nested list.

例如,这可以用于检索嵌套列表的“叶子”值。

Auxiliary function:

辅助功能:

getExtendedIndexMatrix <- function(x) {
  x <- getIndexMatrix(x)
  substituteZeros <- function(x) {
    x[!x] <- NA
    x
  }
  tmp <- as.data.frame(apply(x, 2, substituteZeros))
  spl <- split(tmp, f = tmp[,1])
  tmp <- lapply(spl, function(ii) {
    info <- lapply(1:nrow(ii), function(ii2) {
      data.frame(
        scope = length(na.omit(as.numeric(ii[ii2,,drop = TRUE]))),
        index = paste0("[[", paste(na.omit(as.numeric(ii[ii2,,drop = TRUE])), 
          collapse = "]][["), "]]")
      )
    })
    info <- do.call("rbind", info)
    leaf <- rep(FALSE, nrow(info))
    leaf[which(info$scope == which.max(info$scope))] <- TRUE
    data.frame(ii, leaf = leaf, info)
  })
  do.call("rbind", tmp)
}
> getExtendedIndexMatrix(struc)
     X1 X2 X3  leaf scope           index
1.1   1 NA NA FALSE     1           [[1]]
1.2   1  1 NA FALSE     2      [[1]][[1]]
1.3   1  1  1  TRUE     3 [[1]][[1]][[1]]
1.4   1  1  2  TRUE     3 [[1]][[1]][[2]]
1.5   1  2 NA FALSE     2      [[1]][[2]]
1.6   1  2  1  TRUE     3 [[1]][[2]][[1]]
1.7   1  2  2  TRUE     3 [[1]][[2]][[2]]
2.8   2 NA NA FALSE     1           [[2]]
2.9   2  1 NA  TRUE     2      [[2]][[1]]
3     3 NA NA  TRUE     1           [[3]]
4.11  4 NA NA FALSE     1           [[4]]
4.12  4  1 NA  TRUE     2      [[4]][[1]]
5.13  5 NA NA FALSE     1           [[5]]
5.14  5  1 NA  TRUE     2      [[5]][[1]]
6.15  6 NA NA FALSE     1           [[6]]
6.16  6  1 NA FALSE     2      [[6]][[1]]
6.17  6  1  1  TRUE     3 [[6]][[1]][[1]]
6.18  6  2 NA FALSE     2      [[6]][[2]]
6.19  6  2  1  TRUE     3 [[6]][[2]][[1]]

Actual leaf values:

实际叶值:

getLeafValues <- function(x) {
  indexes <- getExtendedIndexMatrix(getStructure(x))
  expr <- parse(text = paste0("x", indexes[which(indexes$leaf), "index"]))
  lapply(expr, eval)
}
getLeafValues(x)

1 个解决方案

#1


1  

Not entirely sure I understand what you are trying to accomplish, but here is a data.table solution that produces the same result as your example.

我不完全确定我理解您要完成的任务,但这里有一个data.table解决方案,可以产生与您的示例相同的结果。

library(data.table)
setDT(struc)
struc[,x1:=cumsum(X1)]
struc[,x2:=cumsum(X2),by=x1]
struc[,x3:=cumsum(X3),by=with(rle(x2),rep(1:length(lengths),lengths))]
struc
#    X1 X2 X3 x1 x2 x3
#  1:  1  0  0  1  0  0
#  2:  0  1  0  1  1  0
#  3:  0  0  1  1  1  1
#  4:  0  0  1  1  1  2
#  5:  0  1  0  1  2  0
#  6:  0  0  1  1  2  1
#  7:  0  0  1  1  2  2
#  8:  1  0  0  2  0  0
#  9:  0  1  0  2  1  0
# 10:  1  0  0  3  0  0
# 11:  1  0  0  4  0  0
# 12:  0  1  0  4  1  0
# 13:  1  0  0  5  0  0
# 14:  0  1  0  5  1  0
# 15:  1  0  0  6  0  0
# 16:  0  1  0  6  1  0
# 17:  0  0  1  6  1  1
# 18:  0  1  0  6  2  0
# 19:  0  0  1  6  2  1

#1


1  

Not entirely sure I understand what you are trying to accomplish, but here is a data.table solution that produces the same result as your example.

我不完全确定我理解您要完成的任务,但这里有一个data.table解决方案,可以产生与您的示例相同的结果。

library(data.table)
setDT(struc)
struc[,x1:=cumsum(X1)]
struc[,x2:=cumsum(X2),by=x1]
struc[,x3:=cumsum(X3),by=with(rle(x2),rep(1:length(lengths),lengths))]
struc
#    X1 X2 X3 x1 x2 x3
#  1:  1  0  0  1  0  0
#  2:  0  1  0  1  1  0
#  3:  0  0  1  1  1  1
#  4:  0  0  1  1  1  2
#  5:  0  1  0  1  2  0
#  6:  0  0  1  1  2  1
#  7:  0  0  1  1  2  2
#  8:  1  0  0  2  0  0
#  9:  0  1  0  2  1  0
# 10:  1  0  0  3  0  0
# 11:  1  0  0  4  0  0
# 12:  0  1  0  4  1  0
# 13:  1  0  0  5  0  0
# 14:  0  1  0  5  1  0
# 15:  1  0  0  6  0  0
# 16:  0  1  0  6  1  0
# 17:  0  0  1  6  1  1
# 18:  0  1  0  6  2  0
# 19:  0  0  1  6  2  1