I guess I'm trying to compute cumulative sums on certain data segments/groups where the grouping needs to happen in multiple cycles and in that sense, is somewhat nested (sorry, I don't know how better to describe it).
我想我正在尝试计算某些数据段/组的累积总和,其中分组需要在多个周期中发生,并且在这个意义上,有点嵌套(抱歉,我不知道如何更好地描述它)。
I'm trying to get from this
我想要摆脱这个
struc <- structure(list(X1 = c(1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1,
0, 1, 0, 0, 0, 0), X2 = c(0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
0, 1, 0, 1, 0, 1, 0), X3 = c(0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 1)), .Names = c("X1", "X2", "X3"), row.names = c(NA,
-19L), class = "data.frame")
> struc
X1 X2 X3
1 1 0 0
2 0 1 0
3 0 0 1
4 0 0 1
5 0 1 0
6 0 0 1
7 0 0 1
8 1 0 0
9 0 1 0
10 1 0 0
11 1 0 0
12 0 1 0
13 1 0 0
14 0 1 0
15 1 0 0
16 0 1 0
17 0 0 1
18 0 1 0
19 0 0 1
to this
对此
> data.frame(
x1 = cumsum(struc[,1]),
x2 = c(0, rep(1, 3), rep(2, 3), 0, 1, rep(0, 2), 1, 0, 1, 0, rep(1, 2), rep(2, 2)),
x3 = c(rep(0, 2), 1, 2, 0, 1, 2, rep(0, 9), 1, 0, 1)
)
x1 x2 x3
1 1 0 0
2 1 1 0
3 1 1 1
4 1 1 2
5 1 2 0
6 1 2 1
7 1 2 2
8 2 0 0
9 2 1 0
10 3 0 0
11 4 0 0
12 4 1 0
13 5 0 0
14 5 1 0
15 6 0 0
16 6 1 0
17 6 1 1
18 6 2 0
19 6 2 1
EDIT
Background
背景
I'm using the info in struc
to come up with a fast way to compute sort of an index matrix that tells me about the structure of arbitrary deeply nested lists and that I could use for <-
and [[
operations on those lists:
我正在使用struc中的信息来提出一种快速计算索引矩阵的方法,该矩阵告诉我任意深度嵌套列表的结构,并且我可以用于< - 和[[对这些列表的操作:
x <- list(
x1 = list(x11 = list(x111 = 1, x112 = 1), x12 = list(x121 = 1, x122 = 1)),
b = list(c = 2),
c = 3,
d = list(1:4),
list(1:5),
list(list(1:2), list(3:4))
)
getStructure <- function(x) {
struc <- capture.output(str(x))
struc <- unlist(strsplit(struc, split = "\n"))
tops <- str_count(struc, "\\s\\$\\s")
subs <- str_count(struc, "((\\.\\.)(\\s|\\$))")
## Clean //
idx_out <- which(tops == 0 & subs == 0)
if (length(idx_out)) {
tops <- tops[-idx_out]
subs <- subs[-idx_out]
struc <- struc[-idx_out]
}
## Levels //
subs_2 <- lapply(0:subs[which.max(subs)], function(ii) {
out <- subs == ii
out[out] <- 1
out
})
names(subs_2) <- 1:length(subs_2)
data.frame(subs_2)
}
The best I could come up with so far is to loop over the columns in a certain way, but it feels like I'm not using the power of clever matrix algebra or the power of dplyr
or things like filter()
and similar "split-and-combine" approaches:
到目前为止我能想出的最好的方法是以某种方式循环使用列,但感觉就像我没有使用聪明的矩阵代数或dplyr的强大功能或类似filter()和类似的“分裂”并结合“方法:
getIndexMatrix <- function(x) {
for (ii in 1:ncol(x)) {
if (ii == 1) {
x[,ii] <- cumsum(x[,ii])
} else {
f <- factor(apply(x[,1:(ii-1), drop = FALSE], 1, paste, collapse = "-"))
spl <- split(x, f = f)
out <- lapply(spl, function(ii) {
apply(ii, 2, cumsum)
})
x[,ii] <- do.call("rbind", out)[,ii]
}
NULL
}
x
}
(indexes <- getIndexMatrix(struc))
X1 X2 X3
1 1 0 0
2 1 1 0
3 1 1 1
4 1 1 2
5 1 2 0
6 1 2 1
7 1 2 2
8 2 0 0
9 2 1 0
10 3 0 0
11 4 0 0
12 4 1 0
13 5 0 0
14 5 1 0
15 6 0 0
16 6 1 0
17 6 1 1
18 6 2 0
19 6 2 1
This could for example be used to retrieve the "leaf" values of a nested list.
例如,这可以用于检索嵌套列表的“叶子”值。
Auxiliary function:
辅助功能:
getExtendedIndexMatrix <- function(x) {
x <- getIndexMatrix(x)
substituteZeros <- function(x) {
x[!x] <- NA
x
}
tmp <- as.data.frame(apply(x, 2, substituteZeros))
spl <- split(tmp, f = tmp[,1])
tmp <- lapply(spl, function(ii) {
info <- lapply(1:nrow(ii), function(ii2) {
data.frame(
scope = length(na.omit(as.numeric(ii[ii2,,drop = TRUE]))),
index = paste0("[[", paste(na.omit(as.numeric(ii[ii2,,drop = TRUE])),
collapse = "]][["), "]]")
)
})
info <- do.call("rbind", info)
leaf <- rep(FALSE, nrow(info))
leaf[which(info$scope == which.max(info$scope))] <- TRUE
data.frame(ii, leaf = leaf, info)
})
do.call("rbind", tmp)
}
> getExtendedIndexMatrix(struc)
X1 X2 X3 leaf scope index
1.1 1 NA NA FALSE 1 [[1]]
1.2 1 1 NA FALSE 2 [[1]][[1]]
1.3 1 1 1 TRUE 3 [[1]][[1]][[1]]
1.4 1 1 2 TRUE 3 [[1]][[1]][[2]]
1.5 1 2 NA FALSE 2 [[1]][[2]]
1.6 1 2 1 TRUE 3 [[1]][[2]][[1]]
1.7 1 2 2 TRUE 3 [[1]][[2]][[2]]
2.8 2 NA NA FALSE 1 [[2]]
2.9 2 1 NA TRUE 2 [[2]][[1]]
3 3 NA NA TRUE 1 [[3]]
4.11 4 NA NA FALSE 1 [[4]]
4.12 4 1 NA TRUE 2 [[4]][[1]]
5.13 5 NA NA FALSE 1 [[5]]
5.14 5 1 NA TRUE 2 [[5]][[1]]
6.15 6 NA NA FALSE 1 [[6]]
6.16 6 1 NA FALSE 2 [[6]][[1]]
6.17 6 1 1 TRUE 3 [[6]][[1]][[1]]
6.18 6 2 NA FALSE 2 [[6]][[2]]
6.19 6 2 1 TRUE 3 [[6]][[2]][[1]]
Actual leaf values:
实际叶值:
getLeafValues <- function(x) {
indexes <- getExtendedIndexMatrix(getStructure(x))
expr <- parse(text = paste0("x", indexes[which(indexes$leaf), "index"]))
lapply(expr, eval)
}
getLeafValues(x)
1 个解决方案
#1
1
Not entirely sure I understand what you are trying to accomplish, but here is a data.table solution that produces the same result as your example.
我不完全确定我理解您要完成的任务,但这里有一个data.table解决方案,可以产生与您的示例相同的结果。
library(data.table)
setDT(struc)
struc[,x1:=cumsum(X1)]
struc[,x2:=cumsum(X2),by=x1]
struc[,x3:=cumsum(X3),by=with(rle(x2),rep(1:length(lengths),lengths))]
struc
# X1 X2 X3 x1 x2 x3
# 1: 1 0 0 1 0 0
# 2: 0 1 0 1 1 0
# 3: 0 0 1 1 1 1
# 4: 0 0 1 1 1 2
# 5: 0 1 0 1 2 0
# 6: 0 0 1 1 2 1
# 7: 0 0 1 1 2 2
# 8: 1 0 0 2 0 0
# 9: 0 1 0 2 1 0
# 10: 1 0 0 3 0 0
# 11: 1 0 0 4 0 0
# 12: 0 1 0 4 1 0
# 13: 1 0 0 5 0 0
# 14: 0 1 0 5 1 0
# 15: 1 0 0 6 0 0
# 16: 0 1 0 6 1 0
# 17: 0 0 1 6 1 1
# 18: 0 1 0 6 2 0
# 19: 0 0 1 6 2 1
#1
1
Not entirely sure I understand what you are trying to accomplish, but here is a data.table solution that produces the same result as your example.
我不完全确定我理解您要完成的任务,但这里有一个data.table解决方案,可以产生与您的示例相同的结果。
library(data.table)
setDT(struc)
struc[,x1:=cumsum(X1)]
struc[,x2:=cumsum(X2),by=x1]
struc[,x3:=cumsum(X3),by=with(rle(x2),rep(1:length(lengths),lengths))]
struc
# X1 X2 X3 x1 x2 x3
# 1: 1 0 0 1 0 0
# 2: 0 1 0 1 1 0
# 3: 0 0 1 1 1 1
# 4: 0 0 1 1 1 2
# 5: 0 1 0 1 2 0
# 6: 0 0 1 1 2 1
# 7: 0 0 1 1 2 2
# 8: 1 0 0 2 0 0
# 9: 0 1 0 2 1 0
# 10: 1 0 0 3 0 0
# 11: 1 0 0 4 0 0
# 12: 0 1 0 4 1 0
# 13: 1 0 0 5 0 0
# 14: 0 1 0 5 1 0
# 15: 1 0 0 6 0 0
# 16: 0 1 0 6 1 0
# 17: 0 0 1 6 1 1
# 18: 0 1 0 6 2 0
# 19: 0 0 1 6 2 1