I have a data frame that looks as the following:
我有一个数据框,如下所示:
system Id initial final
665 9 16001 6070 6071
683 10 16001 6100 6101
696 11 16001 6101 6113
712 10 16971 6150 6151
715 11 16971 6151 6163
4966 7 4118 10238 10242
5031 9 4118 10260 10278
5088 10 4118 10279 10304
5115 11 4118 10305 10317
structure(list(system = c(9L, 10L, 11L, 10L, 11L, 7L, 9L, 10L,
11L), Id = c(16001L, 16001L, 16001L, 16971L, 16971L, 4118L, 4118L,
4118L, 4118L), initial = c(6070, 6100, 6101, 6150, 6151, 10238,
10260, 10279, 10305), final = c(6071, 6101, 6113, 6151, 6163,
10242, 10278, 10304, 10317)), .Names = c("system", "Id", "initial",
"final"), row.names = c(665L, 683L, 696L, 712L, 715L, 4966L,
5031L, 5088L, 5115L), class = "data.frame")
I would like to get a new data frame with the next structure
我想获得一个具有下一个结构的新数据框架
Id system length initial final
1 16001 9,10,11 3 6070 6113
2 16971 10,11 2 6150 6163
3 4118 7 1 10238 10242
4 4118 9,10,11 3 10260 10317
structure(list(Id = c(16001L, 16971L, 4118L, 4118L), system = structure(c(3L,
1L, 2L, 3L), .Label = c("10,11", "7", "9,10,11"), class = "factor"),
length = c(3L, 2L, 1L, 3L), initial = c(6070L, 6150L, 10238L,
10260L), final = c(6113, 6163, 10242, 10317)), .Names = c("Id",
"system", "length", "initial", "final"), class = "data.frame", row.names = c(NA,
-4L))
The grouping is by Id and the difference (between rows) in "system" field equal to one. Also I would like to get the different "system" and how many of that involved in grouping. Finally a column with the first "initial" and the last "final" involved also.
分组由Id和“system”字段中的差异(行之间)等于1。另外,我想得到不同的“系统”以及分组中涉及多少“系统”。最后,还有一个第一个“初始”和最后一个“最后”的列。
It is possible to do that in r? Thanks.
在r中可以做到这一点吗?谢谢。
2 个解决方案
#1
3
You could use data.table
. Convert "data.frame" to "data.table" (setDT
), create a grouping variable "indx" by taking the difference of adjacent elements of "system" (diff(system)
), cumsum
the logical vector, use "Id" and "indx" as grouping variable to get the statistics.
您可以使用data.table。将“data.frame”转换为“data.table”(setDT),通过获取“system”(diff(系统))的相邻元素的差异来创建分组变量“indx”,使用逻辑向量cumum,使用“Id”和“indx”作为分组变量来获取统计信息。
library(data.table)
setDT(df)[,list(system=toString(system), length=.N, initial=initial[1L],
final=final[.N]), by=list(Id,indx=cumsum(c(TRUE, diff(system)!=1)))][,
indx:=NULL][]
# Id system length initial final
#1: 16001 9, 10, 11 3 6070 6113
#2: 16971 10, 11 2 6150 6163
#3: 4118 7 1 10238 10242
#4: 4118 9, 10, 11 3 10260 10317
Or based on @jazzurro's comment about using first/last
functions from dplyr
,
或者基于@jazzurro关于使用dplyr中的第一个/最后一个函数的注释,
library(dplyr)
df %>%
group_by(indx=cumsum(c(TRUE, diff(system)!=1)), Id) %>%
summarise(system=toString(system), length=n(),
initial=first(initial), final=last(final))
#2
1
A solution without data.table
, but plyr
:
没有data.table的解决方案,但是plyr:
library(plyr)
func = function(subdf)
{
bool = c(diff(subdf$system),1)==1
ldply(split(subdf, bool), function(u){
data.frame(system = paste(u$system, collapse=','),
Id = unique(u$Id),
length = nrow(u),
initial= head(u,1)$initial,
final = tail(u,1)$final)
})
}
ldply(split(df, df$Id), func)
# .id system length Id initial final
#1 FALSE 7 1 4118 10238 10242
#2 TRUE 9,10,11 3 4118 10260 10317
#3 TRUE 9,10,11 3 16001 6070 6113
#4 TRUE 10,11 2 16971 6150 6163
#1
3
You could use data.table
. Convert "data.frame" to "data.table" (setDT
), create a grouping variable "indx" by taking the difference of adjacent elements of "system" (diff(system)
), cumsum
the logical vector, use "Id" and "indx" as grouping variable to get the statistics.
您可以使用data.table。将“data.frame”转换为“data.table”(setDT),通过获取“system”(diff(系统))的相邻元素的差异来创建分组变量“indx”,使用逻辑向量cumum,使用“Id”和“indx”作为分组变量来获取统计信息。
library(data.table)
setDT(df)[,list(system=toString(system), length=.N, initial=initial[1L],
final=final[.N]), by=list(Id,indx=cumsum(c(TRUE, diff(system)!=1)))][,
indx:=NULL][]
# Id system length initial final
#1: 16001 9, 10, 11 3 6070 6113
#2: 16971 10, 11 2 6150 6163
#3: 4118 7 1 10238 10242
#4: 4118 9, 10, 11 3 10260 10317
Or based on @jazzurro's comment about using first/last
functions from dplyr
,
或者基于@jazzurro关于使用dplyr中的第一个/最后一个函数的注释,
library(dplyr)
df %>%
group_by(indx=cumsum(c(TRUE, diff(system)!=1)), Id) %>%
summarise(system=toString(system), length=n(),
initial=first(initial), final=last(final))
#2
1
A solution without data.table
, but plyr
:
没有data.table的解决方案,但是plyr:
library(plyr)
func = function(subdf)
{
bool = c(diff(subdf$system),1)==1
ldply(split(subdf, bool), function(u){
data.frame(system = paste(u$system, collapse=','),
Id = unique(u$Id),
length = nrow(u),
initial= head(u,1)$initial,
final = tail(u,1)$final)
})
}
ldply(split(df, df$Id), func)
# .id system length Id initial final
#1 FALSE 7 1 4118 10238 10242
#2 TRUE 9,10,11 3 4118 10260 10317
#3 TRUE 9,10,11 3 16001 6070 6113
#4 TRUE 10,11 2 16971 6150 6163