I have a bit of an unclear question, so I hope I can explain this properly. I am using R. I know for loops can be slow in R, but for me it would be ok to use a for loop in this case.
我有一个不太清楚的问题,希望我能解释清楚。我使用R,我知道在R中for循环是很慢的,但是对于我来说,在这个例子中使用for循环是可以的。
I have a dataframe like this:
我有一个这样的dataframe:
id_A id_B id_C calc_A calc_B calc_C
1 x,z d g,f 1 1 5
2 x,y,z d,e f 1 2 8
3 y,z d,e g 6 7 1
I also have a vector with the names c('A', 'B', 'C', etc.)
What I want to do is to count for every row, how many id
’s have a calc
<= 2. id_A
is linked to calc_A
, etc.
我还有一个名为c的向量(' a ', 'B', ' c '等等)我要做的是对每一行进行计数,有多少id有一个calc <= 2。id_A链接到calc_A,等等。
For example, for the first row A and B have calc
values <= 2, together A and B have 3 id
's. So the output will be something like this:
例如,对于第一行A和B有calc值<= 2,A和B有3个id。输出是这样的
count
1 3
2 5
3 1
3 个解决方案
#1
3
It's a bit messy, but this should do the trick (for data.frame d
):
这有点乱,但这应该可以达到目的(对于d帧来说):
# store indices of calc columns and id columns
calc.cols <- grep('^calc', names(d))
id.cols <- grep('^id', names(d))
sapply(split(d, seq_len(nrow(d))), function(x) {
length(unique(unlist(strsplit(paste(x[, id.cols][which(x[, calc.cols] <= 2)],
collapse=','), ','))))
})
# 1 2 3
# 3 5 1
#2
1
Assuming that the ID
columns and the calc
columns are in the same order
假设ID列和calc列的顺序相同
library(stringr)
indx <- sapply(df[,1:3], str_count, ",")+1
indx[df[,4:6] >2] <- NA
df$count <- rowSums(indx,na.rm=TRUE)
df
# id_A id_B id_C calc_A calc_B calc_C count
#1 x,z d g,f 1 1 5 3
#2 x,y,z d,e f 1 2 8 5
#3 y,z d,e g 6 7 1 1
Update
Suppose, your dataset is not in the same order
假设您的数据集不是按相同的顺序排列的
set.seed(42)
df1 <- df[,sample(6)]
library(gtools)
df2 <-df1[,mixedorder(names(df1))]
# calc_A calc_B calc_C id_A id_B id_C
#1 1 1 5 x,z d g,f
#2 1 2 8 x,y,z d,e f
#3 6 7 1 y,z d,e g
id1 <- grep("^id", colnames(df2))
calc1 <- grep("^calc", colnames(df2))
indx1 <-sapply(df2[, id1], str_count, ",")+1
indx1[df2[, calc1] >2] <- NA
df1$count <- rowSums(indx1, na.rm=TRUE)
df1
# calc_C calc_B id_B id_C calc_A id_A count
#1 5 1 d g,f 1 x,z 3
#2 8 2 d,e f 1 x,y,z 5
#3 1 7 d,e g 6 y,z 1
data
df <- structure(list(id_A = c("x,z", "x,y,z", "y,z"), id_B = c("d",
"d,e", "d,e"), id_C = c("g,f", "f", "g"), calc_A = c(1L, 1L,
6L), calc_B = c(1L, 2L, 7L), calc_C = c(5L, 8L, 1L)), .Names = c("id_A",
"id_B", "id_C", "calc_A", "calc_B", "calc_C"), class = "data.frame", row.names = c("1",
"2", "3"))
#3
0
I don't know if this is less messy than jbaums solution but here is another option :
我不知道这是否比jbaums解决方案简单,但这里有另一个选择:
mydf<-data.frame(id_A=c("x,y","x,y,z","y,z"),id_B=c("d","d,e","d,e"),id_C=c("g,f","f","g"),
calc_A=c(1,1,6),calc_B=c(1,2,7),calc_C=c(5,8,1),stringsAsFactors=F)
mydf$count<-apply(mydf,1,function(rg,namesrg){
rg_calc<-rg[grep("calc",namesrg)]
rg_ids<-rg[grep("id",namesrg)]
idsinf2<-which(as.numeric( rg_calc)<=2)
ttids<-unlist(sapply(rg_ids[gsub("calc","id",names(rg_calc[idsinf2]))],function(id){strsplit(id,",")[[1]]}))
return(length(ttids))
},colnames(mydf))
> mydf
id_A id_B id_C calc_A calc_B calc_C count
1 x,y d g,f 1 1 5 3
2 x,y,z d,e f 1 2 8 5
3 y,z d,e g 6 7 1 1
#1
3
It's a bit messy, but this should do the trick (for data.frame d
):
这有点乱,但这应该可以达到目的(对于d帧来说):
# store indices of calc columns and id columns
calc.cols <- grep('^calc', names(d))
id.cols <- grep('^id', names(d))
sapply(split(d, seq_len(nrow(d))), function(x) {
length(unique(unlist(strsplit(paste(x[, id.cols][which(x[, calc.cols] <= 2)],
collapse=','), ','))))
})
# 1 2 3
# 3 5 1
#2
1
Assuming that the ID
columns and the calc
columns are in the same order
假设ID列和calc列的顺序相同
library(stringr)
indx <- sapply(df[,1:3], str_count, ",")+1
indx[df[,4:6] >2] <- NA
df$count <- rowSums(indx,na.rm=TRUE)
df
# id_A id_B id_C calc_A calc_B calc_C count
#1 x,z d g,f 1 1 5 3
#2 x,y,z d,e f 1 2 8 5
#3 y,z d,e g 6 7 1 1
Update
Suppose, your dataset is not in the same order
假设您的数据集不是按相同的顺序排列的
set.seed(42)
df1 <- df[,sample(6)]
library(gtools)
df2 <-df1[,mixedorder(names(df1))]
# calc_A calc_B calc_C id_A id_B id_C
#1 1 1 5 x,z d g,f
#2 1 2 8 x,y,z d,e f
#3 6 7 1 y,z d,e g
id1 <- grep("^id", colnames(df2))
calc1 <- grep("^calc", colnames(df2))
indx1 <-sapply(df2[, id1], str_count, ",")+1
indx1[df2[, calc1] >2] <- NA
df1$count <- rowSums(indx1, na.rm=TRUE)
df1
# calc_C calc_B id_B id_C calc_A id_A count
#1 5 1 d g,f 1 x,z 3
#2 8 2 d,e f 1 x,y,z 5
#3 1 7 d,e g 6 y,z 1
data
df <- structure(list(id_A = c("x,z", "x,y,z", "y,z"), id_B = c("d",
"d,e", "d,e"), id_C = c("g,f", "f", "g"), calc_A = c(1L, 1L,
6L), calc_B = c(1L, 2L, 7L), calc_C = c(5L, 8L, 1L)), .Names = c("id_A",
"id_B", "id_C", "calc_A", "calc_B", "calc_C"), class = "data.frame", row.names = c("1",
"2", "3"))
#3
0
I don't know if this is less messy than jbaums solution but here is another option :
我不知道这是否比jbaums解决方案简单,但这里有另一个选择:
mydf<-data.frame(id_A=c("x,y","x,y,z","y,z"),id_B=c("d","d,e","d,e"),id_C=c("g,f","f","g"),
calc_A=c(1,1,6),calc_B=c(1,2,7),calc_C=c(5,8,1),stringsAsFactors=F)
mydf$count<-apply(mydf,1,function(rg,namesrg){
rg_calc<-rg[grep("calc",namesrg)]
rg_ids<-rg[grep("id",namesrg)]
idsinf2<-which(as.numeric( rg_calc)<=2)
ttids<-unlist(sapply(rg_ids[gsub("calc","id",names(rg_calc[idsinf2]))],function(id){strsplit(id,",")[[1]]}))
return(length(ttids))
},colnames(mydf))
> mydf
id_A id_B id_C calc_A calc_B calc_C count
1 x,y d g,f 1 1 5 3
2 x,y,z d,e f 1 2 8 5
3 y,z d,e g 6 7 1 1