在R中合并两个data.frame,保留所有匹配的行

时间:2022-03-09 17:03:41

I'm struggling to merge two data.frame with na values occuring in one or the other df.

我正在努力将两个data.frame与在一个或另一个df中出现的na值合并。

sampleA <- structure(list(Nom_xp = "A1MRJ", Rep = 1L, GB05 = 102L, GB05.1 = 102L, 
    GB18 = 177L, GB18.1 = 177L, GB06 = 240L, GB06.1 = 240L, GB27 = 169L, 
    GB27.1 = 169L, GB24 = 240L, GB24.1 = 242L, GB28 = NA_integer_, 
    GB28.1 = NA_integer_, GB15 = 142L, GB15.1 = 144L, GB02 = 197L, 
    GB02.1 = 197L, GB10 = 126L, GB10.1 = 134L, GB14 = 181L, GB14.1 = 193L), .Names = c("Nom_xp", 
"Rep", "GB05", "GB05.1", "GB18", "GB18.1", "GB06", "GB06.1", 
"GB27", "GB27.1", "GB24", "GB24.1", "GB28", "GB28.1", "GB15", 
"GB15.1", "GB02", "GB02.1", "GB10", "GB10.1", "GB14", "GB14.1"
), row.names = 32L, class = "data.frame")


sampleB <- structure(list(Nom_xp = "A1MRJ", Rep = 2L, GB05 = NA, GB05.1 = NA, 
    GB18 = 177L, GB18.1 = 177L, GB06 = 240L, GB06.1 = 240L, GB27 = 169L, 
    GB27.1 = 169L, GB24 = 240L, GB24.1 = 242L, GB28 = 390L, GB28.1 = 390L, 
    GB15 = 142L, GB15.1 = 144L, GB02 = 197L, GB02.1 = 197L, GB10 = 126L, 
    GB10.1 = 134L, GB14 = 181L, GB14.1 = 193L), .Names = c("Nom_xp", 
"Rep", "GB05", "GB05.1", "GB18", "GB18.1", "GB06", "GB06.1", 
"GB27", "GB27.1", "GB24", "GB24.1", "GB28", "GB28.1", "GB15", 
"GB15.1", "GB02", "GB02.1", "GB10", "GB10.1", "GB14", "GB14.1"
), row.names = 33L, class = "data.frame")

Output needed, as a data.frame. Only one line every for matching "Nom_xp", so the NA get's replaced by the values in either A or B, if the value exist in one or the other DF.

需要输出,作为data.frame。每个匹配“Nom_xp”只有一行,因此如果值存在于一个或另一个DF中,则NA将被A或B中的值替换。

Nom_xp  GB05  GB05  GB18  GB18  GB06  GB06  GB27  GB27  GB24  GB24  GB28    GB28    GB15  GB15  GB02  GB02  GB10  GB10  GB14  GB14
A1MRJ   102 102 177 177 240 240 169 169 240 242 390 390 142 144 197 197 126 134 181 193

I would've thought that :

我会这么想的:

output <- merge(A,B,by="Nom_xp",all.x=T,all.y=T)

or

output <- join(A,B,by="Nom_xp",match="all")

would give me what I need, but no luck so far... What am I missing ? Actual data.frame has more than one row.

会给我我需要的东西,但到目前为止没有运气......我错过了什么?实际data.frame有多行。

2 个解决方案

#1


1  

Do you have just one row? Then, wouldn't this be sufficient? You can get the result in sampleB as:

你有一排吗?那么,这还不够吗?您可以在sampleB中获得结果:

sampleB[, is.na(sampleB)] <- sampleA[, is.na(sampleB)]

No, apply, join and merge are not necessary here, I think. Not tested, but this would work.

不,我认为不需要申请,加入和合并。未经测试,但这可行。

sampleB[is.na(sampleB)] <- sampleA[is.na(sampleB)]

#2


0  

Not entierly sure on how your whole data set looks like but I assume you could have several samples with the same "Nom_xp" and not only 2? And that you probably have all your data in a big dataframe or such?

不是很确定你的整个数据集是怎么样的,但我想你可能有几个样本具有相同的“Nom_xp”,而不仅仅是2?而且您可能将所有数据都放在大数据帧中?

If so, maybe this code could be a good start (maybe someone can help out and re-write this much much more efficient?). Anyhow:

如果是这样,也许这段代码可能是一个好的开始(也许有人可以提供帮助并重新编写更高效的代码?)。无论如何:

sampleA <- structure(list(Nom_xp = "A1MRJ", Rep = 1L, GB05 = 102L, GB05.1 = 102L, 
                          GB18 = 177L, GB18.1 = 177L, GB06 = 240L, GB06.1 = 240L, GB27 = 169L, 
                          GB27.1 = 169L, GB24 = 240L, GB24.1 = 242L, GB28 = NA_integer_, 
                          GB28.1 = NA_integer_, GB15 = 142L, GB15.1 = 144L, GB02 = 197L, 
                          GB02.1 = 197L, GB10 = 126L, GB10.1 = 134L, GB14 = 181L, GB14.1 = 193L), .Names = c("Nom_xp", "Rep", "GB05", "GB05.1", "GB18", "GB18.1", "GB06", "GB06.1","GB27", "GB27.1", "GB24", "GB24.1", "GB28", "GB28.1", "GB15","GB15.1", "GB02", "GB02.1", "GB10", "GB10.1", "GB14", "GB14.1"), row.names = 32L, class = "data.frame")

sampleB <- structure(list(Nom_xp = "A1MRJ", Rep = 2L, GB05 = NA, GB05.1 = NA, 
                          GB18 = 177L, GB18.1 = 177L, GB06 = 240L, GB06.1 = 240L, GB27 = 169L, 
                          GB27.1 = 169L, GB24 = 240L, GB24.1 = 242L, GB28 = 390L, GB28.1 = 390L, 
                          GB15 = 142L, GB15.1 = 144L, GB02 = 197L, GB02.1 = 197L, GB10 = 126L, 
                          GB10.1 = 134L, GB14 = 181L, GB14.1 = 193L), .Names = c("Nom_xp","Rep", "GB05", "GB05.1", "GB18", "GB18.1", "GB06", "GB06.1", "GB27", "GB27.1", "GB24", "GB24.1", "GB28", "GB28.1", "GB15", "GB15.1", "GB02", "GB02.1", "GB10", "GB10.1", "GB14", "GB14.1"  ), row.names = 33L, class = "data.frame")

sampleC <- structure(list(Nom_xp = "ASDF", Rep = 2L, GB05 = NA, GB05.1 = NA, 
                          GB18 = 177L, GB18.1 = 177L, GB06 = 240L, GB06.1 = 240L, GB27 = 12349L, 
                          GB27.1 = 3, GB24 = 234112, GB24.1 = 242L, GB28 = 234, GB28.1 = 390L, 
                          GB15 = NA, GB15.1 = 144L, GB02 = 197L, GB02.1 = 197L, GB10 = 126L, 
                          GB10.1 = 134L, GB14 = NA, GB14.1 = 193L), .Names = c("Nom_xp", "Rep", "GB05", "GB05.1", "GB18", "GB18.1", "GB06", "GB06.1", "GB27", "GB27.1", "GB24", "GB24.1", "GB28", "GB28.1", "GB15", "GB15.1", "GB02", "GB02.1", "GB10", "GB10.1", "GB14", "GB14.1"), row.names = 34L, class = "data.frame")

sampleD <- structure(list(Nom_xp = "ASDF", Rep = 2L, GB05 = 214, GB05.1 = 34, 
                          GB18 = 177L, GB18.1 = 177L, GB06 = 240L, GB06.1 = 240L, GB27 = 169L, 
                          GB27.1 = 3, GB24 = NA, GB24.1 = 242L, GB28 = 234, GB28.1 = 390L, 
                          GB15 = 56, GB15.1 = 144L, GB02 = 197L, GB02.1 = 197L, GB10 = 15466L, 
                          GB10.1 = 134L, GB14 = 34, GB14.1 = 193L), .Names = c("Nom_xp", "Rep", "GB05", "GB05.1", "GB18", "GB18.1", "GB06", "GB06.1", "GB27", "GB27.1", "GB24", "GB24.1", "GB28", "GB28.1", "GB15", "GB15.1", "GB02", "GB02.1", "GB10", "GB10.1", "GB14", "GB14.1"), row.names = 35L, class = "data.frame")

cdat<-rbind(sampleA,sampleB,sampleC,sampleD) #simulating your data set (?)
dcols<-dim(cdat)[2]

mat<-matrix(nrow=length(unique(cdat$Nom_xp)),ncol=dcols)
colnames(mat)<-colnames(cdat)
for (j in 1:length(unique(cdat$Nom_xp))) 
{
  g<-grep(unique(cdat$Nom_xp)[j],cdat$Nom_xp)   #Get the Nom_xp rows that match
  mat[j,1]<-cdat[g[1],1]                        #Fill in the "Nom_xp"
  mat[j,2]<-paste(g,collapse=" ")               #Fill in the "rep"
  mat[j,3:dcols]<-apply(cdat[g,3:dcols],2,      #Calculate a mean for each column
   function(x){as.numeric(mean(x,na.rm=T))})          
}

#1


1  

Do you have just one row? Then, wouldn't this be sufficient? You can get the result in sampleB as:

你有一排吗?那么,这还不够吗?您可以在sampleB中获得结果:

sampleB[, is.na(sampleB)] <- sampleA[, is.na(sampleB)]

No, apply, join and merge are not necessary here, I think. Not tested, but this would work.

不,我认为不需要申请,加入和合并。未经测试,但这可行。

sampleB[is.na(sampleB)] <- sampleA[is.na(sampleB)]

#2


0  

Not entierly sure on how your whole data set looks like but I assume you could have several samples with the same "Nom_xp" and not only 2? And that you probably have all your data in a big dataframe or such?

不是很确定你的整个数据集是怎么样的,但我想你可能有几个样本具有相同的“Nom_xp”,而不仅仅是2?而且您可能将所有数据都放在大数据帧中?

If so, maybe this code could be a good start (maybe someone can help out and re-write this much much more efficient?). Anyhow:

如果是这样,也许这段代码可能是一个好的开始(也许有人可以提供帮助并重新编写更高效的代码?)。无论如何:

sampleA <- structure(list(Nom_xp = "A1MRJ", Rep = 1L, GB05 = 102L, GB05.1 = 102L, 
                          GB18 = 177L, GB18.1 = 177L, GB06 = 240L, GB06.1 = 240L, GB27 = 169L, 
                          GB27.1 = 169L, GB24 = 240L, GB24.1 = 242L, GB28 = NA_integer_, 
                          GB28.1 = NA_integer_, GB15 = 142L, GB15.1 = 144L, GB02 = 197L, 
                          GB02.1 = 197L, GB10 = 126L, GB10.1 = 134L, GB14 = 181L, GB14.1 = 193L), .Names = c("Nom_xp", "Rep", "GB05", "GB05.1", "GB18", "GB18.1", "GB06", "GB06.1","GB27", "GB27.1", "GB24", "GB24.1", "GB28", "GB28.1", "GB15","GB15.1", "GB02", "GB02.1", "GB10", "GB10.1", "GB14", "GB14.1"), row.names = 32L, class = "data.frame")

sampleB <- structure(list(Nom_xp = "A1MRJ", Rep = 2L, GB05 = NA, GB05.1 = NA, 
                          GB18 = 177L, GB18.1 = 177L, GB06 = 240L, GB06.1 = 240L, GB27 = 169L, 
                          GB27.1 = 169L, GB24 = 240L, GB24.1 = 242L, GB28 = 390L, GB28.1 = 390L, 
                          GB15 = 142L, GB15.1 = 144L, GB02 = 197L, GB02.1 = 197L, GB10 = 126L, 
                          GB10.1 = 134L, GB14 = 181L, GB14.1 = 193L), .Names = c("Nom_xp","Rep", "GB05", "GB05.1", "GB18", "GB18.1", "GB06", "GB06.1", "GB27", "GB27.1", "GB24", "GB24.1", "GB28", "GB28.1", "GB15", "GB15.1", "GB02", "GB02.1", "GB10", "GB10.1", "GB14", "GB14.1"  ), row.names = 33L, class = "data.frame")

sampleC <- structure(list(Nom_xp = "ASDF", Rep = 2L, GB05 = NA, GB05.1 = NA, 
                          GB18 = 177L, GB18.1 = 177L, GB06 = 240L, GB06.1 = 240L, GB27 = 12349L, 
                          GB27.1 = 3, GB24 = 234112, GB24.1 = 242L, GB28 = 234, GB28.1 = 390L, 
                          GB15 = NA, GB15.1 = 144L, GB02 = 197L, GB02.1 = 197L, GB10 = 126L, 
                          GB10.1 = 134L, GB14 = NA, GB14.1 = 193L), .Names = c("Nom_xp", "Rep", "GB05", "GB05.1", "GB18", "GB18.1", "GB06", "GB06.1", "GB27", "GB27.1", "GB24", "GB24.1", "GB28", "GB28.1", "GB15", "GB15.1", "GB02", "GB02.1", "GB10", "GB10.1", "GB14", "GB14.1"), row.names = 34L, class = "data.frame")

sampleD <- structure(list(Nom_xp = "ASDF", Rep = 2L, GB05 = 214, GB05.1 = 34, 
                          GB18 = 177L, GB18.1 = 177L, GB06 = 240L, GB06.1 = 240L, GB27 = 169L, 
                          GB27.1 = 3, GB24 = NA, GB24.1 = 242L, GB28 = 234, GB28.1 = 390L, 
                          GB15 = 56, GB15.1 = 144L, GB02 = 197L, GB02.1 = 197L, GB10 = 15466L, 
                          GB10.1 = 134L, GB14 = 34, GB14.1 = 193L), .Names = c("Nom_xp", "Rep", "GB05", "GB05.1", "GB18", "GB18.1", "GB06", "GB06.1", "GB27", "GB27.1", "GB24", "GB24.1", "GB28", "GB28.1", "GB15", "GB15.1", "GB02", "GB02.1", "GB10", "GB10.1", "GB14", "GB14.1"), row.names = 35L, class = "data.frame")

cdat<-rbind(sampleA,sampleB,sampleC,sampleD) #simulating your data set (?)
dcols<-dim(cdat)[2]

mat<-matrix(nrow=length(unique(cdat$Nom_xp)),ncol=dcols)
colnames(mat)<-colnames(cdat)
for (j in 1:length(unique(cdat$Nom_xp))) 
{
  g<-grep(unique(cdat$Nom_xp)[j],cdat$Nom_xp)   #Get the Nom_xp rows that match
  mat[j,1]<-cdat[g[1],1]                        #Fill in the "Nom_xp"
  mat[j,2]<-paste(g,collapse=" ")               #Fill in the "rep"
  mat[j,3:dcols]<-apply(cdat[g,3:dcols],2,      #Calculate a mean for each column
   function(x){as.numeric(mean(x,na.rm=T))})          
}