I'm working with a large matrix, M, which contains sample by gene data. Certain elements contain multiple concatenated entries, but the vectors have been converted to characters:
我正在使用一个大矩阵M,它包含基因数据样本。某些元素包含多个连接条目,但向量已转换为字符:
geneA gene2
sample1 NA NA
sample2 "c(\"siteX\", \"siteY\")" "0"
sample3 "siteZ" "0"
So when I call unique(M[,'geneA']) I get:
因此,当我调用unique(M [,'geneA'])时,我得到:
NA "c(\"siteX\", \"siteY\")" "siteZ"
Is there any way to 'de-characterize' the matrix so that I can obtain all of the unique values for geneA when I run my code? Or would it be better to focus instead on extracting and manipulating the elements of interest using regular expressions?
有没有办法'去表征'矩阵,以便在运行代码时可以获得geneA的所有唯一值?或者更好地关注使用正则表达式提取和操纵感兴趣的元素?
Thanks in advance!
提前致谢!
edit:
编辑:
> dput(tmp)
structure(c(NA, "c(\"siteX\", \"siteY\")",
"siteZ", NA, "0", "0"), .Dim = c(3L, 2L), .Dimnames = list(
c("sample1", "sample2", "sample3"), c("geneA",
"gene2")))
1 个解决方案
#1
1
If you just want the unique geneA
values:
如果您只想要独特的geneA值:
df = structure(c(NA, "c(\"siteX\", \"siteY\")", "siteZ", NA, "0", "0"), .Dim = c(3L, 2L),
.Dimnames = list(c("sample1", "sample2", "sample3"), c("geneA", "gene2")))
df = data.frame(df,stringsAsFactors=F)
df$geneA = as.character(df$geneA)
geneA = unlist(sapply(df$geneA,
function(x)
{ sapply(strsplit(x,",")[[1]],
function(x) { sub(".*\"(\\w+)\".*", "\\1", x,perl=TRUE) } ) }))
names(geneA) = NULL
unique(geneA)
Nicer ordering and cleaning into a data frame with removal of NA geneA's:
通过删除NA geneA来更好地排序和清理数据框:
df = structure(c(NA, "c(\"siteX\", \"siteY\")", "siteZ", NA, "g1", "g2"), .Dim = c(3L, 2L),
.Dimnames = list(c("sample1", "sample2", "sample3"), c("geneA", "gene2")))
df = data.frame(df,stringsAsFactors=F)
df$geneA = as.character(df$geneA)
require(plyr)
ddply(df, "geneA", function(x)
{
if(!is.na(x))
{
geneAs = sapply(strsplit(x$geneA,",")[[1]], function(y) { sub(".*\"(\\w+)\".*", "\\1", y,perl=TRUE) } );
return(data.frame("geneA"= geneAs, "gene2" = rep(x$gene2[1],length(geneAs)) ))
} else return(NULL)
} )
#1
1
If you just want the unique geneA
values:
如果您只想要独特的geneA值:
df = structure(c(NA, "c(\"siteX\", \"siteY\")", "siteZ", NA, "0", "0"), .Dim = c(3L, 2L),
.Dimnames = list(c("sample1", "sample2", "sample3"), c("geneA", "gene2")))
df = data.frame(df,stringsAsFactors=F)
df$geneA = as.character(df$geneA)
geneA = unlist(sapply(df$geneA,
function(x)
{ sapply(strsplit(x,",")[[1]],
function(x) { sub(".*\"(\\w+)\".*", "\\1", x,perl=TRUE) } ) }))
names(geneA) = NULL
unique(geneA)
Nicer ordering and cleaning into a data frame with removal of NA geneA's:
通过删除NA geneA来更好地排序和清理数据框:
df = structure(c(NA, "c(\"siteX\", \"siteY\")", "siteZ", NA, "g1", "g2"), .Dim = c(3L, 2L),
.Dimnames = list(c("sample1", "sample2", "sample3"), c("geneA", "gene2")))
df = data.frame(df,stringsAsFactors=F)
df$geneA = as.character(df$geneA)
require(plyr)
ddply(df, "geneA", function(x)
{
if(!is.na(x))
{
geneAs = sapply(strsplit(x$geneA,",")[[1]], function(y) { sub(".*\"(\\w+)\".*", "\\1", y,perl=TRUE) } );
return(data.frame("geneA"= geneAs, "gene2" = rep(x$gene2[1],length(geneAs)) ))
} else return(NULL)
} )