使用ddply排除某些列中的重复值

I have a data frame with the following structure:

我有一个具有以下结构的数据框：

> dftest
       element seqnames     start       end  width strand tx_id   tx_name
1            1    chr19  58858172  58864865   6694      - 36769 NM_130786
2           10     chr8  18248755  18258723   9969      + 16614 NM_000015
3          100    chr20  43248163  43280376  32214      - 37719 NM_000022
4         1000    chr18  25530930  25757445 226516      - 33839 NM_001792
5        10000     chr1 243651535 244006584 355050      -  4182 NM_181690
6        10000     chr1 243663021 244006584 343564      -  4183 NM_005465
1316 100302285    chr12  12264886  12264967     82      + 24050 NR_036052
1317 100302285    chr12   9392066   9392147     82      - 25034 NR_036052
1318 100302285     chr2 232578024 232578105     82      +  5491 NR_036052
1319 100302285     chr5 118310281 118310362     82      + 11128 NR_036052

As an intermediate step I am trying to get rid of the elements, such as "100302285", that are present more than once, but with different "seqnames". Element "10000" would be kept because all "seqnames" are the same. Elements that are present only once are also kept. This is the desired output:

作为一个中间步骤，我试图摆脱不止一次出现但具有不同“seqnames”的元素，例如“100302285”。元素“10000”将被保留，因为所有“seqnames”都是相同的。仅保留一次的元素。这是所需的输出：

> dftest
       element seqnames     start       end  width strand tx_id   tx_name
1            1    chr19  58858172  58864865   6694      - 36769 NM_130786
2           10     chr8  18248755  18258723   9969      + 16614 NM_000015
3          100    chr20  43248163  43280376  32214      - 37719 NM_000022
4         1000    chr18  25530930  25757445 226516      - 33839 NM_001792
5        10000     chr1 243651535 244006584 355050      -  4182 NM_181690
6        10000     chr1 243663021 244006584 343564      -  4183 NM_005465

So far I've played with ddply and custom function to include duplicates:

到目前为止，我已经使用ddply和自定义函数来包含重复项：

subChr <- function(df)
{
    df[duplicated(df$seqnames),]
}

ddply(df, .(element), subChr)

But the result is far from the intended - silly me, it could have not been that simple:

但结果远非预期 - 愚蠢的我，它可能不是那么简单：

    element seqnames     start       end  width strand tx_id   tx_name
1     10000     chr1 243663021 244006584 343564      -  4183 NM_005465
2 100302285    chr12   9392066   9392147     82      - 25034 NR_036052

Since this is a step before another ddply, I would be happy with an alternative solution that does this:

由于这是另一个ddply之前的一步，我很乐意使用另一种解决方案：

ddply(df, .(element), summarize, chromosome=seqnames[1], gene_start=min(start), gene_end=max(end), strand=strand[1])

    element chromosome gene_start  gene_end strand
1         1      chr19   58858172  58864865      -
2        10       chr8   18248755  18258723      +
3       100      chr20   43248163  43280376      -
4      1000      chr18   25530930  25757445      -
5     10000       chr1  243651535 244006584      -
6 100302285      chr12    9392066 232578105      +

but summarizes element "100302285" for each "seqnames":

但是为每个“seqnames”总结了元素“100302285”：

 element chromosome gene_start  gene_end strand
1         1      chr19   58858172  58864865      -
2        10       chr8   18248755  18258723      +
3       100      chr20   43248163  43280376      -
4      1000      chr18   25530930  25757445      -
5     10000       chr1  243651535 244006584      -
6 100302285      chr12    9392066  12264967      +
7 100302285       chr2  232578024 232578105      +
8 100302285       chr5  118310281 118310362      +

Basically summarizing by .element and .seqname, if that makes sense. I have been searching for an answer for sometime now but did not progress much.

基本上通过.element和.seqname进行总结，如果这有意义的话。我一直在寻找一个答案，但没有取得多大进展。

Test data:

测试数据：

dftest <- structure(list(element = c("1", "10", "100", "1000", "10000", 
"10000", "100302285", "100302285", "100302285", "100302285"), 
    seqnames = c("chr19", "chr8", "chr20", "chr18", "chr1", "chr1", 
    "chr12", "chr12", "chr2", "chr5"), start = c(58858172L, 18248755L, 
    43248163L, 25530930L, 243651535L, 243663021L, 12264886L, 
    9392066L, 232578024L, 118310281L), end = c(58864865L, 18258723L, 
    43280376L, 25757445L, 244006584L, 244006584L, 12264967L, 
    9392147L, 232578105L, 118310362L), width = c(6694L, 9969L, 
    32214L, 226516L, 355050L, 343564L, 82L, 82L, 82L, 82L), strand = c("-", 
    "+", "-", "-", "-", "-", "+", "-", "+", "+"), tx_id = c(36769L, 
    16614L, 37719L, 33839L, 4182L, 4183L, 24050L, 25034L, 5491L, 
    11128L), tx_name = c("NM_130786", "NM_000015", "NM_000022", 
    "NM_001792", "NM_181690", "NM_005465", "NR_036052", "NR_036052", 
    "NR_036052", "NR_036052")), .Names = c("element", "seqnames", 
"start", "end", "width", "strand", "tx_id", "tx_name"), class = "data.frame", row.names = c(1L, 
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L))

1 个解决方案

#1

Answering your first question: If you like, here's a data.table solution:

回答你的第一个问题：如果你愿意，这是一个data.table解决方案：

require(data.table)
dt <- data.table(dftest, key="element")
dt.out <- dt[, .SD[length(table(seqnames)) == 1],by=c("element")]
> dt.out

#    element seqnames     start       end  width strand tx_id   tx_name
# 1:       1    chr19  58858172  58864865   6694      - 36769 NM_130786
# 2:      10     chr8  18248755  18258723   9969      + 16614 NM_000015
# 3:     100    chr20  43248163  43280376  32214      - 37719 NM_000022
# 4:    1000    chr18  25530930  25757445 226516      - 33839 NM_001792
# 5:   10000     chr1 243651535 244006584 355050      -  4182 NM_181690
# 6:   10000     chr1 243663021 244006584 343564      -  4183 NM_005465

And if you prefer the plyr solution:

如果你更喜欢plyr解决方案：

require(plyr)
out <- ddply(dftest, .(element), function(x) {
    if( length(table(x$seqnames)) == 1) {
        x
    }
})

#   element seqnames     start       end  width strand tx_id   tx_name
# 1       1    chr19  58858172  58864865   6694      - 36769 NM_130786
# 2      10     chr8  18248755  18258723   9969      + 16614 NM_000015
# 3     100    chr20  43248163  43280376  32214      - 37719 NM_000022
# 4    1000    chr18  25530930  25757445 226516      - 33839 NM_001792
# 5   10000     chr1 243651535 244006584 355050      -  4182 NM_181690
# 6   10000     chr1 243663021 244006584 343564      -  4183 NM_005465

Edit: For your second question, basically, in addition to the old solution, you just want to return the first row when your first condition is not satisfied.

编辑：对于第二个问题，基本上，除了旧解决方案之外，您只想在第一个条件不满足时返回第一行。

plyr solution: (without summarise)

plyr解决方案:(无需总结）

out <- ddply(dftest, .(element), function(x) {
    if (length(table(x$seqnames)) == 1) {
        x
    } else {
        x[1, ]
    }
})

> out
#     element seqnames     start       end  width strand tx_id   tx_name
# 1         1    chr19  58858172  58864865   6694      - 36769 NM_130786
# 2        10     chr8  18248755  18258723   9969      + 16614 NM_000015
# 3       100    chr20  43248163  43280376  32214      - 37719 NM_000022
# 4      1000    chr18  25530930  25757445 226516      - 33839 NM_001792
# 5     10000     chr1 243651535 244006584 355050      -  4182 NM_181690
# 6     10000     chr1 243663021 244006584 343564      -  4183 NM_005465
# 7 100302285    chr12  12264886  12264967     82      + 24050 NR_036052

data.table solution.

data.table解决方案。

dt <- data.table(dftest, key="element")
dt[, .SD[(if(length(table(seqnames)) == 1) seq_len(.N) else 1)], by = element]

> dt.out
#      element seqnames     start       end  width strand tx_id   tx_name
# 1:         1    chr19  58858172  58864865   6694      - 36769 NM_130786
# 2:        10     chr8  18248755  18258723   9969      + 16614 NM_000015
# 3:       100    chr20  43248163  43280376  32214      - 37719 NM_000022
# 4:      1000    chr18  25530930  25757445 226516      - 33839 NM_001792
# 5:     10000     chr1 243651535 244006584 355050      -  4182 NM_181690
# 6:     10000     chr1 243663021 244006584 343564      -  4183 NM_005465
# 7: 100302285    chr12  12264886  12264967     82      + 24050 NR_036052

#1