以一种很好的方式自动在ggplot2直方图上排序x轴

时间:2022-05-04 14:56:47

I have a dataset like this (but with hundreds of samples):

我有这样的数据集(但有数百个样本):

data <- structure(list(sample = c("C001", "C001", "C001", "C001", "C001", 
                          "C001", "C001", "C001", "C001", "C001", "C001", "C001", "C001", 
                          "C002", "C002", "C002", "C002", "C002", "C002", "C002", "C002", 
                          "C002", "C002", "C002", "C002", "C002", "C003", "C003", "C003", 
                          "C003", "C003", "C003", "C003", "C003", "C003", "C003", "C003", 
                          "C003", "C003", "C004", "C004", "C004", "C004", "C004", "C004", 
                          "C004", "C004", "C004", "C004", "C004", "C004", "C004", "C007", 
                          "C007", "C007", "C007", "C007", "C007", "C007", "C007", "C007", 
                          "C007", "C007", "C007", "C007", "C009", "C009", "C009", "C009", 
                          "C009", "C009", "C009", "C009", "C009", "C009", "C009", "C009", 
                          "C009", "C011", "C011", "C011", "C011", "C011", "C011", "C011", 
                          "C011", "C011", "C011", "C011", "C011", "C011", "C012", "C012", 
                          "C012", "C012", "C012", "C012", "C012", "C012", "C012", "C012", 
                          "C012", "C012", "C012", "C014", "C014", "C014", "C014", "C014", 
                          "C014", "C014", "C014", "C014", "C014", "C014", "C014", "C014", 
                          "C015", "C015", "C015", "C015", "C015", "C015", "C015", "C015", 
                          "C015", "C015", "C015", "C015", "C015", "C016", "C016", "C016", 
                          "C016", "C016", "C016", "C016", "C016", "C016", "C016", "C016", 
                          "C016", "C016", "C018", "C018", "C018", "C018", "C018", "C018", 
                          "C018", "C018", "C018", "C018", "C018", "C018", "C018"), count = c(0L, 
                                                                                             130L, 0L, 10L, 0L, 20L, 568L, 23L, 6L, 77L, 616L, 230734L, 177L, 
                                                                                             10L, 6396L, 0L, 5747L, 0L, 208L, 115189L, 13130L, 1L, 38L, 200L, 
                                                                                             2604L, 3104L, 0L, 95476L, 0L, 3591L, 0L, 7L, 26359L, 83L, 5L, 
                                                                                             1L, 1521L, 36004L, 9779L, 12L, 852L, 0L, 13L, 5L, 329L, 152053L, 
                                                                                             288L, 2L, 0L, 0L, 530L, 1023L, 57L, 84L, 98060L, 122L, 0L, 8552L, 
                                                                                             668L, 209L, 7L, 0L, 155L, 10159L, 4934L, 15L, 47L, 83L, 1L, 0L, 
                                                                                             54L, 462L, 89L, 43L, 0L, 127476L, 2614L, 3659L, 12L, 1L, 1L, 
                                                                                             1061L, 0L, 84199L, 845L, 898L, 0L, 29L, 10L, 63L, 1834L, 87L, 
                                                                                             36L, 7L, 407L, 20167L, 39969L, 1429L, 51072L, 0L, 0L, 27L, 9560L, 
                                                                                             3643L, 2899L, 10L, 0L, 380L, 0L, 82L, 1543L, 55L, 765L, 25172L, 
                                                                                             29791L, 39805L, 922L, 6L, 843L, 5L, 110L, 0L, 174L, 134582L, 
                                                                                             575L, 15L, 65L, 37L, 19240L, 830L, 1L, 1L, 0L, 0L, 0L, 63L, 156446L, 
                                                                                             22L, 1L, 15L, 76L, 9710L, 793L, 128L, 4L, 1L, 2L, 0L, 1904L, 
                                                                                             199L, 98779L, 0L, 0L, 11436L, 91L, 1813L), class = structure(c(1L, 
                                                                                                                                                            2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 
                                                                                                                                                            4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 
                                                                                                                                                            6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 
                                                                                                                                                            8L, 9L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 
                                                                                                                                                            11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 
                                                                                                                                                            12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 
                                                                                                                                                            13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 
                                                                                                                                                            14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 
                                                                                                                                                            1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 1L, 2L, 
                                                                                                                                                            3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 
                                                                                                                                                            5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L), .Label = c("a", "b", 
                                                                                                                                                                                                                "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n"), class = "factor")), .Names = c("sample", 
                                                                                                                                                                                                                                                                                                            "count", "class"), row.names = c(NA, -156L), class = c("tbl_df", 
                                                                                                                                                                                                                                                                                                                                                                   "tbl", "data.frame"))

And I want to plot an histogram of this data:

我想绘制这些数据的直方图:

library(tidyverse)
ggplot(data = data, aes(x = sample)) +
  geom_bar(aes(y = count, fill = class), color = "black", 
           position = "fill", stat = "identity")

以一种很好的方式自动在ggplot2直方图上排序x轴

But as you can see, the bar are not weel-ordered ans it's not easy to compare different samples.

但是正如你所看到的那样,这个条不是有序的,而且比较不同的样品并不容易。

So I reorganize it at the hand to make it more "beautiful" (in some ways)

所以我重新组织它以使其更“美丽”(在某些方面)

data$sample <- factor(data$sample, levels = c("C001", "C014", "C009", "C018",
                      "C012", "C004", "C016", "C002", "C015", "C011", "C003", "C007"))

ggplot(data = data, aes(x = sample)) +
  geom_bar(aes(y = count, fill = class), color = "black", 
           position = "fill", stat = "identity")

以一种很好的方式自动在ggplot2直方图上排序x轴

It's probably not the best order but it's easier to compare proportions between similar samples.

它可能不是最好的顺序,但比较类似样本之间的比例更容易。

At the end, I want to make plots like these (with facet_grid) but let's start from the beginning.

最后,我想制作这样的图(使用facet_grid),但让我们从头开始。

以一种很好的方式自动在ggplot2直方图上排序x轴 Source

资源

1 个解决方案

#1


3  

There is no clear best way to do this. The first thing you have to do is define some sort of dissimilarity measure between the samples. One minus the correlation seems like one (of many) possible candidate. Then you can look at how to order the results based on the similarity measure. Hierarchical clustering gives you a possible order.

没有明确的最佳方法来做到这一点。您要做的第一件事是在样本之间定义某种不相似性度量。一个减去相关性似乎是一个(很多)可能的候选人。然后,您可以查看如何根据相似性度量对结果进行排序。分层聚类为您提供了可能的订单。

In the following code I used that your sample data was ordered and complete. Otherwise you may have to adjust.

在下面的代码中,我使用了您的示例数据已订购并完成。否则你可能需要调整。

# unique samples
samples <- unique(data$sample)
## dissimilarity measure
dm <- matrix(mapply(function(x, y) 1-cor(data[data$sample == x, ]$count, data[data$sample == y, ]$count), 
                    rep(samples, times = length(samples)),
                    rep(samples, each = length(samples))), nrow = length(samples))
# single linkage clustering
hc <- hclust(as.dist(dm), method = "single")
# reorder
data$sample <- factor(data$sample, levels = samples[hc$order])
# plot
ggplot(data = data, aes(x = sample)) +
  geom_bar(aes(y = count, fill = class), color = "black", 
           position = "fill", stat = "identity")

以一种很好的方式自动在ggplot2直方图上排序x轴

#1


3  

There is no clear best way to do this. The first thing you have to do is define some sort of dissimilarity measure between the samples. One minus the correlation seems like one (of many) possible candidate. Then you can look at how to order the results based on the similarity measure. Hierarchical clustering gives you a possible order.

没有明确的最佳方法来做到这一点。您要做的第一件事是在样本之间定义某种不相似性度量。一个减去相关性似乎是一个(很多)可能的候选人。然后,您可以查看如何根据相似性度量对结果进行排序。分层聚类为您提供了可能的订单。

In the following code I used that your sample data was ordered and complete. Otherwise you may have to adjust.

在下面的代码中,我使用了您的示例数据已订购并完成。否则你可能需要调整。

# unique samples
samples <- unique(data$sample)
## dissimilarity measure
dm <- matrix(mapply(function(x, y) 1-cor(data[data$sample == x, ]$count, data[data$sample == y, ]$count), 
                    rep(samples, times = length(samples)),
                    rep(samples, each = length(samples))), nrow = length(samples))
# single linkage clustering
hc <- hclust(as.dist(dm), method = "single")
# reorder
data$sample <- factor(data$sample, levels = samples[hc$order])
# plot
ggplot(data = data, aes(x = sample)) +
  geom_bar(aes(y = count, fill = class), color = "black", 
           position = "fill", stat = "identity")

以一种很好的方式自动在ggplot2直方图上排序x轴