大数据 | 数据挖掘 | R语言 R绘图Session#2 - Bar Plot

时间:2021-02-28 08:56:59
####Bar Plot
#There’s an important distinction you should be aware of when making bar graphs: 
#sometimes the bar heights represent counts of cases in the data set, 
#and sometimes they represent values in the data set.

# x vector is factor, y vector is the height of the bar respectively
> install.packages("gcookbook")
> library(gcookbook) 
> ggplot(pg_mean, aes(x=group, y=weight)) + geom_bar(stat="identity")
# same to
> qplot(group, weight, data=pg_mean, geom="bar", stat="identity")

# add some fill color and outline
> ggplot(pg_mean, aes(x=group, y=weight)) +
    geom_bar(stat="identity", fill="lightblue", colour="black")
	
##Grouped bar: one categorical variable as X, the other continuous variable as Y; 
##If we want to split X with another categorical variable Z, then use FILL and position=dodge (if no dodge, it's an stacked bar)
##Then Bars with same X but different Z will be grouped together, as below example.
> ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
    geom_bar(position="dodge", stat="identity")
> ce <- cabbage_exp[1:5, ]   # Copy the data without last row
#the last bar will be missing
> ggplot(ce, aes(x=Date, y=Weight, fill=Cultivar)) +
    geom_bar(position="dodge", colour="black", stat="identity") +
    scale_fill_brewer(palette="Pastel1")

##a Bar Graph of Counts
> ggplot(diamonds, aes(x=cut)) + geom_bar()
# Equivalent to using geom_bar(stat="bin")

#if the x is categorical/discrete, then its a count bar
#if the x is continuous, then its a histogram, same as geom_histogram()
> ggplot(diamonds, aes(x=carat)) + geom_bar()

##colour
> upc <- subset(uspopchange, rank(Change)>40)
> ggplot(upc, aes(x=Abb, y=Change, fill=Region)) + geom_bar(position="dodge", stat="identity")
> ggplot(upc, aes(x=reorder(Abb, Change), y=Change, fill=Region)) +
    geom_bar(stat="identity", colour="black") +
    scale_fill_manual(values=c("#669933", "#FFCC66")) +
    xlab("State")
#reorder x 
> ggplot(upc, aes(x=reorder(Abb, Change), y=Change, fill=Region)) +
    geom_bar(stat="identity", colour="black") +
    scale_fill_manual(values=c("#669933", "#FFCC66")) +
    xlab("State")

#color for negative / positive numbers

> csub <- subset(climate, Source=="Berkeley" & Year >= 1900)
> csub$pos <- csub$Anomaly10y >= 0 # create a categorical variable for fill
> ggplot(csub, aes(x=Year, y=Anomaly10y, fill=pos)) +
  geom_bar(stat="identity", position="identity")
#position=identity is to prevent a warning message about stacking not being well defined for negative numbers

##remove legend(guide=FALSE) and change color of FILL and outline, and size of outline
> ggplot(csub, aes(x=Year, y=Anomaly10y, fill=pos)) +
    geom_bar(stat="identity", position="identity", colour="black", size=0.8) +
    scale_fill_manual(values=c("#CCEEFF", "#FFDDDD"), guide=FALSE)

####To make the bars narrower or wider, set width in geom_bar(). 
####The default value is 0.9; larger values make the bars wider, and smaller values make the bars narrower 
> ggplot(pg_mean, aes(x=group, y=weight)) + geom_bar(stat="identity", width=0.5)

##For grouped bars, the default is to have no space between bars within each group. 
##To add space between bars within a group, make width smaller and set the value for position_dodge to be larger than width
> ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
    geom_bar(stat="identity", width=0.5, position=position_dodge(0.7))
#position="dodge" is same to position=position_dodge(0.9)


####stacked bar
> ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
    geom_bar(stat="identity")

#reorder the legend
> ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
    geom_bar(stat="identity") +
    guides(fill=guide_legend(reverse=TRUE))

#reorder stack
> library(plyr) # Needed for desc()
> ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar, order=desc(Cultivar))) +
    geom_bar(stat="identity")
> ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
    geom_bar(stat="identity", colour="black", size=0.8) +
    guides(fill=guide_legend(reverse=TRUE)) +
    scale_fill_brewer(palette="Pastel1")

#Proportional Stacked Bar
> library(plyr)
# Do a group-wise transform(), splitting on "Date"
> ce <- ddply(cabbage_exp, "Date", transform,
            percent_weight = Weight / sum(Weight) * 100)

> ggplot(ce, aes(x=Date, y=percent_weight, fill=Cultivar)) +
    geom_bar(stat="identity")

> ggplot(ce, aes(x=Date, y=percent_weight, fill=Cultivar)) +
    geom_bar(stat="identity", colour="black") +
    guides(fill=guide_legend(reverse=TRUE)) +
    scale_fill_brewer(palette="Pastel1")

####add labels
# Below the top
> ggplot(cabbage_exp, aes(x=interaction(Date, Cultivar), y=Weight)) +
    geom_bar(stat="identity") +
    geom_text(aes(label=Weight), vjust=1.5, colour="white")

# Above the top
> ggplot(cabbage_exp, aes(x=interaction(Date, Cultivar), y=Weight)) +
    geom_bar(stat="identity") +
    geom_text(aes(label=Weight), vjust=-0.2)

# size is the font size, default is size = 5
> ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
    geom_bar(stat="identity", position="dodge") +
    geom_text(aes(label=Weight), vjust=1.5, colour="white",
              position=position_dodge(.9), size=3)

# stacked bar with labels
> library(plyr)
# Sort by the day and sex columns
> ce <- arrange(cabbage_exp, Date, Cultivar)
# Get the cumulative sum for y_label offset
> ce <- ddply(ce, "Date", transform, label_y=cumsum(Weight))

> ggplot(ce, aes(x=Date, y=Weight, fill=Cultivar)) +
    geom_bar(stat="identity") +
    geom_text(aes(y=label_y, label=Weight), vjust=1.5, colour="white")

# Calculate y position, placing it in the middle
> ce <- arrange(cabbage_exp, Date, Cultivar)
> ce <- ddply(ce, "Date", transform, label_y=cumsum(Weight)-0.5*Weight)

> ggplot(ce, aes(x=Date, y=Weight, fill=Cultivar)) +
    geom_bar(stat="identity") +
    geom_text(aes(y=label_y, label=Weight), colour="white")

# make a nicer plot
> ggplot(ce, aes(x=Date, y=Weight, fill=Cultivar)) +
    geom_bar(stat="identity", colour="black") +
    geom_text(aes(y=label_y, label=paste(format(Weight, nsmall=2), "kg")),
              size=4) +
    guides(fill=guide_legend(reverse=TRUE)) +
    scale_fill_brewer(palette="Pastel1")	
	
####Cleveland dot plot
> library(gcookbook) # For the data set
> tophit <- tophitters2001[1:25, ] # Take the top 25 from the tophitters data set

> ggplot(tophit, aes(x=avg, y=name)) + geom_point()

> tophit[, c("name", "lg", "avg")]

# sort by avg, dash line
> ggplot(tophit, aes(x=avg, y=reorder(name, avg))) +
    geom_point(size=3) +                        # Use a larger dot
    theme_bw() +
    theme(panel.grid.major.x = element_blank(),
          panel.grid.minor.x = element_blank(),
          panel.grid.major.y = element_line(colour="grey60", linetype="dashed"))

# rotate
> ggplot(tophit, aes(x=reorder(name, avg), y=avg)) +
    geom_point(size=3) +                        # Use a larger dot
    theme_bw() +
    theme(axis.text.x = element_text(angle=60, hjust=1),
          panel.grid.major.y = element_blank(),
          panel.grid.minor.y = element_blank(),
          panel.grid.major.x = element_line(colour="grey60", linetype="dashed"))

# Group
# Get the names, sorted first by lg, then by avg
> nameorder <- tophit$name[order(tophit$lg, tophit$avg)]

# Turn name into a factor, with levels in the order of nameorder
> tophit$name <- factor(tophit$name, levels=nameorder)

> ggplot(tophit, aes(x=avg, y=name)) +
    geom_segment(aes(yend=name), xend=0, colour="grey50") +
    geom_point(size=3, aes(colour=lg)) +
    scale_colour_brewer(palette="Set1", limits=c("NL","AL")) +
    theme_bw() +
    theme(panel.grid.major.y = element_blank(),   # No horizontal grid lines
          legend.position=c(1, 0.55),             # Put legend inside plot area
          legend.justification=c(1, 0.5))