R语言dplyr包初探

　昨天学了一下R语言dplyr包，处理数据框还是很好用的。记录一下免得我忘记了... 先写一篇入门的，以后有空再写一篇详细的用法。
#dplyr learning

library(dplyr)

#filter()

#选择符合条件的数据框的行，返回数据框

#Usage

#filter(.data, ...)      # ...为限制条件

#eg

filter(starwars, species == "Human")

filter(starwars, mass > 1000)

# Multiple criteria

filter(starwars, hair_color == "none" & eye_color == "black")

filter(starwars, hair_color == "none" | eye_color == "black")

# Multiple arguments are equivalent to and

filter(starwars, hair_color == "none", eye_color == "black")     #默认为逻辑与

#arrange()

#给数据框排序

#Usage#

#arrange(.data, ...)

## S3 method for class 'grouped_df'

#arrange(.data, ..., .by_group = FALSE)

#eg

arrange(mtcars, cyl, disp)      #先排cyl，再排disp

arrange(mtcars, desc(disp))     #desc() 降序

# grouped arrange ignores groups

by_cyl <- mtcars %>% group_by(cyl)    # %>% 为管道函数，将左侧变量传给右侧函数的第一个参数

by_cyl %>% arrange(desc(wt))          #忽略分类，直接排序

# Unless you specifically ask:

by_cyl %>% arrange(desc(wt), .by_group = TRUE)     #按照group分组排序

#select()

# eg

iris <- as_tibble(iris) # so it prints a little nicer

select(iris, starts_with("Petal"))  #选择以 'Petal' 开头的列

select(iris, ends_with("Width"))

# Move Species variable to the front

select(iris, Species, everything())

df <- as.data.frame(matrix(runif(100), nrow = 10))

df <- tbl_df(df[c(3, 4, 7, 1, 9, 8, 5, 2, 6, 10)])

select(df, V4:V6)                                  #切片

select(df, num_range("V", 4:6))                    #这个还是好用的

# Drop variables with -

select(iris, -starts_with("Petal"))                #去除以 'Petal' 开头的列

# The .data pronoun is available:

select(mtcars, .data$cyl)                          #这个用的不习惯

select(mtcars, .data$mpg : .data$disp)

# Renaming -----------------------------------------

# * select() keeps only the variables you specify

select(iris, petal_length = Petal.Length)

# * rename() keeps all variables

rename(iris, petal_length = Petal.Length)     #重命名然后提取所有的列

#mutate()                                     #添加新列

mtcars %>% as_tibble() %>% mutate(

  cyl2 = cyl * 2,

  cyl4 = cyl2 * 2

)

mtcars %>% as_tibble() %>% mutate(

  mpg = NULL,                               # 用 NULL 去除某列，类似于select 的 -

  disp = disp * 0.0163871                   # 对某列做运算

)

# mutate() vs transmute --------------------------

# mutate() keeps all existing variables

mtcars %>%

  mutate(displ_l = disp / 61.0237)

# transmute keeps only the variables you create

mtcars %>%

  transmute(displ_l = disp / 61.0237)

#summarise()

#对 group_by 后的数据进行统计，这里以均值为例

mtcars %>%

  summarise(mean = mean(disp), n = n())

mtcars %>%

  group_by(cyl) %>%

  summarise(mean = mean(disp), n = n())

mtcars %>%

  group_by(cyl, vs) %>%

  summarise(cyl_n = n()，mean_disp = mean(disp))        #这个分组统计很强大
秒客网

R语言dplyr包初探

相关文章