拆分字符串列以创建新的二进制列

时间:2021-10-23 12:36:45

My data has one column and I am trying to create additional columns with what’s after each “/” in the rows. Here are the first few rows of the data:

我的数据有一列,我正在尝试使用行中每个“/”之后的内容创建其他列。以下是数据的前几行:

> dput(mydata)
structure(list(ALL = structure(c(1L, 4L, 4L, 3L, 2L), .Label = c("/
ca/put/sent_1/fe.gr/eq2_on/eq2_off",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL", "/ca/put/sent_1/fe.g
r/eq2_on/eq2_off/cni_at.p3x.4",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov"), class = "factor
")), .Names = "ALL", class = "data.frame", row.names = c(NA,
-5L))

The result should look like this (data frame) with a “1” in the new column if the variable appears in the row and “0” if not:

如果变量出现在行中,结果应该看起来像这样(数据框)在新列中带有“1”,否则为“0”:

> dput(Result)
structure(list(ALL = structure(c(1L, 4L, 5L, 3L, 2L), .Label = c("/ca
/put/sent_1/fe.gr/eq2_on/eq2_off",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL", "/ca/put/sent_1/fe.gr/
eq2_on/eq2_off/cni_at.p3x.4",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov", "/ca/put/sent_1fe.
gr/eq2_on/eq2_off/hi.on/hi.ov"
), class = "factor"), ca = c(1L, 1L, 1L, 1L, 1L), put = c(1L,
1L, 1L, 1L, 1L), sent_1 = c(1L, 1L, 1L, 1L, 1L), fe.gr = c(1L,
1L, 1L, 1L, 1L), eq2_on = c(1L, 1L, 1L, 1L, 1L), eq2_off = c(1L,
1L, 1L, 1L, 1L), hi.on = c(0L, 1L, 1L, 0L, 0L), hi.ov = c(0L,
1L, 1L, 0L, 0L), cni_at.p3x.4 = c(0L, 0L, 0L, 1L, 0L), cbr_LBL = c(0L
,
0L, 0L, 0L, 1L)), .Names = c("ALL", "ca", "put", "sent_1", "fe.gr",
"eq2_on", "eq2_off", "hi.on", "hi.ov", "cni_at.p3x.4", "cbr_LBL"
), class = "data.frame", row.names = c(NA, -5L))

I have tried many functions including strsplit and sapply:

我尝试了很多功能,包括strsplit和sapply:

sapply(strsplit(as.character(mydata$ALL), “\\/”), “[[“, 2) #returns "ca"s only

sapply(strsplit(as.character(mydata$ALL), "\\/"), "[[", 3) #returns "put"s only

There are millions of rows and I’d greatly appreciate anything that is quick and efficient.

有数百万行,我非常欣赏任何快速有效的内容。

7 个解决方案

#1


3  

Using mtabuate from the qdapTools package that I maintain:

使用我维护的qdapTools包中的mtabuate:

library(qdapTools)
mtabulate(strsplit(as.character(dat[[1]]), "/"))

##   V1 ca cbr_LBL cni_at.p3x.4 eq2_off eq2_on fe.gr hi.on hi.ov put sent_1 sent_1fe.gr
## 1  1  1       0            0       1      1     1     0     0   1      1           0
## 2  1  1       0            0       1      1     1     1     1   1      1           0
## 3  1  1       0            0       1      1     0     1     1   1      0           1
## 4  1  1       0            1       1      1     1     0     0   1      1           0
## 5  1  1       1            0       1      1     1     0     0   1      1           0

#2


4  

You can use cSplit_e from my "splitstackshape" package:

您可以使用我的“splitstackshape”包中的cSplit_e:

library(splitstackshape)
cSplit_e(mydata, "ALL", "/", type = "character", fill = 0)
#                                                ALL ALL_ca ALL_cbr_LBL
# 1              /ca/put/sent_1/fe.gr/eq2_on/eq2_off      1           0
# 2  /ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov      1           0
# 3   /ca/put/sent_1fe.gr/eq2_on/eq2_off/hi.on/hi.ov      1           0
# 4 /ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4      1           0
# 5      /ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL      1           1
#   ALL_cni_at.p3x.4 ALL_eq2_off ALL_eq2_on ALL_fe.gr ALL_hi.on ALL_hi.ov ALL_put
# 1                0           1          1         1         0         0       1
# 2                0           1          1         1         1         1       1
# 3                0           1          1         0         1         1       1
# 4                1           1          1         1         0         0       1
# 5                0           1          1         1         0         0       1
#   ALL_sent_1 ALL_sent_1fe.gr
# 1          1               0
# 2          1               0
# 3          0               1
# 4          1               0
# 5          1               0

(Note: I think there's a problem in row 3 of your dput which is why it doesn't match your desired output. Notice that the third item in row 3 is "sent_1fe.gr" with no "/" between them.)

(注意:我认为你的dput的第3行存在问题,这就是为什么它与你想要的输出不匹配。请注意,第3行中的第三项是“sent_1fe.gr”,它们之间没有“/”。)

#3


2  

How about something like this

这样的事情怎么样?

spt <- strsplit(as.character(mydata$ALL),"/", fixed=T)
do.call(rbind, lapply(lapply(spt, factor, levels=unique(unlist(spt))), table))

which returns

       ca put sent_1 fe.gr eq2_on eq2_off hi.on hi.ov sent_1fe.gr cni_at.p3x.4 cbr_LBL
[1,] 1  1   1      1     1      1       1     0     0           0            0       0
[2,] 1  1   1      1     1      1       1     1     1           0            0       0
[3,] 1  1   1      1     0      1       1     1     1           1            0       0
[4,] 1  1   1      1     1      1       1     0     0           0            1       0
[5,] 1  1   1      1     1      1       1     0     0           0            0       1

#4


1  

Other option is to melt the split string in list to long form and then use table

其他选项是将列表中的拆分字符串熔化为长格式然后使用表格

library(reshape2)
as.data.frame.matrix(table(melt(strsplit(as.character(
                                   mydata[[1]]), "/"))[2:1]))[,-1]
#    ca eq2_off eq2_on fe.gr put sent_1 hi.on hi.ov sent_1fe.gr cni_at.p3x.4
#1  1       1      1     1   1      1     0     0           0            0
#2  1       1      1     1   1      1     1     1           0            0
#3  1       1      1     0   1      0     1     1           1            0
#4  1       1      1     1   1      1     0     0           0            1
#5  1       1      1     1   1      1     0     0           0            0
#  cbr_LBL
#1       0
#2       0
#3       0
#4       0
#5       1

#5


0  

Here's a solution that uses dplyr and tidyr (note: I cleaned up what appears to be an missing / in row three of your sample data):

这是一个使用dplyr和tidyr的解决方案(注意:我清理了样本数据中似乎缺少/第三行的内容):

## Input

input <- structure(
  list(ALL = structure(c(1L, 4L, 5L, 3L, 2L),
       .Label = c("/ca/put/sent_1/fe.gr/eq2_on/eq2_off",
                  "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL",
                  "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4",
                  "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov",
                  "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov"),
       class = "factor")),
  .Names = "ALL", class = "data.frame", row.names = c(NA, -5L))


## Solution

require(dplyr)
require(tidyr)
solution <- input  %>%
  mutate(temp = sub("^/", "", ALL)) %>%
  separate(temp,
           c("ca", "put", "sent_1", "fe.gr", "eq2_on", "eq2_off",
             "hi.on", "hi.ov", "cni_at.p3x.4", "cbr_LBL"),
           "/", extra="merge") %>%
  mutate_each(funs(as.numeric(!is.na(.))), -ALL)

#6


0  

a tidyverse solution

一个整合的解决方案

library(tidyverse)

mydata %>%
  rownames_to_column() %>%
  mutate(key = strsplit(levels(ALL)[ALL],"/"),value=1) %>%
  unnest %>%
  spread(key,value,0) %>%
  select(-rowname)

#   ALL   ca cbr_LBL cni_at.p3x.4 eq2_off eq2_on fe.gr hi.on hi.ov put sent_1
# 1   1 1  1       0            0       1      1     1     0     0   1      1
# 2   4 1  1       0            0       1      1     1     1     1   1      1
# 3   4 1  1       0            0       1      1     1     1     1   1      1
# 4   3 1  1       0            1       1      1     1     0     0   1      1
# 5   2 1  1       1            0       1      1     1     0     0   1      1

data

mydata <- structure(list(ALL = structure(c(1L, 4L, 4L, 3L, 2L), .Label = c(
  "/ca/put/sent_1/fe.gr/eq2_on/eq2_off",
  "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL",
  "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4",
  "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov"), class = "factor
  ")), .Names = "ALL", class = "data.frame", row.names = c(NA,-5L))

#7


0  

Having myself similar, but more complex, problem I figured the following functional way that allows to cast in 'one hot' way string seperated character values of a column/s individually as well as categorical variables itself:

让我自己有类似但更复杂的问题我想出了以下功能方式,它允许以“一个热”的方式单独转换列/ s的字符串分隔字符值以及分类变量本身:

oneHotOnText <- function(datatable, columns, seperator=", "){ #argument columns is  character vector or numeric vector
  if(! "data.table" %in% .packages()) if(!require(data.table)) { install.packages("data.table"); library(data.table) }
  if(! "data.table" %in% class(datatable)) TempDT <- as.data.table(datatable) else TempDT <- copy(datatable)
  for(i in TempDT[, columns, with = F]){
    if(class(i) != "character") i <- as.character(i)
    uniqueValues <- unique(unlist(strsplit(unique(i), split=seperator)))
    if(any(uniqueValues %in% names(TempDT))) { print("Value/s of the selected column/s is/are present as variables name/s. Rename it/them.")
                                             rm(TempDT)
                                             break }
    for(j in uniqueValues) TempDT[, (j) := ifelse(grepl(j, i), 1L, 0L)]
  }
  if(exists("TempDT")) return(TempDT)
}

DF = data.frame(
  aColumn=rep(c("f", "b", "c"), 100000),
  xColumn=rep(c("N/W", "W", "R"), 100000), 
  yColumn=rep(c("A/B", "A/V", "B/G"), 100000),
  zColumn=rep(20:22, 100000))

str(DF) #factors are present in the data.frame

oneHotOnText(DF, columns = c("aColumn", "xColumn", "yColumn"), seperator="/")[] #applies the function, returns a data.table and prints the result

#        aColumn xColumn yColumn zColumn f b c N W R A B V G
#     1:       f     N/W     A/B      20 1 0 0 1 1 0 1 1 0 0
#     2:       b       W     A/V      21 0 1 0 0 1 0 1 0 1 0
#     3:       c       R     B/G      22 0 0 1 0 0 1 0 1 0 1
#     4:       f     N/W     A/B      20 1 0 0 1 1 0 1 1 0 0
#     5:       b       W     A/V      21 0 1 0 0 1 0 1 0 1 0
#    ---                                                    
#299996:       b       W     A/V      21 0 1 0 0 1 0 1 0 1 0
#299997:       c       R     B/G      22 0 0 1 0 0 1 0 1 0 1
#299998:       f     N/W     A/B      20 1 0 0 1 1 0 1 1 0 0
#299999:       b       W     A/V      21 0 1 0 0 1 0 1 0 1 0
#300000:       c       R     B/G      22 0 0 1 0 0 1 0 1 0 1

Similarly it applies to the OP´s problem:

同样,它适用于OP的问题:

input <- data.frame(ALL = c("/ca/put/sent_1/fe.gr/eq2_on/eq2_off",
                            "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov",
                            "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov",
                            "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4",
                            "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL"
                            ))

oneHotOnText(input, columns = "ALL", seperator = "/")[] 

#1


3  

Using mtabuate from the qdapTools package that I maintain:

使用我维护的qdapTools包中的mtabuate:

library(qdapTools)
mtabulate(strsplit(as.character(dat[[1]]), "/"))

##   V1 ca cbr_LBL cni_at.p3x.4 eq2_off eq2_on fe.gr hi.on hi.ov put sent_1 sent_1fe.gr
## 1  1  1       0            0       1      1     1     0     0   1      1           0
## 2  1  1       0            0       1      1     1     1     1   1      1           0
## 3  1  1       0            0       1      1     0     1     1   1      0           1
## 4  1  1       0            1       1      1     1     0     0   1      1           0
## 5  1  1       1            0       1      1     1     0     0   1      1           0

#2


4  

You can use cSplit_e from my "splitstackshape" package:

您可以使用我的“splitstackshape”包中的cSplit_e:

library(splitstackshape)
cSplit_e(mydata, "ALL", "/", type = "character", fill = 0)
#                                                ALL ALL_ca ALL_cbr_LBL
# 1              /ca/put/sent_1/fe.gr/eq2_on/eq2_off      1           0
# 2  /ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov      1           0
# 3   /ca/put/sent_1fe.gr/eq2_on/eq2_off/hi.on/hi.ov      1           0
# 4 /ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4      1           0
# 5      /ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL      1           1
#   ALL_cni_at.p3x.4 ALL_eq2_off ALL_eq2_on ALL_fe.gr ALL_hi.on ALL_hi.ov ALL_put
# 1                0           1          1         1         0         0       1
# 2                0           1          1         1         1         1       1
# 3                0           1          1         0         1         1       1
# 4                1           1          1         1         0         0       1
# 5                0           1          1         1         0         0       1
#   ALL_sent_1 ALL_sent_1fe.gr
# 1          1               0
# 2          1               0
# 3          0               1
# 4          1               0
# 5          1               0

(Note: I think there's a problem in row 3 of your dput which is why it doesn't match your desired output. Notice that the third item in row 3 is "sent_1fe.gr" with no "/" between them.)

(注意:我认为你的dput的第3行存在问题,这就是为什么它与你想要的输出不匹配。请注意,第3行中的第三项是“sent_1fe.gr”,它们之间没有“/”。)

#3


2  

How about something like this

这样的事情怎么样?

spt <- strsplit(as.character(mydata$ALL),"/", fixed=T)
do.call(rbind, lapply(lapply(spt, factor, levels=unique(unlist(spt))), table))

which returns

       ca put sent_1 fe.gr eq2_on eq2_off hi.on hi.ov sent_1fe.gr cni_at.p3x.4 cbr_LBL
[1,] 1  1   1      1     1      1       1     0     0           0            0       0
[2,] 1  1   1      1     1      1       1     1     1           0            0       0
[3,] 1  1   1      1     0      1       1     1     1           1            0       0
[4,] 1  1   1      1     1      1       1     0     0           0            1       0
[5,] 1  1   1      1     1      1       1     0     0           0            0       1

#4


1  

Other option is to melt the split string in list to long form and then use table

其他选项是将列表中的拆分字符串熔化为长格式然后使用表格

library(reshape2)
as.data.frame.matrix(table(melt(strsplit(as.character(
                                   mydata[[1]]), "/"))[2:1]))[,-1]
#    ca eq2_off eq2_on fe.gr put sent_1 hi.on hi.ov sent_1fe.gr cni_at.p3x.4
#1  1       1      1     1   1      1     0     0           0            0
#2  1       1      1     1   1      1     1     1           0            0
#3  1       1      1     0   1      0     1     1           1            0
#4  1       1      1     1   1      1     0     0           0            1
#5  1       1      1     1   1      1     0     0           0            0
#  cbr_LBL
#1       0
#2       0
#3       0
#4       0
#5       1

#5


0  

Here's a solution that uses dplyr and tidyr (note: I cleaned up what appears to be an missing / in row three of your sample data):

这是一个使用dplyr和tidyr的解决方案(注意:我清理了样本数据中似乎缺少/第三行的内容):

## Input

input <- structure(
  list(ALL = structure(c(1L, 4L, 5L, 3L, 2L),
       .Label = c("/ca/put/sent_1/fe.gr/eq2_on/eq2_off",
                  "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL",
                  "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4",
                  "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov",
                  "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov"),
       class = "factor")),
  .Names = "ALL", class = "data.frame", row.names = c(NA, -5L))


## Solution

require(dplyr)
require(tidyr)
solution <- input  %>%
  mutate(temp = sub("^/", "", ALL)) %>%
  separate(temp,
           c("ca", "put", "sent_1", "fe.gr", "eq2_on", "eq2_off",
             "hi.on", "hi.ov", "cni_at.p3x.4", "cbr_LBL"),
           "/", extra="merge") %>%
  mutate_each(funs(as.numeric(!is.na(.))), -ALL)

#6


0  

a tidyverse solution

一个整合的解决方案

library(tidyverse)

mydata %>%
  rownames_to_column() %>%
  mutate(key = strsplit(levels(ALL)[ALL],"/"),value=1) %>%
  unnest %>%
  spread(key,value,0) %>%
  select(-rowname)

#   ALL   ca cbr_LBL cni_at.p3x.4 eq2_off eq2_on fe.gr hi.on hi.ov put sent_1
# 1   1 1  1       0            0       1      1     1     0     0   1      1
# 2   4 1  1       0            0       1      1     1     1     1   1      1
# 3   4 1  1       0            0       1      1     1     1     1   1      1
# 4   3 1  1       0            1       1      1     1     0     0   1      1
# 5   2 1  1       1            0       1      1     1     0     0   1      1

data

mydata <- structure(list(ALL = structure(c(1L, 4L, 4L, 3L, 2L), .Label = c(
  "/ca/put/sent_1/fe.gr/eq2_on/eq2_off",
  "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL",
  "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4",
  "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov"), class = "factor
  ")), .Names = "ALL", class = "data.frame", row.names = c(NA,-5L))

#7


0  

Having myself similar, but more complex, problem I figured the following functional way that allows to cast in 'one hot' way string seperated character values of a column/s individually as well as categorical variables itself:

让我自己有类似但更复杂的问题我想出了以下功能方式,它允许以“一个热”的方式单独转换列/ s的字符串分隔字符值以及分类变量本身:

oneHotOnText <- function(datatable, columns, seperator=", "){ #argument columns is  character vector or numeric vector
  if(! "data.table" %in% .packages()) if(!require(data.table)) { install.packages("data.table"); library(data.table) }
  if(! "data.table" %in% class(datatable)) TempDT <- as.data.table(datatable) else TempDT <- copy(datatable)
  for(i in TempDT[, columns, with = F]){
    if(class(i) != "character") i <- as.character(i)
    uniqueValues <- unique(unlist(strsplit(unique(i), split=seperator)))
    if(any(uniqueValues %in% names(TempDT))) { print("Value/s of the selected column/s is/are present as variables name/s. Rename it/them.")
                                             rm(TempDT)
                                             break }
    for(j in uniqueValues) TempDT[, (j) := ifelse(grepl(j, i), 1L, 0L)]
  }
  if(exists("TempDT")) return(TempDT)
}

DF = data.frame(
  aColumn=rep(c("f", "b", "c"), 100000),
  xColumn=rep(c("N/W", "W", "R"), 100000), 
  yColumn=rep(c("A/B", "A/V", "B/G"), 100000),
  zColumn=rep(20:22, 100000))

str(DF) #factors are present in the data.frame

oneHotOnText(DF, columns = c("aColumn", "xColumn", "yColumn"), seperator="/")[] #applies the function, returns a data.table and prints the result

#        aColumn xColumn yColumn zColumn f b c N W R A B V G
#     1:       f     N/W     A/B      20 1 0 0 1 1 0 1 1 0 0
#     2:       b       W     A/V      21 0 1 0 0 1 0 1 0 1 0
#     3:       c       R     B/G      22 0 0 1 0 0 1 0 1 0 1
#     4:       f     N/W     A/B      20 1 0 0 1 1 0 1 1 0 0
#     5:       b       W     A/V      21 0 1 0 0 1 0 1 0 1 0
#    ---                                                    
#299996:       b       W     A/V      21 0 1 0 0 1 0 1 0 1 0
#299997:       c       R     B/G      22 0 0 1 0 0 1 0 1 0 1
#299998:       f     N/W     A/B      20 1 0 0 1 1 0 1 1 0 0
#299999:       b       W     A/V      21 0 1 0 0 1 0 1 0 1 0
#300000:       c       R     B/G      22 0 0 1 0 0 1 0 1 0 1

Similarly it applies to the OP´s problem:

同样,它适用于OP的问题:

input <- data.frame(ALL = c("/ca/put/sent_1/fe.gr/eq2_on/eq2_off",
                            "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov",
                            "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov",
                            "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4",
                            "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL"
                            ))

oneHotOnText(input, columns = "ALL", seperator = "/")[]