My data has one column and I am trying to create additional columns with what’s after each “/” in the rows. Here are the first few rows of the data:
我的数据有一列,我正在尝试使用行中每个“/”之后的内容创建其他列。以下是数据的前几行:
> dput(mydata)
structure(list(ALL = structure(c(1L, 4L, 4L, 3L, 2L), .Label = c("/
ca/put/sent_1/fe.gr/eq2_on/eq2_off",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL", "/ca/put/sent_1/fe.g
r/eq2_on/eq2_off/cni_at.p3x.4",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov"), class = "factor
")), .Names = "ALL", class = "data.frame", row.names = c(NA,
-5L))
The result should look like this (data frame) with a “1” in the new column if the variable appears in the row and “0” if not:
如果变量出现在行中,结果应该看起来像这样(数据框)在新列中带有“1”,否则为“0”:
> dput(Result)
structure(list(ALL = structure(c(1L, 4L, 5L, 3L, 2L), .Label = c("/ca
/put/sent_1/fe.gr/eq2_on/eq2_off",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL", "/ca/put/sent_1/fe.gr/
eq2_on/eq2_off/cni_at.p3x.4",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov", "/ca/put/sent_1fe.
gr/eq2_on/eq2_off/hi.on/hi.ov"
), class = "factor"), ca = c(1L, 1L, 1L, 1L, 1L), put = c(1L,
1L, 1L, 1L, 1L), sent_1 = c(1L, 1L, 1L, 1L, 1L), fe.gr = c(1L,
1L, 1L, 1L, 1L), eq2_on = c(1L, 1L, 1L, 1L, 1L), eq2_off = c(1L,
1L, 1L, 1L, 1L), hi.on = c(0L, 1L, 1L, 0L, 0L), hi.ov = c(0L,
1L, 1L, 0L, 0L), cni_at.p3x.4 = c(0L, 0L, 0L, 1L, 0L), cbr_LBL = c(0L
,
0L, 0L, 0L, 1L)), .Names = c("ALL", "ca", "put", "sent_1", "fe.gr",
"eq2_on", "eq2_off", "hi.on", "hi.ov", "cni_at.p3x.4", "cbr_LBL"
), class = "data.frame", row.names = c(NA, -5L))
I have tried many functions including strsplit and sapply:
我尝试了很多功能,包括strsplit和sapply:
sapply(strsplit(as.character(mydata$ALL), “\\/”), “[[“, 2) #returns "ca"s only
sapply(strsplit(as.character(mydata$ALL), "\\/"), "[[", 3) #returns "put"s only
There are millions of rows and I’d greatly appreciate anything that is quick and efficient.
有数百万行,我非常欣赏任何快速有效的内容。
7 个解决方案
#1
3
Using mtabuate
from the qdapTools package that I maintain:
使用我维护的qdapTools包中的mtabuate:
library(qdapTools)
mtabulate(strsplit(as.character(dat[[1]]), "/"))
## V1 ca cbr_LBL cni_at.p3x.4 eq2_off eq2_on fe.gr hi.on hi.ov put sent_1 sent_1fe.gr
## 1 1 1 0 0 1 1 1 0 0 1 1 0
## 2 1 1 0 0 1 1 1 1 1 1 1 0
## 3 1 1 0 0 1 1 0 1 1 1 0 1
## 4 1 1 0 1 1 1 1 0 0 1 1 0
## 5 1 1 1 0 1 1 1 0 0 1 1 0
#2
4
You can use cSplit_e
from my "splitstackshape" package:
您可以使用我的“splitstackshape”包中的cSplit_e:
library(splitstackshape)
cSplit_e(mydata, "ALL", "/", type = "character", fill = 0)
# ALL ALL_ca ALL_cbr_LBL
# 1 /ca/put/sent_1/fe.gr/eq2_on/eq2_off 1 0
# 2 /ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov 1 0
# 3 /ca/put/sent_1fe.gr/eq2_on/eq2_off/hi.on/hi.ov 1 0
# 4 /ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4 1 0
# 5 /ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL 1 1
# ALL_cni_at.p3x.4 ALL_eq2_off ALL_eq2_on ALL_fe.gr ALL_hi.on ALL_hi.ov ALL_put
# 1 0 1 1 1 0 0 1
# 2 0 1 1 1 1 1 1
# 3 0 1 1 0 1 1 1
# 4 1 1 1 1 0 0 1
# 5 0 1 1 1 0 0 1
# ALL_sent_1 ALL_sent_1fe.gr
# 1 1 0
# 2 1 0
# 3 0 1
# 4 1 0
# 5 1 0
(Note: I think there's a problem in row 3 of your dput
which is why it doesn't match your desired output. Notice that the third item in row 3 is "sent_1fe.gr" with no "/" between them.)
(注意:我认为你的dput的第3行存在问题,这就是为什么它与你想要的输出不匹配。请注意,第3行中的第三项是“sent_1fe.gr”,它们之间没有“/”。)
#3
2
How about something like this
这样的事情怎么样?
spt <- strsplit(as.character(mydata$ALL),"/", fixed=T)
do.call(rbind, lapply(lapply(spt, factor, levels=unique(unlist(spt))), table))
which returns
ca put sent_1 fe.gr eq2_on eq2_off hi.on hi.ov sent_1fe.gr cni_at.p3x.4 cbr_LBL
[1,] 1 1 1 1 1 1 1 0 0 0 0 0
[2,] 1 1 1 1 1 1 1 1 1 0 0 0
[3,] 1 1 1 1 0 1 1 1 1 1 0 0
[4,] 1 1 1 1 1 1 1 0 0 0 1 0
[5,] 1 1 1 1 1 1 1 0 0 0 0 1
#4
1
Other option is to melt
the split
string in list
to long
form and then use table
其他选项是将列表中的拆分字符串熔化为长格式然后使用表格
library(reshape2)
as.data.frame.matrix(table(melt(strsplit(as.character(
mydata[[1]]), "/"))[2:1]))[,-1]
# ca eq2_off eq2_on fe.gr put sent_1 hi.on hi.ov sent_1fe.gr cni_at.p3x.4
#1 1 1 1 1 1 1 0 0 0 0
#2 1 1 1 1 1 1 1 1 0 0
#3 1 1 1 0 1 0 1 1 1 0
#4 1 1 1 1 1 1 0 0 0 1
#5 1 1 1 1 1 1 0 0 0 0
# cbr_LBL
#1 0
#2 0
#3 0
#4 0
#5 1
#5
0
Here's a solution that uses dplyr and tidyr (note: I cleaned up what appears to be an missing / in row three of your sample data):
这是一个使用dplyr和tidyr的解决方案(注意:我清理了样本数据中似乎缺少/第三行的内容):
## Input
input <- structure(
list(ALL = structure(c(1L, 4L, 5L, 3L, 2L),
.Label = c("/ca/put/sent_1/fe.gr/eq2_on/eq2_off",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov"),
class = "factor")),
.Names = "ALL", class = "data.frame", row.names = c(NA, -5L))
## Solution
require(dplyr)
require(tidyr)
solution <- input %>%
mutate(temp = sub("^/", "", ALL)) %>%
separate(temp,
c("ca", "put", "sent_1", "fe.gr", "eq2_on", "eq2_off",
"hi.on", "hi.ov", "cni_at.p3x.4", "cbr_LBL"),
"/", extra="merge") %>%
mutate_each(funs(as.numeric(!is.na(.))), -ALL)
#6
0
a tidyverse
solution
一个整合的解决方案
library(tidyverse)
mydata %>%
rownames_to_column() %>%
mutate(key = strsplit(levels(ALL)[ALL],"/"),value=1) %>%
unnest %>%
spread(key,value,0) %>%
select(-rowname)
# ALL ca cbr_LBL cni_at.p3x.4 eq2_off eq2_on fe.gr hi.on hi.ov put sent_1
# 1 1 1 1 0 0 1 1 1 0 0 1 1
# 2 4 1 1 0 0 1 1 1 1 1 1 1
# 3 4 1 1 0 0 1 1 1 1 1 1 1
# 4 3 1 1 0 1 1 1 1 0 0 1 1
# 5 2 1 1 1 0 1 1 1 0 0 1 1
data
mydata <- structure(list(ALL = structure(c(1L, 4L, 4L, 3L, 2L), .Label = c(
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov"), class = "factor
")), .Names = "ALL", class = "data.frame", row.names = c(NA,-5L))
#7
0
Having myself similar, but more complex, problem I figured the following functional way that allows to cast in 'one hot' way string seperated character values of a column/s individually as well as categorical variables itself:
让我自己有类似但更复杂的问题我想出了以下功能方式,它允许以“一个热”的方式单独转换列/ s的字符串分隔字符值以及分类变量本身:
oneHotOnText <- function(datatable, columns, seperator=", "){ #argument columns is character vector or numeric vector
if(! "data.table" %in% .packages()) if(!require(data.table)) { install.packages("data.table"); library(data.table) }
if(! "data.table" %in% class(datatable)) TempDT <- as.data.table(datatable) else TempDT <- copy(datatable)
for(i in TempDT[, columns, with = F]){
if(class(i) != "character") i <- as.character(i)
uniqueValues <- unique(unlist(strsplit(unique(i), split=seperator)))
if(any(uniqueValues %in% names(TempDT))) { print("Value/s of the selected column/s is/are present as variables name/s. Rename it/them.")
rm(TempDT)
break }
for(j in uniqueValues) TempDT[, (j) := ifelse(grepl(j, i), 1L, 0L)]
}
if(exists("TempDT")) return(TempDT)
}
DF = data.frame(
aColumn=rep(c("f", "b", "c"), 100000),
xColumn=rep(c("N/W", "W", "R"), 100000),
yColumn=rep(c("A/B", "A/V", "B/G"), 100000),
zColumn=rep(20:22, 100000))
str(DF) #factors are present in the data.frame
oneHotOnText(DF, columns = c("aColumn", "xColumn", "yColumn"), seperator="/")[] #applies the function, returns a data.table and prints the result
# aColumn xColumn yColumn zColumn f b c N W R A B V G
# 1: f N/W A/B 20 1 0 0 1 1 0 1 1 0 0
# 2: b W A/V 21 0 1 0 0 1 0 1 0 1 0
# 3: c R B/G 22 0 0 1 0 0 1 0 1 0 1
# 4: f N/W A/B 20 1 0 0 1 1 0 1 1 0 0
# 5: b W A/V 21 0 1 0 0 1 0 1 0 1 0
# ---
#299996: b W A/V 21 0 1 0 0 1 0 1 0 1 0
#299997: c R B/G 22 0 0 1 0 0 1 0 1 0 1
#299998: f N/W A/B 20 1 0 0 1 1 0 1 1 0 0
#299999: b W A/V 21 0 1 0 0 1 0 1 0 1 0
#300000: c R B/G 22 0 0 1 0 0 1 0 1 0 1
Similarly it applies to the OP´s problem:
同样,它适用于OP的问题:
input <- data.frame(ALL = c("/ca/put/sent_1/fe.gr/eq2_on/eq2_off",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL"
))
oneHotOnText(input, columns = "ALL", seperator = "/")[]
#1
3
Using mtabuate
from the qdapTools package that I maintain:
使用我维护的qdapTools包中的mtabuate:
library(qdapTools)
mtabulate(strsplit(as.character(dat[[1]]), "/"))
## V1 ca cbr_LBL cni_at.p3x.4 eq2_off eq2_on fe.gr hi.on hi.ov put sent_1 sent_1fe.gr
## 1 1 1 0 0 1 1 1 0 0 1 1 0
## 2 1 1 0 0 1 1 1 1 1 1 1 0
## 3 1 1 0 0 1 1 0 1 1 1 0 1
## 4 1 1 0 1 1 1 1 0 0 1 1 0
## 5 1 1 1 0 1 1 1 0 0 1 1 0
#2
4
You can use cSplit_e
from my "splitstackshape" package:
您可以使用我的“splitstackshape”包中的cSplit_e:
library(splitstackshape)
cSplit_e(mydata, "ALL", "/", type = "character", fill = 0)
# ALL ALL_ca ALL_cbr_LBL
# 1 /ca/put/sent_1/fe.gr/eq2_on/eq2_off 1 0
# 2 /ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov 1 0
# 3 /ca/put/sent_1fe.gr/eq2_on/eq2_off/hi.on/hi.ov 1 0
# 4 /ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4 1 0
# 5 /ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL 1 1
# ALL_cni_at.p3x.4 ALL_eq2_off ALL_eq2_on ALL_fe.gr ALL_hi.on ALL_hi.ov ALL_put
# 1 0 1 1 1 0 0 1
# 2 0 1 1 1 1 1 1
# 3 0 1 1 0 1 1 1
# 4 1 1 1 1 0 0 1
# 5 0 1 1 1 0 0 1
# ALL_sent_1 ALL_sent_1fe.gr
# 1 1 0
# 2 1 0
# 3 0 1
# 4 1 0
# 5 1 0
(Note: I think there's a problem in row 3 of your dput
which is why it doesn't match your desired output. Notice that the third item in row 3 is "sent_1fe.gr" with no "/" between them.)
(注意:我认为你的dput的第3行存在问题,这就是为什么它与你想要的输出不匹配。请注意,第3行中的第三项是“sent_1fe.gr”,它们之间没有“/”。)
#3
2
How about something like this
这样的事情怎么样?
spt <- strsplit(as.character(mydata$ALL),"/", fixed=T)
do.call(rbind, lapply(lapply(spt, factor, levels=unique(unlist(spt))), table))
which returns
ca put sent_1 fe.gr eq2_on eq2_off hi.on hi.ov sent_1fe.gr cni_at.p3x.4 cbr_LBL
[1,] 1 1 1 1 1 1 1 0 0 0 0 0
[2,] 1 1 1 1 1 1 1 1 1 0 0 0
[3,] 1 1 1 1 0 1 1 1 1 1 0 0
[4,] 1 1 1 1 1 1 1 0 0 0 1 0
[5,] 1 1 1 1 1 1 1 0 0 0 0 1
#4
1
Other option is to melt
the split
string in list
to long
form and then use table
其他选项是将列表中的拆分字符串熔化为长格式然后使用表格
library(reshape2)
as.data.frame.matrix(table(melt(strsplit(as.character(
mydata[[1]]), "/"))[2:1]))[,-1]
# ca eq2_off eq2_on fe.gr put sent_1 hi.on hi.ov sent_1fe.gr cni_at.p3x.4
#1 1 1 1 1 1 1 0 0 0 0
#2 1 1 1 1 1 1 1 1 0 0
#3 1 1 1 0 1 0 1 1 1 0
#4 1 1 1 1 1 1 0 0 0 1
#5 1 1 1 1 1 1 0 0 0 0
# cbr_LBL
#1 0
#2 0
#3 0
#4 0
#5 1
#5
0
Here's a solution that uses dplyr and tidyr (note: I cleaned up what appears to be an missing / in row three of your sample data):
这是一个使用dplyr和tidyr的解决方案(注意:我清理了样本数据中似乎缺少/第三行的内容):
## Input
input <- structure(
list(ALL = structure(c(1L, 4L, 5L, 3L, 2L),
.Label = c("/ca/put/sent_1/fe.gr/eq2_on/eq2_off",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov"),
class = "factor")),
.Names = "ALL", class = "data.frame", row.names = c(NA, -5L))
## Solution
require(dplyr)
require(tidyr)
solution <- input %>%
mutate(temp = sub("^/", "", ALL)) %>%
separate(temp,
c("ca", "put", "sent_1", "fe.gr", "eq2_on", "eq2_off",
"hi.on", "hi.ov", "cni_at.p3x.4", "cbr_LBL"),
"/", extra="merge") %>%
mutate_each(funs(as.numeric(!is.na(.))), -ALL)
#6
0
a tidyverse
solution
一个整合的解决方案
library(tidyverse)
mydata %>%
rownames_to_column() %>%
mutate(key = strsplit(levels(ALL)[ALL],"/"),value=1) %>%
unnest %>%
spread(key,value,0) %>%
select(-rowname)
# ALL ca cbr_LBL cni_at.p3x.4 eq2_off eq2_on fe.gr hi.on hi.ov put sent_1
# 1 1 1 1 0 0 1 1 1 0 0 1 1
# 2 4 1 1 0 0 1 1 1 1 1 1 1
# 3 4 1 1 0 0 1 1 1 1 1 1 1
# 4 3 1 1 0 1 1 1 1 0 0 1 1
# 5 2 1 1 1 0 1 1 1 0 0 1 1
data
mydata <- structure(list(ALL = structure(c(1L, 4L, 4L, 3L, 2L), .Label = c(
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov"), class = "factor
")), .Names = "ALL", class = "data.frame", row.names = c(NA,-5L))
#7
0
Having myself similar, but more complex, problem I figured the following functional way that allows to cast in 'one hot' way string seperated character values of a column/s individually as well as categorical variables itself:
让我自己有类似但更复杂的问题我想出了以下功能方式,它允许以“一个热”的方式单独转换列/ s的字符串分隔字符值以及分类变量本身:
oneHotOnText <- function(datatable, columns, seperator=", "){ #argument columns is character vector or numeric vector
if(! "data.table" %in% .packages()) if(!require(data.table)) { install.packages("data.table"); library(data.table) }
if(! "data.table" %in% class(datatable)) TempDT <- as.data.table(datatable) else TempDT <- copy(datatable)
for(i in TempDT[, columns, with = F]){
if(class(i) != "character") i <- as.character(i)
uniqueValues <- unique(unlist(strsplit(unique(i), split=seperator)))
if(any(uniqueValues %in% names(TempDT))) { print("Value/s of the selected column/s is/are present as variables name/s. Rename it/them.")
rm(TempDT)
break }
for(j in uniqueValues) TempDT[, (j) := ifelse(grepl(j, i), 1L, 0L)]
}
if(exists("TempDT")) return(TempDT)
}
DF = data.frame(
aColumn=rep(c("f", "b", "c"), 100000),
xColumn=rep(c("N/W", "W", "R"), 100000),
yColumn=rep(c("A/B", "A/V", "B/G"), 100000),
zColumn=rep(20:22, 100000))
str(DF) #factors are present in the data.frame
oneHotOnText(DF, columns = c("aColumn", "xColumn", "yColumn"), seperator="/")[] #applies the function, returns a data.table and prints the result
# aColumn xColumn yColumn zColumn f b c N W R A B V G
# 1: f N/W A/B 20 1 0 0 1 1 0 1 1 0 0
# 2: b W A/V 21 0 1 0 0 1 0 1 0 1 0
# 3: c R B/G 22 0 0 1 0 0 1 0 1 0 1
# 4: f N/W A/B 20 1 0 0 1 1 0 1 1 0 0
# 5: b W A/V 21 0 1 0 0 1 0 1 0 1 0
# ---
#299996: b W A/V 21 0 1 0 0 1 0 1 0 1 0
#299997: c R B/G 22 0 0 1 0 0 1 0 1 0 1
#299998: f N/W A/B 20 1 0 0 1 1 0 1 1 0 0
#299999: b W A/V 21 0 1 0 0 1 0 1 0 1 0
#300000: c R B/G 22 0 0 1 0 0 1 0 1 0 1
Similarly it applies to the OP´s problem:
同样,它适用于OP的问题:
input <- data.frame(ALL = c("/ca/put/sent_1/fe.gr/eq2_on/eq2_off",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL"
))
oneHotOnText(input, columns = "ALL", seperator = "/")[]