I'm trying to figure out an efficient way to go about splitting a string like
我想找到一种有效的方法来分解一个字符串
"111110000011110000111000"
into a vector
成一个向量
[1] "11111" "00000" "1111" "0000" "111" "000"
where "0" and "1" can be any alternating characters.
其中“0”和“1”可以是任何交替字符。
9 个解决方案
#1
84
Try
试一试
strsplit(str1, '(?<=1)(?=0)|(?<=0)(?=1)', perl=TRUE)[[1]]
#[1] "11111" "00000" "1111" "0000" "111" "000"
Update
A modification of @rawr's solution with stri_extract_all_regex
使用stri_extract_all_regex修改@rawr的解决方案
library(stringi)
stri_extract_all_regex(str1, '(?:(\\w))\\1*')[[1]]
#[1] "11111" "00000" "1111" "0000" "111" "000"
stri_extract_all_regex(x1, '(?:(\\w))\\1*')[[1]]
#[1] "11111" "00000" "222" "000" "3333" "000" "1111" "0000" "111"
#[10] "000"
stri_extract_all_regex(x2, '(?:(\\w))\\1*')[[1]]
#[1] "aaaaa" "bb" "ccccccc" "bbb" "a" "d" "11111"
#[8] "00000" "222" "aaa" "bb" "cc" "d" "11"
#[15] "D" "aa" "BB"
Benchmarks
library(stringi)
set.seed(24)
x3 <- stri_rand_strings(1, 1e4)
akrun <- function() stri_extract_all_regex(x3, '(?:(\\w))\\1*')[[1]]
#modified @thelatemail's function to make it bit more general
thelate <- function() regmatches(x3,gregexpr("(?:(\\w))\\1*", x3,
perl=TRUE))[[1]]
rawr <- function() strsplit(x3, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
ananda <- function() unlist(read.fwf(textConnection(x3),
rle(strsplit(x3, "")[[1]])$lengths,
colClasses = "character"))
Colonel <- function() with(rle(strsplit(x3,'')[[1]]),
mapply(function(u,v) paste0(rep(v,u), collapse=''), lengths, values))
Cryo <- function(){
res_vector=rep(NA_character_,nchar(x3))
res_vector[1]=substr(x3,1,1)
counter=1
old_tmp=''
for (i in 2:nchar(x3)) {
tmp=substr(x3,i,i)
if (tmp==old_tmp) {
res_vector[counter]=paste0(res_vector[counter],tmp)
} else {
res_vector[counter+1]=tmp
counter=counter+1
}
old_tmp=tmp
}
res_vector[!is.na(res_vector)]
}
richard <- function(){
cs <- cumsum(
rle(stri_split_boundaries(x3, type = "character")[[1L]])$lengths
)
stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}
nicola<-function(x) {
indices<-c(0,which(diff(as.integer(charToRaw(x)))!=0),nchar(x))
substring(x,indices[-length(indices)]+1,indices[-1])
}
richard2 <- function() {
cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}
system.time(akrun())
# user system elapsed
# 0.003 0.000 0.003
system.time(thelate())
# user system elapsed
# 0.272 0.001 0.274
system.time(rawr())
# user system elapsed
# 0.397 0.001 0.398
system.time(ananda())
# user system elapsed
# 3.744 0.204 3.949
system.time(Colonel())
# user system elapsed
# 0.154 0.001 0.154
system.time(Cryo())
# user system elapsed
# 0.220 0.005 0.226
system.time(richard())
# user system elapsed
# 0.007 0.000 0.006
system.time(nicola(x3))
# user system elapsed
# 0.190 0.001 0.191
On a slightly bigger string,
在稍微大一点的弦上,
set.seed(24)
x3 <- stri_rand_strings(1, 1e6)
system.time(akrun())
#user system elapsed
#0.166 0.000 0.155
system.time(richard())
# user system elapsed
# 0.606 0.000 0.569
system.time(richard2())
# user system elapsed
# 0.518 0.000 0.487
system.time(Colonel())
# user system elapsed
# 9.631 0.000 9.358
library(microbenchmark)
microbenchmark(richard(), richard2(), akrun(), times=20L, unit='relative')
#Unit: relative
# expr min lq mean median uq max neval cld
# richard() 2.438570 2.633896 2.365686 2.315503 2.368917 2.124581 20 b
#richard2() 2.389131 2.533301 2.223521 2.143112 2.153633 2.157861 20 b
# akrun() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 20 a
NOTE: Tried to run the other methods, but it takes a long time
注意:尝试运行其他方法,但是需要很长时间
data
str1 <- "111110000011110000111000"
x1 <- "1111100000222000333300011110000111000"
x2 <- "aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
#2
24
Variation on a theme:
旋律的变化,
x <- "111110000011110000111000"
regmatches(x,gregexpr("1+|0+",x))[[1]]
#[1] "11111" "00000" "1111" "0000" "111" "000"
#3
21
You could probably make use of substr
or read.fwf
along with rle
(though it is unlikely to be as efficient as any regex-based solution):
您可以使用substr或read。fwf与rle(尽管它不可能像任何基于regex的解决方案一样有效):
x <- "111110000011110000111000"
unlist(read.fwf(textConnection(x),
rle(strsplit(x, "")[[1]])$lengths,
colClasses = "character"))
# V1 V2 V3 V4 V5 V6
# "11111" "00000" "1111" "0000" "111" "000"
One advantage of this approach is that it would work even with, say:
这种方法的一个优点是,它甚至可以使用,比如:
x <- paste(c(rep("a", 5), rep("b", 2), rep("c", 7),
rep("b", 3), rep("a", 1), rep("d", 1)), collapse = "")
x
# [1] "aaaaabbcccccccbbbad"
unlist(read.fwf(textConnection(x),
rle(strsplit(x, "")[[1]])$lengths,
colClasses = "character"))
# V1 V2 V3 V4 V5 V6
# "aaaaa" "bb" "ccccccc" "bbb" "a" "d"
#4
20
Another way would be to add whitespace between the alternating digits. This would work for any two, not just 1s and 0s. Then use strsplit
on the whitespace:
另一种方法是在交替数字之间添加空格。这对任意两个都成立,不只是1和0。然后在空格上使用strsplit:
x <- "111110000011110000111000"
(y <- gsub('(\\d)(?!\\1)', '\\1 \\2', x, perl = TRUE))
# [1] "11111 00000 1111 0000 111 000 "
strsplit(y, ' ')[[1]]
# [1] "11111" "00000" "1111" "0000" "111" "000"
Or more succinctly as @akrun points out:
或者更简洁地说,正如@akrun指出的:
strsplit(x, '(?<=(\\d))(?!\\1)', perl=TRUE)[[1]]
# [1] "11111" "00000" "1111" "0000" "111" "000"
also changing \\d
to \\w
works also
也要改变\\w的工作。
x <- "aaaaabbcccccccbbbad"
strsplit(x, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
# [1] "aaaaa" "bb" "ccccccc" "bbb" "a" "d"
x <- "111110000011110000111000"
strsplit(x, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
# [1] "11111" "00000" "1111" "0000" "111" "000"
You could also use \K
(rather than explicitly using the capture groups, \\1
and \\2
) which I don't see used a lot nor do I know how to explain it :}
您也可以使用\K(而不是显式地使用捕获组、\\1和\\2),我没有看到太多的使用,也不知道如何解释它:}
AFAIK \\K
resets the starting point of the reported match and any previously consumed characters are no longer included, basically throwing away everything matched up to that point.
AFAIK \K重新设置所报告的匹配的起始点,并且不再包含以前使用的任何字符,基本上放弃与该点匹配的所有内容。
x <- "1111100000222000333300011110000111000"
(z <- gsub('(\\d)\\K(?!\\1)', ' ', x, perl = TRUE))
# [1] "11111 00000 222 000 3333 000 1111 0000 111 000 "
#5
14
Original Approach: Here is a stringi approach that incorporates rle()
.
原始方法:这里是一个包含rle()的stringi方法。
x <- "111110000011110000111000"
library(stringi)
cs <- cumsum(
rle(stri_split_boundaries(x, type = "character")[[1L]])$lengths
)
stri_sub(x, c(1L, head(cs + 1L, -1L)), cs)
# [1] "11111" "00000" "1111" "0000" "111" "000"
Or, you can use the length
argument in stri_sub()
或者,可以在stri_sub()中使用length参数
rl <- rle(stri_split_boundaries(x, type = "character")[[1L]])
with(rl, {
stri_sub(x, c(1L, head(cumsum(lengths) + 1L, -1L)), length = lengths)
})
# [1] "11111" "00000" "1111" "0000" "111" "000"
Updated for Efficiency: After realizing that base::strsplit()
is faster than stringi::stri_split_boundaries()
, here is a more efficient version of my previous answer using only base functions.
为提高效率而更新:在实现了base::strsplit()比stringi:: stri_split_boundary()更快之后,这里有一个更高效的版本,它只使用基本函数。
set.seed(24)
x3 <- stri_rand_strings(1L, 1e6L)
system.time({
cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
substring(x3, c(1L, head(cs + 1L, -1L)), cs)
})
# user system elapsed
# 0.686 0.012 0.697
#6
11
Another approach in case, using mapply
:
另一种方法是使用mapply:
x="111110000011110000111000"
with(rle(strsplit(x,'')[[1]]),
mapply(function(u,v) paste0(rep(v,u), collapse=''), lengths, values))
#[1] "11111" "00000" "1111" "0000" "111" "000"
#7
8
It's not really what the OP was looking for (concise R code), but thought I'd give it a try in Rcpp
, and turned out relatively simple and about 5x faster than the fastest R-based answers.
这并不是OP真正想要的(简洁的R代码),但是我想尝试一下Rcpp,结果是相对简单的,比基于R的最快答案快5倍。
library(Rcpp)
cppFunction(
'std::vector<std::string> split_str_cpp(std::string x) {
std::vector<std::string> parts;
int start = 0;
for(int i = 1; i <= x.length(); i++) {
if(x[i] != x[i-1]) {
parts.push_back(x.substr(start, i-start));
start = i;
}
}
return parts;
}')
And testing on these
和测试这些
str1 <- "111110000011110000111000"
x1 <- "1111100000222000333300011110000111000"
x2 <- "aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
Gives the following output
给以下输出
> split_str_cpp(str1)
[1] "11111" "00000" "1111" "0000" "111" "000"
> split_str_cpp(x1)
[1] "11111" "00000" "222" "000" "3333" "000" "1111" "0000" "111" "000"
> split_str_cpp(x2)
[1] "aaaaa" "bb" "ccccccc" "bbb" "a" "d" "11111" "00000" "222" "aaa" "bb" "cc" "d" "11"
[15] "D" "aa" "BB"
And a benchmark shows it's about 5-10x faster than R solutions.
基准测试显示它比R方案快5-10倍。
akrun <- function(str1) strsplit(str1, '(?<=1)(?=0)|(?<=0)(?=1)', perl=TRUE)[[1]]
richard1 <- function(x3){
cs <- cumsum(
rle(stri_split_boundaries(x3, type = "character")[[1L]])$lengths
)
stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}
richard2 <- function(x3) {
cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}
library(microbenchmark)
library(stringi)
set.seed(24)
x3 <- stri_rand_strings(1, 1e6)
microbenchmark(split_str_cpp(x3), akrun(x3), richard1(x3), richard2(x3), unit = 'relative', times=20L)
Comparison:
比较:
Unit: relative
expr min lq mean median uq max neval
split_str_cpp(x3) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 20
akrun(x3) 9.675613 8.952997 8.241750 8.689001 8.403634 4.423134 20
richard1(x3) 5.355620 5.226103 5.483171 5.947053 5.982943 3.379446 20
richard2(x3) 4.842398 4.756086 5.046077 5.389570 5.389193 3.669680 20
#8
2
Simple for
loop solution
简单的for循环的解决方案
x="aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
res_vector=substr(x,1,1)
for (i in 2:nchar(x)) {
tmp=substr(x,i,i)
if (tmp==substr(x,i-1,i-1)) {
res_vector[length(res_vector)]=paste0(res_vector[length(res_vector)],tmp)
} else {
res_vector[length(res_vector)+1]=tmp
}
}
res_vector
#[1] "aaaaa" "bb" "ccccccc" "bbb" "a" "d" "11111" "00000" "222" "aaa" "bb" "cc" "d" "11" "D" "aa" "BB"
Or a maybe a little bit faster with a pre-allocated results vector
或者用预先分配的结果向量来加快速度。
x="aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
res_vector=rep(NA_character_,nchar(x))
res_vector[1]=substr(x,1,1)
counter=1
old_tmp=''
for (i in 2:nchar(x)) {
tmp=substr(x,i,i)
if (tmp==old_tmp) {
res_vector[counter]=paste0(res_vector[counter],tmp)
} else {
res_vector[counter+1]=tmp
counter=counter+1
}
old_tmp=tmp
}
res_vector[!is.na(res_vector)]
#9
1
How about this:
这个怎么样:
s <- "111110000011110000111000"
spl <- strsplit(s,"10|01")[[1]]
l <- length(spl)
sapply(1:l, function(i) paste0(spl[i],i%%2,ifelse(i==1 | i==l, "",i%%2)))
# [1] "11111" "00000" "1111" "0000" "111" "000"
#1
84
Try
试一试
strsplit(str1, '(?<=1)(?=0)|(?<=0)(?=1)', perl=TRUE)[[1]]
#[1] "11111" "00000" "1111" "0000" "111" "000"
Update
A modification of @rawr's solution with stri_extract_all_regex
使用stri_extract_all_regex修改@rawr的解决方案
library(stringi)
stri_extract_all_regex(str1, '(?:(\\w))\\1*')[[1]]
#[1] "11111" "00000" "1111" "0000" "111" "000"
stri_extract_all_regex(x1, '(?:(\\w))\\1*')[[1]]
#[1] "11111" "00000" "222" "000" "3333" "000" "1111" "0000" "111"
#[10] "000"
stri_extract_all_regex(x2, '(?:(\\w))\\1*')[[1]]
#[1] "aaaaa" "bb" "ccccccc" "bbb" "a" "d" "11111"
#[8] "00000" "222" "aaa" "bb" "cc" "d" "11"
#[15] "D" "aa" "BB"
Benchmarks
library(stringi)
set.seed(24)
x3 <- stri_rand_strings(1, 1e4)
akrun <- function() stri_extract_all_regex(x3, '(?:(\\w))\\1*')[[1]]
#modified @thelatemail's function to make it bit more general
thelate <- function() regmatches(x3,gregexpr("(?:(\\w))\\1*", x3,
perl=TRUE))[[1]]
rawr <- function() strsplit(x3, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
ananda <- function() unlist(read.fwf(textConnection(x3),
rle(strsplit(x3, "")[[1]])$lengths,
colClasses = "character"))
Colonel <- function() with(rle(strsplit(x3,'')[[1]]),
mapply(function(u,v) paste0(rep(v,u), collapse=''), lengths, values))
Cryo <- function(){
res_vector=rep(NA_character_,nchar(x3))
res_vector[1]=substr(x3,1,1)
counter=1
old_tmp=''
for (i in 2:nchar(x3)) {
tmp=substr(x3,i,i)
if (tmp==old_tmp) {
res_vector[counter]=paste0(res_vector[counter],tmp)
} else {
res_vector[counter+1]=tmp
counter=counter+1
}
old_tmp=tmp
}
res_vector[!is.na(res_vector)]
}
richard <- function(){
cs <- cumsum(
rle(stri_split_boundaries(x3, type = "character")[[1L]])$lengths
)
stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}
nicola<-function(x) {
indices<-c(0,which(diff(as.integer(charToRaw(x)))!=0),nchar(x))
substring(x,indices[-length(indices)]+1,indices[-1])
}
richard2 <- function() {
cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}
system.time(akrun())
# user system elapsed
# 0.003 0.000 0.003
system.time(thelate())
# user system elapsed
# 0.272 0.001 0.274
system.time(rawr())
# user system elapsed
# 0.397 0.001 0.398
system.time(ananda())
# user system elapsed
# 3.744 0.204 3.949
system.time(Colonel())
# user system elapsed
# 0.154 0.001 0.154
system.time(Cryo())
# user system elapsed
# 0.220 0.005 0.226
system.time(richard())
# user system elapsed
# 0.007 0.000 0.006
system.time(nicola(x3))
# user system elapsed
# 0.190 0.001 0.191
On a slightly bigger string,
在稍微大一点的弦上,
set.seed(24)
x3 <- stri_rand_strings(1, 1e6)
system.time(akrun())
#user system elapsed
#0.166 0.000 0.155
system.time(richard())
# user system elapsed
# 0.606 0.000 0.569
system.time(richard2())
# user system elapsed
# 0.518 0.000 0.487
system.time(Colonel())
# user system elapsed
# 9.631 0.000 9.358
library(microbenchmark)
microbenchmark(richard(), richard2(), akrun(), times=20L, unit='relative')
#Unit: relative
# expr min lq mean median uq max neval cld
# richard() 2.438570 2.633896 2.365686 2.315503 2.368917 2.124581 20 b
#richard2() 2.389131 2.533301 2.223521 2.143112 2.153633 2.157861 20 b
# akrun() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 20 a
NOTE: Tried to run the other methods, but it takes a long time
注意:尝试运行其他方法,但是需要很长时间
data
str1 <- "111110000011110000111000"
x1 <- "1111100000222000333300011110000111000"
x2 <- "aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
#2
24
Variation on a theme:
旋律的变化,
x <- "111110000011110000111000"
regmatches(x,gregexpr("1+|0+",x))[[1]]
#[1] "11111" "00000" "1111" "0000" "111" "000"
#3
21
You could probably make use of substr
or read.fwf
along with rle
(though it is unlikely to be as efficient as any regex-based solution):
您可以使用substr或read。fwf与rle(尽管它不可能像任何基于regex的解决方案一样有效):
x <- "111110000011110000111000"
unlist(read.fwf(textConnection(x),
rle(strsplit(x, "")[[1]])$lengths,
colClasses = "character"))
# V1 V2 V3 V4 V5 V6
# "11111" "00000" "1111" "0000" "111" "000"
One advantage of this approach is that it would work even with, say:
这种方法的一个优点是,它甚至可以使用,比如:
x <- paste(c(rep("a", 5), rep("b", 2), rep("c", 7),
rep("b", 3), rep("a", 1), rep("d", 1)), collapse = "")
x
# [1] "aaaaabbcccccccbbbad"
unlist(read.fwf(textConnection(x),
rle(strsplit(x, "")[[1]])$lengths,
colClasses = "character"))
# V1 V2 V3 V4 V5 V6
# "aaaaa" "bb" "ccccccc" "bbb" "a" "d"
#4
20
Another way would be to add whitespace between the alternating digits. This would work for any two, not just 1s and 0s. Then use strsplit
on the whitespace:
另一种方法是在交替数字之间添加空格。这对任意两个都成立,不只是1和0。然后在空格上使用strsplit:
x <- "111110000011110000111000"
(y <- gsub('(\\d)(?!\\1)', '\\1 \\2', x, perl = TRUE))
# [1] "11111 00000 1111 0000 111 000 "
strsplit(y, ' ')[[1]]
# [1] "11111" "00000" "1111" "0000" "111" "000"
Or more succinctly as @akrun points out:
或者更简洁地说,正如@akrun指出的:
strsplit(x, '(?<=(\\d))(?!\\1)', perl=TRUE)[[1]]
# [1] "11111" "00000" "1111" "0000" "111" "000"
also changing \\d
to \\w
works also
也要改变\\w的工作。
x <- "aaaaabbcccccccbbbad"
strsplit(x, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
# [1] "aaaaa" "bb" "ccccccc" "bbb" "a" "d"
x <- "111110000011110000111000"
strsplit(x, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
# [1] "11111" "00000" "1111" "0000" "111" "000"
You could also use \K
(rather than explicitly using the capture groups, \\1
and \\2
) which I don't see used a lot nor do I know how to explain it :}
您也可以使用\K(而不是显式地使用捕获组、\\1和\\2),我没有看到太多的使用,也不知道如何解释它:}
AFAIK \\K
resets the starting point of the reported match and any previously consumed characters are no longer included, basically throwing away everything matched up to that point.
AFAIK \K重新设置所报告的匹配的起始点,并且不再包含以前使用的任何字符,基本上放弃与该点匹配的所有内容。
x <- "1111100000222000333300011110000111000"
(z <- gsub('(\\d)\\K(?!\\1)', ' ', x, perl = TRUE))
# [1] "11111 00000 222 000 3333 000 1111 0000 111 000 "
#5
14
Original Approach: Here is a stringi approach that incorporates rle()
.
原始方法:这里是一个包含rle()的stringi方法。
x <- "111110000011110000111000"
library(stringi)
cs <- cumsum(
rle(stri_split_boundaries(x, type = "character")[[1L]])$lengths
)
stri_sub(x, c(1L, head(cs + 1L, -1L)), cs)
# [1] "11111" "00000" "1111" "0000" "111" "000"
Or, you can use the length
argument in stri_sub()
或者,可以在stri_sub()中使用length参数
rl <- rle(stri_split_boundaries(x, type = "character")[[1L]])
with(rl, {
stri_sub(x, c(1L, head(cumsum(lengths) + 1L, -1L)), length = lengths)
})
# [1] "11111" "00000" "1111" "0000" "111" "000"
Updated for Efficiency: After realizing that base::strsplit()
is faster than stringi::stri_split_boundaries()
, here is a more efficient version of my previous answer using only base functions.
为提高效率而更新:在实现了base::strsplit()比stringi:: stri_split_boundary()更快之后,这里有一个更高效的版本,它只使用基本函数。
set.seed(24)
x3 <- stri_rand_strings(1L, 1e6L)
system.time({
cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
substring(x3, c(1L, head(cs + 1L, -1L)), cs)
})
# user system elapsed
# 0.686 0.012 0.697
#6
11
Another approach in case, using mapply
:
另一种方法是使用mapply:
x="111110000011110000111000"
with(rle(strsplit(x,'')[[1]]),
mapply(function(u,v) paste0(rep(v,u), collapse=''), lengths, values))
#[1] "11111" "00000" "1111" "0000" "111" "000"
#7
8
It's not really what the OP was looking for (concise R code), but thought I'd give it a try in Rcpp
, and turned out relatively simple and about 5x faster than the fastest R-based answers.
这并不是OP真正想要的(简洁的R代码),但是我想尝试一下Rcpp,结果是相对简单的,比基于R的最快答案快5倍。
library(Rcpp)
cppFunction(
'std::vector<std::string> split_str_cpp(std::string x) {
std::vector<std::string> parts;
int start = 0;
for(int i = 1; i <= x.length(); i++) {
if(x[i] != x[i-1]) {
parts.push_back(x.substr(start, i-start));
start = i;
}
}
return parts;
}')
And testing on these
和测试这些
str1 <- "111110000011110000111000"
x1 <- "1111100000222000333300011110000111000"
x2 <- "aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
Gives the following output
给以下输出
> split_str_cpp(str1)
[1] "11111" "00000" "1111" "0000" "111" "000"
> split_str_cpp(x1)
[1] "11111" "00000" "222" "000" "3333" "000" "1111" "0000" "111" "000"
> split_str_cpp(x2)
[1] "aaaaa" "bb" "ccccccc" "bbb" "a" "d" "11111" "00000" "222" "aaa" "bb" "cc" "d" "11"
[15] "D" "aa" "BB"
And a benchmark shows it's about 5-10x faster than R solutions.
基准测试显示它比R方案快5-10倍。
akrun <- function(str1) strsplit(str1, '(?<=1)(?=0)|(?<=0)(?=1)', perl=TRUE)[[1]]
richard1 <- function(x3){
cs <- cumsum(
rle(stri_split_boundaries(x3, type = "character")[[1L]])$lengths
)
stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}
richard2 <- function(x3) {
cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}
library(microbenchmark)
library(stringi)
set.seed(24)
x3 <- stri_rand_strings(1, 1e6)
microbenchmark(split_str_cpp(x3), akrun(x3), richard1(x3), richard2(x3), unit = 'relative', times=20L)
Comparison:
比较:
Unit: relative
expr min lq mean median uq max neval
split_str_cpp(x3) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 20
akrun(x3) 9.675613 8.952997 8.241750 8.689001 8.403634 4.423134 20
richard1(x3) 5.355620 5.226103 5.483171 5.947053 5.982943 3.379446 20
richard2(x3) 4.842398 4.756086 5.046077 5.389570 5.389193 3.669680 20
#8
2
Simple for
loop solution
简单的for循环的解决方案
x="aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
res_vector=substr(x,1,1)
for (i in 2:nchar(x)) {
tmp=substr(x,i,i)
if (tmp==substr(x,i-1,i-1)) {
res_vector[length(res_vector)]=paste0(res_vector[length(res_vector)],tmp)
} else {
res_vector[length(res_vector)+1]=tmp
}
}
res_vector
#[1] "aaaaa" "bb" "ccccccc" "bbb" "a" "d" "11111" "00000" "222" "aaa" "bb" "cc" "d" "11" "D" "aa" "BB"
Or a maybe a little bit faster with a pre-allocated results vector
或者用预先分配的结果向量来加快速度。
x="aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
res_vector=rep(NA_character_,nchar(x))
res_vector[1]=substr(x,1,1)
counter=1
old_tmp=''
for (i in 2:nchar(x)) {
tmp=substr(x,i,i)
if (tmp==old_tmp) {
res_vector[counter]=paste0(res_vector[counter],tmp)
} else {
res_vector[counter+1]=tmp
counter=counter+1
}
old_tmp=tmp
}
res_vector[!is.na(res_vector)]
#9
1
How about this:
这个怎么样:
s <- "111110000011110000111000"
spl <- strsplit(s,"10|01")[[1]]
l <- length(spl)
sapply(1:l, function(i) paste0(spl[i],i%%2,ifelse(i==1 | i==l, "",i%%2)))
# [1] "11111" "00000" "1111" "0000" "111" "000"