R语言学习笔记之八

时间:2022-12-25 22:55:25

摘要: 仅用于记录R语言学习过程:

内容提要:

字符串的处理、正则表达式、stringi包和stringr包

正文:

  字符串的处理

n  导读:

nchar(x)函数:字符串的个数:

> x <- c('fudan','jiaoda')

> nchar(x)

[1] 5 6   #返回字符串的个数

length()函数:返回元素的个数

> length(x)

[1] 2

u  toupper()函数:小写转大写

> toupper('abc')

[1] "ABC"

u  tolower()函数:大写转小写

> tolower('ABKC')

[1] "abkc"

u  paste()函数:(seq参数和collapse参数)粘贴功能

> stringa <- LETTERS[1:5]

> STRINGB <- 1:5

> paste(stringa,STRINGB)

[1] "A 1" "B 2" "C 3" "D 4" "E 5"

> paste(stringa,STRINGB,seq = '-')  #seq分隔符

[1] "A 1 -" "B 2 -" "C 3 -" "D 4 -" "E 5 -"

> paste(stringa,STRINGB,collapse = '-')   # collapse分隔符

[1] "A 1-B 2-C 3-D 4-E 5"

u  paste0()函数:去掉了A和1之间的空格,seq和collapse的表型也不同

> paste0(stringa,STRINGB)

[1] "A1" "B2" "C3" "D4" "E5"

> paste0(stringa,STRINGB,seq = '-')

[1] "A1-" "B2-" "C3-" "D4-" "E5-"

> paste0(stringa,STRINGB,collapse = '-')

[1] "A1-B2-C3-D4-E5"

u  strsplit()函数:字符串拆分功能

> stringC <- paste(stringa, STRINGB, seq = '/')

> strsplit(stringC,split = '/')   #根据/ 进行拆分

[[1]]

[1] "A 1 "

 

[[2]]

[1] "B 2 "

 

[[3]]

[1] "C 3 "

 

[[4]]

[1] "D 4 "

 

[[5]]

[1] "E 5 "

u  substr()函数:字符串截取函数;同时具有赋值功能

> stringd <- c('python','java','ruby','php','linux')

> sub_str <- substr(stringd,start = 2,stop = 4) #截取2-4位的字符,如果不够,就有几个返回几个

> sub_str

[1] "yth" "ava" "uby" "hp"  "inu"

#实现赋值的功能

> substr(stringd,start = 2,stop = 4) <- 'aaa'

> stringd

[1] "paaaon" "jaaa"   "raaa"   "paa"    "laaax"

grep()函数:用于提取字符串中指定的字符,可返回位置,也可返回具体的值。

> seq_names <- c('EU_FRA02_C1_S2008','AF_COM12_80_20014','AF_COM17_F0_S2008',

+                'AS_CHN11_C3_2004','EU-FRA-C3-S2007','NAUSA02E02005','AS_CHN12_N0_05',

+                'NA_USA03_C2_S2007','NA USA04 A3 2004',

+                'EU_UK01_A0_2009','eu_fra_a2_s98','SA/BRA08/00/1996')

> fra_seq <- grep(pattern = 'FRA|fra',x =seq_names)

> fra_seq

[1]  1  5 11

> seq_names[fra_seq]

[1] "EU_FRA02_C1_S2008" "EU-FRA-C3-S2007" 

[3] "eu_fra_a2_s98"   

> fra_seq <- grep(pattern = 'FRA|fra',x =seq_names,value = TRUE)

> fra_seq

[1] "EU_FRA02_C1_S2008" "EU-FRA-C3-S2007" 

[3] "eu_fra_a2_s98"

u  grepl()函数:返回的是逻辑值。没有value参数。ignore.case参数表示是否忽略大小写,TRUE为忽略。

> grepl(pattern = 'FRA|fra',x =seq_names)

 [1]  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE

[10] FALSE  TRUE FALSE

> fra_seq <- grepl(pattern = 'FRA|fra',x =seq_names,value = TRUE)  #或

u  正则表达式:提取元素

> spe_seq <- seq_names[!grepl(pattern = '[s|S][0-9]{2,4}\\b',seq_names)]  #匹配右边界

> spe_seq

[1] "AF_COM12_80_20014" "AS_CHN11_C3_2004"

[3] "NAUSA02E02005"     "AS_CHN12_N0_05"  

[5] "NA USA04 A3 2004"  "EU_UK01_A0_2009" 

[7] "SA/BRA08/00/1996"

找到以ab开头的

my_string <- c('above','about','abrotion','cab')

grep(pattern = '\\bab',x = my_string,value = T) #匹配左边界

u  gsub()函数:把字符串变成数值,会把找到的所有字符都替换掉

money <- c('$1888','$2888','$3888')

gsub('\\$',replacement = '',money)

as.numeric(money)

u  sub()函数:只会替换掉找到的第一个字符

> money <- c('$1888 $2888 $3888')

> sub('\\$',replacement = '',money)

[1] "1888 $2888 $3888"

> gsub('\\$',replacement = '',money)

[1] "1888 2888 3888"

regexpr()函数

> test_string <- c('happy','apple','application','apolitic')

> regexpr('pp',test_string)

[1]  3  2  2 -1   #返回pp出现的位置,-1表示没有

attr(,"match.length")

[1]  2  2  2 -1

attr(,"useBytes")

[1] TRUE

> test_string[regexpr('pp',test_string)>0]  #提取含pp的字符串

[1] "happy"       "apple"       "application"

gregexpr()函数:同regexpr()函数

regexec()函数:同regexpr()函数

u  agrep()函数:可以匹配英美单词不同写法

> string1 <- c('I need a favour','my favorite sport','you made an error')

> agrep('favor',string1)

[1] 1 2

  正则表达式

n  原义表达式:只代表自己

> mystring1 <- c('apple','orange')

> grep('p',mystring1)

[1] 1

n  转义表达式:代表其他含义

> # .所有字符

> mystring2 <- c('shudo','.dfs','-dsfd')

> grep('.',mystring2)

[1] 1 2 3

>

> mystring3 <- c('9anv','fss7','1000','ss7')

> grep('[7-9]',mystring3)

[1] 1 2 4

>

> # ^a,匹配a开头的

> mystring4 <- c('apple','application','abb')

> grep('^ap',mystring4)

[1] 1 2

> # [^]表示不是0-1

> mystring5 <- c('9anv','fss7','1000','ss7')

> grep('[^0-1]',mystring5)

[1] 1 2 4

> #{}代表重复的次数,{1,}表示重复大于1次

> mystring6 <- c('1220','2289','2228','10002')

> grep('2{2,3}',mystring6)

[1] 1 2 3

> # + 表示其最靠近的字符重复多次,()表示把括号内的内容看成一个整体

> mystring7 <- c('food','foot','foul','fans')

> grep ('fo+',mystring7)

[1] 1 2 3

> grep('fo{1,}',mystring7)

[1] 1 2 3

> grep('(fo){1,}',mystring7)

[1] 1 2 3

>

> #* 匹配0次或以上

> #| 管道符  或,满足其中之一就可被返回

>

> mystring8 <- c('kobe','messi','neymar')

> grep('^k|^m',mystring8)

[1] 1 2

> # $表示匹配字符串末尾

> mystring9 <- c('active','positive','negative','iention')

> grep('ive$',mystring9)  #匹配字符串末尾

[1] 1 2 3

> grep('ive\\b',mystring9)

[1] 1 2 3

n  保义字符:

# \

mystring10 <- c('ac^bb','^df')

grep('\\^',mystring10)

[1] 1 2

\\d = [0-9]  匹配数字0-9

\\D = [^0-9] 匹配非数字

\\s   匹配空白字符,空格,制表符,换行符

\\S  匹配非空白字符

\\w  匹配字母和数字   =[a-zA-Z0-9]

\\W  匹配非字母和数字  =[^a-zA-Z0-9]

\\b   匹配字符的边界

\\B   匹配字符的非边界

\\<   匹配以空白字符开始的文本  如‘ string’

\\>   匹配以空白字符结束的文本  如‘string ’

示例:

> mystring11 <- c('2013','abcd','13sg')

> grep('\\d',mystring11)

[1] 1 3

> grep('\\D',mystring11)

[1] 2 3

> mystring12 <- c('foo t','    able','   moth  er','happy')

> grep('\\s',mystring12)

[1] 1 2 3

> grep('\\S',mystring12)

[1] 1 2 3 4

> mystring13 <- c('theory','the republic','they')

> grep('\\<the\\>',mystring13)   #以the作为边界的字符串,the为一个单独的单词

[1] 2

  stringr与stringi包

n  stringi包更加依赖正则表达式

stringr中的常用函数

str_c()函数:类似paste()函数

> str_c('a','b')

[1] "ab"

> str_c('a','b',sep = '-')

[1] "a-b"

str_length()函数:用于字符串计数

> str_length('abdc')

[1] 4

str_sub()函数:用于字符串提取,类似substr()函数,有三个参数:数据名,开始位置,结束位置(可以接受向量),可以接受赋值

> yxf <- 'yi xue fang'

> str_sub(yxf,c(1,4,8),c(2,6,11))

[1] "yi"   "xue"  "fang"

>

> str_sub(yxf,1,1) <- 'Y'     #可以接受赋值

> yxf

[1] "Yi xue fang"

str_dup()函数:用于复制

> fruit <- c('apple','pear','banana')

> str_dup(fruit,2)

[1] "appleapple"   "pearpear"     "bananabanana"

> fruit <- c('apple','pear','banana')

> str_dup(fruit,2:4)

[1] "appleapple"               "pearpearpear"           

[3] "bananabananabananabanana"

str_trim()函数:去掉字符串首尾的空格,也可以设置成right和left,分别去掉右边和左边的空格

> string <- ' Eternal love for YanQ '

> str_trim(string,side = 'both')

[1] "Eternal love for YanQ"

str_extract()函数:用于提取

phones <- c('219 733 8965','329-293-8753','banana','595 794 7569',

            '387 287 6718','apple','233.398.9187','482 952 3315',

            '239 923 8115 and 842 566 4692','Work: 579-499-7527','$1000',

            'Home:543.355.3679')

str_extract(phones,'([0-9]{3})[- .]([0-9]{3})[- .]([0-9]{4})\\b')

[1] "219 733 8965" "329-293-8753" NA             "595 794 7569" "387 287 6718"

 [6] NA             "233.398.9187" "482 952 3315" "239 923 8115" "579-499-7527"

[11] NA             "543.355.3679"

或写成:str_extract(phones,'([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})')

str_replace()函数:用于字符串替换,只替换找到的第一个

> fruits <- c('one apple','two pears','three bananas')

> str_replace(fruits,'[aeiou]','-')  #[被替换的对象] ,‘拟替换成的对象’

[1] "-ne apple"     "tw- pears"     "thr-e bananas"

str_replace_all()函数:替换所有

> fruits <- c('one apple','two pears','three bananas')

> str_replace_all(fruits,'[aeiou]','-')

[1] "-n- -ppl-"     "tw- p--rs"     "thr-- b-n-n-s"

 

n  stringi中的常用函数

u  stri_join()函数:

> stri_join(1:7,letters[1:7],sep = '-')

[1] "1-a" "2-b" "3-c" "4-d" "5-e" "6-f" "7-g"

> stri_join(1:7,letters[1:7],collapse = '-')

[1] "1a-2b-3c-4d-5e-6f-7g"

u  stri_cmp_eq() & stri_cmp_neq()函数:

> stri_cmp_eq('ab','ab')

[1] TRUE

> stri_cmp_neq('ab','ab')

[1] FALSE

u  stri_cmp_lt() & stri_cmp_gt()函数:用于字符串比大小,lt 前者小于后者,gt前者大于后者

> stri_cmp_lt('121','221')

[1] TRUE

> stri_cmp_lt('a121','b221')

[1] TRUE

> stri_cmp_gt('121','221')

[1] FALSE

u  stri_count()函数:用于计数

> language <- c('python','R','PHP','Ruby','Java',

+               'JavaScript','C','Oracle','C++','C#','Spark',

+               'Go','Room','Good','Pathon','ScriptJava','R2R','C+','C*')

> stri_count(language,fixed = 'R')

 [1] 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 2 0 0

> stri_count(language,regex = '^J')

      [1] 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0

u  stri_count_boundaries()函数:字符串元素个数的计数

> test <- 'The\u00a0above-mentioned     features are very useful.

+ Warm thanks to their developers. Tomorrow is a ,new$% day###'

> stri_count_boundaries(test,type = 'word')

[1] 45

> stri_count_boundaries(test,type = 'sentence')

[1] 3

> stri_count_boundaries(test,type = 'character')

[1] 110

u  stri_duplicated()函数:识别重复的字符串

> stri_duplicated(c('a','b','a',NA,'a',NA))

[1] FALSE FALSE  TRUE FALSE  TRUE  TRUE

> stri_duplicated(c('a','b','a',NA,'a',NA),fromLast = T)  #从最后开始看

[1]  TRUE FALSE  TRUE  TRUE FALSE FALSE

> stri_duplicated_any(c('a','b','a',NA,'a',NA))

[1] 3

u  stri_dup()函数:重复

> stri_dup(c('abc','parst'),c(4,2))

[1] "abcabcabcabc" "parstparst" 

u  stri_detect_fixec()函数:发现匹配函数

> stri_detect_fixed(c('stringi R','REXAMINE','123'),c('i','R','0'))

[1]  TRUE  TRUE FALSE

u  stri_detect_regex()函数:

> stri_detect_regex(c('above','abort','about','abnormal','abandon'),'^ab')

[1] TRUE TRUE TRUE TRUE TRUE

> stri_detect_regex(c('above','abort','about','abnormal','abandon'),'t\\b')

[1] FALSE  TRUE  TRUE FALSE FALSE

> stri_detect_regex(c('ABOUT','abort','AboVE'),'^ab',case_insensitive = TRUE)  #忽略大小写

[1] TRUE TRUE TRUE

u  stri_startswith_fixed()函数:

> stri_startswith_fixed(c('a1','a2','b3','a4','c5'),'a')

[1]  TRUE  TRUE FALSE  TRUE FALSE

>

> stri_startswith_fixed(c('a1','a2','b3','a4','c5'),'a1')

[1]  TRUE FALSE FALSE FALSE FALSE

>

> stri_startswith_fixed(c('abaDc','aabadc','ababa'),'ba',from = 2)  #从哪个字符开始匹配,从第二个字符开始匹配

[1]  TRUE FALSE  TRUE

u  stri_endswith_fixed()函数:

> stri_endswith_fixed(c('abaDc','aabadc','ababa'),'ba')

[1] FALSE FALSE  TRUE

> stri_endswith_fixed(c('abaDc','aabadc','ababa'),'ba', to = 3)  #匹配到第几位,匹配到第三位

[1]  TRUE FALSE  TRUE

u  stri_extract_all()函数:提取

> tEmp_text <- c('EU_FRA02_C1_S2008','AF_COM12_80_20014','AF_COM17_F0_S2008',

+                'AS_CHN11_C3_2004','EU-FRA-C3-S2007','NAUSA02E02005','AS_CHN12_N0_05',

+                'NA_USA03_C2_S2007','NA USA04 A3 2004',

+                'EU_UK01_A0_2009','eu_fra_a2_s98','SA/BRA08/00/1996')

>

> # Generate a strings composed by several sequence names.

>

> stri_extract_all(tEmp_text,regex = '[0-9]{2,4}\\b')

[[1]]

[1] "2008"

 

[[2]]

[1] "0014"

 

[[3]]

[1] "2008"

 

[[4]]

[1] "2004"

 

[[5]]

[1] "2007"

 

[[6]]

[1] "2005"

 

[[7]]

[1] "05"

 

[[8]]

[1] "2007"

 

[[9]]

[1] "04"   "2004"

 

[[10]]

[1] "2009"

 

[[11]]

[1] "98"

 

[[12]]

[1] "08"   "00"   "1996"

u  stri_extract_all_fixed()函数:

> stri_extract_all_fixed('abaBAba','Aba',case_insensitive = T, overlap =T)

[[1]]   #可交叉

[1] "aba" "aBA" "Aba"

u  stri_extract_all_boundaries()函数:提取字符串的边界

> stri_extract_all_boundaries('stringi: THE string processing package 123.48...')

[[1]]

[1] "stringi: "   "THE "        "string "     "processing " "package "  

[6] "123.48..."   #但是带出来单词后面的空格

u  stri_extract_all_words()函数:提取字符串的边界,去掉空格

> stri_extract_all_words('stringi: THE string processing package 123.48...')

[[1]]

[1] "stringi"    "THE"        "string"     "processing" "package"    "123.48"

u  stri_isempty()函数:字符串内是否为空

> stri_isempty(c(',','','abc','123','\u0105\u0104',' '))

[1] FALSE  TRUE FALSE FALSE FALSE FALSE

u  stri_locate_all()函数:定位函数

> stri_locate_all('I want to learn R to promote my statistical skills',fixed = 'to')

[[1]]

     start end

[1,]     8   9

[2,]    19  20  #返回的是位置,起始和结束,可用于提取