有时候,我们想修改数据中的字符串数据列。下面的方法供参考:
str.extract()
str.upper()
str.lower()
str.len()
str.split()
str.replace()
参考实例:
>>>import pandas as pd
>>>df = pd.DataFrame([
['RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT'],
['RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT'],
['RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT'],
['COHOES CITY SCHOOL DISTRICT'],
['COHOES CITY SCHOOL DISTRICT']])
>>>df.columns = ['AREA NAME']
>>>df
----------
AREA NAME
0 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
1 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
2 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
3 COHOES CITY SCHOOL DISTRICT
4 COHOES CITY SCHOOL DISTRICT
str.extract()
>>>df['AREA NAME'].str.extract('(\w+)')#提取SREA NAME列字符串中的第一个单词
----------
0 RAVENA
1 RAVENA
2 RAVENA
3 COHOES
4 COHOES
Name: AREA NAME, dtype: object
>>>df['AREA NAME'].str.extract('(\w+)\s(\w+)')#将AREA NAME列中的第二个单词作为单独的列提取
----------
0 1
0 RAVENA COEYMANS
1 RAVENA COEYMANS
2 RAVENA COEYMANS
3 COHOES CITY
4 COHOES CITY
str.upper()
>>>df['AREA NAME'].str.upper()#因为数据已经是大写,所以没有改变
----------
0 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
1 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
2 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
3 COHOES CITY SCHOOL DISTRICT
4 COHOES CITY SCHOOL DISTRICT
Name: AREA NAME, dtype: object
str.lower()
>>>df['AREA NAME'].str.lower()#将NAME NAME列中的字符串转化为小写
----------
0 ravena coeymans selkirk central school district
1 ravena coeymans selkirk central school district
2 ravena coeymans selkirk central school district
3 cohoes city school district
4 cohoes city school district
Name: AREA NAME, dtype: object
str.len()
>>>df['NAME NAME'].str.len()#AREA NAME列中每个元素的长度
----------
0 47
1 47
2 47
3 27
4 27
Name: AREA NAME, dtype: int64
str.split()
>>>df['NAME NAME'].str.split(' ')#用空格分割AREA NAME列中的字符串
----------
0 [RAVENA, COEYMANS, SELKIRK, CENTRAL, SCHOOL, D...
1 [RAVENA, COEYMANS, SELKIRK, CENTRAL, SCHOOL, D...
2 [RAVENA, COEYMANS, SELKIRK, CENTRAL, SCHOOL, D...
3 [COHOES, CITY, SCHOOL, DISTRICT]
4 [COHOES, CITY, SCHOOL, DISTRICT]
dtype: object
str.replace()
>>>df['NAME NAME'].str.replace('DISTRICT$', 'DIST')#将AREA NAME列中每个元素末尾的DISTRICT替换为DIST
----------
0 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DIST
1 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DIST
2 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DIST
3 COHOES CITY SCHOOL DIST
4 COHOES CITY SCHOOL DIST
Name: AREA NAME, dtype: object
str.cat()
>>>df['AREA NAME'].str.cat(['a', 'b', 'c', 'd', 'e'],sep=' ')#拼接字符串,在原有字符串后各拼接一个字符串
----------
0 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT a
1 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT b
2 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT c
3 COHOES CITY SCHOOL DISTRICT d
4 COHOES CITY SCHOOL DISTRICT e
Name: AREA NAME, dtype: object
>>>df['AREA NAME'].str.cat([['a', 'b', 'c', 'd', 'e'], ['1', '2', '3', '4', '5']], sep=' ')#在原有字符串后各拼接两个字符串
----------
0 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRIC...
1 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRIC...
2 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRIC...
3 COHOES CITY SCHOOL DISTRICT d 4
4 COHOES CITY SCHOOL DISTRICT e 5
Name: AREA NAME, dtype: object
>>>df['AREA NAME'].str.cat(sep=',')#将某一列拼接成一个完整的字符串
----------
RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT,RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT,RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT,COHOES CITY SCHOOL DISTRICT,COHOES CITY SCHOOL DISTRICT
str.get()
>>>df['AREA NAME'].str.get(0)#获取指定位置的字符串
----------
0 R
1 R
2 R
3 C
4 C
Name: AREA NAME, dtype: object
str.contains()
>>>df['AREA NAME'].str.contains('RAVENA')#是否包含表达式
----------
0 True
1 True
2 True
3 False
4 False
Name: AREA NAME, dtype: bool
str.pad()
>>>df['AREA NAME'].str.pad(47, fillchar='?')#左补齐
----------
0 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
1 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
2 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
3 ????????????????????COHOES CITY SCHOOL DISTRICT
4 ????????????????????COHOES CITY SCHOOL DISTRICT
Name: AREA NAME, dtype: object
>>>df['AREA NAME'].str.pad(47, side='right', fillchar='?')#右补齐
----------
0 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
1 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
2 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
3 COHOES CITY SCHOOL DISTRICT????????????????????
4 COHOES CITY SCHOOL DISTRICT????????????????????
Name: AREA NAME, dtype: object
str.center()
>>>df['AREA NAME'].str.center(47, fillchar=' ')#中间补齐
----------
0 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
1 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
2 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
3 COHOES CITY SCHOOL DISTRICT
4 COHOES CITY SCHOOL DISTRICT
Name: AREA NAME, dtype: object
str.ljust()
>>>df['AREA NAME'].str.ljust(47, fillchar='?')#右边补齐
----------
0 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
1 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
2 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
3 COHOES CITY SCHOOL DISTRICT????????????????????
4 COHOES CITY SCHOOL DISTRICT????????????????????
Name: AREA NAME, dtype: object
str.rjust()
>>>df['AREA NAME'].str.rjust(47, fillchar='?')#左边补齐
----------
0 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
1 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
2 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
3 ????????????????????COHOES CITY SCHOOL DISTRICT
4 ????????????????????COHOES CITY SCHOOL DISTRICT
Name: AREA NAME, dtype: object
str.zfill()
>>>df['AREA NAME'].str.zfill(47)#左边补0
----------
0 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
1 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
2 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
3 00000000000000000000COHOES CITY SCHOOL DISTRICT
4 00000000000000000000COHOES CITY SCHOOL DISTRICT
Name: AREA NAME, dtype: object
str.slice()
>>>df['AREA NAME'].str.slice(8,23)#按给定的开始结束位置切割字符串
----------
0 OEYMANS SELKIRK
1 OEYMANS SELKIRK
2 OEYMANS SELKIRK
3 ITY SCHOOL DIST
4 ITY SCHOOL DIST
Name: AREA NAME, dtype: object
str.slice_repalce()
>>>df['AREA NAME'].str.slice_replace(8, 23, '??')#使用给定的字符串,替换指定位置的字符
----------
0 RAVENA C?? CENTRAL SCHOOL DISTRICT
1 RAVENA C?? CENTRAL SCHOOL DISTRICT
2 RAVENA C?? CENTRAL SCHOOL DISTRICT
3 COHOES C??RICT
4 COHOES C??RICT
Name: AREA NAME, dtype: object
str.count()
>>>df['AREA NAME'].str.count('A')#计算给定单词出现的次数
----------
0 4
1 4
2 4
3 0
4 0
Name: AREA NAME, dtype: int64
str.startswith()
>>>df['AREA NAME'].str.startswith('R')#判断是否以给定的字符串开头
----------
0 True
1 True
2 True
3 False
4 False
Name: AREA NAME, dtype: bool
str.endswith()
>>>df['AREA NAME'].str.endswith('T')#判断是否以给定的字符串结束
----------
0 True
1 True
2 True
3 True
4 True
Name: AREA NAME, dtype: bool
str.findall()
>>>df['AREA NAME'].str.findall('[A-D]')#查找所有符合正则表达式的字符,以数组形式返回
----------
0 [A, A, C, A, C, A, C, D, C]
1 [A, A, C, A, C, A, C, D, C]
2 [A, A, C, A, C, A, C, D, C]
3 [C, C, C, D, C]
4 [C, C, C, D, C]
Name: AREA NAME, dtype: object
str.match()
>>>df['AREA NAME'].str.match('[A-D]')#检测是否完全匹配给定的字符串或表达式
----------
0 False
1 False
2 False
3 True
4 True
Name: AREA NAME, dtype: bool
str.isalnum()
>>>df['AREA NAME'].str.isalnum()#是否全部是数字和字母组成
----------
0 False
1 False
2 False
3 False
4 False
Name: AREA NAME, dtype: bool
str.swapcase()
>>>df['AREA NAME'].str.swapcase()#大小写互换
----------
0 ravena coeymans selkirk central school district
1 ravena coeymans selkirk central school district
2 ravena coeymans selkirk central school district
3 cohoes city school district
4 cohoes city school district
Name: AREA NAME, dtype: object