Python修改数据中的字符串数据列

时间:2022-10-31 07:17:03

有时候,我们想修改数据中的字符串数据列。下面的方法供参考:

  • str.extract()
  • str.upper()
  • str.lower()
  • str.len()
  • str.split()
  • str.replace()

参考实例:

>>>import pandas as pd
>>>df = pd.DataFrame([
['RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT'],
['RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT'],
['RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT'],
['COHOES CITY SCHOOL DISTRICT'],
['COHOES CITY SCHOOL DISTRICT']])
>>>df.columns = ['AREA NAME']
>>>df


----------
AREA NAME
0 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
1 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
2 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
3 COHOES CITY SCHOOL DISTRICT
4 COHOES CITY SCHOOL DISTRICT

str.extract()

>>>df['AREA NAME'].str.extract('(\w+)')#提取SREA NAME列字符串中的第一个单词

----------
0 RAVENA
1 RAVENA
2 RAVENA
3 COHOES
4 COHOES
Name: AREA NAME, dtype: object

>>>df['AREA NAME'].str.extract('(\w+)\s(\w+)')#将AREA NAME列中的第二个单词作为单独的列提取

----------

0 1
0 RAVENA COEYMANS
1 RAVENA COEYMANS
2 RAVENA COEYMANS
3 COHOES CITY
4 COHOES CITY

str.upper()

>>>df['AREA NAME'].str.upper()#因为数据已经是大写,所以没有改变

----------
0 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
1 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
2 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
3 COHOES CITY SCHOOL DISTRICT
4 COHOES CITY SCHOOL DISTRICT
Name: AREA NAME, dtype: object

str.lower()

>>>df['AREA NAME'].str.lower()#将NAME NAME列中的字符串转化为小写

----------
0 ravena coeymans selkirk central school district
1 ravena coeymans selkirk central school district
2 ravena coeymans selkirk central school district
3 cohoes city school district
4 cohoes city school district
Name: AREA NAME, dtype: object

str.len()

>>>df['NAME NAME'].str.len()#AREA NAME列中每个元素的长度

----------
0 47
1 47
2 47
3 27
4 27
Name: AREA NAME, dtype: int64

str.split()

>>>df['NAME NAME'].str.split(' ')#用空格分割AREA NAME列中的字符串

----------
0 [RAVENA, COEYMANS, SELKIRK, CENTRAL, SCHOOL, D...
1 [RAVENA, COEYMANS, SELKIRK, CENTRAL, SCHOOL, D...
2 [RAVENA, COEYMANS, SELKIRK, CENTRAL, SCHOOL, D...
3 [COHOES, CITY, SCHOOL, DISTRICT]
4 [COHOES, CITY, SCHOOL, DISTRICT]
dtype: object

str.replace()

>>>df['NAME NAME'].str.replace('DISTRICT$', 'DIST')#将AREA NAME列中每个元素末尾的DISTRICT替换为DIST

----------
0 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DIST
1 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DIST
2 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DIST
3 COHOES CITY SCHOOL DIST
4 COHOES CITY SCHOOL DIST
Name: AREA NAME, dtype: object

str.cat()

>>>df['AREA NAME'].str.cat(['a', 'b', 'c', 'd', 'e'],sep=' ')#拼接字符串,在原有字符串后各拼接一个字符串

----------
0 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT a
1 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT b
2 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT c
3 COHOES CITY SCHOOL DISTRICT d
4 COHOES CITY SCHOOL DISTRICT e
Name: AREA NAME, dtype: object

>>>df['AREA NAME'].str.cat([['a', 'b', 'c', 'd', 'e'], ['1', '2', '3', '4', '5']], sep=' ')#在原有字符串后各拼接两个字符串

----------
0 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRIC...
1 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRIC...
2 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRIC...
3 COHOES CITY SCHOOL DISTRICT d 4
4 COHOES CITY SCHOOL DISTRICT e 5
Name: AREA NAME, dtype: object

>>>df['AREA NAME'].str.cat(sep=',')#将某一列拼接成一个完整的字符串
----------
RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT,RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT,RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT,COHOES CITY SCHOOL DISTRICT,COHOES CITY SCHOOL DISTRICT

str.get()

>>>df['AREA NAME'].str.get(0)#获取指定位置的字符串

----------
0 R
1 R
2 R
3 C
4 C
Name: AREA NAME, dtype: object

str.contains()

>>>df['AREA NAME'].str.contains('RAVENA')#是否包含表达式

----------
0 True
1 True
2 True
3 False
4 False
Name: AREA NAME, dtype: bool

str.pad()

>>>df['AREA NAME'].str.pad(47, fillchar='?')#左补齐

----------
0 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
1 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
2 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
3 ????????????????????COHOES CITY SCHOOL DISTRICT
4 ????????????????????COHOES CITY SCHOOL DISTRICT
Name: AREA NAME, dtype: object

>>>df['AREA NAME'].str.pad(47, side='right', fillchar='?')#右补齐

----------

0 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
1 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
2 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
3 COHOES CITY SCHOOL DISTRICT????????????????????
4 COHOES CITY SCHOOL DISTRICT????????????????????
Name: AREA NAME, dtype: object

str.center()

>>>df['AREA NAME'].str.center(47, fillchar=' ')#中间补齐

----------
0 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
1 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
2 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
3 COHOES CITY SCHOOL DISTRICT
4 COHOES CITY SCHOOL DISTRICT
Name: AREA NAME, dtype: object

str.ljust()

>>>df['AREA NAME'].str.ljust(47, fillchar='?')#右边补齐

----------
0 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
1 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
2 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
3 COHOES CITY SCHOOL DISTRICT????????????????????
4 COHOES CITY SCHOOL DISTRICT????????????????????
Name: AREA NAME, dtype: object

str.rjust()

>>>df['AREA NAME'].str.rjust(47, fillchar='?')#左边补齐

----------
0 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
1 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
2 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
3 ????????????????????COHOES CITY SCHOOL DISTRICT
4 ????????????????????COHOES CITY SCHOOL DISTRICT
Name: AREA NAME, dtype: object

str.zfill()

>>>df['AREA NAME'].str.zfill(47)#左边补0

----------
0 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
1 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
2 RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT
3 00000000000000000000COHOES CITY SCHOOL DISTRICT
4 00000000000000000000COHOES CITY SCHOOL DISTRICT
Name: AREA NAME, dtype: object

str.slice()

>>>df['AREA NAME'].str.slice(8,23)#按给定的开始结束位置切割字符串

----------
0 OEYMANS SELKIRK
1 OEYMANS SELKIRK
2 OEYMANS SELKIRK
3 ITY SCHOOL DIST
4 ITY SCHOOL DIST
Name: AREA NAME, dtype: object

str.slice_repalce()

>>>df['AREA NAME'].str.slice_replace(8, 23, '??')#使用给定的字符串,替换指定位置的字符

----------
0 RAVENA C?? CENTRAL SCHOOL DISTRICT
1 RAVENA C?? CENTRAL SCHOOL DISTRICT
2 RAVENA C?? CENTRAL SCHOOL DISTRICT
3 COHOES C??RICT
4 COHOES C??RICT
Name: AREA NAME, dtype: object

str.count()

>>>df['AREA NAME'].str.count('A')#计算给定单词出现的次数

----------
0 4
1 4
2 4
3 0
4 0
Name: AREA NAME, dtype: int64

str.startswith()

>>>df['AREA NAME'].str.startswith('R')#判断是否以给定的字符串开头

----------
0 True
1 True
2 True
3 False
4 False
Name: AREA NAME, dtype: bool

str.endswith()

>>>df['AREA NAME'].str.endswith('T')#判断是否以给定的字符串结束

----------
0 True
1 True
2 True
3 True
4 True
Name: AREA NAME, dtype: bool

str.findall()

>>>df['AREA NAME'].str.findall('[A-D]')#查找所有符合正则表达式的字符,以数组形式返回

----------
0 [A, A, C, A, C, A, C, D, C]
1 [A, A, C, A, C, A, C, D, C]
2 [A, A, C, A, C, A, C, D, C]
3 [C, C, C, D, C]
4 [C, C, C, D, C]
Name: AREA NAME, dtype: object

str.match()

>>>df['AREA NAME'].str.match('[A-D]')#检测是否完全匹配给定的字符串或表达式

----------
0 False
1 False
2 False
3 True
4 True
Name: AREA NAME, dtype: bool

str.isalnum()

>>>df['AREA NAME'].str.isalnum()#是否全部是数字和字母组成

----------
0 False
1 False
2 False
3 False
4 False
Name: AREA NAME, dtype: bool

str.swapcase()

>>>df['AREA NAME'].str.swapcase()#大小写互换

----------
0 ravena coeymans selkirk central school district
1 ravena coeymans selkirk central school district
2 ravena coeymans selkirk central school district
3 cohoes city school district
4 cohoes city school district
Name: AREA NAME, dtype: object