Pandas(二) —— Dataframe数据结构
Dataframe数据结构
Dataframe的基本概念
import numpy as np
import pandas as pd
#Dataframe的基本概念
'''
是一个表格型的数据结构,包含一组有序的列,其列的值类型可以是数值、字符串、布尔值等。
Dataframe中的数据以一个或多个二维块存放,不是列表、字典或一维数组结构。
'''
# Dataframe 带有index(行标签)和column(列标签)
data = {'name':['jack','mike','gpp'],
'age':[12,34,56],
'hight':[123,145,167]}
frame = pd.DataFrame(data)
print(frame)
print(frame.index,'\n该数据类型为:',type(frame.index))
print(frame.columns,'\n该数据类型为:',type(frame.columns))
print(frame.values,'\n该数据类型为:',type(frame.values))
# 查看数据,数据类型为dataframe
# .index查看行标签
# .columns查看列标签
# .values查看值,数据类型为ndarray
age hight name
0 12 123 jack
1 34 145 mike
2 56 167 gpp
RangeIndex(start=0, stop=3, step=1)
该数据类型为: <class 'pandas.core.indexes.range.RangeIndex'>
Index(['age', 'hight', 'name'], dtype='object')
该数据类型为: <class 'pandas.core.indexes.base.Index'>
[[12 123 'jack']
[34 145 'mike']
[56 167 'gpp']]
该数据类型为: <class 'numpy.ndarray'>
dataframe的创建
#dataframe的创建方法一:
'''
由数组/list 组成的字典
创建方法:pandas.DataFrame()
'''
data1 = {'a':[1,2,3],
'b':[3,4,5],
'c':[5,6,7]}
data2 = {'one':np.random.rand(3),
'two':np.random.rand(3)} #这里生成的数组长度要一致
print(data1)
print(data2)
print('--------------')
df1 = pd.DataFrame(data1,index= ['y','z','x'])
df2 = pd.DataFrame(data2)
print(df1)
print(df2)
# 由数组/list组成的字典 创建Dataframe,columns为字典key,index为默认数字标签
# 字典的值的长度必须保持一致!
print('--------------')
df1 = pd.DataFrame(data1, columns = ['b','c','a','d'])
print(df1)
df1 = pd.DataFrame(data1, columns = ['b','c'])
print(df1)
# columns参数:可以重新指定列的顺序,格式为list,如果现有数据中没有该列(比如'd'),则产生NaN值
# 如果columns重新指定时候,列的数量可以少于原数据
df2 = pd.DataFrame(data2, index = ['f1','f2','f3',]) # 这里如果尝试 index = ['f1','f2','f3','f4'] 会怎么样?
print(df2)
# index参数:重新定义index,格式为list,长度必须保持一致
{'a': [1, 2, 3], 'b': [3, 4, 5], 'c': [5, 6, 7]}
{'one': array([ 0.57370145, 0.68641906, 0.09765431]), 'two': array([ 0.03788015, 0.10822633, 0.33414233])}
--------------
a b c
y 1 3 5
z 2 4 6
x 3 5 7
one two
0 0.573701 0.037880
1 0.686419 0.108226
2 0.097654 0.334142
--------------
b c a d
0 3 5 1 NaN
1 4 6 2 NaN
2 5 7 3 NaN
b c
0 3 5
1 4 6
2 5 7
one two
f1 0.573701 0.037880
f2 0.686419 0.108226
f3 0.097654 0.334142
# dataframe创建方法二:
'''
由series组成的字典
'''
data1 = {'one':pd.Series(np.random.rand(2)),
'two':pd.Series(np.random.rand(3))} # 没有设置index的Series
data2 = {'one':pd.Series(np.random.rand(2), index = ['a','b']),
'two':pd.Series(np.random.rand(3),index = ['a','b','c'])} # 设置了index的Series
print(data1)
print(data2)
print('-------------')
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
print(df1)
print(df2)
# 由Seris组成的字典 创建Dataframe,columns为字典key,index为Series的标签(如果Series没有指定标签,则是默认数字标签)
# Series可以长度不一样,生成的Dataframe会出现NaN值
{'one': 0 0.708273
1 0.707102
dtype: float64, 'two': 0 0.038779
1 0.190305
2 0.166894
dtype: float64}
{'one': a 0.453956
b 0.169642
dtype: float64, 'two': a 0.547766
b 0.525751
c 0.422529
dtype: float64}
-------------
one two
0 0.708273 0.038779
1 0.707102 0.190305
2 NaN 0.166894
one two
a 0.453956 0.547766
b 0.169642 0.525751
c NaN 0.422529
# Dataframe 创建方法三:
'''
通过二维数组直接创建
'''
ar = np.random.rand(9).reshape(3,3) #首先生成二维数组(不明白的请看之前的文章)
print(ar)
df1 = pd.DataFrame(ar)
df2 = pd.DataFrame(ar, index = ['a', 'b', 'c'], columns = ['one','two','three']) # 可以尝试一下index或columns长度不等于已有数组的情况
print(df1)
print(df2)
# 通过二维数组直接创建Dataframe,得到一样形状的结果数据,如果不指定index和columns,两者均返回默认数字格式
# index和colunms指定长度与原数组保持一致
[[ 0.7863483 0.72837569 0.95403682]
[ 0.47387473 0.66190233 0.99712499]
[ 0.02783402 0.70966132 0.54108408]]
0 1 2
0 0.786348 0.728376 0.954037
1 0.473875 0.661902 0.997125
2 0.027834 0.709661 0.541084
one two three
a 0.786348 0.728376 0.954037
b 0.473875 0.661902 0.997125
c 0.027834 0.709661 0.541084
# Dataframe 创建方法四:由字典组成的列表
data = [{'one': 1, 'two': 2}, {'one': 5, 'two': 10, 'three': 20}]
print(data)
df1 = pd.DataFrame(data)
df2 = pd.DataFrame(data, index = ['a','b'])
df3 = pd.DataFrame(data, columns = ['one','two'])
print(df1)
print(df2)
print(df3)
# 由字典组成的列表创建Dataframe,columns为字典的key,index不做指定则为默认数组标签
# colunms和index参数分别重新指定相应列及行标签
[{'one': 1, 'two': 2}, {'one': 5, 'two': 10, 'three': 20}]
one three two
0 1 NaN 2
1 5 20.0 10
one three two
a 1 NaN 2
b 5 20.0 10
one two
0 1 2
1 5 10
# Dataframe 创建方法五:
'''
由字典组成的字典
'''
data = {'Jack':{'math':90,'english':89,'art':78},
'Marry':{'math':82,'english':95,'art':92},
'Tom':{'math':78,'english':67}}
df1 = pd.DataFrame(data)
print(df1)
# 由字典组成的字典创建Dataframe,columns为字典的key,index为子字典的key
df2 = pd.DataFrame(data, columns = ['Jack','Tom','Bob'])
df3 = pd.DataFrame(data, index = ['a','b','c'])
print(df2)
print(df3)
# columns参数可以增加和减少现有列,如出现新的列,值为NaN
# index在这里和之前不同,并不能改变原有index,如果指向新的标签,值为NaN (非常重要!)
Jack Marry Tom
art 78 92 NaN
english 89 95 67.0
math 90 82 78.0
Jack Tom Bob
art 78 NaN NaN
english 89 67.0 NaN
math 90 78.0 NaN
Jack Marry Tom
a NaN NaN NaN
b NaN NaN NaN
c NaN NaN NaN
dataframe的索引
选择列
# 选择列
df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,
index=['one','two','three'],
columns=['a','b','c','d'])
print(df)
print('-------------')
print(df['a'],type(df['a']))
print(df[['a','c']])
# df[]默认选择列,[]中写列名(所以一般数据colunms都会单独制定,不会用默认数字列名,以免和index冲突)
# 按照列名选择列,只选择一列输出Series,选择多列输出Dataframe
data3 = df[:1]
print(data3,type(data3))
# df[]中为数字时,默认选择行,且只能进行切片的选择,不能单独选择(df[0])
# 输出结果为Dataframe,即便只选择一行
# df[]不能通过索引标签名来选择行(df['one'])
a b c d
one 76.686850 33.084433 92.247401 3.232769
two 69.210400 19.430214 72.479291 24.497907
three 33.574062 34.241901 13.176080 45.500454
-------------
one 76.686850
two 69.210400
three 33.574062
Name: a, dtype: float64 <class 'pandas.core.series.Series'>
a c
one 76.686850 92.247401
two 69.210400 72.479291
three 33.574062 13.176080
a b c d
one 76.68685 33.084433 92.247401 3.232769 <class 'pandas.core.frame.DataFrame'>
选择行
按照标签索引
# df.loc[] - 按index选择行
df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = ['one','two','three','four'],
columns = ['a','b','c','d'])
df2 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
columns = ['a','b','c','d'])
print(df1)
print(df2)
print('-----')
data1 = df1.loc['one']
data2 = df2.loc[1]
print(data1)
print(data2)
print('单标签索引\n-----')
# 单个标签索引,返回Series
data3 = df1.reindex(['two','three','five']) #当索引中有不存在的行,用 .reindex()
data4 = df2.loc[[3,2,1]]
print(data3)
print(data4)
print('多标签索引\n-----')
# 多个标签索引,如果标签不存在,则返回NaN
# 顺序可变
data5 = df1.loc['one':'three'] #末端包含
data6 = df2.loc[1:3] #末端包含
print(data
print(data6)
print('切片索引')
# 可以做切片对象
# 末端包含
# 核心笔记:df.loc[label]主要针对index选择行,同时支持指定index,及默认数字index
a b c d
one 0.704809 50.882799 31.748359 4.008618
two 15.502812 65.400805 59.699475 5.715281
three 17.719915 1.108207 8.478434 53.255034
four 55.125978 58.878202 58.578265 78.005093
a b c d
0 94.323711 31.623252 79.545395 65.614674
1 18.653849 68.460337 46.089197 36.713253
2 63.245660 68.602988 78.669926 85.773112
3 11.840830 53.954041 18.657871 56.381644
-----
a 0.704809
b 50.882799
c 31.748359
d 4.008618
Name: one, dtype: float64
a 18.653849
b 68.460337
c 46.089197
d 36.713253
Name: 1, dtype: float64
单标签索引
-----
a b c d
two 15.502812 65.400805 59.699475 5.715281
three 17.719915 1.108207 8.478434 53.255034
five NaN NaN NaN NaN
a b c d
3 11.840830 53.954041 18.657871 56.381644
2 63.245660 68.602988 78.669926 85.773112
1 18.653849 68.460337 46.089197 36.713253
多标签索引
-----
a b c d
one 0.704809 50.882799 31.748359 4.008618
two 15.502812 65.400805 59.699475 5.715281
three 17.719915 1.108207 8.478434 53.255034
a b c d
1 18.653849 68.460337 46.089197 36.713253
2 63.245660 68.602988 78.669926 85.773112
3 11.840830 53.954041 18.657871 56.381644
切片索引
按照位置索引(类似list)
# df.iloc[] - 按照整数位置(从轴的0到length-1)选择行
# 类似list的索引,其顺序就是dataframe的整数位置,从0开始计
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = ['one','two','three','four'],
columns = ['a','b','c','d'])
print(df)
print('------')
print(df.iloc[0])
print(df.iloc[-1])
#print(df.iloc[4])
print('单位置索引\n-----')
# 单位置索引
# 和loc索引不同,不能索引超出数据行数的整数位置
print(df.iloc[[0,2]])
print(df.iloc[[3,2,1]])
print('多位置索引\n-----')
# 多位置索引
# 顺序可变
print(df.iloc[1:3])
print(df.iloc[::2])
print('切片索引')
# 切片索引
# 末端不包含
a b c d
one 51.040031 29.879548 31.532430 97.647893
two 1.090956 21.844713 93.315747 7.065472
three 86.012962 3.038697 38.600913 29.515306
four 11.752251 40.550808 97.113613 4.374101
------
a 51.040031
b 29.879548
c 31.532430
d 97.647893
Name: one, dtype: float64
a 11.752251
b 40.550808
c 97.113613
d 4.374101
Name: four, dtype: float64
单位置索引
-----
a b c d
one 51.040031 29.879548 31.532430 97.647893
three 86.012962 3.038697 38.600913 29.515306
a b c d
four 11.752251 40.550808 97.113613 4.374101
three 86.012962 3.038697 38.600913 29.515306
two 1.090956 21.844713 93.315747 7.065472
多位置索引
-----
a b c d
two 1.090956 21.844713 93.315747 7.065472
three 86.012962 3.038697 38.600913 29.515306
a b c d
one 51.040031 29.879548 31.532430 97.647893
three 86.012962 3.038697 38.600913 29.515306
切片索引
布尔型索引
# 布尔型索引
# 和Series原理相同
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = ['one','two','three','four'],
columns = ['a','b','c','d'])
print(df)
print('------')
b1 = df < 20
print(b1,type(b1))
print(df[b1]) # 也可以书写为 df[df < 20]
print('------')
# 布尔索引会对数据每个值进行判断
# 索引结果保留 所有数据:True返回原数据,False返回值为NaN
b2 = df['a'] > 50
print(b2,type(b2))
print(df[b2]) # 也可以书写为 df[df['a'] > 50]
print('单列判断------')
# 单列做判断
# 索引结果保留 单列判断为True的行数据,包括其他列 (在这里保留了前三列)
b3 = df[['a','b']] > 50
print(b3,type(b3))
print(df[b3]) # 也可以书写为 df[df[['a','b']] > 50]
print('多列判断------')
# 多列做判断
# 索引结果保留 所有数据:True返回原数据,False返回值为NaN
b4 = df.loc[['one','three']] < 50
print(b4,type(b4))
print(df[b4]) # 也可以书写为 df[df.loc[['one','three']] < 50]
print('------')
# 多行做判断
# 索引结果保留 所有数据:True返回原数据,False返回值为NaN
a b c d
one 74.965896 10.849874 23.036455 73.213216
two 57.286029 44.344496 69.367778 58.836892
three 67.542466 91.403359 32.930365 23.509080
four 44.582918 62.278704 12.818474 77.478616
------
a b c d
one False True False False
two False False False False
three False False False False
four False False True False <class 'pandas.core.frame.DataFrame'>
a b c d
one NaN 10.849874 NaN NaN
two NaN NaN NaN NaN
three NaN NaN NaN NaN
four NaN NaN 12.818474 NaN
------
one True
two True
three True
four False
Name: a, dtype: bool <class 'pandas.core.series.Series'>
a b c d
one 74.965896 10.849874 23.036455 73.213216
two 57.286029 44.344496 69.367778 58.836892
three 67.542466 91.403359 32.930365 23.509080
单列判断------
a b
one True False
two True False
three True True
four False True <class 'pandas.core.frame.DataFrame'>
a b c d
one 74.965896 NaN NaN NaN
two 57.286029 NaN NaN NaN
three 67.542466 91.403359 NaN NaN
four NaN 62.278704 NaN NaN
多列判断------
a b c d
one False True True False
three False False True True <class 'pandas.core.frame.DataFrame'>
a b c d
one NaN 10.849874 23.036455 NaN
two NaN NaN NaN NaN
three NaN NaN 32.930365 23.50908
four NaN NaN NaN NaN
------
多重索引
# 多重索引:比如同时索引行和列
# 先选择列再选择行 —— 相当于对于一个数据,先筛选字段,再选择数据量
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = ['one','two','three','four'],
columns = ['a','b','c','d'])
print(df)
print('------')
print(df['a'].loc[['one','three']]) # 选择a列的one,three行
print(df[['b','c','d']].iloc[::2]) # 选择b,c,d列的one,three行
print('-----------')
print(df[df['a'] < 50].iloc[:2]) # 选择满足判断索引的前两行数据
print(df[df < 50][['a','b']])
a b c d
one 63.114540 38.567125 88.018529 71.156573
two 39.640530 2.518002 86.175475 59.508718
three 96.924110 21.059748 39.764780 57.627406
four 12.960049 26.735557 26.079749 27.259359
------
one 63.11454
three 96.92411
Name: a, dtype: float64
b c d
one 38.567125 88.018529 71.156573
three 21.059748 39.764780 57.627406
-----------
a b c d
two 39.640530 2.518002 86.175475 59.508718
four 12.960049 26.735557 26.079749 27.259359
a b
one NaN 38.567125
two 39.640530 2.518002
three NaN 21.059748
four 12.960049 26.735557
下篇会讲到DataFrame的基本操作用法