02. Pandas 1|数据结构Series、Dataframe

1."一维数组"Series

Pandas数据结构Series：基本概念及创建

s.index 、 s.values

# Series 数据结构

# Series 是带有标签的一维数组，可以保存任何数据类型（整数，字符串，浮点数，Python对象等）,轴标签统称为索引

import numpy as np
import pandas as pd
>>> s = pd.Series(np.random.rand(5))

>>> print(s,type(s))

0    0.610318

1    0.235660

2    0.606445

3    0.070794

4    0.217530

dtype: float64 <class 'pandas.core.series.Series'>

>>> print(s.index,type(s.index))

RangeIndex(start=0, stop=5, step=1) <class 'pandas.core.indexes.range.RangeIndex'>

>>> print(s.values, type(s.values))

[0.61031815 0.23566007 0.60644485 0.0707941  0.21753049] <class 'numpy.ndarray'>

>>>

# .index查看series索引，类型为rangeindex

# .values查看series值，类型是ndarray

# 核心：series相比于ndarray，是一个自带索引index的数组 → 一维数组 + 对应索引

# 所以当只看series的值的时候，就是一个ndarray

# series和ndarray较相似，索引切片功能差别不大

# series和dict相比，series更像一个有顺序的字典（dict本身不存在顺序），其索引原理与字典相似（一个用key，一个用index）

1.1 Series 创建方法

由字典创建，字典的key就是index，values就是values

#Series 创建方法一：由字典创建，字典的key就是index，values就是values

>>> dic = {'a':1,'b':2,'c':3,'':4,'':5}

>>> s = pd.Series(dic)

>>> print(s)

a    1

b    2

c    3

4    4

5    5

dtype: int64

# 注意：key肯定是字符串，假如values类型不止一个会怎么样？ → dic = {'a':1 ,'b':'hello' , 'c':3, '4':4, '5':5}

>>> dic = {'a': ,'b':'hello' , 'c':3, '':4, '':5}

>>> s = pd.Series(dic)

>>> print(s)

a 1

b hello

c 3

4 4

5 5

dtype: object

>>>

# Series 创建方法二：由数组创建(一维数组)

>>> arr = np.random.randn(5)

>>> s = pd.Series(arr) # 默认index是从0开始，步长为1的数字

>>> print(arr)

[1.08349965 0.52441811 0.76972371 0.35454797 0.39607907]

>>> print(s)

0    1.083500

1    0.524418

2    0.769724

3    0.354548

4    0.396079

dtype: float64

>>>

>>> s = pd.Series(arr,index = ['a','b','c','d','e'],dtype = np.object)

>>> print(s)

a    1.083500

b    0.524418

c    0.769724

d    0.354548

e    0.396079

dtype: object

# index参数：设置index，长度保持一致

# dtype参数：设置数值类型

# Series 创建方法三：由标量创建

>>> s = pd.Series(10,index = range(4))

>>> print(s)

0    10

1    10

2    10

3    10

dtype: int64

# 如果data是标量值，则必须提供索引。该值会重复，来匹配索引的长度

# Series 名称属性：name

>>> s1 = pd.Series(np.random.randn(5))

>>> print(s1)

0   -0.441627

1   -0.082186

2    0.379461

3    0.163183

4    0.851316

dtype: float64

>>> s2 = pd.Series(np.random.randn(5),name='test')

>>> print(s2)

0   -0.951756

1    0.039272

2    0.618596

3   -0.027975

4    0.409068

Name: test, dtype: float64

>>> print(s1.name,s2.name,type(s2.name))

None test <class 'str'>

# name为Series的一个参数，创建一个数组的 名称

# .name方法：输出数组的名称，输出格式为str，如果没用定义输出名称，输出为None

>>> s3 = s2.rename('hahaha')

>>> print(s3)

0   -0.951756

1    0.039272

2    0.618596

3   -0.027975

4    0.409068

Name: hahaha, dtype: float64

>>> print(s3.name,s2.name)

hahaha test

>>>

# .rename()重命名一个数组的名称，并且新指向一个数组，原数组不变

1.2 Series：索引

位置下标索引： s[0] 、s[-1]不错在报错哦、s[1:4]左闭右开；

标签索引： s.['b'] 、s[ ['a', 'b', 'c' ] ] 、s['a':'c']末端包含哦；

布尔索引： s.isnull () s.notnull() s[s>50] s[ s.notnull() ]

# 位置下标，类似序列

>>> s = pd.Series(np.random.rand(5))

>>> print(s)

0    0.233801

1    0.828125

2    0.184925

3    0.297279

4    0.346561

dtype: float64

>>> print(s[0],type(s[0]),s[0].dtype)

0.23380091830372507 <class 'numpy.float64'> float64

>>> print(float(s[0]),type(float(s[0])))

0.23380091830372507 <class 'float'>

#print(s[-1])

# 位置下标从0开始

# 输出结果为numpy.float格式，

# 可以通过float()函数转换为python float格式

# numpy.float与float占用字节不同

# s[-1]结果如何？ 会报错

# 标签索引

>>> s = pd.Series(np.random.rand(5),index=['a','b','c','d','e'])

>>> print(s)

a    0.685577

b    0.998041

c    0.451358

d    0.832554

e    0.090653

dtype: float64

>>> print(s['a'],type(s['a']),s['a'].dtype)

0.6855772922411842 <class 'numpy.float64'> float64

# 方法类似下标索引，用[]表示，内写上index，注意index是字符串

>>> sci = s[['a','b','e']]

>>> print(sci,type(sci))

a    0.685577

b    0.998041

e    0.090653

dtype: float64 <class 'pandas.core.series.Series'>

>>>

# 如果需要选择多个标签的值，用[[]]来表示（相当于[]中包含一个列表）

# 多标签索引结果是新的数组

# 切片索引

>>> s1 = pd.Series(np.random.rand(5))

>>> s2 = pd.Series(np.random.rand(5),index=['a','b','c','d','e'])

>>> print(s1,'\n',s2)

0    0.917653

1    0.763179

2    0.837807

3    0.344435

4    0.360922

dtype: float64

a    0.126537

b    0.699155

c    0.289233

d    0.831209

e    0.273572

dtype: float64

>>>

>>> print(s1[1:4],s1[4]) #左闭右开
1    0.763179

2    0.837807

3    0.344435

dtype: float64 0.36092197040034457

>>> print(s2['a':'c'],s2['c'])   #用index做切片末端是包含的 

a    0.126537

b    0.699155

c    0.289233

dtype: float64 0.28923306798234194

>>> print(s2[0:3],s2[3])

a    0.126537

b    0.699155

c    0.289233

dtype: float64 0.8312088483742163
# 注意：用index做切片是末端包含

>>> print(s2[:-1])

a    0.126537

b    0.699155

c    0.289233

d    0.831209   ##不包含末端的e

dtype: float64

>>> print(s2[::2])

a    0.126537

c    0.289233

e    0.273572

dtype: float64
# 下标索引做切片，和list写法一样

# 布尔型索引

>>> s = pd.Series(np.random.rand(3)*100)

>>> s[4] = None

>>> print(s)

0    19.9515

1    59.9133

2    97.9854

4       None

dtype: object

>>> bs1 = s > 50

>>> bs2 = s.isnull()
>>> bs3 = s.notnull()
>>> print(bs1, type(bs1),bs1.dtype)

0    False

1     True

2     True

4    False

dtype: bool <class 'pandas.core.series.Series'> bool

>>> print(bs2, type(bs2),bs2.dtype)

0    False

1    False

2    False

4     True

dtype: bool <class 'pandas.core.series.Series'> bool

>>> print(bs3, type(bs3),bs3.dtype)

0     True

1     True

2     True

4    False

dtype: bool <class 'pandas.core.series.Series'> bool

>>>

# 数组做判断之后，返回的是一个由布尔值组成的新的数组

# .isnull() / .notnull() 判断是否为空值 (None代表空值，NaN代表有问题的数值，两个都会识别为空值)

>>> print(s[s > 50])

1    59.9133

2    97.9854

dtype: object

>>> print(s[bs3])

0    19.9515

1    59.9133

2    97.9854

dtype: object

>>>

# 布尔型索引方法：用[判断条件]表示，其中判断条件可以是 一个语句，或者是 一个布尔型数组！

1.3 Series：基本技巧

数据查看（.head() .tail() ） / 重新索引就是对index做重新排序（reindex(列表)） / 对齐（ s1+s2 ）/ 添加（s1.append(s2)）、修改s['a']=10 、删除值s.drop('a')

# 数据查看

>>> s = pd.Series(np.random.rand(50))

>>> print(s.head(10))

0    0.282475

1    0.012153

2    0.642487

3    0.906513

4    0.195709

5    0.828506

6    0.194632

7    0.197138

8    0.503566

9    0.897846

dtype: float64

>>> print(s.tail())

45    0.963916

46    0.642688

47    0.865840

48    0.835746

49    0.905786

dtype: float64

# .head()查看头部数据

# .tail()查看尾部数据

# 默认查看5条

# 重新索引reindex

# .reindex将会根据索引重新排序，如果当前索引不存在，则引入缺失值

>>> s = pd.Series(np.random.rand(3),index=['a','b','c'])

>>> print(s)

a    0.239126

b    0.862137

c    0.501479

dtype: float64

>>> s1 = s.reindex(['c','b','a','d'])

>>> print(s1)

c    0.501479

b    0.862137

a    0.239126

d         NaN

dtype: float64

# .reindex()中也是写列表

# 这里'd'索引不存在，所以值为NaN

>>> s2 = s.reindex(['c','b','a','d'],fill_value=0)  # fill_value参数：填充缺失值的值

>>> print(s2) 
c 0.501479 
b 0.862137 
a 0.239126 
d 0.000000 dtype: float64

# Series对齐

>>> s1 = pd.Series(np.random.rand(3),index=['Jack','Marry','Kris'])

>>> s2 = pd.Series(np.random.rand(3),index=['Wang','Jack','Marry'])

>>> print(s1)

Jack     0.583406

Marry    0.603579

Kris     0.812511

dtype: float64

>>> print(s2)

Wang     0.582852

Jack     0.975184

Marry    0.990203

dtype: float64

>>> print(s1+s2)

Jack     1.558589

Kris          NaN

Marry    1.593783

Wang          NaN

dtype: float64

# Series 和 ndarray 之间的主要区别是，Series 上的操作会根据标签自动对齐

# index顺序不会影响数值计算，以标签来计算

# 空值和任何值计算结果扔为空值

# 删除：.drop

>>> s = pd.Series(np.random.rand(5),index=list('ngjur'))

>>> print(s)

n    0.239752

g    0.643085

j    0.313229

u    0.231923

r    0.836070

dtype: float64

>>> s1 = s.drop('n')

>>> print(s1)

g    0.643085

j    0.313229

u    0.231923

r    0.836070

dtype: float64

>>> s2 = s.drop(['g','j'])

>>> print(s2)

n    0.239752

u    0.231923

r    0.836070

dtype: float64

>>> print(s)

n    0.239752

g    0.643085

j    0.313229

u    0.231923

r    0.836070

dtype: float64

# drop 删除元素之后返回副本(inplace=False)

# 添加

>>> s1 = pd.Series(np.random.rand(5))

>>> s2 = pd.Series(np.random.rand(5),index=list('ngjur'))

>>> print(s1,'\n',s2)

0    0.417249

1    0.226655

2    0.798018

3    0.984398

4    0.304693

dtype: float64

n    0.354443

g    0.609306

j    0.103994

u    0.392755

r    0.302959

dtype: float64

>>> s1[5] = 100

>>> s2['a'] = 100

>>> print(s1,'\n',s2)

0      0.417249

1      0.226655

2      0.798018

3      0.984398

4      0.304693

5    100.000000

dtype: float64

n      0.354443

g      0.609306

j      0.103994

u      0.392755

r      0.302959

a    100.000000

dtype: float64

# 直接通过下标索引/标签index添加值  

>>> s3 = s1.append(s2)

>>> print(s3,'\n',s1)

0      0.417249

1      0.226655

2      0.798018

3      0.984398

4      0.304693

5    100.000000

n      0.354443

g      0.609306

j      0.103994

u      0.392755

r      0.302959

a    100.000000

dtype: float64

 0      0.417249

1      0.226655

2      0.798018

3      0.984398

4      0.304693

5    100.000000

dtype: float64

# 通过.append方法，直接添加一个数组

# .append方法生成一个新的数组，不改变之前的数组

# 修改

>>> s = pd.Series(np.random.rand(3),index=['a','b','c'])

>>> print(s)

a    0.246992

b    0.349735

c    0.395859

dtype: float64

>>> s['a'] = 100

>>> s[['b','c']] = 200

>>> print(s)

a    100.0

b    200.0

c    200.0

dtype: float64

>>>

# 通过索引直接修改，类似序列

2. Pandas数据结构Dataframe

2.1 基本概念及创建

"二维数组"Dataframe：是一个表格型的数据结构，包含一组有序的列，其列的值类型可以是数值、字符串、布尔值等。

Dataframe中的数据以一个或多个二维块存放，不是列表、字典或一维数组结构。

# Dataframe 数据结构

# Dataframe是一个表格型的数据结构，“带有标签的二维数组”。

# Dataframe带有index（行标签）和columns（列标签）

>>> data = {'name':['Jack','Tom','Marry'],

... 'age':[18,19,20],

... 'gender':['m','m','w']}

>>> frame = pd.DataFrame(data)

>>> print(frame)

    name  age gender

0   Jack   18      m

1    Tom   19      m

2  Marry   20      w

>>> print(type(frame))

<class 'pandas.core.frame.DataFrame'>

>>> print(frame.index,'\n该数据类型为:',type(frame.index))

RangeIndex(start=0, stop=3, step=1)

该数据类型为: <class 'pandas.core.indexes.range.RangeIndex'>

>>> print(frame.columns,'\n该数据类型为:',type(frame.columns))

Index(['name', 'age', 'gender'], dtype='object')

该数据类型为: <class 'pandas.core.indexes.base.Index'>

>>> print(frame.values,'\n该数据类型为:',type(frame.values))

[['Jack' 18 'm']

 ['Tom' 19 'm']

 ['Marry' 20 'w']]

该数据类型为: <class 'numpy.ndarray'>

# 查看数据，数据类型为dataframe

# .index查看行标签

# .columns查看列标签

# .values查看值，数据类型为ndarray

# Dataframe 创建方法一：由数组/list组成的字典

# 创建方法:pandas.Dataframe()

>>> data1 = {'a':[1,2,3], 'b':[3,4,5], 'c':[5,6,7]}

>>> data2 = {'one':np.random.rand(3),'two':np.random.rand(3)} ## 这里如果尝试  'two':np.random.rand(4) 会怎么样？转为DataFrame会报错--> {'one': array([0.938673  , 0.90796881, 0.8890414 ]), 'two': array([0.37261493, 0.70430298, 0.24494145, 0.3924875 ])},转为DataFrame 则 ValueError: arrays must all be same length

>>> print(data1,'\n',data2)

{'a': [1, 2, 3], 'b': [3, 4, 5], 'c': [5, 6, 7]}

 {'one': array([0.76701471, 0.01005053, 0.09453216]), 'two': array([0.58442534, 0.14610703, 0.03588291]))

>>> df1 = pd.DataFrame(data1)

>>> df2 = pd.DataFrame(data2)

>>> print(df1,'\n',df2)

   a  b  c

0  1  3  5

1  2  4  6

2  3  5  7

         one       two

0  0.767015  0.573035

1  0.010051  0.892624

2  0.094532  0.228811

>>>

# 由数组/list组成的字典 创建Dataframe，columns为字典key，index为默认数字标签

# 字典的值的长度必须保持一致！

>>> df1 = pd.DataFrame(data1,columns=['b','c','a','d'])

>>> print(df1)

   b  c  a    d

0  3  5  1  NaN

1  4  6  2  NaN

2  5  7  3  NaN

>>> df1 = pd.DataFrame(data1,columns=['b','c'])

>>> print(df1)

   b  c

0  3  5

1  4  6

2  5  7

# columns参数：可以重新指定列的顺序，格式为list，如果现有数据中没有该列（比如'd'），则产生NaN值

# 如果columns重新指定时候，列的数量可以少于原数据

>>> df2 = pd.DataFrame(data2,index=['f1','f2','f3'])  # 这里如果尝试  index = ['f1','f2','f3','f4'] 会怎么样？长度不一致，报错

>>> print(df2)

         one       two

f1  0.767015  0.573035

f2  0.010051  0.892624

f3  0.094532  0.228811

>>>

# index参数：重新定义index，格式为list，长度必须保持一致

# Dataframe 创建方法二：由Series组成的字典

>>> data1 = {'one':pd.Series(np.random.rand(2)),'two':pd.Series(np.random.rand(3))} # 没有设置index的Series

>>> data2 = {'one':pd.Series(np.random.rand(2),index=['a','b']),'two':pd.Series(np.random.rand(3),index=['a','b','c'])} # 设置了index的Series

>>> print(data1,'\n',data2)
{'one': 0    0.682455

1    0.282592

dtype: float64, 'two': 0    0.995054

1    0.781587

2    0.959304

dtype: float64}

 {'one': a    0.940915

b    0.792245

dtype: float64, 'two': a    0.609878

b    0.910182

c    0.245590

dtype: float64}
>>> df1 = pd.DataFrame(data1)

>>> df2 = pd.DataFrame(data2)

>>> print(df1)

        one       two

0  0.682455  0.995054

1  0.282592  0.781587

2       NaN  0.959304

>>> print(df2)

        one       two

a  0.940915  0.609878

b  0.792245  0.910182

c       NaN  0.245590

>>>

# 由Seris组成的字典 创建Dataframe，columns为字典key，index为Series的标签（如果Series没有指定标签，则是默认数字标签）

# Series可以长度不一样，生成的Dataframe会出现NaN值

# Dataframe 创建方法三：通过二维数组直接创建               

>>> ar = np.random.rand(9).reshape(3,3)

>>> print(ar)

[[0.43760945 0.3563898  0.16767573]

 [0.26565413 0.61673585 0.54037501]

 [0.95541978 0.05395517 0.02045977]]

>>> df1 = pd.DataFrame(ar)

>>> df2 = pd.DataFrame(ar,index=['a','b','c'],columns=['one','two','three'])
>>> print(df1,'\n',df2)

          0         1         2

0  0.437609  0.356390  0.167676

1  0.265654  0.616736  0.540375

2  0.955420  0.053955  0.020460

         one       two     three

a  0.437609  0.356390  0.167676

b  0.265654  0.616736  0.540375

c  0.955420  0.053955  0.020460

>>>

# 通过二维数组直接创建Dataframe，得到一样形状的结果数据，如果不指定index和columns，两者均返回默认数字格式

# index和colunms指定长度与原数组保持一致

# Dataframe 创建方法四：由字典组成的列表  

>>> data = [{'one':1,'two':2},{'one':5,'two':10,'three':20}]

>>> print(data)

[{'one': 1, 'two': 2}, {'one': 5, 'two': 10, 'three': 20}]

>>> df1 = pd.DataFrame(data)

>>> df2 = pd.DataFrame(data,index = ['a','b'])

>>> df3 = pd.DataFrame(data,columns = ['one','two'])

>>> print(df1,'\n',df2,'\n',df3)
   one  three  two

0    1    NaN    2

1    5   20.0   10

    one  three  two

a    1    NaN    2

b    5   20.0   10

    one  two

0    1    2

1    5   10

>>>

# 由字典组成的列表创建Dataframe，columns为字典的key，index不做指定则为默认数组标签

# colunms和index参数分别重新指定相应列及行标签

# Dataframe 创建方法五：由字典组成的字典

data = {'Jack':{'math':90,'english':89,'art':78},

       'Marry':{'math':82,'english':95,'art':92},

       'Tom':{'math':78,'english':67}}

df1 = pd.DataFrame(data)

print(df1)

# 由字典组成的字典创建Dataframe，columns为字典的key，index为子字典的key

df2 = pd.DataFrame(data, columns = ['Jack','Tom','Bob'])

df3 = pd.DataFrame(data, index = ['a','b','c'])

print(df2)

print(df3)

# columns参数可以增加和减少现有列，如出现新的列，值为NaN

# index在这里和之前不同，并不能改变原有index，如果指向新的标签，值为NaN （非常重要！）

#在cmd或pycharm里边报错。AttributeError: 'list' object has no attribute 'astype'

         Jack  Marry   Tom

art        78     92   NaN

english    89     95  67.0

math       90     82  78.0

         Jack   Tom  Bob

art        78   NaN  NaN

english    89  67.0  NaN

math       90  78.0  NaN

   Jack  Marry  Tom

a   NaN    NaN  NaN

b   NaN    NaN  NaN

c   NaN    NaN  NaN

2.2 Dataframe：索引

Dataframe既有行索引也有列索引，可以被看做由Series组成的字典（共用一个索引）

选择列 / 选择行 / 切片 / 布尔判断

df [ 'a' ] df [ ['a', 'b'] ] 选择列、 df.loc [ 'one' ] 按index选择行

#选择行df.loc[] 与列 df[ ]  
>>> df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,index=['one','two','three'],columns=['a','b','c','d'])

>>> print(df)

               a          b          c          d

one    15.715854  86.084608  22.376152  66.760504

two     3.761389  63.610935  85.752549  19.065568

three  77.277233  24.776938  13.159774  46.518796

>>> data1 = df['a']

>>> data2 = df[['a','c']]

>>> print(data1,type(data1))

one      15.715854

two       3.761389

three    77.277233

Name: a, dtype: float64 <class 'pandas.core.series.Series'>

>>> print(data2,type(data2))

               a          c

one    15.715854  22.376152

two     3.761389  85.752549

three  77.277233  13.159774 <class 'pandas.core.frame.DataFrame'>

>>>

# 按照列名选择列，只选择一列输出Series，选择多列输出Dataframe 

>>> data3 = df.loc['one']

>>> data4 = df.loc[['one','two']]

>>> print(data3,type(data3))

a    15.715854

b    86.084608

c    22.376152

d    66.760504

Name: one, dtype: float64 <class 'pandas.core.series.Series'>

>>> print(data4,type(data4))

             a          b          c          d

one  15.715854  86.084608  22.376152  66.760504

two   3.761389  63.610935  85.752549  19.065568 <class 'pandas.core.frame.DataFrame'>

>>>

# 按照index选择行，只选择一行输出Series，选择多行输出Dataframe

2.2.1 df[ ] -- 选择列

#1. df[] - 选择列

# 一般用于选择列，也可以选择行

>>> df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,index=['one','two','three'],columns=['a','b','c','d'])

>>> print(df)

               a          b          c          d

one    94.536247  33.478780  10.738060  52.679418

two    37.573186  95.915130   8.529743  11.367094

three  80.758763   0.000355  36.136580  95.739389

>>> data1 = df['a']

>>> data2 = df[['b','c']] # 尝试输入 data2 = df[['b','c','e']]会报错

>>> print(data1)

one      94.536247

two      37.573186

three    80.758763

Name: a, dtype: float64

>>> print(data2)

               b          c

one    33.478780  10.738060

two    95.915130   8.529743

three   0.000355  36.136580

>>>

# df[]默认选择列，[]中写列名（所以一般数据colunms都会单独制定，不会用默认数字列名，以免和index冲突）

# 单选列为Series，print结果为Series格式

# 多选列为Dataframe，print结果为Dataframe格式

>>> data3 = df[:1]

#data3 = df[0] #这两种都是错误的，0  'one'

#data3 = df['one']

>>> print(data3,type(data3))

             a         b         c          d

one  94.536247  33.47878  10.73806  52.679418 <class 'pandas.core.frame.DataFrame'>

# df[]中为数字时，默认选择行，且只能进行切片的选择，不能单独选择（df[0]）

# 输出结果为Dataframe，即便只选择一行

# df[]不能通过索引标签名来选择行(df['one'])

# 核心笔记：df[col]一般用于选择列，[]中写列名

2.2.2df.loc[ ] - 按index选择行

#2. df.loc[] - 按index选择行                                                                

>>> df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,index=['one','two','three','four'],columns=['a','b','c','d'])

>>> df2 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,columns=['a','b','c','d'])

>>> print(df1,'\n',df2)

               a          b          c          d

one    36.881890  13.897714   5.237098  24.676327

two    42.183000  27.146129  49.074872  56.447147

three   6.935006  16.742130   5.955048   2.576066

four   49.843982  64.641184  70.038643  75.103787

            a          b          c          d

0  60.589246  60.305811  90.306763  46.761824

1  59.296330   6.039652  52.296003  97.149954

2  58.255476  13.837192  74.255506  84.082167

3  55.204207  17.340171  25.056553  84.518804

#单标签索引，返回Series

>>> data1 = df1.loc['one'] #单标签索引返回Series

>>> data2 = df2.loc[1]

>>> print(data1,'\n',data2)

a    36.881890

b    13.897714

c     5.237098

d    24.676327

Name: one, dtype: float64

 a    59.296330

b     6.039652

c    52.296003

d    97.149954

Name: 1, dtype: float64

>>>

#多标签索引,顺序可变

>>> data3 = df1.loc[['two','three','five']]

>>> data4 = df2.loc[[3,2,1]]

>>> print(data3)

               a          b          c          d

two    42.183000  27.146129  49.074872  56.447147

three   6.935006  16.742130   5.955048   2.576066

five         NaN        NaN        NaN        NaN    #多标签索引，如果标签不存在则返回NaN

>>> print(data4)

           a          b          c          d

3  55.204207  17.340171  25.056553  84.518804

2  58.255476  13.837192  74.255506  84.082167

1  59.296330   6.039652  52.296003  97.149954

#切片索引 ，可以做切片对象 

>>> data5 = df1.loc['one':'three']   #末端包含  

>>> data6 = df2.loc[1:3]

>>> print(data5)

               a          b          c          d

one    36.881890  13.897714   5.237098  24.676327

two    42.183000  27.146129  49.074872  56.447147

three   6.935006  16.742130   5.955048   2.576066

>>> print(data6)

           a          b          c          d

1  59.296330   6.039652  52.296003  97.149954

2  58.255476  13.837192  74.255506  84.082167

3  55.204207  17.340171  25.056553  84.518804

# 核心笔记：df.loc[label]主要针对index选择行，同时支持指定index，及默认数字index

2.2.3 df.iloc[ ] - 按整数位置选择行

# df.iloc[] - 按照整数位置（从轴的0到length-1）选择行

# 类似list的索引，其顺序就是dataframe的整数位置，从0开始计

>>> df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,index=['one','two','three','four'],columns=['a','b','c','d'])

>>> print(df)

               a          b          c          d

one    21.693396  38.203531  85.439983   9.740751

two    28.940287  57.861274  68.467893  60.788056

three  81.871777  57.813973  60.092876   1.637220

four   67.789269  95.648501  62.837383  65.794259


# 单位置索引; 和loc索引不同，不能索引超出数据行数的整数位置

>>> print(df.iloc[0])

a    21.693396

b    38.203531

c    85.439983

d     9.740751

Name: one, dtype: float64

>>> print(df.iloc[-1])

a    67.789269

b    95.648501

c    62.837383

d    65.794259

Name: four, dtype: float64

>>> print(df.iloc[4]) #索引超过行数了

IndexError: single positional indexer is out-of-bounds

# 多位置索引，顺序可变

>>> print(df.iloc[[0,2]])  ##从0开始 ，第0行和第3行即末端包含   

               a          b          c         d

one    21.693396  38.203531  85.439983  9.740751

three  81.871777  57.813973  60.092876  1.637220

>>> print(df.iloc[[3,2,1]])

               a          b          c          d

four   67.789269  95.648501  62.837383  65.794259

three  81.871777  57.813973  60.092876   1.637220

two    28.940287  57.861274  68.467893  60.788056

#切片索引    

>>> print(df.iloc[1:3]) #末端不包含    

               a          b          c          d

two    28.940287  57.861274  68.467893  60.788056

three  81.871777  57.813973  60.092876   1.637220

>>> print(df.iloc[::2])

               a          b          c         d

one    21.693396  38.203531  85.439983  9.740751

three  81.871777  57.813973  60.092876  1.637220

>>>

2.2.4布尔型索引

df < 20 df [ df < 20 ] 、单列做判断df [ 'a' ] >20 、多列做判断df [ ['a', 'b'] ] >20 、多行做判断 df.loc[ ['one', 'three'] ] < 50

#布尔型索引

# 和Series原理相同

>>> df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,index=['one','two','three','four'],columns=['a','b','c','d'])

>>> print(df)

               a          b          c          d

one    17.575951  66.534852  96.774872  94.415801

two    67.485820  11.871447  19.140092   9.634462

three  32.052532   8.891445  63.209949  92.451412

four    6.931403   0.622515  29.972335  24.438536

>>> b1 = df < 20  # 也可以书写为 df[df < 20]

>>> print(b1,type(b1))

           a      b      c      d

one     True  False  False  False

two    False   True   True   True

three  False   True  False  False

four    True   True  False  False <class 'pandas.core.frame.DataFrame'>

>>> print(df[b1])

               a          b          c         d

one    17.575951        NaN        NaN       NaN

two          NaN  11.871447  19.140092  9.634462

three        NaN   8.891445        NaN       NaN

four    6.931403   0.622515        NaN       NaN

>>>

# 不做索引则会对数据每个值进行判断

# 索引结果保留 所有数据：True返回原数据，False返回值为NaN


>>> b2 = df['a'] > 50

>>> print(b2,type(b2))

one      False
two       True

three    False

four     False

Name: a, dtype: bool <class 'pandas.core.series.Series'>

>>> print(df[b2]) #会把two为True的行保留，包括小于50的数

            a          b          c         d

two  67.48582  11.871447  19.140092  9.634462

# 单列做判断，索引结果保留单列判断为True的行数据，包括其他列


>>> b3 = df[['a','b']] > 50

>>> print(b3,type(b3))

           a      b

one    False   True

two     True  False

three  False  False

four   False  False <class 'pandas.core.frame.DataFrame'>

>>> print(df[b3])

              a          b   c   d

one         NaN  66.534852 NaN NaN

two    67.48582        NaN NaN NaN

three       NaN        NaN NaN NaN

four        NaN        NaN NaN NaN

# 多列做判断，索引结果保留所有数据：True返回原数据，False返回值为NaN

>>>

>>> b4 = df.loc[['one','three']] < 50

>>> print(b4,type(b4))

          a      b      c      d

one    True  False  False  False

three  True   True  False  False <class 'pandas.core.frame.DataFrame'>

>>> print(df[b4])

               a         b   c   d

one    17.575951       NaN NaN NaN

two          NaN       NaN NaN NaN

three  32.052532  8.891445 NaN NaN

four         NaN       NaN NaN NaN

>>>

# 多行做判断，索引结果保留 所有数据：True返回原数据，False返回值为NaN

2.2.5 多重索引：比如同时索引行和列

先选择列再选择行：df[ 'a' ].loc[ ['a', 'b', 'c'] ] df [ df [ 'a' ] < 50 ].iloc[ :2 ]

#多重索引：比如同时索引行和列

# 先选择列再选择行 —— 相当于对于一个数据，先筛选字段，再选择数据量

>>> df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,index=['one','two','three','four'],columns=['a','b','c','d'])

>>> print(df)

               a          b          c          d

one    12.408141  98.202562  38.715980  62.978631

two    93.980397  39.455335  77.214844  42.495949

three   4.210569  48.999179  10.320513  51.919796

four   73.838276  72.854442  98.555301  27.902682

>>> print(df['a'].loc[['one','three']]) # 选择a列的one，three行

one      12.408141

three     4.210569

Name: a, dtype: float64

>>> print(df[['b','c','d']].iloc[::2]) # 选择b，c，d列的one，three行

               b          c          d

one    98.202562  38.715980  62.978631

three  48.999179  10.320513  51.919796

>>> print(df[df['a'] < 50].iloc[:2]) # 选择满足判断索引的前两行数据

               a          b          c          d

one    12.408141  98.202562  38.715980  62.978631

three   4.210569  48.999179  10.320513  51.919796

>>>

2.3 Dataframe：基本技巧

数据查看、转置 / 添加、修改、删除值 / 对齐 / 排序

######数据查看（.head()  .tail()  ）与转置（ .T ） 
>>> df = pd.DataFrame(np.random.rand(16).reshape(8,2)*100,columns=['a','b'])

>>> print(df)

           a          b

0  41.447858  93.937878

1  29.684415  58.637993

2   2.260561  23.601327

3  79.555013  55.611010

4  64.825361  92.444769

5  53.716091  40.166872

6  19.657354  47.842487

7  22.705715  26.977886

>>>

>>> print(df.head(2))

           a          b

0  41.447858  93.937878

1  29.684415  58.637993

>>> print(df.tail())

           a          b

3  79.555013  55.611010

4  64.825361  92.444769

5  53.716091  40.166872

6  19.657354  47.842487

7  22.705715  26.977886

　　# .head()查看头部数据
　　# .tail()查看尾部数据
　　# 默认查看5条

>>> print(df.T)

           0          1          2          3          4          5          6          7

a  41.447858  29.684415   2.260561  79.555013  64.825361  53.716091  19.657354  22.705715

b  93.937878  58.637993  23.601327  55.611010  92.444769  40.166872  47.842487  26.977886
# .T 转置

# 添加与修改 

>>> df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,columns=['a','b','c','d'])

>>> print(df)

           a          b          c          d

0  40.395591  38.023720  64.954712  82.833601

1  69.405393  77.664903  76.566145  11.218753

2  61.793220  95.929196  15.415231  79.368691

3  29.482119  85.228170  94.134330  25.678733

>>> df['e'] = 10

>>> df.loc[4] = 20

>>> print(df)

           a          b          c          d   e

0  40.395591  38.023720  64.954712  82.833601  10

1  69.405393  77.664903  76.566145  11.218753  10

2  61.793220  95.929196  15.415231  79.368691  10

3  29.482119  85.228170  94.134330  25.678733  10

4  20.000000  20.000000  20.000000  20.000000  20

>>>

# 新增列/行并赋值

>>> df['e'] = 20

>>> df[['a','c']] = 100

>>> print(df)

     a          b    c          d   e

0  100  38.023720  100  82.833601  20

1  100  77.664903  100  11.218753  20

2  100  95.929196  100  79.368691  20

3  100  85.228170  100  25.678733  20

4  100  20.000000  100  20.000000  20

>>>

# 索引后直接修改值

# 删除  del / drop() ；inplace = False/True 、 axis = 0 为行  |  axis = 1 为列 

>>> df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,columns=['a','b','c','d'])

>>> print(df)

           a          b          c          d

0  76.082974  91.636219  70.831268  82.900443

1  16.328769   9.910538  36.670726  67.187492

2  96.234567  16.699254   0.257354  31.032239

3  16.659137  85.438085  91.993957  33.055454

>>> del df['a']
>>> print(df)

           b          c          d

0  91.636219  70.831268  82.900443

1   9.910538  36.670726  67.187492

2  16.699254   0.257354  31.032239

3  85.438085  91.993957  33.055454

>>>

# del语句 - 删除列

>>> print(df.drop(0))              ##删除行  

           b          c          d

1   9.910538  36.670726  67.187492

2  16.699254   0.257354  31.032239

3  85.438085  91.993957  33.055454

>>> print(df.drop([1,2]))

           b          c          d

0  91.636219  70.831268  82.900443

3  85.438085  91.993957  33.055454

>>> print(df)                     ##原数据不改变 

           b          c          d

0  91.636219  70.831268  82.900443

1   9.910538  36.670726  67.187492

2  16.699254   0.257354  31.032239

3  85.438085  91.993957  33.055454

>>>

# drop()删除行，inplace=False → 删除后生成新的数据，不改变原数据

　>>> print(df1.drop(['1'], inplace=True)) #默认为 inplace=False, inplace = True是把原数据也删除了
　None

>>> print(df.drop(['d'],axis = 1)) #axis = 1是删除列 ---> 用 [ ]，不改变原数据 ;  axis = 0是删除行,不改变原数据。

           b          c

0  91.636219  70.831268

1   9.910538  36.670726

2  16.699254   0.257354

3  85.438085  91.993957

>>> print(df)

           b          c          d

0  91.636219  70.831268  82.900443

1   9.910538  36.670726  67.187492

2  16.699254   0.257354  31.032239

3  85.438085  91.993957  33.055454

>>>

# drop()删除列，需要加上axis = 1，inplace=False → 删除后生成新的数据，不改变原数据

#对齐 +

>>> df1 = pd.DataFrame(np.random.randn(10,4),columns=['A','B','C','D'])

>>> df2 = pd.DataFrame(np.random.randn(7,3),columns=['A','B','C'])

>>> print(df1)

          A         B         C         D

0 -0.711905  1.102947 -0.203125  0.464160

1 -1.633976 -0.126530  1.437948  1.721049

2  1.323383 -0.277546  0.060134  0.207093

3  1.708294  0.815721 -0.151322  0.522937

4  0.263572 -0.674251 -1.325148 -2.702464

5  1.659823 -0.131172 -1.114735 -2.182527

6 -0.186723 -0.071455 -1.370213  0.513062

7  0.381603  1.265310  0.083247  1.084061

8  0.399770  0.765438 -1.066299  0.626402

9  0.781321 -1.612135 -0.387417 -0.673143

>>> print(df2)

          A         B         C

0  0.012025 -0.488556  0.243515

1 -0.751000  0.277448  0.013675

2  1.008712 -1.231084 -0.523329

3  0.663029 -0.752602 -0.724749

4 -0.755075  0.303930  1.288335

5 -1.233975 -1.241185 -0.414564

6 -0.251519 -1.384259 -0.996120

>>> print(df1+df2)                  #DataFrame对象之间的数据自动按照列和索引（行标签）对齐。

          A         B         C   D

0 -0.699879  0.614391  0.040390 NaN

1 -2.384977  0.150917  1.451622 NaN

2  2.332095 -1.508629 -0.463195 NaN

3  2.371323  0.063119 -0.876071 NaN

4 -0.491503 -0.370321 -0.036813 NaN

5  0.425847 -1.372357 -1.529299 NaN

6 -0.438242 -1.455714 -2.366333 NaN

7       NaN       NaN       NaN NaN

8       NaN       NaN       NaN NaN

9       NaN       NaN       NaN NaN

>>>

# 排序1 - 按值排序 .sort_values

# 同样适用于Series

>>> df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,columns=['a','b','c','d'])

>>> print(df)

           b          c          d

0  91.636219  70.831268  82.900443

1   9.910538  36.670726  67.187492

2  16.699254   0.257354  31.032239

3  85.438085  91.993957  33.055454

>>> print(df1.sort_values(['a'],ascending=True)) #升序； ascending参数：设置升序降序，默认升序。

           a          b          c          d

0   3.255012  35.188882  99.290551  67.897580

1  43.221583  36.144081  84.124544  18.844967

3  47.364524  41.530226  20.800088  22.597198

2  83.170528   1.550416   7.810286  61.375057

>>> print(df1.sort_values(['a'],ascending=False)) #降序

           a          b          c          d

2  83.170528   1.550416   7.810286  61.375057

3  47.364524  41.530226  20.800088  22.597198

1  43.221583  36.144081  84.124544  18.844967

0   3.255012  35.188882  99.290551  67.897580

# 单列排序


>>> df2 = pd.DataFrame({'a':[1,1,1,1,2,2,2,2],'b':list(range(8)),'c':list(range(8,0,-1))})

>>> print(df2)

   a  b  c

0  1  0  8

1  1  1  7

2  1  2  6

3  1  3  5

4  2  4  4

5  2  5  3

6  2  6  2

7  2  7  1

>>> print(df2.sort_values(['a','c']))  #多列排序，按列顺序排序。

   a  b  c

3  1  3  5

2  1  2  6

1  1  1  7

0  1  0  8

7  2  7  1

6  2  6  2

5  2  5  3

4  2  4  4

>>>

# 排序2 - 索引排序 .sort_index

>>> df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,index=[5,4,3,2],columns=['a','b','c','d'])

>>> df2 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,index=['h','s','x','g'],columns=['a','b','c','d'])

>>> print(df1)

           a          b          c          d

5  70.899006  29.653652  38.273239  99.254931

4  68.173016  27.051275  43.236560  48.573018

3  35.870577  41.990773  78.055733  63.581352

2  20.946046  19.712039  33.906534  89.749668

>>> print(df1.sort_index())

           a          b          c          d

2  20.946046  19.712039  33.906534  89.749668

3  35.870577  41.990773  78.055733  63.581352

4  68.173016  27.051275  43.236560  48.573018

5  70.899006  29.653652  38.273239  99.254931

>>> print(df2)

           a          b          c          d

h  62.234181  32.481881  83.483145  39.145470

s  41.003081  16.515826  19.958257  30.331726

x  60.486728  20.206607  91.149820  31.731089

g  22.132468  61.116998  19.929379  98.976248

>>> print(df2.sort_index())

           a          b          c          d

g  22.132468  61.116998  19.929379  98.976248

h  62.234181  32.481881  83.483145  39.145470

s  41.003081  16.515826  19.958257  30.331726

x  60.486728  20.206607  91.149820  31.731089

>>>

# 按照index排序

# 默认 ascending=True, inplace=False

秒客网