Pandas(二) —— Dataframe数据结构

Dataframe数据结构

Dataframe的基本概念

import numpy as np
import pandas as pd

#Dataframe的基本概念
'''
是一个表格型的数据结构，包含一组有序的列，其列的值类型可以是数值、字符串、布尔值等。

Dataframe中的数据以一个或多个二维块存放，不是列表、字典或一维数组结构。
'''

# Dataframe 带有index（行标签）和column（列标签）


data = {'name':['jack','mike','gpp'],
'age':[12,34,56],
'hight':[123,145,167]}
frame = pd.DataFrame(data)
print(frame)
print(frame.index,'\n该数据类型为：',type(frame.index))
print(frame.columns,'\n该数据类型为：',type(frame.columns))
print(frame.values,'\n该数据类型为：',type(frame.values))

# 查看数据，数据类型为dataframe
# .index查看行标签
# .columns查看列标签
# .values查看值，数据类型为ndarray

   age  hight  name
0   12    123  jack
1   34    145  mike
2   56    167   gpp
RangeIndex(start=0, stop=3, step=1) 
该数据类型为： <class 'pandas.core.indexes.range.RangeIndex'>
Index(['age', 'hight', 'name'], dtype='object') 
该数据类型为： <class 'pandas.core.indexes.base.Index'>
[[12 123 'jack']
 [34 145 'mike']
 [56 167 'gpp']] 
该数据类型为： <class 'numpy.ndarray'>

dataframe的创建

#dataframe的创建方法一：
'''
由数组/list 组成的字典
创建方法：pandas.DataFrame()
'''

data1 = {'a':[1,2,3],
'b':[3,4,5],
'c':[5,6,7]}
data2 = {'one':np.random.rand(3),
'two':np.random.rand(3)}    #这里生成的数组长度要一致

print(data1)
print(data2)
print('--------------')

df1 = pd.DataFrame(data1,index= ['y','z','x'])
df2 = pd.DataFrame(data2)
print(df1)
print(df2)
# 由数组/list组成的字典 创建Dataframe，columns为字典key，index为默认数字标签
# 字典的值的长度必须保持一致！

print('--------------')
df1 = pd.DataFrame(data1, columns = ['b','c','a','d'])
print(df1)
df1 = pd.DataFrame(data1, columns = ['b','c'])
print(df1)
# columns参数：可以重新指定列的顺序，格式为list，如果现有数据中没有该列（比如'd'），则产生NaN值
# 如果columns重新指定时候，列的数量可以少于原数据

df2 = pd.DataFrame(data2, index = ['f1','f2','f3',])  # 这里如果尝试 index = ['f1','f2','f3','f4'] 会怎么样？
print(df2)
# index参数：重新定义index，格式为list，长度必须保持一致

{'a': [1, 2, 3], 'b': [3, 4, 5], 'c': [5, 6, 7]}
{'one': array([ 0.57370145,  0.68641906,  0.09765431]), 'two': array([ 0.03788015,  0.10822633,  0.33414233])}
--------------
   a  b  c
y  1  3  5
z  2  4  6
x  3  5  7
        one       two
0  0.573701  0.037880
1  0.686419  0.108226
2  0.097654  0.334142
--------------
   b  c  a    d
0  3  5  1  NaN
1  4  6  2  NaN
2  5  7  3  NaN
   b  c
0  3  5
1  4  6
2  5  7
         one       two
f1  0.573701  0.037880
f2  0.686419  0.108226
f3  0.097654  0.334142

# dataframe创建方法二： 
'''
由series组成的字典
'''

data1 = {'one':pd.Series(np.random.rand(2)),
'two':pd.Series(np.random.rand(3))}    # 没有设置index的Series
data2 = {'one':pd.Series(np.random.rand(2), index = ['a','b']),
'two':pd.Series(np.random.rand(3),index = ['a','b','c'])}   # 设置了index的Series
print(data1)
print(data2)
print('-------------')
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
print(df1)
print(df2)
# 由Seris组成的字典 创建Dataframe，columns为字典key，index为Series的标签（如果Series没有指定标签，则是默认数字标签）
# Series可以长度不一样，生成的Dataframe会出现NaN值

{'one': 0    0.708273
1    0.707102
dtype: float64, 'two': 0    0.038779
1    0.190305
2    0.166894
dtype: float64}
{'one': a    0.453956
b    0.169642
dtype: float64, 'two': a    0.547766
b    0.525751
c    0.422529
dtype: float64}
-------------
        one       two
0  0.708273  0.038779
1  0.707102  0.190305
2       NaN  0.166894
        one       two
a  0.453956  0.547766
b  0.169642  0.525751
c       NaN  0.422529

# Dataframe 创建方法三：
'''
通过二维数组直接创建
'''

ar = np.random.rand(9).reshape(3,3)     #首先生成二维数组（不明白的请看之前的文章）
print(ar)
df1 = pd.DataFrame(ar)
df2 = pd.DataFrame(ar, index = ['a', 'b', 'c'], columns = ['one','two','three'])  # 可以尝试一下index或columns长度不等于已有数组的情况
print(df1)
print(df2)

# 通过二维数组直接创建Dataframe，得到一样形状的结果数据，如果不指定index和columns，两者均返回默认数字格式
# index和colunms指定长度与原数组保持一致

[[ 0.7863483   0.72837569  0.95403682]
 [ 0.47387473  0.66190233  0.99712499]
 [ 0.02783402  0.70966132  0.54108408]]
          0         1         2
0  0.786348  0.728376  0.954037
1  0.473875  0.661902  0.997125
2  0.027834  0.709661  0.541084
        one       two     three
a  0.786348  0.728376  0.954037
b  0.473875  0.661902  0.997125
c  0.027834  0.709661  0.541084

# Dataframe 创建方法四：由字典组成的列表

data = [{'one': 1, 'two': 2}, {'one': 5, 'two': 10, 'three': 20}]
print(data)
df1 = pd.DataFrame(data)
df2 = pd.DataFrame(data, index = ['a','b'])
df3 = pd.DataFrame(data, columns = ['one','two'])
print(df1)
print(df2)
print(df3)
# 由字典组成的列表创建Dataframe，columns为字典的key，index不做指定则为默认数组标签
# colunms和index参数分别重新指定相应列及行标签

[{'one': 1, 'two': 2}, {'one': 5, 'two': 10, 'three': 20}]
   one  three  two
0    1    NaN    2
1    5   20.0   10
   one  three  two
a    1    NaN    2
b    5   20.0   10
   one  two
0    1    2
1    5   10

# Dataframe 创建方法五：
'''
由字典组成的字典
'''

data = {'Jack':{'math':90,'english':89,'art':78},
'Marry':{'math':82,'english':95,'art':92},
'Tom':{'math':78,'english':67}}
df1 = pd.DataFrame(data)
print(df1)
# 由字典组成的字典创建Dataframe，columns为字典的key，index为子字典的key


df2 = pd.DataFrame(data, columns = ['Jack','Tom','Bob'])
df3 = pd.DataFrame(data, index = ['a','b','c'])
print(df2)
print(df3)
# columns参数可以增加和减少现有列，如出现新的列，值为NaN
# index在这里和之前不同，并不能改变原有index，如果指向新的标签，值为NaN （非常重要！）

         Jack  Marry   Tom
art        78     92   NaN
english    89     95  67.0
math       90     82  78.0
         Jack   Tom  Bob
art        78   NaN  NaN
english    89  67.0  NaN
math       90  78.0  NaN
   Jack  Marry  Tom
a   NaN    NaN  NaN
b   NaN    NaN  NaN
c   NaN    NaN  NaN

dataframe的索引

选择列

# 选择列

df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,
                 index=['one','two','three'],
                 columns=['a','b','c','d'])
print(df)

print('-------------')

print(df['a'],type(df['a']))
print(df[['a','c']])
# df[]默认选择列，[]中写列名（所以一般数据colunms都会单独制定，不会用默认数字列名，以免和index冲突）
# 按照列名选择列，只选择一列输出Series，选择多列输出Dataframe

data3 = df[:1]
print(data3,type(data3))
# df[]中为数字时，默认选择行，且只能进行切片的选择，不能单独选择（df[0]）
# 输出结果为Dataframe，即便只选择一行
# df[]不能通过索引标签名来选择行(df['one'])

               a          b          c          d
one    76.686850  33.084433  92.247401   3.232769
two    69.210400  19.430214  72.479291  24.497907
three  33.574062  34.241901  13.176080  45.500454
-------------
one      76.686850
two      69.210400
three    33.574062
Name: a, dtype: float64 <class 'pandas.core.series.Series'>
               a          c
one    76.686850  92.247401
two    69.210400  72.479291
three  33.574062  13.176080
            a          b          c         d
one  76.68685  33.084433  92.247401  3.232769 <class 'pandas.core.frame.DataFrame'>

选择行

按照标签索引

# df.loc[] - 按index选择行

df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                   index = ['one','two','three','four'],
                   columns = ['a','b','c','d'])
df2 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                   columns = ['a','b','c','d'])
print(df1)
print(df2)
print('-----')

data1 = df1.loc['one']
data2 = df2.loc[1]
print(data1)
print(data2)
print('单标签索引\n-----')
# 单个标签索引，返回Series

data3 = df1.reindex(['two','three','five'])     #当索引中有不存在的行，用 .reindex()
data4 = df2.loc[[3,2,1]]
print(data3)
print(data4)
print('多标签索引\n-----')
# 多个标签索引，如果标签不存在，则返回NaN
# 顺序可变

data5 = df1.loc['one':'three']      #末端包含
data6 = df2.loc[1:3]            #末端包含
print(data
print(data6)
print('切片索引')
# 可以做切片对象
# 末端包含

# 核心笔记：df.loc[label]主要针对index选择行，同时支持指定index，及默认数字index

               a          b          c          d
one     0.704809  50.882799  31.748359   4.008618
two    15.502812  65.400805  59.699475   5.715281
three  17.719915   1.108207   8.478434  53.255034
four   55.125978  58.878202  58.578265  78.005093
           a          b          c          d
0  94.323711  31.623252  79.545395  65.614674
1  18.653849  68.460337  46.089197  36.713253
2  63.245660  68.602988  78.669926  85.773112
3  11.840830  53.954041  18.657871  56.381644
-----
a     0.704809
b    50.882799
c    31.748359
d     4.008618
Name: one, dtype: float64
a    18.653849
b    68.460337
c    46.089197
d    36.713253
Name: 1, dtype: float64
单标签索引
-----
               a          b          c          d
two    15.502812  65.400805  59.699475   5.715281
three  17.719915   1.108207   8.478434  53.255034
five         NaN        NaN        NaN        NaN
           a          b          c          d
3  11.840830  53.954041  18.657871  56.381644
2  63.245660  68.602988  78.669926  85.773112
1  18.653849  68.460337  46.089197  36.713253
多标签索引
-----
               a          b          c          d
one     0.704809  50.882799  31.748359   4.008618
two    15.502812  65.400805  59.699475   5.715281
three  17.719915   1.108207   8.478434  53.255034
           a          b          c          d
1  18.653849  68.460337  46.089197  36.713253
2  63.245660  68.602988  78.669926  85.773112
3  11.840830  53.954041  18.657871  56.381644
切片索引

按照位置索引（类似list）

# df.iloc[] - 按照整数位置（从轴的0到length-1）选择行
# 类似list的索引，其顺序就是dataframe的整数位置，从0开始计


df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                   index = ['one','two','three','four'],
                   columns = ['a','b','c','d'])
print(df)
print('------')

print(df.iloc[0])
print(df.iloc[-1])
#print(df.iloc[4])
print('单位置索引\n-----')
# 单位置索引
# 和loc索引不同，不能索引超出数据行数的整数位置

print(df.iloc[[0,2]])
print(df.iloc[[3,2,1]])
print('多位置索引\n-----')
# 多位置索引
# 顺序可变

print(df.iloc[1:3])
print(df.iloc[::2])
print('切片索引')
# 切片索引
# 末端不包含

               a          b          c          d
one    51.040031  29.879548  31.532430  97.647893
two     1.090956  21.844713  93.315747   7.065472
three  86.012962   3.038697  38.600913  29.515306
four   11.752251  40.550808  97.113613   4.374101
------
a    51.040031
b    29.879548
c    31.532430
d    97.647893
Name: one, dtype: float64
a    11.752251
b    40.550808
c    97.113613
d     4.374101
Name: four, dtype: float64
单位置索引
-----
               a          b          c          d
one    51.040031  29.879548  31.532430  97.647893
three  86.012962   3.038697  38.600913  29.515306
               a          b          c          d
four   11.752251  40.550808  97.113613   4.374101
three  86.012962   3.038697  38.600913  29.515306
two     1.090956  21.844713  93.315747   7.065472
多位置索引
-----
               a          b          c          d
two     1.090956  21.844713  93.315747   7.065472
three  86.012962   3.038697  38.600913  29.515306
               a          b          c          d
one    51.040031  29.879548  31.532430  97.647893
three  86.012962   3.038697  38.600913  29.515306
切片索引

布尔型索引

# 布尔型索引
# 和Series原理相同

df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                   index = ['one','two','three','four'],
                   columns = ['a','b','c','d'])
print(df)
print('------')

b1 = df < 20
print(b1,type(b1))
print(df[b1])  # 也可以书写为 df[df < 20]
print('------')
# 布尔索引会对数据每个值进行判断
# 索引结果保留 所有数据：True返回原数据，False返回值为NaN

b2 = df['a'] > 50
print(b2,type(b2))
print(df[b2])  # 也可以书写为 df[df['a'] > 50]
print('单列判断------')
# 单列做判断
# 索引结果保留 单列判断为True的行数据，包括其他列 (在这里保留了前三列)

b3 = df[['a','b']] > 50
print(b3,type(b3))
print(df[b3])  # 也可以书写为 df[df[['a','b']] > 50]
print('多列判断------')
# 多列做判断
# 索引结果保留 所有数据：True返回原数据，False返回值为NaN

b4 = df.loc[['one','three']] < 50
print(b4,type(b4))
print(df[b4])  # 也可以书写为 df[df.loc[['one','three']] < 50]
print('------')
# 多行做判断
# 索引结果保留 所有数据：True返回原数据，False返回值为NaN

               a          b          c          d
one    74.965896  10.849874  23.036455  73.213216
two    57.286029  44.344496  69.367778  58.836892
three  67.542466  91.403359  32.930365  23.509080
four   44.582918  62.278704  12.818474  77.478616
------
           a      b      c      d
one    False   True  False  False
two    False  False  False  False
three  False  False  False  False
four   False  False   True  False <class 'pandas.core.frame.DataFrame'>
        a          b          c   d
one   NaN  10.849874        NaN NaN
two   NaN        NaN        NaN NaN
three NaN        NaN        NaN NaN
four  NaN        NaN  12.818474 NaN
------
one       True
two       True
three     True
four     False
Name: a, dtype: bool <class 'pandas.core.series.Series'>
               a          b          c          d
one    74.965896  10.849874  23.036455  73.213216
two    57.286029  44.344496  69.367778  58.836892
three  67.542466  91.403359  32.930365  23.509080
单列判断------
           a      b
one     True  False
two     True  False
three   True   True
four   False   True <class 'pandas.core.frame.DataFrame'>
               a          b   c   d
one    74.965896        NaN NaN NaN
two    57.286029        NaN NaN NaN
three  67.542466  91.403359 NaN NaN
four         NaN  62.278704 NaN NaN
多列判断------
           a      b     c      d
one    False   True  True  False
three  False  False  True   True <class 'pandas.core.frame.DataFrame'>
        a          b          c         d
one   NaN  10.849874  23.036455       NaN
two   NaN        NaN        NaN       NaN
three NaN        NaN  32.930365  23.50908
four  NaN        NaN        NaN       NaN
------

多重索引

# 多重索引：比如同时索引行和列
# 先选择列再选择行 —— 相当于对于一个数据，先筛选字段，再选择数据量


df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                   index = ['one','two','three','four'],
                   columns = ['a','b','c','d'])
print(df)
print('------')

print(df['a'].loc[['one','three']])   # 选择a列的one，three行
print(df[['b','c','d']].iloc[::2])   # 选择b，c，d列的one，three行
print('-----------')

print(df[df['a'] < 50].iloc[:2])   # 选择满足判断索引的前两行数据
print(df[df < 50][['a','b']])

               a          b          c          d
one    63.114540  38.567125  88.018529  71.156573
two    39.640530   2.518002  86.175475  59.508718
three  96.924110  21.059748  39.764780  57.627406
four   12.960049  26.735557  26.079749  27.259359
------
one      63.11454
three    96.92411
Name: a, dtype: float64
               b          c          d
one    38.567125  88.018529  71.156573
three  21.059748  39.764780  57.627406
-----------
              a          b          c          d
two   39.640530   2.518002  86.175475  59.508718
four  12.960049  26.735557  26.079749  27.259359
               a          b
one          NaN  38.567125
two    39.640530   2.518002
three        NaN  21.059748
four   12.960049  26.735557

下篇会讲到DataFrame的基本操作用法

秒客网

Pyhton科学计算工具Pandas(二) —— Dataframe数据结构

Pandas(二) —— Dataframe数据结构

Dataframe数据结构

Dataframe的基本概念

dataframe的创建

dataframe的索引

选择列

选择行

按照标签索引

按照位置索引（类似list）

布尔型索引

多重索引

相关文章