Pandas之Series

# Series 数据结构

# Series 是带有标签的一维数组，可以保存任何数据类型（整数，字符串，浮点数，Python对象等）,轴标签统称为索引

import numpy as np

import pandas as pd

# 导入numpy、pandas模块

s = pd.Series(np.random.rand())

print(s)

print(type(s))

# 查看数据、数据类型

print(s.index,type(s.index))

print(s.values,type(s.values))

# .index查看series索引，类型为rangeindex

# .values查看series值，类型是ndarray

# 核心：series相比于ndarray，是一个自带索引index的数组 → 一维数组 + 对应索引

# 所以当只看series的值的时候，就是一个ndarray

# series和ndarray较相似，索引切片功能差别不大

# series和dict相比，series更像一个有顺序的字典（dict本身不存在顺序），其索引原理与字典相似（一个用key，一个用index）

　　输出：

    0.229773

    0.357622

    0.546116

    0.734517

    0.686645

dtype: float64

<class 'pandas.core.series.Series'>

RangeIndex(start=, stop=, step=) <class 'pandas.indexes.range.RangeIndex'>

[ 0.22977307  0.35762236  0.54611623  0.73451707  0.68664496] <class 'numpy.ndarray'>

# Series 创建方法一：由字典创建，字典的key就是index，values就是values

dic = {'a': ,'b': , 'c':, '':, '':}

s = pd.Series(dic)

print(s)

# 注意：key肯定是字符串，假如values类型不止一个会怎么样？ → dic = {'a': ,'b':'hello' , 'c':, '':, '':}

　　输出：


a

b

c

dtype: int64

# Series 创建方法二：由数组创建(一维数组)

arr = np.random.randn()

s = pd.Series(arr)

print(arr)

print(s)

# 默认index是从0开始，步长为1的数字

s = pd.Series(arr, index = ['a','b','c','d','e'],dtype = np.object)

print(s)

# index参数：设置index，长度保持一致

# dtype参数：设置数值类型

　　输出：

[ 0.11206121  0.1324684   0.59930544  0.34707543 -0.15652941]

    0.112061

    0.132468

    0.599305

    0.347075

   -0.156529

dtype: float64

a    0.112061

b    0.132468

c    0.599305

d    0.347075

e   -0.156529

dtype: object

# Series 创建方法三：由标量创建

s = pd.Series(, index = range())

print(s)

# 如果data是标量值，则必须提供索引。该值会重复，来匹配索引的长度

　　输出：


dtype: int64

# Series 名称属性：name

s1 = pd.Series(np.random.randn())

print(s1)

print('-----')

s2 = pd.Series(np.random.randn(),name = 'test')

print(s2)

print(s1.name, s2.name,type(s2.name))

# name为Series的一个参数，创建一个数组的 名称

# .name方法：输出数组的名称，输出格式为str，如果没用定义输出名称，输出为None

s3 = s2.rename('hehehe')

print(s3)

print(s3.name, s2.name)

# .rename()重命名一个数组的名称，并且新指向一个数组，原数组不变

　　输出：

   -0.403084

    1.369383

    1.134319

   -0.635050

    1.680211

dtype: float64

-----

   -0.120014

    1.967648

    1.142626

    0.234079

    0.761357

Name: test, dtype: float64

None test <class 'str'>

   -0.120014

    1.967648

    1.142626

    0.234079

    0.761357

Name: hehehe, dtype: float64

hehehe test

# 位置下标，类似序列

s = pd.Series(np.random.rand())

print(s)

print(s[],type(s[]),s[].dtype)

print(float(s[]),type(float(s[])))

#print(s[-])

# 位置下标从0开始

# 输出结果为numpy.float格式，

# 可以通过float()函数转换为python float格式

# numpy.float与float占用字节不同

# s[-]结果如何？

　　输出：

    0.924575

    0.988654

    0.426333

    0.216504

    0.453570

dtype: float64

0.924575004833 <class 'numpy.float64'> float64

0.9245750048328816 <class 'float'>

# 标签索引

s = pd.Series(np.random.rand(), index = ['a','b','c','d','e'])

print(s)

print(s['a'],type(s['a']),s['a'].dtype)

# 方法类似下标索引，用[]表示，内写上index，注意index是字符串

sci = s[['a','b','e']]

print(sci,type(sci))

# 如果需要选择多个标签的值，用[[]]来表示（相当于[]中包含一个列表）

# 多标签索引结果是新的数组

　　输出：

a    0.714630

b    0.213957

c    0.172188

d    0.972158

e    0.875175

dtype: float64

0.714630383451 <class 'numpy.float64'> float64

a    0.714630

b    0.213957

e    0.875175

dtype: float64 <class 'pandas.core.series.Series'>

# 切片索引

s1 = pd.Series(np.random.rand())

s2 = pd.Series(np.random.rand(), index = ['a','b','c','d','e'])

print(s1[:],s1[])

print(s2['a':'c'],s2['c'])

print(s2[:],s2[])

print('-----')

# 注意：用index做切片是末端包含

print(s2[:-])

print(s2[::])

# 下标索引做切片，和list写法一样

　　输出：

    0.865967

    0.114500

    0.369301

dtype: float64 0.411702342342

a    0.717378

b    0.642561

c    0.391091

dtype: float64 0.39109096261

a    0.717378

b    0.642561

c    0.391091

dtype: float64 0.998978363818

-----

a    0.717378

b    0.642561

c    0.391091

d    0.998978

dtype: float64

a    0.717378

c    0.391091

e    0.957639

dtype: float64

# 布尔型索引

s = pd.Series(np.random.rand()*)

s[] = None  # 添加一个空值

print(s)

bs1 = s >

bs2 = s.isnull()

bs3 = s.notnull()

print(bs1, type(bs1), bs1.dtype)

print(bs2, type(bs2), bs2.dtype)

print(bs3, type(bs3), bs3.dtype)

print('-----')

# 数组做判断之后，返回的是一个由布尔值组成的新的数组

# .isnull() / .notnull() 判断是否为空值 (None代表空值，NaN代表有问题的数值，两个都会识别为空值)

print(s[s > ])

print(s[bs3])

# 布尔型索引方法：用[判断条件]表示，其中判断条件可以是 一个语句，或者是 一个布尔型数组！

　　输出：

    2.03802

    40.3989

    25.2001

       None

dtype: object

    False

    False

    False

    False

dtype: bool <class 'pandas.core.series.Series'> bool

    False

    False

    False

     True

dtype: bool <class 'pandas.core.series.Series'> bool

     True

     True

     True

    False

dtype: bool <class 'pandas.core.series.Series'> bool

-----

Series([], dtype: object)

    2.03802

    40.3989

    25.2001

dtype: object

'''

【课程2.】  Pandas数据结构Series：基本技巧

数据查看 / 重新索引 / 对齐 / 添加、修改、删除值

'''

# 数据查看

s = pd.Series(np.random.rand())

print(s.head())

print(s.tail())

# .head()查看头部数据

# .tail()查看尾部数据

# 默认查看5条

　　输出：

    0.730540

    0.116711

    0.787693

    0.969764

    0.324540

    0.061827

    0.377060

    0.820383

    0.964477

    0.451936

dtype: float64

    0.899540

    0.237008

    0.298762

    0.848487

    0.829858

dtype: float64

# 重新索引reindex

# .reindex将会根据索引重新排序，如果当前索引不存在，则引入缺失值

s = pd.Series(np.random.rand(), index = ['a','b','c'])

print(s)

s1 = s.reindex(['c','b','a','d'])

print(s1)

# .reindex()中也是写列表

# 这里'd'索引不存在，所以值为NaN

s2 = s.reindex(['c','b','a','d'], fill_value = )

print(s2)

# fill_value参数：填充缺失值的值

　　输出：

a    0.343718

b    0.322228

c    0.746720

dtype: float64

c    0.746720

b    0.322228

a    0.343718

d         NaN

dtype: float64

c    0.746720

b    0.322228

a    0.343718

d    0.000000

dtype: float64

# Series对齐

s1 = pd.Series(np.random.rand(), index = ['Jack','Marry','Tom'])

s2 = pd.Series(np.random.rand(), index = ['Wang','Jack','Marry'])

print(s1)

print(s2)

print(s1+s2)

# Series 和 ndarray 之间的主要区别是，Series 上的操作会根据标签自动对齐

# index顺序不会影响数值计算，以标签来计算

# 空值和任何值计算结果扔为空值

　　输出：

Jack     0.753732

Marry    0.180223

Tom      0.283704

dtype: float64

Wang     0.309128

Jack     0.533997

Marry    0.626126

dtype: float64

Jack     1.287729

Marry    0.806349

Tom           NaN

Wang          NaN

dtype: float64

# 删除：.drop

s = pd.Series(np.random.rand(), index = list('ngjur'))

print(s)

s1 = s.drop('n')

s2 = s.drop(['g','j'])

print(s1)

print(s2)

print(s)

# drop 删除元素之后返回副本(inplace=False)

　　输出：

n    0.876587

g    0.594053

j    0.628232

u    0.360634

r    0.454483

dtype: float64

g    0.594053

j    0.628232

u    0.360634

r    0.454483

dtype: float64

n    0.876587

u    0.360634

r    0.454483

dtype: float64

n    0.876587

g    0.594053

j    0.628232

u    0.360634

r    0.454483

dtype: float64

# 添加

s1 = pd.Series(np.random.rand())

s2 = pd.Series(np.random.rand(), index = list('ngjur'))

print(s1)

print(s2)

s1[] =

s2['a'] =

print(s1)

print(s2)

print('-----')

# 直接通过下标索引/标签index添加值

s3 = s1.append(s2)

print(s3)

print(s1)

# 通过.append方法，直接添加一个数组

# .append方法生成一个新的数组，不改变之前的数组

　　输出：

    0.516447

    0.699382

    0.469513

    0.589821

    0.402188

dtype: float64

n    0.615641

g    0.451192

j    0.022328

u    0.977568

r    0.902041

dtype: float64

      0.516447

      0.699382

      0.469513

      0.589821

      0.402188

    100.000000

dtype: float64

n      0.615641

g      0.451192

j      0.022328

u      0.977568

r      0.902041

a    100.000000

dtype: float64

-----

      0.516447

      0.699382

      0.469513

      0.589821

      0.402188

    100.000000

n      0.615641

g      0.451192

j      0.022328

u      0.977568

r      0.902041

a    100.000000

dtype: float64

      0.516447

      0.699382

      0.469513

      0.589821

      0.402188

    100.000000

dtype: float64

# 修改

s = pd.Series(np.random.rand(), index = ['a','b','c'])

print(s)

s['a'] =

s[['b','c']] =

print(s)

# 通过索引直接修改，类似序列

　　输出：

a    0.873604

b    0.244707

c    0.888685

dtype: float64

a    100.0

b    200.0

c    200.0

dtype: float64

相关文章