import numpy as np
import pandas as pd
'''
1.Series类型:带索引(标签)的一维数组,类似于字典
Index(pandas自己定义的Index类型) Value(ndarray类型)
a 1
b 2
c 3
'''
Series_1_1 = pd.Series(data=25, index=('a', ))
Series_1_2 = pd.Series(data=25, index=['a', 'b', 'c'])
Series_1_1.index.name = 'Series_1_1_index_name'
Series_1_1.name = 'Series_1_1'
print("由标量创建:\n", Series_1_1, "\n", Series_1_2)
print("typeof Index:", type(Series_1_1.index))
print("typeof Value:", type(Series_1_1.values))
print("索引:", Series_1_2[0], Series_1_2['b'], "\n", Series_1_2[['b', 'c']],
"\n", Series_1_2[[0, 1]], '\n',
Series_1_2.get('a', default=10), '\n', 'a' in Series_1_2)
Series_2_1 = pd.Series([1, 2, 3], index=range(3))
Series_2_2 = pd.Series(data=(1, 2, 3), index=range(6, 3, -1))
print("由python序列创建:\n", Series_2_1, "\n", Series_2_2)
Series_3_1 = pd.Series(data={'a': 1, 'b': 2, 'c': 3})
Series_3_2 = pd.Series(
data={'a': 1,
'b': 2,
'c': 3}, index=('b', 'c', 'a', 'd'))
print("由Python字典创建:\n", Series_3_1, "\n", Series_3_2)
Series_4_1 = pd.Series(
data=np.arange(start=2, stop=6, step=1), index=(1, 4, 6, 3))
print("由ndarray数组创建:\n", Series_4_1)
print("Series_1_2+25:", Series_1_2 + 25)
print("np.log(Series_1_2):", np.log(Series_1_2))
print("Series_1_2+Series_1_1:", Series_1_2 + Series_1_1)
'''
2.DataFrame 类型 :带 标签的二维数组
columns column_1 column_2
index
row_1 1 2
row_2 3 4
轴axis=0 为列方向的轴
轴axis=1 为行方向的轴
dataFrame.columns Index 对象
dataFrame.index Index对象
dataFrame.values ndarray 二维数组对象
'''
dataFrame_1 = pd.DataFrame(
data=10 * np.random.rand(20).reshape(4, 5),
index=['row_1', 'row_2', 'row_3', 'row_4'],
columns=['1', '2', '3', '4', '5'])
print("直接利用nparray多维数组创建:", dataFrame_1)
'''
data_2={
column_1: (values域)(Python序列,Python字典),也可以是一个Series对象
column_2: (values域)
}
'''
data_2 = {
'column_1': {
'a': 1,
'b': 2
},
'column_2': {
'a': 2,
'b': 3
},
'column_3': pd.Series(data=np.arange(2), index=['a', 'c'])
}
dataFrame_2 = pd.DataFrame(data=data_2)
print("利用python字典创建:", dataFrame_2)
'''
#dataFrame类型的索引 切片操作loc at 根据自定义索引选取
'''
print("\ndataFrame类型的索引 切片操作loc at\n")
print("单个元素选取时是选取的列,比如:dataFrame_2['column_1']", dataFrame_2['column_1'])
print("切片时又是选择行,eg:dataFrame_2[0:2]", dataFrame_2[0:2])
print(":.ix", dataFrame_2.ix['a'])
print("eg:loc多个元素", dataFrame_2.loc['a':'c', 'column_1':'column_2'])
print("eg:loc单个元素", dataFrame_2.loc['a', 'column_1'])
print("eg:at单个元素", dataFrame_2.at['a', 'column_1'])
'''
#dataFrame类型的索引 切片操作iloc iat,类似于ndarray数组的切片操作,采用下标切片,不能用自定义的索引
'''
print("\ndataFrame类型的索引 切片操作iloc iat,类似于ndarray数组的切片操作,采用下标切片,不能用自定义的索引\n")
print(dataFrame_2.iloc[2])
print("\n", dataFrame_2.iloc[1, 0:2])
print(dataFrame_2.iloc[2, 1])
print(dataFrame_2.iat[0, 0])
'''
#dataFrame 对象的筛选
dataFrame_2[dataFrame.column_1>0.5] 选取出来是有True的那一行
dataFrame.column_1>0.5 columns(列方向上的) 的 boolean Series 对象 | 或 & ~(取反) ^ (异或)可以用
'''
print("\nDataFrame 对象的筛选 \n", dataFrame_2[dataFrame_2['column_1'] > 0.5])
print('\nDataFrame 对象的筛选 ,选取特定的列\n',
dataFrame_2['column_1'][~(dataFrame_2['column_1'] > 0.5)])
'''
下列方法均产生新的 Index
.append(idx)连接另一个Index对象
.diff(idx)计算差集
.intersection(idx)计算交集
.union(idx)计算并集
.delete(loc)删除index对象位置loc的索引,不改变原Index对象,返回一个新的Index对象
.insert(loc,value)在loc位置增加一个value元素,不改变原Index对象,返回一个新的Index对象
'''
new_columns = dataFrame_2.columns.insert(4, 'new_column')
print(new_columns)
'''
#DataFrame 对象方法
#1.insert添加列
insert(self, loc, column, value, allow_duplicates=False):
"""
Insert column into DataFrame at specified location.向DataFrame中增加一列
If `allow_duplicates` is False, raises Exception if column
is already contained in the DataFrame.
Parameters
----------
loc : int
Must have 0 <= loc <= len(columns)
column : object(列的名字)
value : scalar, Series, or array-like
#2. .drop(''或[, ,]里面是自定义索引值,axis=0)删除列
'''
dataFrame_2.insert(
loc=1, column='new_column',
value=dataFrame_2['column_1'])
dataFrame_2_2 = dataFrame_2.drop('column_1', axis=1)
dataFrame_2_3 = dataFrame_2.drop('a')
print("dataFrame_2:", dataFrame_2)
print("dataFrame_2_2:", dataFrame_2_2)
print("dataFrame_2_3:", dataFrame_2_3)
New_Series_1_2_1 = Series_1_2.reindex(
index=[2, 3, 4, 5, 6, 7], fill_value=-1)
print("New_Series_1_2_1:", New_Series_1_2_1)
series_GB = pd.Series(np.arange(2), index=['a', 'b'])
dict_GB = {
'a': pd.Series(np.arange(2, 4), index=['row_1', 'row_2']),
'b': pd.Series(np.arange(4, 6), index=['row_1', 'row_2'])
}
dataFrame_GB = pd.DataFrame(dict_GB)
print(dataFrame_GB + series_GB)
'''
不改变原数组,返回一个新的数组
.sort_index(axis=0,ascending=True)#0轴正向 根据索引排序
.sort_values(by,axis=0,ascending=True)#by 根据0轴哪个列排序 by为那一列的索引 NaN 放在最后
'''
dataFrame_2_4 = dataFrame_2.sort_values(by='column_2', ascending=False)
print("sorted:", dataFrame_2_4)
'''
1.Series 和DataFrame 类型通用
以下均默认为每一个0轴单独计算
.sum .count 非NaN的值的个数 .mean 均值 .median 中位数 .min .max
.describe 统一描述函数 也是0轴分别单独计算
2.仅Series才可以使用
argmin argmax 返回自动索引
idmin idmax 返回自定义索引
'''
dataFrame_2_describe = dataFrame_2.describe()
print("dataFrame_2_describe:", dataFrame_2_describe)
'''
下面运算全部都是基于0轴
.cumsum 累加 累加,每个位置都要与前面的元素进行累加得出结果
.cumprod 累乘
.cummax
.cummin
滚动计算,每几个元素一计算,不到这个数量就不计算.rolling(w)
'''
print("滚动计算:", dataFrame_2_describe.rolling(3).sum())