​量化交易之python篇 - 统计学习导论python版(第二章习题)

时间:2021-03-10 00:40:47


import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_boston

from public_module.tqz_extern.tools.pandas_operator.pandas_operator import pandas

# 第二章第八题
def charpter_2_test8():
college = pd.read_csv(r'data/College.csv')

college = college.set_index(['Unnamed: 0'], append=True, verify_integrity=True)
college.rename_axis([None, 'Name'], inplace=True) # 将 值为None的列名 设置为Name

# print(college.shape) # 行列(777, 18)
# print(college.describe()) # 矩阵简单描述

college['Elite'] = np.where(college['Top10perc'] > 50, 'Yes', 'No') # 新建一列Elite, 若Top10perc > 50 则设置为Yes 否则为No
# print(college['Elite'].value_counts()) # 查看 Elite这一列 各个值 的 出现次数

college['Enroll'] = pd.cut(college['Enroll'], bins=3, labels=['Low', 'Medium', 'High'])
college['PhD'] = pd.cut(college['PhD'], bins=5, labels=['Very low', 'Low', 'Medium', 'High', 'Very High']) # bins: 5 等分
college['Terminal'] = pd.cut(college['Terminal'], bins=4, labels=['Very low', 'Low', 'High', 'Very High'])

# Enroll | PhD | Terminal 列值的分布柱状图
fig = plt.figure()

plt.subplot(2, 2, 1)
college['Enroll'].value_counts().plot.bar(title='Enroll')
plt.subplot(2, 2, 2)
college['PhD'].value_counts().plot.bar(title='PhD')
plt.subplot(2, 2, 3)
college['Terminal'].value_counts().plot.bar(title='Terminal')

fig.subplots_adjust(hspace=1)

plt.show()


def charpter_2_test9():
auto = pd.read_csv(r'data/Auto.csv')

# print(auto.nunique()) # 各列 不同值的个数;
# print(auto.info()) # 简单看下各列的值是否为NAN

auto['horsepower'] = auto['horsepower'].replace('?', np.nan) # 将df中的 ? 用 np.nan 替代
auto.dropna(inplace=True)

info = auto.describe().T
info['range'] = info['max'] - info['min']
info = info[['mean', 'range', 'std']]


def charpter_2_test10():
boston = pd.DataFrame(load_boston().data, columns=load_boston().feature_names)

boston['target'] = load_boston().target
# print(boston.shape[0]) # 行
# print(boston.shape[0]) # 列

corr_matrix = boston.corr() # 相关性(默认方法为皮尔逊)
foo = boston.sort_values(by=['CRIM', 'TAX', 'PTRATIO'], ascending=False)


if __name__ == '__main__':
pass