import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_boston
from public_module.tqz_extern.tools.pandas_operator.pandas_operator import pandas
# 第二章第八题
def charpter_2_test8():
college = pd.read_csv(r'data/College.csv')
college = college.set_index(['Unnamed: 0'], append=True, verify_integrity=True)
college.rename_axis([None, 'Name'], inplace=True) # 将 值为None的列名 设置为Name
# print(college.shape) # 行列(777, 18)
# print(college.describe()) # 矩阵简单描述
college['Elite'] = np.where(college['Top10perc'] > 50, 'Yes', 'No') # 新建一列Elite, 若Top10perc > 50 则设置为Yes 否则为No
# print(college['Elite'].value_counts()) # 查看 Elite这一列 各个值 的 出现次数
college['Enroll'] = pd.cut(college['Enroll'], bins=3, labels=['Low', 'Medium', 'High'])
college['PhD'] = pd.cut(college['PhD'], bins=5, labels=['Very low', 'Low', 'Medium', 'High', 'Very High']) # bins: 5 等分
college['Terminal'] = pd.cut(college['Terminal'], bins=4, labels=['Very low', 'Low', 'High', 'Very High'])
# Enroll | PhD | Terminal 列值的分布柱状图
fig = plt.figure()
plt.subplot(2, 2, 1)
college['Enroll'].value_counts().plot.bar(title='Enroll')
plt.subplot(2, 2, 2)
college['PhD'].value_counts().plot.bar(title='PhD')
plt.subplot(2, 2, 3)
college['Terminal'].value_counts().plot.bar(title='Terminal')
fig.subplots_adjust(hspace=1)
plt.show()
def charpter_2_test9():
auto = pd.read_csv(r'data/Auto.csv')
# print(auto.nunique()) # 各列 不同值的个数;
# print(auto.info()) # 简单看下各列的值是否为NAN
auto['horsepower'] = auto['horsepower'].replace('?', np.nan) # 将df中的 ? 用 np.nan 替代
auto.dropna(inplace=True)
info = auto.describe().T
info['range'] = info['max'] - info['min']
info = info[['mean', 'range', 'std']]
def charpter_2_test10():
boston = pd.DataFrame(load_boston().data, columns=load_boston().feature_names)
boston['target'] = load_boston().target
# print(boston.shape[0]) # 行
# print(boston.shape[0]) # 列
corr_matrix = boston.corr() # 相关性(默认方法为皮尔逊)
foo = boston.sort_values(by=['CRIM', 'TAX', 'PTRATIO'], ascending=False)
if __name__ == '__main__':
pass