python数据处理与机器学习

时间:2021-06-17 08:21:51

提纲

python数据处理与机器学习

numpy:

#genformtxt
import numpy as np
#genformtxtdata=np.genfromtxt("genfromtxtdata")
#print(help(numpy.genfromtxt)) #matrix-list of list
matrix=np.array([[12,12],[12,12],[1,13]])
print(matrix)
#强制转换成一致数据类型
dataa=np.array([1,2,4.0,1])
#切片 #判断
#datab=dataa
#结果返回true,false
#导出等于某一值的数组
#booldata=(datab==1)
#print(datab[booldata])
#取出包含某一值的某一行
boolmatrix =(matrix[:,1]==13)
print(matrix[boolmatrix,:])
# & | 与或 #类型转换
dataa.astype(float) #求极值
dataa.min()
#按照行列求和
matrix.sum(axis=1) #np.zeros((3,4)->元组格式)
#np.arange(15).reshape(3,4)
#np.random.random()->先进入random模块,默认范围-1->+1
#np.linspace(0,2*pi,100)->均匀取值
#np.exp() #相减:维度一样对应相减,不一样都减去后一个数
A=np.array([[1,2],[1,1]])
B=np.array([[1,2],[1,1]])
print(A*B)#对应元素相乘
print(A.dot(B))#矩阵相乘
print(np.dot(A,B)) #矩阵操作
#向下取
a=np.floor(10*np.random.random((3,4)))
b=np.floor(10*np.random.random((3,4)))
#将矩阵拉成向量
print(a)
print(a.ravel()) #数据拼接
#print(np.hstack((a,b)))
#print(np.vstack((a,b)))
#数据切分
#print(np.hsplit(a,2))
#print(np.vsplit(a,2)) #数据复制
b=a
b.shape=4,3
#改变b的形状,a的形状跟着变了
print(a)
#a,b的ID值一样,指向统一内存空间
print(id(a),id(b))
#浅复制
#c与a虽然指向的地址不同但是共用一套数值,改变 c,a也会改变
c=a.view()
c.shape=2,6
c[1,1]=11
print(a.shape)
print(a)
#深复制
#d与a完全没关系了
d=a.copy() #索引操作
#找最大值所在的位置
intt=a.argmax(axis=0)
print(intt)
#扩展数组
a=np.arange(1,20,10)
b=np.tile(a,(2,3))
print(b)
#排序
a=np.array([[1,2,3],[3,2,1]])
#从小到大的索引值
j=np.argsort(a)
a.sort(axis=1)
print(j)
print(a)

pandas:

import pandas as pd
import numpy as np
current_path = %pwd
print(current_path)
#food_info=pd.read_csv("food_info.csv")
#DataFrame数据类型
#print(type(food_info))
#print(food_info.dtypes) #food_info.head()
#food_info.tail(4)
#print(food_info.columns)
#print(food_info.shape) #索引与计算
#print(food_info.loc[0])
#传入一个list->多列
#print(food_info[["NDB_No","Shrt_Desc"]])
#column_list=food_info.columns.tolist()
#print(column_list) ##数据预处理
#food_info.sort_values("NDB_No",inplace=True)
##排序后缺失值会被放到最后
##从小到大排序
#print(food_info["NDB_No"])
##从大到小
#food_info.sort_values("NDB_No",inplace=True,ascending=False)
#print(food_info["NDB_No"]) titanic_train_info=pd.read_csv("titanic_train.csv")
#print(titanic_train_info.head())
#age=titanic_train_info["Age"]
#print(age.loc[0:10])
#age_is_null=pd.isnull(age)
#print(age_is_null)
#age_null_true=age[age_is_null]
#age_null_count=len(age_null_true)
#print(age_null_count)
#除去缺失值求平均
#age_null_false=titanic_train_info["Age"][age_is_null==False]
#average_age=sum(age_null_false)/len(age_null_false)
#average_age1=titanic_train_info["Age"].mean()
#print(average_age,average_age1) #数据统计表
#基准-统计对象-方法
#求均值是默认方法
#passager_survival=titanic_train_info.pivot_table(index="Pclass",values="Survived",aggfunc=np.mean)
#print(passager_survival)
#passager_age=titanic_train_info.pivot_table(index="Pclass",values="Age",aggfunc=np.mean)
#print(passager_age)
#port_stats=titanic_train_info.pivot_table(index="Embarked",values=["Fare","Survived"],aggfunc=np.sum)
#print(port_stats)
##缺失值丢掉
#titanic_train_info1=titanic_train_info
drop_na_columns=titanic_train_info1.dropna(axis=0,subset=["Age","Sex"])
drop_na_columns.head() #定位到某一具体值
row_index_83_age=titanic_train_info1.loc[83,"Age"]
print(row_index_83_age) #自定义函数
#titanic_train_info1.apply("函数名")
#series结构
import pandas as pd
score_csv=pd.read_csv("fandango_score_comparison.csv")
series_FILM=score_csv["FILM"]
#print(type(series_FILM)) from pandas import Series
film_names=series_FILM.values
#print(type(film_names))
series_rt=score_csv["RottenTomatoes"]
#print(series_rt)
rt_scores=series_rt.values
print(rt_scores)
#以名字所谓索引
series_customer=Series(rt_scores,index=film_names)
series_customer["Minions (2015)"]
series_customer[5:10]

matplotlib:

#折线图
import pandas as pd
unrate=pd.read_csv("UNRATE.csv")
unrate["DATE"]=pd.to_datetime(unrate["DATE"])
#print(unrate.head(12)) import matplotlib.pyplot as plt
#first_twelve=unrate[0:100]
#plt.plot(first_twelve["DATE"],first_twelve["VALUE"])
#plt.xticks(rotation=45)
#plt.xlabel("month")
#plt.ylabel("rate")
#plt.title("失业率")
#plt.show() #fig=plt.figure()
#ax1=fig.add_subplot(4,3,1)
#ax2=fig.add_subplot(4,3,2)
#ax2=fig.add_subplot(4,3,6) import numpy as np
#fig=plt.figure(figsize=(10,6))
#ax1=fig.add_subplot(2,1,1)
#ax2=fig.add_subplot(2,1,2)
#ax1.plot(np.random.randint(1,5,5),np.arange(5))
#ax2.plot(np.arange(10)*3,np.arange(10))
#plt.show() unrate["Month"]=unrate["DATE"].dt.month
#fig=plt.figure(figsize=(6,3))
#plt.plot(unrate[0:12]["Month"],unrate[0:12]["VALUE"],c="red")
#plt.plot(unrate[12:24]["Month"],unrate[12:24]["VALUE"],c="blue")
fig=plt.figure(figsize=(10,5))
colors=["red","blue","green","orange","black"]
for i in range(5):
start_index=i*12
end_index=(i+1)*12
subset=unrate[start_index:end_index]
label=str(1948+i)
plt.plot(subset["Month"],subset["VALUE"],c=colors[i],label=label)
plt.legend(loc="best")
plt.show() #bar
import pandas as pd
reviews = pd.read_csv('fandango_scores.csv')
cols = ['FILM', 'RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars']
norm_reviews = reviews[cols]
#print(norm_reviews[:1]) import matplotlib.pyplot as plt
from numpy import arange
#The Axes.bar() method has 2 required parameters, left and height.
#We use the left parameter to specify the x coordinates of the left sides of the bar.
#We use the height parameter to specify the height of each bar
num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars']
bar_heights = norm_reviews.ix[0, num_cols].values
bar_positions = arange(5) + 0.75
tick_positions = range(1,6)
fig, ax = plt.subplots() ax.bar(bar_positions, bar_heights, 0.5)
#横着画图
ax.barh(bar_positions, bar_heights, 0.5)
ax.set_xticks(tick_positions)
ax.set_xticklabels(num_cols, rotation=45) ax.set_xlabel('Rating Source')
ax.set_ylabel('Average Rating')
ax.set_title('Average User Rating For Avengers: Age of Ultron (2015)')
plt.show() #散点图
#Let's look at a plot that can help us visualize many points.
#函数返回一个figure图像和一个子图ax的array列表。
fig = plt.figure(figsize=(10,5))
ax1 = fig.add_subplot(2,1,1)
ax2 = fig.add_subplot(2,1,2)
ax1.scatter(norm_reviews['Fandango_Ratingvalue'], norm_reviews['RT_user_norm'])
ax1.set_xlabel('Fandango')
ax1.set_ylabel('Rotten Tomatoes')
ax2.scatter(norm_reviews['RT_user_norm'], norm_reviews['Fandango_Ratingvalue'])
ax2.set_xlabel('Rotten Tomatoes')
ax2.set_ylabel('Fandango')
plt.show()
#柱状图
import pandas as pd
import matplotlib.pyplot as plt
reviews = pd.read_csv('fandango_scores.csv')
cols = ['FILM', 'RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue']
norm_reviews = reviews[cols]
#print(norm_reviews[:5])
#数据计数
fandango_distribution = norm_reviews['Fandango_Ratingvalue'].value_counts()
#数据索引从小到大排列
fandango_distribution = fandango_distribution.sort_index()
imdb_distribution = norm_reviews['IMDB_norm'].value_counts()
imdb_distribution = imdb_distribution.sort_index()
#print(fandango_distribution)
#print(imdb_distribution)
fig, ax = plt.subplots()
#ax.hist(norm_reviews['Fandango_Ratingvalue'])
#bins指定个数,range指定区间
ax.hist(norm_reviews['Fandango_Ratingvalue'],bins=20)
ax.hist(norm_reviews['Fandango_Ratingvalue'], range=(4, 5),bins=20)
ax.set_ylim(0,20)
#四分图(盒图)
num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue']
fig, ax = plt.subplots()
ax.boxplot(norm_reviews[num_cols].values)
ax.set_xticklabels(num_cols, rotation=90)
ax.set_ylim(0,5)
plt.show()
#一些细节
import pandas as pd
import matplotlib.pyplot as plt
# Add your code here.
fig, ax = plt.subplots()
ax.plot(women_degrees['Year'], women_degrees['Biology'], label='Women')
ax.plot(women_degrees['Year'], 100-women_degrees['Biology'], label='Men')
#去掉小横线
ax.tick_params(bottom="off", top="off", left="off", right="off")
ax.set_title('Percentage of Biology Degrees Awarded By Gender')
ax.legend(loc="upper right")
major_cats = ['Biology', 'Computer Science', 'Engineering', 'Math and Statistics']
fig = plt.figure(figsize=(12, 12)) #for sp in range(0,4):
# ax = fig.add_subplot(2,2,sp+1)
# ax.plot(women_degrees['Year'], women_degrees[major_cats[sp]], c='blue', label='Women')
# ax.plot(women_degrees['Year'], 100-women_degrees[major_cats[sp]], c='green', label='Men')
# # Add your code here.
#
## Calling pyplot.legend() here will add the legend to the last subplot that was created.
#plt.legend(loc='upper right')
#plt.show() major_cats = ['Biology', 'Computer Science', 'Engineering', 'Math and Statistics']
fig = plt.figure(figsize=(12, 12)) for sp in range(0,4):
ax = fig.add_subplot(2,2,sp+1)
ax.plot(women_degrees['Year'], women_degrees[major_cats[sp]], c='blue', label='Women')
ax.plot(women_degrees['Year'], 100-women_degrees[major_cats[sp]], c='green', label='Men')
for key,spine in ax.spines.items():
spine.set_visible(False)
ax.set_xlim(1968, 2011)
ax.set_ylim(0,100)
ax.set_title(major_cats[sp])
ax.tick_params(bottom="off", top="off", left="off", right="off")
# Calling pyplot.legend() here will add the legend to the last subplot that was created.
plt.legend(loc='upper right')
plt.show() #Setting Line Width
cb_dark_blue = (0/255, 107/255, 164/255)
cb_orange = (255/255, 128/255, 14/255) fig = plt.figure(figsize=(12, 12)) for sp in range(0,4):
ax = fig.add_subplot(2,2,sp+1)
# Set the line width when specifying how each line should look.
ax.plot(women_degrees['Year'], women_degrees[major_cats[sp]], c=cb_dark_blue, label='Women', linewidth=10)
ax.plot(women_degrees['Year'], 100-women_degrees[major_cats[sp]], c=cb_orange, label='Men', linewidth=10)
for key,spine in ax.spines.items():
spine.set_visible(False)
ax.set_xlim(1968, 2011)
ax.set_ylim(0,100)
ax.set_title(major_cats[sp])
ax.tick_params(bottom="off", top="off", left="off", right="off") plt.legend(loc='upper right')
plt.show() stem_cats = ['Engineering', 'Computer Science', 'Psychology', 'Biology', 'Physical Sciences', 'Math and Statistics']
fig = plt.figure(figsize=(18, 3))
for sp in range(0,6):
ax = fig.add_subplot(1,6,sp+1)
ax.plot(women_degrees['Year'], women_degrees[stem_cats[sp]], c=cb_dark_blue, label='Women', linewidth=3)
ax.plot(women_degrees['Year'], 100-women_degrees[stem_cats[sp]], c=cb_orange, label='Men', linewidth=3)
for key,spine in ax.spines.items():
spine.set_visible(False)
ax.set_xlim(1968, 2011)
ax.set_ylim(0,100)
ax.set_title(stem_cats[sp])
ax.tick_params(bottom="off", top="off", left="off", right="off") if sp == 0:
ax.text(2005, 87, 'Men')
ax.text(2002, 8, 'Women')
elif sp == 5:
ax.text(2005, 62, 'Men')
ax.text(2001, 35, 'Women')
plt.show()

seaborn:

#seaborn风格模板
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
def sinplot(flip=1):
x=np.linspace(0,14,100)
for i in range(1,7):
plt.plot(x,np.sin(x+i*0.5)*(7-i)*flip)
#sns默认风格(有五种主题风格)
#sns.set()
#sinplot()
#sns.set_style("whitegrid")
#sns.set_style("dark")
#sns.set_style("white")
#sns.set_style("ticks")
#data=np.random.normal(size=(20,6))+np.arange(6)/2
#sns.boxplot(data=data)
#去掉上方和右边的线条
#sns.despine()
#sns.despine(offset=10)
#sns.despine(left=True)
#with内执行的都是当前风格
#with sns.axes_style("darkgrid"):
# plt.subplot(211)
# sinplot()
#plt.subplot(212)
#sinplot(-1)
##设置整体布局
sns.set_style("whitegrid")
sns.set_context("paper",font_scale=2.5,rc=({"lines.linewidth":4.5}))#poster/notebook
plt.figure(figsize=(8,6))
sinplot()
# 颜色(离散型与连续型) >颜色很重要
>color_palette()能传入任何matplot所支持的颜色
>color_palette()不写参数则默认颜色
>set_palette()设置所有图的颜色 #分类色板
#默认的绘图颜色
current_palette=sns.color_palette()
sns.palplot(current_palette)
#hls默认的颜色空间
sns.palplot(sns.color_palette("hls",8))
#把颜色放到数据中
fig=plt.figure(figsize=(10,6))
data=np.random.normal(size=(20,6))+np.arange(6)/2
sns.boxplot(data=data,palette=sns.color_palette("hls",8))
#更改调色板亮度与饱和度
#fig=plt.figure(figsize=(10,6))
#sns.palplot(sns.hls_palette(8,l=.2,h=.9))
#sns.boxplot(data=data,palette=sns.hls_palette(8,l=.2,h=.9)) #调出来成对的颜色
sns.palplot(sns.color_palette("Paired",8)) 使用xkcd来命名颜色
xkcd包含了一套众包努力的针对随机GRB色的命名,产生了954个可以随时通过xkcd_rgb字典中调用的命名颜色 plt.plot([0,1],[0,1],sns.xkcd_rgb["pale red"],lw=3)
plt.plot([0,1],[0,2],sns.xkcd_rgb["medium green"],lw=3)
plt.plot([0,1],[0,3],sns.xkcd_rgb["denim blue"],lw=3)
#连续画板
#色彩可以变换,比如用颜色的变化表示值重要性的变化
sns.palplot(sns.color_palette("Blues"))
#由深到浅
sns.palplot(sns.color_palette("Blues_r"))
#线性调色板
sns.palplot(sns.color_palette("cubehelix",8))
sns.palplot(sns.cubehelix_palette(8,start=.5,rot=-0.75))
#指定颜色深浅
sns.palplot(sns.light_palette("green"))
sns.palplot(sns.dark_palette("purple"))
x,y=np.random.multivariate_normal([0,0],[[1,-.5],[-.5,1]],size=300).T
#plt.scatter(x,y)
fig=plt.figure(figsize=(10,6))
pal=sns.dark_palette("green",as_cmap=True)
sns.kdeplot(x,y,cmap=pal)