python数据处理与机器学习

提纲

python数据处理与机器学习

numpy:

#genformtxt

import numpy as np

#genformtxtdata=np.genfromtxt("genfromtxtdata")

#print(help(numpy.genfromtxt))

#matrix-list of list

matrix=np.array([[12,12],[12,12],[1,13]])

print(matrix)

#强制转换成一致数据类型

dataa=np.array([1,2,4.0,1])

#切片

#判断

#datab=dataa

#结果返回true,false

#导出等于某一值的数组

#booldata=(datab==1)

#print(datab[booldata])

#取出包含某一值的某一行

boolmatrix =(matrix[:,1]==13)

print(matrix[boolmatrix,:])

# & | 与或

#类型转换

dataa.astype(float)

#求极值

dataa.min()

#按照行列求和

matrix.sum(axis=1)

#np.zeros((3,4)->元组格式)

#np.arange(15).reshape(3,4)

#np.random.random()->先进入random模块，默认范围-1->+1

#np.linspace(0,2*pi,100)->均匀取值

#np.exp()

#相减：维度一样对应相减，不一样都减去后一个数

A=np.array([[1,2],[1,1]])

B=np.array([[1,2],[1,1]])

print(A*B)#对应元素相乘

print(A.dot(B))#矩阵相乘

print(np.dot(A,B))

#矩阵操作

#向下取

a=np.floor(10*np.random.random((3,4)))

b=np.floor(10*np.random.random((3,4)))

#将矩阵拉成向量

print(a)

print(a.ravel())

#数据拼接

#print(np.hstack((a,b)))

#print(np.vstack((a,b)))

#数据切分

#print(np.hsplit(a,2))

#print(np.vsplit(a,2))

#数据复制

b=a

b.shape=4,3

#改变b的形状，a的形状跟着变了

print(a)

#a，b的ID值一样，指向统一内存空间

print(id(a),id(b))

#浅复制

#c与a虽然指向的地址不同但是共用一套数值，改变 c,a也会改变

c=a.view()

c.shape=2,6

c[1,1]=11

print(a.shape)

print(a)

#深复制

#d与a完全没关系了

d=a.copy()

#索引操作

#找最大值所在的位置

intt=a.argmax(axis=0)

print(intt)

#扩展数组

a=np.arange(1,20,10)

b=np.tile(a,(2,3))

print(b)

#排序

a=np.array([[1,2,3],[3,2,1]])

#从小到大的索引值

j=np.argsort(a)

a.sort(axis=1)

print(j)

print(a)

pandas:

import pandas as pd

import numpy as np

current_path = %pwd

print(current_path)

#food_info=pd.read_csv("food_info.csv")

#DataFrame数据类型

#print(type(food_info))

#print(food_info.dtypes)

#food_info.head()

#food_info.tail(4)

#print(food_info.columns)

#print(food_info.shape)

#索引与计算

#print(food_info.loc[0])

#传入一个list->多列

#print(food_info[["NDB_No","Shrt_Desc"]])

#column_list=food_info.columns.tolist()

#print(column_list)

##数据预处理

#food_info.sort_values("NDB_No",inplace=True)

##排序后缺失值会被放到最后

##从小到大排序

#print(food_info["NDB_No"])

##从大到小

#food_info.sort_values("NDB_No",inplace=True,ascending=False)

#print(food_info["NDB_No"])

titanic_train_info=pd.read_csv("titanic_train.csv")

#print(titanic_train_info.head())

#age=titanic_train_info["Age"]

#print(age.loc[0:10])

#age_is_null=pd.isnull(age)

#print(age_is_null)

#age_null_true=age[age_is_null]

#age_null_count=len(age_null_true)

#print(age_null_count)

#除去缺失值求平均

#age_null_false=titanic_train_info["Age"][age_is_null==False]

#average_age=sum(age_null_false)/len(age_null_false)

#average_age1=titanic_train_info["Age"].mean()

#print(average_age,average_age1)

#数据统计表

#基准-统计对象-方法

#求均值是默认方法

#passager_survival=titanic_train_info.pivot_table(index="Pclass",values="Survived",aggfunc=np.mean)

#print(passager_survival)

#passager_age=titanic_train_info.pivot_table(index="Pclass",values="Age",aggfunc=np.mean)

#print(passager_age)

#port_stats=titanic_train_info.pivot_table(index="Embarked",values=["Fare","Survived"],aggfunc=np.sum)

#print(port_stats)

##缺失值丢掉

#titanic_train_info1=titanic_train_info

drop_na_columns=titanic_train_info1.dropna(axis=0,subset=["Age","Sex"])

drop_na_columns.head()

#定位到某一具体值

row_index_83_age=titanic_train_info1.loc[83,"Age"]

print(row_index_83_age)

#自定义函数

#titanic_train_info1.apply("函数名")

#series结构

import pandas as pd

score_csv=pd.read_csv("fandango_score_comparison.csv")

series_FILM=score_csv["FILM"]

#print(type(series_FILM))

from pandas import Series

film_names=series_FILM.values

#print(type(film_names))

series_rt=score_csv["RottenTomatoes"]

#print(series_rt)

rt_scores=series_rt.values

print(rt_scores)

#以名字所谓索引

series_customer=Series(rt_scores,index=film_names)

series_customer["Minions (2015)"]

series_customer[5:10]

matplotlib:

#折线图

import pandas as pd

unrate=pd.read_csv("UNRATE.csv")

unrate["DATE"]=pd.to_datetime(unrate["DATE"])

#print(unrate.head(12))

import matplotlib.pyplot as plt

#first_twelve=unrate[0:100]

#plt.plot(first_twelve["DATE"],first_twelve["VALUE"])

#plt.xticks(rotation=45)

#plt.xlabel("month")

#plt.ylabel("rate")

#plt.title("失业率")

#plt.show()

#fig=plt.figure()

#ax1=fig.add_subplot(4,3,1)

#ax2=fig.add_subplot(4,3,2)

#ax2=fig.add_subplot(4,3,6)

import numpy as np

#fig=plt.figure(figsize=(10,6))

#ax1=fig.add_subplot(2,1,1)

#ax2=fig.add_subplot(2,1,2)

#ax1.plot(np.random.randint(1,5,5),np.arange(5))

#ax2.plot(np.arange(10)*3,np.arange(10))

#plt.show()

unrate["Month"]=unrate["DATE"].dt.month

#fig=plt.figure(figsize=(6,3))

#plt.plot(unrate[0:12]["Month"],unrate[0:12]["VALUE"],c="red")

#plt.plot(unrate[12:24]["Month"],unrate[12:24]["VALUE"],c="blue")

fig=plt.figure(figsize=(10,5))

colors=["red","blue","green","orange","black"]

for i in range(5):

    start_index=i*12

    end_index=(i+1)*12

    subset=unrate[start_index:end_index]

    label=str(1948+i)

    plt.plot(subset["Month"],subset["VALUE"],c=colors[i],label=label)

plt.legend(loc="best")

plt.show()

#bar

import pandas as pd

reviews = pd.read_csv('fandango_scores.csv')

cols = ['FILM', 'RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars']

norm_reviews = reviews[cols]

#print(norm_reviews[:1])

import matplotlib.pyplot as plt

from numpy import arange

#The Axes.bar() method has 2 required parameters, left and height.

#We use the left parameter to specify the x coordinates of the left sides of the bar.

#We use the height parameter to specify the height of each bar

num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars']

bar_heights = norm_reviews.ix[0, num_cols].values

bar_positions = arange(5) + 0.75

tick_positions = range(1,6)

fig, ax = plt.subplots()

ax.bar(bar_positions, bar_heights, 0.5)

#横着画图

ax.barh(bar_positions, bar_heights, 0.5)

ax.set_xticks(tick_positions)

ax.set_xticklabels(num_cols, rotation=45)

ax.set_xlabel('Rating Source')

ax.set_ylabel('Average Rating')

ax.set_title('Average User Rating For Avengers: Age of Ultron (2015)')

plt.show()

#散点图

#Let's look at a plot that can help us visualize many points.

#函数返回一个figure图像和一个子图ax的array列表。

fig = plt.figure(figsize=(10,5))

ax1 = fig.add_subplot(2,1,1)

ax2 = fig.add_subplot(2,1,2)

ax1.scatter(norm_reviews['Fandango_Ratingvalue'], norm_reviews['RT_user_norm'])

ax1.set_xlabel('Fandango')

ax1.set_ylabel('Rotten Tomatoes')

ax2.scatter(norm_reviews['RT_user_norm'], norm_reviews['Fandango_Ratingvalue'])

ax2.set_xlabel('Rotten Tomatoes')

ax2.set_ylabel('Fandango')

plt.show()

#柱状图

import pandas as pd

import matplotlib.pyplot as plt

reviews = pd.read_csv('fandango_scores.csv')

cols = ['FILM', 'RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue']

norm_reviews = reviews[cols]

#print(norm_reviews[:5])

#数据计数

fandango_distribution = norm_reviews['Fandango_Ratingvalue'].value_counts()

#数据索引从小到大排列

fandango_distribution = fandango_distribution.sort_index()

imdb_distribution = norm_reviews['IMDB_norm'].value_counts()

imdb_distribution = imdb_distribution.sort_index()

#print(fandango_distribution)

#print(imdb_distribution)

fig, ax = plt.subplots()

#ax.hist(norm_reviews['Fandango_Ratingvalue'])

#bins指定个数，range指定区间

ax.hist(norm_reviews['Fandango_Ratingvalue'],bins=20)

ax.hist(norm_reviews['Fandango_Ratingvalue'], range=(4, 5),bins=20)

ax.set_ylim(0,20)

#四分图（盒图）

num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue']

fig, ax = plt.subplots()

ax.boxplot(norm_reviews[num_cols].values)

ax.set_xticklabels(num_cols, rotation=90)

ax.set_ylim(0,5)

plt.show()

#一些细节

import pandas as pd

import matplotlib.pyplot as plt

# Add your code here.

fig, ax = plt.subplots()

ax.plot(women_degrees['Year'], women_degrees['Biology'], label='Women')

ax.plot(women_degrees['Year'], 100-women_degrees['Biology'], label='Men')

#去掉小横线

ax.tick_params(bottom="off", top="off", left="off", right="off")

ax.set_title('Percentage of Biology Degrees Awarded By Gender')

ax.legend(loc="upper right")

major_cats = ['Biology', 'Computer Science', 'Engineering', 'Math and Statistics']

fig = plt.figure(figsize=(12, 12))

#for sp in range(0,4):

#    ax = fig.add_subplot(2,2,sp+1)

#    ax.plot(women_degrees['Year'], women_degrees[major_cats[sp]], c='blue', label='Women')

#    ax.plot(women_degrees['Year'], 100-women_degrees[major_cats[sp]], c='green', label='Men')

#    # Add your code here.

#

## Calling pyplot.legend() here will add the legend to the last subplot that was created.

#plt.legend(loc='upper right')

#plt.show()

major_cats = ['Biology', 'Computer Science', 'Engineering', 'Math and Statistics']

fig = plt.figure(figsize=(12, 12))

for sp in range(0,4):

    ax = fig.add_subplot(2,2,sp+1)

    ax.plot(women_degrees['Year'], women_degrees[major_cats[sp]], c='blue', label='Women')

    ax.plot(women_degrees['Year'], 100-women_degrees[major_cats[sp]], c='green', label='Men')

    for key,spine in ax.spines.items():

        spine.set_visible(False)

    ax.set_xlim(1968, 2011)

    ax.set_ylim(0,100)

    ax.set_title(major_cats[sp])

    ax.tick_params(bottom="off", top="off", left="off", right="off")

# Calling pyplot.legend() here will add the legend to the last subplot that was created.

plt.legend(loc='upper right')

plt.show()

#Setting Line Width

cb_dark_blue = (0/255, 107/255, 164/255)

cb_orange = (255/255, 128/255, 14/255)

fig = plt.figure(figsize=(12, 12))

for sp in range(0,4):

    ax = fig.add_subplot(2,2,sp+1)

    # Set the line width when specifying how each line should look.

    ax.plot(women_degrees['Year'], women_degrees[major_cats[sp]], c=cb_dark_blue, label='Women', linewidth=10)

    ax.plot(women_degrees['Year'], 100-women_degrees[major_cats[sp]], c=cb_orange, label='Men', linewidth=10)

    for key,spine in ax.spines.items():

        spine.set_visible(False)

    ax.set_xlim(1968, 2011)

    ax.set_ylim(0,100)

    ax.set_title(major_cats[sp])

    ax.tick_params(bottom="off", top="off", left="off", right="off")

plt.legend(loc='upper right')

plt.show()

stem_cats = ['Engineering', 'Computer Science', 'Psychology', 'Biology', 'Physical Sciences', 'Math and Statistics']

fig = plt.figure(figsize=(18, 3))

for sp in range(0,6):

    ax = fig.add_subplot(1,6,sp+1)

    ax.plot(women_degrees['Year'], women_degrees[stem_cats[sp]], c=cb_dark_blue, label='Women', linewidth=3)

    ax.plot(women_degrees['Year'], 100-women_degrees[stem_cats[sp]], c=cb_orange, label='Men', linewidth=3)

    for key,spine in ax.spines.items():

        spine.set_visible(False)

    ax.set_xlim(1968, 2011)

    ax.set_ylim(0,100)

    ax.set_title(stem_cats[sp])

    ax.tick_params(bottom="off", top="off", left="off", right="off")

    if sp == 0:

        ax.text(2005, 87, 'Men')

        ax.text(2002, 8, 'Women')

    elif sp == 5:

        ax.text(2005, 62, 'Men')

        ax.text(2001, 35, 'Women')

plt.show()

seaborn:

#seaborn风格模板

import seaborn as sns

import matplotlib as mpl

import matplotlib.pyplot as plt

import numpy as np

%matplotlib inline

def sinplot(flip=1):

    x=np.linspace(0,14,100)

    for i in range(1,7):

        plt.plot(x,np.sin(x+i*0.5)*(7-i)*flip)

#sns默认风格（有五种主题风格）

#sns.set()

#sinplot()

#sns.set_style("whitegrid")

#sns.set_style("dark")

#sns.set_style("white")

#sns.set_style("ticks")

#data=np.random.normal(size=(20,6))+np.arange(6)/2

#sns.boxplot(data=data)

#去掉上方和右边的线条

#sns.despine()

#sns.despine(offset=10)

#sns.despine(left=True)

#with内执行的都是当前风格

#with sns.axes_style("darkgrid"):

#    plt.subplot(211)

#    sinplot()

#plt.subplot(212)

#sinplot(-1)

##设置整体布局

sns.set_style("whitegrid")

sns.set_context("paper",font_scale=2.5,rc=({"lines.linewidth":4.5}))#poster/notebook

plt.figure(figsize=(8,6))

sinplot()

# 颜色（离散型与连续型）

>颜色很重要

>color_palette()能传入任何matplot所支持的颜色

>color_palette()不写参数则默认颜色

>set_palette()设置所有图的颜色

#分类色板

#默认的绘图颜色

current_palette=sns.color_palette()

sns.palplot(current_palette)

#hls默认的颜色空间

sns.palplot(sns.color_palette("hls",8))

#把颜色放到数据中

fig=plt.figure(figsize=(10,6))

data=np.random.normal(size=(20,6))+np.arange(6)/2

sns.boxplot(data=data,palette=sns.color_palette("hls",8))

#更改调色板亮度与饱和度

#fig=plt.figure(figsize=(10,6))

#sns.palplot(sns.hls_palette(8,l=.2,h=.9))

#sns.boxplot(data=data,palette=sns.hls_palette(8,l=.2,h=.9))

#调出来成对的颜色

sns.palplot(sns.color_palette("Paired",8))

使用xkcd来命名颜色

xkcd包含了一套众包努力的针对随机GRB色的命名，产生了954个可以随时通过xkcd_rgb字典中调用的命名颜色

plt.plot([0,1],[0,1],sns.xkcd_rgb["pale red"],lw=3)

plt.plot([0,1],[0,2],sns.xkcd_rgb["medium green"],lw=3)

plt.plot([0,1],[0,3],sns.xkcd_rgb["denim blue"],lw=3)

#连续画板

#色彩可以变换，比如用颜色的变化表示值重要性的变化

sns.palplot(sns.color_palette("Blues"))

#由深到浅

sns.palplot(sns.color_palette("Blues_r"))

#线性调色板

sns.palplot(sns.color_palette("cubehelix",8))

sns.palplot(sns.cubehelix_palette(8,start=.5,rot=-0.75))

#指定颜色深浅

sns.palplot(sns.light_palette("green"))

sns.palplot(sns.dark_palette("purple"))

x,y=np.random.multivariate_normal([0,0],[[1,-.5],[-.5,1]],size=300).T

#plt.scatter(x,y)

fig=plt.figure(figsize=(10,6))

pal=sns.dark_palette("green",as_cmap=True)

sns.kdeplot(x,y,cmap=pal)

秒客网

python数据处理与机器学习

numpy:

pandas:

matplotlib:

seaborn:

相关文章