整理自慕课网课程http://www.imooc.com/learn/843

Python数据分析包含的包

numpy：数据结构基础

scipy：强大的科学计算方法（矩阵分析、信号分析、数理分析…）

matplotlib：丰富的可视化套件

pandas：基础数据分析套件

scikit-learn：强大的数据分析建模库

keras：人工神经网络

下载anaconda工具集成Python解释器。

numpy

关键词：开源数据计算扩展

功能：ndarray 多维操作线性代数

官网：http://www.numpy.org

ndarray

numpy中基本数据结构。

lst = [[1,3,5],[2,4,6]]
np_lst = np.array(lst)
print type(lst),type(np_lst)
np_lst = np.array(lst,dtype=np.int)#设置元素默认类型为float
#默认类型还有bool,int,int8/16/32/64/128,uint16/32/64/128,float...
print np_lst.shape,np_lst.ndim,np_lst.dtype,np_lst.itemsize,np_lst.size
print np.arange(1,11).reshape(2,5)

array

print np.zeros([2,3])
print np.ones([2,3])
print np.random.rand(2,3),np.random.rand()
print np.random.randint(1,10,[2,3])
print np.random.randn(2,3),np.random.randn()#正太分布
print np.random.choice(['a','b','c'],[2,3])
print np.random.beta(1,10,[2,3])#Beta distribution

random中可以生成许多分布的随机值，如randn正态分布等。。

array操作

np_lst=np.arange(1,11).reshape([2,-1])
print np.exp(np_lst)
print np.exp2(np_lst)
print np.sqrt(np_lst)
print np.sin(np_lst)
print np.log(np_lst)#底数是自然底数
print np_lst.sum(axis=1)#axis取值为0~lst维数-1,代表sum操作的维度
print np_lst.max(axis=1),np_lst.min()#axis原理同上
print np_lst+np_lst,np_lst**3#加减乘除都是对元素进行操作,**3是三次方
l1=np.array([1,2,3,4]);l2=np.array([4,3,2,1])
print l1.reshape([2,2]),"\n",l2.reshape([2,2])
print np.dot(l1.reshape([2,2]),l2.reshape([2,2]))#dot矩阵相乘
print np.concatenate((l1.reshape([2,2]),l2.reshape([2,2])),axis=1)#追加
print np.vstack((l1.reshape([2,2]),l2.reshape([2,2])))#纵向追加
print np.hstack((l1.reshape([2,2]),l2.reshape([2,2])))#横向追加
print np.split(l1,4)#进行切分数组，后面的参数被数组长度整除
print np.copy(l1)#拷贝

axis参数表示深入的维数，=0表示最外层，=n-1表示最深层，不加axis表示整体进行操作。如：

不加时，所有元素相加。

=0时，两个子数组对应元素相加，操作的是最外层的元素（子数组）

=1时，最内层，两个子数组内部，各自求和。

线性方程组和矩阵运算

from numpy.linalg import *
print np.eye(3)#n=3的单位矩阵
lst = np.array([[1,2],
                [3,4]])
print inv(lst)#逆矩阵
print lst.transpose()#转置矩阵
print det(lst)#行列式
print eig(lst)#输出是一个元组，第一个值是特征值，第二个值是特征向量
y=np.array([[5.],[7.]])
print solve(lst,y)#求方程组，lsy*x=y，输出x

其他

print np.fft.fft(np.array([1,1,1,1,1,1]))
print np.corrcoef([[1,0,1],[0,2,1]])#皮尔逊相关系数
print np.poly1d([2,1,2,3])

arr = np.arange(15).reshape((3,5))
print arr
print arr.transpose(1,0)
print arr.swapaxes(0,1)

matplotlib

关键词：会图库

官网：http://matplotlib.org/

线图

#-*-coding:utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
#定义横轴-pi到pi，256个点，True包含最后一个点
x=np.linspace(-np.pi,np.pi,256,endpoint=True)
c,s=np.cos(x),np.sin(x)#定义正弦函数和余弦函数
plt.figure(1)#定义图1
#绘制cos，颜色，线宽，线样式(-,--,:)，标签，透明度
plt.plot(x,c,color="blue",linewidth=3.0,linestyle=":",label="COS",alpha=0.5)
plt.plot(x,s,color="red",linewidth=2.0,linestyle="--",label="SIN")#绘制sin
plt.title("Title:cos&sin")#图的标题
ax=plt.gca()#轴的编辑器
ax.spines["right"].set_color("none")#右边轴隐藏
ax.spines["top"].set_color("none")#上边轴隐藏
ax.spines["left"].set_position(("data",0))#设置左边轴在0的位置
ax.spines["bottom"].set_position(("data",0))#设置下边轴在0的位置
ax.xaxis.set_ticks_position("bottom")#设置轴标签在轴的位置
ax.yaxis.set_ticks_position("left")
plt.xticks([-np.pi,-np.pi/2,0,np.pi/2,np.pi],#设置x轴标签
           [r'$-\pi$',r'$-\pi/2$',r'$0$',r'$+\pi/2$',r'$+\pi$'])
plt.yticks(np.linspace(-1,1,5,endpoint=True))#设置y轴标签
for label in ax.get_xticklabels()+ax.get_yticklabels():
    label.set_fontsize(8)#设置每个轴标签大小，背景色，框颜色，透明度
    label.set_bbox(dict(facecolor="red",edgecolor="blue",alpha=0.2))
plt.legend(loc="upper left")#设置类型线标签位置
plt.grid()#显示网格线
# plt.axis([-1,1,-0.5,1])#图的显示范围，x范围，y范围

plt.fill(x,s,color="red",alpha=0.2)#填充函数和坐标轴之间
plt.fill_between(x,c,s,color="blue",alpha=0.2)#填充函数之间的区域
# interpolate 自动填充空白，当x取得离散点差距较大时，
# 显示的时候两个函数之间的区域可能有空白存在，interpolate 就是用来填充这部分区域
# plt.fill_between(x, y1, y2, where= y1 >= y2, facecolor = "blue", interpolate= True)
# plt.fill_between(x, y1, y2, where= y2 > y1, facecolor = "yellow", interpolate= True)

t=1
plt.plot([t,t],[0,np.cos(t)],"y",linewidth=3,linestyle="--")
plt.annotate("cos(1)",xy=(t,np.cos(1)),#注释图上的内容，注释的位置
             xycoords="data",xytext=(+10,+30),#注释位置的偏移量
             textcoords="offset points",
             arrowprops=dict(arrowstyle="->",connectionstyle="arc3,rad=.4"))#注释箭头类型
plt.show()#显示图片

其他类型图

'''
其他图
'''
fig=plt.figure()#建立一个表格
'''scatter散点图'''
fig.add_subplot(3,3,1)#加入3行3列第1个子图
n=300
X=np.random.normal(0,1,n)
Y=np.random.normal(0,1,n)
T=np.arctan2(Y,X)#?
# plt.axes([0.025,0.025,0.95,0.95])#显示范围
plt.scatter(X,Y,s=75,c=T,alpha=0.5)#在子图上画散点，s点大小，c颜色
plt.xlim(-1.5,1.5),plt.xticks([])#x的范围
plt.ylim(-1.5,1.5),plt.yticks([])
plt.axis()
plt.title("scatter")
plt.xlabel("x")
plt.ylabel("y")

'''bar柱状图'''
fig.add_subplot(3,3,2)#加入3行3列第2个子图
n=10
X=np.arange(n)
Y1=(1-X/float(n))*np.random.uniform(0.5,1.0,n)
Y2=(1-X/float(n))*np.random.uniform(0.5,1.0,n)
'''画柱状图'''
plt.bar(X,+Y1,facecolor='#9999ff',edgecolor='white')
plt.bar(X,-Y2,facecolor='#ff9999',edgecolor='white')
for x, y in zip(X, Y1):
    plt.text(x + 0.4, y + 0.05, '%.2f' % y, ha='center', va='bottom')
for x, y in zip(X, Y2):
    plt.text(x + 0.4, -y - 0.05, '%.2f' % y, ha='center', va='top')

'''pie饼图'''
fig.add_subplot(3,3,3)
n=20
Z=np.ones(n)
Z[-1]*=2
'''画饼图，explode是每个值离中心的距离'''
plt.pie(Z,explode=Z*0.05,colors=['%f' % (i/float(n)) for i in range(n)],
        labels=['%.2f' % (i/float(n)) for i in range(n)])
plt.gca().set_aspect('equal')#保持形状
plt.xticks([]),plt.yticks([])

'''polar极坐标图'''
fig.add_subplot(3,3,4,polar=True)#polar=True表示极坐标
n=20
theta=np.arange(0.0,2*np.pi,2*np.pi/n)#0到2pi
radii=10*np.random.rand(n)
plt.plot(theta,radii)#若没有polar则为普通折线图
#也可以plt.polar(theta,radii)

'''heatmap热图'''
fig.add_subplot(3,3,5)
from matplotlib import cm
data = np.random.rand(3,3)
cmap=cm.Reds
map=plt.imshow(data,interpolation='nearest',cmap=cmap,aspect='auto',vmin=0,vmax=1)

'''3D图'''
from mpl_toolkits.mplot3d import Axes3D
ax=fig.add_subplot(3,3,6,projection="3d")
x=np.array(range(0,10))
y=x*np.random.rand(10)
z=y+np.random.randn(10)
ax.scatter(x,y,z,s=10)

'''hotmap热力图'''
fig.add_subplot(3,1,3)#3行1列第3个
def f(x,y):
    return (1-x/2+x**5+y**3)*np.exp(-x**2-y**2)
n=256
x=np.linspace(-3,3,n)
y=np.linspace(-3,3,n)
X,Y=np.meshgrid(x,y)
plt.contourf(X,Y,f(X,Y),8,alpha=.75,cmap=plt.cm.hot)

plt.savefig("./data/fig.jpg")#保存图片
plt.show()

scipy

关键词：数值计算库

官网：https://www.scipy.org

SciPy被组织成包含不同科学计算域的子包。这些总结如下表:

cluster 聚类算法

constants 物理和数学常数

fftpack 快速傅里叶变换例程

integrate 积分和常微分方程求解器

interpolate 插值和平滑样条

io 输入和输出

linalg 线性代数

ndimage n维图像处理

odr 正交距离回归

optimize 优化和找根程序

signal 信号处理

sparse 稀疏矩阵和相关例程

spatial 空间数据结构和算法

special 特殊功能

stats 统计分布和功能

scipy积分

这里几个概念需要复习下数学知识，暂时也没用到，日后更新。。。

scipy优化器

scipy插值

scipy线性计算与矩阵分解

scipy学习

pandas

关键词：数据分析库

官网：http://pandas.pydata.org

Series&DataFrame

DataFrame由一列列的Series组成

import numpy as np
import pandas as pd

#Data Structure数据结构
s=pd.Series([i*2 for i in range(1,11)])
print type(s),s
dates=pd.date_range("2017-03-01",periods=8)
print dates#生成时间序列

df=pd.DataFrame(np.random.randn(8,5),
                index=dates,columns=list("ABCDE"))
print df
# df=pd.DataFrame({"A":1,"B":pd.Timestamp("20170907"),
#                  "C":["张三","李四","王五","赵六"]})
# print df

Basic&Select&Set

#Basic基本操作
print df.head(3)#前三行
print df.tail(3)#后三行
print df.index
print df.values#成为二维数组
print df.T#转置
print df.sort_values("C")#按照C列排序
#对index排序，axis表示对哪个维度的index
print df.sort_index(axis=1,ascending=False)
print df.describe()
#Select选择/切片
print df["A"]#DataFrame的A列
print type(df["A"])#DataFrame由一列列的Series组成
print df[:3]#前三行
print df["2017-03-01":"2017-03-01"]
print df.loc[dates[0]]#取第0行值
print df.loc["2017-03-01":"2017-03-02",["B","E"]]#取部分值
print df.at[dates[0],"C"]#取特定的值
print df.iloc[1:3,2:4]#直接通过下标取
print df.iloc[1,4]
print df.iat[1,4]
print df[df.B>0][df.A<0]
print df[df>0]
print df[df["E"].isin([0.0,2.0])]
#以上选择的元素也可以进行相应的修改操作

Missing Data Processing

#Missing Values
df1=df.reindex(index=dates[:4],columns=list("ABCD")+["G"])
df1.loc[dates[0]:dates[1],"G"]=1
print df1
#对于缺失值，一般直接丢弃或者填充
#填充则填充固定值或者插值
print df1.dropna()#直接丢弃
print df1.fillna(value=1)#填充固定值

Merge&Reshape

#Statistic
print df.mean()#均值，结果就是一个Series
print df.var()#方差
s=pd.Series([1,2,4,np.nan,5,7,9,10],index=dates)
print s
print s.shift(2)#值后移2个
print s.diff()
print s.value_counts()#打印该值出现的次数
print df.apply(np.cumsum)
print df.apply(lambda x:x.max()-x.min())

print pd.concat([df[:3],df[-3:]])#拼接
left=pd.DataFrame({"Key":["x","y"],"value":[1,2]})
right=pd.DataFrame({"Key":["x","z"],"value":[3,4]})
print left,"\n",right
print pd.merge(left,right,on="Key",how="outer")#类似sql连接
print df.groupby("A").sum()#按照A列排序print df.sort_values("A")

#Reshape
df["C"]=list("12345678")
#透视表
print pd.pivot_table(df,values="D",index=["A","B"],columns=["C"])

Time Series&Graph&Files

#Time Series
timeTest=pd.date_range("20170907",periods=10,freq="S")
print timeTest
#Graph
ts=pd.Series(np.random.randn(1000),index=pd.date_range("20170907",periods=1000))
ts=ts.cumsum()
from pylab import *
ts.plot()
show()

#File
df.to_csv("./data/test.csv")
df.to_excel("./data/test.xlsx","Sheet1")
dfcsv=pd.read_csv("./data/test.csv")
dfexc=pd.read_excel("./data/test.xlsx","Sheet1")

scikit-learn

关键词：数据挖掘建模机器学习

官网：http://scikit-learn.org

github

决策树

#数据集
from sklearn.datasets import load_iris
iris=load_iris()
print iris
print len(iris["data"])

#数据预处理
from sklearn.model_selection import train_test_split
#划分数据集为训练和预测
train_data,test_data,train_target,test_target = \
    train_test_split(iris.data,iris.target,test_size=0.2,random_state=1)

#模型
from sklearn import tree
clf=tree.DecisionTreeClassifier(criterion="entropy")
clf.fit(train_data,train_target)#训练模型
y_pred=clf.predict(test_data)#预测

#Verify
from sklearn import metrics
print metrics.accuracy_score(y_true=test_target,y_pred=y_pred)
print metrics.confusion_matrix(y_true=test_target,y_pred=y_pred)

#输出到文件
with open("./data/tree.dot","w") as fw:
    tree.export_graphviz(clf,out_file=fw)

支持向量机

'''支持向量机'''
print "="*40,"支持向量机"
from sklearn.svm import SVC
clf = SVC(probability=True)
clf.fit(train_data, train_target)
y_pred=clf.predict(test_data)#预测
#Verify
print metrics.accuracy_score(y_true=test_target,y_pred=y_pred)
print metrics.confusion_matrix(y_true=test_target,y_pred=y_pred)

朴素贝叶斯

'''朴素贝叶斯'''
print "="*40,"朴素贝叶斯"
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(train_data, train_target)
print(model)
# make predictions
expected = test_target
predicted = model.predict(test_data)
# summarize the fit of the model
print metrics.accuracy_score(y_true=test_target,y_pred=predicted)
# print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

k近邻

'''K近邻'''
print "="*40,"k近邻"
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
# fit a k-nearest neighbor model to the data
model = KNeighborsClassifier()
model.fit(train_data, train_target)
print(model)
# make predictions
expected = test_target
predicted = model.predict(test_data)
# summarize the fit of the model
print metrics.accuracy_score(y_true=test_target,y_pred=predicted)
# print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

逻辑回归

'''逻辑回归'''
print "="*40,"k近邻"
#大多数情况下被用来解决分类问题（二元分类），但多类的分类（所谓的一对多方法）也适用。
# 这个算法的优点是对于每一个输出的对象都有一个对应类别的概率。
import numpy as np
import urllib
# url with dataset
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
# download the file
raw_data = urllib.urlopen(url)
# load the CSV file as a numpy matrix
dataset = np.loadtxt(raw_data, delimiter=",")
# separate the data from the target attributes
X = dataset[:,0:7]
y = dataset[:,8]

from sklearn import metrics
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

keras

关键词：人工神经网络

官网：http://keras.io

两个著名的框架

import numpy as np
from keras.models import Sequential
from keras.layers import Dense,Activation
from keras.optimizers import SGD
#报错ImportError: No module named tensorflow
#修改C:\Users\SUN\.keras下keras.json

#数据集
from sklearn.datasets import load_iris
iris=load_iris()
print iris["target"]
#标签化
from sklearn.preprocessing import LabelBinarizer
print LabelBinarizer().fit_transform(iris["target"])

#数据预处理
from sklearn.model_selection import train_test_split
#划分数据集为训练和预测
train_data,test_data,train_target,test_target = \
    train_test_split(iris.data,iris.target,test_size=0.2,random_state=1)
labels_train=LabelBinarizer().fit_transform(train_target)
labels_test=LabelBinarizer().fit_transform(test_target)

model=Sequential(
    [
        Dense(5,input_dim=4),#第一层的输出输入4个属性
        Activation("relu"),#结果函数
        Dense(3),#第二层的输入5省略，输出3个类别
        Activation("sigmoid"),#结果函数
    ]
)
#也可使用下面方法一层层加
# model=Sequential()
# model.add(Dense(5,input_dim=4))

sgd=SGD(lr=0.01,decay=1e-6,momentum=0.9,nesterov=True)
model.compile(optimizer=sgd,loss="categorical_crossentropy")
model.fit(train_data,labels_train,nb_epoch=200,batch_size=40)

print model.predict_classes(test_data)

model.save_weights("./data/weight")#保存训练的权值
model.load_weights("./data/weight")#读取之前训练的权值

秒客网

Python科学计算——使用Anaconda