最近在看《利用Python进行数据分析》,练习书中例子
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
#np数组的切片是原始数组的视图,当改变切片中的数据时,原始数组中的数据也会被改变
arr=np.arange(10)
print("arr",arr)
arr_slice=arr[5:8]
print("arr_slice",arr_slice)
arr_slice[1]=100
print("arr_slice",arr_slice)
print("arr",arr)
arr [0 1 2 3 4 5 6 7 8 9]
arr_slice [5 6 7]
arr_slice [ 5 100 7]
arr [ 0 1 2 3 4 5 100 7 8 9]
#显示地获得副本
arr_copy=arr[5:8].copy()
arr_copy[1]=666
print("arr_copy",arr_copy)
print("arr",arr)
arr_copy [ 5 666 7]
arr [ 0 1 2 3 4 5 100 7 8 9]
arr2d=np.array([[1,2,3],[4,5,6],[7,8,9]])
arr2d[2:,2:]
array([[9]])
布尔型索引
names=np.array(['Bob','Joe','Will','Bob','Will','Joe','Joe'])
data = np.random.randn(7,4) #7行4列
print(data)
names=='Bob' #array([ True, False, False, True, False, False, False])
print("data[names=='Bob']\n",data[names=='Bob']) #即选择地0,3行
print("data[names=='Bob',:2]\n",data[names=='Bob',:2])#选择0,3行,0,1列
[[ 0.68373329 0.16341463 0.69461828 -0.31262088]
[-0.58232105 0.07963629 0.51687759 -2.37428294]
[ 0.2084486 0.31391127 0.75911201 0.40985095]
[ 1.55252237 -1.13394508 2.21888179 0.85675417]
[-0.35051167 -1.54033925 -1.28263148 0.24368071]
[-1.4963781 0.92840301 -0.13512835 0.7639603 ]
[-1.54994869 0.93570074 -1.46281639 -0.93760228]]
data[names=='Bob']
[[ 0.68373329 0.16341463 0.69461828 -0.31262088]
[ 1.55252237 -1.13394508 2.21888179 0.85675417]]
data[names=='Bob',:2]
[[ 0.68373329 0.16341463]
[ 1.55252237 -1.13394508]]
data[data<0]=0 #利用布尔型数组设置值
data
array([[0.68373329, 0.16341463, 0.69461828, 0. ],
[0. , 0.07963629, 0.51687759, 0. ],
[0.2084486 , 0.31391127, 0.75911201, 0.40985095],
[1.55252237, 0. , 2.21888179, 0.85675417],
[0. , 0. , 0. , 0.24368071],
[0. , 0.92840301, 0. , 0.7639603 ],
[0. , 0.93570074, 0. , 0. ]])
花式索引(利用整数数组进行索引)
arr=np.empty((8,4))
for i in range(8):
arr[i]=i
print("arr[[4,5,7,6]]\n",arr[[4,5,7,6]]) #花式索引不是视图,是复制的副本
print("arr[[-3,-1,-5]]\n",arr[[-3,-1,-5]])
arr[[4,5,7,6]]
[[4. 4. 4. 4.]
[5. 5. 5. 5.]
[7. 7. 7. 7.]
[6. 6. 6. 6.]]
arr[[-3,-1,-5]]
[[5. 5. 5. 5.]
[7. 7. 7. 7.]
[3. 3. 3. 3.]]
arr=np.arange(32).reshape((8,4))
print("arr\n",arr)
print("\n",arr[[1,5,7,2]][:,[0,3,1,2]]) #解析:先找到1,5,7,2行,再在所有行中取0,3,1,2列
#np.ix_函数将两个一维整数数组转换为一个用于选取方形区域的索引器
print("使用np.ix_函数\n",arr[np.ix_([1,5,7,2],[0,3,1])])
arr
[[ 0 1 2 3]
[ 4 5 6 7]
[ 8 9 10 11]
[12 13 14 15]
[16 17 18 19]
[20 21 22 23]
[24 25 26 27]
[28 29 30 31]]
[[ 4 7 5 6]
[20 23 21 22]
[28 31 29 30]
[ 8 11 9 10]]
使用np.ix_函数
[[ 4 7 5]
[20 23 21]
[28 31 29]
[ 8 11 9]]
数组转置和轴对称(.T,transpose,swapaxes)
arr=np.arange(15).reshape((3,5))
print("arr\n",arr)
#.T得到数组的转置
print("arr.T\n",arr.T)
np.dot(arr.T,arr) #计算内积
arr
[[ 0 1 2 3 4]
[ 5 6 7 8 9]
[10 11 12 13 14]]
arr.T
[[ 0 5 10]
[ 1 6 11]
[ 2 7 12]
[ 3 8 13]
[ 4 9 14]]
array([[125, 140, 155, 170, 185],
[140, 158, 176, 194, 212],
[155, 176, 197, 218, 239],
[170, 194, 218, 242, 266],
[185, 212, 239, 266, 293]])
#transpose()函数
arr=np.arange(16).reshape((2,2,4))
print("arr\n",arr)
print("\narr.transpose((1,0,2))\n",arr.transpose((1,0,2)))
#从结果中可以看出
#4的原始坐标是(0,1,0),transpose后是(1,0,0)
#5的原始坐标是(0,1,1),transpose后是(1,0,1)
#10的原始坐标是(1,0,2),transpose后是(0,1,2)
#可见原始数组的transpose()参数为(x=0,y=1,z=2)
#arr.transpose((1,0,2))将原始数组中每个元素的x,y值互换
arr
[[[ 0 1 2 3]
[ 4 5 6 7]]
[[ 8 9 10 11]
[12 13 14 15]]]
arr.transpose((1,0,2))
[[[ 0 1 2 3]
[ 8 9 10 11]]
[[ 4 5 6 7]
[12 13 14 15]]]
#swapaxes
print("arr\n",arr)
arr.swapaxes(1,2)
#arr中4的坐标是(0,1,0),arr.swapaxes(1,2)将轴1,2互换,则互换后4的坐标是(0,0,1)
arr
[[[ 0 1 2 3]
[ 4 5 6 7]]
[[ 8 9 10 11]
[12 13 14 15]]]
array([[[ 0, 4],
[ 1, 5],
[ 2, 6],
[ 3, 7]],
[[ 8, 12],
[ 9, 13],
[10, 14],
[11, 15]]])
利用数组进行数据处理
points=np.arange(-5,5,0.01)#10*100个间隔相等的点
print('points\n',points[:100])
xs,ys=np.meshgrid(points,points) #xs中行是points,ys中列是points
print('\nxs\n',xs)
print('\nys\n',ys)
z=np.sqrt(np.power(xs,2)+np.power(ys,2))
# z=np.sqrt(xs**2+ys**2)
plt.imshow(z,cmap=plt.cm.winter) #imshow是热图
plt.colorbar()
plt.title("Image plot of $\sqrt{x^2+y^2}$ for a grid of value")
points
[-5. -4.99 -4.98 -4.97 -4.96 -4.95 -4.94 -4.93 -4.92 -4.91 -4.9 -4.89
-4.88 -4.87 -4.86 -4.85 -4.84 -4.83 -4.82 -4.81 -4.8 -4.79 -4.78 -4.77
-4.76 -4.75 -4.74 -4.73 -4.72 -4.71 -4.7 -4.69 -4.68 -4.67 -4.66 -4.65
-4.64 -4.63 -4.62 -4.61 -4.6 -4.59 -4.58 -4.57 -4.56 -4.55 -4.54 -4.53
-4.52 -4.51 -4.5 -4.49 -4.48 -4.47 -4.46 -4.45 -4.44 -4.43 -4.42 -4.41
-4.4 -4.39 -4.38 -4.37 -4.36 -4.35 -4.34 -4.33 -4.32 -4.31 -4.3 -4.29
-4.28 -4.27 -4.26 -4.25 -4.24 -4.23 -4.22 -4.21 -4.2 -4.19 -4.18 -4.17
-4.16 -4.15 -4.14 -4.13 -4.12 -4.11 -4.1 -4.09 -4.08 -4.07 -4.06 -4.05
-4.04 -4.03 -4.02 -4.01]
xs
[[-5. -4.99 -4.98 ... 4.97 4.98 4.99]
[-5. -4.99 -4.98 ... 4.97 4.98 4.99]
[-5. -4.99 -4.98 ... 4.97 4.98 4.99]
...
[-5. -4.99 -4.98 ... 4.97 4.98 4.99]
[-5. -4.99 -4.98 ... 4.97 4.98 4.99]
[-5. -4.99 -4.98 ... 4.97 4.98 4.99]]
ys
[[-5. -5. -5. ... -5. -5. -5. ]
[-4.99 -4.99 -4.99 ... -4.99 -4.99 -4.99]
[-4.98 -4.98 -4.98 ... -4.98 -4.98 -4.98]
...
[ 4.97 4.97 4.97 ... 4.97 4.97 4.97]
[ 4.98 4.98 4.98 ... 4.98 4.98 4.98]
[ 4.99 4.99 4.99 ... 4.99 4.99 4.99]]
Text(0.5,1,'Image plot of $\\sqrt{x^2+y^2}$ for a grid of value')
Matplotlib图例库:https://matplotlib.org/gallery/index.html
#累加 cumsum
#累积 cumprod
arr=np.arange(9).reshape((3,3))
print("arr\n",arr)
print("\narr.cumsum(0)\n",arr.cumsum(0)) #0按行从上往下累计和
print("\narr.cumsum(1)\n",arr.cumsum(1)) #1按列从做到右累计和
print("\narr.cumprod(0)\n",arr.cumprod(0)) #0按行从上往下累计积
print("\narr.cumprod(1)\n",arr.cumprod(1)) #1按列从左往右累计积
arr
[[0 1 2]
[3 4 5]
[6 7 8]]
arr.cumsum(0)
[[ 0 1 2]
[ 3 5 7]
[ 9 12 15]]
arr.cumsum(1)
[[ 0 1 3]
[ 3 7 12]
[ 6 13 21]]
arr.cumprod(0)
[[ 0 1 2]
[ 0 4 10]
[ 0 28 80]]
arr.cumprod(1)
[[ 0 0 0]
[ 3 12 60]
[ 6 42 336]]
线性代数
from numpy.linalg import inv,qr
X=np.random.randn(5,5)
mat=X.T.dot(X)
inv(mat)
array([[ 0.34978728, 0.46906327, -0.37457211, 0.09158803, 0.15775448],
[ 0.46906327, 1.52050783, -0.9237394 , 0.38114752, 0.45703812],
[-0.37457211, -0.9237394 , 0.91674887, -0.38206074, -0.36762396],
[ 0.09158803, 0.38114752, -0.38206074, 0.34549788, 0.13592124],
[ 0.15775448, 0.45703812, -0.36762396, 0.13592124, 0.32259262]])
随机数
""" np.random.seed() 确定随机数生成器的种子 np.random.permutation() 返回一个序列的随机排列或返回一个随机排列的范围 np.random.shuffle() 对一个序列就地随机排序 np.random.rand() 产生均匀分布的样本值 np.random.randint() 从给定的上下限范围内随机选取整数 np.random.randn() 产生正态分布(平均值为0,标准差为1)的样本值 np.random.binomial() 产生二项分布的样本值 np.random.normal() 产生正态(高斯)分布的样本值 np.random.beta() 产生Beta分布的样本值 np.random.chisquare() 产生卡方分布的样本值 np.random.gamma() 产生Gamma分布的样本值 np.random.uniform() 产生在[0,1)中均匀分布的样本值 """
'\nnp.random.rand() 产生均匀分布的样本值\nnp.random.randint() 从给定的上下限范围内随机选取整数\nnp.random.randn() 产生正态分布(平均值为0,标准差为1)的样本值\nnp.random.binomial() 产生二项分布的样本值\nnp.random.normal() 产生正态(高斯)分布的样本值\nnp.random.beta() 产生Beta分布的样本值\nnp.random.chisquare() 产生卡方分布的样本值\nnp.random.gamma() 产生Gamma分布的样本值\nnp.random.uniform() 产生在[0,1)中均匀分布的样本值\n'
from random import normalvariate
N=1000000
%timeit samples=[normalvariate(0,1) for _ in range(N)]
996 ms ± 5.86 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit np.random.normal(size=N) #对比发现 np.random.normal更快
32.7 ms ± 432 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
""" np.random.seed() seed确定随机数生成器的种子 使用同样的seed则得到的随机数相同 不使用seed或使用不同的seed,随机数可能不同 设置的seed值仅一次有效?(还不理解) """
num=0
np.random.seed(1)
print(np.random.random())
print(np.random.random())
print(np.random.random())
print("---------下面不使用seed---------------")
print(np.random.random())
print("---------下面使用seed=1---------------")
np.random.seed(1)
print(np.random.random())
print(np.random.random())
print(np.random.random())
0.417022004702574
0.7203244934421581
0.00011437481734488664
---------下面不使用seed---------------
0.30233257263183977
---------下面使用seed=1---------------
0.417022004702574
0.7203244934421581
0.00011437481734488664
""" np.random.permutation() 返回一个序列的随机排列或返回一个随机排列的范围 np.random.shuffle() 对一个序列就地随机排序 permutation()和shuffle()的区别: permutation返回将序列随机排列的副本 shuffle无返回值,直接对原序列进行排序 """
arr1 = [2,3,4,5]
print("arr1\n",arr1)
print("permutation\n",np.random.permutation(arr1))
print("permulation后的arr1\n",arr1)
print("shuffle\n",np.random.shuffle(arr1))
print("shuffle后的arr1\n",arr1)
#得到的数组中,2,3,6,7顺序随机
# arr=np.arange(8).reshape((2,4))
# np.random.permutation(arr)
arr1
[2, 3, 4, 5]
permutation
[3 4 2 5]
permulation后的arr1
[2, 3, 4, 5]
shuffle
None
shuffle后的arr1
[2, 5, 4, 3]
范例1:随机漫步
""" 纯python方式完成 """
import random
position=0
walk=[position]
steps=1000
for i in range(steps):
step=1 if random.randint(0,1) else -1 # if random.randint(0,1) step=1 else step=-1
position+=step
walk.append(position)
plt.plot(walk)
plt.title("Random walk with +1/-1 steps")
Text(0.5,1,'Random walk with +1/-1 steps')
""" 使用np.random模块 """
import numpy as np
nsteps=1000
draws=np.random.randint(0,2,size=nsteps) #[0,2),整数0,1
steps=np.where(draws>0,1,-1)
walk=steps.cumsum()
plt.plot(walk)
plt.title("Random walk with +1/-1 steps")
Text(0.5,1,'Random walk with +1/-1 steps')
print("walk.min\n",walk.min())
print("walk.max\n",walk.max())
""" 随机漫步多久才能得到距离初始0点10步远距离? """
(np.abs(walk)>=10).argmax()
walk.min
-18
walk.max
26
67
d=np.random.randint(0,2,size=(5,4))
s=np.where(d>0,1,-1)
print(s)
print('累加\n',s.cumsum(1))
""" cumsum(1) 列数据的累加 cumsum(0) 行数据相累加:例如在第0列,第1行=第0行+第1行,第2行=第0行+第1行+第2行 """
[[-1 1 -1 -1]
[-1 1 1 1]
[ 1 1 1 1]
[ 1 -1 1 1]
[-1 1 -1 -1]]
累加
[[-1 0 -1 -2]
[-1 0 1 2]
[ 1 2 3 4]
[ 1 0 1 2]
[-1 0 -1 -2]]