pandas—数据聚合与分组运算

时间:2022-11-02 21:49:57
In [1]:
import pandas as pd
import numpy as np
from pandas import *
df=DataFrame({'key1':['a','a','b','b','a'],
'key2':['one','two','one','two','one'],
'data1':np.random.randn(5),
'data2':np.random.randn(5)})
df
Out[1]:
  data1 data2 key1 key2
0 1.711100 1.265615 a one
1 -0.937672 0.505905 a two
2 0.332802 0.544956 b one
3 -0.888079 0.024554 b two
4 0.542374 -0.455125 a one
In [4]:
grouped=df['data1'].groupby(df['key1'])
print(grouped) #分组对象
grouped.mean()
<pandas.core.groupby.SeriesGroupBy object at 0x00000223D6390208>
Out[4]:
key1
a 0.249353
b -0.092150
Name: data1, dtype: float64
In [6]:
means=df['data1'].groupby([df['key1'],df['key2']]).mean()
means
Out[6]:
key1  key2
a one 0.507231
two -0.266402
b one -1.247499
two 1.063199
Name: data1, dtype: float64
In [7]:
means.unstack()    #层次化索引!
Out[7]:
key2 one two
key1    
a 0.507231 -0.266402
b -1.247499 1.063199
In [8]:
states=np.array(['Ohio','California','California','Ohio','Ohio'])
years=np.array([2005,2005,2006,2005,2006])
df['data1'].groupby([states,years]).mean()
Out[8]:
California  2005   -0.266402
2006 -1.247499
Ohio 2005 0.607577
2006 0.862507
Name: data1, dtype: float64
In [9]:
df
Out[9]:
  data1 data2 key1 key2
0 0.151954 -0.834709 a one
1 -0.266402 0.469696 a two
2 -1.247499 1.402765 b one
3 1.063199 0.650468 b two
4 0.862507 -1.519559 a one
In [14]:
print(df.groupby('key1').mean())
print(df.groupby(['key1','key2']).mean())
df.groupby(['key1','key2']).size() #size()用法
         data1     data2
key1
a 0.249353 -0.628191
b -0.092150 1.026616
data1 data2
key1 key2
a one 0.507231 -1.177134
two -0.266402 0.469696
b one -1.247499 1.402765
two 1.063199 0.650468
Out[14]:
key1  key2
a one 2
two 1
b one 1
two 1
dtype: int64
In [16]:
for name,group in df.groupby('key1'):
print (name)
print (group) #查看分组结果
a
data1 data2 key1 key2
0 0.151954 -0.834709 a one
1 -0.266402 0.469696 a two
4 0.862507 -1.519559 a one
b
data1 data2 key1 key2
2 -1.247499 1.402765 b one
3 1.063199 0.650468 b two
In [18]:
for (k1,k2),group in df.groupby(['key1','key2']):    #根据两个键分组
print (k1,k2)
print (group)
a one
data1 data2 key1 key2
0 0.151954 -0.834709 a one
4 0.862507 -1.519559 a one
a two
data1 data2 key1 key2
1 -0.266402 0.469696 a two
b one
data1 data2 key1 key2
2 -1.247499 1.402765 b one
b two
data1 data2 key1 key2
3 1.063199 0.650468 b two
In [20]:
pieces=dict(list(df.groupby('key1')))    #可以将分组结果以字典的形式保存
print(pieces['a'])
print(pieces['b'])
      data1     data2 key1 key2
0 0.151954 -0.834709 a one
1 -0.266402 0.469696 a two
4 0.862507 -1.519559 a one
data1 data2 key1 key2
2 -1.247499 1.402765 b one
3 1.063199 0.650468 b two
In [21]:
df.groupby(['key1','key2'])[['data2']].mean()  #只看data2
Out[21]:
    data2
key1 key2  
a one -1.177134
two 0.469696
b one 1.402765
two 0.650468
In [22]:
people=DataFrame(np.random.randn(5,5),
columns=['a','b','c','d','e'],
index=['Joe','Steve','Wes','Jim','Travis'])
people.ix[2:3,['b','c']]=np.nan
people
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:4: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
after removing the cwd from sys.path.
Out[22]:
  a b c d e
Joe 0.523195 0.625608 -1.833872 0.102788 -0.080301
Steve -1.789058 1.342658 1.592765 -0.452598 -0.651140
Wes 0.217548 NaN NaN 2.802176 -1.738924
Jim -0.655526 -1.468537 -1.281012 0.162456 -0.558902
Travis 0.051424 -1.124781 1.375842 0.562162 -0.032887
In [24]:
mapping={'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}
by_column=people.groupby(mapping,axis=1)
by_column
by_column.sum()
Out[24]:
  blue red
Joe -1.731084 1.068502
Steve 1.140167 -1.097540
Wes 2.802176 -1.521376
Jim -1.118556 -2.682965
Travis 1.938004 -1.106244
In [25]:
#通过函数进行分组
people
Out[25]:
  a b c d e
Joe 0.523195 0.625608 -1.833872 0.102788 -0.080301
Steve -1.789058 1.342658 1.592765 -0.452598 -0.651140
Wes 0.217548 NaN NaN 2.802176 -1.738924
Jim -0.655526 -1.468537 -1.281012 0.162456 -0.558902
Travis 0.051424 -1.124781 1.375842 0.562162 -0.032887
In [28]:
#根据索引级别分组
columns=pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],
[1,3,5,1,3]],names=['cty','tenor'])
hier_df=DataFrame(np.random.randn(4,5),columns=columns)
print(hier_df)
hier_df.groupby(level='cty',axis=1).count()
cty          US                            JP          
tenor 1 3 5 1 3
0 0.005531 0.565845 -2.146574 -1.020747 2.167246
1 -0.391606 -0.278086 1.004674 -2.228639 0.112917
2 2.564529 -0.490555 -0.029702 0.717107 0.071027
3 -0.639489 0.283407 -0.848292 1.316535 0.578851
Out[28]:
cty JP US
0 2 3
1 2 3
2 2 3
3 2 3
In [31]:
print(df)
grouped=df.groupby('key1')
grouped['data1'].quantile(1) #分位点
      data1     data2 key1 key2
0 0.151954 -0.834709 a one
1 -0.266402 0.469696 a two
2 -1.247499 1.402765 b one
3 1.063199 0.650468 b two
4 0.862507 -1.519559 a one
Out[31]:
key1
a 0.862507
b 1.063199
Name: data1, dtype: float64
In [32]:
def peak_to_peak(arr):
return arr.max()-arr.min()
grouped.agg(peak_to_peak) #自己定义函数去做聚合
Out[32]:
  data1 data2
key1    
a 1.128910 1.989255
b 2.310698 0.752297
In [3]:
tips=pd.read_csv('tips.csv')
tips.head()
Out[3]:
  total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
In [6]:
tips['tip_pct']=tips['tip']/tips['total_bill']     #常用
tips.head()
Out[6]:
  total_bill tip sex smoker day time size tip_pct
0 16.99 1.01 Female No Sun Dinner 2 0.059447
1 10.34 1.66 Male No Sun Dinner 3 0.160542
2 21.01 3.50 Male No Sun Dinner 3 0.166587
3 23.68 3.31 Male No Sun Dinner 2 0.139780
4 24.59 3.61 Female No Sun Dinner 4 0.146808
In [42]:
grouped=tips.groupby(['sex','smoker'])
grouped_pct=grouped['tip_pct']
print(grouped_pct.agg('mean'))
grouped_pct.agg(['mean','std',peak_to_peak])
sex     smoker
Female No 0.156921
Yes 0.182150
Male No 0.160669
Yes 0.152771
Name: tip_pct, dtype: float64
Out[42]:
    mean std peak_to_peak
sex smoker      
Female No 0.156921 0.036421 0.195876
Yes 0.182150 0.071595 0.360233
Male No 0.160669 0.041849 0.220186
Yes 0.152771 0.090588 0.674707
In [43]:
grouped_pct.agg([('foo','mean'),('bar',np.std)])       #aggregate聚合
Out[43]:
    foo bar
sex smoker    
Female No 0.156921 0.036421
Yes 0.182150 0.071595
Male No 0.160669 0.041849
Yes 0.152771 0.090588
In [45]:
functions=['count','mean','max']
result=grouped['tip_pct','total_bill'].agg(functions) #在两种数据里聚合了三种统计函数
result
Out[45]:
    tip_pct total_bill
    count mean max count mean max
sex smoker            
Female No 54 0.156921 0.252672 54 18.105185 35.83
Yes 33 0.182150 0.416667 33 17.977879 44.30
Male No 97 0.160669 0.291990 97 19.791237 48.33
Yes 60 0.152771 0.710345 60 22.284500 50.81
In [46]:
result['tip_pct']
Out[46]:
    count mean max
sex smoker      
Female No 54 0.156921 0.252672
Yes 33 0.182150 0.416667
Male No 97 0.160669 0.291990
Yes 60 0.152771 0.710345
In [48]:
ftuples=[('Durchschnitt','mean'),('Abweichung',np.var)]    #自命名
grouped['tip_pct','total_bill'].agg(ftuples)
Out[48]:
    tip_pct total_bill
    Durchschnitt Abweichung Durchschnitt Abweichung
sex smoker        
Female No 0.156921 0.001327 18.105185 53.092422
Yes 0.182150 0.005126 17.977879 84.451517
Male No 0.160669 0.001751 19.791237 76.152961
Yes 0.152771 0.008206 22.284500 98.244673
In [49]:
grouped.agg({'tip':np.max,'size':'sum'})
Out[49]:
    tip size
sex smoker    
Female No 5.2 140
Yes 6.5 74
Male No 9.0 263
Yes 10.0 150
In [50]:
grouped.agg({'tip_pct':['min','max','mean','std'],
'size':'sum'}) #分别聚合不同的统计函数
Out[50]:
    tip_pct size
    min max mean std sum
sex smoker          
Female No 0.056797 0.252672 0.156921 0.036421 140
Yes 0.056433 0.416667 0.182150 0.071595 74
Male No 0.071804 0.291990 0.160669 0.041849 263
Yes 0.035638 0.710345 0.152771 0.090588 150
In [51]:
# transform apply
df
Out[51]:
  data1 data2 key1 key2
0 0.151954 -0.834709 a one
1 -0.266402 0.469696 a two
2 -1.247499 1.402765 b one
3 1.063199 0.650468 b two
4 0.862507 -1.519559 a one
In [54]:
k1_means=df.groupby('key1').mean().add_prefix('mean_')
print(k1_means)
pd.merge(df,k1_means,left_on='key1',right_index=True)
      mean_data1  mean_data2
key1
a 0.249353 -0.628191
b -0.092150 1.026616
Out[54]:
  data1 data2 key1 key2 mean_data1 mean_data2
0 0.151954 -0.834709 a one 0.249353 -0.628191
1 -0.266402 0.469696 a two 0.249353 -0.628191
4 0.862507 -1.519559 a one 0.249353 -0.628191
2 -1.247499 1.402765 b one -0.092150 1.026616
3 1.063199 0.650468 b two -0.092150 1.026616
In [55]:
people
Out[55]:
  a b c d e
Joe 0.523195 0.625608 -1.833872 0.102788 -0.080301
Steve -1.789058 1.342658 1.592765 -0.452598 -0.651140
Wes 0.217548 NaN NaN 2.802176 -1.738924
Jim -0.655526 -1.468537 -1.281012 0.162456 -0.558902
Travis 0.051424 -1.124781 1.375842 0.562162 -0.032887
In [57]:
key=['one','two','one','two','one']
print(people.groupby(key).mean())
people.groupby(key).transform(np.mean) #比上面一行的方法简单!
            a         b         c         d         e
one 0.264055 -0.249586 -0.229015 1.155709 -0.617371
two -1.222292 -0.062939 0.155876 -0.145071 -0.605021
Out[57]:
  a b c d e
Joe 0.264055 -0.249586 -0.229015 1.155709 -0.617371
Steve -1.222292 -0.062939 0.155876 -0.145071 -0.605021
Wes 0.264055 -0.249586 -0.229015 1.155709 -0.617371
Jim -1.222292 -0.062939 0.155876 -0.145071 -0.605021
Travis 0.264055 -0.249586 -0.229015 1.155709 -0.617371
In [60]:
def demean(arr):
return arr-arr.mean()
demeaned=people.groupby(key).transform(demean) #减去均值 零均值化
print(demeaned)
               a         b         c         d         e
Joe 0.259139 0.875194 -1.604857 -1.052920 0.537070
Steve -0.566766 1.405598 1.436889 -0.307527 -0.046119
Wes -0.046508 NaN NaN 1.646467 -1.121553
Jim 0.566766 -1.405598 -1.436889 0.307527 0.046119
Travis -0.212631 -0.875194 1.604857 -0.593547 0.584483
In [8]:
###### 最一般化的分组方法Apply  !!!!!!    
def top(df,n=5,column='tip_pct'):
return df.sort_index(by=column)[-n:]
top(tips,n=6)
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: FutureWarning: by argument to sort_index is deprecated, pls use .sort_values(by=...)
This is separate from the ipykernel package so we can avoid doing imports until
Out[8]:
  total_bill tip sex smoker day time size tip_pct
109 14.31 4.00 Female Yes Sat Dinner 2 0.279525
183 23.17 6.50 Male Yes Sun Dinner 4 0.280535
232 11.61 3.39 Male No Sat Dinner 2 0.291990
67 3.07 1.00 Female Yes Sat Dinner 1 0.325733
178 9.60 4.00 Female Yes Sun Dinner 2 0.416667
172 7.25 5.15 Male Yes Sun Dinner 2 0.710345
In [9]:
tips.groupby('smoker').apply(top)     #先按是否吸烟分组 然后返回tip_pct最高的五个
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: FutureWarning: by argument to sort_index is deprecated, pls use .sort_values(by=...)
This is separate from the ipykernel package so we can avoid doing imports until
Out[9]:
    total_bill tip sex smoker day time size tip_pct
smoker                  
No 88 24.71 5.85 Male No Thur Lunch 2 0.236746
185 20.69 5.00 Male No Sun Dinner 5 0.241663
51 10.29 2.60 Female No Sun Dinner 2 0.252672
149 7.51 2.00 Male No Thur Lunch 2 0.266312
232 11.61 3.39 Male No Sat Dinner 2 0.291990
Yes 109 14.31 4.00 Female Yes Sat Dinner 2 0.279525
183 23.17 6.50 Male Yes Sun Dinner 4 0.280535
67 3.07 1.00 Female Yes Sat Dinner 1 0.325733
178 9.60 4.00 Female Yes Sun Dinner 2 0.416667
172 7.25 5.15 Male Yes Sun Dinner 2 0.710345
In [10]:
tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill') #返回total_bill最大的
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: FutureWarning: by argument to sort_index is deprecated, pls use .sort_values(by=...)
This is separate from the ipykernel package so we can avoid doing imports until
Out[10]:
      total_bill tip sex smoker day time size tip_pct
smoker day                  
No Fri 94 22.75 3.25 Female No Fri Dinner 2 0.142857
Sat 212 48.33 9.00 Male No Sat Dinner 4 0.186220
Sun 156 48.17 5.00 Male No Sun Dinner 6 0.103799
Thur 142 41.19 5.00 Male No Thur Lunch 5 0.121389
Yes Fri 95 40.17 4.73 Male Yes Fri Dinner 4 0.117750
Sat 170 50.81 10.00 Male Yes Sat Dinner 3 0.196812
Sun 182 45.35 3.50 Male Yes Sun Dinner 3 0.077178
Thur 197 43.11 5.00 Female Yes Thur Lunch 4 0.115982
In [12]:
result=tips.groupby('smoker')['tip_pct'].describe()
print(result)
result.unstack('smoker')
        count      mean       std       min       25%       50%       75%  \
smoker
No 151.0 0.159328 0.039910 0.056797 0.136906 0.155625 0.185014
Yes 93.0 0.163196 0.085119 0.035638 0.106771 0.153846 0.195059

max
smoker
No 0.291990
Yes 0.710345
Out[12]:
       smoker
count No 151.000000
Yes 93.000000
mean No 0.159328
Yes 0.163196
std No 0.039910
Yes 0.085119
min No 0.056797
Yes 0.035638
25% No 0.136906
Yes 0.106771
50% No 0.155625
Yes 0.153846
75% No 0.185014
Yes 0.195059
max No 0.291990
Yes 0.710345
dtype: float64
In [13]:
tips.groupby('smoker',group_keys=False).apply(top)   #禁止分组键
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: FutureWarning: by argument to sort_index is deprecated, pls use .sort_values(by=...)
This is separate from the ipykernel package so we can avoid doing imports until
Out[13]:
  total_bill tip sex smoker day time size tip_pct
88 24.71 5.85 Male No Thur Lunch 2 0.236746
185 20.69 5.00 Male No Sun Dinner 5 0.241663
51 10.29 2.60 Female No Sun Dinner 2 0.252672
149 7.51 2.00 Male No Thur Lunch 2 0.266312
232 11.61 3.39 Male No Sat Dinner 2 0.291990
109 14.31 4.00 Female Yes Sat Dinner 2 0.279525
183 23.17 6.50 Male Yes Sun Dinner 4 0.280535
67 3.07 1.00 Female Yes Sat Dinner 1 0.325733
178 9.60 4.00 Female Yes Sun Dinner 2 0.416667
172 7.25 5.15 Male Yes Sun Dinner 2 0.710345
In [14]:
#分位数和桶分析
frame=DataFrame({'data1':np.random.randn(1000),
'data2':np.random.randn(1000)})
factor=pd.cut(frame.data1,4)
factor[:10]
Out[14]:
0      (0.137, 1.626]
1 (0.137, 1.626]
2 (-2.848, -1.353]
3 (-1.353, 0.137]
4 (0.137, 1.626]
5 (0.137, 1.626]
6 (-1.353, 0.137]
7 (-1.353, 0.137]
8 (-1.353, 0.137]
9 (-1.353, 0.137]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-2.848, -1.353] < (-1.353, 0.137] < (0.137, 1.626] < (1.626, 3.116]]
In [20]:
def get_stats(group):
return {'min':group.min(),'max':group.max(),
'count':group.count(),'mean':group.mean()}
grouped=frame.data2.groupby(factor)
print(grouped.apply(get_stats))
grouped.apply(get_stats).unstack() #分桶
data1                  
(-2.848, -1.353] count 87.000000
max 2.230317
mean -0.073813
min -3.102882
(-1.353, 0.137] count 461.000000
max 2.600677
mean -0.060566
min -2.994502
(0.137, 1.626] count 394.000000
max 2.596374
mean -0.081585
min -3.647241
(1.626, 3.116] count 58.000000
max 2.463651
mean -0.091304
min -2.815372
Name: data2, dtype: float64
Out[20]:
  count max mean min
data1        
(-2.848, -1.353] 87.0 2.230317 -0.073813 -3.102882
(-1.353, 0.137] 461.0 2.600677 -0.060566 -2.994502
(0.137, 1.626] 394.0 2.596374 -0.081585 -3.647241
(1.626, 3.116] 58.0 2.463651 -0.091304 -2.815372
In [22]:
grouping=pd.qcut(frame.data1,10)   #分桶大小相同
grouped=frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()
Out[22]:
  count max mean min
data1        
(-2.843, -1.234] 100.0 2.230317 -0.105405 -3.102882
(-1.234, -0.843] 100.0 2.118366 -0.102994 -2.319278
(-0.843, -0.475] 100.0 1.923203 -0.030938 -2.510866
(-0.475, -0.214] 100.0 2.529378 -0.007671 -2.994502
(-0.214, 0.0224] 100.0 2.600677 -0.031192 -2.474029
(0.0224, 0.271] 100.0 2.596374 -0.089898 -2.900781
(0.271, 0.545] 100.0 1.891903 -0.183422 -3.647241
(0.545, 0.873] 100.0 2.596245 -0.022401 -3.420276
(0.873, 1.309] 100.0 1.915218 -0.083389 -1.763932
(1.309, 3.116] 100.0 2.463651 -0.060514 -3.046467
In [25]:
#用特定分组的值填充缺失值    这个比较常用!
s=Series(np.random.randn(6))
print(s)
s[::2]=np.nan
s
0   -0.137469
1 1.614162
2 0.661473
3 0.361981
4 -1.316889
5 0.436819
dtype: float64
Out[25]:
0         NaN
1 1.614162
2 NaN
3 0.361981
4 NaN
5 0.436819
dtype: float64
In [26]:
s.fillna(s.mean())
Out[26]:
0    0.804320
1 1.614162
2 0.804320
3 0.361981
4 0.804320
5 0.436819
dtype: float64
In [30]:
states=['Ohio','New York','Vermont','Florida',
'Oregon','Nevada','California','Idaho']
group_key=['East']*4+['West']*4
print(group_key)
data=Series(np.random.randn(8),index=states)
data[['Vermont','Nevada','Idaho']]=np.nan
data
['East', 'East', 'East', 'East', 'West', 'West', 'West', 'West']
Out[30]:
Ohio         -0.441768
New York -0.536480
Vermont NaN
Florida -0.227419
Oregon 0.408258
Nevada NaN
California -0.360140
Idaho NaN
dtype: float64
In [32]:
print(data.groupby(group_key).mean())
fill_mean=lambda g: g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean) #按照东部西部各自分组填补缺失值
East   -0.401889
West 0.024059
dtype: float64
Out[32]:
Ohio         -0.441768
New York -0.536480
Vermont -0.401889
Florida -0.227419
Oregon 0.408258
Nevada 0.024059
California -0.360140
Idaho 0.024059
dtype: float64
In [33]:
fill_values={'East':0.5,'West':-1}  
fill_func=lambda g:g.fillna(fill_values[g.name])
data.groupby(group_key).apply(fill_func) ##预定义填充值
Out[33]:
Ohio         -0.441768
New York -0.536480
Vermont 0.500000
Florida -0.227419
Oregon 0.408258
Nevada -1.000000
California -0.360140
Idaho -1.000000
dtype: float64
In [52]:
suits=['H','S','C','D']
card_val=([1,2,3,4,5,6,7,8,9,10]+[10]*3)*4
base_names=['A']+[2,3,4,5,6,7,8,9,10]+['J','K','Q']
cards=[]
for suit in suits:
cards.extend(str(num)+suit for num in base_names)
deck=Series(card_val,index=cards)
deck[:26]
Out[52]:
AH      1
2H 2
3H 3
4H 4
5H 5
6H 6
7H 7
8H 8
9H 9
10H 10
JH 10
KH 10
QH 10
AS 1
2S 2
3S 3
4S 4
5S 5
6S 6
7S 7
8S 8
9S 9
10S 10
JS 10
KS 10
QS 10
dtype: int64
In [54]:
def draw(deck,n=5):
return deck.take(np.random.permutation(len(deck))[:n]) #permutation重排? 1到52范围内的52个数字
draw(deck) #抽出5张牌
Out[54]:
3S    3
3C 3
8H 8
8C 8
2H 2
dtype: int64
In [55]:
#根据牌的花色 在各个分组分别抽取两张牌
get_suit=lambda card:card[-1]
deck.groupby(get_suit).apply(draw,n=2)
Out[55]:
C  6C     6
4C 4
D JD 10
8D 8
H 2H 2
5H 5
S KS 10
8S 8
dtype: int64
In [56]:
#分组加权平均数和相关系数
df=DataFrame({'category':['a','a','a','a','b','b','b','b'],
'data':np.random.randn(8),
'weights':np.random.rand(8)})
df
Out[56]:
  category data weights
0 a 0.713305 0.063614
1 a -0.627942 0.564585
2 a 1.404638 0.759202
3 a -0.392145 0.695168
4 b 0.145668 0.449950
5 b 0.956057 0.674082
6 b -0.763412 0.001428
7 b 0.570525 0.731245
In [57]:
grouped=df.groupby('category')
get_wavg=lambda g:np.average(g['data'],weights=g['weights']) #计算加权平均值
grouped.apply(get_wavg)
Out[57]:
category
a 0.232716
b 0.606508
dtype: float64
In [60]:
close_px=pd.read_csv('stock_px.csv',parse_dates=True,index_col=0)
close_px.describe()
Out[60]:
  AAPL MSFT XOM SPX
count 2214.000000 2214.000000 2214.000000 2214.000000
mean 125.516147 23.945452 59.558744 1183.773311
std 107.394693 3.255198 16.725025 180.983466
min 6.560000 14.330000 26.210000 676.530000
25% 37.135000 21.700000 49.492500 1077.060000
50% 91.455000 24.000000 62.970000 1189.260000
75% 185.605000 26.280000 72.510000 1306.057500
max 422.000000 34.070000 87.480000 1565.150000
In [68]:
# 透视表和交叉表 !!!
#分组平均数为pivot_table的默认聚合类型
tips.pivot_table(index=['sex','smoker']) #rows改为index col改为columns
Out[68]:
    size tip tip_pct total_bill
sex smoker        
Female No 2.592593 2.773519 0.156921 18.105185
Yes 2.242424 2.931515 0.182150 17.977879
Male No 2.711340 3.113402 0.160669 19.791237
Yes 2.500000 3.051167 0.152771 22.284500
In [69]:
tips.pivot_table(['tip_pct','size'],index=['sex','day'],columns='smoker')
Out[69]:
    size tip_pct
  smoker No Yes No Yes
sex day        
Female Fri 2.500000 2.000000 0.165296 0.209129
Sat 2.307692 2.200000 0.147993 0.163817
Sun 3.071429 2.500000 0.165710 0.237075
Thur 2.480000 2.428571 0.155971 0.163073
Male Fri 2.000000 2.125000 0.138005 0.144730
Sat 2.656250 2.629630 0.162132 0.139067
Sun 2.883721 2.600000 0.158291 0.173964
Thur 2.500000 2.300000 0.165706 0.164417
In [70]:
#分项小计
tips.pivot_table(['tip_pct','size'],index=['sex','day'],
columns='smoker',margins=True)
Out[70]:
    size tip_pct
  smoker No Yes All No Yes All
sex day            
Female Fri 2.500000 2.000000 2.111111 0.165296 0.209129 0.199388
Sat 2.307692 2.200000 2.250000 0.147993 0.163817 0.156470
Sun 3.071429 2.500000 2.944444 0.165710 0.237075 0.181569
Thur 2.480000 2.428571 2.468750 0.155971 0.163073 0.157525
Male Fri 2.000000 2.125000 2.100000 0.138005 0.144730 0.143385
Sat 2.656250 2.629630 2.644068 0.162132 0.139067 0.151577
Sun 2.883721 2.600000 2.810345 0.158291 0.173964 0.162344
Thur 2.500000 2.300000 2.433333 0.165706 0.164417 0.165276
All   2.668874 2.408602 2.569672 0.159328 0.163196 0.160803
In [73]:
tips.pivot_table('tip_pct',index=['sex','smoker'],columns='day',
aggfunc=len,margins=True)
Out[73]:
  day Fri Sat Sun Thur All
sex smoker          
Female No 2.0 13.0 14.0 25.0 54.0
Yes 7.0 15.0 4.0 7.0 33.0
Male No 2.0 32.0 43.0 20.0 97.0
Yes 8.0 27.0 15.0 10.0 60.0
All   19.0 87.0 76.0 62.0 244.0
In [76]:
tips.pivot_table('size',index=['time','sex','smoker'],columns='day',aggfunc='sum',
fill_value=0)
Out[76]:
    day Fri Sat Sun Thur
time sex smoker        
Dinner Female No 2 30 43 2
Yes 8 33 10 0
Male No 4 85 124 0
Yes 12 71 39 0
Lunch Female No 3 0 0 60
Yes 6 0 0 17
Male No 0 0 0 50
Yes 5 0 0 23
In [81]:
#交叉表
from io import StringIO

data = """\
Sample Gender Handedness
1 Female Right-handed
2 Male Left-handed
3 Female Right-handed
4 Male Right-handed
5 Male Left-handed
6 Male Right-handed
7 Female Right-handed
8 Female Left-handed
9 Male Right-handed
10 Female Right-handed"""
data = pd.read_table(StringIO(data), sep='\s+')
data
Out[81]:
  Sample Gender Handedness
0 1 Female Right-handed
1 2 Male Left-handed
2 3 Female Right-handed
3 4 Male Right-handed
4 5 Male Left-handed
5 6 Male Right-handed
6 7 Female Right-handed
7 8 Female Left-handed
8 9 Male Right-handed
9 10 Female Right-handed
In [88]:
data.pivot_table(index=['Gender'],columns='Handedness',
aggfunc=len,margins=True)
Out[88]:
  Sample
Handedness Left-handed Right-handed All
Gender      
Female 1.0 4.0 5.0
Male 2.0 3.0 5.0
All 3.0 7.0 10.0
In [91]:
print(pd.crosstab(data.Gender,data.Handedness,margins=True))     #交叉表更方便?
pd.crosstab([tips.time,tips.day],tips.smoker,margins=True)
Handedness  Left-handed  Right-handed  All
Gender
Female 1 4 5
Male 2 3 5
All 3 7 10
Out[91]:
  smoker No Yes All
time day      
Dinner Fri 3 9 12
Sat 45 42 87
Sun 57 19 76
Thur 1 0 1
Lunch Fri 1 6 7
Thur 44 17 61
All   151 93 244