In [1]:
import pandas as pd
import numpy as np
from pandas import *
df=DataFrame({'key1':['a','a','b','b','a'],
'key2':['one','two','one','two','one'],
'data1':np.random.randn(5),
'data2':np.random.randn(5)})
df
Out[1]:
In [4]:
grouped=df['data1'].groupby(df['key1'])
print(grouped) #分组对象
grouped.mean()
Out[4]:
In [6]:
means=df['data1'].groupby([df['key1'],df['key2']]).mean()
means
Out[6]:
In [7]:
means.unstack() #层次化索引!
Out[7]:
In [8]:
states=np.array(['Ohio','California','California','Ohio','Ohio'])
years=np.array([2005,2005,2006,2005,2006])
df['data1'].groupby([states,years]).mean()
Out[8]:
In [9]:
df
Out[9]:
In [14]:
print(df.groupby('key1').mean())
print(df.groupby(['key1','key2']).mean())
df.groupby(['key1','key2']).size() #size()用法
Out[14]:
In [16]:
for name,group in df.groupby('key1'):
print (name)
print (group) #查看分组结果
In [18]:
for (k1,k2),group in df.groupby(['key1','key2']): #根据两个键分组
print (k1,k2)
print (group)
In [20]:
pieces=dict(list(df.groupby('key1'))) #可以将分组结果以字典的形式保存
print(pieces['a'])
print(pieces['b'])
In [21]:
df.groupby(['key1','key2'])[['data2']].mean() #只看data2
Out[21]:
In [22]:
people=DataFrame(np.random.randn(5,5),
columns=['a','b','c','d','e'],
index=['Joe','Steve','Wes','Jim','Travis'])
people.ix[2:3,['b','c']]=np.nan
people
Out[22]:
In [24]:
mapping={'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}
by_column=people.groupby(mapping,axis=1)
by_column
by_column.sum()
Out[24]:
In [25]:
#通过函数进行分组
people
Out[25]:
In [28]:
#根据索引级别分组
columns=pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],
[1,3,5,1,3]],names=['cty','tenor'])
hier_df=DataFrame(np.random.randn(4,5),columns=columns)
print(hier_df)
hier_df.groupby(level='cty',axis=1).count()
Out[28]:
In [31]:
print(df)
grouped=df.groupby('key1')
grouped['data1'].quantile(1) #分位点
Out[31]:
In [32]:
def peak_to_peak(arr):
return arr.max()-arr.min()
grouped.agg(peak_to_peak) #自己定义函数去做聚合
Out[32]:
In [3]:
tips=pd.read_csv('tips.csv')
tips.head()
Out[3]:
In [6]:
tips['tip_pct']=tips['tip']/tips['total_bill'] #常用
tips.head()
Out[6]:
In [42]:
grouped=tips.groupby(['sex','smoker'])
grouped_pct=grouped['tip_pct']
print(grouped_pct.agg('mean'))
grouped_pct.agg(['mean','std',peak_to_peak])
Out[42]:
In [43]:
grouped_pct.agg([('foo','mean'),('bar',np.std)]) #aggregate聚合
Out[43]:
In [45]:
functions=['count','mean','max']
result=grouped['tip_pct','total_bill'].agg(functions) #在两种数据里聚合了三种统计函数
result
Out[45]:
In [46]:
result['tip_pct']
Out[46]:
In [48]:
ftuples=[('Durchschnitt','mean'),('Abweichung',np.var)] #自命名
grouped['tip_pct','total_bill'].agg(ftuples)
Out[48]:
In [49]:
grouped.agg({'tip':np.max,'size':'sum'})
Out[49]:
In [50]:
grouped.agg({'tip_pct':['min','max','mean','std'],
'size':'sum'}) #分别聚合不同的统计函数
Out[50]:
In [51]:
# transform apply
df
Out[51]:
In [54]:
k1_means=df.groupby('key1').mean().add_prefix('mean_')
print(k1_means)
pd.merge(df,k1_means,left_on='key1',right_index=True)
Out[54]:
In [55]:
people
Out[55]:
In [57]:
key=['one','two','one','two','one']
print(people.groupby(key).mean())
people.groupby(key).transform(np.mean) #比上面一行的方法简单!
Out[57]:
In [60]:
def demean(arr):
return arr-arr.mean()
demeaned=people.groupby(key).transform(demean) #减去均值 零均值化
print(demeaned)
In [8]:
###### 最一般化的分组方法Apply !!!!!!
def top(df,n=5,column='tip_pct'):
return df.sort_index(by=column)[-n:]
top(tips,n=6)
Out[8]:
In [9]:
tips.groupby('smoker').apply(top) #先按是否吸烟分组 然后返回tip_pct最高的五个
Out[9]:
In [10]:
tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill') #返回total_bill最大的
Out[10]:
In [12]:
result=tips.groupby('smoker')['tip_pct'].describe()
print(result)
result.unstack('smoker')
Out[12]:
In [13]:
tips.groupby('smoker',group_keys=False).apply(top) #禁止分组键
Out[13]:
In [14]:
#分位数和桶分析
frame=DataFrame({'data1':np.random.randn(1000),
'data2':np.random.randn(1000)})
factor=pd.cut(frame.data1,4)
factor[:10]
Out[14]:
In [20]:
def get_stats(group):
return {'min':group.min(),'max':group.max(),
'count':group.count(),'mean':group.mean()}
grouped=frame.data2.groupby(factor)
print(grouped.apply(get_stats))
grouped.apply(get_stats).unstack() #分桶
Out[20]:
In [22]:
grouping=pd.qcut(frame.data1,10) #分桶大小相同
grouped=frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()
Out[22]:
In [25]:
#用特定分组的值填充缺失值 这个比较常用!
s=Series(np.random.randn(6))
print(s)
s[::2]=np.nan
s
Out[25]:
In [26]:
s.fillna(s.mean())
Out[26]:
In [30]:
states=['Ohio','New York','Vermont','Florida',
'Oregon','Nevada','California','Idaho']
group_key=['East']*4+['West']*4
print(group_key)
data=Series(np.random.randn(8),index=states)
data[['Vermont','Nevada','Idaho']]=np.nan
data
Out[30]:
In [32]:
print(data.groupby(group_key).mean())
fill_mean=lambda g: g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean) #按照东部西部各自分组填补缺失值
Out[32]:
In [33]:
fill_values={'East':0.5,'West':-1}
fill_func=lambda g:g.fillna(fill_values[g.name])
data.groupby(group_key).apply(fill_func) ##预定义填充值
Out[33]:
In [52]:
suits=['H','S','C','D']
card_val=([1,2,3,4,5,6,7,8,9,10]+[10]*3)*4
base_names=['A']+[2,3,4,5,6,7,8,9,10]+['J','K','Q']
cards=[]
for suit in suits:
cards.extend(str(num)+suit for num in base_names)
deck=Series(card_val,index=cards)
deck[:26]
Out[52]:
In [54]:
def draw(deck,n=5):
return deck.take(np.random.permutation(len(deck))[:n]) #permutation重排? 1到52范围内的52个数字
draw(deck) #抽出5张牌
Out[54]:
In [55]:
#根据牌的花色 在各个分组分别抽取两张牌
get_suit=lambda card:card[-1]
deck.groupby(get_suit).apply(draw,n=2)
Out[55]:
In [56]:
#分组加权平均数和相关系数
df=DataFrame({'category':['a','a','a','a','b','b','b','b'],
'data':np.random.randn(8),
'weights':np.random.rand(8)})
df
Out[56]:
In [57]:
grouped=df.groupby('category')
get_wavg=lambda g:np.average(g['data'],weights=g['weights']) #计算加权平均值
grouped.apply(get_wavg)
Out[57]:
In [60]:
close_px=pd.read_csv('stock_px.csv',parse_dates=True,index_col=0)
close_px.describe()
Out[60]:
In [68]:
# 透视表和交叉表 !!!
#分组平均数为pivot_table的默认聚合类型
tips.pivot_table(index=['sex','smoker']) #rows改为index col改为columns
Out[68]:
In [69]:
tips.pivot_table(['tip_pct','size'],index=['sex','day'],columns='smoker')
Out[69]:
In [70]:
#分项小计
tips.pivot_table(['tip_pct','size'],index=['sex','day'],
columns='smoker',margins=True)
Out[70]:
In [73]:
tips.pivot_table('tip_pct',index=['sex','smoker'],columns='day',
aggfunc=len,margins=True)
Out[73]:
In [76]:
tips.pivot_table('size',index=['time','sex','smoker'],columns='day',aggfunc='sum',
fill_value=0)
Out[76]:
In [81]:
#交叉表
from io import StringIO
data = """\
Sample Gender Handedness
1 Female Right-handed
2 Male Left-handed
3 Female Right-handed
4 Male Right-handed
5 Male Left-handed
6 Male Right-handed
7 Female Right-handed
8 Female Left-handed
9 Male Right-handed
10 Female Right-handed"""
data = pd.read_table(StringIO(data), sep='\s+')
data
Out[81]:
In [88]:
data.pivot_table(index=['Gender'],columns='Handedness',
aggfunc=len,margins=True)
Out[88]:
In [91]:
print(pd.crosstab(data.Gender,data.Handedness,margins=True)) #交叉表更方便?
pd.crosstab([tips.time,tips.day],tips.smoker,margins=True)
Out[91]: