本文实例总结了Python常见的pandas用法。分享给大家供大家参考,具体如下:
1
2
|
import numpy as np
import pandas as pd
|
1
2
3
|
s = pd.Series([ 1 , 3 , 6 , np.nan, 44 , 1 ]) #定义一个序列。 序列就是一列内容,每一行有一个index值
print (s)
print (s.index)
|
0 1.0
1 3.0
2 6.0
3 NaN
4 44.0
5 1.0
dtype: float64
RangeIndex(start=0, stop=6, step=1)
1
2
|
dates = pd.date_range( '20180101' , periods = 6 )
print (dates)
|
DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
'2018-01-05', '2018-01-06'],
dtype='datetime64[ns]', freq='D')
1
2
|
df1 = pd.DataFrame(np.arange( 12 ).reshape( 3 , 4 )) #定义DataFrame,可以看作一个有index和colunms的矩阵
print (df)
|
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
1
2
|
df2 = pd.DataFrame(np.random.randn( 6 , 4 ), index = dates, columns = [ 'a' , 'b' , 'c' , 'd' ]) #np.random.randn(6,4)生成6行4列矩阵
print (df)
|
a b c d
2018-01-01 0.300675 1.769383 1.244406 -1.058294
2018-01-02 0.832666 2.216755 0.178716 -0.156828
2018-01-03 1.314190 -0.866199 0.836150 1.001026
2018-01-04 -1.671724 1.147406 -0.148676 -0.272555
2018-01-05 1.146664 2.022861 -1.833995 -0.627568
2018-01-06 -0.192242 1.517676 0.756707 0.058869
1
2
3
4
5
6
7
8
9
|
df = pd.DataFrame({ 'A' : 1.0 ,
'B' :pd.Timestamp( '20180101' ),
'C' :pd.Series( 1 , index = list ( range ( 4 )), dtype = 'float32' ),
'D' :np.array([ 3 ] * 4 , dtype = 'int32' ),
'E' :pd.Categorical([ 'test' , 'train' , 'test' , 'train' ]),
'F' : 'foo' }) #按照给出的逐列定义df
print (df)
print (df.dtypes)
|
A B C D E F
0 1.0 2018-01-01 1.0 3 test foo
1 1.0 2018-01-01 1.0 3 train foo
2 1.0 2018-01-01 1.0 3 test foo
3 1.0 2018-01-01 1.0 3 train foo
A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
1
2
3
4
|
#df的行、列、值
print (df.index)
print (df.columns)
print (df.values)
|
Int64Index([0, 1, 2, 3], dtype='int64')
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
[[1.0 Timestamp('2018-01-01 00:00:00') 1.0 3 'test' 'foo']
[1.0 Timestamp('2018-01-01 00:00:00') 1.0 3 'train' 'foo']
[1.0 Timestamp('2018-01-01 00:00:00') 1.0 3 'test' 'foo']
[1.0 Timestamp('2018-01-01 00:00:00') 1.0 3 'train' 'foo']]
1
2
|
print (df.describe()) #统计
print (df.T) #转置
|
A C D
count 4.0 4.0 4.0
mean 1.0 1.0 3.0
std 0.0 0.0 0.0
min 1.0 1.0 3.0
25% 1.0 1.0 3.0
50% 1.0 1.0 3.0
75% 1.0 1.0 3.0
max 1.0 1.0 3.0
0 1 2 \
A 1 1 1
B 2018-01-01 00:00:00 2018-01-01 00:00:00 2018-01-01 00:00:00
C 1 1 1
D 3 3 3
E test train test
F foo foo foo
3
A 1
B 2018-01-01 00:00:00
C 1
D 3
E train
F foo
1
2
3
|
#df排序
print (df.sort_index(axis = 1 , ascending = False )) #根据索引值对各行进行排序(相当于重新排列各列的位置)
print (df.sort_values(by = 'E' )) #根据内容值对各列进行排序
|
F E D C B A
0 foo test 3 1.0 2018-01-01 1.0
1 foo train 3 1.0 2018-01-01 1.0
2 foo test 3 1.0 2018-01-01 1.0
3 foo train 3 1.0 2018-01-01 1.0
A B C D E F
0 1.0 2018-01-01 1.0 3 test foo
2 1.0 2018-01-01 1.0 3 test foo
1 1.0 2018-01-01 1.0 3 train foo
3 1.0 2018-01-01 1.0 3 train foo
1
2
3
4
5
6
7
8
|
indexes = pd.date_range( '20180101' , periods = 6 )
df3 = pd.DataFrame(np.arange( 24 ).reshape( 6 , 4 ), index = indexes, columns = [ 'A' , 'B' , 'C' , 'D' ])
print (df3)
print ()
#选择column
print (df3[ 'A' ])
print ()
print (df3.A)
|
A B C D
2018-01-01 0 1 2 3
2018-01-02 4 5 6 7
2018-01-03 8 9 10 11
2018-01-04 12 13 14 15
2018-01-05 16 17 18 19
2018-01-06 20 21 22 23
2018-01-01 0
2018-01-02 4
2018-01-03 8
2018-01-04 12
2018-01-05 16
2018-01-06 20
Freq: D, Name: A, dtype: int32
2018-01-01 0
2018-01-02 4
2018-01-03 8
2018-01-04 12
2018-01-05 16
2018-01-06 20
Freq: D, Name: A, dtype: int32
A B C D
2018-01-01 0 1 2 3
2018-01-02 4 5 6 7
2018-01-03 8 9 10 11
1
2
3
4
5
6
|
#选择行, 类似limit语句
print (df3[ 0 : 0 ])
print ()
print (df3[ 0 : 3 ])
print ()
print (df3[ '20180103' : '20180105' ])
|
Empty DataFrame
Columns: [A, B, C, D]
Index: []
A B C D
2018-01-01 0 1 2 3
2018-01-02 4 5 6 7
2018-01-03 8 9 10 11
A B C D
2018-01-03 8 9 10 11
2018-01-04 12 13 14 15
2018-01-05 16 17 18 19
1
|
print (df3.loc[ '20180102' ]) #返回指定行构成的序列
|
A 4
B 5
C 6
D 7
Name: 2018-01-02 00:00:00, dtype: int32
1
2
3
4
5
|
print (df3.loc[ '20180103' , [ 'A' , 'C' ]]) #列筛选
print ()
print (df3.loc[ '20180103' : '20180105' , [ 'A' , 'C' ]]) #子df,类似select A, C from df limit ...
print ()
print (df3.loc[:, [ 'A' , 'B' ]])
|
A 8
C 10
Name: 2018-01-03 00:00:00, dtype: int32
A C
2018-01-03 8 10
2018-01-04 12 14
2018-01-05 16 18
A B
2018-01-01 0 1
2018-01-02 4 5
2018-01-03 8 9
2018-01-04 12 13
2018-01-05 16 17
2018-01-06 20 21
1
2
3
4
5
6
|
print (df3); print ()
print (df3.iloc[ 1 ]); print ()
print (df3.iloc[ 1 , 1 ]); print ()
print (df3.iloc[:, 1 ]); print ()
print (df3.iloc[ 0 : 3 , 1 : 3 ]); print ()
print (df3.iloc[[ 1 , 3 , 5 ],[ 0 , 2 ]]) #行可以不连续,limit做不到
|
A B C D
2018-01-01 0 1 2 3
2018-01-02 4 5 6 7
2018-01-03 8 9 10 11
2018-01-04 12 13 14 15
2018-01-05 16 17 18 19
2018-01-06 20 21 22 23
A 4
B 5
C 6
D 7
Name: 2018-01-02 00:00:00, dtype: int32
5
2018-01-01 1
2018-01-02 5
2018-01-03 9
2018-01-04 13
2018-01-05 17
2018-01-06 21
Freq: D, Name: B, dtype: int32
B C
2018-01-01 1 2
2018-01-02 5 6
2018-01-03 9 10
A C
2018-01-02 4 6
2018-01-04 12 14
2018-01-06 20 22
1
2
3
|
# print(df3.ix[:3, ['A', 'C']])\
print (df3); print ()
print (df3[df3.A > = 8 ]) #根据值进行条件过滤,类似where A >= 8条件语句
|
A B C D
2018-01-01 0 1 2 3
2018-01-02 4 5 6 7
2018-01-03 8 9 10 11
2018-01-04 12 13 14 15
2018-01-05 16 17 18 19
2018-01-06 20 21 22 23
A B C D
2018-01-03 8 9 10 11
2018-01-04 12 13 14 15
2018-01-05 16 17 18 19
2018-01-06 20 21 22 23
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
indexes1 = pd.date_range( '20180101' , periods = 6 )
df4 = pd.DataFrame(np.arange( 24 ).reshape( 6 , 4 ), index = indexes1, columns = [ 'A' , 'B' , 'C' , 'D' ])
print (df4); print ()
#给某个元素赋值
df4.A[ 1 ] = 1111
df4.B[ '20180103' ] = 2222
df4.iloc[ 3 , 2 ] = 3333
df4.loc[ '20180105' , 'D' ] = 4444
print (df4); print ()
#范围赋值
df4.B[df4.A < 10 ] = - 1
print (df4); print ()
df4[df4.A < 10 ] = 0
print (df4); print ()
|
A B C D
2018-01-01 0 1 2 3
2018-01-02 4 5 6 7
2018-01-03 8 9 10 11
2018-01-04 12 13 14 15
2018-01-05 16 17 18 19
2018-01-06 20 21 22 23
A B C D
2018-01-01 0 1 2 3
2018-01-02 1111 5 6 7
2018-01-03 8 2222 10 11
2018-01-04 12 13 3333 15
2018-01-05 16 17 18 4444
2018-01-06 20 21 22 23
A B C D
2018-01-01 0 -1 2 3
2018-01-02 1111 5 6 7
2018-01-03 8 -1 10 11
2018-01-04 12 13 3333 15
2018-01-05 16 17 18 4444
2018-01-06 20 21 22 23
A B C D
2018-01-01 0 0 0 0
2018-01-02 1111 5 6 7
2018-01-03 0 0 0 0
2018-01-04 12 13 3333 15
2018-01-05 16 17 18 4444
2018-01-06 20 21 22 23
1
2
3
4
5
6
7
8
9
10
|
indexes1 = pd.date_range( '20180101' , periods = 6 )
df4 = pd.DataFrame(np.arange( 24 ).reshape( 6 , 4 ), index = indexes1, columns = [ 'A' , 'B' , 'C' , 'D' ])
print (df4); print ()
#添加一列
df4[ 'E' ] = np.NaN
print (df4); print ()
#由于index没对齐,原df没有的行默认为NaN,类型为float64,多出的行丢弃
df4[ 'F' ] = pd.Series([ 1 , 2 , 3 , 4 , 5 , 6 ], index = pd.date_range( '20180102' , periods = 6 ))
print (df4); print ()
print (df4.dtypes)
|
A B C D
2018-01-01 0 1 2 3
2018-01-02 4 5 6 7
2018-01-03 8 9 10 11
2018-01-04 12 13 14 15
2018-01-05 16 17 18 19
2018-01-06 20 21 22 23
A B C D E
2018-01-01 0 1 2 3 NaN
2018-01-02 4 5 6 7 NaN
2018-01-03 8 9 10 11 NaN
2018-01-04 12 13 14 15 NaN
2018-01-05 16 17 18 19 NaN
2018-01-06 20 21 22 23 NaN
A B C D E F
2018-01-01 0 1 2 3 NaN NaN
2018-01-02 4 5 6 7 NaN 1.0
2018-01-03 8 9 10 11 NaN 2.0
2018-01-04 12 13 14 15 NaN 3.0
2018-01-05 16 17 18 19 NaN 4.0
2018-01-06 20 21 22 23 NaN 5.0
A int32
B int32
C int32
D int32
E float64
F float64
dtype: object
1
2
3
4
5
6
7
8
9
10
11
12
|
df_t = pd.DataFrame(np.arange( 24 ).reshape( 6 , 4 ), index = [ 1 , 2 , 3 , 4 , 5 , 6 ], columns = [ 'A' , 'B' , 'C' , 'D' ])
df_t.iloc[ 0 , 1 ] = np.NaN
df_t.iloc[ 1 , 2 ] = np.NaN
df = df_t.copy()
print (df); print ()
print (df.dropna(axis = 0 , how = 'any' )); print ()
df = df_t.copy()
print (df.dropna(axis = 1 , how = 'any' )); print ()
df = df_t.copy()
df.C = np.NaN
print (df); print ()
print (df.dropna(axis = 1 , how = 'all' )); print ()
|
A B C D
1 0 NaN 2.0 3
2 4 5.0 NaN 7
3 8 9.0 10.0 11
4 12 13.0 14.0 15
5 16 17.0 18.0 19
6 20 21.0 22.0 23
A B C D
3 8 9.0 10.0 11
4 12 13.0 14.0 15
5 16 17.0 18.0 19
6 20 21.0 22.0 23
A D
1 0 3
2 4 7
3 8 11
4 12 15
5 16 19
6 20 23
A B C D
1 0 NaN NaN 3
2 4 5.0 NaN 7
3 8 9.0 NaN 11
4 12 13.0 NaN 15
5 16 17.0 NaN 19
6 20 21.0 NaN 23
A B D
1 0 NaN 3
2 4 5.0 7
3 8 9.0 11
4 12 13.0 15
5 16 17.0 19
6 20 21.0 23
1
2
3
4
5
6
7
|
df = df_t.copy()
print (df); print ()
print (df.isna()); print ()
print (df.isnull(). any ()); print () #isnull是isna别名,功能一样
print (df.isnull(). any (axis = 1 )); print ()
print (np. any (df.isna() = = True )); print ()
print (df.fillna(value = 0 )) #将NaN赋值
|
A B C D
1 0 NaN 2.0 3
2 4 5.0 NaN 7
3 8 9.0 10.0 11
4 12 13.0 14.0 15
5 16 17.0 18.0 19
6 20 21.0 22.0 23
A B C D
1 False True False False
2 False False True False
3 False False False False
4 False False False False
5 False False False False
6 False False False False
A False
B True
C True
D False
dtype: bool
1 True
2 True
3 False
4 False
5 False
6 False
dtype: bool
True
A B C D
1 0 0.0 2.0 3
2 4 5.0 0.0 7
3 8 9.0 10.0 11
4 12 13.0 14.0 15
5 16 17.0 18.0 19
6 20 21.0 22.0 23
1
2
3
|
data = pd.read_csv( 'D:/pythonwp/test/student.csv' )
print (data)
data.to_pickle( 'D:/pythonwp/test/student.pickle' )
|
id name age gender
0 1 牛帅 23 Male
1 2 gyb 89 Male
2 3 xxs 27 Male
3 4 hey 24 Female
4 5 奥莱利赫本 66 Female
5 6 Jackson 61 Male
6 7 牛帅 23 Male
1
2
3
4
5
6
7
8
9
10
|
df0 = pd.DataFrame(np.ones(( 3 , 4 )) * 0 , columns = [ 'A' , 'B' , 'C' , 'D' ])
df1 = pd.DataFrame(np.ones(( 3 , 4 )) * 1 , columns = [ 'A' , 'B' , 'C' , 'D' ])
df2 = pd.DataFrame(np.ones(( 3 , 4 )) * 2 , columns = [ 'A' , 'B' , 'C' , 'D' ])
print (df0); print ()
print (df1); print ()
print (df2); print ()
res = pd.concat([df0, df1, df2], axis = 0 )
print (res); print ()
res = pd.concat([df0, df1, df2], axis = 0 , ignore_index = True )
print (res)
|
A B C D
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
A B C D
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
A B C D
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
A B C D
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
A B C D
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
1
2
3
4
5
6
7
8
|
df0 = pd.DataFrame(np.ones(( 3 , 4 )) * 0 , columns = [ 'A' , 'B' , 'C' , 'D' ])
df1 = pd.DataFrame(np.ones(( 3 , 4 )) * 1 , columns = [ 'E' , 'F' , 'C' , 'D' ])
res = pd.concat([df0, df1], ignore_index = True )
print (res); print ()
res = pd.concat([df0, df1], join = 'outer' , ignore_index = True )
print (res); print ()
res = pd.concat([df0, df1], join = 'inner' ,ignore_index = True )
print (res); print ()
|
A B C D E F
0 0.0 0.0 0.0 0.0 NaN NaN
1 0.0 0.0 0.0 0.0 NaN NaN
2 0.0 0.0 0.0 0.0 NaN NaN
3 NaN NaN 1.0 1.0 1.0 1.0
4 NaN NaN 1.0 1.0 1.0 1.0
5 NaN NaN 1.0 1.0 1.0 1.0
A B C D E F
0 0.0 0.0 0.0 0.0 NaN NaN
1 0.0 0.0 0.0 0.0 NaN NaN
2 0.0 0.0 0.0 0.0 NaN NaN
3 NaN NaN 1.0 1.0 1.0 1.0
4 NaN NaN 1.0 1.0 1.0 1.0
5 NaN NaN 1.0 1.0 1.0 1.0
C D
0 0.0 0.0
1 0.0 0.0
2 0.0 0.0
3 1.0 1.0
4 1.0 1.0
5 1.0 1.0
1
2
3
4
5
6
7
8
9
10
11
|
#横向合并
df0 = pd.DataFrame(np.ones(( 3 , 4 )) * 0 , index = [ '1' , '2' , '3' ], columns = [ 'A' , 'B' , 'C' , 'D' ])
df1 = pd.DataFrame(np.ones(( 3 , 4 )) * 1 , index = [ '2' , '3' , '4' ], columns = [ 'A' , 'B' , 'C' , 'D' ])
print (df0); print ()
print (df1); print ()
res = pd.concat([df0, df1], axis = 1 )
print (res); print ()
res = pd.concat([df0, df1], axis = 1 , join = 'inner' , ignore_index = True )
print (res); print ()
res = pd.concat([df0, df1], axis = 1 , join_axes = [df0.index])
print (res); print ()
|
A B C D
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
A B C D
2 1.0 1.0 1.0 1.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
A B C D A B C D
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0
0 1 2 3 4 5 6 7
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
A B C D A B C D
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
1
2
3
4
5
6
7
8
|
df0 = pd.DataFrame(np.ones(( 3 , 4 )) * 0 , index = [ '1' , '2' , '3' ], columns = [ 'A' , 'B' , 'C' , 'D' ])
df1 = pd.DataFrame(np.ones(( 3 , 4 )) * 1 , index = [ '2' , '3' , '4' ], columns = [ 'A' , 'B' , 'C' , 'D' ])
print (df0); print ()
print (df1); print ()
res = df0.append([df1, df1], ignore_index = False )
print (res); print ()
s = pd.Series([ 1 , 2 , 3 , 4 ], index = [ 'A' , 'B' , 'C' , 'E' ])
print (df0.append(s, ignore_index = True ))
|
A B C D
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
A B C D
2 1.0 1.0 1.0 1.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
A B C D
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
2 1.0 1.0 1.0 1.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
A B C D E
0 0.0 0.0 0.0 0.0 NaN
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
3 1.0 2.0 3.0 NaN 4.0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
df1 = pd.DataFrame({ 'key' :[ 'K0' , 'K1' , 'K2' ],
'A' :[ 'A0' , 'A1' , 'A2' ],
'B' :[ 'B0' , 'B1' , 'B2' ]})
df2 = pd.DataFrame({ 'key' :[ 'K3' , 'K1' , 'K2' ],
'C' :[ 'C3' , 'C1' , 'C2' ],
'D' :[ 'D3' , 'D1' , 'D2' ]})
print (df1); print ()
print (df2); print ()
res = pd.merge(df1, df2, on = 'key' )
print (res); print ()
res = pd.merge(df1, df2, on = 'key' , how = 'outer' )
print (res); print ()
res = pd.merge(df1, df2, on = 'key' , how = 'left' )
print (res); print ()
res = pd.merge(df1, df2, on = 'key' , how = 'right' )
print (res); print ()
|
A B key
0 A0 B0 K0
1 A1 B1 K1
2 A2 B2 K2
C D key
0 C3 D3 K3
1 C1 D1 K1
2 C2 D2 K2
A B key C D
0 A1 B1 K1 C1 D1
1 A2 B2 K2 C2 D2
A B key C D
0 A0 B0 K0 NaN NaN
1 A1 B1 K1 C1 D1
2 A2 B2 K2 C2 D2
3 NaN NaN K3 C3 D3
A B key C D
0 A0 B0 K0 NaN NaN
1 A1 B1 K1 C1 D1
2 A2 B2 K2 C2 D2
A B key C D
0 A1 B1 K1 C1 D1
1 A2 B2 K2 C2 D2
2 NaN NaN K3 C3 D3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
df1 = pd.DataFrame({ 'key1' :[ 'K0' , 'K0' , 'K1' ],
'key2' :[ 'K0' , 'K1' , 'K1' ],
'A' :[ 'A0' , 'A1' , 'A2' ],
'B' :[ 'B0' , 'B1' , 'B2' ]})
df2 = pd.DataFrame({ 'key1' :[ 'K0' , 'K0' , 'K1' , 'K2' ],
'key2' :[ 'K0' , 'K0' , 'K1' , 'K2' ],
'C' :[ 'C3' , 'C1' , 'C2' , 'C4' ],
'D' :[ 'D3' , 'D1' , 'D2' , 'D4' ]})
print (df1); print ()
print (df2); print ()
res = pd.merge(df1, df2, on = [ 'key1' , 'key2' ])
print (res); print ()
res = pd.merge(df1, df2, on = [ 'key1' , 'key2' ], how = 'outer' , indicator = 'indi' )
print (res); print ()
|
A B key1 key2
0 A0 B0 K0 K0
1 A1 B1 K0 K1
2 A2 B2 K1 K1
C D key1 key2
0 C3 D3 K0 K0
1 C1 D1 K0 K0
2 C2 D2 K1 K1
3 C4 D4 K2 K2
A B key1 key2 C D
0 A0 B0 K0 K0 C3 D3
1 A0 B0 K0 K0 C1 D1
2 A2 B2 K1 K1 C2 D2
A B key1 key2 C D indi
0 A0 B0 K0 K0 C3 D3 both
1 A0 B0 K0 K0 C1 D1 both
2 A1 B1 K0 K1 NaN NaN left_only
3 A2 B2 K1 K1 C2 D2 both
4 NaN NaN K2 K2 C4 D4 right_only
1
2
3
4
5
6
7
8
9
10
11
12
13
|
#以上是根据值合并。下面根据index合并
df1 = pd.DataFrame({ 'A' :[ 'A0' , 'A1' , 'A2' ],
'B' :[ 'B0' , 'B1' , 'B2' ]},
index = [ 'index0' , 'index1' , 'index2' ])
df2 = pd.DataFrame({ 'A' :[ 'C3' , 'C1' , 'C2' ],
'D' :[ 'D3' , 'D1' , 'D2' ]},
index = [ 'index3' , 'index1' , 'index2' ])
print (df1); print ()
print (df2); print ()
res = pd.merge(df1, df2, left_index = True , right_index = True )
print (res); print ()
res = pd.merge(df1, df2, left_index = True , right_index = True , how = 'outer' , suffixes = [ '_b' , '_g' ])
print (res); print ()
|
A B
index0 A0 B0
index1 A1 B1
index2 A2 B2
A D
index3 C3 D3
index1 C1 D1
index2 C2 D2
A_x B A_y D
index1 A1 B1 C1 D1
index2 A2 B2 C2 D2
A_b B A_g D
index0 A0 B0 NaN NaN
index1 A1 B1 C1 D1
index2 A2 B2 C2 D2
index3 NaN NaN C3 D3
1
2
3
4
|
res = df1.join(df2, how = 'outer' , lsuffix = '_left' , rsuffix = '_right' ) #不用on默认用索引合并
print (res); print ()
res = df1.join(df2, on = 'B' , how = 'outer' , lsuffix = '_left' , rsuffix = '_right' ) #用on指定df1的某列和df2的索引合并
print (res); print ()
|
A_left B A_right D
index0 A0 B0 NaN NaN
index1 A1 B1 C1 D1
index2 A2 B2 C2 D2
index3 NaN NaN C3 D3
A_left B A_right D
index0 A0 B0 NaN NaN
index1 A1 B1 NaN NaN
index2 A2 B2 NaN NaN
index2 NaN index3 C3 D3
index2 NaN index1 C1 D1
index2 NaN index2 C2 D2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt #画图模块
s = pd.Series(np.random.randn( 1000 ), index = np.arange( 1000 ))
s = s.cumsum()
#须在命令行执行, jupyter会报错
#s.plot()
#plt.show()
df = pd.DataFrame(np.random.randn( 1000 , 3 ), columns = [ 'A' , 'B' , 'C' ])
df = df.cumsum()
print (df.head()); print () #head默认显示前5行
#须在命令行执行, jupyter会报错
#s.plot()
#plt.show()
#须在命令行执行, jupyter会报错
#'bar', 'hist', 'box', 'kde', 'area', 'scatter', 'hexbin', 'pie'...
#class_B = df.plot.scatter(x='A', y='B', color='DarkBlue', label='Class B') #画图,scatter<散点图>
#df.plot.scatter(x='A', y='C', color='DarkRed', label='Class C', class_B=class_B)
#plt.show()
|
A B C
0 -0.399363 -1.004210 0.641141
1 -1.970009 -0.608482 -0.758504
2 -3.081640 -0.617352 -1.143872
3 -2.174627 -1.383785 -1.011411
4 -1.415515 -1.892226 -2.511739
希望本文所述对大家Python程序设计有所帮助。
原文链接:https://blog.csdn.net/xuejianbest/article/details/85159361