Pandas库基础操作

时间:2022-03-06 19:13:52

pandas主要用于数据的处理,在数据预处理方面用途比较多
如下为一些基本操作:

import pandas as pd
food_info=pandas.read_csv("food_info.csv")
print(type(food_info)) #food_info是Dataframe格式
print(food_info.dtypes)


#取food_info的前三行数据
#print(food_info.head(3) )

#取food_info的后四行数据
#print(food_info.tail(4))

#取food_info的列名
print(food_info.columns)

#打印数据的行数、列数
print(food_info.shape)

#loc对数据进行定位
print(food_info.loc[0]) #调用第0个数据

#对数据进行切片,取出第3、4、5、6的数据
#第一种
food_info.loc[3:6]
#第二种
two_five_ten = [2,5,10]
food_info.loc[two_five_ten]
#第三种
food_info.loc[[2,5,10]]

#取出数据中的“NDB_No”列
ndb_col = food_info["NDB_No"]
#取两列
columns = ["Zinc_(mg)", "Copper_(mg)"]
zinc_copper = food_info[columns]

#取出列名中以“g”结尾的数据
col_names = food_info.columns.tolist()
#print (col_names)
gram_columns = []

for c in col_names:
if c.endswith("(g)"):
gram_columns.append(c)
gram_df = food_info[gram_columns]
print(gram_df.head(3))

#对一列中每一个数据都除以100
print (food_info["Iron_(mg)"])
div_1000 = food_info["Iron_(mg)"] / 1000
print (div_1000)

#排序
#以Sodium_(mg)进行排序,inplace=True表示新生成一个,false为替换原数据
food_info.sort_values("Sodium_(mg)", inplace=True)
print (food_info["Sodium_(mg)"])#默认为升序
#Sorts by descending order, rather than ascending.
food_info.sort_values("Sodium_(mg)", inplace=True, ascending=False)
print (food_info["Sodium_(mg)"])

实例:
对泰坦尼克号数据的处理

import pandas as pd
import numpy as np
titanic_survival = pd.read_csv("titanic_train.csv")
#titanic_survival.head()

age=titanic_survival["Age"] #取出age这一列
age_is_null=pd.isnull(age) #返回的是bool类型
age_null_true=age[age_is_null]
age_null_count=len(age_null_true)
print(age_null_count)

#如不做预处理,求均值.因为有缺失值算的平均值为nan
mean_age=sum(titanic_survival["Age"])/len(titanic_survival["Age"])
print(mean_age)
##手动处理
good_ages=titanic_survival["Age"][age_is_null==False]
correct_mean_age=sum(good_ages)/len(good_ages)
print(correct_mean_age)
#pandas库中提供了mean函数,可以自动处理缺失值情况
correct_mean_age=titanic_survival["Age"].mean()
print(correct_mean_age)

#1.计算每一个等级船舱的平均价格
passenger_classes=[1,2,3]
fares_by_class={}
for this_class in passenger_classes:
pclass_rows=titanic_survival[titanic_survival["Pclass"]==this_class]
pclass_fares=pclass_rows["Fare"]
fare_for_class=pclass_fares.mean()
fares_by_class[this_class]=fare_for_class
print(fares_by_class)
#2.以Pclass为基准,统计Survived的几率,相比于上一个方便很多
passenger_survival=titanic_survival.pivot_table(index="Pclass",values="Survived",aggfunc=np.mean)
print(passenger_survival)
passenger_survival=titanic_survival.pivot_table(index="Pclass",values="Age")
print(passenger_survival)
port_stats=titanic_survival.pivot_table(index="Embarked",values=["Fare","Survived"],aggfunc=np.sum)
print(port_stats)

#dropna去掉缺失值,axis=1删除有缺失值的列
drop_na_columns=titanic_survival.dropna(axis=1)
new_titanic_survival=titanic_survival.dropna(axis=0,subset=["Age","Sex"])

#定位
row_index_83_age=titanic_survival.loc[83,"Age"]
row_index_1000_pclass=titanic_survival.loc[766,"Pclass"]
print(row_index_83_age)
print(row_index_1000_pclass)


#apply自定义的函数hundredth_row
#返回第100行,
def hundredth_row(column):
hundredth_item=column.loc[99]
return hundredth_item

hundredth_row=titanic_survival.apply(hundredth_row)
print(hundredth_row)
#
def not_null_count(column):
column_null=pd.isnull(column)
null=column[column_null]
return len(null)
column_null_count=titanic_survival.apply(not_null_count)
print(column_null_count)

Series结构:
pandas库中,dataframe是一种矩阵的形式,其中有一种是series,表示某一行或某一列或某几行某几列

import pandas as pd
import numpy as np
fandango=pd.read_csv("fandango_score_comparison.csv")
series_film=fandango["FILM"]
series_rt=fandango["RottenTomatoes"]

from pandas import Series

#可以用字符型作为series索引
film_names=series_film.values
rt_scores=series_rt.values
series_custom=Series(rt_scores,index=film_names)
series_custom[["Minions(2015)","Leviathan(2014)"]]

# int index is also aviable
series_custom = Series(rt_scores , index=film_names)
series_custom[['Minions (2015)', 'Leviathan (2014)']]
fiveten = series_custom[5:10]
print(fiveten)

original_index = series_custom.index.tolist()
#print original_index
sorted_index = sorted(original_index)
sorted_by_index = series_custom.reindex(sorted_index)
print (sorted_by_index)