001_python实现数据分析

时间:2023-03-10 04:53:43
001_python实现数据分析

一、

# coding:utf8
# !/usr/bin/python
# import numpy as np
import pandas as pd
import np def example2():
'''
Describing a numeric ``Series``.
:return:
'''
s = pd.Series([1, 2, 3])
print s.describe()
'''
count 3.0
mean 2.0
std 1.0
min 1.0
25% 1.5
50% 2.0
75% 2.5
max 3.0
dtype: float64
'''
def example3():
'''
Describing a categorical ``Series``.
:return:
'''
s = pd.Series(['a', 'a', 'b', 'c'])
print s.describe()
'''
count 4
unique 3
top a
freq 2
dtype: object
'''
def example4():
'''
Describing a timestamp ``Series``.
:return:
'''
s = pd.Series([
np.datetime64("2000-01-01"),
np.datetime64("2010-01-01"),
np.datetime64("2010-01-01")
])
print s.describe()
'''
count 3
unique 2
top 2010-01-01 00:00:00
freq 2
first 2000-01-01 00:00:00
last 2010-01-01 00:00:00
dtype: object
'''
def example5():
'''
Describing a ``DataFrame``. By default only numeric fields are returned.
:return:
'''
df = pd.DataFrame({'categorical': pd.Categorical(['d', 'e', 'f']),
'numeric': [1, 2, 3],
'object': ['a', 'b', 'c']})
print df.describe()
'''
#Describing all columns of a ``DataFrame`` regardless of data type.
print df.describe(include='all')
#Describing a column from a ``DataFrame`` by accessing it as an attribute.
print df.numeric.describe()
#Including only numeric columns in a ``DataFrame`` description.
print df.describe(include=[np.number])
#Including only string columns in a ``DataFrame`` description.
print df.describe(include=[np.object])
#Including only categorical columns from a ``DataFrame`` description.
print df.describe(include=['category'])
#Excluding numeric columns from a ``DataFrame`` description.
print df.describe(exclude=[np.number])
#Excluding object columns from a ``DataFrame`` description.
print df.describe(exclude=[np.object])
'''
def example1():
dic1={'000':{'a':1,'b':2,'c':3},'001':{'d':4,'e':5,'f':6}}
df2=pd.DataFrame(dic1)
# print df2.describe()
'''
000 001
count 3.0 3.0
mean 2.0 5.0
std 1.0 1.0
min 1.0 4.0
25% 1.5 4.5
50% 2.0 5.0
75% 2.5 5.5
max 3.0 6.0
'''
print "返回非NAN数据项数量=>count()\n{count}\n".format(count = df2.describe().count())
print "返回中位数,等价第50位百分位数的值=>median()\n{median}\n".format(median = df2.describe().median())
print "返回数据的众值=>mode()\n{mode}\n".format(mode = df2.describe().mode())
print "返回数据的标准差(描述离散度)=>std()\n{std}\n".format(std = df2.describe().std())
print "返回方差=>var()\n{var}\n".format(var = df2.describe().var())
print "偏态系数(skewness,表示数据分布的对称程度)=>skew()\n{skew}\n".format(skew = df2.describe().skew()) def main():
example1()
if __name__ == '__main__':
main()

输出=>

返回非NAN数据项数量=>count()
000 8
001 8
dtype: int64
返回中位数,等价第50位百分位数的值=>median()
000 2.00
001 4.75
dtype: float64
返回数据的众值=>mode()
000 001
0 1.0 5.0
1 2.0 NaN
2 3.0 NaN
返回数据的标准差(描述离散度)=>std()
000 0.801784
001 1.603567
dtype: float64
返回方差=>var()
000 0.642857
001 2.571429
dtype: float64
偏态系数(skewness,表示数据分布的对称程度)=>skew()
000 0.000000
001 -1.299187
dtype: float64