%matplotlib inline

import random

import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

sns.set_context("talk")

Anscombe’s quartet

Anscombe’s quartet comprises of four datasets, and is rather famous. Why? You’ll find out in this exercise.

anascombe = pd.read_csv('data/anscombe.csv')
anascombe.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	dataset	x	y
0	I	10.0	8.04
1	I	8.0	6.95
2	I	13.0	7.58
3	I	9.0	8.81
4	I	11.0	8.33

Part 1

For each of the four datasets…
- Compute the mean and variance of both x and y
- Compute the correlation coefficient between x and y
- Compute the linear regression line: $y = β_{0} + β_{1} x + ϵ$ (hint: use statsmodels and look at the Statsmodels notebook)

# Enter your code

# 取出数据集“I”的方法是：anascombe[anascombe.dataset == "I"]

dataset = anascombe[anascombe.dataset == "I"]
print("dataset 'I':\n")

print(" The mean of x is %0.2f, and the variance is %0.2lf." % (dataset['x'].mean(), dataset['x'].var()))
print(" The mean of y is %0.2f, and the variance is %0.2lf.\n" % (dataset['y'].mean(), dataset['y'].var()))

a = np.array([dataset['x'], dataset['y']])  
b = np.corrcoef(a)
print(" The correlation coefficient between x and y is %lf.\n" % b[0][1])

n = len(dataset)
is_train = np.random.rand(n) < 0.7
train = dataset[is_train].reset_index(drop=True)
test = dataset[~is_train].reset_index(drop=True)
lin_model = smf.ols('x ~ y', train).fit()
lin_model.summary()

dataset 'I':

  The mean of x is 9.00, and the variance is 11.00.
  The mean of y is 7.50, and the variance is 4.13.

  The correlation coefficient between x and y is 0.816421.

OLS Regression Results
Dep. Variable:	x	R-squared:	0.635
Model:	OLS	Adj. R-squared:	0.574
Method:	Least Squares	F-statistic:	10.43
Date:	Mon, 11 Jun 2018	Prob (F-statistic):	0.0179
Time:	12:08:36	Log-Likelihood:	-16.703
No. Observations:	8	AIC:	37.41
Df Residuals:	6	BIC:	37.57
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	-1.2102	3.335	-0.363	0.729	-9.371	6.951
y	1.4174	0.439	3.230	0.018	0.344	2.491

Omnibus:	0.024	Durbin-Watson:	2.656
Prob(Omnibus):	0.988	Jarque-Bera (JB):	0.188
Skew:	0.082	Prob(JB):	0.910
Kurtosis:	2.268	Cond. No.	32.3

dataset = anascombe[anascombe.dataset == "II"]
print("dataset 'II':\n")
print(" The mean of x is %0.2f, and the variance is %0.2f.\n" % (dataset['x'].mean(), dataset['x'].var()))
print(" The mean of y is %0.2f, and the variance is %0.2f.\n" % (dataset['y'].mean(), dataset['y'].var()))

a = np.array([dataset['x'], dataset['y']])  
b = np.corrcoef(a)
print(" The correlation coefficient between x and y is %lf.\n" % b[0][1])

n = len(dataset)
is_train = np.random.rand(n) < 0.7
train = dataset[is_train].reset_index(drop=True)
test = dataset[~is_train].reset_index(drop=True)
lin_model = smf.ols('x ~ y', train).fit()
lin_model.summary()

dataset 'II':

  The mean of x is 9.00, and the variance is 11.00.

  The mean of y is 7.50, and the variance is 4.13.

  The correlation coefficient between x and y is 0.816237.

OLS Regression Results
Dep. Variable:	x	R-squared:	0.678
Model:	OLS	Adj. R-squared:	0.638
Method:	Least Squares	F-statistic:	16.85
Date:	Mon, 11 Jun 2018	Prob (F-statistic):	0.00341
Time:	12:08:36	Log-Likelihood:	-20.461
No. Observations:	10	AIC:	44.92
Df Residuals:	8	BIC:	45.53
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	-1.2852	2.568	-0.501	0.630	-7.206	4.636
y	1.3882	0.338	4.105	0.003	0.608	2.168

Omnibus:	2.361	Durbin-Watson:	2.780
Prob(Omnibus):	0.307	Jarque-Bera (JB):	1.184
Skew:	0.829	Prob(JB):	0.553
Kurtosis:	2.694	Cond. No.	29.9

dataset = anascombe[anascombe.dataset == "III"]
print("dataset 'III':\n")
print(" The mean of x is %0.2f, and the variance is %0.2f.\n" % (dataset['x'].mean(), dataset['x'].var()))
print(" The mean of y is %0.2f, and the variance is %0.2f.\n" % (dataset['y'].mean(), dataset['y'].var()))

a = np.array([dataset['x'], dataset['y']])  
b = np.corrcoef(a)
print(" The correlation coefficient between x and y is %lf.\n" % b[0][1])

n = len(dataset)
is_train = np.random.rand(n) < 0.7
train = dataset[is_train].reset_index(drop=True)
test = dataset[~is_train].reset_index(drop=True)
lin_model = smf.ols('x ~ y', train).fit()
lin_model.summary()

dataset 'III':

  The mean of x is 9.00, and the variance is 11.00.

  The mean of y is 7.50, and the variance is 4.12.

  The correlation coefficient between x and y is 0.816287.

OLS Regression Results
Dep. Variable:	x	R-squared:	0.704
Model:	OLS	Adj. R-squared:	0.655
Method:	Least Squares	F-statistic:	14.27
Date:	Mon, 11 Jun 2018	Prob (F-statistic):	0.00921
Time:	12:08:36	Log-Likelihood:	-16.338
No. Observations:	8	AIC:	36.68
Df Residuals:	6	BIC:	36.84
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	-1.0892	2.651	-0.411	0.695	-7.575	5.396
y	1.2835	0.340	3.777	0.009	0.452	2.115

Omnibus:	1.577	Durbin-Watson:	2.069
Prob(Omnibus):	0.455	Jarque-Bera (JB):	0.749
Skew:	0.708	Prob(JB):	0.688
Kurtosis:	2.506	Cond. No.	27.6

dataset = anascombe[anascombe.dataset == "IV"]
print("dataset 'IV':\n")
print(" The mean of x is %0.2f, and the variance is %0.2f.\n" % (dataset['x'].mean(), dataset['x'].var()))
print(" The mean of y is %0.2f, and the variance is %0.2f.\n" % (dataset['y'].mean(), dataset['y'].var()))

a = np.array([dataset['x'], dataset['y']])  
b = np.corrcoef(a)
print(" The correlation coefficient between x and y is %lf.\n" % b[0][1])

n = len(dataset)
is_train = np.random.rand(n) < 0.7
train = dataset[is_train].reset_index(drop=True)
test = dataset[~is_train].reset_index(drop=True)
lin_model = smf.ols('x ~ y', train).fit()
lin_model.summary()

dataset 'IV':

  The mean of x is 9.00, and the variance is 11.00.

  The mean of y is 7.50, and the variance is 4.12.

  The correlation coefficient between x and y is 0.816521.

OLS Regression Results
Dep. Variable:	x	R-squared:	-inf
Model:	OLS	Adj. R-squared:	-inf
Method:	Least Squares	F-statistic:	-6.000
Date:	Mon, 11 Jun 2018	Prob (F-statistic):	1.00
Time:	12:08:36	Log-Likelihood:	253.03
No. Observations:	8	AIC:	-502.1
Df Residuals:	6	BIC:	-501.9
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	8.0000	1.13e-14	7.08e+14	0.000	8.000	8.000
y	-1.11e-16	1.54e-15	-0.072	0.945	-3.88e-15	3.66e-15

Omnibus:	154.682	Durbin-Watson:	0.000
Prob(Omnibus):	0.000	Jarque-Bera (JB):	3.000
Skew:	0.000	Prob(JB):	0.223
Kurtosis:	0.000	Cond. No.	46.5

Part 2

Using Seaborn, visualize all four datasets.

hint: use sns.FacetGrid combined with plt.scatter

# Enter your code

g = sns.FacetGrid(anascombe, col="dataset") 
g.map(plt.scatter, "x","y")

第十四周的作业

秒客网

第十四周的作业

Anscombe’s quartet

Part 1

Part 2

相关文章