In [1]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
return false;
}
In [2]:
import pandas as pd, numpy as np
import as plt
#Jupyter 设置
%matplotlib inline
In [3]:
df_fx_data = pd.read_csv('')
df_fx_data.head()
Out[3]:
In [4]:
type(df_fx_data['Date'][0])
Out[4]:
In [5]:
df_fx_data['Date'] = pd.to_datetime(df_fx_data['Date'])
df_fx_data.head()
Out[5]:
In [6]:
type(df_fx_data['Date'][0])
Out[6]:
In [7]:
indexed_df = df_fx_data.set_index('Date')
In [8]:
ts = indexed_df['Value']
ts.head(5)
Out[8]:
In [9]:
plt.plot(ts)
Out[9]:
In [10]:
ts_week = ts.resample('W').mean()
ts_week.plot()
Out[10]:
In [11]:
ts.resample('2A').sum().plot()
Out[11]:
In [12]:
from import plot_pacf, plot_acf
from import MaxNLocator
def acf_pacf(series, lags=40, title=None, figsize=(10,6)):
#ACF and PACF plots
# 求自相关和偏自相关函数
# series 输入的时间序列
# lags 自相关和偏自相关函数的滞后取值范围
fig = plt.figure(figsize=figsize)
ax1 = fig.add_subplot(211)
ax1.set_xlabel('lags')
ax1.set_ylabel('Autocorrelation')
# x坐标为整数
ax1.xaxis.set_major_locator(MaxNLocator(integer=True))
plot_acf(series.dropna(), lags=lags, ax=ax1, title='ACF of %s'%title)
ax2 = fig.add_subplot(212)
ax2.set_xlabel('lags')
ax2.set_ylabel('Partial Autocorrelation')
ax2.xaxis.set_major_locator(MaxNLocator(integer=True))
plot_pacf(series.dropna(), lags=lags, ax=ax2, title='PACF of %s'%title)
fig.tight_layout()
In [13]:
acf_pacf(ts_week, lags=20, title='Weekly Euro Dollar Exchange Rate')
In [14]:
from random import gauss
from random import seed
from pandas import Series
from import autocorrelation_plot
# seed random number generator
seed(1)
# create white noise series
whitenoise = Series([gauss(0.0, 1.0) for i in range(1000)])
acf_pacf(whitenoise, lags=40, title='White Noise Series')
In [15]:
acf_pacf(ts_week, lags=40, title='Weekly Euro Dollar Exchange Rate')
In [16]:
from import adfuller
def test_stationarity(timeseries, window=10):
# 求移动平均和移动标准差
# window为选取的时间窗口个数
rolmean = timeseries.rolling(window=window, center=False).mean()
rolstd = timeseries.rolling(window=window, center=False).std()
# 将以周重取样后的时间序列、移动平均和移动标准差制成折线图
orig = plt.plot(timeseries, color='blue', label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label='Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)
# 用Augmented Dickey-Fuller检验测试时间序列稳定性:
print('Results of Augmented Dickey-Fuller Test:')
# 使用减小AIC的办法估算ADF测试所需的滞后数
dftest = adfuller(timeseries, autolag='AIC')
# 将ADF测试结果、显著性概率、所用的滞后数和所用的观测数打印出来
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value',
'Num Lags Used',
'Num Observations Used'])
for key, value in dftest[4].items():
dfoutput['Critical Value (%s)' % key] = value
print(dfoutput)
In [17]:
test_stationarity(ts_week)
In [18]:
from import seasonal_decompose
from import acorr_ljungbox
def decompose_plot(series, title=''):
decomposition = seasonal_decompose(series)
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid
fig = decomposition.plot()
fig.set_size_inches(10,6)
fig.suptitle(title)
fig.tight_layout()
fig2 = acf_pacf(residual, title='Residuals', figsize=(10,4))
# 每周汇率
decompose_plot(ts_week, title='ts_week')
In [19]:
# 使用第一差分去掉趋势
ts_week_diff = ts_week - ts_week.shift(1)
test_stationarity(ts_week_diff.dropna())
decompose_plot(ts_week_diff.dropna(), title='ts_week_diff')
In [20]:
# 对数转换, 使用第一差分去掉趋势
ts_week_log = np.log(ts_week)
ts_week_log_diff = ts_week_log - ts_week_log.shift(1)
test_stationarity(ts_week_log_diff.dropna())
decompose_plot(ts_week_log_diff.dropna(), title='ts_week_log_diff')
plt.show()
In [21]:
acf_pacf(ts_week_log, title='ts_week_log')
In [22]:
acf_pacf(ts_week_log_diff, title='ts_week_log_diff', lags=10)
In [23]:
from .arima_model import ARIMA
# ARIMA的参数: order = p, d, q
model = ARIMA(ts_week_log, order=(0, 1, 0))
# 已拟合的模型
results_ARIMA = model.fit()
# 拟合的数据和原始数据的残差
residuals = pd.DataFrame(results_ARIMA.resid)
acf_pacf(residuals, lags=10, title='Residuals, p=0, d=1, q=0')
In [24]:
a110=ARIMA(ts_week_log, order=(1, 1, 0)).fit()
a011=ARIMA(ts_week_log, order=(0, 1, 1)).fit()
a111=ARIMA(ts_week_log, order=(1, 1, 1)).fit()
acf_pacf(a110.resid, lags=10, title='Residuals, p=1, d=1, q=0, aic=%f'%a110.aic)
acf_pacf(a011.resid, lags=10, title='Residuals, p=1, d=1, q=1, aic=%f'%a011.aic)
acf_pacf(a111.resid, lags=10, title='Residuals, p=0, d=1, q=1, aic=%f'%a111.aic)
In [25]:
from .arima_model import ARIMA
# ARIMA的参数: order = p, d, q
model = ARIMA(ts_week_log, order=(1, 1, 1))
# 已拟合的模型
results_ARIMA = model.fit()
# ARIMA的结果
print(results_ARIMA.summary())
# 比较拟合的数据和原始数据,
# 注意fittedvalues返回的是d次差分后的序列, 因此和ts_week_log_diff比较
plt.plot(ts_week_log_diff)
plt.plot(results_ARIMA.fittedvalues, color='red')
# 拟合的数据和原始数据的残差
residuals = pd.DataFrame(results_ARIMA.resid)
# 计算拟残差平方和(RSS)
plt.title('RSS: %.4f'% sum((results_ARIMA.fittedvalues-ts_week_log_diff)**2))
acf_pacf(residuals, lags=10, title='Residuals')
# 残差的核密度估计
residuals.plot(kind='kde')
print('\n\nResiduals Summary:\n', residuals.describe())
plt.show()
In [26]:
acf_pacf(results_ARIMA.fittedvalues-ts_week_log_diff, title='Residuals')
In [27]:
predictions_ARIMA_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
predictions_ARIMA_diff.head()
Out[27]:
In [28]:
# 一阶差分的逆向转换为累计和
predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum()
# 对数函数的逆向转换为指数函数
predictions_ARIMA_log = pd.Series(ts_week_log.iloc [0], index=ts_week_log.index)
predictions_ARIMA_log = predictions_ARIMA_log.add(predictions_ARIMA_diff_cumsum,fill_value=0)
In [29]:
predictions_ARIMA = np.exp(predictions_ARIMA_log)
plt.plot(ts_week)
plt.plot(predictions_ARIMA)
plt.title('RMSE: %.4f'% np.sqrt(sum((predictions_ARIMA-ts_week)**2)/len(ts_week)))
Out[29]:
In [30]:
from import mean_squared_error
# 我们的ARIMA模型针对的是序列ts_week_log
# 因此使用的数据为ts_week_log
# 因此须将预测做指数转换
# 使用 n_test_obs 个点校验
n_test_obs = 50
size = int(len(ts_week_log) - n_test_obs)
# 切分数据集
train, test = ts_week_log[:size], ts_week_log[size:len(ts_week_log)]
# 历史值
history = [x for x in train]
# 预测值
predictions = list()
# 置信区间
confidence_intervals = list()
print('Predicted vs Expected Values...')
print('\n')
for t in range(len(test)):
model = ARIMA(history, order=(1,1,1))
model_fit = model.fit()
output = model_fit.forecast(steps=1)
yhat = output[0]
predictions.append(float(yhat))
confidence_intervals.append(output[2])
obs = test[t]
history.append(obs)
# 须将预测做指数转换
print('predicted=%f, expected=%f' % (np.exp(yhat), np.exp(obs)))
error = mean_squared_error(test, predictions)
print('\n')
predictions_series = pd.Series(predictions, index = test.index)
print('Test MSE: %.6f' % error)
In [31]:
predictions_series = pd.Series(predictions, index = test.index)
fig, ax = plt.subplots(figsize=(10, 6))
ax.set(title='Spot Exchange Rate, Euro into USD', xlabel='Date', ylabel='Euro into USD')
ax.plot(ts_week[-n_test_obs:], 'o', label='observed')
# 须将预测做指数转换
ax.plot(np.exp(predictions_series), 'g', label='rolling one-step out-of-sample forecast')
legend = ax.legend(loc='upper left')
In [32]:
%matplotlib inline
import pandas as pd
import numpy as np
import as plt
import datetime
from import relativedelta
import seaborn as sns
import as sm
from import acf
from import pacf
from import seasonal_decompose
from pandas import DatetimeIndex
In [33]:
# 读取CSV
df = pd.read_csv('')
print(df.head(), '\nindex type:\n', type(df.index))
# 按'%Y-%m'格式转换CSV的日期, 为了方便处理, 将时间段转为timestamp
df['Month'] = pd.to_datetime(df['Month'], format='%Y-%m')
# 索引并resample为月
indexed_df = df.set_index('Month')
ts = indexed_df['riders']
ts = ts.resample('M').sum()
print(ts.head(), '\nindex type:\n', type(ts.index))
In [34]:
ts.plot(title='Monthly Num. of Ridership')
test_stationarity(ts)
decomposition = seasonal_decompose(ts)
fig = plt.figure()
fig = decomposition.plot()
fig.set_size_inches(10, 6)
acf_pacf(decomposition.resid, title='Residuals')
In [35]:
ts_sdiff = ts - ts.shift(12)
test_stationarity(ts_sdiff.dropna())
decomposition = seasonal_decompose(ts_sdiff.dropna(), freq=12)
fig = plt.figure()
fig = decomposition.plot()
fig.set_size_inches(10, 6)
acf_pacf(decomposition.resid, title='Residuals')
In [36]:
ts_diff_of_sdiff = ts_sdiff - ts_sdiff.shift(1)
test_stationarity(ts_diff_of_sdiff.dropna())
decomposition = seasonal_decompose(ts_diff_of_sdiff.dropna(), freq=12)
fig = plt.figure()
fig = decomposition.plot()
fig.set_size_inches(10, 6)
acf_pacf(decomposition.resid, title='Residuals')
In [37]:
ts_log = np.log(ts)
ts_log_sdiff = ts_log - ts_log.shift(12)
ts_diff_of_log_sdiff = ts_log_sdiff - ts_log_sdiff.shift(1)
test_stationarity(ts_diff_of_log_sdiff.dropna())
decomposition = seasonal_decompose(ts_diff_of_log_sdiff.dropna(), freq=12)
fig = plt.figure()
fig = decomposition.plot()
fig.set_size_inches(10, 6)
acf_pacf(decomposition.resid, title='Residuals')
In [38]:
acf_pacf(ts_diff_of_sdiff, title='ts_diff_of_sdiff')
In [39]:
from import SARIMAX
# SARIMA的参数: order = p, d, q, seasonal_order=P, D, Q, season的周期=12
model = SARIMAX(ts, order=(0,1,0),seasonal_order=(0,1,1,12))
# 已拟合的模型
fitted = model.fit()
# ARIMA的结果
print(fitted.summary())
# 比较拟合的数据和原始数据,
ts.plot()
fitted.fittedvalues.plot(color='red')
# 拟合的数据和原始数据的残差
residuals = pd.DataFrame(fitted.resid)
acf_pacf(residuals, title='Residuals', figsize=(10,4))
# 计算拟残差平方和(RSS)
plt.title('RSS: %.4f'% sum((fitted.fittedvalues-ts)**2))
# 残差的核密度估计
residuals.plot(kind='kde')
print('\n\nResiduals Summary:\n', residuals.describe())
plt.show()
In [40]:
from import mean_squared_error
# 我们的ARIMA模型针对的是序列ts_week_log
# 因此使用的数据为ts_week_log
# 因此须将预测做指数转换
# 使用 n_test_obs 个点校验
n_test_obs = 20
size = int(len(ts) - n_test_obs)
# 切分数据集
train, test = ts[:size], ts[size:len(ts)]
# 历史值
history = [x for x in train]
# 预测值
predictions = list()
# 置信区间
confidence_intervals = list()
print('Predicted vs Expected Values...')
print('\n')
for t in range(len(test)):
model = SARIMAX(history, order=(0,1,0),seasonal_order=(0,1,1,12))
model_fit = model.fit()
output = model_fit.forecast(steps=1)
yhat = output[0]
predictions.append(float(yhat))
obs = test[t]
history.append(obs)
print('predicted=%f, expected=%f' % (yhat, obs))
error = mean_squared_error(test, predictions)
print('\n')
predictions_series = pd.Series(predictions, index = test.index)
print('Test MSE: %.6f' % error)
predictions_series = pd.Series(predictions, index = test.index)
fig, ax = plt.subplots(figsize=(10, 6))
ax.set(title='Bus Riders', xlabel='Month', ylabel='100 Bus Riders')
ax.plot(ts[-n_test_obs:], 'o', label='observed')
ax.plot(predictions_series, 'g', label='rolling one-step out-of-sample forecast')
legend = ax.legend(loc='upper left')
In [ ]: