备份。绿字为备注。包括数据读取、数据选取、数据拼接、按照日期聚合、目录替换、滚动预测、预测结果画图、计算rmse并显示
#packages 测试阶段用到的所有
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from itertools import product
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
from keras.objectives import mean_squared_error
import numpy as np
import tensorflow as tf
from statsmodels.tsa.ar_model import AR
import math
##the data 用pandas读取csv数据
df1=pd.read_excel(\'D:\\job\\20200520_ xxxx项目\\data\\sq_df_yy.xlsx\')
df2=pd.read_excel(\'D:\\job\\20200520_ xxxx项目\\data\\sq_df_zx2018.xlsx\')
df3=pd.read_excel(\'D:\\job\\20200520_ xxxx项目\\data\\sq_df_zx2019p1.xlsx\')
df4=pd.read_excel(\'D:\\job\\20200520_ xxxx项目\\data\\sq_df_zx2019_p2.xlsx\')
##选择需要的字段
df1_1=df1[[\'x1\',\'x2\',\'x3\',\'x4\',\'x5\',\'slsj\',\'x6\',\'x7\',\'x8\',\'x9\',\'x10\',\'x11\',\'x12\']]
df2_1=df2[[\'x1\',\'x2\',\'x3\',\'x4\',\'x5\',\'slsj\',\'x6\',\'x7\',\'x8\',\'x9\',\'x10\',\'x11\',\'x12\']]
df3_1=df3[[\'x1\',\'x2\',\'x3\',\'x4\',\'x5\',\'slsj\',\'x6\',\'x7\',\'x8\',\'x9\',\'x10\',\'x11\',\'x12\']]
df4_1=df4[[\'x1\',\'x2\',\'x3\',\'x4\',\'x5\',\'slsj\',\'x6\',\'x7\',\'x8\',\'x9\',\'x10\',\'x11\',\'x12\']]
del df1,df2,df3,df4
##concat them 拼接同结构的数据
df_a=pd.concat([df1_1,df2_1,df3_1,df4_1])
del df1_1,df2_1,df3_1,df4_1
##splite the data as train(201801-201910) test (201911-201912) 切割数据
df_b=pd.DataFrame.drop_duplicates(df_a, subset=None, keep=\'first\', inplace=False)
df_b=df_b.sort_values(by=\'slsj\')
df_b.reset_index(drop=True, inplace=True)
i=pd.DataFrame(df_b[\'slsj\'])
train=df_b[0:2195040]
test =df_b[2195040:]
print(test.iloc[0,])
del df_a,i
print(df_b.iloc[0,])
df_c=df_b ##backup
#df_b=df_b.drop(\'timestamp\',axis=1)
##Aggregate data by Date 按照天进行聚合
df_b[\'count\']=1
df_b=df_b[[\'slsj\',\'count\']]
df_b[\'timestamp\']=pd.to_datetime(df_b[\'slsj\'],format=\'%d-%m-%Y %H:%M\')
df_b.index=df_b[\'timestamp\']
df_b.head()
df_b=df_b.resample(\'D\').sum()
##train
train[\'count\']=1
train=train[[\'slsj\',\'count\']]
train[\'timestamp\']=pd.to_datetime(train[\'slsj\'],format=\'%d-%m-%Y %H:%M\')
train.index=train[\'timestamp\']
train=train.resample(\'D\').sum()
train.head()
##test
test[\'count\']=1
test=test[[\'slsj\',\'count\']]
test[\'timestamp\']=pd.to_datetime(test[\'slsj\'],format=\'%d-%m-%Y %H:%M\')
test.index=test[\'timestamp\']
test=test.resample(\'D\').sum()
test.head()
##plot 看看图
plt.figure(figsize=(12,8))
plt.plot(train.index,train[\'count\'],label=\'Train\')
plt.plot(test.index,test[\'count\'],label=\'Test\')
plt.legend(loc=\'best\')
plt.title(\'Daily Counts\')
plt.show()
##一些检验
sm.tsa.seasonal_decompose(df_b[\'count\']).plot() ##季节性分解 /Seasonal decomposition
print(\'pvalue={}\'.format(adfuller(df_b[\'count\'])[1])) ##增强Dickey-Fuller单位根检验 pvalue
##model进行拟合和预测
X = df_b.values
predictions=pd.DataFrame()
original=pd.DataFrame()
for i in range(0,365):
train, test = X[0+i:365+i], X[365+i:366+i]
model=AR(train)
model_fit = model.fit()
p = model_fit.predict(start=len(train), end=len(train)+len(test)-1, dynamic=False)
predictions=predictions.append([p])
original=original.append([test[0,0]])
i=i+1
print(i,test,p)
predictions=predictions.reset_index(drop=True)
original=original.reset_index(drop=True)
del train,test,model,model_fit,p
##Change index to date 把INDEX换成日期,看图更直观
i=df_b.index
i2=i[365:730]
predictions.set_index(i2,inplace=True)
original.set_index(i2,inplace=True)
del i,i2
##PLOT 图
plt.figure(facecolor=\'white\')
plt.plot(predictions,color=\'red\', label=\'Predict\')
plt.plot(original,color=\'blue\', label=\'Original\')
plt.legend(loc=\'best\')
plt.title(\'predictions vs expected\')
plt.show()
##RMSE emmm
y_true=np.array(predictions)
y_true=y_true.tolist()
y_pred=np.array(original)
y_pred=y_pred.tolist()
mse=mean_squared_error(y_true,y_pred)
with tf.Session() as session:
session.run(tf.global_variables_initializer())
i=session.run(mse)
rmse=math.sqrt(i[0])