菜单

基于LSTM的时间序列预测

2019-09-06 - 代码整理
基于LSTM的时间序列预测

这是一个电量预测的项目,数据集包括日期和用电量,以天为周期,一天一个数据,共计3年多。

环境为win10  python3.7.2  keras2.2.5  tensorflow1.14.0

需要用到keras包和tensorflow包

lstm是一种RNN(循环神经网络)的改进算法,适用于时间序列相关数据的预测。可单变量也可多变量进行预测,这里展示单变量的一种算法。

 

以下为源代码,数据集请联系站长免费索取,或者等待后续站长维护这个文章的时候把数据放到服务器上以供下载(其实我就是现在比较懒 懒得弄 哈哈)

 

from __future__ import print_function

import time
import datetime
#from datetime import *
import warnings
import numpy as np
import pandas as pd
#import time
#import matplotlib.pyplot as plt
from numpy import newaxis
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import Sequential
import sys

 

warnings.filterwarnings(“ignore”)

“””
Function: load_data2
INPUTS: filename – 输入的时间序列文件名
seq_len – 训练的数据窗口大小
normalise_window – 是否要对数据进行标准化
pred_length – 建模时测试数据的长度

RETURNS: 训练集数据

REMARKS: 读入时间序列数据,把它变成每行seq_len+1个,每行之间向后移一位的数据,
然后将数据作标准化,用于训练
“””
def load_data2(filename, seq_len, normalise_window, pred_length):
#f = open(filename, ‘rb’).read()
f = open(filename, ‘r’).read()
data = f.split(‘\n’)

print(‘data len:’,len(data))
print(‘sequence len:’,seq_len)

sequence_length = seq_len + 1
result = []
for index in range(len(data) – sequence_length):
result.append(data[index: index + sequence_length]) #得到长度为seq_len+1的向量,最后一个作为label

print(‘result len:’,len(result))
print(‘result shape:’,np.array(result).shape)
#print(result[:1])

if normalise_window:
#result = normalise_windows(result)
result = normalise_windows(result)

#print(result[:1])
print(‘normalise_windows result shape:’,np.array(result).shape)

result = np.array(result)

row = result.shape[0] – pred_length

train = result[:row, :]
np.random.shuffle(train)
x_train = train[:, :-1]
y_train = train[:, -1]

x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
return [x_train, y_train]

 

“””
Function: normalise_window
INPUTS: window_data – 输入数据

RETURNS: 标准化后的数据

REMARKS: 将数据标准化(每行的第一个数据是0)
“””
def normalise_windows(window_data):
normalised_data = []
for window in window_data: #window shape (sequence_length L ,) 即(51L,)
normalised_window = [((float(p) / float(window[0])) – 1) for p in window]
normalised_data.append(normalised_window)
return normalised_data

 

“””
Function: build_model
INPUTS: layers – 神经网络参数(layer1, layer2, layer3, layer4)
drop – LSTM要扔掉的神经元的百分比
RETURNS: 设定完参数的模型

REMARKS: 设定LSTM层,输出层等
“””
def build_model(layers, drop): #layers [1,50,100,1]
model = Sequential()

model.add(LSTM(input_dim=layers[0],output_dim=layers[1],return_sequences=True))
model.add(Dropout(drop))

model.add(LSTM(layers[2],return_sequences=False))
model.add(Dropout(drop))

model.add(Dense(output_dim=layers[3]))
model.add(Activation(“linear”))

start = time.time()
model.compile(loss=”mse”, optimizer=”rmsprop”)
print(“Compilation Time : “, time.time() – start)
return model

 

“””
Function: predict_sequence_full1
INPUTS: model – 已经初始化好的模型
data_n – 标准化后的的用于预测的数据
data_r – 未标准化的用于预测的原始数据
window_size – 训练的数据窗口
length – 要预测的序列长度

RETURNS: 预测得到的序列

REMARKS: 进行不断推进预测,每次移动一位数据
“””
#不断推进方法一
def predict_sequence_full1(model, data_n, data_r, window_size, length): #data X_test
curr_frame = data_n #(50L,1L)
buffer = data_r
predicted = []
predicted2= []
for i in range(0,length):
#x = np.array([[[1],[2],[3]], [[4],[5],[6]]]) x.shape (2, 3, 1) x[0,0] = array([1]) x[:,np.newaxis,:,:].shape (2, 1, 3, 1)
predicted.append(model.predict(curr_frame[newaxis,:,:])[0,0]) #np.array(curr_frame[newaxis,:,:]).shape (1L,50L,1L)
curr_frame = curr_frame[1:]
curr_frame = np.insert(curr_frame, [window_size-1], predicted[-1], axis=0) #numpy.insert(arr, obj, values, axis=None)

base = float(buffer[0][0])
for j in range(len(predicted)):
predicted2.append((predicted[j] +1)*base)

return predicted2

 

“””
Function: predict_sequence_full3
INPUTS: model – 已经初始化好的模型
data_n – 标准化后的的用于预测的数据
data_r – 未标准化的用于预测的原始数据
window_size – 训练的数据窗口
length – 要预测的序列长度

RETURNS: 预测得到的序列

REMARKS: 进行不断推进预测,每次移动一位数据,推进时不断将首位至零(调整标准化)
“””
#不断推进方法二
def predict_sequence_full3(model, data_n, data_r, window_size, length): #data X_test
curr_frame = data_n #(50L,1L)
buffer = data_r
predicted = []
predicted2= []
for i in range(0,length):
#x = np.array([[[1],[2],[3]], [[4],[5],[6]]]) x.shape (2, 3, 1) x[0,0] = array([1]) x[:,np.newaxis,:,:].shape (2, 1, 3, 1)
predicted.append(model.predict(curr_frame[newaxis,:,:])[0,0]) #np.array(curr_frame[newaxis,:,:]).shape (1L,50L,1L)
curr_frame = curr_frame[1:]
curr_frame = np.insert(curr_frame, [window_size-1], predicted[-1], axis=0) #numpy.insert(arr, obj, values, axis=None)

next_frame = np.array([])
for j in range(len(curr_frame)):
if j==0:
next_frame = np.insert(next_frame, 0, 0, axis=0)
else:
next_frame = np.insert(next_frame,j, ((curr_frame[j] +1)/(curr_frame[0]+1) -1), axis=0)

next_frame = next_frame.reshape((window_size,1))
curr_frame = next_frame

for j in range(len(predicted)):
base = float(buffer[j][0])
predicted2.append((predicted[j] +1)*base)
buffer = np.insert(buffer, [window_size+j], predicted2[-1], axis = 0)

return predicted2

 

“””
Function: predict_cal_error
INPUTS: model – 已经初始化好的模型
pred_len – 预测的长度
seq_len – 时间窗口长度

RETURNS: 预测误差

REMARKS: 计算LSTM的预测误差
“””
def predict_cal_error(model, pred_len, seq_len, file):
pred_lst_pos = -pred_len
#print(‘predict_cal_error’, filename)

f = open(file, ‘r’).read()
data = f.split(‘\n’)
data_orig = data.copy()

data = data[(pred_lst_pos-seq_len-1):pred_lst_pos-1]
data_r = data[0:seq_len]
data_r = np.array(data_r)
data_r = data_r.reshape((seq_len,1))

result = []
result.append(data[0: 0 + seq_len])
result = normalise_windows(result)
result = np.array(result)
result = result.reshape(seq_len,1)

full_predictions3 = predict_sequence_full3(model, result, data_r, seq_len, pred_len)
full_predictions1 = predict_sequence_full1(model, result, data_r, seq_len, pred_len)

#data_v = data[pred_lst_pos-1 :-1]
data_v = data_orig[pred_lst_pos-1 :-1]

data_v = np.array(data_v, dtype=float)
v = data_v.sum()
#v = sum(data_v)
full_predict = np.array(full_predictions3)
pred = full_predict.sum()
#pred = sum(full_prediction4)
print (“data_v: “,data_v )
print (“full_predict: “, full_predict)
return (v-pred)/v

“””
Function: curve_smooth
INPUTS: fileinput – 输入的时间序列数据
fileTmpOut – 平滑后的实际序列数据
rate1 – 平滑的下限
rate2 – 平滑的上限
RETURNS:

REMARKS: 对输入数据进行平滑(小于平均值的70%,大于平均值的1.3倍进行平滑)
“””
def curve_smooth(fileinput, fileTmpOut, rate1, rate2):
#数据平滑
#filename = u’D:/boyuan/shaoxing/财务现金流和电量预测/work/testdata/高压用户日电量_无平滑_7上旬.csv’
filename = fileinput

f = open(filename, ‘r’).read()
data = f.split(‘\n’)
data2=[]
for i in range(len(data)-1):
data2.append(float(data[i]))
#data = np.array(data, dtype=float)
data2 = pd.DataFrame(data2, dtype=float)
avg = data2.mean().iloc[0]

#data3=pd.DataFrame()
data3= data2.copy()

for i in range(len(data3)):
data3.iloc[i,0]=0

for i in range(len(data2)):
if ((data2.iloc[i,0] < rate1*avg) | (data2.iloc[i,0] > rate2*avg)):
if ((i>=7) & (data2.iloc[i-7,0]>=rate1*avg) & (data2.iloc[i-7,0]<=rate2*avg)):
data3.iloc[i,0] = data2.iloc[i-7,0]
elif ((i>=28) & (data2.iloc[i-28,0]>=rate1*avg) & (data2.iloc[i-28,0]<=rate2*avg)):
data3.iloc[i,0] = data2.iloc[i-28,0]
elif (i+7<len(data2)):
if ((data2.iloc[i+7,0]>=rate1*avg) & (data2.iloc[i+7,0]<=rate2*avg)):
data3.iloc[i,0] = data2.iloc[i+7,0]

#elif ((i+7<len(data2)) & (data2.iloc[i+7,0]>=0.8*avg) & (data2.iloc[i+7,0]<=1.2*avg)):
# data3.iloc[i,0] = data2.iloc[i+7,0]
elif (i+28<len(data2)):
if ((data2.iloc[i+28,0]>=rate1*avg) & (data2.iloc[i+28,0]<=rate2*avg)):
data3.iloc[i,0] = data2.iloc[i+28,0]
else:
data3.iloc[i,0] = avg
else:
data3.iloc[i,0] = avg
#elif ((i+28<len(data2)) & (data2.iloc[i+28,0]>=0.8*avg) & (data2.iloc[i+28,0]<=1.2*avg)):
# data3.iloc[i,0] = data2.iloc[i+28,0]
else:
data3.iloc[i,0] = avg
else:
data3.iloc[i,0] = data2.iloc[i,0]

#data3.to_csv(‘d:/temp/usage_data.csv’, encoding=’utf-8′, index=False, header=False)
data3.to_csv(fileTmpOut, encoding=’utf-8′, index=False, header=False)

return

“””
Function: main_lstm_run_forecast
INPUTS: epochs – 训练次数
bsize – 每次训练的取得样本数
pred_len – 测试神经网络时采用的序列长度
seq_len – 窗口的大小
pred_length – 要预测的序列长度
filename – 输入时间序列文件名
layer1 – 输入神经元
layer2 – 第一层lstm的神经元
layer3 – 第二层lstm的神经元
layer4 – 输出神经元
drop – 丢掉的神经元百分比,用于处理过拟合
err – 训练网络时控制的错误率

RETURNS: 返回预测得到的数据序列

REMARKS: 建立5个LSTM模型,检验它们的精度,用最精确的模型来预测指定长的序列(旧版)
训练模型使得交叉验证的误差小于等于千分之五,或若训练超过10次但未达到误差要求
则返回精度最高的那个模型(新版)
“””
def main_lstm_run_forecast(epochs, bsize, pred_len, seq_len, pred_length, file_n,layer1,layer2,layer3,layer4,drop,err):
#global_start_time = time.time()
#epochs = 50
#seq_len = 40
#pred_len = 7 # length to predict

#filename = u’D:/boyuan/shaoxing/财务现金流和电量预测/work/高压用户日电量_平滑_7上旬.csv’

X_train, y_train = load_data2(file_n, seq_len, True, pred_len)

error_bk = 10
cnt = 0
while True:
#model = build_model([1, 7, 14, 1])
model = build_model([layer1, layer2, layer3, layer4], drop)
model.fit(X_train,y_train,batch_size=bsize,nb_epoch=epochs,validation_split=0.05,verbose=1)
error = predict_cal_error(model, pred_len, seq_len, file_n)
cnt = cnt +1
print (“error:”, error)
print (“count: “, cnt)

# 误差为千分之五
#if abs(error) <= 0.005:
if abs(error) <= err:
break;
else:
if abs(error) < error_bk:
model_bk = model
error_bk = abs(error)
#最多训练10次
if cnt == 10:
model = model_bk
break;

#pred_length = 11

#pred_lst_pos = -pred_len
f = open(file_n, ‘r’).read()
data = f.split(‘\n’)
#data = data[(pred_lst_pos-seq_len-1):pred_lst_pos-1]
data = data[(-seq_len-1):-1]
data_r = data[0:seq_len]
data_r = np.array(data_r)
data_r = data_r.reshape((seq_len,1))

result = []
result.append(data[0: 0 + seq_len])
result = normalise_windows(result)
result = np.array(result)
result = result.reshape(seq_len,1)
full_predictions3 = predict_sequence_full3(model, result, data_r, seq_len, pred_length)
full_predictions4 = predict_sequence_full1(model, result, data_r, seq_len, pred_length)

return [full_predictions3, full_predictions4]

 

if __name__==’__main__’:
epochs = 100
batchsize = 256
seq_len = 40
pred_len = 14 # length used to test 推荐为7或14
pred_length = 31 # length used to predict

filename = ‘D:\公司\绍兴电量预测项目\高压纺织业/aaa_1.csv’

fileTmpOut = ‘D:\公司\绍兴电量预测项目/temp/usage_data.csv’
outputfile = ‘D:\公司\绍兴电量预测项目/temp/outputlstm.csv’

date_pred =”2019/03/20″
layer1 = 1
layer2 = 7
layer3 = 14

layer4 = 1
drop = 0.5
err = 0.005

fileSP = u’D:\公司\绍兴电量预测项目\高压纺织业/SP_Festival.csv’

smooth = 0 # 1为平滑,0为不平滑

# filename = sys.argv[1]
# epochs = int(sys.argv[2])
# batchsize = int(sys.argv[3])
# seq_len = int(sys.argv[4])
# pred_len = int(sys.argv[5])
# pred_length= int(sys.argv[6])
# outputfile = sys.argv[7]
# date_pred = sys.argv[8]
# fileTmpOut = sys.argv[9]
# layer1 = int(sys.argv[10])
# layer2 = int(sys.argv[11])
# layer3 = int(sys.argv[12])
# layer4 = int(sys.argv[13])
# drop = float(sys.argv[14])
# err = float(sys.argv[15])
# fileSP = sys.argv[16]

print (filename, epochs, seq_len, pred_len, pred_length)
#main_lstm_run_forecast(epochs, pred_len, seq_len, pred_length, filename)

if smooth ==1:
curve_smooth(filename, fileTmpOut, 0.75, 1.3)
else:
fileTmpOut = filename

forecast1, forecast2 = main_lstm_run_forecast(epochs, batchsize, pred_len, seq_len, pred_length, fileTmpOut, layer1, layer2, layer3, layer4, drop, err)
print(forecast1)

f = open(fileSP, ‘r’).read()
dataSP = f.split(‘\n’)

sp_yr =[]
sp_dt =[]
for i in range(len(dataSP)):
sp_yr.append(dataSP[i][0:4])
sp_dt.append(dataSP[i][6:16])

date_time=datetime.datetime.strptime(date_pred,”%Y/%m/%d”)
date_tm = date_time+datetime.timedelta(30)
if 1<=date_tm.month<=3: # 可能和春节交叉
for i in range(len(sp_yr)):
if str(date_tm.year) == sp_yr[i]:
sp_date = sp_dt[i]
sp_date_last = sp_dt[i-1]
sp_date_last2= sp_dt[i-2]

# 看要预测的区间 22日-21日是否部分落在春节期间
dt_sp_curr = datetime.datetime.strptime(sp_date,”%Y/%m/%d”)
dt_sp_curr_start = dt_sp_curr – datetime.timedelta(13)
dt_sp_curr_end = dt_sp_curr + datetime.timedelta(15)

overlapped = False

if date_time.month == 1: #算 1/22-2/21
if dt_sp_curr_start>= datetime.datetime.strptime(str(date_time.year) + ‘/01/22’ ,”%Y/%m/%d”):
#dt_sp_found_start = dt_sp_curr_start
overlapped = True
else:
#dt_sp_found_start = datetime.datetime.strptime(str(date_time.year) + ‘/01/22’ ,”%Y/%m/%d”)
overlapped = True

if dt_sp_curr_end >= datetime.datetime.strptime(str(date_time.year) + ‘/02/21’ ,”%Y/%m/%d”):
#dt_sp_found_end = datetime.datetime.strptime(str(date_time.year) + ‘/02/21’ ,”%Y/%m/%d”)
overlapped = True
else:
#dt_sp_found_end = dt_sp_curr_end
overlapped = True

elif date_time.month == 2: #算 2/22-3/21
if dt_sp_curr_end>=datetime.datetime.strptime(str(date_time.year) + ‘/02/22’ ,”%Y/%m/%d”):
#dt_sp_found_start = datetime.datetime.strptime(str(date_time.year) + ‘/02/22’,”%Y/%m/%d”)
#dt_sp_found_end = dt_sp_curr_end
overlapped = True
else:
overlapped = False

elif date_tm.month ==1: #算 12/22-1/21
if dt_sp_curr_start <= datetime.datetime.strptime(str(date_tm.year) + ‘/01/21’ ,”%Y/%m/%d”):
overlapped = True
else:
overlapped = False

#读入原始文件,赋给index为日期
f = open(filename, ‘r’).read()
data_b = f.split(‘\n’)
data_b2=[]
for i in data_b:
if i !=”:
data_b2.append(i)

#dt1 = datetime.datetime.strptime(‘2015/01/01’,”%Y/%m/%d”)
dt1 = datetime.datetime.strptime(‘2016/01/01′,”%Y/%m/%d”) #数据从2016/1/1开始
#dt2 = dt1+datetime.timedelta(len(data_b)-1-1)
dt2 = dt1+datetime.timedelta(len(data_b2)-1)

rng = pd.period_range(dt1, dt2, freq=’D’)
data_buffer = pd.DataFrame(data_b2)
data_buffer.index = rng
data_buffer.columns=[‘usage’]

if overlapped ==True:
dt_sp_last = datetime.datetime.strptime(sp_date_last,”%Y/%m/%d”)
dt_sp_last_start = dt_sp_last – datetime.timedelta(13)
dt_sp_last_end = dt_sp_last + datetime.timedelta(15)

dt_sp_last2 = datetime.datetime.strptime(sp_date_last2,”%Y/%m/%d”)
dt_sp_last2_start = dt_sp_last2 – datetime.timedelta(13)
dt_sp_last2_end = dt_sp_last2 + datetime.timedelta(15)

rng1 = pd.period_range(dt_sp_last_start, dt_sp_last_end, freq=’D’)
rng2 = pd.period_range(dt_sp_last2_start, dt_sp_last2_end, freq=’D’)

sp_last_buffer = data_buffer.ix[rng1]
sp_last2_buffer = data_buffer.ix[rng2]

rng3= pd.period_range(dt_sp_curr_start, dt_sp_curr_end, freq=’D’)
sp_last_buffer.index = rng3
sp_last2_buffer.index = rng3

sp_join_buffer = pd.concat([sp_last_buffer,sp_last2_buffer], axis=1)
sp_join_buffer.columns=[‘usage1′,’usage2’]

sp_join_buffer[‘avg_usage’]= 0
for i in range(len(sp_join_buffer)):
sp_join_buffer[‘avg_usage’][i] = (float(sp_join_buffer[‘usage1’][i]) + float(sp_join_buffer[‘usage2′][i]))/2

# 13天前30天
date_time2= date_time+datetime.timedelta(pred_length-1)
rng_forecast = pd.period_range(date_time, date_time2, freq=’D’)
forecast = pd.DataFrame(forecast1)
forecast.index = rng_forecast
forecast.columns=[‘usage’]
data_buffer_forecast = pd.concat([data_buffer,forecast], axis=0)

dt_sp_curr_start_prior_s = dt_sp_curr – datetime.timedelta(13+30)
dt_sp_curr_start_prior_e = dt_sp_curr – datetime.timedelta(14)
rng = pd.period_range(dt_sp_curr_start_prior_s, dt_sp_curr_start_prior_e)
sp_curr_prior_buffer = data_buffer_forecast.ix[rng]

sp_curr_prior = 0
for i in range(len(sp_curr_prior_buffer)):
sp_curr_prior = sp_curr_prior + float(sp_curr_prior_buffer[‘usage’][i])

dt_sp_last_start_prior_s = dt_sp_last – datetime.timedelta(13+30)
dt_sp_last_start_prior_e = dt_sp_last – datetime.timedelta(14)
rng = pd.period_range(dt_sp_last_start_prior_s,dt_sp_last_start_prior_e)
sp_last_prior_buffer = data_buffer_forecast.ix[rng]

sp_last_prior = 0
for i in range(len(sp_last_prior_buffer)):
sp_last_prior = sp_last_prior + float(sp_last_prior_buffer[‘usage’][i])

dt_sp_last2_start_prior_s = dt_sp_last2 – datetime.timedelta(13+30)
dt_sp_last2_start_prior_e = dt_sp_last2 – datetime.timedelta(14)
rng = pd.period_range(dt_sp_last2_start_prior_s,dt_sp_last2_start_prior_e)
sp_last2_prior_buffer = data_buffer_forecast.ix[rng]

sp_last2_prior = 0
for i in range(len(sp_last2_prior_buffer)):
sp_last2_prior = sp_last2_prior + float(sp_last2_prior_buffer[‘usage’][i])

usage_ratio = 2* sp_curr_prior/(sp_last_prior + sp_last2_prior)

forecast[‘usage2’] = 0
for i in range(len(forecast)):
try:
if float(sp_join_buffer.ix[str(rng_forecast[i])][‘avg_usage’]) > 0:
forecast[‘usage2’][str(rng_forecast[i])] = sp_join_buffer.ix[str(rng_forecast[i])][‘avg_usage’] * usage_ratio
except KeyError:
forecast[‘usage2’][str(rng_forecast[i])] = forecast[‘usage’][str(rng_forecast[i])]
#continue

else:
date_time2= date_time+datetime.timedelta(pred_length-1)
rng_forecast = pd.period_range(date_time, date_time2, freq=’D’)
forecast = pd.DataFrame(forecast1)
forecast.index = rng_forecast
forecast.columns=[‘usage2’]

#date = datetime.date.today()
#date2 = date +datetime.timedelta(11)

#forecast.to_csv(‘d:/temp/outputlstm.csv’, encoding=’utf-8′, index=True, header=False)
forecast[‘usage2′].to_csv(outputfile, encoding=’utf-8’, index=True, header=False)

打赏作者

发表评论

邮箱地址不会被公开。 必填项已用*标注