如何在Keras中创建可变长度输入LSTM？

I am trying to do some vanilla pattern recognition with an LSTM using Keras to predict the next element in a sequence.

我正在尝试使用Keras使用LSTM进行一些香草模式识别来预测序列中的下一个元素。

My data look like this:

我的数据如下所示：

where the label of the training sequence is the last element in the list: X_train['Sequence'][n][-1].

其中训练序列的标签是列表中的最后一个元素：X_train ['Sequence'] [n] [ - 1]。

Because my Sequence column can have a variable number of elements in the sequence, I believe an RNN to be the best model to use. Below is my attempt to build an LSTM in Keras:

因为我的序列列在序列中可以有可变数量的元素，我相信RNN是最好的模型。以下是我在Keras建立LSTM的尝试：

# Build the model

# A few arbitrary constants...
max_features = 20000
out_size = 128

# The max length should be the length of the longest sequence (minus one to account for the label)
max_length = X_train['Sequence'].apply(len).max() - 1

# Normal LSTM model construction with sigmoid activation
model = Sequential()
model.add(Embedding(max_features, out_size, input_length=max_length, dropout=0.2))
model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2))
model.add(Dense(1))
model.add(Activation('sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

And here's how I attempt to train my model:

以下是我尝试训练模型的方法：

# Train the model
for seq in X_train['Sequence']:
    print("Length of training is {0}".format(len(seq[:-1])))
    print("Training set is {0}".format(seq[:-1]))
    model.fit(np.array([seq[:-1]]), [seq[-1]])

My output is this:

我的输出是这样的：

Length of training is 13
Training set is [1, 3, 13, 87, 1053, 28576, 2141733, 508147108, 402135275365, 1073376057490373, 9700385489355970183, 298434346895322960005291, 31479360095907908092817694945]

However, I get the following error:

但是，我收到以下错误：

Exception: Error when checking model input: expected embedding_input_1 to have shape (None, 347) but got array with shape (1, 13)

I believe my training step is correctly setup, so my model construction must be wrong. Note that 347 is max_length.

我相信我的训练步骤是正确设置的，所以我的模型构造必定是错误的。注意347是max_length。

How can I correctly build a variable-length input LSTM in Keras? I'd prefer not to pad the data. Not sure if it's relevant, but I'm using the Theano backend.

如何在Keras中正确构建可变长度输入LSTM？我不想填充数据。不确定它是否相关，但我正在使用Theano后端。

3 个解决方案

#1

I am not clear about the embedding procedure. But still here is a way to implement a variable-length input LSTM. Just do not specify the timespan dimension when building LSTM.

我不清楚嵌入程序。但仍然是一种实现可变长度输入LSTM的方法。在构建LSTM时，不要指定时间跨度维度。

import keras.backend as K
from keras.layers import LSTM, Input

I = Input(shape=(None, 200)) # unknown timespan, fixed feature size
lstm = LSTM(20)
f = K.function(inputs=[I], outputs=[lstm(I)])

import numpy as np
data1 = np.random.random(size=(1, 100, 200)) # batch_size = 1, timespan = 100
print f([data1])[0].shape
# (1, 20)

data2 = np.random.random(size=(1, 314, 200)) # batch_size = 1, timespan = 314
print f([data2])[0].shape
# (1, 20)

#2

The trick to training and classifying sequences is training with masking and classifying using a stateful network. Here's an example that I made that classifies whether a sequence of variable length starts with zero or not.

训练和分类序列的技巧是使用有状态网络进行掩蔽和分类训练。这是我做的一个例子，它分类变量长度序列是否从零开始。

import numpy as np
np.random.seed(1)

import tensorflow as tf
tf.set_random_seed(1)

from keras import models
from keras.layers import Dense, Masking, LSTM

import matplotlib.pyplot as plt


def stateful_model():
    hidden_units = 256

    model = models.Sequential()
    model.add(LSTM(hidden_units, batch_input_shape=(1, 1, 1), return_sequences=False, stateful=True))
    model.add(Dense(1, activation='relu', name='output'))

    model.compile(loss='binary_crossentropy', optimizer='rmsprop')

    return model


def train_rnn(x_train, y_train, max_len, mask):
    epochs = 10
    batch_size = 200

    vec_dims = 1
    hidden_units = 256
    in_shape = (max_len, vec_dims)

    model = models.Sequential()

    model.add(Masking(mask, name="in_layer", input_shape=in_shape,))
    model.add(LSTM(hidden_units, return_sequences=False))
    model.add(Dense(1, activation='relu', name='output'))

    model.compile(loss='binary_crossentropy', optimizer='rmsprop')

    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs,
              validation_split=0.05)

    return model


def gen_train_sig_cls_pair(t_stops, num_examples, mask):
    x = []
    y = []
    max_t = int(np.max(t_stops))

    for t_stop in t_stops:
        one_indices = np.random.choice(a=num_examples, size=num_examples // 2, replace=False)

        sig = np.zeros((num_examples, max_t), dtype=np.int8)
        sig[one_indices, 0] = 1
        sig[:, t_stop:] = mask
        x.append(sig)

        cls = np.zeros(num_examples, dtype=np.bool)
        cls[one_indices] = 1
        y.append(cls)

    return np.concatenate(x, axis=0), np.concatenate(y, axis=0)


def gen_test_sig_cls_pair(t_stops, num_examples):
    x = []
    y = []

    for t_stop in t_stops:
        one_indices = np.random.choice(a=num_examples, size=num_examples // 2, replace=False)

        sig = np.zeros((num_examples, t_stop), dtype=np.bool)
        sig[one_indices, 0] = 1
        x.extend(list(sig))

        cls = np.zeros((num_examples, t_stop), dtype=np.bool)
        cls[one_indices] = 1
        y.extend(list(cls))

    return x, y


if __name__ == '__main__':
    noise_mag = 0.01
    mask_val = -10
    signal_lengths = (10, 15, 20)

    x_in, y_in = gen_train_sig_cls_pair(signal_lengths, 10, mask_val)

    mod = train_rnn(x_in[:, :, None], y_in, int(np.max(signal_lengths)), mask_val)

    testing_dat, expected = gen_test_sig_cls_pair(signal_lengths, 3)

    state_mod = stateful_model()
    state_mod.set_weights(mod.get_weights())

    res = []
    for s_i in range(len(testing_dat)):
        seq_in = list(testing_dat[s_i])
        seq_len = len(seq_in)

        for t_i in range(seq_len):
            res.extend(state_mod.predict(np.array([[[seq_in[t_i]]]])))

        state_mod.reset_states()

    fig, axes = plt.subplots(2)
    axes[0].plot(np.concatenate(testing_dat), label="input")

    axes[1].plot(res, "ro", label="result", alpha=0.2)
    axes[1].plot(np.concatenate(expected, axis=0), "bo", label="expected", alpha=0.2)
    axes[1].legend(bbox_to_anchor=(1.1, 1))

    plt.show()

#3

Not sure how applicable recurrent networks are for your sequences, ie how strongly dependent each element is on its preceding sequence as opposed to other factors. That being said (which doesn't help you one bit of course), if you don't want to pad your input with some bad value, a stateful model that processes a single timestep at once is the only alternative for variable length sequences IMHO. If you don't mind taking an alternative approach to encoding:

不确定递归网络对于序列的适用程度，即每个元素对其前一个序列的依赖程度与其他因素的相关程度。话虽如此（当然这对你没有帮助），如果你不想用一些不好的值填充你的输入，一次处理单个时间步的状态模型是可变长度序列的唯一选择恕我直言。如果您不介意采用其他编码方法：

import numpy as np
import keras.models as kem
import keras.layers as kel
import keras.callbacks as kec
import sklearn.preprocessing as skprep

X_train, max_features = {'Sequence': [[1, 2, 4, 5, 8, 10, 16], [1, 2, 1, 5, 5, 1, 11, 16, 7]]}, 16

num_mem_units = 64
size_batch = 1
num_timesteps = 1
num_features = 1
num_targets = 1
num_epochs = 1500

model = kem.Sequential()
model.add(kel.LSTM(num_mem_units, stateful=True,  batch_input_shape=(size_batch, num_timesteps, num_features),
  return_sequences=True))
model.add(kel.Dense(num_targets, activation='sigmoid'))
model.summary()
model.compile(loss='binary_crossentropy', optimizer='adam')

range_act = (0, 1) # sigmoid
range_features = np.array([0, max_features]).reshape(-1, 1)
normalizer = skprep.MinMaxScaler(feature_range=range_act)
normalizer.fit(range_features)

reset_state = kec.LambdaCallback(on_epoch_end=lambda *_ : model.reset_states())

# training
for seq in X_train['Sequence']:
    X = seq[:-1]
    y = seq[1:] # predict next element
    X_norm = normalizer.transform(np.array(X).reshape(-1, 1)).reshape(-1, num_timesteps, num_features)
    y_norm = normalizer.transform(np.array(y).reshape(-1, 1)).reshape(-1, num_timesteps, num_targets)
    model.fit(X_norm, y_norm, epochs=num_epochs, batch_size=size_batch, shuffle=False,
      callbacks=[reset_state])

# prediction
for seq in X_train['Sequence']:
    model.reset_states() 
    for istep in range(len(seq)-1): # input up to not incl last
        val = seq[istep]
        X = np.array([val]).reshape(-1, 1)
        X_norm = normalizer.transform(X).reshape(-1, num_timesteps, num_features)
        y_norm = model.predict(X_norm)
    yhat = int(normalizer.inverse_transform(y_norm[0])[0, 0])
    y = seq[-1] # last
    put = '{0} predicts {1:d}, expecting {2:d}'.format(', '.join(str(val) for val in seq[:-1]), yhat, y)
    print(put)

which produces sth like:

产生类似于：

1, 2, 4, 5, 8, 10 predicts 11, expecting 16
1, 2, 1, 5, 5, 1, 11, 16 predicts 7, expecting 7

with ridiculous loss, however.

然而，有可笑的损失。

#1