A-3.RNN Encoder-Decoder with Attentionによる文章生成

任意の長さテキストをAttentionベースのRNN Encoder-Decoder(LSTM)に学習させるためのKerasのサンプルコード。「モデルの定義と訓練」と「保存したモデルによる文章生成」は別々に実行します。

サンプルデータ(要解凍)
https://aiandstory.net/sample.zip

モデルの定義と訓練

# -*- coding: utf-8 -*-

from __future__ import print_function
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.layers import Reshape, RepeatVector, TimeDistributed, Activation
from tensorflow.keras.layers import add, concatenate, Flatten, dot, Lambda, Permute
from tensorflow.keras.optimizers import Adamax
from tensorflow.keras import Input

import numpy as np
import random
import sys
import warnings
warnings.filterwarnings('ignore')

#utf-8のテキストを準備
path="sample.txt"

#一括で読み込み
with open(path, encoding='utf-8') as f:
    text = f.read()
print('corpus length:', len(text))

#空白を除去
tokens = text.split()
text = ''.join(tokens)

#文字のインデックスを作成するためにソート
chars = sorted(list(set(text)))
print('total chars:', len(chars))

#文字＜－＞インデックスの辞書を準備
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

#学習及び正解データの準備
maxlen = 30#入力データの文字数
gen_charlen = 1#出力(正解)データの文字数
step = 1
sentences = []#入力データのリスト
next_chars = []#出力(正解)データのリスト

for i in range(0, len(text) - maxlen - gen_charlen, step):
    sentences.append([char_indices[char] for char in text[i: i + maxlen]])
    next_chars.append([char_indices[char] for char in text[i + maxlen: i + maxlen + gen_charlen]])
print('input data sequences:', len(sentences))

# build the model: 

print('Build model...')

#語彙数がEmbedding層の入力次元になる
vcab_size = len(chars)

#他のファイルでimportする場合、以降は実行しない。
if __name__ == "__main__":#importに必要

    #文字のインデックスが入力データなのでint32を指定,shapeにはサンプル数の次元の軸は含まない
    #この指定ではmaxlen次元のベクトルが入力となる
    text_input = Input(shape=(maxlen,), dtype='int32', name='text')
    #埋め込み層。output_dim次元のベクトルシーケンスに埋め込む
    embedded_text = Embedding(input_dim=vcab_size ,output_dim=128)(text_input)
    
    hidden_unit = 128#LSTMの隠れユニット数
    output = embedded_text
    
    #-----エンコーダー：入力データを固定長の状態ベクトルにエンコードする-----
    #アテンションの処理用にreturn_sequences=Trueが必要。再帰時の途中（系列）のデータも出力する
    output_enc = LSTM(hidden_unit, return_sequences=True, dropout=0.2, recurrent_dropout=0.5)(output)
    #デコーダーのＬＳＴＭ層の入力用に、最後の状態ベクトルだけ取り出す。return_sequences=Falseの代わり
    output_enc_last = Lambda(lambda x: x[:,-1,:], output_shape=(None, hidden_unit))(output_enc)
    #--------------------

    #-----デコーダー：状態ベクトルを出力データにデコードする-----
    #出力データの文字数分だけ処理を繰り返す
    output_dec = RepeatVector(gen_charlen)(output_enc_last)
    #デコーダのＬＳＴＭ層
    output_dec = LSTM(hidden_unit, return_sequences=True, dropout=0.2, recurrent_dropout=0.5)(output_dec)
    
    #-----Attention用の処理-----
    #[3]における(7)の実装
    attention = dot([output_dec, output_enc], axes=[2, 2])
    attention = Activation('softmax')(attention)    
    context = dot([attention, output_enc], axes=[2,1])
    decoder_combined_context = concatenate([context, output_dec])

    output_dec = decoder_combined_context

    #TimeDistributedはRepeatVectorの繰り返し分にレイヤー（ここではDense）を適用
    #[3]における(5)の実装
    output_dec = TimeDistributed(Dense(512, activation="tanh"))(output_dec) 
    #----Attetion終了--------
    
    #softmaxで各文字毎(vcab_size=語彙数)の確率を出力
    output_dec = TimeDistributed(Dense(vcab_size, activation='softmax'))(output_dec)
    #----デコーダー終了---------

    #モデルのインスタンス化
    model = Model(text_input, output_dec)
    
    optimizer = Adamax(lr=0.01)
    #softmaxで数値（one-hot表現ではなく数値の配列）のターゲットを処理する場合はspasparse_categorical_crossentropyを指定する。
    #この指定は、入力データごとに確率スコアを出力するため、モデルの最終的な出力形状には影響しない。
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer,  metrics=['accuracy'])
    model.summary()
    
    #model.fit()用にNumPy配列を生成する。
    x = np.array(sentences)
    y = np.array(next_chars)
    #sparse_categorical_crossentropyの出力と整合する用にshapeを変更する
    y = np.reshape(y, (-1, gen_charlen, 1))
    
    #xは入力データ、yは出力(正解)データ、epochs(試行回数)は任意(10)
    model.fit(x, y,
              batch_size=128,
              epochs=10)
    #学習済みモデルを保存
    model.save('text_generation_for_aiandstory')

保存したモデルによる文章生成

# -*- coding: utf-8 -*-

import numpy as np
import sys
from tensorflow.keras.models import load_model
from text_generation_for_aiandstory import maxlen, text, char_indices, indices_char

#保存したモデルをロードする
model=load_model('text_generation_for_aiandstory')
 
generated = ''

sys.stdout.write(generated)

#sample.txtの何文字目から最初に与える文を取り出すかを決める
s_ind = 102
sentence = text[s_ind: s_ind + maxlen]
#別途与える場合はmaxlen文字で、すべての文字が語彙に存在していなければならない。
#sentence = ""
print(sentence)

#500文字生成する
for i in range(500):
    x4preds = []
    x4pred = [char_indices[char] for char in sentence]
    x4preds.append(x4pred)
    x4preds = np.array(x4preds)
    
    #答えが複数文字の場合にも対応させる
    preds = model.predict(x4preds, verbose=0)[0]
    ans_indexies = [np.argmax(pred) for pred in preds]
    next_chars = [indices_char[ans_index] for ans_index in ans_indexies]
    next_chars = ''.join(next_chars)
    sentence = sentence[len(next_chars):] + next_chars

    sys.stdout.write(next_chars)

    sys.stdout.flush()

参考文献

全般

[1] https://github.com/keras-team/keras/tree/master/examples

Attention

[2] wanasit : Attention-based Sequence-to-Sequence in Keras https://wanasit.github.io/attention-based-sequence-to-sequence-in-keras.html (2017)

[3] Minh-Thang Luong, Hieu Pham, Christopher D. Manning : ”Effective Approaches to Attention-based Neural Machine Translation”　arXiv preprint arXiv:1508.04025 (2015)