Keras LSTM の文章生成を単語単位でやってみる

今回は、Keras LSTM の文章生成のプログラムを改造して、単語単位の文章生成をやってみます。

こんちは cedro です。

以前、Keras LSTM のサンプルプログラムで文字単位の文章生成をしてみました。これはこれで、結構雰囲気が出て面白いのですが、やっぱり本格的にやるには、単語単位じゃないとねーと思っていました。

最近、文章カテゴリー分類やマルコフ連鎖による文章生成をやってみる中で、Python による辞書の取り扱いにも慣れて来たので、そろそろ単語単位の文章生成をやってみたくなりました。

ということで、今回は、Keras LSTM の文章生成のプログラムを改造して、単語単位の文章生成をやってみます。

テキストを準備します。

今回も、元となるテキストは推理小説にしてみます。青空文庫にあるエドガー・アラン・ポーの「モルグ街の殺人」（605_ruby_20933.zip）をダウンロードします。


import sys
import re
 
path = './morgue.txt'
bindata = open(path, "rb")
lines = bindata.readlines()
for line in lines:
    text = line.decode('Shift_JIS')    # Shift_JISで読み込み
    text = re.split(r'\r',text)[0]     # 改行削除
    text = text.replace('｜','')       # ルビ前記号削除
    text = re.sub(r'《.+?》','',text)    # ルビ削除
    text = re.sub(r'［＃.+?］','',text)  # 入力者注削除
    print(text)
    file = open('data.txt','a',encoding='utf-8').write(text)  # UTF-8に変換

import sys

import re

path = './morgue.txt'

bindata = open(path, "rb")

lines = bindata.readlines()

for line in lines:

text = line.decode('Shift_JIS') # Shift_JISで読み込み

text = re.split(r'\r',text)[0] # 改行削除

text = text.replace('｜','') # ルビ前記号削除

text = re.sub(r'《.+?》','',text) # ルビ削除

text = re.sub(r'［＃.+?］','',text) # 入力者注削除

print(text)

file = open('data.txt','a',encoding='utf-8').write(text) # UTF-8に変換

ダウンロードしたテキストファイルをプログラムに適切に読ませるために、前処理を行うプログラムです。実行すると、テキストファイルの形式（Shift-JIS）で読み込み、改行、ルビ、入力者注などを削除してから、 UTF-8に変換して、data.txt という名前で保存します。

あとはエディターを使い、文章の前後にある余分な部分や、文中に出て来る注の（１）〜（19）や、空白等を手動で削除します。容量は101KBでした。

プログラムを改造します

今回、改造するプログラムは、lstm_text_generation.py です。


from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import RMSprop
from janome.tokenizer import Tokenizer  # 追加
import numpy as np
import random
import sys
import io

path = './data.txt'
with io.open(path, encoding='utf-8') as f:
    text = f.read().lower()
print('corpus length:', len(text))

#chars = sorted(list(set(text)))
#print('total chars:', len(chars))
#char_indices = dict((c, i) for i, c in enumerate(chars))
#indices_char = dict((i, c) for i, c in enumerate(chars))

text =Tokenizer().tokenize(text, wakati=True)  # 分かち書きする
chars = text
count = 0
char_indices = {}  # 辞書初期化
indices_char = {}  # 逆引き辞書初期化

for word in chars:
    if not word in char_indices:  # 未登録なら
       char_indices[word] = count  # 登録する      
       count +=1
       print(count,word)  # 登録した単語を表示
# 逆引き辞書を辞書から作成する
indices_char = dict([(value, key) for (key, value) in char_indices.items()])

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 5
step = 1
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

from __future__ import print_function

from keras.callbacks import LambdaCallback

from keras.models import Sequential

from keras.layers import Dense

from keras.layers import LSTM

from keras.optimizers import RMSprop

from janome.tokenizer import Tokenizer # 追加

import numpy as np

import random

import sys

import io

path = './data.txt'

with io.open(path, encoding='utf-8') as f:

text = f.read().lower()

print('corpus length:', len(text))

#chars = sorted(list(set(text)))

#print('total chars:', len(chars))

#char_indices = dict((c, i) for i, c in enumerate(chars))

#indices_char = dict((i, c) for i, c in enumerate(chars))

text =Tokenizer().tokenize(text, wakati=True) # 分かち書きする

chars = text

count = 0

char_indices = {} # 辞書初期化

indices_char = {} # 逆引き辞書初期化

for word in chars:

if not word in char_indices: # 未登録なら

char_indices[word] = count # 登録する

count +=1

print(count,word) # 登録した単語を表示

# 逆引き辞書を辞書から作成する

indices_char = dict([(value, key) for (key, value) in char_indices.items()])

# cut the text in semi-redundant sequences of maxlen characters

maxlen = 5

step = 1

sentences = []

next_chars = []

for i in range(0, len(text) - maxlen, step):

sentences.append(text[i: i + maxlen])

next_chars.append(text[i + maxlen])

print('nb sequences:', len(sentences))

プログラムの冒頭〜データセット作成までの部分です。8行目に、形態素解析ライブラリー janome を追加します。19〜22行目をコメントアウトし、24〜36行目を追加します。まず、24行目でテキストを分かち書きをして下記の様に text をリスト形式にしています。

text = [ ‘分析’, ‘的’, ‘な’, ‘もの’, ‘として’, ‘論じ’, ‘られ’, ‘て’, ‘いる’, … ]

30行目から分かち書きされた単語を順次辞書（char_indices）に登録して行きます。登録が完了したら、36行目で、逆引き辞書（indices_char）を辞書から作成します。それそれの辞書は、下記の様なイメージになります。

char_indices = { ‘分析’: 0, ‘的’: 1, ‘な’: 2, ‘もの’: 3, ‘として’: 4, ‘論じ’: 5, ‘られ’: 6, ‘て’: 7, ‘いる’: 8, … }

indices_char = { 0: ‘分析’, 1: ‘的’, 2: ‘な’, 3: ‘もの’, 4: ‘として’, 5: ‘論じ’, 6: ‘られ’, 7: ‘て’, 8: ‘いる’, … }

39−40行目は、データセット作成の設定で、5つの連続した単語（maxlen = 5）から次の単語を予測し、１つづつ（step = 1 ）スライドしながらデータセットを作成する設定にします。43−45行目で、5つの連続した単語は sentences に、次の単語は next_char に順次格納して行きます。その結果以下の様なイメージになります、

sentences = [ [‘分析’, ‘的’, ‘な’, ‘もの’, ‘として’], [‘的’, ‘な’, ‘もの’, ‘として’, ‘論じ’], [‘な’, ‘もの’, ‘として’, ‘論じ’, ‘られ’], … ]

next_char = [ ‘論じ’, ‘られ’, ‘て’, ‘いる’, … ]

後は、文字単位の場合と一緒で、辞書を使ってワンホットベクトルに変換する部分に渡せばOKです。


def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    start_index = 0  # テキストの最初からスタート
    for diversity in [0.2]:  # diversity は 0.2のみ使用 
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        # sentence はリストなので文字列へ変換して使用
        generated += "".join(sentence)
        print(sentence)
        
        # sentence はリストなので文字列へ変換して使用
        print('----- Generating with seed: "' + "".join(sentence)+ '"')
        sys.stdout.write(generated)


        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:]
            # sentence はリストなので append で結合する
            sentence.append(next_char)  

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(x, y,
          batch_size=128,
          epochs=60,
          callbacks=[print_callback])

def on_epoch_end(epoch, _):

# Function invoked at end of each epoch. Prints generated text.

print()

print('----- Generating text after Epoch: %d' % epoch)

start_index = random.randint(0, len(text) - maxlen - 1)

start_index = 0 # テキストの最初からスタート

for diversity in [0.2]: # diversity は 0.2のみ使用

print('----- diversity:', diversity)

generated = ''

sentence = text[start_index: start_index + maxlen]

# sentence はリストなので文字列へ変換して使用

generated += "".join(sentence)

print(sentence)

# sentence はリストなので文字列へ変換して使用

print('----- Generating with seed: "' + "".join(sentence)+ '"')

sys.stdout.write(generated)

for i in range(400):

x_pred = np.zeros((1, maxlen, len(chars)))

for t, char in enumerate(sentence):

x_pred[0, t, char_indices[char]] = 1.

preds = model.predict(x_pred, verbose=0)[0]

next_index = sample(preds, diversity)

next_char = indices_char[next_index]

generated += next_char

sentence = sentence[1:]

# sentence はリストなので append で結合する

sentence.append(next_char)

sys.stdout.write(next_char)

sys.stdout.flush()

print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(x, y,

batch_size=128,

epochs=60,

callbacks=[print_callback])

プログラムの中盤から最後までの部分です。8行目の start_index = 0 は、毎回テキストの最初の５単語から（ ‘ 分析的なものとして’ から）、文章生成をさせる設定です。9行目 、diversity は0.2 のみ使用することにしています。

15、19行目は、” ” . join(sentence) でリストを文字列に変換しています。元は、１文字単位の文章生成プログラムなので、 sentence も文字列だったわけですが、分かち書きをした影響で sentence がリストになったため、リストから文字列に戻す必要があるわけです。

35行目、sentence がリストなので、next_char の結合は append を使います。


from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import RMSprop
from janome.tokenizer import Tokenizer
import numpy as np
import random
import sys
import io

path = './data.txt'
with io.open(path, encoding='utf-8') as f:
    text = f.read().lower()
print('corpus length:', len(text))

text =Tokenizer().tokenize(text, wakati=True)  # 分かち書きする
chars = text
count = 0
char_indices = {}  # 辞書初期化
indices_char = {}  # 逆引き辞書初期化

for word in chars:
    if not word in char_indices:  # 未登録なら
       char_indices[word] = count  # 登録する      
       count +=1
       print(count,word)  # 登録した単語を表示
# 逆引き辞書を辞書から作成する
indices_char = dict([(value, key) for (key, value) in char_indices.items()])

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 5
step = 1
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1


# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars), activation='softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)


def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    start_index = 0  # テキストの最初からスタート
    for diversity in [0.2]:  # diversity は 0.2のみ使用 
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        # sentence はリストなので文字列へ変換して使用
        generated += "".join(sentence)
        print(sentence)
        
        # sentence はリストなので文字列へ変換して使用
        print('----- Generating with seed: "' + "".join(sentence)+ '"')
        sys.stdout.write(generated)


        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:]
            # sentence はリストなので append で結合する
            sentence.append(next_char)  

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(x, y,
          batch_size=128,
          epochs=60,
          callbacks=[print_callback])

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

from __future__ import print_function

from keras.callbacks import LambdaCallback

from keras.models import Sequential

from keras.layers import Dense

from keras.layers import LSTM

from keras.optimizers import RMSprop

from janome.tokenizer import Tokenizer

import numpy as np

import random

import sys

import io

path = './data.txt'

with io.open(path, encoding='utf-8') as f:

text = f.read().lower()

print('corpus length:', len(text))

text =Tokenizer().tokenize(text, wakati=True) # 分かち書きする

chars = text

count = 0

char_indices = {} # 辞書初期化

indices_char = {} # 逆引き辞書初期化

for word in chars:

if not word in char_indices: # 未登録なら

char_indices[word] = count # 登録する

count +=1

print(count,word) # 登録した単語を表示

# 逆引き辞書を辞書から作成する

indices_char = dict([(value, key) for (key, value) in char_indices.items()])

# cut the text in semi-redundant sequences of maxlen characters

maxlen = 5

step = 1

sentences = []

next_chars = []

for i in range(0, len(text) - maxlen, step):

sentences.append(text[i: i + maxlen])

next_chars.append(text[i + maxlen])

print('nb sequences:', len(sentences))

print('Vectorization...')

x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)

y = np.zeros((len(sentences), len(chars)), dtype=np.bool)

for i, sentence in enumerate(sentences):

for t, char in enumerate(sentence):

x[i, t, char_indices[char]] = 1

y[i, char_indices[next_chars[i]]] = 1

# build the model: a single LSTM

print('Build model...')

model = Sequential()

model.add(LSTM(128, input_shape=(maxlen, len(chars))))

model.add(Dense(len(chars), activation='softmax'))

optimizer = RMSprop(lr=0.01)

model.compile(loss='categorical_crossentropy', optimizer=optimizer)

def sample(preds, temperature=1.0):

# helper function to sample an index from a probability array

preds = np.asarray(preds).astype('float64')

preds = np.log(preds) / temperature

exp_preds = np.exp(preds)

preds = exp_preds / np.sum(exp_preds)

probas = np.random.multinomial(1, preds, 1)

return np.argmax(probas)

def on_epoch_end(epoch, _):

# Function invoked at end of each epoch. Prints generated text.

print()

print('----- Generating text after Epoch: %d' % epoch)

start_index = random.randint(0, len(text) - maxlen - 1)

start_index = 0 # テキストの最初からスタート

for diversity in [0.2]: # diversity は 0.2のみ使用

print('----- diversity:', diversity)

generated = ''

sentence = text[start_index: start_index + maxlen]

# sentence はリストなので文字列へ変換して使用

generated += "".join(sentence)

print(sentence)

# sentence はリストなので文字列へ変換して使用

print('----- Generating with seed: "' + "".join(sentence)+ '"')

sys.stdout.write(generated)

for i in range(400):

x_pred = np.zeros((1, maxlen, len(chars)))

for t, char in enumerate(sentence):

x_pred[0, t, char_indices[char]] = 1.

preds = model.predict(x_pred, verbose=0)[0]

next_index = sample(preds, diversity)

next_char = indices_char[next_index]

generated += next_char

sentence = sentence[1:]

# sentence はリストなので append で結合する

sentence.append(next_char)

sys.stdout.write(next_char)

sys.stdout.flush()

print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(x, y,

batch_size=128,

epochs=60,

callbacks=[print_callback])

プログラム全体を載せておきます。

プログラムを動かします

コーパスが101KB程度なので、普通のノートパソコンでも動きます。私のMacbookAirでは、220sec / epoch で 60epoch 動かすのに約3.7時間掛かりました。プログラムを実行すると、スクリーン表示が以下の様に自動的に進んで行きます。