深度學習 - LSTM生成文本 - 唐詩 - 文本向量化
經過一番工夫將文本預處理後,接著就是要將文本向量化。
首先需要統計總共出現過哪些字,即不重複的字有幾個。
先看總共有幾個字:
計算一下總共有多少字
words = [word for poem in final_poetry for word in poem]
print('總共有{}個字'.format(len(words)))
總共有2977881個字
接著利用dict() 統計重複出現的字
counted_words = {}
for poem in final_poetry:
for word in poem:
if word in counted_words:
counted_words[word] += 1
else:
counted_words[word] = 1
print('不重複出現的字總共有{}個'.format(len(counted_words)))
不重複出現的字總共有7045個
將低頻的字去掉,先建立一個 erase list,再將低頻的字丟進 list 後,再依 list 中的元素逐個刪除。
erase = []
for key in counted_words:
if counted_words[key] <= 2:
erase.append(key)
for key in erase:
del counted_words[key]
wordPairs = sorted(counted_words.items(), key=lambda x: -x[1])
print('低頻字去掉後剩下 {} 個不重複的字'.format(len(counted_words)))
低頻字去掉後剩下 5817 個不重複的字
文本清理得差不多了,最後就是將文字映射到向量。
words, _ = zip(*wordPairs)
words += (" ",)
# word到向量的映射
word2num = dict((c, i) for i, c in enumerate(words))
num2word = dict((i, c) for i, c in enumerate(words))
文本預處理的工作準備差不多了,接著直接使用"造輪者"的編碼(有巨人的肩膀就站上去吧XD),來開始訓練模型吧!
其完整編碼如下:
將其寫在 data_utils.py 存在根目錄底下。
# *-* coding:utf-8 *-*
def preprocess_file(Config):
# 將唐詩文本内容丟進文字串中
files_content = ''
with open( Config.poetry_file, 'r', encoding='UTF-8') as f:
for line in f:
#將每首詩的最後加上]符號區隔
x = line.strip() + "]"
# x = x.split(":")[1] 這邊使用我自己過濾的文本,不需要這行
if len(x) <= 5:
continue
if x[5] == ',': #只使用五言律詩 (其他如七言絕句等都不列入)
files_content += x
#將詩中統計出現字符統計出現次數
words = sorted(list(files_content))
counted_words = {}
for word in words:
if word in counted_words:
counted_words[word] += 1
else:
counted_words[word] = 1
# 去掉低频的字,總共出現2次以下的都刪掉
erase = []
for key in counted_words:
if counted_words[key] <= 2:
erase.append(key)
for key in erase:
del counted_words[key]
wordPairs = sorted(counted_words.items(), key=lambda x: -x[1])
words, _ = zip(*wordPairs)
words += (" ",)
# 利用dict(),透過(key,value)=(value,key),將word轉成向量的映射
word2num = dict((c, i) for i, c in enumerate(words))
num2word = dict((i, c) for i, c in enumerate(words))
word2numF = lambda x: word2num.get(x, len(words) - 1)
return word2numF, num2word, words, files_content
其中,將Config其寫成class,存成Config.py放在根目錄,內容如下:
class Config(object):
poetry_file = './data/final_poetry.txt'
weight_file = './data/poetry_model.h5'
# 根據前六個字預測第七個字
max_len = 6
batch_size = 32
learning_rate = 0.001
將文本的路徑,與訓練參數寫在Config 中,方便調整參數。
首先需要統計總共出現過哪些字,即不重複的字有幾個。
計算一下總共有多少字
words = [word for poem in final_poetry for word in poem]
print('總共有{}個字'.format(len(words)))
總共有2977881個字
counted_words = {}
for poem in final_poetry:
for word in poem:
if word in counted_words:
counted_words[word] += 1
else:
counted_words[word] = 1
print('不重複出現的字總共有{}個'.format(len(counted_words)))
不重複出現的字總共有7045個
erase = []
for key in counted_words:
if counted_words[key] <= 2:
erase.append(key)
for key in erase:
del counted_words[key]
wordPairs = sorted(counted_words.items(), key=lambda x: -x[1])
print('低頻字去掉後剩下 {} 個不重複的字'.format(len(counted_words)))
低頻字去掉後剩下 5817 個不重複的字
words, _ = zip(*wordPairs)
words += (" ",)
# word到向量的映射
word2num = dict((c, i) for i, c in enumerate(words))
num2word = dict((i, c) for i, c in enumerate(words))
其完整編碼如下:
# *-* coding:utf-8 *-*
def preprocess_file(Config):
# 將唐詩文本内容丟進文字串中
files_content = ''
with open( Config.poetry_file, 'r', encoding='UTF-8') as f:
for line in f:
#將每首詩的最後加上]符號區隔
x = line.strip() + "]"
# x = x.split(":")[1] 這邊使用我自己過濾的文本,不需要這行
if len(x) <= 5:
continue
if x[5] == ',': #只使用五言律詩 (其他如七言絕句等都不列入)
files_content += x
#將詩中統計出現字符統計出現次數
words = sorted(list(files_content))
counted_words = {}
for word in words:
if word in counted_words:
counted_words[word] += 1
else:
counted_words[word] = 1
# 去掉低频的字,總共出現2次以下的都刪掉
erase = []
for key in counted_words:
if counted_words[key] <= 2:
erase.append(key)
for key in erase:
del counted_words[key]
wordPairs = sorted(counted_words.items(), key=lambda x: -x[1])
words, _ = zip(*wordPairs)
words += (" ",)
# 利用dict(),透過(key,value)=(value,key),將word轉成向量的映射
word2num = dict((c, i) for i, c in enumerate(words))
num2word = dict((i, c) for i, c in enumerate(words))
word2numF = lambda x: word2num.get(x, len(words) - 1)
return word2numF, num2word, words, files_content
class Config(object):
poetry_file = './data/final_poetry.txt'
weight_file = './data/poetry_model.h5'
# 根據前六個字預測第七個字
max_len = 6
batch_size = 32
learning_rate = 0.001
沒有留言:
張貼留言