基础版CNN
def get_model():
K.clear_session()
model = Sequential()
model.add(Embedding(len(vocab) + 1, 300, input_length=50)) #使用Embeeding层将每个词编码转换为词向量
model.add(Conv1D(256, 5, padding='same'))
model.add(MaxPooling1D(3, 3, padding='same'))
model.add(Conv1D(128, 5, padding='same'))
model.add(MaxPooling1D(3, 3, padding='same'))
model.add(Conv1D(64, 3, padding='same'))
model.add(Flatten())
model.add(Dropout(0.1))
model.add(BatchNormalization()) # (批)规范化层
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(3, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())
return model
简单版TextCNN
def get_model():
K.clear_session()
main_input = Input(shape=(50,), dtype='float64')
# 词嵌入(使用预训练的词向量)
embedder = Embedding(len(vocab) + 1, 300, input_length=50, trainable=False)
embed = embedder(main_input)
# 词窗大小分别为3,4,5
cnn1 = Conv1D(256, 3, padding='same', strides=1, activation='relu')(embed)
cnn1 = MaxPooling1D(pool_size=48)(cnn1)
cnn2 = Conv1D(256, 4, padding='same', strides=1, activation='relu')(embed)
cnn2 = MaxPooling1D(pool_size=47)(cnn2)
cnn3 = Conv1D(256, 5, padding='same', strides=1, activation='relu')(embed)
cnn3 = MaxPooling1D(pool_size=46)(cnn3)
# 合并三个模型的输出向量
cnn = concatenate([cnn1, cnn2, cnn3], axis=-1)
flat = Flatten()(cnn)
drop = Dropout(0.2)(flat)
main_output = Dense(3, activation='softmax')(drop)
model = Model(inputs=main_input, outputs=main_output)
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())
return model
附录
全部源码
导包
import os
import random
from joblib import load, dump
from sklearn.model_selection import train_test_split
import pandas as pd
import jieba
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
import numpy as np
import pandas as pd
from keras.models import Sequential, Model
from keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization, Dense, Input, concatenate
from keras import backend as K
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
构建文本迭代器
def get_text_label_iterator(data_path):
with open(data_path, 'r', encoding='utf-8') as f:
for line in f:
line_split = line.strip().split('\t')
if len(line_split) != 2:
print(line)
continue
yield line_split[0], line_split[1]
it = get_text_label_iterator(r"data/keras_bert_train.txt")
next(it)
获得词汇表vocab
def get_segment_iterator(data_path):
data_iter = get_text_label_iterator(data_path)
for text, label in data_iter:
yield list(jieba.cut(text)), label
it = get_segment_iterator(r"data/keras_bert_train.txt")
# next(it)
def get_only_segment_iterator(data_path):
segment_iter = get_segment_iterator(data_path)
for segment, label in tqdm(segment_iter):
yield segment
# tokenizer=Tokenizer() #创建一个Tokenizer对象
# # fit_on_texts函数可以将输入的文本中的每个词编号,编号是根据词频的,词频越大,编号越小
# tokenizer.fit_on_texts(get_only_segment_iterator(r"data/keras_bert_train.txt"))
# dump(tokenizer, r"data/keras_textcnn_tokenizer.bin")
tokenizer = load(r"data/keras_textcnn_tokenizer.bin")
vocab = tokenizer.word_index #得到每个词的编号
获取样本个数
def get_sample_count(data_path):
data_iter = get_text_label_iterator(data_path)
count = 0
for text, label in tqdm(data_iter):
count += 1
return count
train_sample_count = get_sample_count(r"data/keras_bert_train.txt")
dev_sample_count = get_sample_count(r"data/keras_bert_dev.txt")
构建标签表
def read_category(data_path):
"""读取分类目录,固定"""
categories = os.listdir(data_path)
cat_to_id = dict(zip(categories, range(len(categories))))
return categories, cat_to_id
categories, cat_to_id = read_category("000_text_classifier_tensorflow_textcnn/THUCNews")
cat_to_id
构建输入数据迭代器
def get_data_iterator(data_path):
while True:
segment_iter = get_segment_iterator(data_path)
for segment, label in segment_iter:
word_ids = tokenizer.texts_to_sequences([segment])
padded_seqs = pad_sequences(word_ids,maxlen=50)[0] #将超过固定值的部分截掉,不足的在最前面用0填充
yield padded_seqs, cat_to_id[label]
it = get_data_iterator(r"data/keras_bert_train.txt")
next(it)
Building prefix dict from the default dictionary …
Loading model from cache /tmp/jieba.cache
Loading model cost 1.039 seconds.
Prefix dict has been built succesfully.
(array([ 69, 2160, 57, 3010, 55, 828, 68, 1028,
456, 3712, 2130, 1, 36, 116604, 361, 7019,
377, 26, 8, 76, 539, 1, 346, 7323,
89885, 7019, 73, 7, 55, 84, 3, 33,
3199, 69, 579, 1366, 2, 1526, 26, 89,
456, 5741, 8256, 1, 6163, 7253, 10831, 14,
77404, 3], dtype=int32),
def get_batch_data_iterator(data_path, batch_size=64, shuffle=True):
data_iter = get_data_iterator(data_path)
while True:
data_list = []
for _ in range(batch_size):
data = next(data_iter)
data_list.append(data)
if shuffle:
random.shuffle(data_list)
pad_sequences_list = []
label_index_list = []
for data in data_list:
pad_sequences, label_index = data
pad_sequences_list.append(pad_sequences.tolist())
label_index_list.append(label_index)
yield np.array(pad_sequences_list), np.array(label_index_list)
it = get_batch_data_iterator(r"data/keras_bert_train.txt", batch_size=1)
next(it)
(array([[ 69, 2160, 57, 3010, 55, 828, 68, 1028,
456, 3712, 2130, 1, 36, 116604, 361, 7019,
377, 26, 8, 76, 539, 1, 346, 7323,
89885, 7019, 73, 7, 55, 84, 3, 33,
3199, 69, 579, 1366, 2, 1526, 26, 89,
456, 5741, 8256, 1, 6163, 7253, 10831, 14,
77404, 3]]),
array([0]))
it = get_batch_data_iterator(r"data/keras_bert_train.txt", batch_size=1)
next(it)
(array([[ 5, 5013, 14313, 601, 15377, 23499, 13, 493,
1541, 247, 5, 35557, 21529, 15377, 5, 1764,
11, 2774, 15377, 5, 279, 1764, 430, 5,
4742, 36921, 24090, 6387, 23499, 13, 5013, 8319,
6387, 5, 2370, 1764, 6387, 5, 16122, 1764,
6387, 5, 14313, 3707, 6387, 5, 11, 2774,
247, 6387],
[ 69, 2160, 57, 3010, 55, 828, 68, 1028,
456, 3712, 2130, 1, 36, 116604, 361, 7019,
377, 26, 8, 76, 539, 1, 346, 7323,
89885, 7019, 73, 7, 55, 84, 3, 33,
3199, 69, 579, 1366, 2, 1526, 26, 89,
456, 5741, 8256, 1, 6163, 7253, 10831, 14,
77404, 3]]),
array([0, 0]))
定义 基础版CNN
def get_model():
K.clear_session()
model = Sequential()
model.add(Embedding(len(vocab) + 1, 300, input_length=50)) #使用Embeeding层将每个词编码转换为词向量
model.add(Conv1D(256, 5, padding='same'))
model.add(MaxPooling1D(3, 3, padding='same'))
model.add(Conv1D(128, 5, padding='same'))
model.add(MaxPooling1D(3, 3, padding='same'))
model.add(Conv1D(64, 3, padding='same'))
model.add(Flatten())
model.add(Dropout(0.1))
model.add(BatchNormalization()) # (批)规范化层
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(3, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())
return model
early_stopping = EarlyStopping(monitor='val_acc', patience=3) #早停法,防止过拟合
plateau = ReduceLROnPlateau(monitor="val_acc", verbose=1, mode='max', factor=0.5, patience=2) #当评价指标不在提升时,减少学习率
# checkpoint = ModelCheckpoint('trained_model/keras_bert_THUCNews.hdf5', monitor='val_acc',verbose=2, save_best_only=True, mode='max', save_weights_only=True) #保存最好的模型
def get_step(sample_count, batch_size):
step = sample_count // batch_size
if sample_count % batch_size != 0:
step += 1
return step
batch_size = 8
train_step = get_step(train_sample_count, batch_size)
dev_step = get_step(dev_sample_count, batch_size)
train_dataset_iterator = get_batch_data_iterator(r"data/keras_bert_train.txt", batch_size)
dev_dataset_iterator = get_batch_data_iterator(r"data/keras_bert_dev.txt", batch_size)
model = get_model()
#模型训练
model.fit(
train_dataset_iterator,
steps_per_epoch=train_step,
epochs=10,
validation_data=dev_dataset_iterator,
validation_steps=dev_step,
callbacks=[early_stopping, plateau],
verbose=1
)
Model: “sequential”
Layer (type) Output Shape Param #
=================================================================
embedding (Embedding) (None, 50, 300) 454574700
conv1d (Conv1D) (None, 50, 256) 384256
max_pooling1d (MaxPooling1D) (None, 17, 256) 0
conv1d_1 (Conv1D) (None, 17, 128) 163968
max_pooling1d_1 (MaxPooling1 (None, 6, 128) 0
conv1d_2 (Conv1D) (None, 6, 64) 24640
flatten (Flatten) (None, 384) 0
dropout (Dropout) (None, 384) 0
batch_normalization (BatchNo (None, 384) 1536
dense (Dense) (None, 256) 98560
dropout_1 (Dropout) (None, 256) 0
dense_1 (Dense) (None, 3) 771
=================================================================
Total params: 455,248,431
Trainable params: 455,247,663
Non-trainable params: 768
None
Epoch 1/10
1/83608 […] - ETA: 3:28 - loss: 1.1427 - accuracy: 0.3750
定义 简单版TextCNN
def get_model():
K.clear_session()
main_input = Input(shape=(50,), dtype='float64')
# 词嵌入(使用预训练的词向量)
embedder = Embedding(len(vocab) + 1, 300, input_length=50, trainable=False)
embed = embedder(main_input)
# 词窗大小分别为3,4,5
cnn1 = Conv1D(256, 3, padding='same', strides=1, activation='relu')(embed)
cnn1 = MaxPooling1D(pool_size=48)(cnn1)
cnn2 = Conv1D(256, 4, padding='same', strides=1, activation='relu')(embed)
cnn2 = MaxPooling1D(pool_size=47)(cnn2)
cnn3 = Conv1D(256, 5, padding='same', strides=1, activation='relu')(embed)
cnn3 = MaxPooling1D(pool_size=46)(cnn3)
# 合并三个模型的输出向量
cnn = concatenate([cnn1, cnn2, cnn3], axis=-1)
flat = Flatten()(cnn)
drop = Dropout(0.2)(flat)
main_output = Dense(3, activation='softmax')(drop)
model = Model(inputs=main_input, outputs=main_output)
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())
return model
batch_size = 8
train_step = get_step(train_sample_count, batch_size)
dev_step = get_step(dev_sample_count, batch_size)
train_dataset_iterator = get_batch_data_iterator(r"data/keras_bert_train.txt", batch_size)
dev_dataset_iterator = get_batch_data_iterator(r"data/keras_bert_dev.txt", batch_size)
model = get_model()
#模型训练
model.fit(
train_dataset_iterator,
steps_per_epoch=train_step,
epochs=10,
validation_data=dev_dataset_iterator,
validation_steps=dev_step,
callbacks=[early_stopping, plateau],
verbose=1
)
Model: “functional_1”
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_1 (InputLayer) [(None, 50)] 0
embedding (Embedding) (None, 50, 300) 454574700 input_1[0][0]
conv1d (Conv1D) (None, 50, 256) 230656 embedding[0][0]
conv1d_1 (Conv1D) (None, 50, 256) 307456 embedding[0][0]
conv1d_2 (Conv1D) (None, 50, 256) 384256 embedding[0][0]
max_pooling1d (MaxPooling1D) (None, 1, 256) 0 conv1d[0][0]
max_pooling1d_1 (MaxPooling1D) (None, 1, 256) 0 conv1d_1[0][0]
max_pooling1d_2 (MaxPooling1D) (None, 1, 256) 0 conv1d_2[0][0]
concatenate (Concatenate) (None, 1, 768) 0 max_pooling1d[0][0]
max_pooling1d_1[0][0]
max_pooling1d_2[0][0]
flatten (Flatten) (None, 768) 0 concatenate[0][0]
dropout (Dropout) (None, 768) 0 flatten[0][0]
dense (Dense) (None, 3) 2307 dropout[0][0]
==================================================================================================
Total params: 455,499,375
Trainable params: 924,675
Non-trainable params: 454,574,700
None
Epoch 1/10
238/83608 […] - ETA: 2:31:07 - loss: 0.0308 - accuracy: 0.9979