#! -*- coding: utf-8 -*- # bert做Seq2Seq任务,采用UNILM方案 # 介绍链接:https://kexue.fm/archives/6933 from __future__ import print_function import glob import numpy as np from bert4keras.backend import keras, K from bert4keras.layers import Loss from bert4keras.models import build_transformer_model from bert4keras.tokenizers import Tokenizer, load_vocab from bert4keras.optimizers import Adam from bert4keras.snippets import sequence_padding, open from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder from keras.models import Model import tensorflow as tf from rouge import Rouge from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction import os # os.environ["TF_KERAS"] = "1" os.environ["CUDA_VISIBLE_DEVICES"] = "0" from keras.backend.tensorflow_backend import set_session config = tf.ConfigProto() config.gpu_options.allow_growth = True set_session(tf.Session(config=config)) # 此处不同 # 基本参数 maxlen = 256 batch_size = 8 steps_per_epoch = 20000 epochs = 10000 # bert配置 config_path = 'bert_config_dropout_0_3.json' checkpoint_path = './chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt' dict_path = './chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt' file = "data/train_yy.txt" try: with open(file, 'r', encoding="utf-8") as f: lines = [x.strip() for x in f if x.strip() != ''] except: with open(file, 'r', encoding="gbk") as f: lines = [x.strip() for x in f if x.strip() != ''] # 加载并精简词表,建立分词器 token_dict, keep_tokens = load_vocab( dict_path=dict_path, simplified=True, startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], ) tokenizer = Tokenizer(token_dict, do_lower_case=True) class data_generator(DataGenerator): """数据生成器 """ def __iter__(self, random=False): batch_token_ids, batch_segment_ids = [], [] for is_end, txt in self.sample(random): text = txt.split('\t') if len(text) == 3: content = text[0] content_g = text[2] token_ids, segment_ids = tokenizer.encode( content, content_g, maxlen=maxlen ) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) yield [batch_token_ids, batch_segment_ids], None batch_token_ids, batch_segment_ids = [], [] class CrossEntropy(Loss): """交叉熵作为loss,并mask掉输入部分 """ def compute_loss(self, inputs, mask=None): y_true, y_mask, y_pred = inputs y_true = y_true[:, 1:] # 目标token_ids y_mask = y_mask[:, 1:] # segment_ids,刚好指示了要预测的部分 y_pred = y_pred[:, :-1] # 预测序列,错开一位 loss = K.sparse_categorical_crossentropy(y_true, y_pred) loss = K.sum(loss * y_mask) / K.sum(y_mask) return loss model = build_transformer_model( config_path, checkpoint_path, application='unilm', keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表 ignore_invalid_weights=True ) model.summary() output = CrossEntropy(2)(model.inputs + model.outputs) model = Model(model.inputs, output) model.compile(optimizer=Adam(1e-5)) class AutoTitle(AutoRegressiveDecoder): """seq2seq解码器 """ @AutoRegressiveDecoder.wraps(default_rtype='probas') def predict(self, inputs, output_ids, states): token_ids, segment_ids = inputs token_ids = np.concatenate([token_ids, output_ids], 1) segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1) return self.last_token(model).predict([token_ids, segment_ids]) def generate(self, text, topk=1): max_c_len = maxlen - self.maxlen token_ids, segment_ids = tokenizer.encode(text, maxlen=max_c_len) output_ids = self.beam_search([token_ids, segment_ids], topk=topk) # 基于beam search return tokenizer.decode(output_ids) autotitle = AutoTitle(start_id=None, end_id=tokenizer._token_end_id, maxlen=60) def just_show(): s2 = u'尽管是有些疑惑,但大家也只敢是脸上带着笑意,慢慢地从苏溪的嘴里面套一些话出来。' for s in [s2]: print(u'生成标题:', autotitle.generate(s)) print() # class Evaluator(keras.callbacks.Callback): # """评估与保存 # """ # def __init__(self): # self.lowest = 1e10 # # def on_epoch_end(self, epoch, logs=None): # # 保存最优 # if logs['loss'] <= self.lowest: # self.lowest = logs['loss'] # model.save_weights('./output/best_model_quan_reversal.weights') # # 演示效果 # just_show() class Evaluator(keras.callbacks.Callback): """评估与保存 """ def __init__(self): self.rouge = Rouge() self.smooth = SmoothingFunction().method1 self.best_bleu = 0. def on_epoch_end(self, epoch, logs=None): metrics = self.evaluate(valid_data) # 评测模型 if metrics['bleu'] > self.best_bleu: self.best_bleu = metrics['bleu'] model.save_weights('./best_model.weights') # 保存模型 metrics['best_bleu'] = self.best_bleu print('valid_data:', metrics) def evaluate(self, data, topk=1): total = 0 rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0 for title, content in tqdm(data): total += 1 title = ' '.join(title).lower() pred_title = ' '.join(autotitle.generate(content, topk)).lower() if pred_title.strip(): scores = self.rouge.get_scores(hyps=pred_title, refs=title) rouge_1 += scores[0]['rouge-1']['f'] rouge_2 += scores[0]['rouge-2']['f'] rouge_l += scores[0]['rouge-l']['f'] bleu += sentence_bleu( references=[title.split(' ')], hypothesis=pred_title.split(' '), smoothing_function=self.smooth ) rouge_1 /= total rouge_2 /= total rouge_l /= total bleu /= total return { 'rouge-1': rouge_1, 'rouge-2': rouge_2, 'rouge-l': rouge_l, 'bleu': bleu, } if __name__ == '__main__': evaluator = Evaluator() train_generator = data_generator(lines, batch_size) model.fit( train_generator.forfit(), steps_per_epoch=steps_per_epoch, epochs=epochs, callbacks=[evaluator] ) else: model.load_weights('./best_model.weights')