|
@ -9,9 +9,6 @@ |
|
|
""" |
|
|
""" |
|
|
#! -*- coding: utf-8 -*- |
|
|
#! -*- coding: utf-8 -*- |
|
|
|
|
|
|
|
|
import os |
|
|
|
|
|
# os.environ["TF_KERAS"] = "1" |
|
|
|
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = "1" |
|
|
|
|
|
import glob |
|
|
import glob |
|
|
from numpy import random |
|
|
from numpy import random |
|
|
random.seed(1001) |
|
|
random.seed(1001) |
|
@ -19,7 +16,6 @@ from tqdm import tqdm |
|
|
import numpy as np |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
import pandas as pd |
|
|
import json |
|
|
import json |
|
|
import numpy as np |
|
|
|
|
|
from tqdm import tqdm |
|
|
from tqdm import tqdm |
|
|
from bert4keras.backend import keras, K |
|
|
from bert4keras.backend import keras, K |
|
|
from bert4keras.layers import Loss |
|
|
from bert4keras.layers import Loss |
|
@ -32,12 +28,10 @@ from keras.models import Model |
|
|
# from rouge import Rouge # pip install rouge |
|
|
# from rouge import Rouge # pip install rouge |
|
|
# from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction |
|
|
# from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction |
|
|
import tensorflow as tf |
|
|
import tensorflow as tf |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from keras.backend import set_session |
|
|
from keras.backend import set_session |
|
|
config = tf.ConfigProto() |
|
|
tfconfig = tf.ConfigProto() |
|
|
config.gpu_options.allow_growth = True |
|
|
tfconfig.gpu_options.allow_growth = True |
|
|
set_session(tf.Session(config=config)) # 此处不同 |
|
|
set_session(tf.Session(config=tfconfig)) # 此处不同 |
|
|
global graph |
|
|
global graph |
|
|
graph = tf.get_default_graph() |
|
|
graph = tf.get_default_graph() |
|
|
sess = tf.Session(graph=graph) |
|
|
sess = tf.Session(graph=graph) |
|
@ -52,14 +46,13 @@ set_session(sess) |
|
|
# 基本参数 |
|
|
# 基本参数 |
|
|
|
|
|
|
|
|
class GenerateModel(object): |
|
|
class GenerateModel(object): |
|
|
def __init__(self): |
|
|
def __init__(self, config_path, checkpoint_path, spm_path, keep_tokens_path, maxlen, savemodel_path): |
|
|
|
|
|
self.config_path = config_path |
|
|
self.epoch_acc_vel = 0 |
|
|
self.checkpoint_path = checkpoint_path |
|
|
self.config_path = 'mt5/mt5_base/mt5_base_config.json' |
|
|
self.spm_path = spm_path |
|
|
self.checkpoint_path = 'mt5/mt5_base/model.ckpt-1000000' |
|
|
self.keep_tokens_path = keep_tokens_path |
|
|
self.spm_path = 'mt5/mt5_base/sentencepiece_cn.model' |
|
|
self.maxlen = maxlen |
|
|
self.keep_tokens_path = 'mt5/mt5_base/sentencepiece_cn_keep_tokens.json' |
|
|
self.savemodel_path = savemodel_path |
|
|
self.maxlen = 256 |
|
|
|
|
|
|
|
|
|
|
|
def device_setup(self): |
|
|
def device_setup(self): |
|
|
tokenizer = SpTokenizer(self.spm_path, token_start=None, token_end='</s>') |
|
|
tokenizer = SpTokenizer(self.spm_path, token_start=None, token_end='</s>') |
|
@ -85,7 +78,7 @@ class GenerateModel(object): |
|
|
output = CrossEntropy(1)([model.inputs[1], model.outputs[0]]) |
|
|
output = CrossEntropy(1)([model.inputs[1], model.outputs[0]]) |
|
|
|
|
|
|
|
|
model = Model(model.inputs, output) |
|
|
model = Model(model.inputs, output) |
|
|
path_model = "output_t5/best_model_t5.weights" |
|
|
path_model = self.savemodel_path |
|
|
model.load_weights(path_model) |
|
|
model.load_weights(path_model) |
|
|
|
|
|
|
|
|
return encoder, decoder, model, tokenizer |
|
|
return encoder, decoder, model, tokenizer |
|
@ -131,7 +124,7 @@ class Beamdataone(object): |
|
|
self.inputs_vector = 0 |
|
|
self.inputs_vector = 0 |
|
|
|
|
|
|
|
|
def text_2_textids(self,text): |
|
|
def text_2_textids(self,text): |
|
|
token_ids, segment_ids = self.tokenizer.encode(text[0], maxlen=120) |
|
|
token_ids, segment_ids = self.tokenizer.encode(text[0], maxlen=self.maxlen) |
|
|
self.text_ids = [token_ids] |
|
|
self.text_ids = [token_ids] |
|
|
|
|
|
|
|
|
def add_data(self, step, output_scores): |
|
|
def add_data(self, step, output_scores): |
|
@ -217,6 +210,11 @@ class AutoTitle(AutoRegressiveDecoder): |
|
|
self.end_id = end_id |
|
|
self.end_id = end_id |
|
|
self.minlen = minlen |
|
|
self.minlen = minlen |
|
|
self.models = {} |
|
|
self.models = {} |
|
|
|
|
|
self.chinese_sign = { |
|
|
|
|
|
",":",", |
|
|
|
|
|
":": ":", |
|
|
|
|
|
";": ";", |
|
|
|
|
|
} |
|
|
if start_id is None: |
|
|
if start_id is None: |
|
|
self.first_output_ids = np.empty((1, 0), dtype=int) |
|
|
self.first_output_ids = np.empty((1, 0), dtype=int) |
|
|
else: |
|
|
else: |
|
@ -246,7 +244,7 @@ class AutoTitle(AutoRegressiveDecoder): |
|
|
c_encoded = inputs[0] |
|
|
c_encoded = inputs[0] |
|
|
with graph.as_default(): |
|
|
with graph.as_default(): |
|
|
K.set_session(sess) |
|
|
K.set_session(sess) |
|
|
nodes = self.last_token(decoder).predict([c_encoded, output_ids]) |
|
|
nodes = self.last_token(self.decoder).predict([c_encoded, output_ids]) |
|
|
return nodes |
|
|
return nodes |
|
|
|
|
|
|
|
|
def predict_batch(self, inputs): |
|
|
def predict_batch(self, inputs): |
|
@ -259,14 +257,6 @@ class AutoTitle(AutoRegressiveDecoder): |
|
|
nodes = self.decoder.predict([token_ids, output_ids]) |
|
|
nodes = self.decoder.predict([token_ids, output_ids]) |
|
|
return nodes |
|
|
return nodes |
|
|
|
|
|
|
|
|
def data_generator(self, token_ids, output_ids): |
|
|
|
|
|
|
|
|
|
|
|
batch_token_ids = [] |
|
|
|
|
|
for i,j in zip(token_ids, output_ids): |
|
|
|
|
|
|
|
|
|
|
|
batch_token_ids = sequence_padding(token_ids) |
|
|
|
|
|
batch_segment_ids = sequence_padding(output_ids) |
|
|
|
|
|
return batch_token_ids, batch_segment_ids |
|
|
|
|
|
|
|
|
|
|
|
def beam_search_batch( |
|
|
def beam_search_batch( |
|
|
self, |
|
|
self, |
|
@ -395,15 +385,17 @@ class AutoTitle(AutoRegressiveDecoder): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate(self, text, topk=5): |
|
|
def generate(self, text, topk=5): |
|
|
c_token_ids, _ = tokenizer.encode(text, maxlen=120) |
|
|
c_token_ids, _ = self.tokenizer.encode(text, maxlen=self.maxlen) |
|
|
with graph.as_default(): |
|
|
with graph.as_default(): |
|
|
K.set_session(sess) |
|
|
K.set_session(sess) |
|
|
c_encoded = encoder.predict(np.array([c_token_ids]))[0] |
|
|
c_encoded = self.encoder.predict(np.array([c_token_ids]))[0] |
|
|
output_ids = self.beam_search([c_encoded], topk=topk) # 基于beam search |
|
|
output_ids = self.beam_search([c_encoded], topk=topk) # 基于beam search |
|
|
return tokenizer.decode([int(i) for i in output_ids]) |
|
|
return_text = self.tokenizer.decode([int(i) for i in output_ids]) |
|
|
|
|
|
return_text = "".join([self.chinese_sign[i] if i in self.chinese_sign else i for i in return_text]) |
|
|
|
|
|
return return_text |
|
|
|
|
|
|
|
|
def generate_random(self, text, n=30, topp=0.9): |
|
|
def generate_random(self, text, n=30, topp=0.9): |
|
|
c_token_ids, _ = self.tokenizer.encode(text, maxlen=120) |
|
|
c_token_ids, _ = self.tokenizer.encode(text, maxlen=self.maxlen) |
|
|
with graph.as_default(): |
|
|
with graph.as_default(): |
|
|
K.set_session(sess) |
|
|
K.set_session(sess) |
|
|
c_encoded = self.encoder.predict(np.array([c_token_ids]))[0] |
|
|
c_encoded = self.encoder.predict(np.array([c_token_ids]))[0] |
|
@ -418,13 +410,6 @@ class AutoTitle(AutoRegressiveDecoder): |
|
|
return output_str |
|
|
return output_str |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
generatemodel = GenerateModel() |
|
|
|
|
|
encoder, decoder, model, tokenizer = generatemodel.device_setup() |
|
|
|
|
|
autotitle = AutoTitle(encoder, decoder, model, tokenizer, start_id=0, end_id=tokenizer._token_end_id, maxlen=120) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def just_show_sentence(file): |
|
|
def just_show_sentence(file): |
|
|
""" |
|
|
""" |
|
|
@param file:list |
|
|
@param file:list |
|
@ -441,6 +426,18 @@ def just_show_sentence_batch(file: list) -> object: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
if __name__ == '__main__': |
|
|
|
|
|
import os |
|
|
|
|
|
from config.predict_t5_config import DropT5Config |
|
|
|
|
|
config = DropT5Config() |
|
|
|
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = config.cuda_id |
|
|
|
|
|
generatemodel = GenerateModel(config.config_path, |
|
|
|
|
|
config.checkpoint_path, |
|
|
|
|
|
config.spm_path, |
|
|
|
|
|
config.keep_tokens_path, |
|
|
|
|
|
config.maxlen, |
|
|
|
|
|
config.savemodel_path) |
|
|
|
|
|
encoder, decoder, model, tokenizer = generatemodel.device_setup() |
|
|
|
|
|
autotitle = AutoTitle(encoder, decoder, model, tokenizer, start_id=0, end_id=tokenizer._token_end_id, maxlen=256) |
|
|
# file = "train_2842.txt" |
|
|
# file = "train_2842.txt" |
|
|
# just_show(file) |
|
|
# just_show(file) |
|
|
# text = ["历史和当下都证明,创新是民族生存、发展的不竭源泉,是自身发展的必然选择,是时代对于青年们的深切呼唤"] |
|
|
# text = ["历史和当下都证明,创新是民族生存、发展的不竭源泉,是自身发展的必然选择,是时代对于青年们的深切呼唤"] |
|
@ -493,16 +490,20 @@ if __name__ == '__main__': |
|
|
|
|
|
|
|
|
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
|
|
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
|
|
|
|
|
|
|
|
text = ["'李正旺你真是傻逼讪笑,挥手道:“不不不,你千万别误会", |
|
|
# text = ["'李正旺你真是傻逼讪笑,挥手道:“不不不,你千万别误会", |
|
|
"历史和当下都证明,创新是民族生存、“发展的不竭源泉”,是是自身发展的必然选择", |
|
|
# "历史和当下都证明,创新是民族生存、“发展的不竭源泉”,是是自身发展的必然选择", |
|
|
"自身发展的必然选择", |
|
|
# "自身发展的必然选择", |
|
|
"强调轻资产经营, 更加重视经营风险的规避", |
|
|
# "强调轻资产经营, 更加重视经营风险的规避", |
|
|
"历史和当下都证明,创新是民族生存、发展的不竭源泉,是是自身发展的必然选择", |
|
|
# "历史和当下都证明,创新是民族生存、发展的不竭源泉,是是自身发展的必然选择", |
|
|
"是时代对于青年们的深切呼唤"] |
|
|
# "是时代对于青年们的深切呼唤"] |
|
|
# text = ["基本消除“热桥”影响。"] |
|
|
text = ["随着经济的发展,人们生活水平的提高,环境:问题也日益突出。", |
|
|
|
|
|
"环境问题中的化学污染是影响我国居民生活质量不可忽视的重要因素,而仪器分析作为化工专业课程中必不可少的一门课程也不例外。", |
|
|
|
|
|
"所以对学生对应用仪器分析解决实际问题的能力要求很高。", |
|
|
|
|
|
"随着经济的发展,人们生活水平的提高,环境问题也日益突出。"] |
|
|
print(just_show_sentence(text)) |
|
|
print(just_show_sentence(text)) |
|
|
# print(just_show_sentence_top(text)) |
|
|
# print(just_show_sentence_top(text)) |
|
|
# print(just_show_chachong_random(text)) |
|
|
# print(just_show_chachong_random(text)) |
|
|
|
|
|
|
|
|
# print(tokenizer.encode("\"", maxlen=120)) |
|
|
# print(tokenizer.encode("\"", maxlen=120)) |
|
|
# print(just_show_sentence_batch(text)) |
|
|
# print(just_show_sentence_batch(text)) |
|
|
|
|
|
# myout.flask_predict_no_batch_t5.logs |