普通版降重
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

511 lines
19 KiB

# -*- coding: utf-8 -*-
"""
@Time : 2023/1/16 14:59
@Author :
@FileName:
@Software:
@Describe:
"""
#! -*- coding: utf-8 -*-
import os
from config.predict_t5_config import MultipleResultsDropT5Config
config = MultipleResultsDropT5Config()
os.environ["CUDA_VISIBLE_DEVICES"] = config.cuda_id
import glob
from numpy import random
random.seed(1001)
from tqdm import tqdm
import numpy as np
import pandas as pd
import json
from tqdm import tqdm
from bert4keras.backend import keras, K
from bert4keras.layers import Loss
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import SpTokenizer
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, open
from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder
from keras.models import Model
# from rouge import Rouge # pip install rouge
# from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import tensorflow as tf
from keras.backend import set_session
tfconfig = tf.ConfigProto()
tfconfig.gpu_options.allow_growth = True
set_session(tf.Session(config=tfconfig)) # 此处不同
global graph
graph = tf.get_default_graph()
sess = tf.Session(graph=graph)
set_session(sess)
# global graph,model
# graph = tf.get_default_graph()
# sess = tf.Session(graph=graph)
# K.set_session(sess)
# 基本参数
class GenerateModel(object):
def __init__(self):
self.config_path = config.config_path
self.checkpoint_path = config.checkpoint_path
self.spm_path = config.spm_path
self.keep_tokens_path = config.keep_tokens_path
self.maxlen = config.maxlen
def device_setup(self):
tokenizer = SpTokenizer(self.spm_path, token_start=None, token_end='</s>')
keep_tokens = json.load(open(self.keep_tokens_path))
t5 = build_transformer_model(
config_path=self.config_path,
checkpoint_path=self.checkpoint_path,
keep_tokens=keep_tokens,
model='mt5.1.1',
return_keras_model=False,
name='T5',
)
# output = CrossEntropy(2)(model.inputs + model.outputs)
#
# model = Model(model.inputs, output)
encoder = t5.encoder
decoder = t5.decoder
model = t5.model
model.summary()
output = CrossEntropy(1)([model.inputs[1], model.outputs[0]])
model = Model(model.inputs, output)
path_model = config.savemodel_path
model.load_weights(path_model)
return encoder, decoder, model, tokenizer
class CrossEntropy(Loss):
"""交叉熵作为loss,并mask掉输入部分
"""
def compute_loss(self, inputs, mask=None):
y_true, y_pred = inputs
y_true = y_true[:, 1:] # 目标token_ids
y_mask = K.cast(mask[1], K.floatx())[:, 1:] # 解码器自带mask
y_pred = y_pred[:, :-1] # 预测序列,错开一位
loss = K.sparse_categorical_crossentropy(y_true, y_pred)
loss = K.sum(loss * y_mask) / K.sum(y_mask)
return loss
class Beamdataone(object):
def __init__(self, num_beams, batch_id, text, end_id, minlen, min_ends, tokenizer, output_ids):
"""
Initialize n-best list of hypotheses.
"""
self.num_beams = num_beams
self.batch_id = batch_id
self.beams = []
self.minlen = minlen
self.min_ends = min_ends
self.end_id = end_id
self.text = text
self.output_scores = np.zeros(1)
self.output_ids = [output_ids]
self.return_str = ""
self.over = False
self.tokenizer = tokenizer
# self.data()
self.output_str = ""
self.text_2_textids(
self.text
)
self.scores = np.zeros(1)
self.inputs_vector = 0
def text_2_textids(self,text):
token_ids, segment_ids = self.tokenizer.encode(text[0], maxlen=120)
self.text_ids = [token_ids]
def add_data(self, step, output_scores):
'''
还存有的数据,直接可以被迭代,
@param text:
@return:
'''
# inputs = [np.array([i]) for i in inputs]
# output_ids, output_scores = self.first_output_ids, np.zeros(1)
#
# scores, states = self.predict(
# inputs, output_ids, states, temperature, 'logits'
# ) # 计算当前得分
# if step == 0: # 第1步预测后将输入重复topk次
# inputs = [np.repeat(i, self.num_beams, axis=0) for i in self.inputs]
# inputs = [self.token_ids, self.segment_ids]
# inputs = [np.array([i]) for i in inputs]
self.output_ids = np.array(self.output_ids)
if step == 0: # 第1步预测后将输入重复topk次
self.text_ids = [np.repeat(i, self.num_beams, axis=0) for i in self.text_ids]
scores = output_scores.reshape((-1, 1)) + self.scores # 综合累积得分
# scores = output_probas
scores = self.output_scores.reshape((-1, 1)) + scores # 综合累积得分
indices = scores.argpartition(-self.num_beams, axis=None)[-self.num_beams:] # 仅保留topk
indices_1 = indices // scores.shape[1] # 行索引
indices_2 = (indices % scores.shape[1]).reshape((-1, 1)) # 列索引
self.output_ids = np.concatenate([self.output_ids[indices_1], indices_2],
1) # 更新输出
self.output_scores = np.take_along_axis(
scores, indices, axis=None
) # 更新得分
is_end = self.output_ids[:, -1] == self.end_id # 标记是否以end标记结束
self.end_counts = (self.output_ids == self.end_id).sum(1) # 统计出现的end标记
if self.output_ids.shape[1] >= self.minlen: # 最短长度判断
best = self.output_scores.argmax() # 得分最大的那个
if is_end[best] and self.end_counts[best] >= self.min_ends: # 如果已经终止
# return output_ids[best] # 直接输出
self.return_str_main(self.output_ids, best)
self.over = True
else: # 否则,只保留未完成部分
flag = ~is_end | (self.end_counts < self.min_ends) # 标记未完成序列
if not flag.all(): # 如果有已完成的
self.output_ids = self.output_ids[flag] # 扔掉已完成序列
self.output_scores = self.output_scores[flag] # 扔掉已完成序列
self.end_counts = self.end_counts[flag] # 扔掉已完成end计数
self.num_beams = flag.sum() # topk相应变化
self.output_ids = self.output_ids.tolist()
self.output_str = [tokenizer.decode(ids) for ids in self.output_ids]
self.text_ids = [self.text_ids[0] for i in range(len(self.output_ids))]
# # 达到长度直接输出
# return output_ids[output_scores.argmax()]
# def data(self):
# token_ids, segment_ids = self.tokenizer.encode(self.text, maxlen=256)
# self.token_ids = token_ids
# self.segment_ids = segment_ids
# input_str = [text for i in range(self.num_beams)]
# output_str = self.output_str
# return input_str, output_str
def return_str_main(self, output_ids, best):
output_ids_best = output_ids[best]
self.return_str = self.tokenizer.decode(output_ids_best)
class AutoTitle(AutoRegressiveDecoder):
"""seq2seq解码器
"""
def __init__(self, encoder, decoder, model, tokenizer, start_id, end_id, maxlen, minlen=1):
super(AutoTitle, self).__init__(start_id, end_id, maxlen, minlen)
self.encoder = encoder
self.decoder = decoder
self.model = model
self.tokenizer = tokenizer
self.start_id = start_id
self.end_id = end_id
self.minlen = minlen
self.models = {}
if start_id is None:
self.first_output_ids = np.empty((1, 0), dtype=int)
else:
self.first_output_ids = np.array([[self.start_id]])
# @AutoRegressiveDecoder.wraps(default_rtype='probas')
# def predict(self, inputs, output_ids, states):
# token_ids, segment_ids = inputs
# token_ids = np.concatenate([token_ids, output_ids], 1)
# segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
# with graph.as_default():
# K.set_session(sess)
# nodes = self.last_token(self.model).predict([token_ids, segment_ids])
# return nodes
# # return self.last_token(self.model).predict([token_ids, segment_ids])
# @AutoRegressiveDecoder.wraps(default_rtype='probas')
# def predict(self, inputs, output_ids, states):
# c_encoded = inputs[0]
# with graph.as_default():
# K.set_session(sess)
# nodes = self.last_token(self.decoder).predict([c_encoded, output_ids])
# return nodes
@AutoRegressiveDecoder.wraps(default_rtype='probas')
def predict(self, inputs, output_ids, states):
c_encoded = inputs[0]
with graph.as_default():
K.set_session(sess)
nodes = self.last_token(decoder).predict([c_encoded, output_ids])
return nodes
def predict_batch(self, inputs):
# inputs, output_ids, states, temperature, 'probas'
token_ids, output_ids = inputs
# token_ids = np.concatenate([token_ids, output_ids], 1)
# segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
with graph.as_default():
K.set_session(sess)
nodes = self.decoder.predict([token_ids, output_ids])
return nodes
def data_generator(self, token_ids, output_ids):
batch_token_ids = []
for i,j in zip(token_ids, output_ids):
batch_token_ids = sequence_padding(token_ids)
batch_segment_ids = sequence_padding(output_ids)
return batch_token_ids, batch_segment_ids
def beam_search_batch(
self,
inputs_str,
states=None,
temperature=1,
min_ends=1,
num_beam=3
):
"""随机采样n个结果
说明:非None的topk表示每一步只从概率最高的topk个中采样;而非None的topp
表示每一步只从概率最高的且概率之和刚好达到topp的若干个token中采样。
返回:n个解码序列组成的list。
"""
output_str = []
# token_ids, segment_ids = self.data_generator(inputs, output_ids)
batch_nums = len(inputs_str)
return_str_batch = [0] * batch_nums
# output_ids = np.empty((batch_nums, 0), dtype=int)
output_ids = np.array([self.start_id])
generated = [Beamdataone(num_beam, i, [inputs_str[i]], self.end_id, self.minlen, min_ends, self.tokenizer, output_ids) for i in range(batch_nums)]
# index_data = [i for i in range(batch_nums)]
c_token_ids = []
for i in generated:
text_ids = i.text_ids
c_token_ids.extend(text_ids)
c_token_ids = sequence_padding(c_token_ids)
c_encoded = encoder.predict(np.array(c_token_ids))
# probas_bool = np.array(token_ids, dtype=bool)
# # np.array(np.where(probas_bool == True))
# for i, sentence in enumerate(probas_bool):
# lie = np.array(np.where(sentence == True))[0]
# probas_new.append(probas[i, lie[-1]])
for i in range(len(generated)):
probas_bool = np.array(generated[i].text_ids[0], dtype=bool)
lie = np.array(np.where(probas_bool == True))[0]
# c_encoded_dan = c_encoded[i, lie[-1]]
c_encoded_dan = c_encoded[np.ix_([i], lie)]
generated[i].inputs_vector = c_encoded_dan[0]
for step in range(self.maxlen):
# if step == 0:
# token_ids, segment_ids = self.data_generator(inputs_str, output_str)
# else:
# inputs_str, output_str = [], []
inputs_vector_batch, output_ids_batch = [], []
batch_input_num_beam_num = []
for i in generated:
inputs_vector = i.inputs_vector
# if step != 0:
# output_ids_batch.extend(i.output_ids)
# text_ids_batch.extend(text_ids)
# else:
inputs_vector_batch.append(inputs_vector)
output_ids_batch.extend(i.output_ids)
if step != 0:
batch_input_num_beam_num.append(i.num_beams)
# token_ids, output_ids_batch = self.data_generator(inputs_vector_batch, output_ids_batch)
# token_ids_batch = sequence_padding(token_ids_batch)
# segment_ids_batch = sequence_padding(segment_ids_batch)
# output_ids_batch = np.array(output_ids_batch)
# if step == 0:
inputs = [inputs_vector_batch, output_ids_batch]
probas = self.predict_batch(
inputs
) # 计算当前概率
probas_new = []
probas_bool = np.array(inputs_vector_batch, dtype=bool)
# np.array(np.where(probas_bool == True))
for i, sentence in enumerate(probas_bool):
lie = np.array(np.where(sentence == True))[0]
probas_new.append(probas[i, lie[-1]])
probas = np.array(probas_new)
if step != 0:
num = 0
if len(generated) > 1:
index = 0
for index in range(len(batch_input_num_beam_num)-1):
cc = num
num += batch_input_num_beam_num[index]
generated[index].add_data(step, probas[cc:num,:])
generated[index+1].add_data(step, probas[num:,:])
else:
generated[0].add_data(step, probas[:,:])
else:
for index in range(len(generated)):
generated[index].add_data(step, probas[index,:])
# i = 0
# while True:
# bool_ = generated[i].over
# if bool_ == True:
# one_sentence = generated.pop(i)
# return_str_batch[i] = one_sentence.return_str
# if i > len(generated) - 1:
# break
# else:
# i += 1
# if i > len(generated) - 1:
# break
generated_new = []
for i in range(len(generated)):
bool_ = generated[i].over
if bool_ == False:
generated_new.append(generated[i])
else:
return_str_batch[generated[i].batch_id] = generated[i].return_str
generated = generated_new
if generated == []:
return return_str_batch
return return_str_batch
def generate(self, text, topk=5):
c_token_ids, _ = tokenizer.encode(text, maxlen=120)
with graph.as_default():
K.set_session(sess)
c_encoded = encoder.predict(np.array([c_token_ids]))[0]
output_ids = self.beam_search([c_encoded], topk=topk) # 基于beam search
return_text = tokenizer.decode([int(i) for i in output_ids])
return_text = return_text.replace(",", "")
return return_text
def generate_random(self, text, n=30, topp=0.9):
c_token_ids, _ = self.tokenizer.encode(text, maxlen=120)
with graph.as_default():
K.set_session(sess)
c_encoded = self.encoder.predict(np.array([c_token_ids]))[0]
output_ids = self.random_sample([c_encoded], n, topp=topp) # 基于随机采样
text = []
for ids in output_ids:
text.append(tokenizer.decode([int(i) for i in ids]))
return text
def generate_beam_search_batch(self, text):
output_str = self.beam_search_batch(text) # 基于随机采样
return output_str
generatemodel = GenerateModel()
encoder, decoder, model, tokenizer = generatemodel.device_setup()
autotitle = AutoTitle(encoder, decoder, model, tokenizer, start_id=0, end_id=tokenizer._token_end_id, maxlen=120)
def just_show_sentence(file):
"""
@param file:list
"""
text = file[0]
pre = autotitle.generate(text)
return pre
def just_show_sentence_batch(file: list) -> object:
text = file
pre = autotitle.generate_beam_search_batch(text)
return pre
if __name__ == '__main__':
# file = "train_2842.txt"
# just_show(file)
# text = ["历史和当下都证明,创新是民族生存、发展的不竭源泉,是自身发展的必然选择,是时代对于青年们的深切呼唤"]
# a = just_show_sentence(text)
# print(a)
# print(type(a))
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# is_novel = False
# path = "./data/700条论文测试.xlsx"
# df_list = pd.read_excel(path).values.tolist()
#
#
# df_list_new = []
# print(len(df_list))
# for i in tqdm(df_list):
# pre = just_show_sentence([i[0]])
#
# df_list_new.append([i[0], i[1], pre])
#
# df = pd.DataFrame(df_list_new, columns=["原文", "yy降重", "t5模型"])
# df.to_excel("./data/700条论文测试_7.xlsx", index=None)
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
import os
file = "./data/11篇汇总txt_new.txt"
file_t5 = "./data/11篇汇总txt_new_predict_t5.txt"
file_t5_0724 = "./data/11篇汇总txt_new_predict_t5_0724.txt"
try:
with open(file, 'r', encoding="utf-8") as f:
lines = [x.strip() for x in f if x.strip() != '']
except:
with open(file, 'r', encoding="gbk") as f:
lines = [x.strip() for x in f if x.strip() != '']
zishu = 0
data = []
for i in tqdm(lines):
zishu += len(i)
pre = just_show_sentence([i])
data.append([i, pre])
with open(file_t5_0724, "w", encoding='utf-8') as file:
for i in data:
file.write("\t".join(i) + '\n')
file.close()
print(zishu)
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# text = ["'李正旺你真是傻逼讪笑,挥手道:“不不不,你千万别误会",
# "历史和当下都证明,创新是民族生存、“发展的不竭源泉”,是是自身发展的必然选择",
# "自身发展的必然选择",
# "强调轻资产经营, 更加重视经营风险的规避",
# "历史和当下都证明,创新是民族生存、发展的不竭源泉,是是自身发展的必然选择",
# "是时代对于青年们的深切呼唤"]
# text = ["随着经济的发展,人们生活水平的提高,环境问题也日益突出。",
# "环境问题中的化学污染是影响我国居民生活质量不可忽视的重要因素,而仪器分析作为化工专业课程中必不可少的一门课程也不例外。",
# "所以对学生对应用仪器分析解决实际问题的能力要求很高。",
# "随着经济的发展,人们生活水平的提高,环境问题也日益突出。"]
# print(just_show_sentence(text))
# print(just_show_sentence_top(text))
# print(just_show_chachong_random(text))
# print(tokenizer.encode("\"", maxlen=120))
# print(just_show_sentence_batch(text))