普通版降重
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

904 lines
41 KiB

#! -*- coding: utf-8 -*-
import os
os.environ["TF_KERAS"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import glob
from numpy import random
random.seed(1001)
from tqdm import tqdm
import numpy as np
import pandas as pd
from bert4keras.backend import keras, K
from bert4keras.layers import Loss
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer, load_vocab
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, open
from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder
from keras.models import Model
import tensorflow as tf
from keras.backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config)) # 此处不同
global graph
graph = tf.get_default_graph()
sess = tf.Session(graph=graph)
set_session(sess)
# global graph,model
# graph = tf.get_default_graph()
# sess = tf.Session(graph=graph)
# K.set_session(sess)
# 基本参数
class GenerateModel(object):
def __init__(self):
self.epoch_acc_vel = 0
self.config_path = r'./chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json'
self.checkpoint_path = r'./chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt'
self.dict_path = r'./chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt'
self.maxlen = 120
def device_setup(self):
token_dict, keep_tokens = load_vocab(
dict_path=self.dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
model = build_transformer_model(
self.config_path,
self.checkpoint_path,
application='unilm',
keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表
)
# output = CrossEntropy(2)(model.inputs + model.outputs)
#
# model = Model(model.inputs, output)
model = Model(model.inputs, model.outputs)
path_model = './output_quan/best_model_20wan_1.weights'
model.load_weights(path_model)
return model, tokenizer
class CrossEntropy(Loss):
"""交叉熵作为loss,并mask掉输入部分
"""
def compute_loss(self, inputs, mask=None):
y_true, y_mask, y_pred = inputs
y_true = y_true[:, 1:] # 目标token_ids
y_mask = y_mask[:, 1:] # segment_ids,刚好指示了要预测的部分
y_pred = y_pred[:, :-1] # 预测序列,错开一位
loss = K.sparse_categorical_crossentropy(y_true, y_pred)
loss = K.sum(loss * y_mask) / K.sum(y_mask)
return loss
class AutoTitle(AutoRegressiveDecoder):
"""seq2seq解码器
"""
def __init__(self, model, tokenizer, start_id, end_id, maxlen, minlen=1):
super(AutoTitle, self).__init__(start_id, end_id, maxlen, minlen)
self.model = model
self.tokenizer = tokenizer
self.start_id = start_id
self.end_id = end_id
self.minlen = minlen
self.models = {}
if start_id is None:
self.first_output_ids = np.empty((1, 0), dtype=int)
else:
self.first_output_ids = np.array([[self.start_id]])
def data_generator(self, inputs, output_ids):
batch_token_ids, batch_segment_ids = [], []
if output_ids == []:
for txt in inputs:
token_ids, segment_ids = self.tokenizer.encode(txt, maxlen=120)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
else:
for txt,output_id in zip(inputs, output_ids):
token_ids, segment_ids = self.tokenizer.encode(txt, output_id)
batch_token_ids.append(token_ids[:-1])
batch_segment_ids.append(segment_ids[:-1])
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
return batch_token_ids, batch_segment_ids
def random_sample_batch(
self,
inputs,
n,
topk=None,
topp=None,
states=None,
temperature=1,
min_ends=1
):
"""随机采样n个结果
说明:非None的topk表示每一步只从概率最高的topk个中采样;而非None的topp
表示每一步只从概率最高的且概率之和刚好达到topp的若干个token中采样。
返回:n个解码序列组成的list。
"""
inputs = [np.array([i for j in i]) for i in inputs]
output_ids = self.first_output_ids
results = []
for step in range(self.maxlen):
probas, states = self.predict(
inputs, output_ids, states, temperature, 'probas'
) # 计算当前概率
probas /= probas.sum(axis=1, keepdims=True) # 确保归一化
if step == 0: # 第1步预测后将结果重复n次
probas = np.repeat(probas, n, axis=0)
inputs = [np.repeat(i, n, axis=0) for i in inputs]
output_ids = np.repeat(output_ids, n, axis=0)
if topk is not None:
k_indices = probas.argpartition(-topk,
axis=1)[:, -topk:] # 仅保留topk
probas = np.take_along_axis(probas, k_indices, axis=1) # topk概率
probas /= probas.sum(axis=1, keepdims=True) # 重新归一化
if topp is not None:
p_indices = probas.argsort(axis=1)[:, ::-1] # 从高到低排序
probas = np.take_along_axis(probas, p_indices, axis=1) # 排序概率
cumsum_probas = np.cumsum(probas, axis=1) # 累积概率
flag = np.roll(cumsum_probas >= topp, 1, axis=1) # 标记超过topp的部分
flag[:, 0] = False # 结合上面的np.roll,实现平移一位的效果
probas[flag] = 0 # 后面的全部置零
probas /= probas.sum(axis=1, keepdims=True) # 重新归一化
sample_func = lambda p: np.random.choice(len(p), p=p) # 按概率采样函数
sample_ids = np.apply_along_axis(sample_func, 1, probas) # 执行采样
sample_ids = sample_ids.reshape((-1, 1)) # 对齐形状
if topp is not None:
sample_ids = np.take_along_axis(
p_indices, sample_ids, axis=1
) # 对齐原id
if topk is not None:
sample_ids = np.take_along_axis(
k_indices, sample_ids, axis=1
) # 对齐原id
output_ids = np.concatenate([output_ids, sample_ids], 1) # 更新输出
is_end = output_ids[:, -1] == self.end_id # 标记是否以end标记结束
end_counts = (output_ids == self.end_id).sum(1) # 统计出现的end标记
if output_ids.shape[1] >= self.minlen: # 最短长度判断
flag = is_end & (end_counts >= min_ends) # 标记已完成序列
if flag.any(): # 如果有已完成的
for ids in output_ids[flag]: # 存好已完成序列
results.append(ids)
flag = (flag == False) # 标记未完成序列
inputs = [i[flag] for i in inputs] # 只保留未完成部分输入
output_ids = output_ids[flag] # 只保留未完成部分候选集
end_counts = end_counts[flag] # 只保留未完成部分end计数
if len(output_ids) == 0:
break
# 如果还有未完成序列,直接放入结果
for ids in output_ids:
results.append(ids)
# 返回结果
return results
def random_sample_and_beam_search(
self,
inputs,
n,
topk=None,
topp=None,
states=None,
temperature=1,
min_ends=1
):
"""随机采样n个结果
说明:非None的topk表示每一步只从概率最高的topk个中采样;而非None的topp
表示每一步只从概率最高的且概率之和刚好达到topp的若干个token中采样。
返回:n个解码序列组成的list。
"""
whether_end_b = False
results_r = []
results_b = []
# index_r = [i for i in range(n)]
# index_b = [i for i in range(topk)]
index_r = np.arange(n)
index_b = np.arange(topk)
inputs = [np.array([i]) for i in inputs]
output_ids, output_scores = self.first_output_ids, np.zeros(1)
results = []
for step in range(self.maxlen):
beam_n = len(index_b)
probas, states = self.predict(
inputs, output_ids, states, temperature, 'probas'
) # 计算当前概率
probas = probas / probas.sum(axis=1, keepdims=True) # 确保归一化
if step == 0: # 第1步预测后将结果重复n次
probas = np.repeat(probas, n + topk, axis=0)
inputs_r = [np.repeat(i, n, axis=0) for i in inputs]
output_ids = np.repeat(output_ids, n + topk, axis=0)
inputs_b = [np.repeat(i, topk, axis=0) for i in inputs]
probas_b = probas[0, :]
probas_r = probas[:-beam_n, :]
output_ids_r = output_ids[:-beam_n, :]
output_ids_b = output_ids[-beam_n:, :]
else:
probas_b = probas[-beam_n:, :]
if whether_end_b == False:
inputs_r = [i[:-beam_n, :] for i in inputs]
inputs_b = [i[-beam_n:, :] for i in inputs]
probas_r = probas[:-beam_n, :]
output_ids_r = output_ids[:-beam_n, :]
output_ids_b = output_ids[-beam_n:, :]
else:
inputs_r = inputs
probas_r = probas
output_ids_r = output_ids
k_indices = probas_r.argpartition(-topk,
axis=1)[:, -topk:] # 仅保留topk
probas_r = np.take_along_axis(probas_r, k_indices, axis=1) # topk概率
probas_r /= probas_r.sum(axis=1, keepdims=True) # 重新归一化
if whether_end_b == False:
scores = output_scores.reshape((-1, 1)) + probas_b # 综合累积得分
indices = scores.argpartition(-topk, axis=None)[-topk:] # 仅保留topk
indices_1 = indices // scores.shape[1] # 行索引
indices_2 = (indices % scores.shape[1]).reshape((-1, 1)) # 列索引
output_ids_b = np.concatenate([output_ids_b[indices_1], indices_2],
1) # 更新输出
output_scores = np.take_along_axis(
scores, indices, axis=None
) # 更新得分
sample_func = lambda p: np.random.choice(len(p), p=p) # 按概率采样函数
sample_ids = np.apply_along_axis(sample_func, 1, probas_r) # 执行采样
sample_ids = sample_ids.reshape((-1, 1)) # 对齐形状
if topk is not None:
sample_ids = np.take_along_axis(
k_indices, sample_ids, axis=1
) # 对齐原id
output_ids_r = np.concatenate([output_ids_r, sample_ids], 1) # 更新输出
# output_ids = np.concatenate([output_ids_r, output_ids_b], 0)
if whether_end_b == False:
is_end_r = output_ids_r[:, -1] == self.end_id # 标记是否以end标记结束
is_end_b = output_ids_b[:, -1] == self.end_id # 标记是否以end标记结束
end_counts_r = (output_ids_r == self.end_id).sum(1) # 统计出现的end标记
end_counts_b = (output_ids_b == self.end_id).sum(1) # 统计出现的end标记
else:
is_end_r = output_ids_r[:, -1] == self.end_id
end_counts_r = (output_ids_r == self.end_id).sum(1)
# random_serach
if output_ids_r.shape[1] >= self.minlen: # 最短长度判断
flag = is_end_r & (end_counts_r >= min_ends) # 标记已完成序列
if flag.any(): # 如果有已完成的
for ids in output_ids_r[flag]: # 存好已完成序列
results_r.append(ids)
flag = (flag == False) # 标记未完成序列
index_r = index_r[flag]
inputs_r = [i[flag] for i in inputs_r] # 只保留未完成部分输入
output_ids_r = output_ids_r[flag] # 只保留未完成部分候选集
end_counts_r = end_counts_r[flag] # 只保留未完成部分end计数
# beam_serach
if whether_end_b == False:
if output_ids_b.shape[1] >= self.minlen: # 最短长度判断
best = output_scores.argmax() # 得分最大的那个
if is_end_b[best] and end_counts_b[best] >= min_ends: # 如果已经终止
results_b.append(output_ids_b[best]) # 直接输出
whether_end_b = True
else: # 否则,只保留未完成部分
flag_b = ~is_end_b | (end_counts_b < min_ends) # 标记未完成序列
if not flag_b.all(): # 如果有已完成的
index_b = index_b[flag_b]
inputs_b = [i[flag_b] for i in inputs_b] # 扔掉已完成序列
output_ids_b = output_ids_b[flag_b] # 扔掉已完成序列
output_scores = output_scores[flag_b] # 扔掉已完成序列
end_counts_b = end_counts_b[flag_b] # 扔掉已完成end计数
topk = flag_b.sum() # topk相应变化
if whether_end_b == False and len(output_ids_r) != 0:
token_r = inputs_r[0]
sample_ids_r = inputs_r[1]
token_b = inputs_b[0]
sample_ids_b = inputs_b[1]
token = np.concatenate([token_r,token_b],0)
sample_ids = np.concatenate([sample_ids_r,sample_ids_b],0)
inputs = [token,sample_ids]
output_ids = np.concatenate([output_ids_r, output_ids_b], 0)
elif whether_end_b == True and len(output_ids_r) != 0:
inputs = inputs_r
output_ids = output_ids_r
elif whether_end_b == False and len(output_ids_r) == 0:
inputs = inputs_b
output_ids = output_ids_b
else:
break
# 如果还有未完成序列,直接放入结果
for ids in output_ids:
results.append(ids)
# 返回结果
return results_r, results_b
def random_sample_and_beam_search_batch(
self,
inputs,
n,
topk=None,
topp=None,
states=None,
temperature=1,
min_ends=1
):
"""随机采样n个结果
说明:非None的topk表示每一步只从概率最高的topk个中采样;而非None的topp
表示每一步只从概率最高的且概率之和刚好达到topp的若干个token中采样。
返回:n个解码序列组成的list。
"""
whether_end_b = False
results_r = []
results_b = []
index_r = np.arange(n)
index_b = np.arange(topk)
inputs = [np.array([i]) for i in inputs]
output_ids, output_scores = self.first_output_ids, np.zeros(1)
results = []
for step in range(self.maxlen):
beam_n = len(index_b)
probas, states = self.predict(
inputs, output_ids, states, temperature, 'probas'
) # 计算当前概率
probas = probas / probas.sum(axis=1, keepdims=True) # 确保归一化
if step == 0: # 第1步预测后将结果重复n次
probas = np.repeat(probas, n + topk, axis=0)
inputs_r = [np.repeat(i, n, axis=0) for i in inputs]
output_ids = np.repeat(output_ids, n + topk, axis=0)
inputs_b = [np.repeat(i, topk, axis=0) for i in inputs]
probas_b = probas[0, :]
probas_r = probas[:-beam_n, :]
output_ids_r = output_ids[:-beam_n, :]
output_ids_b = output_ids[-beam_n:, :]
else:
probas_b = probas[-beam_n:, :]
if whether_end_b == False:
inputs_r = [i[:-beam_n, :] for i in inputs]
inputs_b = [i[-beam_n:, :] for i in inputs]
probas_r = probas[:-beam_n, :]
output_ids_r = output_ids[:-beam_n, :]
output_ids_b = output_ids[-beam_n:, :]
else:
inputs_r = inputs
probas_r = probas
output_ids_r = output_ids
k_indices = probas_r.argpartition(-topk,
axis=1)[:, -topk:] # 仅保留topk
probas_r = np.take_along_axis(probas_r, k_indices, axis=1) # topk概率
probas_r /= probas_r.sum(axis=1, keepdims=True) # 重新归一化
if whether_end_b == False:
scores = output_scores.reshape((-1, 1)) + probas_b # 综合累积得分
indices = scores.argpartition(-topk, axis=None)[-topk:] # 仅保留topk
indices_1 = indices // scores.shape[1] # 行索引
indices_2 = (indices % scores.shape[1]).reshape((-1, 1)) # 列索引
output_ids_b = np.concatenate([output_ids_b[indices_1], indices_2],
1) # 更新输出
output_scores = np.take_along_axis(
scores, indices, axis=None
) # 更新得分
sample_func = lambda p: np.random.choice(len(p), p=p) # 按概率采样函数
sample_ids = np.apply_along_axis(sample_func, 1, probas_r) # 执行采样
sample_ids = sample_ids.reshape((-1, 1)) # 对齐形状
if topk is not None:
sample_ids = np.take_along_axis(
k_indices, sample_ids, axis=1
) # 对齐原id
output_ids_r = np.concatenate([output_ids_r, sample_ids], 1) # 更新输出
# output_ids = np.concatenate([output_ids_r, output_ids_b], 0)
if whether_end_b == False:
is_end_r = output_ids_r[:, -1] == self.end_id # 标记是否以end标记结束
is_end_b = output_ids_b[:, -1] == self.end_id # 标记是否以end标记结束
end_counts_r = (output_ids_r == self.end_id).sum(1) # 统计出现的end标记
end_counts_b = (output_ids_b == self.end_id).sum(1) # 统计出现的end标记
else:
is_end_r = output_ids_r[:, -1] == self.end_id
end_counts_r = (output_ids_r == self.end_id).sum(1)
# random_serach
if output_ids_r.shape[1] >= self.minlen: # 最短长度判断
flag = is_end_r & (end_counts_r >= min_ends) # 标记已完成序列
if flag.any(): # 如果有已完成的
for ids in output_ids_r[flag]: # 存好已完成序列
results_r.append(ids)
flag = (flag == False) # 标记未完成序列
index_r = index_r[flag]
inputs_r = [i[flag] for i in inputs_r] # 只保留未完成部分输入
output_ids_r = output_ids_r[flag] # 只保留未完成部分候选集
end_counts_r = end_counts_r[flag] # 只保留未完成部分end计数
# beam_serach
if whether_end_b == False:
if output_ids_b.shape[1] >= self.minlen: # 最短长度判断
best = output_scores.argmax() # 得分最大的那个
if is_end_b[best] and end_counts_b[best] >= min_ends: # 如果已经终止
results_b.append(output_ids_b[best]) # 直接输出
whether_end_b = True
else: # 否则,只保留未完成部分
flag_b = ~is_end_b | (end_counts_b < min_ends) # 标记未完成序列
if not flag_b.all(): # 如果有已完成的
index_b = index_b[flag_b]
inputs_b = [i[flag_b] for i in inputs_b] # 扔掉已完成序列
output_ids_b = output_ids_b[flag_b] # 扔掉已完成序列
output_scores = output_scores[flag_b] # 扔掉已完成序列
end_counts_b = end_counts_b[flag_b] # 扔掉已完成end计数
topk = flag_b.sum() # topk相应变化
if whether_end_b == False and len(output_ids_r) != 0:
token_r = inputs_r[0]
sample_ids_r = inputs_r[1]
token_b = inputs_b[0]
sample_ids_b = inputs_b[1]
token = np.concatenate([token_r,token_b],0)
sample_ids = np.concatenate([sample_ids_r,sample_ids_b],0)
inputs = [token,sample_ids]
output_ids = np.concatenate([output_ids_r, output_ids_b], 0)
elif whether_end_b == True and len(output_ids_r) != 0:
inputs = inputs_r
output_ids = output_ids_r
elif whether_end_b == False and len(output_ids_r) == 0:
inputs = inputs_b
output_ids = output_ids_b
else:
break
# 如果还有未完成序列,直接放入结果
for ids in output_ids:
results.append(ids)
# 返回结果
return results_r, results_b
def random_sample_seed(
self,
inputs,
n,
topk=None,
topp=None,
states=None,
temperature=1,
min_ends=1
):
"""随机采样n个结果
说明:非None的topk表示每一步只从概率最高的topk个中采样;而非None的topp
表示每一步只从概率最高的且概率之和刚好达到topp的若干个token中采样。
返回:n个解码序列组成的list。
"""
inputs = [np.array([i]) for i in inputs]
output_ids = self.first_output_ids
results = []
for step in range(self.maxlen):
probas, states = self.predict(
inputs, output_ids, states, temperature, 'probas'
) # 计算当前概率
probas /= probas.sum(axis=1, keepdims=True) # 确保归一化
if step == 0: # 第1步预测后将结果重复n次
probas = np.repeat(probas, n, axis=0)
inputs = [np.repeat(i, n, axis=0) for i in inputs]
output_ids = np.repeat(output_ids, n, axis=0)
if topk is not None:
k_indices = probas.argpartition(-topk,
axis=1)[:, -topk:] # 仅保留topk
probas = np.take_along_axis(probas, k_indices, axis=1) # topk概率
probas /= probas.sum(axis=1, keepdims=True) # 重新归一化
if topp is not None:
p_indices = probas.argsort(axis=1)[:, ::-1] # 从高到低排序
probas = np.take_along_axis(probas, p_indices, axis=1) # 排序概率
cumsum_probas = np.cumsum(probas, axis=1) # 累积概率
flag = np.roll(cumsum_probas >= topp, 1, axis=1) # 标记超过topp的部分
flag[:, 0] = False # 结合上面的np.roll,实现平移一位的效果
probas[flag] = 0 # 后面的全部置零
probas /= probas.sum(axis=1, keepdims=True) # 重新归一化
random.seed(1001)
sample_func = lambda p: np.random.choice(len(p), p=p, ) # 按概率采样函数
sample_ids = np.apply_along_axis(sample_func, 1, probas) # 执行采样
sample_ids = sample_ids.reshape((-1, 1)) # 对齐形状
if topp is not None:
sample_ids = np.take_along_axis(
p_indices, sample_ids, axis=1
) # 对齐原id
if topk is not None:
sample_ids = np.take_along_axis(
k_indices, sample_ids, axis=1
) # 对齐原id
output_ids = np.concatenate([output_ids, sample_ids], 1) # 更新输出
is_end = output_ids[:, -1] == self.end_id # 标记是否以end标记结束
end_counts = (output_ids == self.end_id).sum(1) # 统计出现的end标记
if output_ids.shape[1] >= self.minlen: # 最短长度判断
flag = is_end & (end_counts >= min_ends) # 标记已完成序列
if flag.any(): # 如果有已完成的
for ids in output_ids[flag]: # 存好已完成序列
results.append(ids)
flag = (flag == False) # 标记未完成序列
inputs = [i[flag] for i in inputs] # 只保留未完成部分输入
output_ids = output_ids[flag] # 只保留未完成部分候选集
end_counts = end_counts[flag] # 只保留未完成部分end计数
if len(output_ids) == 0:
break
# 如果还有未完成序列,直接放入结果
for ids in output_ids:
results.append(ids)
# 返回结果
return results
# def beam_search_batch(
# self,
# inputs_str,
# topk = 1
# temperature=1,
# min_ends=1
# ):
# """随机采样n个结果
# 说明:非None的topk表示每一步只从概率最高的topk个中采样;而非None的topp
# 表示每一步只从概率最高的且概率之和刚好达到topp的若干个token中采样。
# 返回:n个解码序列组成的list。
# """
# output_str = []
# # token_ids, segment_ids = self.data_generator(inputs, output_ids)
# batch_nums = len(inputs_str)
# output_ids = self.first_output_ids_batch = np.empty((batch_nums, 0), dtype=int)
#
# results = [[] for i in range(batch_nums)]
# index_data = [i for i in range(batch_nums)]
# for step in range(self.maxlen):
#
# token_ids, segment_ids = self.data_generator(inputs_str, output_str)
# inputs = [token_ids, segment_ids]
# probas = self.predict_batch(
# inputs
# ) # 计算当前概率
# # probas /= probas.sum(axis=1, keepdims=True) # 确保归一化
#
# probas_new = []
# probas_bool = np.array(token_ids, dtype=bool)
# # np.array(np.where(probas_bool == True))
# for i,sentence in enumerate(probas_bool):
# lie = np.array(np.where(sentence == True))[0]
# probas_new.append(probas[i,lie[-1]])
# probas = np.array(probas_new)
# k_indices = np.argmax(probas,axis=1)
# k_indices = k_indices.reshape(-1,1)
#
# sample_ids = k_indices
# output_ids = np.concatenate([output_ids, sample_ids], 1) # 更新输出
# is_end = output_ids[:, -1] == self.end_id # 标记是否以end标记结束
# end_counts = (output_ids == self.end_id).sum(1) # 统计出现的end标记
# if output_ids.shape[1] >= self.minlen: # 最短长度判断
# flag = is_end & (end_counts >= min_ends) # 标记已完成序列
# if flag.any(): # 如果有已完成的
# index = np.array(np.where(flag == True))[0]
# pop_index = []
# for i in index:
# results[index_data[i]] = output_ids[i]
# pop_index.append(index_data[i])
# for i in pop_index:
# index_data.remove(i)
# # for ids in output_ids[flag]: # 存好已完成序列
# # results.append(ids)
# flag = (flag == False) # 标记未完成序列
# inputs_str = [inputs_str[i] for i in index_data] # 只保留未完成部分输入
# output_ids = output_ids[flag] # 只保留未完成部分候选集
# if len(output_ids) == 0:
# break
# else:
# output_str = [tokenizer.decode(ids) for ids in output_ids]
# else:
# output_str = [tokenizer.decode(ids) for ids in output_ids]
# # 如果还有未完成序列,直接放入结果
# # for ids in output_ids:
# # results.append(ids)
# # 返回结果
# return results
def random_sample_topp_gentle(
self,
inputs,
n,
topk=None,
topp=None,
states=None,
temperature=1,
min_ends=1
):
"""随机采样n个结果
说明:非None的topk表示每一步只从概率最高的topk个中采样;而非None的topp
表示每一步只从概率最高的且概率之和刚好达到topp的若干个token中采样。
返回:n个解码序列组成的list。
"""
inputs = [np.array([i]) for i in inputs]
output_ids = self.first_output_ids
results = []
for step in range(self.maxlen):
probas, states = self.predict(
inputs, output_ids, states, temperature, 'probas'
) # 计算当前概率
probas /= probas.sum(axis=1, keepdims=True) # 确保归一化
if step == 0: # 第1步预测后将结果重复n次
# TODO
probas = np.repeat(probas, n, axis=0)
inputs = [np.repeat(i, n, axis=0) for i in inputs]
output_ids = np.repeat(output_ids, n, axis=0)
if topk is not None:
k_indices = probas.argpartition(-topk,
axis=1)[:, -topk:] # 仅保留topk
probas = np.take_along_axis(probas, k_indices, axis=1) # topk概率
probas /= probas.sum(axis=1, keepdims=True) # 重新归一化
if topp is not None:
p_indices = probas.argsort(axis=1)[:, ::-1] # 从高到低排序
probas = np.take_along_axis(probas, p_indices, axis=1) # 排序概率
cumsum_probas = np.cumsum(probas, axis=1) # 累积概率
flag = np.roll(cumsum_probas >= topp, 1, axis=1) # 标记超过topp的部分
flag[:, 0] = False # 结合上面的np.roll,实现平移一位的效果
probas[flag] = 0 # 后面的全部置零
probas /= probas.sum(axis=1, keepdims=True) # 重新归一化
# me = np.mean(n)
# c = n + (2 * me)
# c /= c.sum()
sample_func = lambda p: np.random.choice(len(p), p=p) # 按概率采样函数
sample_ids = np.apply_along_axis(sample_func, 1, probas) # 执行采样
sample_ids = sample_ids.reshape((-1, 1)) # 对齐形状
if topp is not None:
sample_ids = np.take_along_axis(
p_indices, sample_ids, axis=1
) # 对齐原id
if topk is not None:
sample_ids = np.take_along_axis(
k_indices, sample_ids, axis=1
) # 对齐原id
output_ids = np.concatenate([output_ids, sample_ids], 1) # 更新输出
is_end = output_ids[:, -1] == self.end_id # 标记是否以end标记结束
end_counts = (output_ids == self.end_id).sum(1) # 统计出现的end标记
if output_ids.shape[1] >= self.minlen: # 最短长度判断
flag = is_end & (end_counts >= min_ends) # 标记已完成序列
if flag.any(): # 如果有已完成的
for ids in output_ids[flag]: # 存好已完成序列
results.append(ids)
flag = (flag == False) # 标记未完成序列
inputs = [i[flag] for i in inputs] # 只保留未完成部分输入
output_ids = output_ids[flag] # 只保留未完成部分候选集
end_counts = end_counts[flag] # 只保留未完成部分end计数
if len(output_ids) == 0:
break
# 如果还有未完成序列,直接放入结果
for ids in output_ids:
results.append(ids)
# 返回结果
return results
def batch(
self,
inputs_str,
temperature=1,
min_ends=1
):
"""随机采样n个结果
说明:非None的topk表示每一步只从概率最高的topk个中采样;而非None的topp
表示每一步只从概率最高的且概率之和刚好达到topp的若干个token中采样。
返回:n个解码序列组成的list。
"""
output_str = []
# token_ids, segment_ids = self.data_generator(inputs, output_ids)
batch_nums = len(inputs_str)
output_ids = self.first_output_ids_batch = np.empty((batch_nums, 0), dtype=int)
results = [[] for i in range(batch_nums)]
index_data = [i for i in range(batch_nums)]
for step in range(self.maxlen):
token_ids, segment_ids = self.data_generator(inputs_str, output_str)
inputs = [token_ids, segment_ids]
probas = self.predict_batch(
inputs
) # 计算当前概率
# probas /= probas.sum(axis=1, keepdims=True) # 确保归一化
probas_new = []
probas_bool = np.array(token_ids, dtype=bool)
# np.array(np.where(probas_bool == True))
for i,sentence in enumerate(probas_bool):
lie = np.array(np.where(sentence == True))[0]
probas_new.append(probas[i,lie[-1]])
probas = np.array(probas_new)
k_indices = np.argmax(probas,axis=1)
k_indices = k_indices.reshape(-1,1)
sample_ids = k_indices
output_ids = np.concatenate([output_ids, sample_ids], 1) # 更新输出
is_end = output_ids[:, -1] == self.end_id # 标记是否以end标记结束
end_counts = (output_ids == self.end_id).sum(1) # 统计出现的end标记
if output_ids.shape[1] >= self.minlen: # 最短长度判断
flag = is_end & (end_counts >= min_ends) # 标记已完成序列
if flag.any(): # 如果有已完成的
index = np.array(np.where(flag == True))[0]
pop_index = []
for i in index:
results[index_data[i]] = output_ids[i]
pop_index.append(index_data[i])
for i in pop_index:
index_data.remove(i)
# for ids in output_ids[flag]: # 存好已完成序列
# results.append(ids)
flag = (flag == False) # 标记未完成序列
inputs_str = [inputs_str[i] for i in index_data] # 只保留未完成部分输入
output_ids = output_ids[flag] # 只保留未完成部分候选集
if len(output_ids) == 0:
break
else:
output_str = [tokenizer.decode(ids) for ids in output_ids]
else:
output_str = [tokenizer.decode(ids) for ids in output_ids]
# 如果还有未完成序列,直接放入结果
# for ids in output_ids:
# results.append(ids)
# 返回结果
return results
@AutoRegressiveDecoder.wraps(default_rtype='probas')
def predict(self, inputs, output_ids, states):
token_ids, segment_ids = inputs
token_ids = np.concatenate([token_ids, output_ids], 1)
segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
with graph.as_default():
K.set_session(sess)
nodes = self.last_token(self.model).predict([token_ids, segment_ids])
return nodes
# return self.last_token(self.model).predict([token_ids, segment_ids])
def predict_batch(self, inputs):
token_ids, segment_ids = inputs
# token_ids = np.concatenate([token_ids, output_ids], 1)
# segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
with graph.as_default():
K.set_session(sess)
nodes = self.model.predict([token_ids, segment_ids])
return nodes
# return self.last_token(self.model).predict([token_ids, segment_ids])
def generate(self, text, topk=5):
text = text[0]
token_ids, segment_ids = self.tokenizer.encode(text, maxlen=120)
output_ids = self.beam_search([token_ids, segment_ids],
topk=topk) # 基于beam search
return self.tokenizer.decode(output_ids)
def generate_random(self, text, n=20, topp=0.9):
token_ids, segment_ids = self.tokenizer.encode(text, maxlen=120)
# batch_token_ids = sequence_padding(batch_token_ids)
# batch_segment_ids = sequence_padding(batch_segment_ids)
# token_ids, segment_ids = self.data_generator(text)
output_ids = self.random_sample([token_ids, segment_ids], n, topp=topp) # 基于随机采样
return [tokenizer.decode(ids) for ids in output_ids]
def generate_random_sample_topp_gentle(self, text, n=20, topp=0.9):
token_ids, segment_ids = self.tokenizer.encode(text, maxlen=120)
# batch_token_ids = sequence_padding(batch_token_ids)
# batch_segment_ids = sequence_padding(batch_segment_ids)
# token_ids, segment_ids = self.data_generator(text)
output_ids = self.random_sample_topp_gentle([token_ids, segment_ids], n, topp=topp) # 基于随机采样
return [tokenizer.decode(ids) for ids in output_ids]
def generate_random_shortest(self, text, n=20, topk=5):
token_ids, segment_ids = self.tokenizer.encode(text, maxlen=120)
# batch_token_ids = sequence_padding(batch_token_ids)
# batch_segment_ids = sequence_padding(batch_segment_ids)
# token_ids, segment_ids = self.data_generator(text)
output_ids = self.random_sample_seed([token_ids, segment_ids], n, topk) # 基于随机采样
return_str = [tokenizer.decode(ids) for ids in output_ids][0]
return return_str
def generate_top(self, text):
output_ids = self.batch(text) # 基于随机采样
return [tokenizer.decode(ids) for ids in output_ids]
def generate_random_sample_and_beam_search(self, text, n=20, topk=5):
text = text[0]
token_ids, segment_ids = self.tokenizer.encode(text, maxlen=120)
output_ids_r, output_ids_b = self.random_sample_and_beam_search([token_ids, segment_ids], n=n,
topk=topk) # 基于beam search
output_str_r = [self.tokenizer.decode(ids) for ids in output_ids_r]
output_str_b = [self.tokenizer.decode(ids) for ids in output_ids_b]
return output_str_r, output_str_b
generatemodel = GenerateModel()
model, tokenizer = generatemodel.device_setup()
autotitle = AutoTitle(model, tokenizer, start_id=None, end_id=tokenizer._token_end_id, maxlen=60)
def just_show(file):
data = []
try:
with open(file, 'r', encoding="utf-8") as f:
lines = [x.strip() for x in f if x.strip() != '']
except:
with open(file, 'r', encoding="gbk") as f:
lines = [x.strip() for x in f if x.strip() != '']
# s2 = u'她只能应下来。'
# lines = pd.read_csv(file,encoding="gbk").values.tolist()
# random.shuffle(lines)
# lines = lines[:20]
for s in tqdm(lines[:2]):
print(s)
pre = autotitle.generate_random(s)
print(s)
print(pre)
# data.append([s, pre])
# pd.DataFrame(data,columns=["原始文本","生成文本"]).to_csv("data/text_测试一万字_unilm_修正数据_小说预训练_全部数据_epoch72_反向训练.csv")
def just_show_sentence(file):
"""
@param file:list
"""
# file = file[0]
# for i in range(100):
# pre = autotitle.generate_random_sample_topp_gentle(file)
# print(pre)
pre = autotitle.generate(file)
print(pre)
# print(pre)
# if isinstance(pre,list):
# for i in pre:
# print(i, len(i))
#
#
# if isinstance(pre,str):
# print(pre)
def just_show_csv(file):
data_new = []
data = pd.read_csv(file).values.tolist()
for sentence in tqdm(data):
sentence = sentence[1]
print(sentence)
data_new_dan = []
data_new_dan.extend([sentence, len(sentence)])
pre = autotitle.generate_random(sentence)
for i in pre:
data_new_dan.extend([i, len(i)])
data_new.append(data_new_dan)
pd.DataFrame(data_new).to_csv("data/###第3章 非常尴尬_generate_random.csv")
# return pre
if __name__ == '__main__':
# file = "train_2842.txt"
# just_show(file)
text = ["迈向新时代,当代青年要立鸿鹄之志,做马克思主义的坚定信仰者。"]
just_show_sentence(text)
# "简言之,她不好过,李四也别想好过!"
# s = "张三的对话"
# print(autotitle.generate(s))
# file = "data/###第3章 非常尴尬.csv"
# just_show_csv(file)