Browse Source

完成batch预测版本

master
majiahui@haimaqingfan.com 2 years ago
parent
commit
2836c9e0d7
  1. 41
      data_do/11篇txt合并.py
  2. 30
      data_do/11篇合并之后继续处理.py
  3. 34
      evaluate_test.py
  4. 11
      flask_predict.py
  5. 126
      predict_batch.py
  6. 10
      predict_sim.py
  7. 59
      request_drop.py

41
data_do/11篇txt合并.py

@ -0,0 +1,41 @@
# -*- coding: utf-8 -*-
"""
@Time : 2023/2/21 11:28
@Author :
@FileName:
@Software:
@Describe:
"""
import os
def read_text(file):
try:
with open(file, 'r', encoding="utf-8") as f:
lines = [x.strip() for x in f if x.strip() != '']
except:
with open(file, 'r', encoding="gbk") as f:
lines = [x.strip() for x in f if x.strip() != '']
return lines
if __name__ == '__main__':
path = '../data/11篇txt'
path_new = '../data/11篇汇总txt.txt'
path_list = []
data = []
for file_name in os.listdir(path):
path_list.append(file_name)
for docx_name in path_list:
df_list_new = []
with open(path + "/" + docx_name, 'r', encoding="utf-8") as f:
lines = [x.strip() for x in f if x.strip() != '']
data.extend(lines)
data.append("+++++++++++++++++++++++++++++++++++++++++++++++++@@@@@@@@@@@@@@@@@@@")
with open(path_new, "w", encoding='utf-8') as file:
for i in data:
file.write(i + '\n')
file.close()

30
data_do/11篇合并之后继续处理.py

@ -0,0 +1,30 @@
# -*- coding: utf-8 -*-
"""
@Time : 2023/2/21 11:51
@Author :
@FileName:
@Software:
@Describe:
"""
path = '../data/11篇汇总txt.txt'
path_new = '../data/11篇汇总txt_new.txt'
with open(path, 'r', encoding="utf-8") as f:
lines = [x.strip() for x in f if x.strip() != '']
data = []
for i in lines:
dan = i.strip()
print(dan)
if dan == "":
continue
elif dan == "+++++++++++++++++++++++++++++++++++++++++++++++++@@@@@@@@@@@@@@@@@@@":
data.append("嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯")
else:
data.append(dan)
with open(path_new, "w", encoding='utf-8') as file:
for i in data:
file.write(i + '\n')
file.close()

34
evaluate_test.py

@ -96,10 +96,10 @@ eval_class = Evaluator()
# print(eval_class.evaluate_t("星 辰 的 话","星 辰 的 话 :"))
path = "data/700条效果对比.xlsx"
path_out = "data/700条效果对比测评结果_12.csv"
path_out = "data/700条效果对比测评结果_14.csv"
data = pd.read_excel(path).values.tolist()
list_class = [0 for i in range(10)]
list_class = [0 for i in range(13)]
# print(list_class)
data_new = {"rouge_1": list_class.copy(),
"rouge_2": list_class.copy(),
@ -110,14 +110,17 @@ total = len(data)
print(len(data))
for i in data:
dan_list = [i[1], i[2], i[3], i[4], i[5], i[6], i[7], i[8], i[9], i[-1]]
dan_list = [i[1], i[2], i[3], i[4], i[5], i[6], i[7], i[8], i[9], i[10], i[11], i[12], i[-1]]
for j in range(len(dan_list)):
eval_list = eval_class.evaluate_t(dan_list[j], i[0])
data_new["rouge_1"][j] += eval_list[0]
data_new["rouge_2"][j] += eval_list[1]
data_new["rouge_l"][j] += eval_list[2]
data_new["bleu"][j] += eval_list[3]
data_new["str_sim"][j] += eval_list[4]
try:
data_new["rouge_1"][j] += eval_list[0]
data_new["rouge_2"][j] += eval_list[1]
data_new["rouge_l"][j] += eval_list[2]
data_new["bleu"][j] += eval_list[3]
data_new["str_sim"][j] += eval_list[4]
except:
pass
data = {}
@ -127,5 +130,18 @@ for i in data_new:
data[i] = list(map(fune, data_new[i]))
pd.DataFrame(data,
index=["simbert_5day","simbert_simdata4day", "simbert_random20_5day","simsim模型", "dropout_sim_03模型", "dropout_sim_04模型", "t5", "t5_dropout", "小说模型", "yy"]).to_csv(
index=["simbert_5day",
"simbert_simdata4day",
"simbert_simdata5day",
"simbert_random20_5day",
"simbert_simdata4day_yinhao",
"simbert_simdata4day_yinhao_dropout",
"simsim模型",
"dropout_sim_03模型",
"dropout_sim_04模型",
"t5",
"t5_dropout",
"小说模型",
"yy"]
).to_csv(
path_out)

11
flask_predict.py

@ -116,7 +116,7 @@ def batch_data_process(text_list):
for sentence in text_list:
sentence_batch_length += len(sentence[0])
sentence_batch_one.append(sentence)
if sentence_batch_length > 500:
if sentence_batch_length > 10000:
sentence_batch_length = 0
sentence_ = sentence_batch_one.pop(-1)
sentence_batch_list.append(sentence_batch_one)
@ -132,8 +132,8 @@ def batch_predict(batch_data_list):
for i in batch_data_list:
batch_data_text_list.append(i[0])
batch_data_snetence_id_list.append(i[1:])
# batch_pre_data_list = autotitle.generate_beam_search_batch(batch_data_text_list)
batch_pre_data_list = batch_data_text_list
batch_pre_data_list = autotitle.generate_beam_search_batch(batch_data_text_list)
# batch_pre_data_list = batch_data_text_list
for text,sentence_id in zip(batch_pre_data_list,batch_data_snetence_id_list):
batch_data_list_new.append([text] + sentence_id)
@ -170,13 +170,15 @@ def main(text:list):
batch_data = batch_data_process(text_list)
text_list = []
for i in batch_data:
text_list.extend(i)
pre = batch_predict(i)
text_list.extend(pre)
return_list = predict_data_post_processing(text_list)
return return_list
@app.route('/droprepeat/', methods=['POST'])
def sentence():
print(request.remote_addr)
texts = request.json["texts"]
text_type = request.json["text_type"]
print("原始语句" + str(texts))
@ -200,7 +202,6 @@ def sentence():
texts_list = main(texts)
return_text = {"texts": texts_list, "probabilities": None, "status_code": True}
return_text = {"texts":texts_list, "probabilities": None, "status_code":True}
else:
return_text = {"texts":"输入格式应该为list", "probabilities": None, "status_code":False}
return jsonify(return_text)

126
predict_batch.py

@ -2,7 +2,7 @@
import os
# os.environ["TF_KERAS"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import glob
import random
from tqdm import tqdm
@ -156,24 +156,46 @@ class AutoTitle(AutoRegressiveDecoder):
else:
self.first_output_ids = np.array([[self.start_id]])
def data_generator(self, inputs, output_ids):
try:
batch_token_ids, batch_segment_ids = [], []
if output_ids == []:
for txt in inputs:
token_ids, segment_ids = self.tokenizer.encode(txt, maxlen=120)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
else:
for txt,output_id in zip(inputs, output_ids):
token_ids, segment_ids = self.tokenizer.encode(txt, output_id)
batch_token_ids.append(token_ids[:-1])
batch_segment_ids.append(segment_ids[:-1])
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
except:
print(inputs,output_ids)
@AutoRegressiveDecoder.wraps(default_rtype='probas')
def predict(self, inputs, output_ids, states):
token_ids, segment_ids = inputs
token_ids = np.concatenate([token_ids, output_ids], 1)
segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
with graph.as_default():
K.set_session(sess)
nodes = self.last_token(self.model).predict([token_ids, segment_ids])
return nodes
# return self.last_token(self.model).predict([token_ids, segment_ids])
def predict_batch(self, inputs):
# inputs, output_ids, states, temperature, 'probas'
token_ids, segment_ids = inputs
# token_ids = np.concatenate([token_ids, output_ids], 1)
# segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
with graph.as_default():
K.set_session(sess)
nodes = self.model.predict([token_ids, segment_ids])
return nodes
# return self.last_token(self.model).predict([token_ids, segment_ids])
def data_generator(self, token_ids, output_ids):
batch_token_ids, batch_segment_ids = [], []
if output_ids == []:
for token_id in token_ids:
segment_ids = np.zeros_like(token_id)
batch_token_ids.append(token_id)
batch_segment_ids.append(segment_ids)
else:
output_ids = np.array(output_ids)
for token_id,output_id in zip(token_ids, output_ids):
token_id = np.array(token_id)
tokens_id = np.concatenate([token_id, output_id])
segment_ids = np.concatenate([np.zeros_like(token_id), np.ones_like(output_id)])
batch_token_ids.append(tokens_id)
batch_segment_ids.append(segment_ids)
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
return batch_token_ids, batch_segment_ids
def beam_search_batch_(self, inputs, topk, states=None, temperature=1, min_ends=1):
@ -581,17 +603,17 @@ class AutoTitle(AutoRegressiveDecoder):
# token_ids, segment_ids = self.data_generator(inputs_str, output_str)
# else:
# inputs_str, output_str = [], []
text_batch, output_str_batch = [], []
text_ids_batch, output_ids_batch = [], []
batch_input_num_beam_num = []
for i in generated:
text = i.text
text_batch.extend(text)
if i.output_str != "":
output_str_batch.extend(i.output_str)
text_ids = i.text_ids
text_ids_batch.extend(text_ids)
if i.output_ids != []:
output_ids_batch.extend(i.output_ids)
if step != 0:
batch_input_num_beam_num.append(i.num_beams)
token_ids, segment_ids = self.data_generator(text_batch, output_str_batch)
token_ids, segment_ids = self.data_generator(text_ids_batch, output_ids_batch)
# token_ids_batch = sequence_padding(token_ids_batch)
# segment_ids_batch = sequence_padding(segment_ids_batch)
@ -615,7 +637,6 @@ class AutoTitle(AutoRegressiveDecoder):
if step != 0:
num = 0
print()
if len(generated) > 1:
index = 0
for index in range(len(batch_input_num_beam_num)-1):
@ -656,10 +677,6 @@ class AutoTitle(AutoRegressiveDecoder):
return return_str_batch
return return_str_batch
def top_batch(
self,
inputs_str,
@ -729,29 +746,6 @@ class AutoTitle(AutoRegressiveDecoder):
return results
@AutoRegressiveDecoder.wraps(default_rtype='probas')
def predict(self, inputs, output_ids, states):
token_ids, segment_ids = inputs
token_ids = np.concatenate([token_ids, output_ids], 1)
segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
with graph.as_default():
K.set_session(sess)
nodes = self.last_token(self.model).predict([token_ids, segment_ids])
return nodes
# return self.last_token(self.model).predict([token_ids, segment_ids])
def predict_batch(self, inputs):
# inputs, output_ids, states, temperature, 'probas'
token_ids, segment_ids = inputs
# token_ids = np.concatenate([token_ids, output_ids], 1)
# segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
with graph.as_default():
K.set_session(sess)
nodes = self.model.predict([token_ids, segment_ids])
return nodes
# return self.last_token(self.model).predict([token_ids, segment_ids])
def generate(self, text, topk=1):
token_ids, segment_ids = self.tokenizer.encode(text, maxlen=256)
output_ids = self.beam_search([token_ids, segment_ids],
@ -916,7 +910,13 @@ class Beamdataone(object):
self.tokenizer = tokenizer
# self.data()
self.output_str = ""
self.text_2_textids(
self.text
)
def text_2_textids(self,text):
token_ids, segment_ids = self.tokenizer.encode(text[0], maxlen=120)
self.text_ids = [token_ids]
def add_data(self, step, output_probas):
'''
@ -934,6 +934,7 @@ class Beamdataone(object):
# inputs = [np.repeat(i, self.num_beams, axis=0) for i in self.inputs]
# inputs = [self.token_ids, self.segment_ids]
# inputs = [np.array([i]) for i in inputs]
self.output_ids = np.array(self.output_ids)
scores = output_probas
scores = self.output_scores.reshape((-1, 1)) + scores # 综合累积得分
indices = scores.argpartition(-self.num_beams, axis=None)[-self.num_beams:] # 仅保留topk
@ -960,8 +961,9 @@ class Beamdataone(object):
self.output_scores = self.output_scores[flag] # 扔掉已完成序列
self.end_counts = self.end_counts[flag] # 扔掉已完成end计数
self.num_beams = flag.sum() # topk相应变化
self.output_ids = self.output_ids.tolist()
self.output_str = [tokenizer.decode(ids) for ids in self.output_ids]
self.text = [self.text[0] for i in range(len(self.output_ids))]
self.text_ids = [self.text_ids[0] for i in range(len(self.output_ids))]
# # 达到长度直接输出
@ -1132,18 +1134,18 @@ def paragraph_test(text, text_new):
if __name__ == '__main__':
# text = ["历史和当下都证明,创新是民族生存、“发展的不竭源泉”,是是自身发展的必然选择",
# "自身发展的必然选择",
# "强调轻资产经营, 更加重视经营风险的规避",
# "历史和当下都证明,创新是民族生存、发展的不竭源泉,是是自身发展的必然选择",
# "是时代对于青年们的深切呼唤"]
text = ["基本消除“热桥”影响。"]
print(just_show_sentence(text))
text = ["历史和当下都证明,创新是民族生存、“发展的不竭源泉”,是是自身发展的必然选择",
"自身发展的必然选择",
"强调轻资产经营, 更加重视经营风险的规避",
"历史和当下都证明,创新是民族生存、发展的不竭源泉,是是自身发展的必然选择",
"是时代对于青年们的深切呼唤"]
# text = ["基本消除“热桥”影响。"]
# print(just_show_sentence(text))
# print(just_show_sentence_top(text))
# print(just_show_chachong_random(text))
# print(tokenizer.encode("\"", maxlen=120))
# print(just_show_sentence_batch(text))
print(just_show_sentence_batch(text))
# path = "./data/700条论文测试.xlsx"

10
predict_sim.py

@ -83,7 +83,7 @@ class GenerateModel(object):
self.epoch_acc_vel = 0
self.config_path = r'./chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json'
self.checkpoint_path = r'./chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt'
self.dict_path = r'./chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt'
self.dict_path = r'./chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab_drop.txt'
self.maxlen = 120
self.novel_maxlen = 60
@ -122,7 +122,7 @@ class GenerateModel(object):
outputs = TotalLoss([2, 3])(bert.model.inputs + bert.model.outputs)
model = keras.models.Model(bert.model.inputs, outputs)
path_model = './output_simbert_yy/best_simbertmodel_datasim.weights'
path_model = './output_simbert_yy/best_simbertmodel_datasim_yinhao.weights'
model.load_weights(path_model)
return encoder,seq2seq, tokenizer
@ -708,7 +708,9 @@ def just_show_csv_beam(file):
if __name__ == '__main__':
# text = ["强调轻资产经营, 更加重视经营风险的规避", "历史和当下都证明,创新是民族生存、发展的不竭源泉,是是自身发展的必然选择", "是时代对于青年们的深切呼唤"]
# text = ["强调轻资产“经营”, 更加重视“营风险”的规避", "历史和当下都证明,创新是民族生存、发展的不竭源泉,是是自身发展的必然选择", "是时代对于青年们的深切呼唤"]
# print(just_show_sentence(text))
#
# print(just_show_sentence_batch(text))
# print(type(just_show_sentence_batch(text)))
@ -726,7 +728,7 @@ if __name__ == '__main__':
print(i[0])
continue
df = pd.DataFrame(df_list_new)
df.to_excel("./data/700条论文测试_18.xlsx", index=None)
df.to_excel("./data/700条论文测试_19.xlsx", index=None)

59
request_drop.py

@ -0,0 +1,59 @@
# -*- coding: utf-8 -*-
"""
@Time : 2022/12/29 1:14
@Author :
@FileName:
@Software:
@Describe:
"""
#coding:utf-8
import requests
from time import time
def dialog_line_parse(url, text):
"""
将数据输入模型进行分析并输出结果
:param url: 模型url
:param text: 进入模型的数据
:return: 模型返回结果
"""
response = requests.post(
url,
json=text,
timeout=1000
)
if response.status_code == 200:
return response.json()
else:
# logger.error(
# "【{}】 Failed to get a proper response from remote "
# "server. Status Code: {}. Response: {}"
# "".format(url, response.status_code, response.text)
# )
print("{}】 Failed to get a proper response from remote "
"server. Status Code: {}. Response: {}"
"".format(url, response.status_code, response.text))
print(text)
return []
ceshi_1 = [
"李正旺你真是傻逼讪笑,挥手道:“不不不,你千万别误会。关于这件事,校长特别交代过了,我也非常认同。你这是见义勇为,是勇斗歹徒、义救同学的英雄,我们清江一中决不让英雄流血又流泪!”。",
"李正旺你真是傻逼讪笑,挥手道:“不不不,你千万别误会。关于这件事,校长特别交代过了,我也非常认同。",
"李正旺你真是傻逼讪笑,挥手道:“不不不,你千万别误会。关于这件事,校长特别交代过了,我也非常认同。"
"" * 110
]
jishu = 0
for i in ceshi_1:
for j in i:
jishu += 1
print(jishu)
t1 = time()
print(dialog_line_parse("http://114.116.25.228:14000/droprepeat/",{"texts": ceshi_1, "text_type": "focus"}))
t2 = time()
print(t2 -t1)
Loading…
Cancel
Save