完成batch预测版本

3 years ago · 2836c9e0d7
7 changed files with 231 additions and 80 deletions
--- a/data_do/11篇txt合并.py
+++ b/data_do/11篇txt合并.py
@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+
+"""
+@Time    :  2023/2/21 11:28
+@Author  : 
+@FileName: 
+@Software: 
+@Describe:
+"""
+import os
+def read_text(file):
+    try:
+        with open(file, 'r', encoding="utf-8") as f:
+            lines = [x.strip() for x in f if x.strip() != '']
+    except:
+        with open(file, 'r', encoding="gbk") as f:
+            lines = [x.strip() for x in f if x.strip() != '']
+    return lines
+
+
+if __name__ == '__main__':
+
+    path = '../data/11篇txt'
+    path_new = '../data/11篇汇总txt.txt'
+    path_list = []
+    data = []
+
+    for file_name in os.listdir(path):
+        path_list.append(file_name)
+    for docx_name in path_list:
+        df_list_new = []
+        with open(path + "/" + docx_name, 'r', encoding="utf-8") as f:
+            lines = [x.strip() for x in f if x.strip() != '']
+        data.extend(lines)
+        data.append("+++++++++++++++++++++++++++++++++++++++++++++++++@@@@@@@@@@@@@@@@@@@")
+
+
+    with open(path_new, "w", encoding='utf-8') as file:
+        for i in data:
+            file.write(i + '\n')
+        file.close()
--- a/data_do/11篇合并之后继续处理.py
+++ b/data_do/11篇合并之后继续处理.py
@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+
+"""
+@Time    :  2023/2/21 11:51
+@Author  : 
+@FileName: 
+@Software: 
+@Describe:
+"""
+
+path = '../data/11篇汇总txt.txt'
+path_new  = '../data/11篇汇总txt_new.txt'
+with open(path, 'r', encoding="utf-8") as f:
+    lines = [x.strip() for x in f if x.strip() != '']
+
+data = []
+for i in lines:
+    dan = i.strip()
+    print(dan)
+    if dan == "。":
+        continue
+    elif dan == "+++++++++++++++++++++++++++++++++++++++++++++++++@@@@@@@@@@@@@@@@@@@":
+        data.append("嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯嚯")
+    else:
+        data.append(dan)
+
+with open(path_new, "w", encoding='utf-8') as file:
+    for i in data:
+        file.write(i + '\n')
+    file.close()
--- a/evaluate_test.py
+++ b/evaluate_test.py
@ -96,10 +96,10 @@ eval_class = Evaluator()

 # print(eval_class.evaluate_t("星 辰 的 话","星 辰 的 话 :"))
 path = "data/700条效果对比.xlsx"
-path_out = "data/700条效果对比测评结果_12.csv"
+path_out = "data/700条效果对比测评结果_14.csv"
 data = pd.read_excel(path).values.tolist()

-list_class = [0 for i in range(10)]
+list_class = [0 for i in range(13)]
 # print(list_class)
 data_new = {"rouge_1": list_class.copy(),
            "rouge_2": list_class.copy(),
@ -110,14 +110,17 @@ total = len(data)

 print(len(data))
 for i in data:
-    dan_list = [i[1], i[2], i[3], i[4], i[5], i[6], i[7], i[8], i[9], i[-1]]
+    dan_list = [i[1], i[2], i[3], i[4], i[5], i[6], i[7], i[8], i[9], i[10], i[11], i[12], i[-1]]
    for j in range(len(dan_list)):
        eval_list = eval_class.evaluate_t(dan_list[j], i[0])
-        data_new["rouge_1"][j] += eval_list[0]
-        data_new["rouge_2"][j] += eval_list[1]
-        data_new["rouge_l"][j] += eval_list[2]
-        data_new["bleu"][j] += eval_list[3]
-        data_new["str_sim"][j] += eval_list[4]
+        try:
+            data_new["rouge_1"][j] += eval_list[0]
+            data_new["rouge_2"][j] += eval_list[1]
+            data_new["rouge_l"][j] += eval_list[2]
+            data_new["bleu"][j] += eval_list[3]
+            data_new["str_sim"][j] += eval_list[4]
+        except:
+            pass

 data = {}

@ -127,5 +130,18 @@ for i in data_new:
    data[i] = list(map(fune, data_new[i]))

 pd.DataFrame(data,
-             index=["simbert_5day","simbert_simdata4day", "simbert_random20_5day","simsim模型", "dropout_sim_03模型", "dropout_sim_04模型", "t5", "t5_dropout", "小说模型", "yy"]).to_csv(
+             index=["simbert_5day",
+                    "simbert_simdata4day",
+                    "simbert_simdata5day",
+                    "simbert_random20_5day",
+                    "simbert_simdata4day_yinhao",
+                    "simbert_simdata4day_yinhao_dropout",
+                    "simsim模型",
+                    "dropout_sim_03模型",
+                    "dropout_sim_04模型",
+                    "t5",
+                    "t5_dropout",
+                    "小说模型",
+                    "yy"]
+             ).to_csv(
    path_out)
--- a/flask_predict.py
+++ b/flask_predict.py
@ -116,7 +116,7 @@ def batch_data_process(text_list):
    for sentence in text_list:
        sentence_batch_length += len(sentence[0])
        sentence_batch_one.append(sentence)
-        if sentence_batch_length > 500:
+        if sentence_batch_length > 10000:
            sentence_batch_length = 0
            sentence_ = sentence_batch_one.pop(-1)
            sentence_batch_list.append(sentence_batch_one)
@ -132,8 +132,8 @@ def batch_predict(batch_data_list):
    for i in batch_data_list:
        batch_data_text_list.append(i[0])
        batch_data_snetence_id_list.append(i[1:])
-    # batch_pre_data_list = autotitle.generate_beam_search_batch(batch_data_text_list)
-    batch_pre_data_list = batch_data_text_list
+    batch_pre_data_list = autotitle.generate_beam_search_batch(batch_data_text_list)
+    # batch_pre_data_list = batch_data_text_list
    for text,sentence_id in zip(batch_pre_data_list,batch_data_snetence_id_list):
        batch_data_list_new.append([text] + sentence_id)

@ -170,13 +170,15 @@ def main(text:list):
    batch_data = batch_data_process(text_list)
    text_list = []
    for i in batch_data:
-        text_list.extend(i)
+        pre = batch_predict(i)
+        text_list.extend(pre)
    return_list = predict_data_post_processing(text_list)
    return return_list


@app.route('/droprepeat/', methods=['POST'])
 def sentence():
+    print(request.remote_addr)
    texts = request.json["texts"]
    text_type = request.json["text_type"]
    print("原始语句" + str(texts))
@ -200,7 +202,6 @@ def sentence():
                texts_list = main(texts)

            return_text = {"texts": texts_list, "probabilities": None, "status_code": True}
-        return_text = {"texts":texts_list, "probabilities": None, "status_code":True}
    else:
        return_text = {"texts":"输入格式应该为list", "probabilities": None, "status_code":False}
    return jsonify(return_text)
--- a/predict_batch.py
+++ b/predict_batch.py
@ -2,7 +2,7 @@

 import os
 # os.environ["TF_KERAS"] = "1"
-os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+os.environ["CUDA_VISIBLE_DEVICES"] = "1"
 import glob
 import random
 from tqdm import tqdm
@ -156,24 +156,46 @@ class AutoTitle(AutoRegressiveDecoder):
        else:
            self.first_output_ids = np.array([[self.start_id]])

-    def data_generator(self, inputs, output_ids):
-        try:
-            batch_token_ids, batch_segment_ids = [], []
-            if output_ids == []:
-                for txt in inputs:
-                    token_ids, segment_ids = self.tokenizer.encode(txt, maxlen=120)
-                    batch_token_ids.append(token_ids)
-                    batch_segment_ids.append(segment_ids)
-            else:
-                for txt,output_id in zip(inputs, output_ids):
-                    token_ids, segment_ids = self.tokenizer.encode(txt, output_id)
-                    batch_token_ids.append(token_ids[:-1])
-                    batch_segment_ids.append(segment_ids[:-1])
-
-            batch_token_ids = sequence_padding(batch_token_ids)
-            batch_segment_ids = sequence_padding(batch_segment_ids)
-        except:
-            print(inputs,output_ids)
+    @AutoRegressiveDecoder.wraps(default_rtype='probas')
+    def predict(self, inputs, output_ids, states):
+        token_ids, segment_ids = inputs
+        token_ids = np.concatenate([token_ids, output_ids], 1)
+        segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
+        with graph.as_default():
+            K.set_session(sess)
+            nodes = self.last_token(self.model).predict([token_ids, segment_ids])
+        return nodes
+        # return self.last_token(self.model).predict([token_ids, segment_ids])
+
+    def predict_batch(self, inputs):
+        # inputs, output_ids, states, temperature, 'probas'
+        token_ids, segment_ids = inputs
+        # token_ids = np.concatenate([token_ids, output_ids], 1)
+        # segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
+        with graph.as_default():
+            K.set_session(sess)
+            nodes = self.model.predict([token_ids, segment_ids])
+        return nodes
+        # return self.last_token(self.model).predict([token_ids, segment_ids])
+
+    def data_generator(self, token_ids, output_ids):
+        batch_token_ids, batch_segment_ids = [], []
+        if output_ids == []:
+            for token_id in token_ids:
+                segment_ids = np.zeros_like(token_id)
+                batch_token_ids.append(token_id)
+                batch_segment_ids.append(segment_ids)
+        else:
+            output_ids = np.array(output_ids)
+            for token_id,output_id in zip(token_ids, output_ids):
+                token_id = np.array(token_id)
+                tokens_id = np.concatenate([token_id, output_id])
+                segment_ids = np.concatenate([np.zeros_like(token_id), np.ones_like(output_id)])
+                batch_token_ids.append(tokens_id)
+                batch_segment_ids.append(segment_ids)
+
+        batch_token_ids = sequence_padding(batch_token_ids)
+        batch_segment_ids = sequence_padding(batch_segment_ids)
        return batch_token_ids, batch_segment_ids

    def beam_search_batch_(self, inputs, topk, states=None, temperature=1, min_ends=1):
@ -581,17 +603,17 @@ class AutoTitle(AutoRegressiveDecoder):
            #     token_ids, segment_ids = self.data_generator(inputs_str, output_str)
            # else:
            # inputs_str, output_str = [], []
-            text_batch, output_str_batch = [], []
+            text_ids_batch, output_ids_batch = [], []
            batch_input_num_beam_num = []
            for i in generated:
-                text = i.text
-                text_batch.extend(text)
-                if i.output_str != "":
-                    output_str_batch.extend(i.output_str)
+                text_ids = i.text_ids
+                text_ids_batch.extend(text_ids)
+                if i.output_ids != []:
+                    output_ids_batch.extend(i.output_ids)
                if step != 0:
                    batch_input_num_beam_num.append(i.num_beams)

-            token_ids, segment_ids = self.data_generator(text_batch, output_str_batch)
+            token_ids, segment_ids = self.data_generator(text_ids_batch, output_ids_batch)

            # token_ids_batch = sequence_padding(token_ids_batch)
            # segment_ids_batch = sequence_padding(segment_ids_batch)
@ -615,7 +637,6 @@ class AutoTitle(AutoRegressiveDecoder):

            if step != 0:
                num = 0
-                print()
                if len(generated) > 1:
                    index = 0
                    for index in range(len(batch_input_num_beam_num)-1):
@ -656,10 +677,6 @@ class AutoTitle(AutoRegressiveDecoder):
                return return_str_batch
        return return_str_batch

-
-
-
-
    def top_batch(
        self,
        inputs_str,
@ -729,29 +746,6 @@ class AutoTitle(AutoRegressiveDecoder):
        return results


-    @AutoRegressiveDecoder.wraps(default_rtype='probas')
-    def predict(self, inputs, output_ids, states):
-        token_ids, segment_ids = inputs
-
-        token_ids = np.concatenate([token_ids, output_ids], 1)
-        segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
-        with graph.as_default():
-            K.set_session(sess)
-            nodes = self.last_token(self.model).predict([token_ids, segment_ids])
-        return nodes
-        # return self.last_token(self.model).predict([token_ids, segment_ids])
-
-    def predict_batch(self, inputs):
-        # inputs, output_ids, states, temperature, 'probas'
-        token_ids, segment_ids = inputs
-        # token_ids = np.concatenate([token_ids, output_ids], 1)
-        # segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
-        with graph.as_default():
-            K.set_session(sess)
-            nodes = self.model.predict([token_ids, segment_ids])
-        return nodes
-        # return self.last_token(self.model).predict([token_ids, segment_ids])
-
    def generate(self, text, topk=1):
        token_ids, segment_ids = self.tokenizer.encode(text, maxlen=256)
        output_ids = self.beam_search([token_ids, segment_ids],
@ -916,7 +910,13 @@ class Beamdataone(object):
        self.tokenizer = tokenizer
        # self.data()
        self.output_str = ""
+        self.text_2_textids(
+            self.text
+        )

+    def text_2_textids(self,text):
+        token_ids, segment_ids = self.tokenizer.encode(text[0], maxlen=120)
+        self.text_ids = [token_ids]

    def add_data(self, step, output_probas):
        '''
@ -934,6 +934,7 @@ class Beamdataone(object):
        #     inputs = [np.repeat(i, self.num_beams, axis=0) for i in self.inputs]
        # inputs = [self.token_ids, self.segment_ids]
        # inputs = [np.array([i]) for i in inputs]
+        self.output_ids = np.array(self.output_ids)
        scores = output_probas
        scores = self.output_scores.reshape((-1, 1)) + scores  # 综合累积得分
        indices = scores.argpartition(-self.num_beams, axis=None)[-self.num_beams:]  # 仅保留topk
@ -960,8 +961,9 @@ class Beamdataone(object):
                    self.output_scores = self.output_scores[flag]  # 扔掉已完成序列
                    self.end_counts = self.end_counts[flag]  # 扔掉已完成end计数
                    self.num_beams = flag.sum()  # topk相应变化
+                self.output_ids = self.output_ids.tolist()
                self.output_str = [tokenizer.decode(ids) for ids in self.output_ids]
-                self.text = [self.text[0] for i in range(len(self.output_ids))]
+                self.text_ids = [self.text_ids[0] for i in range(len(self.output_ids))]


    # # 达到长度直接输出
@ -1132,18 +1134,18 @@ def paragraph_test(text, text_new):

 if __name__ == '__main__':

-    # text = ["历史和当下都证明,创新是民族生存、“发展的不竭源泉”,是是自身发展的必然选择",
-    #         "自身发展的必然选择",
-    #         "强调轻资产经营, 更加重视经营风险的规避",
-    #         "历史和当下都证明,创新是民族生存、发展的不竭源泉,是是自身发展的必然选择",
-    #         "是时代对于青年们的深切呼唤"]
-    text = ["基本消除“热桥”影响。"]
-    print(just_show_sentence(text))
+    text = ["历史和当下都证明,创新是民族生存、“发展的不竭源泉”,是是自身发展的必然选择",
+            "自身发展的必然选择",
+            "强调轻资产经营, 更加重视经营风险的规避",
+            "历史和当下都证明,创新是民族生存、发展的不竭源泉,是是自身发展的必然选择",
+            "是时代对于青年们的深切呼唤"]
+    # text = ["基本消除“热桥”影响。"]
+    # print(just_show_sentence(text))
    # print(just_show_sentence_top(text))
    # print(just_show_chachong_random(text))

    # print(tokenizer.encode("\"", maxlen=120))
-    # print(just_show_sentence_batch(text))
+    print(just_show_sentence_batch(text))


    # path = "./data/700条论文测试.xlsx"
--- a/predict_sim.py
+++ b/predict_sim.py
@ -83,7 +83,7 @@ class GenerateModel(object):
        self.epoch_acc_vel = 0
        self.config_path = r'./chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json'
        self.checkpoint_path = r'./chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt'
-        self.dict_path = r'./chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt'
+        self.dict_path = r'./chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab_drop.txt'
        self.maxlen = 120
        self.novel_maxlen = 60

@ -122,7 +122,7 @@ class GenerateModel(object):
        outputs = TotalLoss([2, 3])(bert.model.inputs + bert.model.outputs)
        model = keras.models.Model(bert.model.inputs, outputs)

-        path_model = './output_simbert_yy/best_simbertmodel_datasim.weights'
+        path_model = './output_simbert_yy/best_simbertmodel_datasim_yinhao.weights'
        model.load_weights(path_model)

        return encoder,seq2seq, tokenizer
@ -708,7 +708,9 @@ def just_show_csv_beam(file):

 if __name__ == '__main__':

-    # text = ["强调轻资产经营, 更加重视经营风险的规避", "历史和当下都证明,创新是民族生存、发展的不竭源泉,是是自身发展的必然选择", "是时代对于青年们的深切呼唤"]
+    # text = ["强调轻资产“经营”, 更加重视“营风险”的规避", "历史和当下都证明,创新是民族生存、发展的不竭源泉,是是自身发展的必然选择", "是时代对于青年们的深切呼唤"]
+    # print(just_show_sentence(text))
+    #
    # print(just_show_sentence_batch(text))
    # print(type(just_show_sentence_batch(text)))

@ -726,7 +728,7 @@ if __name__ == '__main__':
            print(i[0])
            continue
    df = pd.DataFrame(df_list_new)
-    df.to_excel("./data/700条论文测试_18.xlsx", index=None)
+    df.to_excel("./data/700条论文测试_19.xlsx", index=None)



--- a/request_drop.py
+++ b/request_drop.py
@ -0,0 +1,59 @@
+# -*- coding: utf-8 -*-
+
+"""
+@Time    :  2022/12/29 1:14
+@Author  :
+@FileName:
+@Software:
+@Describe:
+"""
+#coding:utf-8
+
+import requests
+from time import time
+
+def dialog_line_parse(url, text):
+    """
+    将数据输入模型进行分析并输出结果
+    :param url: 模型url
+    :param text: 进入模型的数据
+    :return: 模型返回结果
+    """
+
+    response = requests.post(
+        url,
+        json=text,
+        timeout=1000
+    )
+    if response.status_code == 200:
+        return response.json()
+    else:
+        # logger.error(
+        #     "【{}】 Failed to get a proper response from remote "
+        #     "server. Status Code: {}. Response: {}"
+        #     "".format(url, response.status_code, response.text)
+        # )
+        print("【{}】 Failed to get a proper response from remote "
+            "server. Status Code: {}. Response: {}"
+            "".format(url, response.status_code, response.text))
+        print(text)
+        return []
+
+
+ceshi_1 = [
+    "李正旺你真是傻逼讪笑，挥手道：“不不不，你千万别误会。关于这件事，校长特别交代过了，我也非常认同。你这是见义勇为，是勇斗歹徒、义救同学的英雄，我们清江一中决不让英雄流血又流泪！”。",
+    "李正旺你真是傻逼讪笑，挥手道：“不不不，你千万别误会。关于这件事，校长特别交代过了，我也非常认同。",
+    "李正旺你真是傻逼讪笑，挥手道：“不不不，你千万别误会。关于这件事，校长特别交代过了，我也非常认同。"
+    "我" * 110
+           ]
+
+jishu = 0
+for i in ceshi_1:
+    for j in i:
+        jishu += 1
+print(jishu)
+
+t1 = time()
+print(dialog_line_parse("http://114.116.25.228:14000/droprepeat/",{"texts": ceshi_1, "text_type": "focus"}))
+t2 = time()
+print(t2 -t1)