增加单句多篇幅检测

2 years ago · c523d713af
2 changed files with 435 additions and 231 deletions
--- a/flask_check_bert.py
+++ b/flask_check_bert.py
@ -4,7 +4,7 @@ from numpy.linalg import norm
 import pandas as pd
 # from rouge import Rouge
 from rouge_chinese import Rouge
-from Rouge_w import Rouge_w,Rouge_l
+from Rouge_w import Rouge_w, Rouge_l
 import json
 import pymysql
 import re
@ -15,6 +15,8 @@ import uuid
 import time
 import redis
 from threading import Thread
 from multiprocessing import Pool
 app = Flask(__name__)
 app.config["JSON_AS_ASCII"] = False
@ -25,7 +27,7 @@ db_key_query = 'query'
 db_key_querying = 'querying'
 db_key_queryset = 'queryset'
-nums_cpus = 16
+nums_cpus = 24
 rouge = Rouge()
 rouge_model = Rouge_w()
 rouge_l_model = Rouge_l()
@ -65,7 +67,6 @@ def bert_check(text, recall_data_list):
    return return_list
 def rouge_value_self(data_1, data_2):
    data_1 = [' '.join(i) for i in data_1]
    data_2 = [' '.join(i) for i in data_2]
@ -81,7 +82,6 @@ def rouge_value_self(data_1, data_2):
 def rouge_pre(text, df_train_nuoche):
    return_list = []
    index_rouge_list = []
    text_list = [text] * len(df_train_nuoche)
@ -100,12 +100,307 @@ def rouge_pre(text, df_train_nuoche):
    return return_list
 def rouge_pre_m(text, df_train_nuoche):
    return_list = []
    index_rouge_list = []
    text_list = [text] * len(df_train_nuoche)
    data_list = []
    for data_dan in df_train_nuoche:
        data_list.append(data_dan[0])
    rouge_1, rouge_2, rouge_l = rouge_value_self(text_list, data_list)
    index_rouge_list.extend(rouge_l)
    re1 = [(i[0], i[1]) for i in sorted(list(enumerate(index_rouge_list)), key=lambda x: x[1], reverse=True)]
    return_list.extend(re1)
    return return_list
 # 以单个章节为例
 def similar_content_func():
    '''
    重复文章
    :return:
    '''
    return [{
        "content": "重复的内容标红",
        "thesis_info": "论文标题 + 论文作者 + 来源 + 年份日期--敞开式奶牛舍环境控制系统的设计  李晓红,刘晓丽,余泳昌 - 商丘工学院机械工程学院 - 2015-04-01",
        "title": "标题",
        "year": "日期",
        "degree": "来源",
        "author": "作者"
    }]
 def original_text_contrast_func(data_sentence_dan, paper_dict):
    '''
    重复的对比详细信息
    :param similar_content:
    :return:
    '''
    original_text = ""
    start = len(data_sentence_dan[0][1])
    end = 0
    similar_content = []
    for i in data_sentence_dan:  # 可能有很多个暂且确定是一个
        similar_content_dan = {
            "paper_red_len_word": "",
            "content": "重复的内容标红",
            "thesis_info": "论文标题 + 论文作者 + 来源 + 年份日期--敞开式奶牛舍环境控制系统的设计  李晓红,刘晓丽,余泳昌 - 商丘工学院机械工程学院 - 2015-04-01",
            "title": "标题",
            "year": "日期",
            "degree": "来源",
            "author": "作者",
            "paper_len_word": ""
        }
        sentence_0_bool, sentence_0_dan_red = original_text_marked_red(i[1], paper_dict[i[0]][0],
                                                                       paper_dict[i[0]][
                                                                           1])  # text_original, bert_text, bert_text_pre
        sentence_1_bool, sentence_1_dan_red = original_text_marked_red(i[2], paper_dict[i[0]][2],
                                                                       paper_dict[i[0]][
                                                                           3])  # text_original, bert_text, bert_text_pre
        start_dan = sentence_0_dan_red.index("<red>")
        end_dan = sentence_0_dan_red.index("</red>") - len("<red>")
        if start_dan < start:
            start = start_dan
        if end_dan > end:
            end = end_dan
        if sentence_0_bool == False or sentence_1_bool == False:
            continue
        similar_content_dan["content"] = sentence_1_dan_red
        similar_content_dan["title"] = i[3]["title"]
        similar_content_dan["author"] = i[3]["author"]
        similar_content_dan["degree"] = i[3]["degree"]
        similar_content_dan["year"] = i[3]["year"]
        similar_content_dan["paper_len_word"] = i[3]["paper_len_word"]
        similar_content_dan["paper_red_len_word"] = len(paper_dict[i[0]][3])
        thesis_info = " ".join(
            [similar_content_dan["title"], similar_content_dan["author"], similar_content_dan["degree"],
             similar_content_dan["year"]])
        similar_content_dan["thesis_info"] = thesis_info
        similar_content.append(similar_content_dan)
    original_text_list = list(data_sentence_dan[0][1])
    original_text_list.insert(end, "</red>")
    original_text_list.insert(start, "<red>")
    original_text = "".join(original_text_list)
    return_info = {
        "original_text": original_text,
        "dan_sentence_word_nums": end - start,
        "similar_content": similar_content
    }
    return return_info
 def repeat_quote_info_func(original_text_contrast):
    '''
    重复的引用信息
    :return:
    '''
    chongfuwendang = {}
    for sentence_dan in original_text_contrast:
        for i in sentence_dan["similar_content"]:
            thesis_info = i["thesis_info"]
            if thesis_info not in chongfuwendang:
                chongfuwendang[thesis_info] = {
                    "quote": False,
                    "thesis_author": i["author"],
                    "thesis_date": i["year"],
                    "thesis_info": thesis_info,
                    "thesis_repeat_rate": (i["paper_red_len_word"] / i["paper_len_word"]) * 100,
                    # round(repetition_rate, 3) * 100
                    "thesis_title": i["title"],
                    "thesis_link": "",
                    "thesis_publish": i["degree"],
                    "thesis_repeat_word": i["paper_red_len_word"],
                    "thesis_teacher": "",
                    "paper_len_word": i["paper_len_word"]
                }
            else:
                chongfuwendang[thesis_info]["thesis_repeat_word"] += i["paper_red_len_word"]
                chongfuwendang[thesis_info]["thesis_repeat_rate"] = (chongfuwendang[thesis_info]["thesis_repeat_word"] /
                                                                     chongfuwendang[thesis_info][
                                                                         "paper_len_word"]) * 100
    chongfuwendang = sorted(chongfuwendang.items(),
                            key=lambda x: x[1]["thesis_repeat_rate"], reverse=False)
    chongfuwendang_list = [i[1] for i in chongfuwendang]
    return chongfuwendang_list
 def total_data_func(section_data_list):
    '''
    总体数据
    :return:
    '''
    # "end_page_index": 0,
    # "name": "第1部分",
    # "repeat_rate": repeat_rate,
    # "repeat_words": repeat_words,
    # "start_page_index": 0,
    # "words": section_words,
    # "original_text": original_text,
    # "original_text_oneself": original_text,
    # "original_text_contrast/重复的对比详细信息": original_text_contrast,
    # "repeat_quote_info/重复的引用信息": repeat_quote_info
    repeat_words = 0
    words = 0
    for i in section_data_list:
        repeat_words += i["repeat_words"]
        words += i["words"]
    exclude_personal_rate = str(repeat_words / words * 100) + "%"
    exclude_quote_rate = str(repeat_words / words * 100) + "%"
    single_max_rate = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_rate"]
    single_max_repeat_words = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_word"]
    total_repeat_rate = str(repeat_words / words * 100) + "%"
    total_repeat_words = repeat_words
    total_words = words
    return {
        "back_repeat_words": "",
        "exclude_personal_rate": exclude_personal_rate,
        "exclude_quote_rate": exclude_quote_rate,
        "front_repeat_words": "",
        "single_max_rate": single_max_rate,
        "single_max_repeat_words": single_max_repeat_words,
        "suspected_paragraph": "",
        "suspected_paragraph_max_repeat_words": "",
        "suspected_paragraph_min_repeat_words": "",
        "total_paragraph": "",
        "total_repeat_rate": total_repeat_rate,
        "total_repeat_words": total_repeat_words,
        "total_words": total_words,
        "tables": 0
    }
 def section_data_func_dan():
    '''
    章节信息单个
    :return:
    '''
    # {
    #     "section_name": "章节名称",
    #     "section_repeat_rate": "重复率",
    #     "section_repeat_words": "重复字数",
    #     "section_words": "章节字数",
    #     "oneself_repeat_words": "去除本人后重复字数",
    #     "reference_repeat_words": "去除引用后重复字数",
    #     "section_oneself_rate": "去除本人后重复率"
    # }
    return {
        "section_name": "",
        "section_repeat_rate": "",
        "section_repeat_words": "",
        "section_words": "",
        "oneself_repeat_words": "",
        "reference_repeat_words": "",
        "section_oneself_rate": ""
    }
 def section_data_func(section_details):
    '''
    章节信息
    :return:
    '''
    # "end_page_index": 0,
    # "name": "第1部分",
    # "repeat_rate": repeat_rate,
    # "repeat_words": repeat_words,
    # "start_page_index": 0,
    # "words": section_words,
    # "original_text": original_text,
    # "original_text_oneself": original_text,
    # "original_text_contrast/重复的对比详细信息": original_text_contrast,
    # "repeat_quote_info/重复的引用信息": repeat_quote_info
    section_name = section_details["name"]
    section_repeat_rate = section_details["repeat_rate"]
    section_repeat_words = section_details["repeat_words"]
    section_words = section_details["words"]
    oneself_repeat_words = section_details["repeat_words"]
    reference_repeat_words = section_details["repeat_words"]
    section_oneself_rate = section_details["repeat_rate"]
    return {
        "section_name": section_name,
        "section_repeat_rate": section_repeat_rate,
        "section_repeat_words": section_repeat_words,
        "section_words": section_words,
        "oneself_repeat_words": oneself_repeat_words,
        "reference_repeat_words": reference_repeat_words,
        "section_oneself_rate": section_oneself_rate
    }
 def section_details_func(data_section_dan, paper_dict):
    '''
    章节详细信息
    :param original_text_contrast:
    :param repeat_quote_info:
    :return:
    '''
    original_text_contrast = []
    section_repeat_rate = ""
    repeat_words = 0
    section_words = 0
    oneself_repeat_words = ""
    reference_repeat_words = ""
    section_oneself_rate = ""
    original_text_list = []
    for sentence_dan in data_section_dan:
        original_text_contrast_dan = original_text_contrast_func(sentence_dan, paper_dict)
        original_text_contrast.append(original_text_contrast_dan)
        repeat_words += original_text_contrast_dan["dan_sentence_word_nums"]
        original_text_list.append(original_text_contrast_dan["original_text"])
        section_words += len(sentence_dan[0][1])
    original_text = "。".join(original_text_list)
    repeat_rate = repeat_words / section_words
    repeat_quote_info = repeat_quote_info_func(original_text_contrast)
    return {
        "end_page_index": 0,
        "name": "第1部分",
        "repeat_rate": repeat_rate,
        "repeat_words": repeat_words,
        "start_page_index": 0,
        "words": section_words,
        "original_text": original_text,
        "original_text_oneself": original_text,
        "original_text_contrast": original_text_contrast,
        "repeat_quote_info": repeat_quote_info
    }
 def accurate_check_rouge(
        title,
        author,
        text_paper,
        recall_data_list
-    ):
+):
    '''
    精确查重出相似句子
    :param text:
@ -120,9 +415,23 @@ def accurate_check_rouge(
    sentence_word_nums = 0
    # rouge算法查重
-    for text in centent_list:
+    rst = []
-        rouge_pre_list = rouge_pre(text, recall_data_list)
+    p = Pool(nums_cpus)  # 进程池中含有n个子进程
-        data_zong.append(rouge_pre_list)
+
    print("centent_list", centent_list)
    for i in range(len(centent_list)):
        text = centent_list[i]
        a = p.apply_async(rouge_pre_m, args=(text, recall_data_list,))
        rst.append(a)
    p.close()
    p.join()  # 等待所有子进程执行完毕。调用join()之前必须先调用close()，调用close()之后就不能继续添加新的Process了。
    rst = [i.get() for i in rst]
    for i in range(len(rst)):
        print(rst[i])
        data_zong.append(rst[i])
    t0 = time.time()
    # bert算法查重
@ -132,7 +441,6 @@ def accurate_check_rouge(
    t1 = time.time()
    original_dict = []
    # 找出相似的句子序号
    bool_check_sentense = []
    # bert算法
@ -142,12 +450,14 @@ def accurate_check_rouge(
    # rouge算法
    for i in range(len(data_zong)):
-        if data_zong[i][0] > 0.47:
+        for j in range(len(data_zong[i])):
-            bool_check_sentense.append([i,data_zong[i][1]])
+            if data_zong[i][j][1] > 0.47:
-    biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]]
+                bool_check_sentense.append([i, data_zong[i][j][0]])
-
+    biao_red = biaohong(bool_check_sentense, data_zong,
-    print("bert精确查重时间", t1-t0)
+                        recall_data_list)  # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]]
    print("bert精确查重时间", t1 - t0)
    print(biao_red)
    sentence_0_list = []
    sentence_1_list = []
@ -156,7 +466,8 @@ def accurate_check_rouge(
    for i in biao_red:
        if recall_data_list[i[1][0]][1] == recall_data_list[i[1][1]][1] == recall_data_list[i[1][2]][1]:
            sentence_0_list.append("。".join([centent_list[i[0][0]], centent_list[i[0][1]], centent_list[i[0][2]]]))
-            sentence_1_list.append("".join([recall_data_list[i[1][0]][0], recall_data_list[i[1][1]][0], recall_data_list[i[1][2]][0]]))
+            sentence_1_list.append(
                "".join([recall_data_list[i[1][0]][0], recall_data_list[i[1][1]][0], recall_data_list[i[1][2]][0]]))
            sim_paper_name.append(recall_data_list[i[1][0]][1])
        else:
            continue
@ -164,7 +475,6 @@ def accurate_check_rouge(
    sentence_0_list_new = []
    sentence_1_list_new = []
    for i in zip(sentence_0_list, sentence_1_list):
        if len(i[0]) + len(i[1]) < 1200:
            sentence_0_list_new.append(i[0])
@ -183,141 +493,59 @@ def accurate_check_rouge(
    chongfuwendang = {}
-    for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), sentence_0_list_new, sentence_1_list_new, sim_paper_name):
+    print("paper_dict", paper_dict)
-
+    print("sentence_0_list_new", sentence_0_list_new)
-        print([sentence_0_dan, sentence_1_dan])
+    print("sentence_1_list_new", sentence_1_list_new)
-        original_text_contrast_dict = {
+    print("sim_paper_name", sim_paper_name)
-            "original_text": "",
+    similar_content_control = [[]]
            "similar_content": [
                {
                    "content": "",
                    "thesis_info": "",
                    "title": "",
                    "year": "",
                    "degree": "",
                    "author": "",
                }
            ]
        }
        try:
            sentence_0_bool, sentence_0_dan_red = original_text_marked_red(sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]) # text_original, bert_text, bert_text_pre
        except:
            print("报错", [sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]])
            continue
            # 9/0
        sentence_1_bool, sentence_1_dan_red = original_text_marked_red(sentence_1_dan, paper_dict[paper_dict_dan_id][2], paper_dict[paper_dict_dan_id][3]) # text_original, bert_text, bert_text_pre
-        if sentence_0_bool == False or sentence_1_bool == False:
+    with open("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/paper_dict.json", "w") as f:
-            continue
+        json.dump(paper_dict, f, ensure_ascii=False)
-        dan_sentence_word_nums = len(paper_dict[paper_dict_dan_id][1])
+    sentence_0_list_new_cursor = sentence_0_list_new[0]
-        sentence_word_nums += dan_sentence_word_nums
+    for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)),
-
+                                                                                     sentence_0_list_new,
-        original_text.append(sentence_0_dan_red)
+                                                                                     sentence_1_list_new,
-        original_text_contrast_dict["original_text"] = "此处有 {} 字相似\n".format(
+                                                                                     sim_paper_name):
-            dan_sentence_word_nums) + sentence_0_dan_red
+
-
+        if sentence_0_list_new_cursor != sentence_0_dan:
-        thesis_info = " ".join([sim_paper_name_dan["title"], sim_paper_name_dan["author"], sim_paper_name_dan["degree"], sim_paper_name_dan["year"]])
+            similar_content_control.append([[paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan]])
        original_text_contrast_dict["similar_content"][0]["content"] = sentence_1_dan_red
        original_text_contrast_dict["similar_content"][0]["title"] = sim_paper_name_dan["title"]
        original_text_contrast_dict["similar_content"][0]["author"] = sim_paper_name_dan["author"]
        original_text_contrast_dict["similar_content"][0]["degree"] = sim_paper_name_dan["degree"]
        original_text_contrast_dict["similar_content"][0]["year"] = sim_paper_name_dan["year"]
        original_text_contrast_dict["similar_content"][0]["thesis_info"] = thesis_info
        original_text_contrast.append(original_text_contrast_dict)
        # for i in repeat_quote_info:
        #     if
        if thesis_info not in chongfuwendang:
            chongfuwendang[thesis_info] = {
                "quote": False,
                "thesis_author": sim_paper_name_dan["author"],
                "thesis_date" : sim_paper_name_dan["year"],
                "thesis_info" : thesis_info,
                "thesis_repeat_rate": (dan_sentence_word_nums/sim_paper_name_dan["paper_len_word"]) * 100, #round(repetition_rate, 3) * 100
                "thesis_title": sim_paper_name_dan["title"],
                "thesis_link": "",
                "thesis_publish": sim_paper_name_dan["degree"],
                "thesis_repeat_word": dan_sentence_word_nums,
                "thesis_teacher": "",
                "paper_len_word": sim_paper_name_dan["paper_len_word"]
            }
        else:
-            chongfuwendang[thesis_info]["thesis_repeat_word"] += dan_sentence_word_nums
+            similar_content_control[-1].append([paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan])
            chongfuwendang[thesis_info]["thesis_repeat_rate"] = (chongfuwendang[thesis_info]["thesis_repeat_word"]/chongfuwendang[thesis_info]["paper_len_word"]) * 100
    data = [similar_content_control]
-    chongfuwendang = sorted(chongfuwendang.items(),
+    # 模拟多个章节
-                            key=lambda x: x[1]["thesis_repeat_rate"], reverse=False)
+    section_details_list = []
    for data_dan in data:
        data_section_dan = data_dan
        # 章节详细信息
        section_details = section_details_func(data_section_dan, paper_dict)
        section_details_list.append(section_details)
-    for i in range(len(chongfuwendang)):
+    # 模拟多个章节
        repeat_paper_one_info_dict = chongfuwendang[i][1]
        repeat_paper_one_info_dict.pop("paper_len_word")
        repeat_paper_one_info_dict["thesis_repeat_rate"] = str(round(repeat_paper_one_info_dict["thesis_repeat_rate"], 1)) + "%"
        repeat_quote_info.append(repeat_paper_one_info_dict)
-    original_text = "。".join(original_text)
+    section_data_list = []
    for section_details in section_details_list:
        section_data = section_data_func(section_details)
-    repetition_rate = sentence_word_nums/len(text_paper)
+    total_data = total_data_func(section_details_list)
    repetition_rate = round(repetition_rate, 3) * 100
    format = '%Y-%m-%d %H:%M:%S'
    value = time.localtime(int(time.time()))
    dt = time.strftime(format, value)
-    return {
+    paper_data = {
        "author": author,
        "check_time": dt,
        "title": title,
        "time_range": "1900-01-01至2023-08-08",
-        "section_data": [
+        "title": title,
-            {
+        "total_data": total_data,
-                "oneself_repeat_words": sentence_word_nums,
+        "section_data": section_data_list,
-                "reference_repeat_words": sentence_word_nums,
+        "section_details": section_details_list
                "section_name": "第1部分",
                "section_oneself_rate": "{}%".format(repetition_rate),
                "section_repeat_rate": "{}%".format(repetition_rate),
                "section_repeat_words": sentence_word_nums,
                "section_words": len(text_paper)
            }
        ],
        "section_details": [
            {
                "end_page_index": 0,
                "name": "",
                "repeat_rate": "",
                "repeat_words": "",
                "words": "",
                "original_text": original_text,
                "original_text_oneself": original_text,
                "original_text_contrast": original_text_contrast,
                "repeat_quote_info": repeat_quote_info
            }
        ],
        "total_data": {
            "back_repeat_words": "",
            "exclude_personal_rate": "{}%".format(repetition_rate),
            "exclude_quote_rate": "{}%".format(repetition_rate),
            "foot_end_note": "0",
            "front_repeat_words": "",
            "single_max_rate": "",
            "single_max_repeat_words": "",
            "suspected_paragraph": "1",
            "suspected_paragraph_max_repeat_words": "",
            "suspected_paragraph_min_repeat_words": "",
            "tables": "0",
            "total_paragraph": "1",
            "total_repeat_rate": "{}%".format(repetition_rate),
            "total_repeat_words": sentence_word_nums,
            "total_words": len(text_paper)
        }
    }
-
+    return paper_data
 def biaohong(bool_check_sentense, data_zong, df_train_nuoche):
@ -331,15 +559,16 @@ def biaohong(bool_check_sentense, data_zong, df_train_nuoche):
    start = -1
    end = -1
    while True:
-        if i >= len(bool_check_sentense) or bool_check_sentense[i][0] +1 >= len(data_zong) or bool_check_sentense[i][1]+1 >= len(df_train_nuoche):
+        if i >= len(bool_check_sentense) or bool_check_sentense[i][0] + 1 >= len(data_zong) or bool_check_sentense[i][1] \
                + 1 >= len(df_train_nuoche):
            break
-        elif bool_check_sentense[i][0]-1 == start:
+        elif bool_check_sentense[i][0] - 1 == start:
            i += 1
            continue
        elif bool_check_sentense[i][0] == end:
            i += 1
            continue
-        elif bool_check_sentense[i][0]-1 == end:
+        elif bool_check_sentense[i][0] - 1 == end:
            i += 1
            continue
        else:
@ -347,10 +576,10 @@ def biaohong(bool_check_sentense, data_zong, df_train_nuoche):
            biao_red_dan.append([bool_check_sentense[i][0] - 1, bool_check_sentense[i][1] - 1])
            biao_red_dan.append([bool_check_sentense[i][0], bool_check_sentense[i][1]])
            biao_red_dan.append([bool_check_sentense[i][0] + 1, bool_check_sentense[i][1] + 1])
-            biao_red.append([[bool_check_sentense[i][0]-1, bool_check_sentense[i][0], bool_check_sentense[i][0]+1],
+            biao_red.append([[bool_check_sentense[i][0] - 1, bool_check_sentense[i][0], bool_check_sentense[i][0] + 1],
-                             [bool_check_sentense[i][1]-1, bool_check_sentense[i][1], bool_check_sentense[i][1]+1]])
+                             [bool_check_sentense[i][1] - 1, bool_check_sentense[i][1], bool_check_sentense[i][1] + 1]])
-            start = bool_check_sentense[i][0]-1
+            start = bool_check_sentense[i][0] - 1
-            end = bool_check_sentense[i][0]+1
+            end = bool_check_sentense[i][0] + 1
            i += 1
    return biao_red
@ -378,8 +607,8 @@ def dialog_line_parse(url, text):
        #     "".format(url, response.status_code, response.text)
        # )
        print("【{}】 Failed to get a proper response from remote "
-            "server. Status Code: {}. Response: {}"
+              "server. Status Code: {}. Response: {}"
-            "".format(url, response.status_code, response.text))
+              "".format(url, response.status_code, response.text))
        print(text)
        return {}
@ -410,7 +639,7 @@ def original_text_marked_red(text_original, bert_text, bert_text_pre):
    while True:
        if down_pointer >= len(bert_text_pre):
            break
-        elif down_pointer == len(bert_text_pre)-1:
+        elif down_pointer == len(bert_text_pre) - 1:
            if bert_text[up_pointer] == bert_text_pre[down_pointer]:
                pointer_list.append(up_pointer)
                break
@ -428,7 +657,7 @@ def original_text_marked_red(text_original, bert_text, bert_text_pre):
                up_pointer += 1
                down_pointer += 1
            else:
-                if bert_text_pre[down_pointer:down_pointer+5] == "[UNK]":
+                if bert_text_pre[down_pointer:down_pointer + 5] == "[UNK]":
                    up_pointer += 1
                    down_pointer += 5
                    pointer_list.append(up_pointer)
@ -441,12 +670,11 @@ def original_text_marked_red(text_original, bert_text, bert_text_pre):
                    down_pointer = 0
                    pointer_list = []
    start = pointer_list[0]
    end = pointer_list[-1]
    bert_text_list = list(bert_text)
    bert_text_list.insert(start, "<red>")
-    bert_text_list.insert(end + 2 , "</red>")
+    bert_text_list.insert(end + 2, "</red>")
    text_original_list = list(text_original)
@ -482,30 +710,14 @@ def biaohong_bert_predict(sentence_0_list, sentence_1_list):
    :return:
    '''
-    # sentence_0_list = []
+    paper_dict = \
-    # sentence_1_list = []
+        dialog_line_parse("http://192.168.31.74:16003/",
-    # sim_paper_name = []
+                          {"sentence_0": sentence_0_list, "sentence_1": sentence_1_list})[
-    #
+            "resilt"]
    # for i in biaohong_list:
    #     sentence_0_list.append("。".join([paper_list[i[0][0]], paper_list[i[0][1]], paper_list[i[0][2]]]))
    #     sentence_1_list.append("。".join([recall_data_list[i[1][1]], recall_data_list[i[1][1]], recall_data_list[i[1][2]]]))
    paper_dict = dialog_line_parse("http://192.168.31.74:16003/", {"sentence_0": sentence_0_list, "sentence_1": sentence_1_list})["resilt"]
    # paper_dict
    # print("原文：".format(i), paper_dict[i][0])
    # print("原文标红：".format(i), paper_dict[i][1])
    # print("相似：".format(i), paper_dict[i][2])
    # print("相似标红：".format(i), paper_dict[i][3])
    # original_text
    #
    #
    # for paper_dict_dan, sentence_0_dan, sentence_1_dan in zip(paper_dict, sentence_0_list, sentence_1_list):
    #     original_text_marked_red
    return paper_dict
 def ulit_text(title, text):
    data = []
    try:
@ -520,6 +732,7 @@ def ulit_text(title, text):
        data.append([i, title])
    return data
 def run_query(conn, sql, params):
    with conn.cursor() as cursor:
        cursor.execute(sql, params)
@ -587,9 +800,8 @@ def ulit_recall_paper(recall_data_list_dict):
    #             data.append([sentence, filename])
    # return data
    data = []
-    for i in list(recall_data_list_dict.items())[:5]:
+    for i in list(recall_data_list_dict.items())[:10]:
        data_one = processing_one_text(i[0])
        data.extend(data_one)
@ -652,18 +864,18 @@ def uilt_content(content):
            key_word_bool = True
            break
-    if zhaiyao_bool== True and zhaiyao_en_bool == True:
+    if zhaiyao_bool == True and zhaiyao_en_bool == True:
-        pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,zhaiyao_en_str)
+        pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, zhaiyao_en_str)
        result_biaoti_list = re.findall(pantten_zhaiyao, content)
        zhaiyao_text = result_biaoti_list[0]
    elif zhaiyao_bool == True and key_word_bool == True:
-        pantten_zhaiyao  = "{}(.*?){}".format(zhaiyao_str,key_word_str )
+        pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, key_word_str)
        result_biaoti_list = re.findall(pantten_zhaiyao, content)
        zhaiyao_text = result_biaoti_list[0]
    elif zhaiyao_bool == True and mulu_bool == True:
-        pantten_zhaiyao  = "{}(.*?){}".format(zhaiyao_str,mulu_str)
+        pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, mulu_str)
        result_biaoti_list = re.findall(pantten_zhaiyao, content)
        zhaiyao_text = result_biaoti_list[0]
@ -688,7 +900,6 @@ def ulit_request_file(file):
    return abst_zh, content
 # @app.route("/", methods=["POST"])
 # def handle_query():
 #     print(request.remote_addr)
@ -761,7 +972,6 @@ def classify():  # 调用模型，设置最大batch_size
        goodsId = data_dict['goodsId']
        callbackUrl = data_dict['callbackUrl']
        # 调用宇鹏查询相似十篇
        # recall_data_list_dict = recall_10(title, abst_zh, content)
@ -805,10 +1015,10 @@ def handle_query():
        # request.form.get('prompt')
        dataBases = request.form.get("dataBases")
-        minSimilarity = request.form.get("minSimilarity") # txt
+        minSimilarity = request.form.get("minSimilarity")  # txt
        minWords = request.form.get("minWords")
        title = request.form.get("title")
-        author = request.form.get("author") # txt
+        author = request.form.get("author")  # txt
        file = request.files.get('file')
        token = request.form.get("token")
        account = request.form.get("account")
@ -833,7 +1043,7 @@ def handle_query():
            'account': account,
            'goodsId': goodsId,
            'callbackUrl': callbackUrl
-            }
+        }
        # 绑定文本和query id
        print(d)
@ -860,8 +1070,9 @@ def handle_query():
        return_text = {'code': 1}
    return jsonify(return_text)  # 返回结果
 t = Thread(target=classify)
 t.start()
 if __name__ == "__main__":
-    app.run(host="0.0.0.0", port=16001, threaded=True, debug=True, use_reloader=False)
+    app.run(host="0.0.0.0", port=16001, threaded=True, debug=True)
--- a/flask_check_bert_test.py
+++ b/flask_check_bert_test.py
@ -4,7 +4,7 @@ from numpy.linalg import norm
 import pandas as pd
 # from rouge import Rouge
 from rouge_chinese import Rouge
-from Rouge_w import Rouge_w,Rouge_l
+from Rouge_w import Rouge_w, Rouge_l
 import json
 import pymysql
 import re
@ -16,6 +16,7 @@ import time
 import redis
 from threading import Thread
 from multiprocessing import Pool
 app = Flask(__name__)
 app.config["JSON_AS_ASCII"] = False
@ -66,7 +67,6 @@ def bert_check(text, recall_data_list):
    return return_list
 def rouge_value_self(data_1, data_2):
    data_1 = [' '.join(i) for i in data_1]
    data_2 = [' '.join(i) for i in data_2]
@ -82,7 +82,6 @@ def rouge_value_self(data_1, data_2):
 def rouge_pre(text, df_train_nuoche):
    return_list = []
    index_rouge_list = []
    text_list = [text] * len(df_train_nuoche)
@ -102,7 +101,6 @@ def rouge_pre(text, df_train_nuoche):
 def rouge_pre_m(text, df_train_nuoche):
    return_list = []
    index_rouge_list = []
@ -144,12 +142,11 @@ def original_text_contrast_func(data_sentence_dan, paper_dict):
    :return:
    '''
    original_text = ""
    start = len(data_sentence_dan[0][1])
    end = 0
    similar_content = []
-    for i in data_sentence_dan: #可能有很多个暂且确定是一个
+    for i in data_sentence_dan:  # 可能有很多个暂且确定是一个
        similar_content_dan = {
            "paper_red_len_word": "",
@ -216,7 +213,6 @@ def repeat_quote_info_func(original_text_contrast):
    '''
    chongfuwendang = {}
    for sentence_dan in original_text_contrast:
        for i in sentence_dan["similar_content"]:
            thesis_info = i["thesis_info"]
@ -238,7 +234,8 @@ def repeat_quote_info_func(original_text_contrast):
            else:
                chongfuwendang[thesis_info]["thesis_repeat_word"] += i["paper_red_len_word"]
                chongfuwendang[thesis_info]["thesis_repeat_rate"] = (chongfuwendang[thesis_info]["thesis_repeat_word"] /
-                                                                     chongfuwendang[thesis_info]["paper_len_word"]) * 100
+                                                                     chongfuwendang[thesis_info][
                                                                         "paper_len_word"]) * 100
    chongfuwendang = sorted(chongfuwendang.items(),
                            key=lambda x: x[1]["thesis_repeat_rate"], reverse=False)
    chongfuwendang_list = [i[1] for i in chongfuwendang]
@ -265,16 +262,15 @@ def total_data_func(section_data_list):
    repeat_words = 0
    words = 0
    for i in section_data_list:
        repeat_words += i["repeat_words"]
        words += i["words"]
-    exclude_personal_rate = str(repeat_words/words * 100) + "%"
+    exclude_personal_rate = str(repeat_words / words * 100) + "%"
-    exclude_quote_rate = str(repeat_words/words * 100) + "%"
+    exclude_quote_rate = str(repeat_words / words * 100) + "%"
    single_max_rate = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_rate"]
    single_max_repeat_words = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_word"]
-    total_repeat_rate = str(repeat_words/words * 100) + "%"
+    total_repeat_rate = str(repeat_words / words * 100) + "%"
    total_repeat_words = repeat_words
    total_words = words
@ -321,6 +317,7 @@ def section_data_func_dan():
        "section_oneself_rate": ""
    }
 def section_data_func(section_details):
    '''
    章节信息
@ -380,13 +377,10 @@ def section_details_func(data_section_dan, paper_dict):
        section_words += len(sentence_dan[0][1])
    original_text = "。".join(original_text_list)
-    repeat_rate = repeat_words/section_words
+    repeat_rate = repeat_words / section_words
    repeat_quote_info = repeat_quote_info_func(original_text_contrast)
    return {
        "end_page_index": 0,
        "name": "第1部分",
@ -401,13 +395,12 @@ def section_details_func(data_section_dan, paper_dict):
    }
 def accurate_check_rouge(
        title,
        author,
        text_paper,
        recall_data_list
-    ):
+):
    '''
    精确查重出相似句子
    :param text:
@ -448,7 +441,6 @@ def accurate_check_rouge(
    t1 = time.time()
    original_dict = []
    # 找出相似的句子序号
    bool_check_sentense = []
    # bert算法
@ -460,13 +452,13 @@ def accurate_check_rouge(
    for i in range(len(data_zong)):
        for j in range(len(data_zong[i])):
            if data_zong[i][j][1] > 0.47:
-                bool_check_sentense.append([i,data_zong[i][j][0]])
+                bool_check_sentense.append([i, data_zong[i][j][0]])
-    biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]]
+    biao_red = biaohong(bool_check_sentense, data_zong,
                        recall_data_list)  # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]]
-    print("bert精确查重时间", t1-t0)
+    print("bert精确查重时间", t1 - t0)
    print(biao_red)
    sentence_0_list = []
    sentence_1_list = []
    sim_paper_name = []
@ -474,7 +466,8 @@ def accurate_check_rouge(
    for i in biao_red:
        if recall_data_list[i[1][0]][1] == recall_data_list[i[1][1]][1] == recall_data_list[i[1][2]][1]:
            sentence_0_list.append("。".join([centent_list[i[0][0]], centent_list[i[0][1]], centent_list[i[0][2]]]))
-            sentence_1_list.append("".join([recall_data_list[i[1][0]][0], recall_data_list[i[1][1]][0], recall_data_list[i[1][2]][0]]))
+            sentence_1_list.append(
                "".join([recall_data_list[i[1][0]][0], recall_data_list[i[1][1]][0], recall_data_list[i[1][2]][0]]))
            sim_paper_name.append(recall_data_list[i[1][0]][1])
        else:
            continue
@ -482,7 +475,6 @@ def accurate_check_rouge(
    sentence_0_list_new = []
    sentence_1_list_new = []
    for i in zip(sentence_0_list, sentence_1_list):
        if len(i[0]) + len(i[1]) < 1200:
            sentence_0_list_new.append(i[0])
@ -556,8 +548,6 @@ def accurate_check_rouge(
    return paper_data
 def biaohong(bool_check_sentense, data_zong, df_train_nuoche):
    '''
    标红的序号 [[0,1,2],[3,4,5]]
@ -569,15 +559,16 @@ def biaohong(bool_check_sentense, data_zong, df_train_nuoche):
    start = -1
    end = -1
    while True:
-        if i >= len(bool_check_sentense) or bool_check_sentense[i][0] +1 >= len(data_zong) or bool_check_sentense[i][1]+1 >= len(df_train_nuoche):
+        if i >= len(bool_check_sentense) or bool_check_sentense[i][0] + 1 >= len(data_zong) or bool_check_sentense[i][1] \
                + 1 >= len(df_train_nuoche):
            break
-        elif bool_check_sentense[i][0]-1 == start:
+        elif bool_check_sentense[i][0] - 1 == start:
            i += 1
            continue
        elif bool_check_sentense[i][0] == end:
            i += 1
            continue
-        elif bool_check_sentense[i][0]-1 == end:
+        elif bool_check_sentense[i][0] - 1 == end:
            i += 1
            continue
        else:
@ -585,10 +576,10 @@ def biaohong(bool_check_sentense, data_zong, df_train_nuoche):
            biao_red_dan.append([bool_check_sentense[i][0] - 1, bool_check_sentense[i][1] - 1])
            biao_red_dan.append([bool_check_sentense[i][0], bool_check_sentense[i][1]])
            biao_red_dan.append([bool_check_sentense[i][0] + 1, bool_check_sentense[i][1] + 1])
-            biao_red.append([[bool_check_sentense[i][0]-1, bool_check_sentense[i][0], bool_check_sentense[i][0]+1],
+            biao_red.append([[bool_check_sentense[i][0] - 1, bool_check_sentense[i][0], bool_check_sentense[i][0] + 1],
-                             [bool_check_sentense[i][1]-1, bool_check_sentense[i][1], bool_check_sentense[i][1]+1]])
+                             [bool_check_sentense[i][1] - 1, bool_check_sentense[i][1], bool_check_sentense[i][1] + 1]])
-            start = bool_check_sentense[i][0]-1
+            start = bool_check_sentense[i][0] - 1
-            end = bool_check_sentense[i][0]+1
+            end = bool_check_sentense[i][0] + 1
            i += 1
    return biao_red
@ -616,8 +607,8 @@ def dialog_line_parse(url, text):
        #     "".format(url, response.status_code, response.text)
        # )
        print("【{}】 Failed to get a proper response from remote "
-            "server. Status Code: {}. Response: {}"
+              "server. Status Code: {}. Response: {}"
-            "".format(url, response.status_code, response.text))
+              "".format(url, response.status_code, response.text))
        print(text)
        return {}
@ -648,7 +639,7 @@ def original_text_marked_red(text_original, bert_text, bert_text_pre):
    while True:
        if down_pointer >= len(bert_text_pre):
            break
-        elif down_pointer == len(bert_text_pre)-1:
+        elif down_pointer == len(bert_text_pre) - 1:
            if bert_text[up_pointer] == bert_text_pre[down_pointer]:
                pointer_list.append(up_pointer)
                break
@ -666,7 +657,7 @@ def original_text_marked_red(text_original, bert_text, bert_text_pre):
                up_pointer += 1
                down_pointer += 1
            else:
-                if bert_text_pre[down_pointer:down_pointer+5] == "[UNK]":
+                if bert_text_pre[down_pointer:down_pointer + 5] == "[UNK]":
                    up_pointer += 1
                    down_pointer += 5
                    pointer_list.append(up_pointer)
@ -679,12 +670,11 @@ def original_text_marked_red(text_original, bert_text, bert_text_pre):
                    down_pointer = 0
                    pointer_list = []
    start = pointer_list[0]
    end = pointer_list[-1]
    bert_text_list = list(bert_text)
    bert_text_list.insert(start, "<red>")
-    bert_text_list.insert(end + 2 , "</red>")
+    bert_text_list.insert(end + 2, "</red>")
    text_original_list = list(text_original)
@ -720,10 +710,14 @@ def biaohong_bert_predict(sentence_0_list, sentence_1_list):
    :return:
    '''
-    paper_dict = dialog_line_parse("http://192.168.31.74:16003/", {"sentence_0": sentence_0_list, "sentence_1": sentence_1_list})["resilt"]
+    paper_dict = \
        dialog_line_parse("http://192.168.31.74:16003/",
                          {"sentence_0": sentence_0_list, "sentence_1": sentence_1_list})[
            "resilt"]
    return paper_dict
 def ulit_text(title, text):
    data = []
    try:
@ -738,6 +732,7 @@ def ulit_text(title, text):
        data.append([i, title])
    return data
 def run_query(conn, sql, params):
    with conn.cursor() as cursor:
        cursor.execute(sql, params)
@ -805,7 +800,6 @@ def ulit_recall_paper(recall_data_list_dict):
    #             data.append([sentence, filename])
    # return data
    data = []
    for i in list(recall_data_list_dict.items())[:10]:
        data_one = processing_one_text(i[0])
@ -870,18 +864,18 @@ def uilt_content(content):
            key_word_bool = True
            break
-    if zhaiyao_bool== True and zhaiyao_en_bool == True:
+    if zhaiyao_bool == True and zhaiyao_en_bool == True:
-        pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,zhaiyao_en_str)
+        pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, zhaiyao_en_str)
        result_biaoti_list = re.findall(pantten_zhaiyao, content)
        zhaiyao_text = result_biaoti_list[0]
    elif zhaiyao_bool == True and key_word_bool == True:
-        pantten_zhaiyao  = "{}(.*?){}".format(zhaiyao_str,key_word_str )
+        pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, key_word_str)
        result_biaoti_list = re.findall(pantten_zhaiyao, content)
        zhaiyao_text = result_biaoti_list[0]
    elif zhaiyao_bool == True and mulu_bool == True:
-        pantten_zhaiyao  = "{}(.*?){}".format(zhaiyao_str,mulu_str)
+        pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, mulu_str)
        result_biaoti_list = re.findall(pantten_zhaiyao, content)
        zhaiyao_text = result_biaoti_list[0]
@ -906,7 +900,6 @@ def ulit_request_file(file):
    return abst_zh, content
 # @app.route("/", methods=["POST"])
 # def handle_query():
 #     print(request.remote_addr)
@ -979,7 +972,6 @@ def classify():  # 调用模型，设置最大batch_size
        goodsId = data_dict['goodsId']
        callbackUrl = data_dict['callbackUrl']
        # 调用宇鹏查询相似十篇
        # recall_data_list_dict = recall_10(title, abst_zh, content)
@ -1023,10 +1015,10 @@ def handle_query():
        # request.form.get('prompt')
        dataBases = request.form.get("dataBases")
-        minSimilarity = request.form.get("minSimilarity") # txt
+        minSimilarity = request.form.get("minSimilarity")  # txt
        minWords = request.form.get("minWords")
        title = request.form.get("title")
-        author = request.form.get("author") # txt
+        author = request.form.get("author")  # txt
        file = request.files.get('file')
        token = request.form.get("token")
        account = request.form.get("account")
@ -1051,7 +1043,7 @@ def handle_query():
            'account': account,
            'goodsId': goodsId,
            'callbackUrl': callbackUrl
-            }
+        }
        # 绑定文本和query id
        print(d)
@ -1078,8 +1070,9 @@ def handle_query():
        return_text = {'code': 1}
    return jsonify(return_text)  # 返回结果
 t = Thread(target=classify)
 t.start()
 if __name__ == "__main__":
-    app.run(host="0.0.0.0", port=16001, threaded=True, debug=True)
+    app.run(host="0.0.0.0", port=16001, threaded=True, debug=True)