From 7493ebf9afcd3388487ff6a2bfa1578e18960077 Mon Sep 17 00:00:00 2001
From: "majiahui@haimaqingfan.com" <majiahui@haimaqingfan.com>
Date: Fri, 15 Sep 2023 18:09:18 +0800
Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=95=B4=E7=89=88v1.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 flask_check_bert_test.py | 156 ++++++++++++++++++++++++++++++++++-------------
 1 file changed, 114 insertions(+), 42 deletions(-)
diff --git a/flask_check_bert_test.py b/flask_check_bert_test.py
index 9c9bd8e..c12e800 100644
--- a/flask_check_bert_test.py
+++ b/flask_check_bert_test.py
@@ -174,8 +174,8 @@ def original_text_contrast_func(data_sentence_dan, paper_dict):
                                                                        paper_dict[i[0]][4][1])  # text_original, bert_text, bert_text_pre
 
         sentence_1_bool, sentence_1_dan_red = original_text_marked_red(i[2], paper_dict[i[0]][2],
-                                                                       paper_dict[i[0]][4][0],
-                                                                       paper_dict[i[0]][4][1])  # text_original, bert_text, bert_text_pre
+                                                                       paper_dict[i[0]][4][2],
+                                                                       paper_dict[i[0]][4][3])  # text_original, bert_text, bert_text_pre
 
         if sentence_0_bool == False or sentence_1_bool == False:
             continue
@@ -194,7 +194,7 @@ def original_text_contrast_func(data_sentence_dan, paper_dict):
         similar_content_dan["degree"] = i[3]["degree"]
         similar_content_dan["year"] = i[3]["year"]
         similar_content_dan["paper_len_word"] = i[3]["paper_len_word"]
-        similar_content_dan["paper_red_len_word"] = len(paper_dict[i[0]][3])
+        similar_content_dan["paper_red_len_word"] = end_dan - start_dan
 
         thesis_info = " ".join(
             [similar_content_dan["title"], similar_content_dan["author"], similar_content_dan["degree"],
@@ -206,7 +206,7 @@ def original_text_contrast_func(data_sentence_dan, paper_dict):
     original_text_list = list(data_sentence_dan[0][1])
     original_text_list.insert(end, "</red>")
     original_text_list.insert(start, "<red>")
-    original_text = "".join(original_text_list)
+    original_text = "此处有 {} 字相似\n".format(str(end - start)) + "".join(original_text_list)
 
     return_info = {
         "original_text": original_text,
@@ -216,7 +216,7 @@ def original_text_contrast_func(data_sentence_dan, paper_dict):
     return return_info
 
 
-def repeat_quote_info_func(original_text_contrast):
+def repeat_quote_info_func(original_text_contrast, section_words):
     '''
     重复的引用信息
     :return:
@@ -232,7 +232,7 @@ def repeat_quote_info_func(original_text_contrast):
                     "thesis_author": i["author"],
                     "thesis_date": i["year"],
                     "thesis_info": thesis_info,
-                    "thesis_repeat_rate": (i["paper_red_len_word"] / i["paper_len_word"]) * 100,
+                    "thesis_repeat_rate": (i["paper_red_len_word"] / section_words) * 100, # str(round(repeat_rate, 1)) + "%"
                     # round(repetition_rate, 3) * 100
                     "thesis_title": i["title"],
                     "thesis_link": "",
@@ -244,11 +244,19 @@ def repeat_quote_info_func(original_text_contrast):
             else:
                 chongfuwendang[thesis_info]["thesis_repeat_word"] += i["paper_red_len_word"]
                 chongfuwendang[thesis_info]["thesis_repeat_rate"] = (chongfuwendang[thesis_info]["thesis_repeat_word"] /
-                                                                     chongfuwendang[thesis_info][
-                                                                         "paper_len_word"]) * 100
+                                                                     section_words) * 100
     chongfuwendang = sorted(chongfuwendang.items(),
                             key=lambda x: x[1]["thesis_repeat_rate"], reverse=False)
-    chongfuwendang_list = [i[1] for i in chongfuwendang]
+
+
+    chongfuwendang_list = []
+
+    for i in chongfuwendang:
+        chongfuwendang_dan = i[1]
+        print(chongfuwendang_dan)
+        chongfuwendang_dan["thesis_repeat_rate"] = str(round(chongfuwendang_dan["thesis_repeat_rate"], 1)) + "%"
+        chongfuwendang_list.append(chongfuwendang_dan)
+
 
     return chongfuwendang_list
 
@@ -383,7 +391,6 @@ def section_details_func(data_section_dan, paper_dict, num_words):
     original_text_list = []
 
     for sentence_dan in data_section_dan:
-        print("sentence_dan", sentence_dan)
         original_text_contrast_dan = original_text_contrast_func(sentence_dan, paper_dict)
         original_text_contrast.append(original_text_contrast_dan)
         repeat_words += original_text_contrast_dan["dan_sentence_word_nums"]
@@ -393,7 +400,7 @@ def section_details_func(data_section_dan, paper_dict, num_words):
     repeat_rate = (repeat_words / section_words)* 100
     repeat_rate = str(round(repeat_rate, 1)) + "%"
 
-    repeat_quote_info = repeat_quote_info_func(original_text_contrast)
+    repeat_quote_info = repeat_quote_info_func(original_text_contrast, section_words)
 
     return {
         "end_page_index": 0,
@@ -531,7 +538,10 @@ def accurate_check_rouge(
     data_zong = []
     sentence_word_nums = 0
 
+    # =============================================================================================
+    # 多进程算法
     # rouge算法查重
+    t1_0 = time.time()
     rst = []
     p = Pool(nums_cpus) # 进程池中含有n个子进程
 
@@ -552,8 +562,26 @@ def accurate_check_rouge(
     p.close()
     p.join()  # 等待所有子进程执行完毕。调用join()之前必须先调用close()，调用close()之后就不能继续添加新的Process了。
 
+    print("筛选句子完成")
     rst = [i.get() for i in rst]
 
+    t2_0 = time.time()
+    print(t2_0- t1_0)
+    # =========================================================================================================
+
+    # rst = []
+    # num_words = 0
+    # centent_list = []
+    # for i in centent_list_old:
+    #     num_words += len(i)
+    #     if len(i) < 300:
+    #         centent_list.append(i)
+    # for i in range(len(centent_list)):
+    #     text = centent_list[i]
+    #     rst.append(rouge_pre_m(text, recall_data_list))
+
+    # ========================================================================================================
+
     for i in range(len(rst)):
         # print(rst[i])
         data_zong.append(rst[i])
@@ -567,7 +595,8 @@ def accurate_check_rouge(
     original_dict = []
 
     # 找出相似的句子序号
-    bool_check_sentense = []
+    bool_check_sentense = [] # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]]
+
     # bert算法
     # for i in range(len(data_zong)):
     #     if data_zong[i][0] == 1:
@@ -575,29 +604,35 @@ def accurate_check_rouge(
 
     # rouge算法
     for i in range(len(data_zong)):
+        bool_check_sentense_dan = []  # [[1, 223],[1, 226], [1, 562]]
         for j in range(len(data_zong[i])):
-            if data_zong[i][j][1] > 0.35:
-                bool_check_sentense.append([i, data_zong[i][j][0]])
-
+            if data_zong[i][j][1] > 0.3:
+                # print("data_zong[{}][{}]".format(i,j), data_zong[i][j][0])
+                bool_check_sentense_dan.append([i, data_zong[i][j][0]])
+        if bool_check_sentense_dan != []:
+            bool_check_sentense.append(bool_check_sentense_dan)
+
+    print("bool_check_sentense", bool_check_sentense)
+    print("找出相似的句子序号完成")
     biao_red = biaohong(bool_check_sentense, data_zong,
-                        recall_data_list)  # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]]
-
-    print("bert精确查重时间", t1 - t0)
-    print(biao_red)
+                        recall_data_list)  # [[[[0, 1, 2], [479, 480, 481]],[[0, 1, 2], [471, 472, 473]]], [[[3, 4, 5], [481, 482, 483]], [[3, 4, 5], [461, 462, 463]]]]
 
+    print("biao_red", str(biao_red))
     sentence_0_list = []
     sentence_1_list = []
     sim_paper_name = []
 
-    for i in biao_red:
-        if recall_data_list[i[1][0]][1] == recall_data_list[i[1][1]][1] == recall_data_list[i[1][2]][1]:
-            sentence_0_list.append("。".join([centent_list[i[0][0]], centent_list[i[0][1]], centent_list[i[0][2]]]))
-            sentence_1_list.append(
-                "".join([recall_data_list[i[1][0]][0], recall_data_list[i[1][1]][0], recall_data_list[i[1][2]][0]]))
-            sim_paper_name.append(recall_data_list[i[1][0]][1])
-        else:
-            continue
+    for i in range(len(biao_red)):
+        for j in range(len(biao_red[i])):
+            if recall_data_list[biao_red[i][j][1][0]][1] == recall_data_list[biao_red[i][j][1][1]][1] == recall_data_list[biao_red[i][j][1][2]][1]:
+                sentence_0_list.append("。".join([centent_list[biao_red[i][j][0][0]], centent_list[biao_red[i][j][0][1]], centent_list[biao_red[i][j][0][2]]]))
+                sentence_1_list.append(
+                    "".join([recall_data_list[biao_red[i][j][1][0]][0], recall_data_list[biao_red[i][j][1][1]][0], recall_data_list[biao_red[i][j][1][2]][0]]))
+                sim_paper_name.append(recall_data_list[biao_red[i][j][1][0]][1])
+            else:
+                continue
 
+    print("待标红句子筛选完成")
     sentence_0_list_new = []
     sentence_1_list_new = []
 
@@ -610,12 +645,16 @@ def accurate_check_rouge(
             continue
     t2 = time.time()
 
+    print()
+    for i in sentence_0_list_new:
+        print("sentence_0_list_new", i)
     if sentence_0_list_new == sentence_1_list_new == []:
         paper_dict = []
     else:
         paper_dict = biaohong_bert_predict(sentence_0_list_new, sentence_1_list_new)
 
     t3 = time.time()
+    print("标红完成")
     print("标红时间", t3 - t2)
     original_text = []
     original_text_contrast = []
@@ -631,6 +670,7 @@ def accurate_check_rouge(
     # with open("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/paper_dict.json", "w") as f:
     #     json.dump(paper_dict, f, ensure_ascii=False)
 
+
     sentence_0_list_new_cursor = sentence_0_list_new[0]
     for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)),
                                                                                      sentence_0_list_new,
@@ -639,6 +679,7 @@ def accurate_check_rouge(
 
         if sentence_0_list_new_cursor != sentence_0_dan:
             similar_content_control.append([[paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan]])
+            sentence_0_list_new_cursor = sentence_0_dan
         else:
             similar_content_control[-1].append([paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan])
 
@@ -682,38 +723,69 @@ def accurate_check_rouge(
 def biaohong(bool_check_sentense, data_zong, df_train_nuoche):
     '''
     标红的序号 [[0,1,2],[3,4,5]]
-    :param bool_check_sentense:
-    :return: list
+    :param bool_check_sentense: # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]]
+    :return: list # [[[[0, 1, 2], [479, 480, 481]],[[0, 1, 2], [471, 472, 473]]], [[[3, 4, 5], [481, 482, 483]], [[3, 4, 5], [461, 462, 463]]]]
     '''
+
+    # print("bool_check_sentense", bool_check_sentense)
     biao_red = []
     i = 0
     start = -1
     end = -1
+    tiaochu = False
     while True:
-        if i >= len(bool_check_sentense) or bool_check_sentense[i][0] + 1 >= len(data_zong) or bool_check_sentense[i][1] \
-                + 1 >= len(df_train_nuoche):
+        # if i >= len(bool_check_sentense) or bool_check_sentense[i][0] + 1 >= len(data_zong) or bool_check_sentense[i][1] \
+        #         + 1 >= len(df_train_nuoche):
+        #     break
+
+        if i >= len(bool_check_sentense):
             break
-        elif bool_check_sentense[i][0] - 1 == start:
+
+        for j in bool_check_sentense[i]:
+            # print("j", j)
+            if j[0] + 1 >= len(data_zong):
+                tiaochu = True
+                break
+
+        for j in bool_check_sentense[i]:
+            if j[1] + 1 >= len(df_train_nuoche):
+                tiaochu = True
+                break
+
+        if tiaochu == True:
+            break
+
+        elif bool_check_sentense[i][0][0] - 1 == start:
             i += 1
             continue
-        elif bool_check_sentense[i][0] == end:
+        elif bool_check_sentense[i][0][0] == end:
             i += 1
             continue
-        elif bool_check_sentense[i][0] - 1 == end:
+        elif bool_check_sentense[i][0][0] - 1 == end:
             i += 1
             continue
         else:
             biao_red_dan = []
-            biao_red_dan.append([bool_check_sentense[i][0] - 1, bool_check_sentense[i][1] - 1])
-            biao_red_dan.append([bool_check_sentense[i][0], bool_check_sentense[i][1]])
-            biao_red_dan.append([bool_check_sentense[i][0] + 1, bool_check_sentense[i][1] + 1])
-            biao_red.append([[bool_check_sentense[i][0] - 1, bool_check_sentense[i][0], bool_check_sentense[i][0] + 1],
-                             [bool_check_sentense[i][1] - 1, bool_check_sentense[i][1], bool_check_sentense[i][1] + 1]])
-            start = bool_check_sentense[i][0] - 1
-            end = bool_check_sentense[i][0] + 1
+            for j in range(len(bool_check_sentense[i])):  # bool_check_sentense: [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]]
+                # biao_red_dan.append([bool_check_sentense[i][j][0] - 1, bool_check_sentense[i][j][1] - 1])
+                # biao_red_dan.append([bool_check_sentense[i][j][0], bool_check_sentense[i][j][1]])
+                # biao_red_dan.append([bool_check_sentense[i][j][0] + 1, bool_check_sentense[i][j][1] + 1])
+                # biao_red.append([[bool_check_sentense[i][0] - 1, bool_check_sentense[i][0], bool_check_sentense[i][0] + 1],
+                #                  [bool_check_sentense[i][1] - 1, bool_check_sentense[i][1], bool_check_sentense[i][1] + 1]])
+                # start = bool_check_sentense[i][0] - 1
+                # end = bool_check_sentense[i][0] + 1
+                # i += 1
+                # print("i:{}, j:{}".format(i, j), )
+                # print(bool_check_sentense)
+                # print([bool_check_sentense[i][j][0] - 1, bool_check_sentense[i][j][0], bool_check_sentense[i][j][0] + 1])
+                biao_red_dan.append([[bool_check_sentense[i][j][0] - 1, bool_check_sentense[i][j][0], bool_check_sentense[i][j][0] + 1],
+                                    [bool_check_sentense[i][j][1] - 1, bool_check_sentense[i][j][1], bool_check_sentense[i][j][1] + 1]])
+            start = bool_check_sentense[i][0][0] - 1
+            end = bool_check_sentense[i][0][0] + 1
             i += 1
+            biao_red.append(biao_red_dan)
 
-    return biao_red
+    return biao_red # [[[[0, 1, 2], [479, 480, 481]],[[0, 1, 2], [471, 472, 473]]], [[[3, 4, 5], [481, 482, 483]], [[3, 4, 5], [461, 462, 463]]]]
 
 
 def dialog_line_parse(url, text):