完整版v1.0

2 years ago · 7493ebf9af
1 changed files with 114 additions and 42 deletions
--- a/flask_check_bert_test.py
+++ b/flask_check_bert_test.py
@ -174,8 +174,8 @@ def original_text_contrast_func(data_sentence_dan, paper_dict):
                                                                       paper_dict[i[0]][4][1])  # text_original, bert_text, bert_text_pre

        sentence_1_bool, sentence_1_dan_red = original_text_marked_red(i[2], paper_dict[i[0]][2],
-                                                                       paper_dict[i[0]][4][0],
-                                                                       paper_dict[i[0]][4][1])  # text_original, bert_text, bert_text_pre
+                                                                       paper_dict[i[0]][4][2],
+                                                                       paper_dict[i[0]][4][3])  # text_original, bert_text, bert_text_pre

        if sentence_0_bool == False or sentence_1_bool == False:
            continue
@ -194,7 +194,7 @@ def original_text_contrast_func(data_sentence_dan, paper_dict):
        similar_content_dan["degree"] = i[3]["degree"]
        similar_content_dan["year"] = i[3]["year"]
        similar_content_dan["paper_len_word"] = i[3]["paper_len_word"]
-        similar_content_dan["paper_red_len_word"] = len(paper_dict[i[0]][3])
+        similar_content_dan["paper_red_len_word"] = end_dan - start_dan

        thesis_info = " ".join(
            [similar_content_dan["title"], similar_content_dan["author"], similar_content_dan["degree"],
@ -206,7 +206,7 @@ def original_text_contrast_func(data_sentence_dan, paper_dict):
    original_text_list = list(data_sentence_dan[0][1])
    original_text_list.insert(end, "</red>")
    original_text_list.insert(start, "<red>")
-    original_text = "".join(original_text_list)
+    original_text = "此处有 {} 字相似\n".format(str(end - start)) + "".join(original_text_list)

    return_info = {
        "original_text": original_text,
@ -216,7 +216,7 @@ def original_text_contrast_func(data_sentence_dan, paper_dict):
    return return_info


-def repeat_quote_info_func(original_text_contrast):
+def repeat_quote_info_func(original_text_contrast, section_words):
    '''
    重复的引用信息
    :return:
@ -232,7 +232,7 @@ def repeat_quote_info_func(original_text_contrast):
                    "thesis_author": i["author"],
                    "thesis_date": i["year"],
                    "thesis_info": thesis_info,
-                    "thesis_repeat_rate": (i["paper_red_len_word"] / i["paper_len_word"]) * 100,
+                    "thesis_repeat_rate": (i["paper_red_len_word"] / section_words) * 100, # str(round(repeat_rate, 1)) + "%"
                    # round(repetition_rate, 3) * 100
                    "thesis_title": i["title"],
                    "thesis_link": "",
@ -244,11 +244,19 @@ def repeat_quote_info_func(original_text_contrast):
            else:
                chongfuwendang[thesis_info]["thesis_repeat_word"] += i["paper_red_len_word"]
                chongfuwendang[thesis_info]["thesis_repeat_rate"] = (chongfuwendang[thesis_info]["thesis_repeat_word"] /
-                                                                     chongfuwendang[thesis_info][
-                                                                         "paper_len_word"]) * 100
+                                                                     section_words) * 100
    chongfuwendang = sorted(chongfuwendang.items(),
                            key=lambda x: x[1]["thesis_repeat_rate"], reverse=False)
-    chongfuwendang_list = [i[1] for i in chongfuwendang]
+
+
+    chongfuwendang_list = []
+
+    for i in chongfuwendang:
+        chongfuwendang_dan = i[1]
+        print(chongfuwendang_dan)
+        chongfuwendang_dan["thesis_repeat_rate"] = str(round(chongfuwendang_dan["thesis_repeat_rate"], 1)) + "%"
+        chongfuwendang_list.append(chongfuwendang_dan)
+

    return chongfuwendang_list

@ -383,7 +391,6 @@ def section_details_func(data_section_dan, paper_dict, num_words):
    original_text_list = []

    for sentence_dan in data_section_dan:
-        print("sentence_dan", sentence_dan)
        original_text_contrast_dan = original_text_contrast_func(sentence_dan, paper_dict)
        original_text_contrast.append(original_text_contrast_dan)
        repeat_words += original_text_contrast_dan["dan_sentence_word_nums"]
@ -393,7 +400,7 @@ def section_details_func(data_section_dan, paper_dict, num_words):
    repeat_rate = (repeat_words / section_words)* 100
    repeat_rate = str(round(repeat_rate, 1)) + "%"

-    repeat_quote_info = repeat_quote_info_func(original_text_contrast)
+    repeat_quote_info = repeat_quote_info_func(original_text_contrast, section_words)

    return {
        "end_page_index": 0,
@ -531,7 +538,10 @@ def accurate_check_rouge(
    data_zong = []
    sentence_word_nums = 0

+    # =============================================================================================
+    # 多进程算法
    # rouge算法查重
+    t1_0 = time.time()
    rst = []
    p = Pool(nums_cpus) # 进程池中含有n个子进程

@ -552,8 +562,26 @@ def accurate_check_rouge(
    p.close()
    p.join()  # 等待所有子进程执行完毕。调用join()之前必须先调用close()，调用close()之后就不能继续添加新的Process了。

+    print("筛选句子完成")
    rst = [i.get() for i in rst]

+    t2_0 = time.time()
+    print(t2_0- t1_0)
+    # =========================================================================================================
+
+    # rst = []
+    # num_words = 0
+    # centent_list = []
+    # for i in centent_list_old:
+    #     num_words += len(i)
+    #     if len(i) < 300:
+    #         centent_list.append(i)
+    # for i in range(len(centent_list)):
+    #     text = centent_list[i]
+    #     rst.append(rouge_pre_m(text, recall_data_list))
+
+    # ========================================================================================================
+
    for i in range(len(rst)):
        # print(rst[i])
        data_zong.append(rst[i])
@ -567,7 +595,8 @@ def accurate_check_rouge(
    original_dict = []

    # 找出相似的句子序号
-    bool_check_sentense = []
+    bool_check_sentense = [] # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]]
+
    # bert算法
    # for i in range(len(data_zong)):
    #     if data_zong[i][0] == 1:
@ -575,29 +604,35 @@ def accurate_check_rouge(

    # rouge算法
    for i in range(len(data_zong)):
+        bool_check_sentense_dan = []  # [[1, 223],[1, 226], [1, 562]]
        for j in range(len(data_zong[i])):
-            if data_zong[i][j][1] > 0.35:
-                bool_check_sentense.append([i, data_zong[i][j][0]])
-
+            if data_zong[i][j][1] > 0.3:
+                # print("data_zong[{}][{}]".format(i,j), data_zong[i][j][0])
+                bool_check_sentense_dan.append([i, data_zong[i][j][0]])
+        if bool_check_sentense_dan != []:
+            bool_check_sentense.append(bool_check_sentense_dan)
+
+    print("bool_check_sentense", bool_check_sentense)
+    print("找出相似的句子序号完成")
    biao_red = biaohong(bool_check_sentense, data_zong,
-                        recall_data_list)  # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]]
-
-    print("bert精确查重时间", t1 - t0)
-    print(biao_red)
+                        recall_data_list)  # [[[[0, 1, 2], [479, 480, 481]],[[0, 1, 2], [471, 472, 473]]], [[[3, 4, 5], [481, 482, 483]], [[3, 4, 5], [461, 462, 463]]]]

+    print("biao_red", str(biao_red))
    sentence_0_list = []
    sentence_1_list = []
    sim_paper_name = []

-    for i in biao_red:
-        if recall_data_list[i[1][0]][1] == recall_data_list[i[1][1]][1] == recall_data_list[i[1][2]][1]:
-            sentence_0_list.append("。".join([centent_list[i[0][0]], centent_list[i[0][1]], centent_list[i[0][2]]]))
+    for i in range(len(biao_red)):
+        for j in range(len(biao_red[i])):
+            if recall_data_list[biao_red[i][j][1][0]][1] == recall_data_list[biao_red[i][j][1][1]][1] == recall_data_list[biao_red[i][j][1][2]][1]:
+                sentence_0_list.append("。".join([centent_list[biao_red[i][j][0][0]], centent_list[biao_red[i][j][0][1]], centent_list[biao_red[i][j][0][2]]]))
                sentence_1_list.append(
-                "".join([recall_data_list[i[1][0]][0], recall_data_list[i[1][1]][0], recall_data_list[i[1][2]][0]]))
-            sim_paper_name.append(recall_data_list[i[1][0]][1])
+                    "".join([recall_data_list[biao_red[i][j][1][0]][0], recall_data_list[biao_red[i][j][1][1]][0], recall_data_list[biao_red[i][j][1][2]][0]]))
+                sim_paper_name.append(recall_data_list[biao_red[i][j][1][0]][1])
            else:
                continue

+    print("待标红句子筛选完成")
    sentence_0_list_new = []
    sentence_1_list_new = []

@ -610,12 +645,16 @@ def accurate_check_rouge(
            continue
    t2 = time.time()

+    print()
+    for i in sentence_0_list_new:
+        print("sentence_0_list_new", i)
    if sentence_0_list_new == sentence_1_list_new == []:
        paper_dict = []
    else:
        paper_dict = biaohong_bert_predict(sentence_0_list_new, sentence_1_list_new)

    t3 = time.time()
+    print("标红完成")
    print("标红时间", t3 - t2)
    original_text = []
    original_text_contrast = []
@ -631,6 +670,7 @@ def accurate_check_rouge(
    # with open("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/paper_dict.json", "w") as f:
    #     json.dump(paper_dict, f, ensure_ascii=False)

+
    sentence_0_list_new_cursor = sentence_0_list_new[0]
    for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)),
                                                                                     sentence_0_list_new,
@ -639,6 +679,7 @@ def accurate_check_rouge(

        if sentence_0_list_new_cursor != sentence_0_dan:
            similar_content_control.append([[paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan]])
+            sentence_0_list_new_cursor = sentence_0_dan
        else:
            similar_content_control[-1].append([paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan])

@ -682,38 +723,69 @@ def accurate_check_rouge(
 def biaohong(bool_check_sentense, data_zong, df_train_nuoche):
    '''
    标红的序号 [[0,1,2],[3,4,5]]
-    :param bool_check_sentense:
-    :return: list
+    :param bool_check_sentense: # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]]
+    :return: list # [[[[0, 1, 2], [479, 480, 481]],[[0, 1, 2], [471, 472, 473]]], [[[3, 4, 5], [481, 482, 483]], [[3, 4, 5], [461, 462, 463]]]]
    '''
+
+    # print("bool_check_sentense", bool_check_sentense)
    biao_red = []
    i = 0
    start = -1
    end = -1
+    tiaochu = False
    while True:
-        if i >= len(bool_check_sentense) or bool_check_sentense[i][0] + 1 >= len(data_zong) or bool_check_sentense[i][1] \
-                + 1 >= len(df_train_nuoche):
+        # if i >= len(bool_check_sentense) or bool_check_sentense[i][0] + 1 >= len(data_zong) or bool_check_sentense[i][1] \
+        #         + 1 >= len(df_train_nuoche):
+        #     break
+
+        if i >= len(bool_check_sentense):
+            break
+
+        for j in bool_check_sentense[i]:
+            # print("j", j)
+            if j[0] + 1 >= len(data_zong):
+                tiaochu = True
+                break
+
+        for j in bool_check_sentense[i]:
+            if j[1] + 1 >= len(df_train_nuoche):
+                tiaochu = True
                break
-        elif bool_check_sentense[i][0] - 1 == start:
+
+        if tiaochu == True:
+            break
+
+        elif bool_check_sentense[i][0][0] - 1 == start:
            i += 1
            continue
-        elif bool_check_sentense[i][0] == end:
+        elif bool_check_sentense[i][0][0] == end:
            i += 1
            continue
-        elif bool_check_sentense[i][0] - 1 == end:
+        elif bool_check_sentense[i][0][0] - 1 == end:
            i += 1
            continue
        else:
            biao_red_dan = []
-            biao_red_dan.append([bool_check_sentense[i][0] - 1, bool_check_sentense[i][1] - 1])
-            biao_red_dan.append([bool_check_sentense[i][0], bool_check_sentense[i][1]])
-            biao_red_dan.append([bool_check_sentense[i][0] + 1, bool_check_sentense[i][1] + 1])
-            biao_red.append([[bool_check_sentense[i][0] - 1, bool_check_sentense[i][0], bool_check_sentense[i][0] + 1],
-                             [bool_check_sentense[i][1] - 1, bool_check_sentense[i][1], bool_check_sentense[i][1] + 1]])
-            start = bool_check_sentense[i][0] - 1
-            end = bool_check_sentense[i][0] + 1
+            for j in range(len(bool_check_sentense[i])):  # bool_check_sentense: [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]]
+                # biao_red_dan.append([bool_check_sentense[i][j][0] - 1, bool_check_sentense[i][j][1] - 1])
+                # biao_red_dan.append([bool_check_sentense[i][j][0], bool_check_sentense[i][j][1]])
+                # biao_red_dan.append([bool_check_sentense[i][j][0] + 1, bool_check_sentense[i][j][1] + 1])
+                # biao_red.append([[bool_check_sentense[i][0] - 1, bool_check_sentense[i][0], bool_check_sentense[i][0] + 1],
+                #                  [bool_check_sentense[i][1] - 1, bool_check_sentense[i][1], bool_check_sentense[i][1] + 1]])
+                # start = bool_check_sentense[i][0] - 1
+                # end = bool_check_sentense[i][0] + 1
+                # i += 1
+                # print("i:{}, j:{}".format(i, j), )
+                # print(bool_check_sentense)
+                # print([bool_check_sentense[i][j][0] - 1, bool_check_sentense[i][j][0], bool_check_sentense[i][j][0] + 1])
+                biao_red_dan.append([[bool_check_sentense[i][j][0] - 1, bool_check_sentense[i][j][0], bool_check_sentense[i][j][0] + 1],
+                                    [bool_check_sentense[i][j][1] - 1, bool_check_sentense[i][j][1], bool_check_sentense[i][j][1] + 1]])
+            start = bool_check_sentense[i][0][0] - 1
+            end = bool_check_sentense[i][0][0] + 1
            i += 1
+            biao_red.append(biao_red_dan)

-    return biao_red
+    return biao_red # [[[[0, 1, 2], [479, 480, 481]],[[0, 1, 2], [471, 472, 473]]], [[[3, 4, 5], [481, 482, 483]], [[3, 4, 5], [461, 462, 463]]]]


 def dialog_line_parse(url, text):