From 7493ebf9afcd3388487ff6a2bfa1578e18960077 Mon Sep 17 00:00:00 2001 From: "majiahui@haimaqingfan.com" Date: Fri, 15 Sep 2023 18:09:18 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=95=B4=E7=89=88v1.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_check_bert_test.py | 156 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 114 insertions(+), 42 deletions(-) diff --git a/flask_check_bert_test.py b/flask_check_bert_test.py index 9c9bd8e..c12e800 100644 --- a/flask_check_bert_test.py +++ b/flask_check_bert_test.py @@ -174,8 +174,8 @@ def original_text_contrast_func(data_sentence_dan, paper_dict): paper_dict[i[0]][4][1]) # text_original, bert_text, bert_text_pre sentence_1_bool, sentence_1_dan_red = original_text_marked_red(i[2], paper_dict[i[0]][2], - paper_dict[i[0]][4][0], - paper_dict[i[0]][4][1]) # text_original, bert_text, bert_text_pre + paper_dict[i[0]][4][2], + paper_dict[i[0]][4][3]) # text_original, bert_text, bert_text_pre if sentence_0_bool == False or sentence_1_bool == False: continue @@ -194,7 +194,7 @@ def original_text_contrast_func(data_sentence_dan, paper_dict): similar_content_dan["degree"] = i[3]["degree"] similar_content_dan["year"] = i[3]["year"] similar_content_dan["paper_len_word"] = i[3]["paper_len_word"] - similar_content_dan["paper_red_len_word"] = len(paper_dict[i[0]][3]) + similar_content_dan["paper_red_len_word"] = end_dan - start_dan thesis_info = " ".join( [similar_content_dan["title"], similar_content_dan["author"], similar_content_dan["degree"], @@ -206,7 +206,7 @@ def original_text_contrast_func(data_sentence_dan, paper_dict): original_text_list = list(data_sentence_dan[0][1]) original_text_list.insert(end, "") original_text_list.insert(start, "") - original_text = "".join(original_text_list) + original_text = "此处有 {} 字相似\n".format(str(end - start)) + "".join(original_text_list) return_info = { "original_text": original_text, @@ -216,7 +216,7 @@ def original_text_contrast_func(data_sentence_dan, paper_dict): return return_info -def repeat_quote_info_func(original_text_contrast): +def repeat_quote_info_func(original_text_contrast, section_words): ''' 重复的引用信息 :return: @@ -232,7 +232,7 @@ def repeat_quote_info_func(original_text_contrast): "thesis_author": i["author"], "thesis_date": i["year"], "thesis_info": thesis_info, - "thesis_repeat_rate": (i["paper_red_len_word"] / i["paper_len_word"]) * 100, + "thesis_repeat_rate": (i["paper_red_len_word"] / section_words) * 100, # str(round(repeat_rate, 1)) + "%" # round(repetition_rate, 3) * 100 "thesis_title": i["title"], "thesis_link": "", @@ -244,11 +244,19 @@ def repeat_quote_info_func(original_text_contrast): else: chongfuwendang[thesis_info]["thesis_repeat_word"] += i["paper_red_len_word"] chongfuwendang[thesis_info]["thesis_repeat_rate"] = (chongfuwendang[thesis_info]["thesis_repeat_word"] / - chongfuwendang[thesis_info][ - "paper_len_word"]) * 100 + section_words) * 100 chongfuwendang = sorted(chongfuwendang.items(), key=lambda x: x[1]["thesis_repeat_rate"], reverse=False) - chongfuwendang_list = [i[1] for i in chongfuwendang] + + + chongfuwendang_list = [] + + for i in chongfuwendang: + chongfuwendang_dan = i[1] + print(chongfuwendang_dan) + chongfuwendang_dan["thesis_repeat_rate"] = str(round(chongfuwendang_dan["thesis_repeat_rate"], 1)) + "%" + chongfuwendang_list.append(chongfuwendang_dan) + return chongfuwendang_list @@ -383,7 +391,6 @@ def section_details_func(data_section_dan, paper_dict, num_words): original_text_list = [] for sentence_dan in data_section_dan: - print("sentence_dan", sentence_dan) original_text_contrast_dan = original_text_contrast_func(sentence_dan, paper_dict) original_text_contrast.append(original_text_contrast_dan) repeat_words += original_text_contrast_dan["dan_sentence_word_nums"] @@ -393,7 +400,7 @@ def section_details_func(data_section_dan, paper_dict, num_words): repeat_rate = (repeat_words / section_words)* 100 repeat_rate = str(round(repeat_rate, 1)) + "%" - repeat_quote_info = repeat_quote_info_func(original_text_contrast) + repeat_quote_info = repeat_quote_info_func(original_text_contrast, section_words) return { "end_page_index": 0, @@ -531,7 +538,10 @@ def accurate_check_rouge( data_zong = [] sentence_word_nums = 0 + # ============================================================================================= + # 多进程算法 # rouge算法查重 + t1_0 = time.time() rst = [] p = Pool(nums_cpus) # 进程池中含有n个子进程 @@ -552,8 +562,26 @@ def accurate_check_rouge( p.close() p.join() # 等待所有子进程执行完毕。调用join()之前必须先调用close(),调用close()之后就不能继续添加新的Process了。 + print("筛选句子完成") rst = [i.get() for i in rst] + t2_0 = time.time() + print(t2_0- t1_0) + # ========================================================================================================= + + # rst = [] + # num_words = 0 + # centent_list = [] + # for i in centent_list_old: + # num_words += len(i) + # if len(i) < 300: + # centent_list.append(i) + # for i in range(len(centent_list)): + # text = centent_list[i] + # rst.append(rouge_pre_m(text, recall_data_list)) + + # ======================================================================================================== + for i in range(len(rst)): # print(rst[i]) data_zong.append(rst[i]) @@ -567,7 +595,8 @@ def accurate_check_rouge( original_dict = [] # 找出相似的句子序号 - bool_check_sentense = [] + bool_check_sentense = [] # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] + # bert算法 # for i in range(len(data_zong)): # if data_zong[i][0] == 1: @@ -575,29 +604,35 @@ def accurate_check_rouge( # rouge算法 for i in range(len(data_zong)): + bool_check_sentense_dan = [] # [[1, 223],[1, 226], [1, 562]] for j in range(len(data_zong[i])): - if data_zong[i][j][1] > 0.35: - bool_check_sentense.append([i, data_zong[i][j][0]]) - + if data_zong[i][j][1] > 0.3: + # print("data_zong[{}][{}]".format(i,j), data_zong[i][j][0]) + bool_check_sentense_dan.append([i, data_zong[i][j][0]]) + if bool_check_sentense_dan != []: + bool_check_sentense.append(bool_check_sentense_dan) + + print("bool_check_sentense", bool_check_sentense) + print("找出相似的句子序号完成") biao_red = biaohong(bool_check_sentense, data_zong, - recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]] - - print("bert精确查重时间", t1 - t0) - print(biao_red) + recall_data_list) # [[[[0, 1, 2], [479, 480, 481]],[[0, 1, 2], [471, 472, 473]]], [[[3, 4, 5], [481, 482, 483]], [[3, 4, 5], [461, 462, 463]]]] + print("biao_red", str(biao_red)) sentence_0_list = [] sentence_1_list = [] sim_paper_name = [] - for i in biao_red: - if recall_data_list[i[1][0]][1] == recall_data_list[i[1][1]][1] == recall_data_list[i[1][2]][1]: - sentence_0_list.append("。".join([centent_list[i[0][0]], centent_list[i[0][1]], centent_list[i[0][2]]])) - sentence_1_list.append( - "".join([recall_data_list[i[1][0]][0], recall_data_list[i[1][1]][0], recall_data_list[i[1][2]][0]])) - sim_paper_name.append(recall_data_list[i[1][0]][1]) - else: - continue + for i in range(len(biao_red)): + for j in range(len(biao_red[i])): + if recall_data_list[biao_red[i][j][1][0]][1] == recall_data_list[biao_red[i][j][1][1]][1] == recall_data_list[biao_red[i][j][1][2]][1]: + sentence_0_list.append("。".join([centent_list[biao_red[i][j][0][0]], centent_list[biao_red[i][j][0][1]], centent_list[biao_red[i][j][0][2]]])) + sentence_1_list.append( + "".join([recall_data_list[biao_red[i][j][1][0]][0], recall_data_list[biao_red[i][j][1][1]][0], recall_data_list[biao_red[i][j][1][2]][0]])) + sim_paper_name.append(recall_data_list[biao_red[i][j][1][0]][1]) + else: + continue + print("待标红句子筛选完成") sentence_0_list_new = [] sentence_1_list_new = [] @@ -610,12 +645,16 @@ def accurate_check_rouge( continue t2 = time.time() + print() + for i in sentence_0_list_new: + print("sentence_0_list_new", i) if sentence_0_list_new == sentence_1_list_new == []: paper_dict = [] else: paper_dict = biaohong_bert_predict(sentence_0_list_new, sentence_1_list_new) t3 = time.time() + print("标红完成") print("标红时间", t3 - t2) original_text = [] original_text_contrast = [] @@ -631,6 +670,7 @@ def accurate_check_rouge( # with open("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/paper_dict.json", "w") as f: # json.dump(paper_dict, f, ensure_ascii=False) + sentence_0_list_new_cursor = sentence_0_list_new[0] for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), sentence_0_list_new, @@ -639,6 +679,7 @@ def accurate_check_rouge( if sentence_0_list_new_cursor != sentence_0_dan: similar_content_control.append([[paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan]]) + sentence_0_list_new_cursor = sentence_0_dan else: similar_content_control[-1].append([paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan]) @@ -682,38 +723,69 @@ def accurate_check_rouge( def biaohong(bool_check_sentense, data_zong, df_train_nuoche): ''' 标红的序号 [[0,1,2],[3,4,5]] - :param bool_check_sentense: - :return: list + :param bool_check_sentense: # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] + :return: list # [[[[0, 1, 2], [479, 480, 481]],[[0, 1, 2], [471, 472, 473]]], [[[3, 4, 5], [481, 482, 483]], [[3, 4, 5], [461, 462, 463]]]] ''' + + # print("bool_check_sentense", bool_check_sentense) biao_red = [] i = 0 start = -1 end = -1 + tiaochu = False while True: - if i >= len(bool_check_sentense) or bool_check_sentense[i][0] + 1 >= len(data_zong) or bool_check_sentense[i][1] \ - + 1 >= len(df_train_nuoche): + # if i >= len(bool_check_sentense) or bool_check_sentense[i][0] + 1 >= len(data_zong) or bool_check_sentense[i][1] \ + # + 1 >= len(df_train_nuoche): + # break + + if i >= len(bool_check_sentense): break - elif bool_check_sentense[i][0] - 1 == start: + + for j in bool_check_sentense[i]: + # print("j", j) + if j[0] + 1 >= len(data_zong): + tiaochu = True + break + + for j in bool_check_sentense[i]: + if j[1] + 1 >= len(df_train_nuoche): + tiaochu = True + break + + if tiaochu == True: + break + + elif bool_check_sentense[i][0][0] - 1 == start: i += 1 continue - elif bool_check_sentense[i][0] == end: + elif bool_check_sentense[i][0][0] == end: i += 1 continue - elif bool_check_sentense[i][0] - 1 == end: + elif bool_check_sentense[i][0][0] - 1 == end: i += 1 continue else: biao_red_dan = [] - biao_red_dan.append([bool_check_sentense[i][0] - 1, bool_check_sentense[i][1] - 1]) - biao_red_dan.append([bool_check_sentense[i][0], bool_check_sentense[i][1]]) - biao_red_dan.append([bool_check_sentense[i][0] + 1, bool_check_sentense[i][1] + 1]) - biao_red.append([[bool_check_sentense[i][0] - 1, bool_check_sentense[i][0], bool_check_sentense[i][0] + 1], - [bool_check_sentense[i][1] - 1, bool_check_sentense[i][1], bool_check_sentense[i][1] + 1]]) - start = bool_check_sentense[i][0] - 1 - end = bool_check_sentense[i][0] + 1 + for j in range(len(bool_check_sentense[i])): # bool_check_sentense: [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] + # biao_red_dan.append([bool_check_sentense[i][j][0] - 1, bool_check_sentense[i][j][1] - 1]) + # biao_red_dan.append([bool_check_sentense[i][j][0], bool_check_sentense[i][j][1]]) + # biao_red_dan.append([bool_check_sentense[i][j][0] + 1, bool_check_sentense[i][j][1] + 1]) + # biao_red.append([[bool_check_sentense[i][0] - 1, bool_check_sentense[i][0], bool_check_sentense[i][0] + 1], + # [bool_check_sentense[i][1] - 1, bool_check_sentense[i][1], bool_check_sentense[i][1] + 1]]) + # start = bool_check_sentense[i][0] - 1 + # end = bool_check_sentense[i][0] + 1 + # i += 1 + # print("i:{}, j:{}".format(i, j), ) + # print(bool_check_sentense) + # print([bool_check_sentense[i][j][0] - 1, bool_check_sentense[i][j][0], bool_check_sentense[i][j][0] + 1]) + biao_red_dan.append([[bool_check_sentense[i][j][0] - 1, bool_check_sentense[i][j][0], bool_check_sentense[i][j][0] + 1], + [bool_check_sentense[i][j][1] - 1, bool_check_sentense[i][j][1], bool_check_sentense[i][j][1] + 1]]) + start = bool_check_sentense[i][0][0] - 1 + end = bool_check_sentense[i][0][0] + 1 i += 1 + biao_red.append(biao_red_dan) - return biao_red + return biao_red # [[[[0, 1, 2], [479, 480, 481]],[[0, 1, 2], [471, 472, 473]]], [[[3, 4, 5], [481, 482, 483]], [[3, 4, 5], [461, 462, 463]]]] def dialog_line_parse(url, text):