diff --git a/flask_check_bert_test.py b/flask_check_bert_test.py index dc76ced..18f909a 100644 --- a/flask_check_bert_test.py +++ b/flask_check_bert_test.py @@ -248,15 +248,16 @@ def original_text_contrast_func(data_sentence_dan, paper_dict, centent_list): original_text_start = "".join(original_text_list[:start]) original_text_end = "".join(original_text_list[end:]) + print(data_sentence_dan) if data_sentence_dan[0][4][0]-1 < 0: start_sen = "" else: start_sen = centent_list[data_sentence_dan[0][4][0]-1] - if data_sentence_dan[0][4][0]+1 > len(centent_list) -1: + if data_sentence_dan[0][4][-1]+1 >= len(centent_list): end_sen = "" else: - end_sen = centent_list[data_sentence_dan[0][4][2]+1] + end_sen = centent_list[data_sentence_dan[0][4][-1]+1] start_sen = start_sen + original_text_start end_sen = original_text_end + end_sen @@ -428,7 +429,7 @@ def section_data_func(section_details): } -def section_details_func(data_section_dan, paper_dict, num_words, centent_list): +def section_details_func(data_section_dan, paper_dict, num_words, centent_list, index_centent_list_dan): ''' 章节详细信息 :param original_text_contrast: @@ -458,7 +459,7 @@ def section_details_func(data_section_dan, paper_dict, num_words, centent_list): return { "end_page_index": 0, - "name": "第1部分", + "name": "第{}部分".format(str(index_centent_list_dan)), "repeat_rate": repeat_rate, "repeat_words": repeat_words, "start_page_index": 0, @@ -470,7 +471,8 @@ def section_details_func(data_section_dan, paper_dict, num_words, centent_list): } -def check_dict(similar_content_control, paper_dict, num_words, title, author, centent_list): +def check_dict(similar_content_control_zong, paper_dict_zong, num_words_zong, title, author, chapter_data, index_centent_list): + # similar_content_control, paper_dict, num_words, title, author, centent_list ''' 生成返回字典 :param similar_content_control: @@ -480,16 +482,14 @@ def check_dict(similar_content_control, paper_dict, num_words, title, author, ce :param author: :return: ''' - if paper_dict != []: - data = [similar_content_control] + if paper_dict_zong != []: # 模拟多个章节 section_details_list = [] - for data_dan in data: - data_section_dan = data_dan + for data_section_dan, paper_dict, num_words, centent_list, index_centent_list_dan in zip(similar_content_control_zong, paper_dict_zong, num_words_zong, chapter_data, index_centent_list): # 章节详细信息 - section_details = section_details_func(data_section_dan, paper_dict, num_words, centent_list) + section_details = section_details_func(data_section_dan, paper_dict, num_words, centent_list, index_centent_list_dan) section_details_list.append(section_details) # 模拟多个章节 @@ -572,26 +572,37 @@ def check_dict(similar_content_control, paper_dict, num_words, title, author, ce } return paper_data -def accurate_check_rouge( - title, - author, - text_paper, - recall_data_list -): + +def split_chapter(centent_list): ''' - 精确查重出相似句子 - :param text: - :param recall_data_list: list [[sentence, filename],[sentence, filename],[sentence, filename]] - :return: + + :param centent_list: + :return: [[[sentence, sentence, ... sentence], 2000], [[sentence, sentence, ... sentence], 2000]] ''' - # 文本处理 - # centent_list = [] - print("text_paper", len(text_paper)) - text_paper = str(text_paper).replace("。\n", "。") - centent_list_old = text_paper.split("。") - data_zong = [] - sentence_word_nums = 0 + centent_list_new = [] + zishu = 2000 + dangqianzishu = 0 + + i = 0 + centent_list_dan = [] + while True: + if i >= len(centent_list): + if centent_list_dan != []: + centent_list_new.append([centent_list_dan, dangqianzishu]) + break + centent_list_dan.append(centent_list[i]) + dangqianzishu += len(centent_list[i]) + if dangqianzishu > zishu: + centent_list_new.append([centent_list_dan, dangqianzishu]) + dangqianzishu = 0 + centent_list_dan = [] + i += 1 + + return centent_list_new + + +def chapter_check(dan_chapter_data, recall_data_list): # ============================================================================================= # 多进程算法 # rouge算法查重 @@ -599,15 +610,6 @@ def accurate_check_rouge( # rst = [] # p = Pool(nums_cpus) # 进程池中含有n个子进程 # - # # print("centent_list", centent_list) - # - # num_words = 0 - # centent_list = [] - # for i in centent_list_old: - # num_words += len(i) - # if len(i) < 300: - # centent_list.append(i) - # # print("num_words", num_words) # for i in range(len(centent_list)): # text = centent_list[i] @@ -624,20 +626,13 @@ def accurate_check_rouge( # ========================================================================================================= rst = [] - num_words = 0 - centent_list = [] - for i in centent_list_old: - num_words += len(i) - if len(i) < 300: - centent_list.append(i + "。") - if i == "": - continue - for i in range(len(centent_list)): - text = centent_list[i] + for i in range(len(dan_chapter_data)): + text = dan_chapter_data[i] rst.append(rouge_pre_m(text, recall_data_list)) # ======================================================================================================== + data_zong = [] for i in range(len(rst)): # print(rst[i]) data_zong.append(rst[i]) @@ -651,7 +646,7 @@ def accurate_check_rouge( original_dict = [] # 找出相似的句子序号 - bool_check_sentense = [] # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] + bool_check_sentense = [] # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] # bert算法 # for i in range(len(data_zong)): @@ -670,7 +665,8 @@ def accurate_check_rouge( # 继续用rouge方法筛选 - bool_check_sentense = rouge_pre_m_1(bool_check_sentense, centent_list, recall_data_list) # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] + bool_check_sentense = rouge_pre_m_1(bool_check_sentense, dan_chapter_data, + recall_data_list) # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] print("bool_check_sentense", bool_check_sentense) print("找出相似的句子序号完成") @@ -692,6 +688,7 @@ def accurate_check_rouge( for i in range(len(biao_red)): for j in range(len(biao_red[i])): + print("i,j",i, j) # if recall_data_list[biao_red[i][j][1][0]][1] == recall_data_list[biao_red[i][j][1][1]][1] == recall_data_list[biao_red[i][j][1][2]][1]: # sentence_0_list.append("".join([centent_list[biao_red[i][j][0][0]], centent_list[biao_red[i][j][0][1]], centent_list[biao_red[i][j][0][2]]])) # sentence_1_list.append( @@ -700,26 +697,30 @@ def accurate_check_rouge( # else: # continue - file_name = recall_data_list[biao_red[i][j][1][1]][1] sentence_0_list_dan = [] sentence_1_list_dan = [] sentence_0_list_dan_index = [] - houxuna_file_list = [ - [recall_data_list[biao_red[i][j][1][0]][1], centent_list[biao_red[i][j][0][0]], recall_data_list[biao_red[i][j][1][0]][0]], - [recall_data_list[biao_red[i][j][1][1]][1], centent_list[biao_red[i][j][0][1]], recall_data_list[biao_red[i][j][1][1]][0]], - [recall_data_list[biao_red[i][j][1][2]][1], centent_list[biao_red[i][j][0][2]], recall_data_list[biao_red[i][j][1][2]][0]] - ] + # houxuna_file_list = [ + # [recall_data_list[biao_red[i][j][1][0]][1], dan_chapter_data[biao_red[i][j][0][0]], + # recall_data_list[biao_red[i][j][1][0]][0]], + # [recall_data_list[biao_red[i][j][1][1]][1], dan_chapter_data[biao_red[i][j][0][1]], + # recall_data_list[biao_red[i][j][1][1]][0]], + # [recall_data_list[biao_red[i][j][1][2]][1], dan_chapter_data[biao_red[i][j][0][2]], + # recall_data_list[biao_red[i][j][1][2]][0]] + # ] + + sentence_0_list_dan = [dan_chapter_data[biao_red[i][j][0][index_simsentence]] for index_simsentence in range(len(biao_red[i][j][0]))] + houxuna_file_list = [[recall_data_list[biao_red[i][j][1][index_simsentence]][1], recall_data_list[biao_red[i][j][1][index_simsentence]][0]] for index_simsentence in range(len(biao_red[i][j][0]))] + for dan_sen_info in houxuna_file_list: - sentence_0_list_dan.append(dan_sen_info[1]) if dan_sen_info[0] == file_name: - sentence_1_list_dan.append(dan_sen_info[2]) - - sentence_0_list.append("".join(sentence_0_list_dan)) - sentence_1_list.append("".join(sentence_1_list_dan)) - original_sentence_index.append(biao_red[i][j][0]) - sim_paper_name.append(recall_data_list[biao_red[i][j][1][1]][1]) - + sentence_1_list_dan.append(dan_sen_info[1]) + if sentence_0_list_dan != [] and sentence_1_list_dan != []: + sentence_0_list.append("".join(sentence_0_list_dan)) + sentence_1_list.append("".join(sentence_1_list_dan)) + original_sentence_index.append(biao_red[i][j][0]) + sim_paper_name.append(recall_data_list[biao_red[i][j][1][1]][1]) print("待标红句子筛选完成") sentence_0_list_new = [] @@ -759,21 +760,73 @@ def accurate_check_rouge( # with open("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/paper_dict.json", "w") as f: # json.dump(paper_dict, f, ensure_ascii=False) - sentence_0_list_new_cursor = sentence_0_list_new[0] - for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan in zip(range(len(paper_dict)), - sentence_0_list_new, - sentence_1_list_new, - sim_paper_name, - original_sentence_index): + for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan in zip( + range(len(paper_dict)), + sentence_0_list_new, + sentence_1_list_new, + sim_paper_name, + original_sentence_index): if sentence_0_list_new_cursor != sentence_0_dan: - similar_content_control.append([[paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan]]) + similar_content_control.append( + [[paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan]]) sentence_0_list_new_cursor = sentence_0_dan else: - similar_content_control[-1].append([paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan]) + similar_content_control[-1].append( + [paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan]) + return similar_content_control, paper_dict + + +def accurate_check_rouge( + title, + author, + text_paper, + recall_data_list +): + ''' + 精确查重出相似句子 + :param text: + :param recall_data_list: list [[sentence, filename],[sentence, filename],[sentence, filename]] + :return: + ''' + # 文本处理 + # centent_list = [] + print("text_paper", len(text_paper)) + text_paper = str(text_paper).replace("。\n", "。") + centent_list_old = text_paper.split("。") + + sentence_word_nums = 0 + + centent_list = [] + for i in centent_list_old: + if len(i) < 300: + centent_list.append(i + "。") + if i == "": + continue + + centent_list_zong = split_chapter(centent_list) + + + # 分章 + + similar_content_control_zong = [] + paper_dict_zong = [] + num_words_zong = [] + chapter_data = [] + index_centent_list = [] + + for index_centent_list_zong in range(len(centent_list_zong)): + dan_chapter_data, dan_chapter_num_words = centent_list_zong[index_centent_list_zong][0], centent_list_zong[index_centent_list_zong][1] - paper_data = check_dict(similar_content_control, paper_dict, num_words, title, author, centent_list) + similar_content_control, paper_dict = chapter_check(dan_chapter_data, recall_data_list) + similar_content_control_zong.append(similar_content_control) + paper_dict_zong.append(paper_dict) + num_words_zong.append(dan_chapter_num_words) + chapter_data.append(dan_chapter_data) + index_centent_list.append(index_centent_list_zong) + + paper_data = check_dict(similar_content_control_zong, paper_dict_zong, num_words_zong, title, author, chapter_data, index_centent_list) # data = [similar_content_control] # # # 模拟多个章节 @@ -833,7 +886,7 @@ def biaohong(bool_check_sentense, data_zong, df_train_nuoche): for j in bool_check_sentense[i]: # print("j", j) - if j[0] + 1 >= len(data_zong): + if j[0] + 1 > len(data_zong): tiaochu = True break @@ -864,6 +917,36 @@ def biaohong(bool_check_sentense, data_zong, df_train_nuoche): # [bool_check_sentense[i+1][j][1] - 1, bool_check_sentense[i+1][j][1], bool_check_sentense[i+1][j][1] + 1]]) # biao_red.append(biao_red_dan) + elif i == len(bool_check_sentense)-1: + if end == bool_check_sentense[i][0][0]: + i += 1 + break + elif bool_check_sentense[i][0][0]-1 == end + 1 and bool_check_sentense[i][0][0] == len(data_zong) -1: + index_list = [ii for ii in range(bool_check_sentense[i][0][0]-1, bool_check_sentense[i][0][0] + 1)] + elif bool_check_sentense[i][0][0]-1 == end and bool_check_sentense[i][0][0] == len(data_zong) -1: + index_list = [ii for ii in range(bool_check_sentense[i][0][0], bool_check_sentense[i][0][0] + 1)] + elif bool_check_sentense[i][0][0]-1 > end + 1 and bool_check_sentense[i][0][0] == len(data_zong) -1: + index_list = [ii for ii in range(bool_check_sentense[i][0][0] - 1, bool_check_sentense[i][0][0] + 1)] + else: + index_list = [ii for ii in range(bool_check_sentense[i][0][0] - 1, bool_check_sentense[i][0][0] + 2)] + + biaohongset = set() + biao_red_dan = [] + for j in range(len(bool_check_sentense[ + i])): # bool_check_sentense: [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] + if bool_check_sentense[i][j][1] not in biaohongset: + biao_red_dan.append([index_list, + [bool_check_sentense[i][j][1] - 1, bool_check_sentense[i][j][1], + bool_check_sentense[i][j][1] + 1]]) + biaohongset.add(bool_check_sentense[i][j][1] - 1) + biaohongset.add(bool_check_sentense[i][j][1]) + biaohongset.add(bool_check_sentense[i][j][1] + 1) + else: + continue + + i += 1 + biao_red.append(biao_red_dan) + break elif bool_check_sentense[i][0][0] - 1 == start: i += 1 @@ -1316,55 +1399,55 @@ def classify_accurate_check(): goodsId = data_dict['goodsId'] callbackUrl = data_dict['callbackUrl'] - try: - print("查找相似的50篇完成") - print(len(content)) - - # with open("data/rell_json.txt") as f: - # recall_data_list_dict = eval(f.read()) - # recall_data_list = ulit_recall_paper(recall_data_list_dict) + # try: + print("查找相似的50篇完成") + print(len(content)) - print("文章格式转化完成") + # with open("data/rell_json.txt") as f: + # recall_data_list_dict = eval(f.read()) + # recall_data_list = ulit_recall_paper(recall_data_list_dict) - # recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist() + print("文章格式转化完成") - # 进入精确查重系统 - print("进入精确查重系统") + # recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist() - return_list = accurate_check_rouge(title, author, content, recall_data_list) + # 进入精确查重系统 + print("进入精确查重系统") - return_text = {"resilt": return_list, "probabilities": None, "status_code": 200} + return_list = accurate_check_rouge(title, author, content, recall_data_list) - load_result_path = "./new_data_logs/{}.json".format(queue_uuid) + return_text = {"resilt": return_list, "probabilities": None, "status_code": 200} - print("queue_uuid: ", queue_uuid) - print("load_result_path: ", load_result_path) + load_result_path = "./new_data_logs/{}.json".format(queue_uuid) - with open(load_result_path, 'w', encoding='utf8') as f2: - # ensure_ascii=False才能输入中文,否则是Unicode字符 - # indent=2 JSON数据的缩进,美观 - json.dump(return_text, f2, ensure_ascii=False, indent=4) + print("queue_uuid: ", queue_uuid) + print("load_result_path: ", load_result_path) - print(queue_uuid) - print(load_result_path) - redis_.set(queue_uuid, load_result_path, 86400) - redis_.srem(db_key_querying, queue_uuid) - except: - return_text = {"resilt": "", "probabilities": None, "status_code": 401} - load_result_path = "./new_data_logs/{}.json".format(queue_uuid) - - print("queue_uuid: ", queue_uuid) - print("load_result_path: ", load_result_path) - - with open(load_result_path, 'w', encoding='utf8') as f2: - # ensure_ascii=False才能输入中文,否则是Unicode字符 - # indent=2 JSON数据的缩进,美观 - json.dump(return_text, f2, ensure_ascii=False, indent=4) + with open(load_result_path, 'w', encoding='utf8') as f2: + # ensure_ascii=False才能输入中文,否则是Unicode字符 + # indent=2 JSON数据的缩进,美观 + json.dump(return_text, f2, ensure_ascii=False, indent=4) - print(queue_uuid) - print(load_result_path) - redis_.set(queue_uuid, load_result_path, 86400) - redis_.srem(db_key_querying, queue_uuid) + print(queue_uuid) + print(load_result_path) + redis_.set(queue_uuid, load_result_path, 86400) + redis_.srem(db_key_querying, queue_uuid) + # except: + # return_text = {"resilt": "", "probabilities": None, "status_code": 401} + # load_result_path = "./new_data_logs/{}.json".format(queue_uuid) + # + # print("queue_uuid: ", queue_uuid) + # print("load_result_path: ", load_result_path) + # + # with open(load_result_path, 'w', encoding='utf8') as f2: + # # ensure_ascii=False才能输入中文,否则是Unicode字符 + # # indent=2 JSON数据的缩进,美观 + # json.dump(return_text, f2, ensure_ascii=False, indent=4) + # + # print(queue_uuid) + # print(load_result_path) + # redis_.set(queue_uuid, load_result_path, 86400) + # redis_.srem(db_key_querying, queue_uuid) @app.route("/", methods=["POST"])