diff --git a/flask_check_bert_test.py b/flask_check_bert_test.py index c86d33a..584bae8 100644 --- a/flask_check_bert_test.py +++ b/flask_check_bert_test.py @@ -182,7 +182,7 @@ def similar_content_func(): }] -def original_text_contrast_func(data_sentence_dan, paper_dict): +def original_text_contrast_func(data_sentence_dan, paper_dict, centent_list): ''' 重复的对比详细信息 :param similar_content: @@ -241,9 +241,26 @@ def original_text_contrast_func(data_sentence_dan, paper_dict): similar_content.append(similar_content_dan) original_text_list = list(data_sentence_dan[0][1]) - original_text_list.insert(end, "") - original_text_list.insert(start, "") - original_text = "此处有 {} 字相似\n".format(str(end - start)) + "".join(original_text_list) + # original_text_list.insert(end, "\n") + # original_text_list.insert(start, "\n") + target_text_str = "".join(["\n"] + original_text_list[start: end] + ["\n"]) + + original_text_start = "".join(original_text_list[:start]) + original_text_end = "".join(original_text_list[end:]) + + if data_sentence_dan[0][4][0]-1 < 0: + start_sen = "" + else: + start_sen = centent_list[data_sentence_dan[0][4][0]-1] + + if data_sentence_dan[0][4][0]+1 > len(centent_list) -1: + end_sen = "" + else: + end_sen = centent_list[data_sentence_dan[0][4][2]+1] + + start_sen = start_sen + original_text_start + end_sen = original_text_end + end_sen + original_text = "此处有 {} 字相似\n".format(str(end - start)) + start_sen[-60:] + target_text_str + end_sen[:60] return_info = { "original_text": original_text, @@ -411,7 +428,7 @@ def section_data_func(section_details): } -def section_details_func(data_section_dan, paper_dict, num_words): +def section_details_func(data_section_dan, paper_dict, num_words, centent_list): ''' 章节详细信息 :param original_text_contrast: @@ -428,12 +445,12 @@ def section_details_func(data_section_dan, paper_dict, num_words): original_text_list = [] for sentence_dan in data_section_dan: - original_text_contrast_dan = original_text_contrast_func(sentence_dan, paper_dict) + original_text_contrast_dan = original_text_contrast_func(sentence_dan, paper_dict, centent_list) original_text_contrast.append(original_text_contrast_dan) repeat_words += original_text_contrast_dan["dan_sentence_word_nums"] original_text_list.append(original_text_contrast_dan["original_text"]) - original_text = "。".join(original_text_list) + original_text = "".join(original_text_list) repeat_rate = (repeat_words / section_words)* 100 repeat_rate = str(round(repeat_rate, 1)) + "%" @@ -453,7 +470,7 @@ def section_details_func(data_section_dan, paper_dict, num_words): } -def check_dict(similar_content_control, paper_dict, num_words, title, author): +def check_dict(similar_content_control, paper_dict, num_words, title, author, centent_list): ''' 生成返回字典 :param similar_content_control: @@ -472,7 +489,7 @@ def check_dict(similar_content_control, paper_dict, num_words, title, author): data_section_dan = data_dan # 章节详细信息 - section_details = section_details_func(data_section_dan, paper_dict, num_words) + section_details = section_details_func(data_section_dan, paper_dict, num_words, centent_list) section_details_list.append(section_details) # 模拟多个章节 @@ -612,7 +629,9 @@ def accurate_check_rouge( for i in centent_list_old: num_words += len(i) if len(i) < 300: - centent_list.append(i) + centent_list.append(i + "。") + if i == "": + continue for i in range(len(centent_list)): text = centent_list[i] rst.append(rouge_pre_m(text, recall_data_list)) @@ -655,23 +674,51 @@ def accurate_check_rouge( print("bool_check_sentense", bool_check_sentense) print("找出相似的句子序号完成") + biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[[0, 1, 2], [479, 480, 481]],[[0, 1, 2], [471, 472, 473]]], [[[3, 4, 5], [481, 482, 483]], [[3, 4, 5], [461, 462, 463]]]] print("biao_red", str(biao_red)) + + original_sentence_index = [] + # for i in biao_red: + # for j in i: + # original_sentence_index.append(j[0]) + sentence_0_list = [] sentence_1_list = [] sim_paper_name = [] for i in range(len(biao_red)): for j in range(len(biao_red[i])): - if recall_data_list[biao_red[i][j][1][0]][1] == recall_data_list[biao_red[i][j][1][1]][1] == recall_data_list[biao_red[i][j][1][2]][1]: - sentence_0_list.append("。".join([centent_list[biao_red[i][j][0][0]], centent_list[biao_red[i][j][0][1]], centent_list[biao_red[i][j][0][2]]])) - sentence_1_list.append( - "".join([recall_data_list[biao_red[i][j][1][0]][0], recall_data_list[biao_red[i][j][1][1]][0], recall_data_list[biao_red[i][j][1][2]][0]])) - sim_paper_name.append(recall_data_list[biao_red[i][j][1][0]][1]) - else: - continue + # if recall_data_list[biao_red[i][j][1][0]][1] == recall_data_list[biao_red[i][j][1][1]][1] == recall_data_list[biao_red[i][j][1][2]][1]: + # sentence_0_list.append("".join([centent_list[biao_red[i][j][0][0]], centent_list[biao_red[i][j][0][1]], centent_list[biao_red[i][j][0][2]]])) + # sentence_1_list.append( + # "".join([recall_data_list[biao_red[i][j][1][0]][0], recall_data_list[biao_red[i][j][1][1]][0], recall_data_list[biao_red[i][j][1][2]][0]])) + # sim_paper_name.append(recall_data_list[biao_red[i][j][1][0]][1]) + # else: + # continue + + + file_name = recall_data_list[biao_red[i][j][1][1]][1] + sentence_0_list_dan = [] + sentence_1_list_dan = [] + sentence_0_list_dan_index = [] + houxuna_file_list = [ + [recall_data_list[biao_red[i][j][1][0]][1], centent_list[biao_red[i][j][0][0]], recall_data_list[biao_red[i][j][1][0]][0]], + [recall_data_list[biao_red[i][j][1][1]][1], centent_list[biao_red[i][j][0][1]], recall_data_list[biao_red[i][j][1][1]][0]], + [recall_data_list[biao_red[i][j][1][2]][1], centent_list[biao_red[i][j][0][2]], recall_data_list[biao_red[i][j][1][2]][0]] + ] + for dan_sen_info in houxuna_file_list: + sentence_0_list_dan.append(dan_sen_info[1]) + if dan_sen_info[0] == file_name: + sentence_1_list_dan.append(dan_sen_info[2]) + + sentence_0_list.append("".join(sentence_0_list_dan)) + sentence_1_list.append("".join(sentence_1_list_dan)) + original_sentence_index.append(biao_red[i][j][0]) + sim_paper_name.append(recall_data_list[biao_red[i][j][1][1]][1]) + print("待标红句子筛选完成") sentence_0_list_new = [] @@ -713,18 +760,19 @@ def accurate_check_rouge( sentence_0_list_new_cursor = sentence_0_list_new[0] - for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), + for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan in zip(range(len(paper_dict)), sentence_0_list_new, sentence_1_list_new, - sim_paper_name): + sim_paper_name, + original_sentence_index): if sentence_0_list_new_cursor != sentence_0_dan: - similar_content_control.append([[paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan]]) + similar_content_control.append([[paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan]]) sentence_0_list_new_cursor = sentence_0_dan else: - similar_content_control[-1].append([paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan]) + similar_content_control[-1].append([paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan]) - paper_data = check_dict(similar_content_control, paper_dict, num_words, title, author) + paper_data = check_dict(similar_content_control, paper_dict, num_words, title, author, centent_list) # data = [similar_content_control] # # # 模拟多个章节 @@ -1227,40 +1275,55 @@ def classify_accurate_check(): goodsId = data_dict['goodsId'] callbackUrl = data_dict['callbackUrl'] + try: + print("查找相似的50篇完成") + print(len(content)) + + # with open("data/rell_json.txt") as f: + # recall_data_list_dict = eval(f.read()) + # recall_data_list = ulit_recall_paper(recall_data_list_dict) + print("文章格式转化完成") - print("查找相似的50篇完成") - print(len(content)) + # recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist() - # with open("data/rell_json.txt") as f: - # recall_data_list_dict = eval(f.read()) - # recall_data_list = ulit_recall_paper(recall_data_list_dict) + # 进入精确查重系统 + print("进入精确查重系统") - print("文章格式转化完成") + return_list = accurate_check_rouge(title, author, content, recall_data_list) - # recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist() + return_text = {"resilt": return_list, "probabilities": None, "status_code": 200} - # 进入精确查重系统 - print("进入精确查重系统") + load_result_path = "./new_data_logs/{}.json".format(queue_uuid) - return_list = accurate_check_rouge(title, author, content, recall_data_list) + print("queue_uuid: ", queue_uuid) + print("load_result_path: ", load_result_path) - return_text = {"resilt": return_list, "probabilities": None, "status_code": 200} + with open(load_result_path, 'w', encoding='utf8') as f2: + # ensure_ascii=False才能输入中文,否则是Unicode字符 + # indent=2 JSON数据的缩进,美观 + json.dump(return_text, f2, ensure_ascii=False, indent=4) - load_result_path = "./new_data_logs/{}.json".format(queue_uuid) + print(queue_uuid) + print(load_result_path) + redis_.set(queue_uuid, load_result_path, 86400) + redis_.srem(db_key_querying, queue_uuid) + except: + return_text = {"resilt": "", "probabilities": None, "status_code": 401} + load_result_path = "./new_data_logs/{}.json".format(queue_uuid) - print("queue_uuid: ", queue_uuid) - print("load_result_path: ", load_result_path) + print("queue_uuid: ", queue_uuid) + print("load_result_path: ", load_result_path) - with open(load_result_path, 'w', encoding='utf8') as f2: - # ensure_ascii=False才能输入中文,否则是Unicode字符 - # indent=2 JSON数据的缩进,美观 - json.dump(return_text, f2, ensure_ascii=False, indent=4) + with open(load_result_path, 'w', encoding='utf8') as f2: + # ensure_ascii=False才能输入中文,否则是Unicode字符 + # indent=2 JSON数据的缩进,美观 + json.dump(return_text, f2, ensure_ascii=False, indent=4) - print(queue_uuid) - print(load_result_path) - redis_.set(queue_uuid, load_result_path, 86400) - redis_.srem(db_key_querying, queue_uuid) + print(queue_uuid) + print(load_result_path) + redis_.set(queue_uuid, load_result_path, 86400) + redis_.srem(db_key_querying, queue_uuid) @app.route("/", methods=["POST"])