diff --git a/flask_check_bert.py b/flask_check_bert.py index 267dd7d..30b398a 100644 --- a/flask_check_bert.py +++ b/flask_check_bert.py @@ -4,7 +4,7 @@ from numpy.linalg import norm import pandas as pd # from rouge import Rouge from rouge_chinese import Rouge -from Rouge_w import Rouge_w,Rouge_l +from Rouge_w import Rouge_w, Rouge_l import json import pymysql import re @@ -15,6 +15,8 @@ import uuid import time import redis from threading import Thread +from multiprocessing import Pool + app = Flask(__name__) app.config["JSON_AS_ASCII"] = False @@ -25,7 +27,7 @@ db_key_query = 'query' db_key_querying = 'querying' db_key_queryset = 'queryset' -nums_cpus = 16 +nums_cpus = 24 rouge = Rouge() rouge_model = Rouge_w() rouge_l_model = Rouge_l() @@ -65,7 +67,6 @@ def bert_check(text, recall_data_list): return return_list - def rouge_value_self(data_1, data_2): data_1 = [' '.join(i) for i in data_1] data_2 = [' '.join(i) for i in data_2] @@ -81,7 +82,6 @@ def rouge_value_self(data_1, data_2): def rouge_pre(text, df_train_nuoche): - return_list = [] index_rouge_list = [] text_list = [text] * len(df_train_nuoche) @@ -100,12 +100,307 @@ def rouge_pre(text, df_train_nuoche): return return_list +def rouge_pre_m(text, df_train_nuoche): + return_list = [] + index_rouge_list = [] + + text_list = [text] * len(df_train_nuoche) + + data_list = [] + for data_dan in df_train_nuoche: + data_list.append(data_dan[0]) + rouge_1, rouge_2, rouge_l = rouge_value_self(text_list, data_list) + index_rouge_list.extend(rouge_l) + + re1 = [(i[0], i[1]) for i in sorted(list(enumerate(index_rouge_list)), key=lambda x: x[1], reverse=True)] + + return_list.extend(re1) + + return return_list + + +# 以单个章节为例 +def similar_content_func(): + ''' + 重复文章 + :return: + ''' + return [{ + "content": "重复的内容标红", + "thesis_info": "论文标题 + 论文作者 + 来源 + 年份日期--敞开式奶牛舍环境控制系统的设计 李晓红,刘晓丽,余泳昌 - 商丘工学院机械工程学院 - 2015-04-01", + "title": "标题", + "year": "日期", + "degree": "来源", + "author": "作者" + }] + + +def original_text_contrast_func(data_sentence_dan, paper_dict): + ''' + 重复的对比详细信息 + :param similar_content: + :return: + ''' + + original_text = "" + start = len(data_sentence_dan[0][1]) + end = 0 + similar_content = [] + for i in data_sentence_dan: # 可能有很多个暂且确定是一个 + + similar_content_dan = { + "paper_red_len_word": "", + "content": "重复的内容标红", + "thesis_info": "论文标题 + 论文作者 + 来源 + 年份日期--敞开式奶牛舍环境控制系统的设计 李晓红,刘晓丽,余泳昌 - 商丘工学院机械工程学院 - 2015-04-01", + "title": "标题", + "year": "日期", + "degree": "来源", + "author": "作者", + "paper_len_word": "" + } + + sentence_0_bool, sentence_0_dan_red = original_text_marked_red(i[1], paper_dict[i[0]][0], + paper_dict[i[0]][ + 1]) # text_original, bert_text, bert_text_pre + + sentence_1_bool, sentence_1_dan_red = original_text_marked_red(i[2], paper_dict[i[0]][2], + paper_dict[i[0]][ + 3]) # text_original, bert_text, bert_text_pre + + start_dan = sentence_0_dan_red.index("") + end_dan = sentence_0_dan_red.index("") - len("") + + if start_dan < start: + start = start_dan + if end_dan > end: + end = end_dan + + if sentence_0_bool == False or sentence_1_bool == False: + continue + + similar_content_dan["content"] = sentence_1_dan_red + similar_content_dan["title"] = i[3]["title"] + similar_content_dan["author"] = i[3]["author"] + similar_content_dan["degree"] = i[3]["degree"] + similar_content_dan["year"] = i[3]["year"] + similar_content_dan["paper_len_word"] = i[3]["paper_len_word"] + similar_content_dan["paper_red_len_word"] = len(paper_dict[i[0]][3]) + + thesis_info = " ".join( + [similar_content_dan["title"], similar_content_dan["author"], similar_content_dan["degree"], + similar_content_dan["year"]]) + similar_content_dan["thesis_info"] = thesis_info + + similar_content.append(similar_content_dan) + + original_text_list = list(data_sentence_dan[0][1]) + original_text_list.insert(end, "") + original_text_list.insert(start, "") + original_text = "".join(original_text_list) + + return_info = { + "original_text": original_text, + "dan_sentence_word_nums": end - start, + "similar_content": similar_content + } + return return_info + + +def repeat_quote_info_func(original_text_contrast): + ''' + 重复的引用信息 + :return: + ''' + chongfuwendang = {} + + for sentence_dan in original_text_contrast: + for i in sentence_dan["similar_content"]: + thesis_info = i["thesis_info"] + if thesis_info not in chongfuwendang: + chongfuwendang[thesis_info] = { + "quote": False, + "thesis_author": i["author"], + "thesis_date": i["year"], + "thesis_info": thesis_info, + "thesis_repeat_rate": (i["paper_red_len_word"] / i["paper_len_word"]) * 100, + # round(repetition_rate, 3) * 100 + "thesis_title": i["title"], + "thesis_link": "", + "thesis_publish": i["degree"], + "thesis_repeat_word": i["paper_red_len_word"], + "thesis_teacher": "", + "paper_len_word": i["paper_len_word"] + } + else: + chongfuwendang[thesis_info]["thesis_repeat_word"] += i["paper_red_len_word"] + chongfuwendang[thesis_info]["thesis_repeat_rate"] = (chongfuwendang[thesis_info]["thesis_repeat_word"] / + chongfuwendang[thesis_info][ + "paper_len_word"]) * 100 + chongfuwendang = sorted(chongfuwendang.items(), + key=lambda x: x[1]["thesis_repeat_rate"], reverse=False) + chongfuwendang_list = [i[1] for i in chongfuwendang] + + return chongfuwendang_list + + +def total_data_func(section_data_list): + ''' + 总体数据 + :return: + ''' + # "end_page_index": 0, + # "name": "第1部分", + # "repeat_rate": repeat_rate, + # "repeat_words": repeat_words, + # "start_page_index": 0, + # "words": section_words, + # "original_text": original_text, + # "original_text_oneself": original_text, + # "original_text_contrast/重复的对比详细信息": original_text_contrast, + # "repeat_quote_info/重复的引用信息": repeat_quote_info + + repeat_words = 0 + words = 0 + + for i in section_data_list: + repeat_words += i["repeat_words"] + words += i["words"] + + exclude_personal_rate = str(repeat_words / words * 100) + "%" + exclude_quote_rate = str(repeat_words / words * 100) + "%" + single_max_rate = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_rate"] + single_max_repeat_words = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_word"] + total_repeat_rate = str(repeat_words / words * 100) + "%" + total_repeat_words = repeat_words + total_words = words + + return { + "back_repeat_words": "", + "exclude_personal_rate": exclude_personal_rate, + "exclude_quote_rate": exclude_quote_rate, + "front_repeat_words": "", + "single_max_rate": single_max_rate, + "single_max_repeat_words": single_max_repeat_words, + "suspected_paragraph": "", + "suspected_paragraph_max_repeat_words": "", + "suspected_paragraph_min_repeat_words": "", + "total_paragraph": "", + "total_repeat_rate": total_repeat_rate, + "total_repeat_words": total_repeat_words, + "total_words": total_words, + "tables": 0 + } + + +def section_data_func_dan(): + ''' + 章节信息单个 + :return: + ''' + # { + # "section_name": "章节名称", + # "section_repeat_rate": "重复率", + # "section_repeat_words": "重复字数", + # "section_words": "章节字数", + # "oneself_repeat_words": "去除本人后重复字数", + # "reference_repeat_words": "去除引用后重复字数", + # "section_oneself_rate": "去除本人后重复率" + # } + + return { + "section_name": "", + "section_repeat_rate": "", + "section_repeat_words": "", + "section_words": "", + "oneself_repeat_words": "", + "reference_repeat_words": "", + "section_oneself_rate": "" + } + + +def section_data_func(section_details): + ''' + 章节信息 + :return: + ''' + # "end_page_index": 0, + # "name": "第1部分", + # "repeat_rate": repeat_rate, + # "repeat_words": repeat_words, + # "start_page_index": 0, + # "words": section_words, + # "original_text": original_text, + # "original_text_oneself": original_text, + # "original_text_contrast/重复的对比详细信息": original_text_contrast, + # "repeat_quote_info/重复的引用信息": repeat_quote_info + + section_name = section_details["name"] + section_repeat_rate = section_details["repeat_rate"] + section_repeat_words = section_details["repeat_words"] + section_words = section_details["words"] + oneself_repeat_words = section_details["repeat_words"] + reference_repeat_words = section_details["repeat_words"] + section_oneself_rate = section_details["repeat_rate"] + + return { + "section_name": section_name, + "section_repeat_rate": section_repeat_rate, + "section_repeat_words": section_repeat_words, + "section_words": section_words, + "oneself_repeat_words": oneself_repeat_words, + "reference_repeat_words": reference_repeat_words, + "section_oneself_rate": section_oneself_rate + } + + +def section_details_func(data_section_dan, paper_dict): + ''' + 章节详细信息 + :param original_text_contrast: + :param repeat_quote_info: + :return: + ''' + original_text_contrast = [] + section_repeat_rate = "" + repeat_words = 0 + section_words = 0 + oneself_repeat_words = "" + reference_repeat_words = "" + section_oneself_rate = "" + original_text_list = [] + + for sentence_dan in data_section_dan: + original_text_contrast_dan = original_text_contrast_func(sentence_dan, paper_dict) + original_text_contrast.append(original_text_contrast_dan) + repeat_words += original_text_contrast_dan["dan_sentence_word_nums"] + original_text_list.append(original_text_contrast_dan["original_text"]) + section_words += len(sentence_dan[0][1]) + + original_text = "。".join(original_text_list) + repeat_rate = repeat_words / section_words + + repeat_quote_info = repeat_quote_info_func(original_text_contrast) + + return { + "end_page_index": 0, + "name": "第1部分", + "repeat_rate": repeat_rate, + "repeat_words": repeat_words, + "start_page_index": 0, + "words": section_words, + "original_text": original_text, + "original_text_oneself": original_text, + "original_text_contrast": original_text_contrast, + "repeat_quote_info": repeat_quote_info + } + + def accurate_check_rouge( title, author, text_paper, recall_data_list - ): +): ''' 精确查重出相似句子 :param text: @@ -120,9 +415,23 @@ def accurate_check_rouge( sentence_word_nums = 0 # rouge算法查重 - for text in centent_list: - rouge_pre_list = rouge_pre(text, recall_data_list) - data_zong.append(rouge_pre_list) + rst = [] + p = Pool(nums_cpus) # 进程池中含有n个子进程 + + print("centent_list", centent_list) + + for i in range(len(centent_list)): + text = centent_list[i] + a = p.apply_async(rouge_pre_m, args=(text, recall_data_list,)) + rst.append(a) + p.close() + p.join() # 等待所有子进程执行完毕。调用join()之前必须先调用close(),调用close()之后就不能继续添加新的Process了。 + + rst = [i.get() for i in rst] + + for i in range(len(rst)): + print(rst[i]) + data_zong.append(rst[i]) t0 = time.time() # bert算法查重 @@ -132,7 +441,6 @@ def accurate_check_rouge( t1 = time.time() original_dict = [] - # 找出相似的句子序号 bool_check_sentense = [] # bert算法 @@ -142,12 +450,14 @@ def accurate_check_rouge( # rouge算法 for i in range(len(data_zong)): - if data_zong[i][0] > 0.47: - bool_check_sentense.append([i,data_zong[i][1]]) - biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]] - - print("bert精确查重时间", t1-t0) + for j in range(len(data_zong[i])): + if data_zong[i][j][1] > 0.47: + bool_check_sentense.append([i, data_zong[i][j][0]]) + biao_red = biaohong(bool_check_sentense, data_zong, + recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]] + print("bert精确查重时间", t1 - t0) + print(biao_red) sentence_0_list = [] sentence_1_list = [] @@ -156,7 +466,8 @@ def accurate_check_rouge( for i in biao_red: if recall_data_list[i[1][0]][1] == recall_data_list[i[1][1]][1] == recall_data_list[i[1][2]][1]: sentence_0_list.append("。".join([centent_list[i[0][0]], centent_list[i[0][1]], centent_list[i[0][2]]])) - sentence_1_list.append("".join([recall_data_list[i[1][0]][0], recall_data_list[i[1][1]][0], recall_data_list[i[1][2]][0]])) + sentence_1_list.append( + "".join([recall_data_list[i[1][0]][0], recall_data_list[i[1][1]][0], recall_data_list[i[1][2]][0]])) sim_paper_name.append(recall_data_list[i[1][0]][1]) else: continue @@ -164,7 +475,6 @@ def accurate_check_rouge( sentence_0_list_new = [] sentence_1_list_new = [] - for i in zip(sentence_0_list, sentence_1_list): if len(i[0]) + len(i[1]) < 1200: sentence_0_list_new.append(i[0]) @@ -183,141 +493,59 @@ def accurate_check_rouge( chongfuwendang = {} - for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), sentence_0_list_new, sentence_1_list_new, sim_paper_name): - - print([sentence_0_dan, sentence_1_dan]) - original_text_contrast_dict = { - "original_text": "", - "similar_content": [ - { - "content": "", - "thesis_info": "", - "title": "", - "year": "", - "degree": "", - "author": "", - } - ] - } - try: - sentence_0_bool, sentence_0_dan_red = original_text_marked_red(sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]) # text_original, bert_text, bert_text_pre - except: - print("报错", [sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]]) - continue - # 9/0 - sentence_1_bool, sentence_1_dan_red = original_text_marked_red(sentence_1_dan, paper_dict[paper_dict_dan_id][2], paper_dict[paper_dict_dan_id][3]) # text_original, bert_text, bert_text_pre + print("paper_dict", paper_dict) + print("sentence_0_list_new", sentence_0_list_new) + print("sentence_1_list_new", sentence_1_list_new) + print("sim_paper_name", sim_paper_name) + similar_content_control = [[]] - if sentence_0_bool == False or sentence_1_bool == False: - continue + with open("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/paper_dict.json", "w") as f: + json.dump(paper_dict, f, ensure_ascii=False) - dan_sentence_word_nums = len(paper_dict[paper_dict_dan_id][1]) - sentence_word_nums += dan_sentence_word_nums - - original_text.append(sentence_0_dan_red) - original_text_contrast_dict["original_text"] = "此处有 {} 字相似\n".format( - dan_sentence_word_nums) + sentence_0_dan_red - - thesis_info = " ".join([sim_paper_name_dan["title"], sim_paper_name_dan["author"], sim_paper_name_dan["degree"], sim_paper_name_dan["year"]]) - original_text_contrast_dict["similar_content"][0]["content"] = sentence_1_dan_red - original_text_contrast_dict["similar_content"][0]["title"] = sim_paper_name_dan["title"] - original_text_contrast_dict["similar_content"][0]["author"] = sim_paper_name_dan["author"] - original_text_contrast_dict["similar_content"][0]["degree"] = sim_paper_name_dan["degree"] - original_text_contrast_dict["similar_content"][0]["year"] = sim_paper_name_dan["year"] - original_text_contrast_dict["similar_content"][0]["thesis_info"] = thesis_info - - original_text_contrast.append(original_text_contrast_dict) - - # for i in repeat_quote_info: - # if - - if thesis_info not in chongfuwendang: - chongfuwendang[thesis_info] = { - "quote": False, - "thesis_author": sim_paper_name_dan["author"], - "thesis_date" : sim_paper_name_dan["year"], - "thesis_info" : thesis_info, - "thesis_repeat_rate": (dan_sentence_word_nums/sim_paper_name_dan["paper_len_word"]) * 100, #round(repetition_rate, 3) * 100 - "thesis_title": sim_paper_name_dan["title"], - "thesis_link": "", - "thesis_publish": sim_paper_name_dan["degree"], - "thesis_repeat_word": dan_sentence_word_nums, - "thesis_teacher": "", - "paper_len_word": sim_paper_name_dan["paper_len_word"] - } + sentence_0_list_new_cursor = sentence_0_list_new[0] + for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), + sentence_0_list_new, + sentence_1_list_new, + sim_paper_name): + + if sentence_0_list_new_cursor != sentence_0_dan: + similar_content_control.append([[paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan]]) else: - chongfuwendang[thesis_info]["thesis_repeat_word"] += dan_sentence_word_nums - chongfuwendang[thesis_info]["thesis_repeat_rate"] = (chongfuwendang[thesis_info]["thesis_repeat_word"]/chongfuwendang[thesis_info]["paper_len_word"]) * 100 + similar_content_control[-1].append([paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan]) + data = [similar_content_control] - chongfuwendang = sorted(chongfuwendang.items(), - key=lambda x: x[1]["thesis_repeat_rate"], reverse=False) + # 模拟多个章节 + section_details_list = [] + for data_dan in data: + data_section_dan = data_dan + # 章节详细信息 + section_details = section_details_func(data_section_dan, paper_dict) + section_details_list.append(section_details) - for i in range(len(chongfuwendang)): - repeat_paper_one_info_dict = chongfuwendang[i][1] - repeat_paper_one_info_dict.pop("paper_len_word") - repeat_paper_one_info_dict["thesis_repeat_rate"] = str(round(repeat_paper_one_info_dict["thesis_repeat_rate"], 1)) + "%" - repeat_quote_info.append(repeat_paper_one_info_dict) + # 模拟多个章节 - original_text = "。".join(original_text) + section_data_list = [] + for section_details in section_details_list: + section_data = section_data_func(section_details) - repetition_rate = sentence_word_nums/len(text_paper) - repetition_rate = round(repetition_rate, 3) * 100 + total_data = total_data_func(section_details_list) format = '%Y-%m-%d %H:%M:%S' value = time.localtime(int(time.time())) dt = time.strftime(format, value) - return { + paper_data = { "author": author, "check_time": dt, - "title": title, "time_range": "1900-01-01至2023-08-08", - "section_data": [ - { - "oneself_repeat_words": sentence_word_nums, - "reference_repeat_words": sentence_word_nums, - "section_name": "第1部分", - "section_oneself_rate": "{}%".format(repetition_rate), - "section_repeat_rate": "{}%".format(repetition_rate), - "section_repeat_words": sentence_word_nums, - "section_words": len(text_paper) - } - ], - "section_details": [ - { - "end_page_index": 0, - "name": "", - "repeat_rate": "", - "repeat_words": "", - "words": "", - "original_text": original_text, - "original_text_oneself": original_text, - "original_text_contrast": original_text_contrast, - "repeat_quote_info": repeat_quote_info - } - ], - "total_data": { - "back_repeat_words": "", - "exclude_personal_rate": "{}%".format(repetition_rate), - "exclude_quote_rate": "{}%".format(repetition_rate), - "foot_end_note": "0", - "front_repeat_words": "", - "single_max_rate": "", - "single_max_repeat_words": "", - "suspected_paragraph": "1", - "suspected_paragraph_max_repeat_words": "", - "suspected_paragraph_min_repeat_words": "", - "tables": "0", - "total_paragraph": "1", - "total_repeat_rate": "{}%".format(repetition_rate), - "total_repeat_words": sentence_word_nums, - "total_words": len(text_paper) - } + "title": title, + "total_data": total_data, + "section_data": section_data_list, + "section_details": section_details_list } - - - + return paper_data def biaohong(bool_check_sentense, data_zong, df_train_nuoche): @@ -331,15 +559,16 @@ def biaohong(bool_check_sentense, data_zong, df_train_nuoche): start = -1 end = -1 while True: - if i >= len(bool_check_sentense) or bool_check_sentense[i][0] +1 >= len(data_zong) or bool_check_sentense[i][1]+1 >= len(df_train_nuoche): + if i >= len(bool_check_sentense) or bool_check_sentense[i][0] + 1 >= len(data_zong) or bool_check_sentense[i][1] \ + + 1 >= len(df_train_nuoche): break - elif bool_check_sentense[i][0]-1 == start: + elif bool_check_sentense[i][0] - 1 == start: i += 1 continue elif bool_check_sentense[i][0] == end: i += 1 continue - elif bool_check_sentense[i][0]-1 == end: + elif bool_check_sentense[i][0] - 1 == end: i += 1 continue else: @@ -347,10 +576,10 @@ def biaohong(bool_check_sentense, data_zong, df_train_nuoche): biao_red_dan.append([bool_check_sentense[i][0] - 1, bool_check_sentense[i][1] - 1]) biao_red_dan.append([bool_check_sentense[i][0], bool_check_sentense[i][1]]) biao_red_dan.append([bool_check_sentense[i][0] + 1, bool_check_sentense[i][1] + 1]) - biao_red.append([[bool_check_sentense[i][0]-1, bool_check_sentense[i][0], bool_check_sentense[i][0]+1], - [bool_check_sentense[i][1]-1, bool_check_sentense[i][1], bool_check_sentense[i][1]+1]]) - start = bool_check_sentense[i][0]-1 - end = bool_check_sentense[i][0]+1 + biao_red.append([[bool_check_sentense[i][0] - 1, bool_check_sentense[i][0], bool_check_sentense[i][0] + 1], + [bool_check_sentense[i][1] - 1, bool_check_sentense[i][1], bool_check_sentense[i][1] + 1]]) + start = bool_check_sentense[i][0] - 1 + end = bool_check_sentense[i][0] + 1 i += 1 return biao_red @@ -378,8 +607,8 @@ def dialog_line_parse(url, text): # "".format(url, response.status_code, response.text) # ) print("【{}】 Failed to get a proper response from remote " - "server. Status Code: {}. Response: {}" - "".format(url, response.status_code, response.text)) + "server. Status Code: {}. Response: {}" + "".format(url, response.status_code, response.text)) print(text) return {} @@ -410,7 +639,7 @@ def original_text_marked_red(text_original, bert_text, bert_text_pre): while True: if down_pointer >= len(bert_text_pre): break - elif down_pointer == len(bert_text_pre)-1: + elif down_pointer == len(bert_text_pre) - 1: if bert_text[up_pointer] == bert_text_pre[down_pointer]: pointer_list.append(up_pointer) break @@ -428,7 +657,7 @@ def original_text_marked_red(text_original, bert_text, bert_text_pre): up_pointer += 1 down_pointer += 1 else: - if bert_text_pre[down_pointer:down_pointer+5] == "[UNK]": + if bert_text_pre[down_pointer:down_pointer + 5] == "[UNK]": up_pointer += 1 down_pointer += 5 pointer_list.append(up_pointer) @@ -441,12 +670,11 @@ def original_text_marked_red(text_original, bert_text, bert_text_pre): down_pointer = 0 pointer_list = [] - start = pointer_list[0] end = pointer_list[-1] bert_text_list = list(bert_text) bert_text_list.insert(start, "") - bert_text_list.insert(end + 2 , "") + bert_text_list.insert(end + 2, "") text_original_list = list(text_original) @@ -482,30 +710,14 @@ def biaohong_bert_predict(sentence_0_list, sentence_1_list): :return: ''' - # sentence_0_list = [] - # sentence_1_list = [] - # sim_paper_name = [] - # - # for i in biaohong_list: - # sentence_0_list.append("。".join([paper_list[i[0][0]], paper_list[i[0][1]], paper_list[i[0][2]]])) - # sentence_1_list.append("。".join([recall_data_list[i[1][1]], recall_data_list[i[1][1]], recall_data_list[i[1][2]]])) - - paper_dict = dialog_line_parse("http://192.168.31.74:16003/", {"sentence_0": sentence_0_list, "sentence_1": sentence_1_list})["resilt"] - - # paper_dict - # print("原文:".format(i), paper_dict[i][0]) - # print("原文标红:".format(i), paper_dict[i][1]) - # print("相似:".format(i), paper_dict[i][2]) - # print("相似标红:".format(i), paper_dict[i][3]) - - # original_text - # - # - # for paper_dict_dan, sentence_0_dan, sentence_1_dan in zip(paper_dict, sentence_0_list, sentence_1_list): - # original_text_marked_red + paper_dict = \ + dialog_line_parse("http://192.168.31.74:16003/", + {"sentence_0": sentence_0_list, "sentence_1": sentence_1_list})[ + "resilt"] return paper_dict + def ulit_text(title, text): data = [] try: @@ -520,6 +732,7 @@ def ulit_text(title, text): data.append([i, title]) return data + def run_query(conn, sql, params): with conn.cursor() as cursor: cursor.execute(sql, params) @@ -587,9 +800,8 @@ def ulit_recall_paper(recall_data_list_dict): # data.append([sentence, filename]) # return data - data = [] - for i in list(recall_data_list_dict.items())[:5]: + for i in list(recall_data_list_dict.items())[:10]: data_one = processing_one_text(i[0]) data.extend(data_one) @@ -652,18 +864,18 @@ def uilt_content(content): key_word_bool = True break - if zhaiyao_bool== True and zhaiyao_en_bool == True: - pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,zhaiyao_en_str) + if zhaiyao_bool == True and zhaiyao_en_bool == True: + pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, zhaiyao_en_str) result_biaoti_list = re.findall(pantten_zhaiyao, content) zhaiyao_text = result_biaoti_list[0] elif zhaiyao_bool == True and key_word_bool == True: - pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,key_word_str ) + pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, key_word_str) result_biaoti_list = re.findall(pantten_zhaiyao, content) zhaiyao_text = result_biaoti_list[0] elif zhaiyao_bool == True and mulu_bool == True: - pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,mulu_str) + pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, mulu_str) result_biaoti_list = re.findall(pantten_zhaiyao, content) zhaiyao_text = result_biaoti_list[0] @@ -688,7 +900,6 @@ def ulit_request_file(file): return abst_zh, content - # @app.route("/", methods=["POST"]) # def handle_query(): # print(request.remote_addr) @@ -761,7 +972,6 @@ def classify(): # 调用模型,设置最大batch_size goodsId = data_dict['goodsId'] callbackUrl = data_dict['callbackUrl'] - # 调用宇鹏查询相似十篇 # recall_data_list_dict = recall_10(title, abst_zh, content) @@ -805,10 +1015,10 @@ def handle_query(): # request.form.get('prompt') dataBases = request.form.get("dataBases") - minSimilarity = request.form.get("minSimilarity") # txt + minSimilarity = request.form.get("minSimilarity") # txt minWords = request.form.get("minWords") title = request.form.get("title") - author = request.form.get("author") # txt + author = request.form.get("author") # txt file = request.files.get('file') token = request.form.get("token") account = request.form.get("account") @@ -833,7 +1043,7 @@ def handle_query(): 'account': account, 'goodsId': goodsId, 'callbackUrl': callbackUrl - } + } # 绑定文本和query id print(d) @@ -860,8 +1070,9 @@ def handle_query(): return_text = {'code': 1} return jsonify(return_text) # 返回结果 + t = Thread(target=classify) t.start() if __name__ == "__main__": - app.run(host="0.0.0.0", port=16001, threaded=True, debug=True, use_reloader=False) \ No newline at end of file + app.run(host="0.0.0.0", port=16001, threaded=True, debug=True) diff --git a/flask_check_bert_test.py b/flask_check_bert_test.py index ac36ec1..30b398a 100644 --- a/flask_check_bert_test.py +++ b/flask_check_bert_test.py @@ -4,7 +4,7 @@ from numpy.linalg import norm import pandas as pd # from rouge import Rouge from rouge_chinese import Rouge -from Rouge_w import Rouge_w,Rouge_l +from Rouge_w import Rouge_w, Rouge_l import json import pymysql import re @@ -16,6 +16,7 @@ import time import redis from threading import Thread from multiprocessing import Pool + app = Flask(__name__) app.config["JSON_AS_ASCII"] = False @@ -66,7 +67,6 @@ def bert_check(text, recall_data_list): return return_list - def rouge_value_self(data_1, data_2): data_1 = [' '.join(i) for i in data_1] data_2 = [' '.join(i) for i in data_2] @@ -82,7 +82,6 @@ def rouge_value_self(data_1, data_2): def rouge_pre(text, df_train_nuoche): - return_list = [] index_rouge_list = [] text_list = [text] * len(df_train_nuoche) @@ -102,7 +101,6 @@ def rouge_pre(text, df_train_nuoche): def rouge_pre_m(text, df_train_nuoche): - return_list = [] index_rouge_list = [] @@ -144,12 +142,11 @@ def original_text_contrast_func(data_sentence_dan, paper_dict): :return: ''' - original_text = "" start = len(data_sentence_dan[0][1]) end = 0 similar_content = [] - for i in data_sentence_dan: #可能有很多个暂且确定是一个 + for i in data_sentence_dan: # 可能有很多个暂且确定是一个 similar_content_dan = { "paper_red_len_word": "", @@ -216,7 +213,6 @@ def repeat_quote_info_func(original_text_contrast): ''' chongfuwendang = {} - for sentence_dan in original_text_contrast: for i in sentence_dan["similar_content"]: thesis_info = i["thesis_info"] @@ -238,7 +234,8 @@ def repeat_quote_info_func(original_text_contrast): else: chongfuwendang[thesis_info]["thesis_repeat_word"] += i["paper_red_len_word"] chongfuwendang[thesis_info]["thesis_repeat_rate"] = (chongfuwendang[thesis_info]["thesis_repeat_word"] / - chongfuwendang[thesis_info]["paper_len_word"]) * 100 + chongfuwendang[thesis_info][ + "paper_len_word"]) * 100 chongfuwendang = sorted(chongfuwendang.items(), key=lambda x: x[1]["thesis_repeat_rate"], reverse=False) chongfuwendang_list = [i[1] for i in chongfuwendang] @@ -265,16 +262,15 @@ def total_data_func(section_data_list): repeat_words = 0 words = 0 - for i in section_data_list: repeat_words += i["repeat_words"] words += i["words"] - exclude_personal_rate = str(repeat_words/words * 100) + "%" - exclude_quote_rate = str(repeat_words/words * 100) + "%" + exclude_personal_rate = str(repeat_words / words * 100) + "%" + exclude_quote_rate = str(repeat_words / words * 100) + "%" single_max_rate = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_rate"] single_max_repeat_words = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_word"] - total_repeat_rate = str(repeat_words/words * 100) + "%" + total_repeat_rate = str(repeat_words / words * 100) + "%" total_repeat_words = repeat_words total_words = words @@ -321,6 +317,7 @@ def section_data_func_dan(): "section_oneself_rate": "" } + def section_data_func(section_details): ''' 章节信息 @@ -380,13 +377,10 @@ def section_details_func(data_section_dan, paper_dict): section_words += len(sentence_dan[0][1]) original_text = "。".join(original_text_list) - repeat_rate = repeat_words/section_words + repeat_rate = repeat_words / section_words repeat_quote_info = repeat_quote_info_func(original_text_contrast) - - - return { "end_page_index": 0, "name": "第1部分", @@ -401,13 +395,12 @@ def section_details_func(data_section_dan, paper_dict): } - def accurate_check_rouge( title, author, text_paper, recall_data_list - ): +): ''' 精确查重出相似句子 :param text: @@ -448,7 +441,6 @@ def accurate_check_rouge( t1 = time.time() original_dict = [] - # 找出相似的句子序号 bool_check_sentense = [] # bert算法 @@ -460,13 +452,13 @@ def accurate_check_rouge( for i in range(len(data_zong)): for j in range(len(data_zong[i])): if data_zong[i][j][1] > 0.47: - bool_check_sentense.append([i,data_zong[i][j][0]]) - biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]] + bool_check_sentense.append([i, data_zong[i][j][0]]) + biao_red = biaohong(bool_check_sentense, data_zong, + recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]] - print("bert精确查重时间", t1-t0) + print("bert精确查重时间", t1 - t0) print(biao_red) - sentence_0_list = [] sentence_1_list = [] sim_paper_name = [] @@ -474,7 +466,8 @@ def accurate_check_rouge( for i in biao_red: if recall_data_list[i[1][0]][1] == recall_data_list[i[1][1]][1] == recall_data_list[i[1][2]][1]: sentence_0_list.append("。".join([centent_list[i[0][0]], centent_list[i[0][1]], centent_list[i[0][2]]])) - sentence_1_list.append("".join([recall_data_list[i[1][0]][0], recall_data_list[i[1][1]][0], recall_data_list[i[1][2]][0]])) + sentence_1_list.append( + "".join([recall_data_list[i[1][0]][0], recall_data_list[i[1][1]][0], recall_data_list[i[1][2]][0]])) sim_paper_name.append(recall_data_list[i[1][0]][1]) else: continue @@ -482,7 +475,6 @@ def accurate_check_rouge( sentence_0_list_new = [] sentence_1_list_new = [] - for i in zip(sentence_0_list, sentence_1_list): if len(i[0]) + len(i[1]) < 1200: sentence_0_list_new.append(i[0]) @@ -556,8 +548,6 @@ def accurate_check_rouge( return paper_data - - def biaohong(bool_check_sentense, data_zong, df_train_nuoche): ''' 标红的序号 [[0,1,2],[3,4,5]] @@ -569,15 +559,16 @@ def biaohong(bool_check_sentense, data_zong, df_train_nuoche): start = -1 end = -1 while True: - if i >= len(bool_check_sentense) or bool_check_sentense[i][0] +1 >= len(data_zong) or bool_check_sentense[i][1]+1 >= len(df_train_nuoche): + if i >= len(bool_check_sentense) or bool_check_sentense[i][0] + 1 >= len(data_zong) or bool_check_sentense[i][1] \ + + 1 >= len(df_train_nuoche): break - elif bool_check_sentense[i][0]-1 == start: + elif bool_check_sentense[i][0] - 1 == start: i += 1 continue elif bool_check_sentense[i][0] == end: i += 1 continue - elif bool_check_sentense[i][0]-1 == end: + elif bool_check_sentense[i][0] - 1 == end: i += 1 continue else: @@ -585,10 +576,10 @@ def biaohong(bool_check_sentense, data_zong, df_train_nuoche): biao_red_dan.append([bool_check_sentense[i][0] - 1, bool_check_sentense[i][1] - 1]) biao_red_dan.append([bool_check_sentense[i][0], bool_check_sentense[i][1]]) biao_red_dan.append([bool_check_sentense[i][0] + 1, bool_check_sentense[i][1] + 1]) - biao_red.append([[bool_check_sentense[i][0]-1, bool_check_sentense[i][0], bool_check_sentense[i][0]+1], - [bool_check_sentense[i][1]-1, bool_check_sentense[i][1], bool_check_sentense[i][1]+1]]) - start = bool_check_sentense[i][0]-1 - end = bool_check_sentense[i][0]+1 + biao_red.append([[bool_check_sentense[i][0] - 1, bool_check_sentense[i][0], bool_check_sentense[i][0] + 1], + [bool_check_sentense[i][1] - 1, bool_check_sentense[i][1], bool_check_sentense[i][1] + 1]]) + start = bool_check_sentense[i][0] - 1 + end = bool_check_sentense[i][0] + 1 i += 1 return biao_red @@ -616,8 +607,8 @@ def dialog_line_parse(url, text): # "".format(url, response.status_code, response.text) # ) print("【{}】 Failed to get a proper response from remote " - "server. Status Code: {}. Response: {}" - "".format(url, response.status_code, response.text)) + "server. Status Code: {}. Response: {}" + "".format(url, response.status_code, response.text)) print(text) return {} @@ -648,7 +639,7 @@ def original_text_marked_red(text_original, bert_text, bert_text_pre): while True: if down_pointer >= len(bert_text_pre): break - elif down_pointer == len(bert_text_pre)-1: + elif down_pointer == len(bert_text_pre) - 1: if bert_text[up_pointer] == bert_text_pre[down_pointer]: pointer_list.append(up_pointer) break @@ -666,7 +657,7 @@ def original_text_marked_red(text_original, bert_text, bert_text_pre): up_pointer += 1 down_pointer += 1 else: - if bert_text_pre[down_pointer:down_pointer+5] == "[UNK]": + if bert_text_pre[down_pointer:down_pointer + 5] == "[UNK]": up_pointer += 1 down_pointer += 5 pointer_list.append(up_pointer) @@ -679,12 +670,11 @@ def original_text_marked_red(text_original, bert_text, bert_text_pre): down_pointer = 0 pointer_list = [] - start = pointer_list[0] end = pointer_list[-1] bert_text_list = list(bert_text) bert_text_list.insert(start, "") - bert_text_list.insert(end + 2 , "") + bert_text_list.insert(end + 2, "") text_original_list = list(text_original) @@ -720,10 +710,14 @@ def biaohong_bert_predict(sentence_0_list, sentence_1_list): :return: ''' - paper_dict = dialog_line_parse("http://192.168.31.74:16003/", {"sentence_0": sentence_0_list, "sentence_1": sentence_1_list})["resilt"] + paper_dict = \ + dialog_line_parse("http://192.168.31.74:16003/", + {"sentence_0": sentence_0_list, "sentence_1": sentence_1_list})[ + "resilt"] return paper_dict + def ulit_text(title, text): data = [] try: @@ -738,6 +732,7 @@ def ulit_text(title, text): data.append([i, title]) return data + def run_query(conn, sql, params): with conn.cursor() as cursor: cursor.execute(sql, params) @@ -805,7 +800,6 @@ def ulit_recall_paper(recall_data_list_dict): # data.append([sentence, filename]) # return data - data = [] for i in list(recall_data_list_dict.items())[:10]: data_one = processing_one_text(i[0]) @@ -870,18 +864,18 @@ def uilt_content(content): key_word_bool = True break - if zhaiyao_bool== True and zhaiyao_en_bool == True: - pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,zhaiyao_en_str) + if zhaiyao_bool == True and zhaiyao_en_bool == True: + pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, zhaiyao_en_str) result_biaoti_list = re.findall(pantten_zhaiyao, content) zhaiyao_text = result_biaoti_list[0] elif zhaiyao_bool == True and key_word_bool == True: - pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,key_word_str ) + pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, key_word_str) result_biaoti_list = re.findall(pantten_zhaiyao, content) zhaiyao_text = result_biaoti_list[0] elif zhaiyao_bool == True and mulu_bool == True: - pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,mulu_str) + pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, mulu_str) result_biaoti_list = re.findall(pantten_zhaiyao, content) zhaiyao_text = result_biaoti_list[0] @@ -906,7 +900,6 @@ def ulit_request_file(file): return abst_zh, content - # @app.route("/", methods=["POST"]) # def handle_query(): # print(request.remote_addr) @@ -979,7 +972,6 @@ def classify(): # 调用模型,设置最大batch_size goodsId = data_dict['goodsId'] callbackUrl = data_dict['callbackUrl'] - # 调用宇鹏查询相似十篇 # recall_data_list_dict = recall_10(title, abst_zh, content) @@ -1023,10 +1015,10 @@ def handle_query(): # request.form.get('prompt') dataBases = request.form.get("dataBases") - minSimilarity = request.form.get("minSimilarity") # txt + minSimilarity = request.form.get("minSimilarity") # txt minWords = request.form.get("minWords") title = request.form.get("title") - author = request.form.get("author") # txt + author = request.form.get("author") # txt file = request.files.get('file') token = request.form.get("token") account = request.form.get("account") @@ -1051,7 +1043,7 @@ def handle_query(): 'account': account, 'goodsId': goodsId, 'callbackUrl': callbackUrl - } + } # 绑定文本和query id print(d) @@ -1078,8 +1070,9 @@ def handle_query(): return_text = {'code': 1} return jsonify(return_text) # 返回结果 + t = Thread(target=classify) t.start() if __name__ == "__main__": - app.run(host="0.0.0.0", port=16001, threaded=True, debug=True) \ No newline at end of file + app.run(host="0.0.0.0", port=16001, threaded=True, debug=True)