import os import numpy as np from numpy.linalg import norm import pandas as pd # from rouge import Rouge from rouge_chinese import Rouge from Rouge_w import Rouge_w, Rouge_l import json import pymysql import re import requests from flask import Flask, jsonify from flask import request import uuid import time import redis from threading import Thread from multiprocessing import Pool app = Flask(__name__) app.config["JSON_AS_ASCII"] = False pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=8, password="zhicheng123*") redis_ = redis.Redis(connection_pool=pool, decode_responses=True) db_key_query = 'query_check_task' db_key_querying = 'querying_check_task' db_key_queryset = 'queryset_check_task' db_key_query_recall = 'query_recall' nums_cpus = 24 rouge = Rouge() rouge_model = Rouge_w() rouge_l_model = Rouge_l() def jaccard_similarity(s1, s2): set1 = set(s1) set2 = set(s2) intersection = set1 & set2 union = set1 | set2 return len(intersection) / len(union) def bert_check(text, recall_data_list): ''' bert 查重 :return: ''' sen_0 = [text] * len(recall_data_list) sen_1 = [i[0] for i in recall_data_list] return_list = [] request_json = { "texts": [sen_0, sen_1], } paper_dict = dialog_line_parse("http://192.168.31.74:16002/", request_json) score_list = paper_dict["res"] # 后期要改 # return_list.append(re1[0][1]) # return_list.append(re1[0][0]) if 1 in score_list: index_score = score_list.index(1) else: index_score = "NaN" if index_score == "NaN": return_list.append(0) return_list.append("") else: return_list.append(1) return_list.append(index_score) return return_list def rouge_value_self(data_1, data_2): data_1 = [' '.join(i) for i in data_1] data_2 = [' '.join(i) for i in data_2] rouge_l_list = [] for sen_1, sen_2 in zip(data_1, data_2): sen_1 = sen_1.split(" ") sen_2 = sen_2.split(" ") rouge_l_score = rouge_l_model.score(sen_1, sen_2) rouge_l_list.append(rouge_l_score) return "", "", rouge_l_list def strsim_value(data_1, data_2): data_1 = [' '.join(i) for i in data_1] data_2 = [' '.join(i) for i in data_2] rouge_l_list = [] for sen_1, sen_2 in zip(data_1, data_2): sen_1 = sen_1.split(" ") sen_2 = sen_2.split(" ") rouge_l_score = jaccard_similarity(sen_1, sen_2) rouge_l_list.append(rouge_l_score) return "", "", rouge_l_list def rouge_pre(text, df_train_nuoche): return_list = [] index_rouge_list = [] text_list = [text] * len(df_train_nuoche) data_list = [] for data_dan in df_train_nuoche: data_list.append(data_dan[0]) rouge_1, rouge_2, rouge_l = rouge_value_self(text_list, data_list) index_rouge_list.extend(rouge_l) re1 = [(i[0], i[1]) for i in sorted(list(enumerate(index_rouge_list)), key=lambda x: x[1], reverse=True)] return_list.append(re1[0][1]) return_list.append(re1[0][0]) return return_list def rouge_pre_m(text, df_train_nuoche): return_list = [] index_rouge_list = [] text_list = [text] * len(df_train_nuoche) data_list = [] for data_dan in df_train_nuoche: data_list.append(data_dan[0]) rouge_1, rouge_2, rouge_l = strsim_value(text_list, data_list) index_rouge_list.extend(rouge_l) re1 = [(i[0], i[1]) for i in sorted(list(enumerate(index_rouge_list)), key=lambda x: x[1], reverse=True)] return_list.extend(re1) return return_list def rouge_pre_m_1(bool_check_sentense, centent_list, recall_data_list): # bool_check_sentense [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] bool_check_sentense_new = [] for bool_check_sentense_dan in bool_check_sentense: bool_check_sentense_new_dan = [] text_list = [] data_list = [] linshi = [] for i in bool_check_sentense_dan: text1 = centent_list[i[0]] text2 = recall_data_list[i[1]][0] linshi.append([i[0], i[1]]) text_list.append(text1) data_list.append(text2) _, _, rouge_l_list = rouge_value_self(text_list, data_list) for i in range(len(rouge_l_list)): if rouge_l_list[i] > 0.47: bool_check_sentense_new_dan.append(linshi[i]) if bool_check_sentense_new_dan != []: bool_check_sentense_new.append(bool_check_sentense_new_dan) return bool_check_sentense_new # 以单个章节为例 def similar_content_func(): ''' 重复文章 :return: ''' return [{ "content": "重复的内容标红", "thesis_info": "论文标题 + 论文作者 + 来源 + 年份日期--敞开式奶牛舍环境控制系统的设计 李晓红,刘晓丽,余泳昌 - 商丘工学院机械工程学院 - 2015-04-01", "title": "标题", "year": "日期", "degree": "来源", "author": "作者" }] def original_text_contrast_func(data_sentence_dan, paper_dict, centent_list): ''' 重复的对比详细信息 :param similar_content: :return: ''' original_text = "" start = len(data_sentence_dan[0][1]) end = 0 similar_content = [] for i in data_sentence_dan: # 可能有很多个暂且确定是一个 similar_content_dan = { "paper_red_len_word": "", "content": "重复的内容标红", "thesis_info": "论文标题 + 论文作者 + 来源 + 年份日期--敞开式奶牛舍环境控制系统的设计 李晓红,刘晓丽,余泳昌 - 商丘工学院机械工程学院 - 2015-04-01", "title": "标题", "year": "日期", "degree": "来源", "author": "作者", "paper_len_word": "" } sentence_0_bool, sentence_0_dan_red = original_text_marked_red(i[1], paper_dict[i[0]][0], paper_dict[i[0]][4][0], paper_dict[i[0]][4][1]) # text_original, bert_text, bert_text_pre sentence_1_bool, sentence_1_dan_red = original_text_marked_red(i[2], paper_dict[i[0]][2], paper_dict[i[0]][4][2], paper_dict[i[0]][4][3]) # text_original, bert_text, bert_text_pre if sentence_0_bool == False or sentence_1_bool == False: continue start_dan = sentence_0_dan_red.index("") end_dan = sentence_0_dan_red.index("") - len("") if start_dan < start: start = start_dan if end_dan > end: end = end_dan similar_content_dan["content"] = sentence_1_dan_red similar_content_dan["title"] = i[3]["title"] similar_content_dan["author"] = i[3]["author"] similar_content_dan["degree"] = i[3]["degree"] similar_content_dan["year"] = i[3]["year"] similar_content_dan["paper_len_word"] = i[3]["paper_len_word"] similar_content_dan["paper_red_len_word"] = end_dan - start_dan thesis_info = " ".join( [similar_content_dan["title"], similar_content_dan["author"], similar_content_dan["degree"], similar_content_dan["year"]]) similar_content_dan["thesis_info"] = thesis_info similar_content.append(similar_content_dan) original_text_list = list(data_sentence_dan[0][1]) # original_text_list.insert(end, "\n") # original_text_list.insert(start, "\n") target_text_str = "".join(["\n"] + original_text_list[start: end] + ["\n"]) original_text_start = "".join(original_text_list[:start]) original_text_end = "".join(original_text_list[end:]) print(data_sentence_dan) if data_sentence_dan[0][4][0]-1 < 0: start_sen = "" else: start_sen = centent_list[data_sentence_dan[0][4][0]-1] if data_sentence_dan[0][4][-1]+1 >= len(centent_list): end_sen = "" else: end_sen = centent_list[data_sentence_dan[0][4][-1]+1] start_sen = start_sen + original_text_start end_sen = original_text_end + end_sen original_text = "此处有 {} 字相似\n".format(str(end - start)) + start_sen[-60:] + target_text_str + end_sen[:60] return_info = { "original_text": original_text, "dan_sentence_word_nums": end - start, "similar_content": similar_content } return return_info def repeat_quote_info_func(original_text_contrast, section_words): ''' 重复的引用信息 :return: ''' chongfuwendang = {} for sentence_dan in original_text_contrast: for i in sentence_dan["similar_content"]: thesis_info = i["thesis_info"] if thesis_info not in chongfuwendang: chongfuwendang[thesis_info] = { "quote": False, "thesis_author": i["author"], "thesis_date": i["year"], "thesis_info": thesis_info, "thesis_repeat_rate": (i["paper_red_len_word"] / section_words) * 100, # str(round(repeat_rate, 1)) + "%" # round(repetition_rate, 3) * 100 "thesis_title": i["title"], "thesis_link": "", "thesis_publish": i["degree"], "thesis_repeat_word": i["paper_red_len_word"], "thesis_teacher": "", "paper_len_word": i["paper_len_word"] } else: chongfuwendang[thesis_info]["thesis_repeat_word"] += i["paper_red_len_word"] chongfuwendang[thesis_info]["thesis_repeat_rate"] = (chongfuwendang[thesis_info]["thesis_repeat_word"] / section_words) * 100 chongfuwendang = sorted(chongfuwendang.items(), key=lambda x: x[1]["thesis_repeat_rate"], reverse=False) chongfuwendang_list = [] for i in chongfuwendang: chongfuwendang_dan = i[1] print(chongfuwendang_dan) chongfuwendang_dan["thesis_repeat_rate"] = str(round(chongfuwendang_dan["thesis_repeat_rate"], 1)) + "%" chongfuwendang_list.append(chongfuwendang_dan) return chongfuwendang_list def total_data_func(section_data_list): ''' 总体数据 :return: ''' # "end_page_index": 0, # "name": "第1部分", # "repeat_rate": repeat_rate, # "repeat_words": repeat_words, # "start_page_index": 0, # "words": section_words, # "original_text": original_text, # "original_text_oneself": original_text, # "original_text_contrast/重复的对比详细信息": original_text_contrast, # "repeat_quote_info/重复的引用信息": repeat_quote_info repeat_words = 0 words = 0 for i in section_data_list: repeat_words += i["repeat_words"] words += i["words"] baifenbi = (repeat_words / words) *100 exclude_personal_rate = str(round(baifenbi, 1)) + "%" exclude_quote_rate = str(round(baifenbi, 1)) + "%" single_max_rate = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_rate"] single_max_repeat_words = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_word"] total_repeat_rate = str(round(baifenbi, 1)) + "%" total_repeat_words = repeat_words total_words = words print(exclude_personal_rate) return { "back_repeat_words": "", "exclude_personal_rate": exclude_personal_rate, "exclude_quote_rate": exclude_quote_rate, "front_repeat_words": "", "single_max_rate": single_max_rate, "single_max_repeat_words": single_max_repeat_words, "suspected_paragraph": "", "suspected_paragraph_max_repeat_words": "", "suspected_paragraph_min_repeat_words": "", "total_paragraph": "", "total_repeat_rate": total_repeat_rate, "total_repeat_words": total_repeat_words, "total_words": total_words, "tables": 0 } def section_data_func_dan(): ''' 章节信息单个 :return: ''' # { # "section_name": "章节名称", # "section_repeat_rate": "重复率", # "section_repeat_words": "重复字数", # "section_words": "章节字数", # "oneself_repeat_words": "去除本人后重复字数", # "reference_repeat_words": "去除引用后重复字数", # "section_oneself_rate": "去除本人后重复率" # } return { "section_name": "", "section_repeat_rate": "", "section_repeat_words": "", "section_words": "", "oneself_repeat_words": "", "reference_repeat_words": "", "section_oneself_rate": "" } def section_data_func(section_details): ''' 章节信息 :return: ''' # "end_page_index": 0, # "name": "第1部分", # "repeat_rate": repeat_rate, # "repeat_words": repeat_words, # "start_page_index": 0, # "words": section_words, # "original_text": original_text, # "original_text_oneself": original_text, # "original_text_contrast/重复的对比详细信息": original_text_contrast, # "repeat_quote_info/重复的引用信息": repeat_quote_info section_name = section_details["name"] section_repeat_rate = section_details["repeat_rate"] section_repeat_words = section_details["repeat_words"] section_words = section_details["words"] oneself_repeat_words = section_details["repeat_words"] reference_repeat_words = section_details["repeat_words"] section_oneself_rate = section_details["repeat_rate"] return { "section_name": section_name, "section_repeat_rate": section_repeat_rate, "section_repeat_words": section_repeat_words, "section_words": section_words, "oneself_repeat_words": oneself_repeat_words, "reference_repeat_words": reference_repeat_words, "section_oneself_rate": section_oneself_rate } def section_details_func(data_section_dan, paper_dict, num_words, centent_list, index_centent_list_dan): ''' 章节详细信息 :param original_text_contrast: :param repeat_quote_info: :return: ''' original_text_contrast = [] section_repeat_rate = "" repeat_words = 0 section_words = num_words oneself_repeat_words = "" reference_repeat_words = "" section_oneself_rate = "" original_text_list = [] for sentence_dan in data_section_dan: original_text_contrast_dan = original_text_contrast_func(sentence_dan, paper_dict, centent_list) original_text_contrast.append(original_text_contrast_dan) repeat_words += original_text_contrast_dan["dan_sentence_word_nums"] original_text_list.append(original_text_contrast_dan["original_text"]) original_text = "".join(original_text_list) repeat_rate = (repeat_words / section_words) * 100 repeat_rate = str(round(repeat_rate, 1)) + "%" repeat_quote_info = repeat_quote_info_func(original_text_contrast, section_words) return { "end_page_index": 0, "name": "第{}部分".format(str(index_centent_list_dan)), "repeat_rate": repeat_rate, "repeat_words": repeat_words, "start_page_index": 0, "words": section_words, "original_text": original_text, "original_text_oneself": original_text, "original_text_contrast": original_text_contrast, "repeat_quote_info": repeat_quote_info } def check_dict(similar_content_control_zong, paper_dict_zong, num_words_zong, title, author, chapter_data, index_centent_list): # similar_content_control, paper_dict, num_words, title, author, centent_list ''' 生成返回字典 :param similar_content_control: :param paper_dict: :param num_words: :param title: :param author: :return: ''' if paper_dict_zong != []: # 模拟多个章节 section_details_list = [] for data_section_dan, paper_dict, num_words, centent_list, index_centent_list_dan in zip(similar_content_control_zong, paper_dict_zong, num_words_zong, chapter_data, index_centent_list): # 章节详细信息 section_details = section_details_func(data_section_dan, paper_dict, num_words, centent_list, index_centent_list_dan) section_details_list.append(section_details) # 模拟多个章节 section_data_list = [] for section_details in section_details_list: section_data = section_data_func(section_details) section_data_list.append(section_data) total_data = total_data_func(section_details_list) format = '%Y-%m-%d %H:%M:%S' value = time.localtime(int(time.time())) dt = time.strftime(format, value) paper_data = { "author": author, "check_time": dt, "time_range": "1900-01-01至2023-08-08", "title": title, "total_data": total_data, "section_data": section_data_list, "section_details": section_details_list } else: total_data = { "back_repeat_words": "", "exclude_personal_rate": 0, "exclude_quote_rate": 0, "front_repeat_words": "", "single_max_rate": 0, "single_max_repeat_words": 0, "suspected_paragraph": "", "suspected_paragraph_max_repeat_words": "", "suspected_paragraph_min_repeat_words": "", "total_paragraph": "", "total_repeat_rate": 0, "total_repeat_words": 0, "total_words": num_words, "tables": 0 } section_data_list = [{ "section_name": "第一部分", "section_repeat_rate": 0, "section_repeat_words": 0, "section_words": num_words, "oneself_repeat_words": 0, "reference_repeat_words": 0, "section_oneself_rate": 0 }] section_details_list = [ { "end_page_index": 0, "name": "第1部分", "repeat_rate": 0, "repeat_words": 0, "start_page_index": 0, "words": num_words, "original_text": "", "original_text_oneself": "", "original_text_contrast": [], "repeat_quote_info": [] } ] format = '%Y-%m-%d %H:%M:%S' value = time.localtime(int(time.time())) dt = time.strftime(format, value) paper_data = { "author": author, "check_time": dt, "time_range": "1900-01-01至2023-08-08", "title": title, "total_data": total_data, "section_data": section_data_list, "section_details": section_details_list } return paper_data def split_chapter(centent_list): ''' :param centent_list: :return: [[[sentence, sentence, ... sentence], 2000], [[sentence, sentence, ... sentence], 2000]] ''' centent_list_new = [] zishu = 2000 dangqianzishu = 0 i = 0 centent_list_dan = [] while True: if i >= len(centent_list): if centent_list_dan != []: centent_list_new.append([centent_list_dan, dangqianzishu]) break centent_list_dan.append(centent_list[i]) dangqianzishu += len(centent_list[i]) if dangqianzishu > zishu: centent_list_new.append([centent_list_dan, dangqianzishu]) dangqianzishu = 0 centent_list_dan = [] i += 1 return centent_list_new def chapter_check(dan_chapter_data, recall_data_list): # ============================================================================================= # 多进程算法 # rouge算法查重 # t1_0 = time.time() # rst = [] # p = Pool(nums_cpus) # 进程池中含有n个子进程 # # print("num_words", num_words) # for i in range(len(centent_list)): # text = centent_list[i] # a = p.apply_async(rouge_pre_m, args=(text, recall_data_list,)) # rst.append(a) # p.close() # p.join() # 等待所有子进程执行完毕。调用join()之前必须先调用close(),调用close()之后就不能继续添加新的Process了。 # # print("筛选句子完成") # rst = [i.get() for i in rst] # # t2_0 = time.time() # print(t2_0- t1_0) # ========================================================================================================= rst = [] for i in range(len(dan_chapter_data)): text = dan_chapter_data[i] rst.append(rouge_pre_m(text, recall_data_list)) # ======================================================================================================== data_zong = [] for i in range(len(rst)): # print(rst[i]) data_zong.append(rst[i]) t0 = time.time() # bert算法查重 # for text in centent_list: # bert_pre_list = bert_check(text, recall_data_list) # data_zong.append(bert_pre_list) t1 = time.time() original_dict = [] # 找出相似的句子序号 bool_check_sentense = [] # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] # bert算法 # for i in range(len(data_zong)): # if data_zong[i][0] == 1: # bool_check_sentense.append([i,data_zong[i][1]]) # rouge算法 for i in range(len(data_zong)): bool_check_sentense_dan = [] # [[1, 223],[1, 226], [1, 562]] for j in range(len(data_zong[i])): if data_zong[i][j][1] > 0.3: # print("data_zong[{}][{}]".format(i,j), data_zong[i][j][0]) bool_check_sentense_dan.append([i, data_zong[i][j][0]]) if bool_check_sentense_dan != []: bool_check_sentense.append(bool_check_sentense_dan) # 继续用rouge方法筛选 bool_check_sentense = rouge_pre_m_1(bool_check_sentense, dan_chapter_data, recall_data_list) # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] print("bool_check_sentense", bool_check_sentense) print("找出相似的句子序号完成") # print("data_zong", data_zong) biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[[0, 1, 2], [479, 480, 481]],[[0, 1, 2], [471, 472, 473]]], [[[3, 4, 5], [481, 482, 483]], [[3, 4, 5], [461, 462, 463]]]] print("biao_red", str(biao_red)) original_sentence_index = [] # for i in biao_red: # for j in i: # original_sentence_index.append(j[0]) sentence_0_list = [] sentence_1_list = [] sim_paper_name = [] for i in range(len(biao_red)): for j in range(len(biao_red[i])): print("i,j",i, j) # if recall_data_list[biao_red[i][j][1][0]][1] == recall_data_list[biao_red[i][j][1][1]][1] == recall_data_list[biao_red[i][j][1][2]][1]: # sentence_0_list.append("".join([centent_list[biao_red[i][j][0][0]], centent_list[biao_red[i][j][0][1]], centent_list[biao_red[i][j][0][2]]])) # sentence_1_list.append( # "".join([recall_data_list[biao_red[i][j][1][0]][0], recall_data_list[biao_red[i][j][1][1]][0], recall_data_list[biao_red[i][j][1][2]][0]])) # sim_paper_name.append(recall_data_list[biao_red[i][j][1][0]][1]) # else: # continue file_name = recall_data_list[biao_red[i][j][1][1]][1] sentence_0_list_dan = [] sentence_1_list_dan = [] sentence_0_list_dan_index = [] # houxuna_file_list = [ # [recall_data_list[biao_red[i][j][1][0]][1], dan_chapter_data[biao_red[i][j][0][0]], # recall_data_list[biao_red[i][j][1][0]][0]], # [recall_data_list[biao_red[i][j][1][1]][1], dan_chapter_data[biao_red[i][j][0][1]], # recall_data_list[biao_red[i][j][1][1]][0]], # [recall_data_list[biao_red[i][j][1][2]][1], dan_chapter_data[biao_red[i][j][0][2]], # recall_data_list[biao_red[i][j][1][2]][0]] # ] sentence_0_list_dan = [dan_chapter_data[biao_red[i][j][0][index_simsentence]] for index_simsentence in range(len(biao_red[i][j][0]))] houxuna_file_list = [[recall_data_list[biao_red[i][j][1][index_simsentence]][1], recall_data_list[biao_red[i][j][1][index_simsentence]][0]] for index_simsentence in range(len(biao_red[i][j][0]))] for dan_sen_info in houxuna_file_list: if dan_sen_info[0] == file_name: sentence_1_list_dan.append(dan_sen_info[1]) if sentence_0_list_dan != [] and sentence_1_list_dan != []: sentence_0_list.append("".join(sentence_0_list_dan)) sentence_1_list.append("".join(sentence_1_list_dan)) original_sentence_index.append(biao_red[i][j][0]) sim_paper_name.append(recall_data_list[biao_red[i][j][1][1]][1]) print("待标红句子筛选完成") sentence_0_list_new = [] sentence_1_list_new = [] for i in zip(sentence_0_list, sentence_1_list): if len(i[0]) + len(i[1]) < 1200: sentence_0_list_new.append(i[0]) sentence_1_list_new.append(i[1]) else: print(len(i[0]) + len(i[1])) continue t2 = time.time() print() for i in sentence_0_list_new: print("sentence_0_list_new", i) if sentence_0_list_new == sentence_1_list_new == []: paper_dict = [] else: paper_dict = biaohong_bert_predict(sentence_0_list_new, sentence_1_list_new) t3 = time.time() print("标红完成") print("标红时间", t3 - t2) original_text = [] original_text_contrast = [] repeat_quote_info = [] chongfuwendang = {} print("paper_dict", paper_dict) print("sentence_0_list_new", sentence_0_list_new) print("sentence_1_list_new", sentence_1_list_new) print("sim_paper_name", sim_paper_name) similar_content_control = [[]] # with open("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/paper_dict.json", "w") as f: # json.dump(paper_dict, f, ensure_ascii=False) sentence_0_list_new_cursor = sentence_0_list_new[0] for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan in zip( range(len(paper_dict)), sentence_0_list_new, sentence_1_list_new, sim_paper_name, original_sentence_index): if sentence_0_list_new_cursor != sentence_0_dan: similar_content_control.append( [[paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan]]) sentence_0_list_new_cursor = sentence_0_dan else: similar_content_control[-1].append( [paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan]) return similar_content_control, paper_dict def accurate_check_rouge( title, author, text_paper, recall_data_list ): ''' 精确查重出相似句子 :param text: :param recall_data_list: list [[sentence, filename],[sentence, filename],[sentence, filename]] :return: ''' # 文本处理 # centent_list = [] print("text_paper", len(text_paper)) text_paper = str(text_paper).replace("。\n", "。") centent_list_old = text_paper.split("。") sentence_word_nums = 0 centent_list = [] for i in centent_list_old: if len(i) < 300: centent_list.append(i + "。") if i == "": continue centent_list_zong = split_chapter(centent_list) # 分章 similar_content_control_zong = [] paper_dict_zong = [] num_words_zong = [] chapter_data = [] index_centent_list = [] for index_centent_list_zong in range(len(centent_list_zong)): dan_chapter_data, dan_chapter_num_words = centent_list_zong[index_centent_list_zong][0], centent_list_zong[index_centent_list_zong][1] similar_content_control, paper_dict = chapter_check(dan_chapter_data, recall_data_list) similar_content_control_zong.append(similar_content_control) paper_dict_zong.append(paper_dict) num_words_zong.append(dan_chapter_num_words) chapter_data.append(dan_chapter_data) index_centent_list.append(index_centent_list_zong) paper_data = check_dict(similar_content_control_zong, paper_dict_zong, num_words_zong, title, author, chapter_data, index_centent_list) # data = [similar_content_control] # # # 模拟多个章节 # section_details_list = [] # for data_dan in data: # data_section_dan = data_dan # # # 章节详细信息 # section_details = section_details_func(data_section_dan, paper_dict, num_words) # section_details_list.append(section_details) # # # 模拟多个章节 # # section_data_list = [] # for section_details in section_details_list: # section_data = section_data_func(section_details) # section_data_list.append(section_data) # # total_data = total_data_func(section_details_list) # # format = '%Y-%m-%d %H:%M:%S' # value = time.localtime(int(time.time())) # dt = time.strftime(format, value) # # paper_data = { # "author": author, # "check_time": dt, # "time_range": "1900-01-01至2023-08-08", # "title": title, # "total_data": total_data, # "section_data": section_data_list, # "section_details": section_details_list # } return paper_data def biaohong(bool_check_sentense, data_zong, df_train_nuoche): ''' 标红的序号 [[0,1,2],[3,4,5]] :param bool_check_sentense: # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] :return: list # [[[[0, 1, 2], [479, 480, 481]],[[0, 1, 2], [471, 472, 473]]], [[[3, 4, 5], [481, 482, 483]], [[3, 4, 5], [461, 462, 463]]]] ''' # print("bool_check_sentense", bool_check_sentense) biao_red = [] i = 0 start = -1 end = -1 tiaochu = False while True: # if i >= len(bool_check_sentense) or bool_check_sentense[i][0] + 1 >= len(data_zong) or bool_check_sentense[i][1] \ # + 1 >= len(df_train_nuoche): # break if i >= len(bool_check_sentense): break for j in bool_check_sentense[i]: # print("j", j) if j[0] + 1 > len(data_zong): tiaochu = True break # if bool_check_sentense[i][0][0] + 1 >= len(data_zong): # if bool_check_sentense[] # bool_check_sentense[i][0][0] + 1 = bool_check_sentense[i + 1][0][0] # break for j in bool_check_sentense[i]: if j[1] + 1 >= len(df_train_nuoche): tiaochu = True break if tiaochu == True: break # elif bool_check_sentense[i-1][0][0] == start: # biao_red_dan = [] # for j in range(len(bool_check_sentense[i-1])): # bool_check_sentense: [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] # biao_red_dan.append([[bool_check_sentense[i-1][j][0], bool_check_sentense[i-1][j][0]+ 1, bool_check_sentense[i-1][j][0] + 2], # [bool_check_sentense[i-1][j][1] - 1, bool_check_sentense[i-1][j][1], bool_check_sentense[i+1][j][1] + 1]]) # biao_red.append(biao_red_dan) # # elif bool_check_sentense[i+1][0][0] == end: # biao_red_dan = [] # for j in range(len(bool_check_sentense[i+1])): # bool_check_sentense: [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] # biao_red_dan.append([[bool_check_sentense[i+1][j][0]-2, bool_check_sentense[i+1][j][0]-1, bool_check_sentense[i+1][j][0]], # [bool_check_sentense[i+1][j][1] - 1, bool_check_sentense[i+1][j][1], bool_check_sentense[i+1][j][1] + 1]]) # biao_red.append(biao_red_dan) elif i == len(bool_check_sentense)-1: if end == bool_check_sentense[i][0][0]: i += 1 break elif bool_check_sentense[i][0][0]-1 == end + 1 and bool_check_sentense[i][0][0] == len(data_zong) -1: index_list = [ii for ii in range(bool_check_sentense[i][0][0]-1, bool_check_sentense[i][0][0] + 1)] elif bool_check_sentense[i][0][0]-1 == end and bool_check_sentense[i][0][0] == len(data_zong) -1: index_list = [ii for ii in range(bool_check_sentense[i][0][0], bool_check_sentense[i][0][0] + 1)] elif bool_check_sentense[i][0][0]-1 > end + 1 and bool_check_sentense[i][0][0] == len(data_zong) -1: index_list = [ii for ii in range(bool_check_sentense[i][0][0] - 1, bool_check_sentense[i][0][0] + 1)] else: index_list = [ii for ii in range(bool_check_sentense[i][0][0] - 1, bool_check_sentense[i][0][0] + 2)] biaohongset = set() biao_red_dan = [] for j in range(len(bool_check_sentense[ i])): # bool_check_sentense: [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] if bool_check_sentense[i][j][1] not in biaohongset: biao_red_dan.append([index_list, [bool_check_sentense[i][j][1] - 1, bool_check_sentense[i][j][1], bool_check_sentense[i][j][1] + 1]]) biaohongset.add(bool_check_sentense[i][j][1] - 1) biaohongset.add(bool_check_sentense[i][j][1]) biaohongset.add(bool_check_sentense[i][j][1] + 1) else: continue i += 1 biao_red.append(biao_red_dan) break elif bool_check_sentense[i][0][0] - 1 == start: i += 1 continue elif bool_check_sentense[i][0][0] == end: i += 1 continue elif bool_check_sentense[i][0][0] - 1 == end: i += 1 continue else: biaohongset = set() biao_red_dan = [] for j in range(len(bool_check_sentense[i])): # bool_check_sentense: [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] if bool_check_sentense[i][j][1] not in biaohongset: biao_red_dan.append([[bool_check_sentense[i][j][0] - 1, bool_check_sentense[i][j][0], bool_check_sentense[i][j][0] + 1], [bool_check_sentense[i][j][1] - 1, bool_check_sentense[i][j][1], bool_check_sentense[i][j][1] + 1]]) biaohongset.add(bool_check_sentense[i][j][1] - 1) biaohongset.add(bool_check_sentense[i][j][1]) biaohongset.add(bool_check_sentense[i][j][1] + 1) else: continue start = bool_check_sentense[i][0][0] - 1 end = bool_check_sentense[i][0][0] + 1 if bool_check_sentense[i-1][0][0] == start: for j in range(len(bool_check_sentense[i-1])): # bool_check_sentense: [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] if bool_check_sentense[i - 1][j][1] not in biaohongset: biao_red_dan.append([[bool_check_sentense[i-1][j][0], bool_check_sentense[i-1][j][0] + 1, bool_check_sentense[i-1][j][0] + 2], [bool_check_sentense[i-1][j][1] - 1, bool_check_sentense[i-1][j][1], bool_check_sentense[i-1][j][1] + 1]]) biaohongset.add(bool_check_sentense[i-1][j][1] - 1) biaohongset.add(bool_check_sentense[i-1][j][1]) biaohongset.add(bool_check_sentense[i-1][j][1] + 1) else: continue if bool_check_sentense[i+1][0][0] == end: for j in range(len(bool_check_sentense[i+1])): # bool_check_sentense: [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] if bool_check_sentense[i + 1][j][1] not in biaohongset: biao_red_dan.append([[bool_check_sentense[i+1][j][0]-2, bool_check_sentense[i+1][j][0]-1, bool_check_sentense[i+1][j][0]], [bool_check_sentense[i+1][j][1] - 1, bool_check_sentense[i+1][j][1], bool_check_sentense[i+1][j][1] + 1]]) biaohongset.add(bool_check_sentense[i+1][j][1] - 1) biaohongset.add(bool_check_sentense[i+1][j][1]) biaohongset.add(bool_check_sentense[i+1][j][1] + 1) else: continue i += 1 biao_red.append(biao_red_dan) return biao_red # [[[[0, 1, 2], [479, 480, 481]],[[0, 1, 2], [471, 472, 473]]], [[[3, 4, 5], [481, 482, 483]], [[3, 4, 5], [461, 462, 463]]]] def dialog_line_parse(url, text): """ 将数据输入模型进行分析并输出结果 :param url: 模型url :param text: 进入模型的数据 :return: 模型返回结果 """ response = requests.post( url, json=text, timeout=100000 ) if response.status_code == 200: return response.json() else: # logger.error( # "【{}】 Failed to get a proper response from remote " # "server. Status Code: {}. Response: {}" # "".format(url, response.status_code, response.text) # ) print("【{}】 Failed to get a proper response from remote " "server. Status Code: {}. Response: {}" "".format(url, response.status_code, response.text)) print(text) return {} def is_english_char(char): code = ord(char) return 32 <= code <= 126 def original_text_marked_red(text_original, bert_text, start, end): ''' 把原文标红字段找到 :param text_original: :param bert_text: :param bert_text_pre: :return: ''' try: fuhao = ["\n"] up_pointer = 0 down_pointer = 0 pointer_list = [] bert_text_list = list(bert_text) bert_text_list.insert(start, "") bert_text_list.insert(end + 2, "") text_original_list = list(text_original) up = 0 down = 0 while True: if up == len(text_original_list): break if text_original_list[up] == bert_text_list[down]: up += 1 down += 1 else: if bert_text_list[down] == "": down += 1 elif bert_text_list[down] == "": down += 1 else: bert_text_list.insert(down, text_original_list[up]) up += 1 down += 1 bert_text = "".join(bert_text_list) return True, bert_text except: print("句子标红报错") print(text_original, bert_text) return False, "" def biaohong_bert_predict(sentence_0_list, sentence_1_list): ''' 找出标红字符 :param bool_check_sentense: :return: ''' paper_dict = \ dialog_line_parse("http://192.168.31.74:16003/", {"sentence_0": sentence_0_list, "sentence_1": sentence_1_list})[ "resilt"] return paper_dict def ulit_text(title, text): data = [] try: text = json.loads(text)["content"] except: pass text = text.strip().replace("\n", "").replace(" ", "").replace("。", "。\n") text_list = text.split("\n") for i in text_list: data.append([i, title]) return data def run_query(conn, sql, params): with conn.cursor() as cursor: cursor.execute(sql, params) result = cursor.fetchall() return result def processing_one_text(paper_id): conn = pymysql.connect( host='192.168.31.145', port=3306, user='root', password='123456', db='zhiwang_db', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor ) sql = 'SELECT * FROM main_table_paper_detail_message WHERE doc_id=%s' params = (paper_id,) result = run_query(conn, sql, params) conn.close() print(result[0]['title'], result[0]['author']) title = result[0]['title'] author = result[0]['author'] degree = result[0]['degree'] year = result[0]['content'].split("/")[5] content_path = result[0]['content'] try: with open(content_path, encoding="utf-8") as f: text = f.read() except: with open(content_path, encoding="gbk") as f: text = f.read() paper_info = { "title": title, "author": author, "degree": degree, "year": year, "paper_len_word": len(text) } data = ulit_text(paper_info, text) return data def ulit_recall_paper(recall_data_list_dict): ''' 对返回的十篇文章路径读取并解析 :param recall_data_list_path: :return data: list [[sentence, filename],[sentence, filename],[sentence, filename]] ''' # data = [] # for path in recall_data_list_path: # filename = path.split("/")[-1] # with open(path, encoding="gbk") as f: # text = f.read() # text_list = text.split("\n") # for sentence in text_list: # if sentence != "": # data.append([sentence, filename]) # return data data = [] for i in list(recall_data_list_dict.items()): data_one = processing_one_text(i[0]) data.extend(data_one) return data def recall_10(queue_uuid, title, abst_zh, content): ''' 宇鹏召回接口 :param paper_name: :return: ''' request_json = { "uuid": queue_uuid, "title": title, "abst_zh": abst_zh, "content": content } print(request_json) dialog_line_parse("http://192.168.31.145:50004/check1", request_json) def uilt_content(content): zhaiyao_list = ["摘要"] zhaiyao_en_list = ["Abstract", "abstract"] mulu_list = ["目录"] key_word_list = ["关键词"] caikanwenxian = ["参考文献"] key_word_bool = False key_word_str = "" zhaiyao_bool = False zhaiyao_en_bool = False zhaiyao_str = "" zhaiyao_en_str = "" mulu_str = "" zhaiyao_text = "" mulu_bool = False for i in zhaiyao_list: if i in content: zhaiyao_bool = True zhaiyao_str = i break for i in zhaiyao_en_list: if i in content: zhaiyao_en_bool = True zhaiyao_en_str = i break for i in mulu_list: if i in content: mulu_str = i mulu_bool = True break for i in key_word_list: if i in content: key_word_str = i key_word_bool = True break if zhaiyao_bool == True and zhaiyao_en_bool == True: pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, zhaiyao_en_str) result_biaoti_list = re.findall(pantten_zhaiyao, content) zhaiyao_text = result_biaoti_list[0] elif zhaiyao_bool == True and key_word_bool == True: pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, key_word_str) result_biaoti_list = re.findall(pantten_zhaiyao, content) zhaiyao_text = result_biaoti_list[0] elif zhaiyao_bool == True and mulu_bool == True: pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str, mulu_str) result_biaoti_list = re.findall(pantten_zhaiyao, content) zhaiyao_text = result_biaoti_list[0] if zhaiyao_text == "": content = str(content).replace("。\n", "。") content_list = content.split("。") zhaiyao_text = "".join(content_list[:15]) return zhaiyao_text def ulit_request_file(file): file_name = file.filename if file_name.split(".")[-1] == "txt": file_name_save = "data/request/{}".format(file_name) file.save(file_name_save) try: with open(file_name_save, encoding="gbk") as f: content = f.read() except: with open(file_name_save, encoding="utf-8") as f: content = f.read() content = " ".join([i for i in content.split("\n") if i != ""]) abst_zh = uilt_content(content) return abst_zh, content # @app.route("/", methods=["POST"]) # def handle_query(): # print(request.remote_addr) # # # request.form.get('prompt') # dataBases = request.form.get("dataBases") # minSimilarity = request.form.get("minSimilarity") # txt # minWords = request.form.get("minWords") # title = request.form.get("title") # author = request.form.get("author") # txt # file = request.files.get('file') # token = request.form.get("token") # account = request.form.get("account") # goodsId = request.form.get("goodsId") # callbackUrl = request.form.get("callbackUrl") # # # t0 = time.time() # abst_zh, content = ulit_request_file(file) # # # 调用宇鹏查询相似十篇 # # recall_data_list_dict = recall_10(title, abst_zh, content) # # t1 = time.time() # print("查找相似的50篇完成") # with open("data/rell_json.txt") as f: # recall_data_list_dict = eval(f.read()) # # # 读取文章转化成格式数据 # recall_data_list = ulit_recall_paper(recall_data_list_dict) # print("文章格式转化完成") # # # recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist() # # # 进入精确查重系统 # print("进入精确查重系统") # return_list = accurate_check_rouge(title, author, content, recall_data_list) # # print("召回50篇", t1 - t0) # # return_text = {"resilt": return_list, "probabilities": None, "status_code": 200} # return jsonify(return_text) # 返回结果 # def classify_recall(): # 调用模型,设置最大batch_size # while True: # if redis_.llen(db_key_query) == 0: # 若队列中没有元素就继续获取 # time.sleep(3) # continue # query = redis_.lpop(db_key_query).decode('UTF-8') # 获取query的text # data_dict_path = json.loads(query) # path = data_dict_path['path'] # # text_type = data_dict["text_type"] # # with open(path, encoding='utf8') as f1: # # 加载文件的对象 # data_dict = json.load(f1) # # queue_uuid = data_dict['id'] # print(queue_uuid) # dataBases = data_dict['dataBases'] # minSimilarity = data_dict['minSimilarity'] # minWords = data_dict['minWords'] # title = data_dict['title'] # author = data_dict['author'] # abst_zh = data_dict['abst_zh'] # content = data_dict['content'] # token = data_dict['token'] # account = data_dict['account'] # goodsId = data_dict['goodsId'] # callbackUrl = data_dict['callbackUrl'] # # # 调用宇鹏查询相似十篇 # recall_data_list_dict = recall_10(queue_uuid, title, abst_zh, content) # # # print("查找相似的50篇完成") # # with open("data/rell_json.txt") as f: # # recall_data_list_dict = eval(f.read()) # # # 读取文章转化成格式 def classify_accurate_check(): while True: if redis_.llen(db_key_query_recall) == 0: # 若队列中没有元素就继续获取 time.sleep(3) continue query_recall = redis_.lpop(db_key_query_recall).decode('UTF-8') # 获取query的text query_recall_dict = json.loads(query_recall) query_recall_uuid = query_recall_dict["uuid"] recall_data_list_dict = json.loads(query_recall_dict["data"]) recall_data_list = ulit_recall_paper(recall_data_list_dict) data_dict_path = redis_.get(query_recall_uuid + "_request_check") print(data_dict_path) with open(data_dict_path, encoding='utf8') as f: data_dict = json.loads(f.read()) queue_uuid = data_dict['id'] dataBases = data_dict['dataBases'] minSimilarity = data_dict['minSimilarity'] minWords = data_dict['minWords'] title = data_dict['title'] author = data_dict['author'] abst_zh = data_dict['abst_zh'] content = data_dict['content'] token = data_dict['token'] account = data_dict['account'] goodsId = data_dict['goodsId'] callbackUrl = data_dict['callbackUrl'] # try: print("查找相似的50篇完成") print(len(content)) # with open("data/rell_json.txt") as f: # recall_data_list_dict = eval(f.read()) # recall_data_list = ulit_recall_paper(recall_data_list_dict) print("文章格式转化完成") # recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist() # 进入精确查重系统 print("进入精确查重系统") return_list = accurate_check_rouge(title, author, content, recall_data_list) return_text = {"resilt": return_list, "probabilities": None, "status_code": 200} load_result_path = "./new_data_logs/{}.json".format(queue_uuid) print("queue_uuid: ", queue_uuid) print("load_result_path: ", load_result_path) with open(load_result_path, 'w', encoding='utf8') as f2: # ensure_ascii=False才能输入中文,否则是Unicode字符 # indent=2 JSON数据的缩进,美观 json.dump(return_text, f2, ensure_ascii=False, indent=4) print(queue_uuid) print(load_result_path) redis_.set(queue_uuid, load_result_path, 86400) redis_.srem(db_key_querying, queue_uuid) # except: # return_text = {"resilt": "", "probabilities": None, "status_code": 401} # load_result_path = "./new_data_logs/{}.json".format(queue_uuid) # # print("queue_uuid: ", queue_uuid) # print("load_result_path: ", load_result_path) # # with open(load_result_path, 'w', encoding='utf8') as f2: # # ensure_ascii=False才能输入中文,否则是Unicode字符 # # indent=2 JSON数据的缩进,美观 # json.dump(return_text, f2, ensure_ascii=False, indent=4) # # print(queue_uuid) # print(load_result_path) # redis_.set(queue_uuid, load_result_path, 86400) # redis_.srem(db_key_querying, queue_uuid) @app.route("/", methods=["POST"]) def handle_query(): try: print(request.remote_addr) # request.form.get('prompt') dataBases = request.form.get("dataBases") minSimilarity = request.form.get("minSimilarity") # txt minWords = request.form.get("minWords") title = request.form.get("title") author = request.form.get("author") # txt file = request.files.get('file') token = request.form.get("token") account = request.form.get("account") goodsId = request.form.get("goodsId") callbackUrl = request.form.get("callbackUrl") abst_zh, content = ulit_request_file(file) id_ = str(uuid.uuid1()) # 为query生成唯一标识 print("uuid: ", id_) print(id_) d = { 'id': id_, 'dataBases': dataBases, 'minSimilarity': minSimilarity, 'minWords': minWords, 'title': title, 'author': author, 'abst_zh': abst_zh, 'content': content, 'token': token, 'account': account, 'goodsId': goodsId, 'callbackUrl': callbackUrl } print(d) # 绑定文本和query id # recall_10(id_, title, abst_zh, content) Thread_rellce = Thread(target=recall_10, args=(id_, title, abst_zh, content,)) Thread_rellce.start() load_request_path = './request_data_logs/{}.json'.format(id_) with open(load_request_path, 'w', encoding='utf8') as f2: # ensure_ascii=False才能输入中文,否则是Unicode字符 indent=2 JSON数据的缩进,美观 json.dump(d, f2, ensure_ascii=False, indent=4) # redis_.rpush(db_key_query, json.dumps({"id": id_, "path": load_request_path})) # 加入redis # redis_.sadd(db_key_querying, id_) # redis_.sadd(db_key_queryset, id_) redis_.set(id_ + "_request_check", load_request_path) return_text = { 'code': 0, 'msg': "请求成功", 'data': { 'balances': "", 'orderId': id_, 'consumeNum': "" } } print("ok") except: return_text = {'code': 1} return jsonify(return_text) # 返回结果 t1 = Thread(target=classify_accurate_check) t1.start() # t = Thread(target=classify_recall) # t.start() if __name__ == "__main__": app.run(host="0.0.0.0", port=16001, threaded=True)