import os import numpy as np from numpy.linalg import norm import pandas as pd # from rouge import Rouge from rouge_chinese import Rouge from Rouge_w import Rouge_w,Rouge_l import json import pymysql import re import requests from flask import Flask, jsonify from flask import request import uuid import time import redis from threading import Thread app = Flask(__name__) app.config["JSON_AS_ASCII"] = False pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=7, password="zhicheng123*") redis_ = redis.Redis(connection_pool=pool, decode_responses=True) db_key_query = 'query' db_key_querying = 'querying' db_key_queryset = 'queryset' nums_cpus = 16 rouge = Rouge() rouge_model = Rouge_w() rouge_l_model = Rouge_l() def bert_check(text, recall_data_list): ''' bert 查重 :return: ''' sen_0 = [text] * len(recall_data_list) sen_1 = [i[0] for i in recall_data_list] return_list = [] request_json = { "texts": [sen_0, sen_1], } paper_dict = dialog_line_parse("http://192.168.31.74:16002/", request_json) score_list = paper_dict["res"] # 后期要改 # return_list.append(re1[0][1]) # return_list.append(re1[0][0]) if 1 in score_list: index_score = score_list.index(1) else: index_score = "NaN" if index_score == "NaN": return_list.append(0) return_list.append("") else: return_list.append(1) return_list.append(index_score) return return_list def rouge_value_self(data_1, data_2): data_1 = [' '.join(i) for i in data_1] data_2 = [' '.join(i) for i in data_2] rouge_l_list = [] for sen_1, sen_2 in zip(data_1, data_2): sen_1 = sen_1.split(" ") sen_2 = sen_2.split(" ") rouge_l_score = rouge_l_model.score(sen_1, sen_2) rouge_l_list.append(rouge_l_score) return "", "", rouge_l_list def rouge_pre(text, df_train_nuoche): return_list = [] index_rouge_list = [] text_list = [text] * len(df_train_nuoche) data_list = [] for data_dan in df_train_nuoche: data_list.append(data_dan[0]) rouge_1, rouge_2, rouge_l = rouge_value_self(text_list, data_list) index_rouge_list.extend(rouge_l) re1 = [(i[0], i[1]) for i in sorted(list(enumerate(index_rouge_list)), key=lambda x: x[1], reverse=True)] return_list.append(re1[0][1]) return_list.append(re1[0][0]) return return_list def accurate_check_rouge( title, author, text_paper, recall_data_list ): ''' 精确查重出相似句子 :param text: :param recall_data_list: list [[sentence, filename],[sentence, filename],[sentence, filename]] :return: ''' # 文本处理 centent_list = [] text_paper = str(text_paper).replace("。\n", "。") centent_list.extend(text_paper.split("。")) data_zong = [] sentence_word_nums = 0 # rouge算法查重 for text in centent_list: rouge_pre_list = rouge_pre(text, recall_data_list) data_zong.append(rouge_pre_list) t0 = time.time() # bert算法查重 # for text in centent_list: # bert_pre_list = bert_check(text, recall_data_list) # data_zong.append(bert_pre_list) t1 = time.time() original_dict = [] # 找出相似的句子序号 bool_check_sentense = [] # bert算法 # for i in range(len(data_zong)): # if data_zong[i][0] == 1: # bool_check_sentense.append([i,data_zong[i][1]]) # rouge算法 for i in range(len(data_zong)): if data_zong[i][0] > 0.47: bool_check_sentense.append([i,data_zong[i][1]]) biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]] print("bert精确查重时间", t1-t0) sentence_0_list = [] sentence_1_list = [] sim_paper_name = [] for i in biao_red: if recall_data_list[i[1][0]][1] == recall_data_list[i[1][1]][1] == recall_data_list[i[1][2]][1]: sentence_0_list.append("。".join([centent_list[i[0][0]], centent_list[i[0][1]], centent_list[i[0][2]]])) sentence_1_list.append("".join([recall_data_list[i[1][0]][0], recall_data_list[i[1][1]][0], recall_data_list[i[1][2]][0]])) sim_paper_name.append(recall_data_list[i[1][0]][1]) else: continue sentence_0_list_new = [] sentence_1_list_new = [] for i in zip(sentence_0_list, sentence_1_list): if len(i[0]) + len(i[1]) < 1200: sentence_0_list_new.append(i[0]) sentence_1_list_new.append(i[1]) else: print(len(i[0]) + len(i[1])) continue t2 = time.time() paper_dict = biaohong_bert_predict(sentence_0_list_new, sentence_1_list_new) t3 = time.time() print("标红时间", t3 - t2) original_text = [] original_text_contrast = [] repeat_quote_info = [] chongfuwendang = {} for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), sentence_0_list_new, sentence_1_list_new, sim_paper_name): print([sentence_0_dan, sentence_1_dan]) original_text_contrast_dict = { "original_text": "", "similar_content": [ { "content": "", "thesis_info": "", "title": "", "year": "", "degree": "", "author": "", } ] } try: sentence_0_bool, sentence_0_dan_red = original_text_marked_red(sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]) # text_original, bert_text, bert_text_pre except: print("报错", [sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]]) continue # 9/0 sentence_1_bool, sentence_1_dan_red = original_text_marked_red(sentence_1_dan, paper_dict[paper_dict_dan_id][2], paper_dict[paper_dict_dan_id][3]) # text_original, bert_text, bert_text_pre if sentence_0_bool == False or sentence_1_bool == False: continue dan_sentence_word_nums = len(paper_dict[paper_dict_dan_id][1]) sentence_word_nums += dan_sentence_word_nums original_text.append(sentence_0_dan_red) original_text_contrast_dict["original_text"] = "此处有 {} 字相似\n".format( dan_sentence_word_nums) + sentence_0_dan_red thesis_info = " ".join([sim_paper_name_dan["title"], sim_paper_name_dan["author"], sim_paper_name_dan["degree"], sim_paper_name_dan["year"]]) original_text_contrast_dict["similar_content"][0]["content"] = sentence_1_dan_red original_text_contrast_dict["similar_content"][0]["title"] = sim_paper_name_dan["title"] original_text_contrast_dict["similar_content"][0]["author"] = sim_paper_name_dan["author"] original_text_contrast_dict["similar_content"][0]["degree"] = sim_paper_name_dan["degree"] original_text_contrast_dict["similar_content"][0]["year"] = sim_paper_name_dan["year"] original_text_contrast_dict["similar_content"][0]["thesis_info"] = thesis_info original_text_contrast.append(original_text_contrast_dict) # for i in repeat_quote_info: # if if thesis_info not in chongfuwendang: chongfuwendang[thesis_info] = { "quote": False, "thesis_author": sim_paper_name_dan["author"], "thesis_date" : sim_paper_name_dan["year"], "thesis_info" : thesis_info, "thesis_repeat_rate": (dan_sentence_word_nums/sim_paper_name_dan["paper_len_word"]) * 100, #round(repetition_rate, 3) * 100 "thesis_title": sim_paper_name_dan["title"], "thesis_link": "", "thesis_publish": sim_paper_name_dan["degree"], "thesis_repeat_word": dan_sentence_word_nums, "thesis_teacher": "", "paper_len_word": sim_paper_name_dan["paper_len_word"] } else: chongfuwendang[thesis_info]["thesis_repeat_word"] += dan_sentence_word_nums chongfuwendang[thesis_info]["thesis_repeat_rate"] = (chongfuwendang[thesis_info]["thesis_repeat_word"]/chongfuwendang[thesis_info]["paper_len_word"]) * 100 chongfuwendang = sorted(chongfuwendang.items(), key=lambda x: x[1]["thesis_repeat_rate"], reverse=False) for i in range(len(chongfuwendang)): repeat_paper_one_info_dict = chongfuwendang[i][1] repeat_paper_one_info_dict.pop("paper_len_word") repeat_paper_one_info_dict["thesis_repeat_rate"] = str(round(repeat_paper_one_info_dict["thesis_repeat_rate"], 1)) + "%" repeat_quote_info.append(repeat_paper_one_info_dict) original_text = "。".join(original_text) repetition_rate = sentence_word_nums/len(text_paper) repetition_rate = round(repetition_rate, 3) * 100 format = '%Y-%m-%d %H:%M:%S' value = time.localtime(int(time.time())) dt = time.strftime(format, value) return { "author": author, "check_time": dt, "title": title, "time_range": "1900-01-01至2023-08-08", "section_data": [ { "oneself_repeat_words": sentence_word_nums, "reference_repeat_words": sentence_word_nums, "section_name": "第1部分", "section_oneself_rate": "{}%".format(repetition_rate), "section_repeat_rate": "{}%".format(repetition_rate), "section_repeat_words": sentence_word_nums, "section_words": len(text_paper) } ], "section_details": [ { "end_page_index": 0, "name": "", "repeat_rate": "", "repeat_words": "", "words": "", "original_text": original_text, "original_text_oneself": original_text, "original_text_contrast": original_text_contrast, "repeat_quote_info": repeat_quote_info } ], "total_data": { "back_repeat_words": "", "exclude_personal_rate": "{}%".format(repetition_rate), "exclude_quote_rate": "{}%".format(repetition_rate), "foot_end_note": "0", "front_repeat_words": "", "single_max_rate": "", "single_max_repeat_words": "", "suspected_paragraph": "1", "suspected_paragraph_max_repeat_words": "", "suspected_paragraph_min_repeat_words": "", "tables": "0", "total_paragraph": "1", "total_repeat_rate": "{}%".format(repetition_rate), "total_repeat_words": sentence_word_nums, "total_words": len(text_paper) } } def biaohong(bool_check_sentense, data_zong, df_train_nuoche): ''' 标红的序号 [[0,1,2],[3,4,5]] :param bool_check_sentense: :return: list ''' biao_red = [] i = 0 start = -1 end = -1 while True: if i >= len(bool_check_sentense) or bool_check_sentense[i][0] +1 >= len(data_zong) or bool_check_sentense[i][1]+1 >= len(df_train_nuoche): break elif bool_check_sentense[i][0]-1 == start: i += 1 continue elif bool_check_sentense[i][0] == end: i += 1 continue elif bool_check_sentense[i][0]-1 == end: i += 1 continue else: biao_red_dan = [] biao_red_dan.append([bool_check_sentense[i][0] - 1, bool_check_sentense[i][1] - 1]) biao_red_dan.append([bool_check_sentense[i][0], bool_check_sentense[i][1]]) biao_red_dan.append([bool_check_sentense[i][0] + 1, bool_check_sentense[i][1] + 1]) biao_red.append([[bool_check_sentense[i][0]-1, bool_check_sentense[i][0], bool_check_sentense[i][0]+1], [bool_check_sentense[i][1]-1, bool_check_sentense[i][1], bool_check_sentense[i][1]+1]]) start = bool_check_sentense[i][0]-1 end = bool_check_sentense[i][0]+1 i += 1 return biao_red def dialog_line_parse(url, text): """ 将数据输入模型进行分析并输出结果 :param url: 模型url :param text: 进入模型的数据 :return: 模型返回结果 """ response = requests.post( url, json=text, timeout=100000 ) if response.status_code == 200: return response.json() else: # logger.error( # "【{}】 Failed to get a proper response from remote " # "server. Status Code: {}. Response: {}" # "".format(url, response.status_code, response.text) # ) print("【{}】 Failed to get a proper response from remote " "server. Status Code: {}. Response: {}" "".format(url, response.status_code, response.text)) print(text) return {} def is_english_char(char): code = ord(char) return 32 <= code <= 126 def original_text_marked_red(text_original, bert_text, bert_text_pre): ''' 把原文标红字段找到 :param text_original: :param bert_text: :param bert_text_pre: :return: ''' fuhao = ["\n"] up_pointer = 0 down_pointer = 0 pointer_list = [] if len(bert_text_pre) > len(bert_text): return False, "" while True: if down_pointer >= len(bert_text_pre): break elif down_pointer == len(bert_text_pre)-1: if bert_text[up_pointer] == bert_text_pre[down_pointer]: pointer_list.append(up_pointer) break else: up_pointer += 1 down_pointer = 0 pointer_list = [] elif bert_text[up_pointer] in fuhao: up_pointer += 1 else: if bert_text[up_pointer] == bert_text_pre[down_pointer]: pointer_list.append(up_pointer) up_pointer += 1 down_pointer += 1 else: if bert_text_pre[down_pointer:down_pointer+5] == "[UNK]": up_pointer += 1 down_pointer += 5 pointer_list.append(up_pointer) elif is_english_char(bert_text_pre[down_pointer]) == True: up_pointer += 1 down_pointer += 1 pointer_list.append(up_pointer) else: up_pointer += 1 down_pointer = 0 pointer_list = [] start = pointer_list[0] end = pointer_list[-1] bert_text_list = list(bert_text) bert_text_list.insert(start, "") bert_text_list.insert(end + 2 , "") text_original_list = list(text_original) up = 0 down = 0 while True: if up == len(text_original_list): break if text_original_list[up] == bert_text_list[down]: up += 1 down += 1 else: if bert_text_list[down] == "": down += 1 elif bert_text_list[down] == "": down += 1 else: bert_text_list.insert(down, text_original_list[up]) up += 1 down += 1 bert_text = "".join(bert_text_list) return True, bert_text def biaohong_bert_predict(sentence_0_list, sentence_1_list): ''' 找出标红字符 :param bool_check_sentense: :return: ''' # sentence_0_list = [] # sentence_1_list = [] # sim_paper_name = [] # # for i in biaohong_list: # sentence_0_list.append("。".join([paper_list[i[0][0]], paper_list[i[0][1]], paper_list[i[0][2]]])) # sentence_1_list.append("。".join([recall_data_list[i[1][1]], recall_data_list[i[1][1]], recall_data_list[i[1][2]]])) paper_dict = dialog_line_parse("http://192.168.31.74:16003/", {"sentence_0": sentence_0_list, "sentence_1": sentence_1_list})["resilt"] # paper_dict # print("原文:".format(i), paper_dict[i][0]) # print("原文标红:".format(i), paper_dict[i][1]) # print("相似:".format(i), paper_dict[i][2]) # print("相似标红:".format(i), paper_dict[i][3]) # original_text # # # for paper_dict_dan, sentence_0_dan, sentence_1_dan in zip(paper_dict, sentence_0_list, sentence_1_list): # original_text_marked_red return paper_dict def ulit_text(title, text): data = [] try: text = json.loads(text)["content"] except: pass text = text.strip().replace("\n", "").replace(" ", "").replace("。", "。\n") text_list = text.split("\n") for i in text_list: data.append([i, title]) return data def run_query(conn, sql, params): with conn.cursor() as cursor: cursor.execute(sql, params) result = cursor.fetchall() return result def processing_one_text(paper_id): conn = pymysql.connect( host='192.168.31.145', port=3306, user='root', password='123456', db='zhiwang_db', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor ) sql = 'SELECT * FROM main_table_paper_detail_message WHERE doc_id=%s' params = (paper_id,) result = run_query(conn, sql, params) conn.close() print(result[0]['title'], result[0]['author']) title = result[0]['title'] author = result[0]['author'] degree = result[0]['degree'] year = result[0]['content'].split("/")[5] content_path = result[0]['content'] try: with open(content_path, encoding="utf-8") as f: text = f.read() except: with open(content_path, encoding="gbk") as f: text = f.read() paper_info = { "title": title, "author": author, "degree": degree, "year": year, "paper_len_word": len(text) } data = ulit_text(paper_info, text) return data def ulit_recall_paper(recall_data_list_dict): ''' 对返回的十篇文章路径读取并解析 :param recall_data_list_path: :return data: list [[sentence, filename],[sentence, filename],[sentence, filename]] ''' # data = [] # for path in recall_data_list_path: # filename = path.split("/")[-1] # with open(path, encoding="gbk") as f: # text = f.read() # text_list = text.split("\n") # for sentence in text_list: # if sentence != "": # data.append([sentence, filename]) # return data data = [] for i in list(recall_data_list_dict.items())[:5]: data_one = processing_one_text(i[0]) data.extend(data_one) return data def recall_10(title, abst_zh, content) -> dict: ''' 宇鹏召回接口 :param paper_name: :return: ''' request_json = { "title": title, "abst_zh": abst_zh, "content": content } paper_dict = dialog_line_parse("http://192.168.31.145:50004/check", request_json) return paper_dict def uilt_content(content): zhaiyao_list = ["摘要"] zhaiyao_en_list = ["Abstract", "abstract"] mulu_list = ["目录"] key_word_list = ["关键词"] key_word_bool = False key_word_str = "" zhaiyao_bool = False zhaiyao_en_bool = False zhaiyao_str = "" zhaiyao_en_str = "" mulu_str = "" zhaiyao_text = "" mulu_bool = False for i in zhaiyao_list: if i in content: zhaiyao_bool = True zhaiyao_str = i break for i in zhaiyao_en_list: if i in content: zhaiyao_en_bool = True zhaiyao_en_str = i break for i in mulu_list: if i in content: mulu_str = i mulu_bool = True break for i in key_word_list: if i in content: key_word_str = i key_word_bool = True break if zhaiyao_bool== True and zhaiyao_en_bool == True: pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,zhaiyao_en_str) result_biaoti_list = re.findall(pantten_zhaiyao, content) zhaiyao_text = result_biaoti_list[0] elif zhaiyao_bool == True and key_word_bool == True: pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,key_word_str ) result_biaoti_list = re.findall(pantten_zhaiyao, content) zhaiyao_text = result_biaoti_list[0] elif zhaiyao_bool == True and mulu_bool == True: pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,mulu_str) result_biaoti_list = re.findall(pantten_zhaiyao, content) zhaiyao_text = result_biaoti_list[0] return zhaiyao_text def ulit_request_file(file): file_name = file.filename if file_name.split(".")[-1] == "txt": file_name_save = "data/request/{}".format(file_name) file.save(file_name_save) try: with open(file_name_save, encoding="gbk") as f: content = f.read() except: with open(file_name_save, encoding="utf-8") as f: content = f.read() content = content.strip().replace("\n", "").replace(" ", "") abst_zh = uilt_content(content) return abst_zh, content # @app.route("/", methods=["POST"]) # def handle_query(): # print(request.remote_addr) # # # request.form.get('prompt') # dataBases = request.form.get("dataBases") # minSimilarity = request.form.get("minSimilarity") # txt # minWords = request.form.get("minWords") # title = request.form.get("title") # author = request.form.get("author") # txt # file = request.files.get('file') # token = request.form.get("token") # account = request.form.get("account") # goodsId = request.form.get("goodsId") # callbackUrl = request.form.get("callbackUrl") # # # t0 = time.time() # abst_zh, content = ulit_request_file(file) # # # 调用宇鹏查询相似十篇 # # recall_data_list_dict = recall_10(title, abst_zh, content) # # t1 = time.time() # print("查找相似的50篇完成") # with open("data/rell_json.txt") as f: # recall_data_list_dict = eval(f.read()) # # # 读取文章转化成格式数据 # recall_data_list = ulit_recall_paper(recall_data_list_dict) # print("文章格式转化完成") # # # recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist() # # # 进入精确查重系统 # print("进入精确查重系统") # return_list = accurate_check_rouge(title, author, content, recall_data_list) # # print("召回50篇", t1 - t0) # # return_text = {"resilt": return_list, "probabilities": None, "status_code": 200} # return jsonify(return_text) # 返回结果 def classify(): # 调用模型,设置最大batch_size while True: if redis_.llen(db_key_query) == 0: # 若队列中没有元素就继续获取 time.sleep(3) continue query = redis_.lpop(db_key_query).decode('UTF-8') # 获取query的text data_dict_path = json.loads(query) path = data_dict_path['path'] # text_type = data_dict["text_type"] with open(path, encoding='utf8') as f1: # 加载文件的对象 data_dict = json.load(f1) query_id = data_dict['id'] print(query_id) dataBases = data_dict['dataBases'] minSimilarity = data_dict['minSimilarity'] minWords = data_dict['minWords'] title = data_dict['title'] author = data_dict['author'] abst_zh = data_dict['abst_zh'] content = data_dict['content'] token = data_dict['token'] account = data_dict['account'] goodsId = data_dict['goodsId'] callbackUrl = data_dict['callbackUrl'] # 调用宇鹏查询相似十篇 # recall_data_list_dict = recall_10(title, abst_zh, content) t1 = time.time() print("查找相似的50篇完成") with open("data/rell_json.txt") as f: recall_data_list_dict = eval(f.read()) # 读取文章转化成格式数据 recall_data_list = ulit_recall_paper(recall_data_list_dict) print("文章格式转化完成") # recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist() # 进入精确查重系统 print("进入精确查重系统") return_list = accurate_check_rouge(title, author, content, recall_data_list) return_text = {"resilt": return_list, "probabilities": None, "status_code": 200} load_result_path = "./new_data_logs/{}.json".format(query_id) print("query_id: ", query_id) print("load_result_path: ", load_result_path) with open(load_result_path, 'w', encoding='utf8') as f2: # ensure_ascii=False才能输入中文,否则是Unicode字符 # indent=2 JSON数据的缩进,美观 json.dump(return_text, f2, ensure_ascii=False, indent=4) print(query_id) print(load_result_path) redis_.set(query_id, load_result_path, 86400) redis_.srem(db_key_querying, query_id) @app.route("/", methods=["POST"]) def handle_query(): try: print(request.remote_addr) # request.form.get('prompt') dataBases = request.form.get("dataBases") minSimilarity = request.form.get("minSimilarity") # txt minWords = request.form.get("minWords") title = request.form.get("title") author = request.form.get("author") # txt file = request.files.get('file') token = request.form.get("token") account = request.form.get("account") goodsId = request.form.get("goodsId") callbackUrl = request.form.get("callbackUrl") abst_zh, content = ulit_request_file(file) id_ = str(uuid.uuid1()) # 为query生成唯一标识 print("uuid: ", uuid) print(id_) d = { 'id': id_, 'dataBases': dataBases, 'minSimilarity': minSimilarity, 'minWords': minWords, 'title': title, 'author': author, 'abst_zh': abst_zh, 'content': content, 'token': token, 'account': account, 'goodsId': goodsId, 'callbackUrl': callbackUrl } # 绑定文本和query id print(d) load_request_path = './request_data_logs/{}.json'.format(id_) with open(load_request_path, 'w', encoding='utf8') as f2: # ensure_ascii=False才能输入中文,否则是Unicode字符 # indent=2 JSON数据的缩进,美观 json.dump(d, f2, ensure_ascii=False, indent=4) redis_.rpush(db_key_query, json.dumps({"id": id_, "path": load_request_path})) # 加入redis redis_.sadd(db_key_querying, id_) redis_.sadd(db_key_queryset, id_) return_text = { 'code': 0, 'msg': "请求成功", 'data': { 'balances': "", 'orderId': id_, 'consumeNum': "" } } print("ok") except: return_text = {'code': 1} return jsonify(return_text) # 返回结果 t = Thread(target=classify) t.start() if __name__ == "__main__": app.run(host="0.0.0.0", port=16001, threaded=True, debug=True)