From 41a38d1d76914b13a509d084765d800adaa10b12 Mon Sep 17 00:00:00 2001 From: "majiahui@haimaqingfan.com" Date: Tue, 22 Aug 2023 17:47:10 +0800 Subject: [PATCH] =?UTF-8?q?=E6=8E=A5=E5=8F=A3=E8=B0=83=E8=AF=95=E5=AE=8C?= =?UTF-8?q?=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_check.py | 474 +++++++++++++++++++++++++++++++++++++++++++++++ flask_check_bert.py | 523 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 997 insertions(+) create mode 100644 flask_check.py create mode 100644 flask_check_bert.py diff --git a/flask_check.py b/flask_check.py new file mode 100644 index 0000000..f35afd4 --- /dev/null +++ b/flask_check.py @@ -0,0 +1,474 @@ +import os +import numpy as np +from numpy.linalg import norm +import pandas as pd +# from rouge import Rouge +from rouge_chinese import Rouge +from Rouge_w import Rouge_w,Rouge_l +import json +import pymysql + +import requests +from flask import Flask, jsonify +from flask import request +import uuid +app = Flask(__name__) +app.config["JSON_AS_ASCII"] = False + + +nums_cpus = 16 +rouge = Rouge() +rouge_model = Rouge_w() +rouge_l_model = Rouge_l() + + +def rouge_value_self(data_1, data_2): + data_1 = [' '.join(i) for i in data_1] + data_2 = [' '.join(i) for i in data_2] + rouge_l_list = [] + + for sen_1, sen_2 in zip(data_1, data_2): + sen_1 = sen_1.split(" ") + sen_2 = sen_2.split(" ") + rouge_l_score = rouge_l_model.score(sen_1, sen_2) + rouge_l_list.append(rouge_l_score) + + return "", "", rouge_l_list + + +def rouge_pre(text, df_train_nuoche): + + return_list = [] + index_rouge_list = [] + text_list = [text] * len(df_train_nuoche) + + data_list = [] + for data_dan in df_train_nuoche: + data_list.append(data_dan[0]) + rouge_1, rouge_2, rouge_l = rouge_value_self(text_list, data_list) + index_rouge_list.extend(rouge_l) + + re1 = [(i[0], i[1]) for i in sorted(list(enumerate(index_rouge_list)), key=lambda x: x[1], reverse=True)] + + return_list.append(re1[0][1]) + return_list.append(re1[0][0]) + + return return_list + + +def accurate_check_rouge(text_paper, recall_data_list): + ''' + 精确查重出相似句子 + :param text: + :param recall_data_list: list [[sentence, filename],[sentence, filename],[sentence, filename]] + :return: + ''' + # 文本处理 + # with open(text_paper_path, encoding="gbk") as f: + # text_paper = f.read() + centent_list = [] + text_paper = str(text_paper).replace("。\n", "。") + centent_list.extend(text_paper.split("。")) + data_zong = [] + + # rouge算法查重 + for text in centent_list: + rouge_pre_list = rouge_pre(text, recall_data_list) + data_zong.append(rouge_pre_list) + + original_dict = [] + + + # 找出相似的句子序号 + bool_check_sentense = [] + for i in range(len(data_zong)): + if data_zong[i][0] > 0.47: + bool_check_sentense.append([i,data_zong[i][1]]) + biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]] + + sentence_0_list = [] + sentence_1_list = [] + sim_paper_name = [] + + for i in biao_red: + if recall_data_list[i[1][0]][1] == recall_data_list[i[1][1]][1] == recall_data_list[i[1][2]][1]: + sentence_0_list.append("。".join([centent_list[i[0][0]], centent_list[i[0][1]], centent_list[i[0][2]]])) + sentence_1_list.append("".join([recall_data_list[i[1][0]][0], recall_data_list[i[1][1]][0], recall_data_list[i[1][2]][0]])) + sim_paper_name.append(recall_data_list[i[1][0]][1]) + else: + continue + + sentence_0_list_new = [] + sentence_1_list_new = [] + + for i in zip(sentence_0_list, sentence_1_list): + if len(i[0]) + len(i[1]) < 1200: + sentence_0_list_new.append(i[0]) + sentence_1_list_new.append(i[1]) + else: + print(len(i[0]) + len(i[1])) + continue + for i in zip(sentence_0_list_new, sentence_1_list_new): + print("超过字数", len(i[0])) + print("超过字数", len(i[1])) + + paper_dict = biaohong_bert_predict(sentence_0_list_new, sentence_1_list_new) + + # paper_dict + # print("原文:".format(i), paper_dict[i][0]) + # print("原文标红:".format(i), paper_dict[i][1]) + # print("相似:".format(i), paper_dict[i][2]) + # print("相似标红:".format(i), paper_dict[i][3]) + + # original_text + original_text = [] + original_text_contrast = [] + + + for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), sentence_0_list_new, sentence_1_list_new, sim_paper_name): + + print([sentence_0_dan, sentence_1_dan]) + original_text_contrast_dict = {} + similar_content = {"author": ""} + try: + sentence_0_bool, sentence_0_dan_red = original_text_marked_red(sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]) # text_original, bert_text, bert_text_pre + except: + print([sentence_0_dan,sentence_1_dan]) + 9/0 + sentence_1_bool, sentence_1_dan_red = original_text_marked_red(sentence_1_dan, paper_dict[paper_dict_dan_id][2], paper_dict[paper_dict_dan_id][3]) # text_original, bert_text, bert_text_pre + + if sentence_0_bool == False or sentence_1_bool == False: + continue + original_text.append(sentence_0_dan_red) + original_text_contrast_dict["original_text"] = "此处有 {} 字相似\n".format( + len(paper_dict[paper_dict_dan_id][1])) + sentence_0_dan_red + + similar_content["content"] = sentence_1_dan_red + similar_content["title"] = sim_paper_name_dan + original_text_contrast_dict["similar_content"] = similar_content + + original_text_contrast.append(original_text_contrast_dict) + + original_text = "。".join(original_text) + + return { + "author": "", + "check_time": "", + "section_data": "", + "section_details": [ + { + "end_page_index": 0, + "name": "", + "original_text": original_text, + "original_text_contrast": original_text_contrast + } + ] + } + + + + + +def biaohong(bool_check_sentense, data_zong, df_train_nuoche): + ''' + 标红的序号 [[0,1,2],[3,4,5]] + :param bool_check_sentense: + :return: list + ''' + biao_red = [] + i = 0 + start = -1 + end = -1 + while True: + if i >= len(bool_check_sentense) or bool_check_sentense[i][0] +1 >= len(data_zong) or bool_check_sentense[i][1]+1 >= len(df_train_nuoche): + break + elif bool_check_sentense[i][0]-1 == start: + i += 1 + continue + elif bool_check_sentense[i][0] == end: + i += 1 + continue + elif bool_check_sentense[i][0]-1 == end: + i += 1 + continue + else: + biao_red_dan = [] + biao_red_dan.append([bool_check_sentense[i][0] - 1, bool_check_sentense[i][1] - 1]) + biao_red_dan.append([bool_check_sentense[i][0], bool_check_sentense[i][1]]) + biao_red_dan.append([bool_check_sentense[i][0] + 1, bool_check_sentense[i][1] + 1]) + biao_red.append([[bool_check_sentense[i][0]-1, bool_check_sentense[i][0], bool_check_sentense[i][0]+1], + [bool_check_sentense[i][1]-1, bool_check_sentense[i][1], bool_check_sentense[i][1]+1]]) + start = bool_check_sentense[i][0]-1 + end = bool_check_sentense[i][0]+1 + i += 1 + + return biao_red + + +def dialog_line_parse(url, text): + """ + 将数据输入模型进行分析并输出结果 + :param url: 模型url + :param text: 进入模型的数据 + :return: 模型返回结果 + """ + + response = requests.post( + url, + json=text, + timeout=100000 + ) + if response.status_code == 200: + return response.json() + else: + # logger.error( + # "【{}】 Failed to get a proper response from remote " + # "server. Status Code: {}. Response: {}" + # "".format(url, response.status_code, response.text) + # ) + print("【{}】 Failed to get a proper response from remote " + "server. Status Code: {}. Response: {}" + "".format(url, response.status_code, response.text)) + print(text) + return [] + + +def original_text_marked_red(text_original, bert_text, bert_text_pre): + ''' + 把原文标红字段找到 + :param text_original: + :param bert_text: + :param bert_text_pre: + :return: + ''' + + fuhao = ["\n"] + up_pointer = 0 + down_pointer = 0 + + pointer_list = [] + + if len(bert_text_pre) > len(bert_text): + return False, "" + + while True: + if down_pointer >= len(bert_text_pre): + break + elif down_pointer == len(bert_text_pre)-1: + if bert_text[up_pointer] == bert_text_pre[down_pointer]: + pointer_list.append(up_pointer) + break + else: + up_pointer += 1 + down_pointer = 0 + pointer_list = [] + + elif bert_text[up_pointer] in fuhao: + up_pointer += 1 + + else: + if bert_text[up_pointer] == bert_text_pre[down_pointer]: + pointer_list.append(up_pointer) + up_pointer += 1 + down_pointer += 1 + else: + if bert_text_pre[down_pointer:down_pointer+5] == "[UNK]": + up_pointer += 1 + down_pointer += 5 + pointer_list.append(up_pointer) + else: + up_pointer += 1 + down_pointer = 0 + pointer_list = [] + + + start = pointer_list[0] + end = pointer_list[-1] + bert_text_list = list(bert_text) + bert_text_list.insert(start, "") + bert_text_list.insert(end + 2 , "") + + text_original_list = list(text_original) + + up = 0 + down = 0 + + while True: + if up == len(text_original_list): + break + + if text_original_list[up] == bert_text_list[down]: + up += 1 + down += 1 + + else: + if bert_text_list[down] == "": + down += 1 + elif bert_text_list[down] == "": + down += 1 + else: + bert_text_list.insert(down, text_original_list[up]) + up += 1 + down += 1 + + bert_text = "".join(bert_text_list) + return True, bert_text + + +def biaohong_bert_predict(sentence_0_list, sentence_1_list): + ''' + 找出标红字符 + :param bool_check_sentense: + :return: + ''' + + # sentence_0_list = [] + # sentence_1_list = [] + # sim_paper_name = [] + # + # for i in biaohong_list: + # sentence_0_list.append("。".join([paper_list[i[0][0]], paper_list[i[0][1]], paper_list[i[0][2]]])) + # sentence_1_list.append("。".join([recall_data_list[i[1][1]], recall_data_list[i[1][1]], recall_data_list[i[1][2]]])) + + paper_dict = dialog_line_parse("http://192.168.31.74:16003/", {"sentence_0": sentence_0_list, "sentence_1": sentence_1_list})["resilt"] + + # paper_dict + # print("原文:".format(i), paper_dict[i][0]) + # print("原文标红:".format(i), paper_dict[i][1]) + # print("相似:".format(i), paper_dict[i][2]) + # print("相似标红:".format(i), paper_dict[i][3]) + + # original_text + # + # + # for paper_dict_dan, sentence_0_dan, sentence_1_dan in zip(paper_dict, sentence_0_list, sentence_1_list): + # original_text_marked_red + + return paper_dict + +def ulit_text(title, text): + data = [] + try: + text = json.loads(text)["content"] + except: + pass + + text = text.strip().replace("\n", "").replace(" ", "").replace("。", "。\n") + text_list = text.split("\n") + + for i in text_list: + data.append([i, title]) + return data + +def run_query(conn, sql, params): + with conn.cursor() as cursor: + cursor.execute(sql, params) + result = cursor.fetchall() + return result + + +def processing_one_text(paper_id): + conn = pymysql.connect( + host='192.168.31.145', + port=3306, + user='root', + password='123456', + db='zhiwang_db', + charset='utf8mb4', + cursorclass=pymysql.cursors.DictCursor + ) + + sql = 'SELECT * FROM main_table_paper_detail_message WHERE doc_id=%s' + params = (paper_id,) + + result = run_query(conn, sql, params) + + conn.close() + print(result) + title = result[0]['title'] + author = result[0]['author'] + content_path = result[0]['content'] + + try: + with open(content_path, encoding="utf-8") as f: + text = f.read() + except: + with open(content_path, encoding="gbk") as f: + text = f.read() + + data = ulit_text(title, text) + return data + + +def ulit_recall_paper(recall_data_list_dict): + ''' + 对返回的十篇文章路径读取并解析 + :param recall_data_list_path: + :return data: list [[sentence, filename],[sentence, filename],[sentence, filename]] + ''' + + # data = [] + # for path in recall_data_list_path: + # filename = path.split("/")[-1] + # with open(path, encoding="gbk") as f: + # text = f.read() + # text_list = text.split("\n") + # for sentence in text_list: + # if sentence != "": + # data.append([sentence, filename]) + # return data + + + data = [] + for i in list(recall_data_list_dict.items())[:5]: + data_one = processing_one_text(i[0]) + data.extend(data_one) + + + return data + + +def recall_10(title, abst_zh, content) -> list: + ''' + 宇鹏召回接口 + :param paper_name: + :return: + ''' + + request_json = { + "title": title, + "abst_zh": abst_zh, + "content": content + } + paper_dict = dialog_line_parse("http://192.168.31.145:50002/check", request_json) + + return paper_dict + +@app.route("/", methods=["POST"]) +def handle_query(): + + title = request.json["title"] + abst_zh = request.json["abst_zh"] # txt + content = request.json["content"] + + # 调用宇鹏查询相似十篇 + # recall_data_list_dict = recall_10(title, abst_zh, content) + + with open("data/rell_json.txt") as f: + recall_data_list_dict = eval(f.read()) + + # 读取文章转化成格式数据 + recall_data_list = ulit_recall_paper(recall_data_list_dict) + + + # recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist() + + # 进入精确查重系统 + return_list = accurate_check_rouge(content, recall_data_list) + + return_text = {"resilt": return_list, "probabilities": None, "status_code": 200} + return jsonify(return_text) # 返回结果 + + +if __name__ == "__main__": + app.run(host="0.0.0.0", port=16001, threaded=True, debug=True) \ No newline at end of file diff --git a/flask_check_bert.py b/flask_check_bert.py new file mode 100644 index 0000000..2a6010e --- /dev/null +++ b/flask_check_bert.py @@ -0,0 +1,523 @@ +import os +import numpy as np +from numpy.linalg import norm +import pandas as pd +# from rouge import Rouge +from rouge_chinese import Rouge +from Rouge_w import Rouge_w,Rouge_l +import json +import pymysql + +import requests +from flask import Flask, jsonify +from flask import request +import uuid +app = Flask(__name__) +app.config["JSON_AS_ASCII"] = False + + +nums_cpus = 16 +rouge = Rouge() +rouge_model = Rouge_w() +rouge_l_model = Rouge_l() + + +def bert_check(text, recall_data_list): + ''' + bert 查重 + :return: + ''' + + sen_0 = [text] * len(recall_data_list) + sen_1 = [i[0] for i in recall_data_list] + + return_list = [] + request_json = { + "texts": [sen_0, sen_1], + } + paper_dict = dialog_line_parse("http://192.168.31.74:16002/", request_json) + score_list = paper_dict["res"] + + # 后期要改 + # return_list.append(re1[0][1]) + # return_list.append(re1[0][0]) + if 1 in score_list: + index_score = score_list.index(1) + else: + index_score = "NaN" + + if index_score == "NaN": + return_list.append(0) + return_list.append("") + else: + return_list.append(1) + return_list.append(index_score) + + return return_list + + + +def rouge_value_self(data_1, data_2): + data_1 = [' '.join(i) for i in data_1] + data_2 = [' '.join(i) for i in data_2] + rouge_l_list = [] + + for sen_1, sen_2 in zip(data_1, data_2): + sen_1 = sen_1.split(" ") + sen_2 = sen_2.split(" ") + rouge_l_score = rouge_l_model.score(sen_1, sen_2) + rouge_l_list.append(rouge_l_score) + + return "", "", rouge_l_list + + +def rouge_pre(text, df_train_nuoche): + + return_list = [] + index_rouge_list = [] + text_list = [text] * len(df_train_nuoche) + + data_list = [] + for data_dan in df_train_nuoche: + data_list.append(data_dan[0]) + rouge_1, rouge_2, rouge_l = rouge_value_self(text_list, data_list) + index_rouge_list.extend(rouge_l) + + re1 = [(i[0], i[1]) for i in sorted(list(enumerate(index_rouge_list)), key=lambda x: x[1], reverse=True)] + + return_list.append(re1[0][1]) + return_list.append(re1[0][0]) + + return return_list + + +def accurate_check_rouge(text_paper, recall_data_list): + ''' + 精确查重出相似句子 + :param text: + :param recall_data_list: list [[sentence, filename],[sentence, filename],[sentence, filename]] + :return: + ''' + # 文本处理 + # with open(text_paper_path, encoding="gbk") as f: + # text_paper = f.read() + centent_list = [] + text_paper = str(text_paper).replace("。\n", "。") + centent_list.extend(text_paper.split("。")) + data_zong = [] + + # rouge算法查重 + # for text in centent_list: + # rouge_pre_list = rouge_pre(text, recall_data_list) + # data_zong.append(rouge_pre_list) + + # bert算法查重 + for text in centent_list: + bert_pre_list = bert_check(text, recall_data_list) + data_zong.append(bert_pre_list) + + original_dict = [] + + + # 找出相似的句子序号 + bool_check_sentense = [] + for i in range(len(data_zong)): + if data_zong[i][0] == 1: + bool_check_sentense.append([i,data_zong[i][1]]) + biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]] + + sentence_0_list = [] + sentence_1_list = [] + sim_paper_name = [] + + for i in biao_red: + if recall_data_list[i[1][0]][1] == recall_data_list[i[1][1]][1] == recall_data_list[i[1][2]][1]: + sentence_0_list.append("。".join([centent_list[i[0][0]], centent_list[i[0][1]], centent_list[i[0][2]]])) + sentence_1_list.append("".join([recall_data_list[i[1][0]][0], recall_data_list[i[1][1]][0], recall_data_list[i[1][2]][0]])) + sim_paper_name.append(recall_data_list[i[1][0]][1]) + else: + continue + + sentence_0_list_new = [] + sentence_1_list_new = [] + + + for i in zip(sentence_0_list, sentence_1_list): + if len(i[0]) + len(i[1]) < 1200: + sentence_0_list_new.append(i[0]) + sentence_1_list_new.append(i[1]) + else: + print(len(i[0]) + len(i[1])) + continue + for i in zip(sentence_0_list_new, sentence_1_list_new): + print("超过字数", len(i[0])) + print("超过字数", len(i[1])) + + paper_dict = biaohong_bert_predict(sentence_0_list_new, sentence_1_list_new) + + # paper_dict + # print("原文:".format(i), paper_dict[i][0]) + # print("原文标红:".format(i), paper_dict[i][1]) + # print("相似:".format(i), paper_dict[i][2]) + # print("相似标红:".format(i), paper_dict[i][3]) + + # original_text + original_text = [] + original_text_contrast = [] + + + for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), sentence_0_list_new, sentence_1_list_new, sim_paper_name): + + print([sentence_0_dan, sentence_1_dan]) + original_text_contrast_dict = {} + similar_content = {"author": ""} + try: + sentence_0_bool, sentence_0_dan_red = original_text_marked_red(sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]) # text_original, bert_text, bert_text_pre + except: + print("报错", [sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]]) + 9/0 + sentence_1_bool, sentence_1_dan_red = original_text_marked_red(sentence_1_dan, paper_dict[paper_dict_dan_id][2], paper_dict[paper_dict_dan_id][3]) # text_original, bert_text, bert_text_pre + + if sentence_0_bool == False or sentence_1_bool == False: + continue + original_text.append(sentence_0_dan_red) + original_text_contrast_dict["original_text"] = "此处有 {} 字相似\n".format( + len(paper_dict[paper_dict_dan_id][1])) + sentence_0_dan_red + + similar_content["content"] = sentence_1_dan_red + similar_content["title"] = sim_paper_name_dan + original_text_contrast_dict["similar_content"] = similar_content + + original_text_contrast.append(original_text_contrast_dict) + + original_text = "。".join(original_text) + + return { + "author": "", + "check_time": "", + "section_data": "", + "section_details": [ + { + "end_page_index": 0, + "name": "", + "original_text": original_text, + "original_text_contrast": original_text_contrast + } + ] + } + + + + + +def biaohong(bool_check_sentense, data_zong, df_train_nuoche): + ''' + 标红的序号 [[0,1,2],[3,4,5]] + :param bool_check_sentense: + :return: list + ''' + biao_red = [] + i = 0 + start = -1 + end = -1 + while True: + if i >= len(bool_check_sentense) or bool_check_sentense[i][0] +1 >= len(data_zong) or bool_check_sentense[i][1]+1 >= len(df_train_nuoche): + break + elif bool_check_sentense[i][0]-1 == start: + i += 1 + continue + elif bool_check_sentense[i][0] == end: + i += 1 + continue + elif bool_check_sentense[i][0]-1 == end: + i += 1 + continue + else: + biao_red_dan = [] + biao_red_dan.append([bool_check_sentense[i][0] - 1, bool_check_sentense[i][1] - 1]) + biao_red_dan.append([bool_check_sentense[i][0], bool_check_sentense[i][1]]) + biao_red_dan.append([bool_check_sentense[i][0] + 1, bool_check_sentense[i][1] + 1]) + biao_red.append([[bool_check_sentense[i][0]-1, bool_check_sentense[i][0], bool_check_sentense[i][0]+1], + [bool_check_sentense[i][1]-1, bool_check_sentense[i][1], bool_check_sentense[i][1]+1]]) + start = bool_check_sentense[i][0]-1 + end = bool_check_sentense[i][0]+1 + i += 1 + + return biao_red + + +def dialog_line_parse(url, text): + """ + 将数据输入模型进行分析并输出结果 + :param url: 模型url + :param text: 进入模型的数据 + :return: 模型返回结果 + """ + + response = requests.post( + url, + json=text, + timeout=100000 + ) + if response.status_code == 200: + return response.json() + else: + # logger.error( + # "【{}】 Failed to get a proper response from remote " + # "server. Status Code: {}. Response: {}" + # "".format(url, response.status_code, response.text) + # ) + print("【{}】 Failed to get a proper response from remote " + "server. Status Code: {}. Response: {}" + "".format(url, response.status_code, response.text)) + print(text) + return [] + + +def is_english_char(char): + code = ord(char) + return 32 <= code <= 126 + + +def original_text_marked_red(text_original, bert_text, bert_text_pre): + ''' + 把原文标红字段找到 + :param text_original: + :param bert_text: + :param bert_text_pre: + :return: + ''' + + fuhao = ["\n"] + up_pointer = 0 + down_pointer = 0 + + pointer_list = [] + + if len(bert_text_pre) > len(bert_text): + return False, "" + + while True: + if down_pointer >= len(bert_text_pre): + break + elif down_pointer == len(bert_text_pre)-1: + if bert_text[up_pointer] == bert_text_pre[down_pointer]: + pointer_list.append(up_pointer) + break + else: + up_pointer += 1 + down_pointer = 0 + pointer_list = [] + + elif bert_text[up_pointer] in fuhao: + up_pointer += 1 + + else: + if bert_text[up_pointer] == bert_text_pre[down_pointer]: + pointer_list.append(up_pointer) + up_pointer += 1 + down_pointer += 1 + else: + if bert_text_pre[down_pointer:down_pointer+5] == "[UNK]": + up_pointer += 1 + down_pointer += 5 + pointer_list.append(up_pointer) + elif is_english_char(bert_text_pre[down_pointer]) == True: + up_pointer += 1 + down_pointer += 1 + pointer_list.append(up_pointer) + else: + up_pointer += 1 + down_pointer = 0 + pointer_list = [] + + + start = pointer_list[0] + end = pointer_list[-1] + bert_text_list = list(bert_text) + bert_text_list.insert(start, "") + bert_text_list.insert(end + 2 , "") + + text_original_list = list(text_original) + + up = 0 + down = 0 + + while True: + if up == len(text_original_list): + break + + if text_original_list[up] == bert_text_list[down]: + up += 1 + down += 1 + + else: + if bert_text_list[down] == "": + down += 1 + elif bert_text_list[down] == "": + down += 1 + else: + bert_text_list.insert(down, text_original_list[up]) + up += 1 + down += 1 + + bert_text = "".join(bert_text_list) + return True, bert_text + + +def biaohong_bert_predict(sentence_0_list, sentence_1_list): + ''' + 找出标红字符 + :param bool_check_sentense: + :return: + ''' + + # sentence_0_list = [] + # sentence_1_list = [] + # sim_paper_name = [] + # + # for i in biaohong_list: + # sentence_0_list.append("。".join([paper_list[i[0][0]], paper_list[i[0][1]], paper_list[i[0][2]]])) + # sentence_1_list.append("。".join([recall_data_list[i[1][1]], recall_data_list[i[1][1]], recall_data_list[i[1][2]]])) + + paper_dict = dialog_line_parse("http://192.168.31.74:16003/", {"sentence_0": sentence_0_list, "sentence_1": sentence_1_list})["resilt"] + + # paper_dict + # print("原文:".format(i), paper_dict[i][0]) + # print("原文标红:".format(i), paper_dict[i][1]) + # print("相似:".format(i), paper_dict[i][2]) + # print("相似标红:".format(i), paper_dict[i][3]) + + # original_text + # + # + # for paper_dict_dan, sentence_0_dan, sentence_1_dan in zip(paper_dict, sentence_0_list, sentence_1_list): + # original_text_marked_red + + return paper_dict + +def ulit_text(title, text): + data = [] + try: + text = json.loads(text)["content"] + except: + pass + + text = text.strip().replace("\n", "").replace(" ", "").replace("。", "。\n") + text_list = text.split("\n") + + for i in text_list: + data.append([i, title]) + return data + +def run_query(conn, sql, params): + with conn.cursor() as cursor: + cursor.execute(sql, params) + result = cursor.fetchall() + return result + + +def processing_one_text(paper_id): + conn = pymysql.connect( + host='192.168.31.145', + port=3306, + user='root', + password='123456', + db='zhiwang_db', + charset='utf8mb4', + cursorclass=pymysql.cursors.DictCursor + ) + + sql = 'SELECT * FROM main_table_paper_detail_message WHERE doc_id=%s' + params = (paper_id,) + + result = run_query(conn, sql, params) + + conn.close() + print(result) + title = result[0]['title'] + author = result[0]['author'] + content_path = result[0]['content'] + + try: + with open(content_path, encoding="utf-8") as f: + text = f.read() + except: + with open(content_path, encoding="gbk") as f: + text = f.read() + + data = ulit_text(title, text) + return data + + +def ulit_recall_paper(recall_data_list_dict): + ''' + 对返回的十篇文章路径读取并解析 + :param recall_data_list_path: + :return data: list [[sentence, filename],[sentence, filename],[sentence, filename]] + ''' + + # data = [] + # for path in recall_data_list_path: + # filename = path.split("/")[-1] + # with open(path, encoding="gbk") as f: + # text = f.read() + # text_list = text.split("\n") + # for sentence in text_list: + # if sentence != "": + # data.append([sentence, filename]) + # return data + + + data = [] + for i in list(recall_data_list_dict.items())[:5]: + data_one = processing_one_text(i[0]) + data.extend(data_one) + + return data + + +def recall_10(title, abst_zh, content) -> list: + ''' + 宇鹏召回接口 + :param paper_name: + :return: + ''' + + request_json = { + "title": title, + "abst_zh": abst_zh, + "content": content + } + paper_dict = dialog_line_parse("http://192.168.31.145:50002/check", request_json) + + return paper_dict + +@app.route("/", methods=["POST"]) +def handle_query(): + print(request.remote_addr) + title = request.json["title"] + abst_zh = request.json["abst_zh"] # txt + content = request.json["content"] + + # 调用宇鹏查询相似十篇 + # recall_data_list_dict = recall_10(title, abst_zh, content) + with open("data/rell_json.txt") as f: + recall_data_list_dict = eval(f.read()) + + + # 读取文章转化成格式数据 + recall_data_list = ulit_recall_paper(recall_data_list_dict) + + + # recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist() + + # 进入精确查重系统 + return_list = accurate_check_rouge(content, recall_data_list) + + return_text = {"resilt": return_list, "probabilities": None, "status_code": 200} + return jsonify(return_text) # 返回结果 + + +if __name__ == "__main__": + app.run(host="0.0.0.0", port=16001, threaded=True, debug=True) \ No newline at end of file