diff --git a/flask_check.py b/flask_check.py
new file mode 100644
index 0000000..f35afd4
--- /dev/null
+++ b/flask_check.py
@@ -0,0 +1,474 @@
+import os
+import numpy as np
+from numpy.linalg import norm
+import pandas as pd
+# from rouge import Rouge
+from rouge_chinese import Rouge
+from Rouge_w import Rouge_w,Rouge_l
+import json
+import pymysql
+
+import requests
+from flask import Flask, jsonify
+from flask import request
+import uuid
+app = Flask(__name__)
+app.config["JSON_AS_ASCII"] = False
+
+
+nums_cpus = 16
+rouge = Rouge()
+rouge_model = Rouge_w()
+rouge_l_model = Rouge_l()
+
+
+def rouge_value_self(data_1, data_2):
+ data_1 = [' '.join(i) for i in data_1]
+ data_2 = [' '.join(i) for i in data_2]
+ rouge_l_list = []
+
+ for sen_1, sen_2 in zip(data_1, data_2):
+ sen_1 = sen_1.split(" ")
+ sen_2 = sen_2.split(" ")
+ rouge_l_score = rouge_l_model.score(sen_1, sen_2)
+ rouge_l_list.append(rouge_l_score)
+
+ return "", "", rouge_l_list
+
+
+def rouge_pre(text, df_train_nuoche):
+
+ return_list = []
+ index_rouge_list = []
+ text_list = [text] * len(df_train_nuoche)
+
+ data_list = []
+ for data_dan in df_train_nuoche:
+ data_list.append(data_dan[0])
+ rouge_1, rouge_2, rouge_l = rouge_value_self(text_list, data_list)
+ index_rouge_list.extend(rouge_l)
+
+ re1 = [(i[0], i[1]) for i in sorted(list(enumerate(index_rouge_list)), key=lambda x: x[1], reverse=True)]
+
+ return_list.append(re1[0][1])
+ return_list.append(re1[0][0])
+
+ return return_list
+
+
+def accurate_check_rouge(text_paper, recall_data_list):
+ '''
+ 精确查重出相似句子
+ :param text:
+ :param recall_data_list: list [[sentence, filename],[sentence, filename],[sentence, filename]]
+ :return:
+ '''
+ # 文本处理
+ # with open(text_paper_path, encoding="gbk") as f:
+ # text_paper = f.read()
+ centent_list = []
+ text_paper = str(text_paper).replace("。\n", "。")
+ centent_list.extend(text_paper.split("。"))
+ data_zong = []
+
+ # rouge算法查重
+ for text in centent_list:
+ rouge_pre_list = rouge_pre(text, recall_data_list)
+ data_zong.append(rouge_pre_list)
+
+ original_dict = []
+
+
+ # 找出相似的句子序号
+ bool_check_sentense = []
+ for i in range(len(data_zong)):
+ if data_zong[i][0] > 0.47:
+ bool_check_sentense.append([i,data_zong[i][1]])
+ biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]]
+
+ sentence_0_list = []
+ sentence_1_list = []
+ sim_paper_name = []
+
+ for i in biao_red:
+ if recall_data_list[i[1][0]][1] == recall_data_list[i[1][1]][1] == recall_data_list[i[1][2]][1]:
+ sentence_0_list.append("。".join([centent_list[i[0][0]], centent_list[i[0][1]], centent_list[i[0][2]]]))
+ sentence_1_list.append("".join([recall_data_list[i[1][0]][0], recall_data_list[i[1][1]][0], recall_data_list[i[1][2]][0]]))
+ sim_paper_name.append(recall_data_list[i[1][0]][1])
+ else:
+ continue
+
+ sentence_0_list_new = []
+ sentence_1_list_new = []
+
+ for i in zip(sentence_0_list, sentence_1_list):
+ if len(i[0]) + len(i[1]) < 1200:
+ sentence_0_list_new.append(i[0])
+ sentence_1_list_new.append(i[1])
+ else:
+ print(len(i[0]) + len(i[1]))
+ continue
+ for i in zip(sentence_0_list_new, sentence_1_list_new):
+ print("超过字数", len(i[0]))
+ print("超过字数", len(i[1]))
+
+ paper_dict = biaohong_bert_predict(sentence_0_list_new, sentence_1_list_new)
+
+ # paper_dict
+ # print("原文:".format(i), paper_dict[i][0])
+ # print("原文标红:".format(i), paper_dict[i][1])
+ # print("相似:".format(i), paper_dict[i][2])
+ # print("相似标红:".format(i), paper_dict[i][3])
+
+ # original_text
+ original_text = []
+ original_text_contrast = []
+
+
+ for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), sentence_0_list_new, sentence_1_list_new, sim_paper_name):
+
+ print([sentence_0_dan, sentence_1_dan])
+ original_text_contrast_dict = {}
+ similar_content = {"author": ""}
+ try:
+ sentence_0_bool, sentence_0_dan_red = original_text_marked_red(sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]) # text_original, bert_text, bert_text_pre
+ except:
+ print([sentence_0_dan,sentence_1_dan])
+ 9/0
+ sentence_1_bool, sentence_1_dan_red = original_text_marked_red(sentence_1_dan, paper_dict[paper_dict_dan_id][2], paper_dict[paper_dict_dan_id][3]) # text_original, bert_text, bert_text_pre
+
+ if sentence_0_bool == False or sentence_1_bool == False:
+ continue
+ original_text.append(sentence_0_dan_red)
+ original_text_contrast_dict["original_text"] = "此处有 {} 字相似\n".format(
+ len(paper_dict[paper_dict_dan_id][1])) + sentence_0_dan_red
+
+ similar_content["content"] = sentence_1_dan_red
+ similar_content["title"] = sim_paper_name_dan
+ original_text_contrast_dict["similar_content"] = similar_content
+
+ original_text_contrast.append(original_text_contrast_dict)
+
+ original_text = "。".join(original_text)
+
+ return {
+ "author": "",
+ "check_time": "",
+ "section_data": "",
+ "section_details": [
+ {
+ "end_page_index": 0,
+ "name": "",
+ "original_text": original_text,
+ "original_text_contrast": original_text_contrast
+ }
+ ]
+ }
+
+
+
+
+
+def biaohong(bool_check_sentense, data_zong, df_train_nuoche):
+ '''
+ 标红的序号 [[0,1,2],[3,4,5]]
+ :param bool_check_sentense:
+ :return: list
+ '''
+ biao_red = []
+ i = 0
+ start = -1
+ end = -1
+ while True:
+ if i >= len(bool_check_sentense) or bool_check_sentense[i][0] +1 >= len(data_zong) or bool_check_sentense[i][1]+1 >= len(df_train_nuoche):
+ break
+ elif bool_check_sentense[i][0]-1 == start:
+ i += 1
+ continue
+ elif bool_check_sentense[i][0] == end:
+ i += 1
+ continue
+ elif bool_check_sentense[i][0]-1 == end:
+ i += 1
+ continue
+ else:
+ biao_red_dan = []
+ biao_red_dan.append([bool_check_sentense[i][0] - 1, bool_check_sentense[i][1] - 1])
+ biao_red_dan.append([bool_check_sentense[i][0], bool_check_sentense[i][1]])
+ biao_red_dan.append([bool_check_sentense[i][0] + 1, bool_check_sentense[i][1] + 1])
+ biao_red.append([[bool_check_sentense[i][0]-1, bool_check_sentense[i][0], bool_check_sentense[i][0]+1],
+ [bool_check_sentense[i][1]-1, bool_check_sentense[i][1], bool_check_sentense[i][1]+1]])
+ start = bool_check_sentense[i][0]-1
+ end = bool_check_sentense[i][0]+1
+ i += 1
+
+ return biao_red
+
+
+def dialog_line_parse(url, text):
+ """
+ 将数据输入模型进行分析并输出结果
+ :param url: 模型url
+ :param text: 进入模型的数据
+ :return: 模型返回结果
+ """
+
+ response = requests.post(
+ url,
+ json=text,
+ timeout=100000
+ )
+ if response.status_code == 200:
+ return response.json()
+ else:
+ # logger.error(
+ # "【{}】 Failed to get a proper response from remote "
+ # "server. Status Code: {}. Response: {}"
+ # "".format(url, response.status_code, response.text)
+ # )
+ print("【{}】 Failed to get a proper response from remote "
+ "server. Status Code: {}. Response: {}"
+ "".format(url, response.status_code, response.text))
+ print(text)
+ return []
+
+
+def original_text_marked_red(text_original, bert_text, bert_text_pre):
+ '''
+ 把原文标红字段找到
+ :param text_original:
+ :param bert_text:
+ :param bert_text_pre:
+ :return:
+ '''
+
+ fuhao = ["\n"]
+ up_pointer = 0
+ down_pointer = 0
+
+ pointer_list = []
+
+ if len(bert_text_pre) > len(bert_text):
+ return False, ""
+
+ while True:
+ if down_pointer >= len(bert_text_pre):
+ break
+ elif down_pointer == len(bert_text_pre)-1:
+ if bert_text[up_pointer] == bert_text_pre[down_pointer]:
+ pointer_list.append(up_pointer)
+ break
+ else:
+ up_pointer += 1
+ down_pointer = 0
+ pointer_list = []
+
+ elif bert_text[up_pointer] in fuhao:
+ up_pointer += 1
+
+ else:
+ if bert_text[up_pointer] == bert_text_pre[down_pointer]:
+ pointer_list.append(up_pointer)
+ up_pointer += 1
+ down_pointer += 1
+ else:
+ if bert_text_pre[down_pointer:down_pointer+5] == "[UNK]":
+ up_pointer += 1
+ down_pointer += 5
+ pointer_list.append(up_pointer)
+ else:
+ up_pointer += 1
+ down_pointer = 0
+ pointer_list = []
+
+
+ start = pointer_list[0]
+ end = pointer_list[-1]
+ bert_text_list = list(bert_text)
+ bert_text_list.insert(start, "")
+ bert_text_list.insert(end + 2 , "")
+
+ text_original_list = list(text_original)
+
+ up = 0
+ down = 0
+
+ while True:
+ if up == len(text_original_list):
+ break
+
+ if text_original_list[up] == bert_text_list[down]:
+ up += 1
+ down += 1
+
+ else:
+ if bert_text_list[down] == "":
+ down += 1
+ elif bert_text_list[down] == "":
+ down += 1
+ else:
+ bert_text_list.insert(down, text_original_list[up])
+ up += 1
+ down += 1
+
+ bert_text = "".join(bert_text_list)
+ return True, bert_text
+
+
+def biaohong_bert_predict(sentence_0_list, sentence_1_list):
+ '''
+ 找出标红字符
+ :param bool_check_sentense:
+ :return:
+ '''
+
+ # sentence_0_list = []
+ # sentence_1_list = []
+ # sim_paper_name = []
+ #
+ # for i in biaohong_list:
+ # sentence_0_list.append("。".join([paper_list[i[0][0]], paper_list[i[0][1]], paper_list[i[0][2]]]))
+ # sentence_1_list.append("。".join([recall_data_list[i[1][1]], recall_data_list[i[1][1]], recall_data_list[i[1][2]]]))
+
+ paper_dict = dialog_line_parse("http://192.168.31.74:16003/", {"sentence_0": sentence_0_list, "sentence_1": sentence_1_list})["resilt"]
+
+ # paper_dict
+ # print("原文:".format(i), paper_dict[i][0])
+ # print("原文标红:".format(i), paper_dict[i][1])
+ # print("相似:".format(i), paper_dict[i][2])
+ # print("相似标红:".format(i), paper_dict[i][3])
+
+ # original_text
+ #
+ #
+ # for paper_dict_dan, sentence_0_dan, sentence_1_dan in zip(paper_dict, sentence_0_list, sentence_1_list):
+ # original_text_marked_red
+
+ return paper_dict
+
+def ulit_text(title, text):
+ data = []
+ try:
+ text = json.loads(text)["content"]
+ except:
+ pass
+
+ text = text.strip().replace("\n", "").replace(" ", "").replace("。", "。\n")
+ text_list = text.split("\n")
+
+ for i in text_list:
+ data.append([i, title])
+ return data
+
+def run_query(conn, sql, params):
+ with conn.cursor() as cursor:
+ cursor.execute(sql, params)
+ result = cursor.fetchall()
+ return result
+
+
+def processing_one_text(paper_id):
+ conn = pymysql.connect(
+ host='192.168.31.145',
+ port=3306,
+ user='root',
+ password='123456',
+ db='zhiwang_db',
+ charset='utf8mb4',
+ cursorclass=pymysql.cursors.DictCursor
+ )
+
+ sql = 'SELECT * FROM main_table_paper_detail_message WHERE doc_id=%s'
+ params = (paper_id,)
+
+ result = run_query(conn, sql, params)
+
+ conn.close()
+ print(result)
+ title = result[0]['title']
+ author = result[0]['author']
+ content_path = result[0]['content']
+
+ try:
+ with open(content_path, encoding="utf-8") as f:
+ text = f.read()
+ except:
+ with open(content_path, encoding="gbk") as f:
+ text = f.read()
+
+ data = ulit_text(title, text)
+ return data
+
+
+def ulit_recall_paper(recall_data_list_dict):
+ '''
+ 对返回的十篇文章路径读取并解析
+ :param recall_data_list_path:
+ :return data: list [[sentence, filename],[sentence, filename],[sentence, filename]]
+ '''
+
+ # data = []
+ # for path in recall_data_list_path:
+ # filename = path.split("/")[-1]
+ # with open(path, encoding="gbk") as f:
+ # text = f.read()
+ # text_list = text.split("\n")
+ # for sentence in text_list:
+ # if sentence != "":
+ # data.append([sentence, filename])
+ # return data
+
+
+ data = []
+ for i in list(recall_data_list_dict.items())[:5]:
+ data_one = processing_one_text(i[0])
+ data.extend(data_one)
+
+
+ return data
+
+
+def recall_10(title, abst_zh, content) -> list:
+ '''
+ 宇鹏召回接口
+ :param paper_name:
+ :return:
+ '''
+
+ request_json = {
+ "title": title,
+ "abst_zh": abst_zh,
+ "content": content
+ }
+ paper_dict = dialog_line_parse("http://192.168.31.145:50002/check", request_json)
+
+ return paper_dict
+
+@app.route("/", methods=["POST"])
+def handle_query():
+
+ title = request.json["title"]
+ abst_zh = request.json["abst_zh"] # txt
+ content = request.json["content"]
+
+ # 调用宇鹏查询相似十篇
+ # recall_data_list_dict = recall_10(title, abst_zh, content)
+
+ with open("data/rell_json.txt") as f:
+ recall_data_list_dict = eval(f.read())
+
+ # 读取文章转化成格式数据
+ recall_data_list = ulit_recall_paper(recall_data_list_dict)
+
+
+ # recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist()
+
+ # 进入精确查重系统
+ return_list = accurate_check_rouge(content, recall_data_list)
+
+ return_text = {"resilt": return_list, "probabilities": None, "status_code": 200}
+ return jsonify(return_text) # 返回结果
+
+
+if __name__ == "__main__":
+ app.run(host="0.0.0.0", port=16001, threaded=True, debug=True)
\ No newline at end of file
diff --git a/flask_check_bert.py b/flask_check_bert.py
new file mode 100644
index 0000000..2a6010e
--- /dev/null
+++ b/flask_check_bert.py
@@ -0,0 +1,523 @@
+import os
+import numpy as np
+from numpy.linalg import norm
+import pandas as pd
+# from rouge import Rouge
+from rouge_chinese import Rouge
+from Rouge_w import Rouge_w,Rouge_l
+import json
+import pymysql
+
+import requests
+from flask import Flask, jsonify
+from flask import request
+import uuid
+app = Flask(__name__)
+app.config["JSON_AS_ASCII"] = False
+
+
+nums_cpus = 16
+rouge = Rouge()
+rouge_model = Rouge_w()
+rouge_l_model = Rouge_l()
+
+
+def bert_check(text, recall_data_list):
+ '''
+ bert 查重
+ :return:
+ '''
+
+ sen_0 = [text] * len(recall_data_list)
+ sen_1 = [i[0] for i in recall_data_list]
+
+ return_list = []
+ request_json = {
+ "texts": [sen_0, sen_1],
+ }
+ paper_dict = dialog_line_parse("http://192.168.31.74:16002/", request_json)
+ score_list = paper_dict["res"]
+
+ # 后期要改
+ # return_list.append(re1[0][1])
+ # return_list.append(re1[0][0])
+ if 1 in score_list:
+ index_score = score_list.index(1)
+ else:
+ index_score = "NaN"
+
+ if index_score == "NaN":
+ return_list.append(0)
+ return_list.append("")
+ else:
+ return_list.append(1)
+ return_list.append(index_score)
+
+ return return_list
+
+
+
+def rouge_value_self(data_1, data_2):
+ data_1 = [' '.join(i) for i in data_1]
+ data_2 = [' '.join(i) for i in data_2]
+ rouge_l_list = []
+
+ for sen_1, sen_2 in zip(data_1, data_2):
+ sen_1 = sen_1.split(" ")
+ sen_2 = sen_2.split(" ")
+ rouge_l_score = rouge_l_model.score(sen_1, sen_2)
+ rouge_l_list.append(rouge_l_score)
+
+ return "", "", rouge_l_list
+
+
+def rouge_pre(text, df_train_nuoche):
+
+ return_list = []
+ index_rouge_list = []
+ text_list = [text] * len(df_train_nuoche)
+
+ data_list = []
+ for data_dan in df_train_nuoche:
+ data_list.append(data_dan[0])
+ rouge_1, rouge_2, rouge_l = rouge_value_self(text_list, data_list)
+ index_rouge_list.extend(rouge_l)
+
+ re1 = [(i[0], i[1]) for i in sorted(list(enumerate(index_rouge_list)), key=lambda x: x[1], reverse=True)]
+
+ return_list.append(re1[0][1])
+ return_list.append(re1[0][0])
+
+ return return_list
+
+
+def accurate_check_rouge(text_paper, recall_data_list):
+ '''
+ 精确查重出相似句子
+ :param text:
+ :param recall_data_list: list [[sentence, filename],[sentence, filename],[sentence, filename]]
+ :return:
+ '''
+ # 文本处理
+ # with open(text_paper_path, encoding="gbk") as f:
+ # text_paper = f.read()
+ centent_list = []
+ text_paper = str(text_paper).replace("。\n", "。")
+ centent_list.extend(text_paper.split("。"))
+ data_zong = []
+
+ # rouge算法查重
+ # for text in centent_list:
+ # rouge_pre_list = rouge_pre(text, recall_data_list)
+ # data_zong.append(rouge_pre_list)
+
+ # bert算法查重
+ for text in centent_list:
+ bert_pre_list = bert_check(text, recall_data_list)
+ data_zong.append(bert_pre_list)
+
+ original_dict = []
+
+
+ # 找出相似的句子序号
+ bool_check_sentense = []
+ for i in range(len(data_zong)):
+ if data_zong[i][0] == 1:
+ bool_check_sentense.append([i,data_zong[i][1]])
+ biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]]
+
+ sentence_0_list = []
+ sentence_1_list = []
+ sim_paper_name = []
+
+ for i in biao_red:
+ if recall_data_list[i[1][0]][1] == recall_data_list[i[1][1]][1] == recall_data_list[i[1][2]][1]:
+ sentence_0_list.append("。".join([centent_list[i[0][0]], centent_list[i[0][1]], centent_list[i[0][2]]]))
+ sentence_1_list.append("".join([recall_data_list[i[1][0]][0], recall_data_list[i[1][1]][0], recall_data_list[i[1][2]][0]]))
+ sim_paper_name.append(recall_data_list[i[1][0]][1])
+ else:
+ continue
+
+ sentence_0_list_new = []
+ sentence_1_list_new = []
+
+
+ for i in zip(sentence_0_list, sentence_1_list):
+ if len(i[0]) + len(i[1]) < 1200:
+ sentence_0_list_new.append(i[0])
+ sentence_1_list_new.append(i[1])
+ else:
+ print(len(i[0]) + len(i[1]))
+ continue
+ for i in zip(sentence_0_list_new, sentence_1_list_new):
+ print("超过字数", len(i[0]))
+ print("超过字数", len(i[1]))
+
+ paper_dict = biaohong_bert_predict(sentence_0_list_new, sentence_1_list_new)
+
+ # paper_dict
+ # print("原文:".format(i), paper_dict[i][0])
+ # print("原文标红:".format(i), paper_dict[i][1])
+ # print("相似:".format(i), paper_dict[i][2])
+ # print("相似标红:".format(i), paper_dict[i][3])
+
+ # original_text
+ original_text = []
+ original_text_contrast = []
+
+
+ for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), sentence_0_list_new, sentence_1_list_new, sim_paper_name):
+
+ print([sentence_0_dan, sentence_1_dan])
+ original_text_contrast_dict = {}
+ similar_content = {"author": ""}
+ try:
+ sentence_0_bool, sentence_0_dan_red = original_text_marked_red(sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]) # text_original, bert_text, bert_text_pre
+ except:
+ print("报错", [sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]])
+ 9/0
+ sentence_1_bool, sentence_1_dan_red = original_text_marked_red(sentence_1_dan, paper_dict[paper_dict_dan_id][2], paper_dict[paper_dict_dan_id][3]) # text_original, bert_text, bert_text_pre
+
+ if sentence_0_bool == False or sentence_1_bool == False:
+ continue
+ original_text.append(sentence_0_dan_red)
+ original_text_contrast_dict["original_text"] = "此处有 {} 字相似\n".format(
+ len(paper_dict[paper_dict_dan_id][1])) + sentence_0_dan_red
+
+ similar_content["content"] = sentence_1_dan_red
+ similar_content["title"] = sim_paper_name_dan
+ original_text_contrast_dict["similar_content"] = similar_content
+
+ original_text_contrast.append(original_text_contrast_dict)
+
+ original_text = "。".join(original_text)
+
+ return {
+ "author": "",
+ "check_time": "",
+ "section_data": "",
+ "section_details": [
+ {
+ "end_page_index": 0,
+ "name": "",
+ "original_text": original_text,
+ "original_text_contrast": original_text_contrast
+ }
+ ]
+ }
+
+
+
+
+
+def biaohong(bool_check_sentense, data_zong, df_train_nuoche):
+ '''
+ 标红的序号 [[0,1,2],[3,4,5]]
+ :param bool_check_sentense:
+ :return: list
+ '''
+ biao_red = []
+ i = 0
+ start = -1
+ end = -1
+ while True:
+ if i >= len(bool_check_sentense) or bool_check_sentense[i][0] +1 >= len(data_zong) or bool_check_sentense[i][1]+1 >= len(df_train_nuoche):
+ break
+ elif bool_check_sentense[i][0]-1 == start:
+ i += 1
+ continue
+ elif bool_check_sentense[i][0] == end:
+ i += 1
+ continue
+ elif bool_check_sentense[i][0]-1 == end:
+ i += 1
+ continue
+ else:
+ biao_red_dan = []
+ biao_red_dan.append([bool_check_sentense[i][0] - 1, bool_check_sentense[i][1] - 1])
+ biao_red_dan.append([bool_check_sentense[i][0], bool_check_sentense[i][1]])
+ biao_red_dan.append([bool_check_sentense[i][0] + 1, bool_check_sentense[i][1] + 1])
+ biao_red.append([[bool_check_sentense[i][0]-1, bool_check_sentense[i][0], bool_check_sentense[i][0]+1],
+ [bool_check_sentense[i][1]-1, bool_check_sentense[i][1], bool_check_sentense[i][1]+1]])
+ start = bool_check_sentense[i][0]-1
+ end = bool_check_sentense[i][0]+1
+ i += 1
+
+ return biao_red
+
+
+def dialog_line_parse(url, text):
+ """
+ 将数据输入模型进行分析并输出结果
+ :param url: 模型url
+ :param text: 进入模型的数据
+ :return: 模型返回结果
+ """
+
+ response = requests.post(
+ url,
+ json=text,
+ timeout=100000
+ )
+ if response.status_code == 200:
+ return response.json()
+ else:
+ # logger.error(
+ # "【{}】 Failed to get a proper response from remote "
+ # "server. Status Code: {}. Response: {}"
+ # "".format(url, response.status_code, response.text)
+ # )
+ print("【{}】 Failed to get a proper response from remote "
+ "server. Status Code: {}. Response: {}"
+ "".format(url, response.status_code, response.text))
+ print(text)
+ return []
+
+
+def is_english_char(char):
+ code = ord(char)
+ return 32 <= code <= 126
+
+
+def original_text_marked_red(text_original, bert_text, bert_text_pre):
+ '''
+ 把原文标红字段找到
+ :param text_original:
+ :param bert_text:
+ :param bert_text_pre:
+ :return:
+ '''
+
+ fuhao = ["\n"]
+ up_pointer = 0
+ down_pointer = 0
+
+ pointer_list = []
+
+ if len(bert_text_pre) > len(bert_text):
+ return False, ""
+
+ while True:
+ if down_pointer >= len(bert_text_pre):
+ break
+ elif down_pointer == len(bert_text_pre)-1:
+ if bert_text[up_pointer] == bert_text_pre[down_pointer]:
+ pointer_list.append(up_pointer)
+ break
+ else:
+ up_pointer += 1
+ down_pointer = 0
+ pointer_list = []
+
+ elif bert_text[up_pointer] in fuhao:
+ up_pointer += 1
+
+ else:
+ if bert_text[up_pointer] == bert_text_pre[down_pointer]:
+ pointer_list.append(up_pointer)
+ up_pointer += 1
+ down_pointer += 1
+ else:
+ if bert_text_pre[down_pointer:down_pointer+5] == "[UNK]":
+ up_pointer += 1
+ down_pointer += 5
+ pointer_list.append(up_pointer)
+ elif is_english_char(bert_text_pre[down_pointer]) == True:
+ up_pointer += 1
+ down_pointer += 1
+ pointer_list.append(up_pointer)
+ else:
+ up_pointer += 1
+ down_pointer = 0
+ pointer_list = []
+
+
+ start = pointer_list[0]
+ end = pointer_list[-1]
+ bert_text_list = list(bert_text)
+ bert_text_list.insert(start, "")
+ bert_text_list.insert(end + 2 , "")
+
+ text_original_list = list(text_original)
+
+ up = 0
+ down = 0
+
+ while True:
+ if up == len(text_original_list):
+ break
+
+ if text_original_list[up] == bert_text_list[down]:
+ up += 1
+ down += 1
+
+ else:
+ if bert_text_list[down] == "":
+ down += 1
+ elif bert_text_list[down] == "":
+ down += 1
+ else:
+ bert_text_list.insert(down, text_original_list[up])
+ up += 1
+ down += 1
+
+ bert_text = "".join(bert_text_list)
+ return True, bert_text
+
+
+def biaohong_bert_predict(sentence_0_list, sentence_1_list):
+ '''
+ 找出标红字符
+ :param bool_check_sentense:
+ :return:
+ '''
+
+ # sentence_0_list = []
+ # sentence_1_list = []
+ # sim_paper_name = []
+ #
+ # for i in biaohong_list:
+ # sentence_0_list.append("。".join([paper_list[i[0][0]], paper_list[i[0][1]], paper_list[i[0][2]]]))
+ # sentence_1_list.append("。".join([recall_data_list[i[1][1]], recall_data_list[i[1][1]], recall_data_list[i[1][2]]]))
+
+ paper_dict = dialog_line_parse("http://192.168.31.74:16003/", {"sentence_0": sentence_0_list, "sentence_1": sentence_1_list})["resilt"]
+
+ # paper_dict
+ # print("原文:".format(i), paper_dict[i][0])
+ # print("原文标红:".format(i), paper_dict[i][1])
+ # print("相似:".format(i), paper_dict[i][2])
+ # print("相似标红:".format(i), paper_dict[i][3])
+
+ # original_text
+ #
+ #
+ # for paper_dict_dan, sentence_0_dan, sentence_1_dan in zip(paper_dict, sentence_0_list, sentence_1_list):
+ # original_text_marked_red
+
+ return paper_dict
+
+def ulit_text(title, text):
+ data = []
+ try:
+ text = json.loads(text)["content"]
+ except:
+ pass
+
+ text = text.strip().replace("\n", "").replace(" ", "").replace("。", "。\n")
+ text_list = text.split("\n")
+
+ for i in text_list:
+ data.append([i, title])
+ return data
+
+def run_query(conn, sql, params):
+ with conn.cursor() as cursor:
+ cursor.execute(sql, params)
+ result = cursor.fetchall()
+ return result
+
+
+def processing_one_text(paper_id):
+ conn = pymysql.connect(
+ host='192.168.31.145',
+ port=3306,
+ user='root',
+ password='123456',
+ db='zhiwang_db',
+ charset='utf8mb4',
+ cursorclass=pymysql.cursors.DictCursor
+ )
+
+ sql = 'SELECT * FROM main_table_paper_detail_message WHERE doc_id=%s'
+ params = (paper_id,)
+
+ result = run_query(conn, sql, params)
+
+ conn.close()
+ print(result)
+ title = result[0]['title']
+ author = result[0]['author']
+ content_path = result[0]['content']
+
+ try:
+ with open(content_path, encoding="utf-8") as f:
+ text = f.read()
+ except:
+ with open(content_path, encoding="gbk") as f:
+ text = f.read()
+
+ data = ulit_text(title, text)
+ return data
+
+
+def ulit_recall_paper(recall_data_list_dict):
+ '''
+ 对返回的十篇文章路径读取并解析
+ :param recall_data_list_path:
+ :return data: list [[sentence, filename],[sentence, filename],[sentence, filename]]
+ '''
+
+ # data = []
+ # for path in recall_data_list_path:
+ # filename = path.split("/")[-1]
+ # with open(path, encoding="gbk") as f:
+ # text = f.read()
+ # text_list = text.split("\n")
+ # for sentence in text_list:
+ # if sentence != "":
+ # data.append([sentence, filename])
+ # return data
+
+
+ data = []
+ for i in list(recall_data_list_dict.items())[:5]:
+ data_one = processing_one_text(i[0])
+ data.extend(data_one)
+
+ return data
+
+
+def recall_10(title, abst_zh, content) -> list:
+ '''
+ 宇鹏召回接口
+ :param paper_name:
+ :return:
+ '''
+
+ request_json = {
+ "title": title,
+ "abst_zh": abst_zh,
+ "content": content
+ }
+ paper_dict = dialog_line_parse("http://192.168.31.145:50002/check", request_json)
+
+ return paper_dict
+
+@app.route("/", methods=["POST"])
+def handle_query():
+ print(request.remote_addr)
+ title = request.json["title"]
+ abst_zh = request.json["abst_zh"] # txt
+ content = request.json["content"]
+
+ # 调用宇鹏查询相似十篇
+ # recall_data_list_dict = recall_10(title, abst_zh, content)
+ with open("data/rell_json.txt") as f:
+ recall_data_list_dict = eval(f.read())
+
+
+ # 读取文章转化成格式数据
+ recall_data_list = ulit_recall_paper(recall_data_list_dict)
+
+
+ # recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist()
+
+ # 进入精确查重系统
+ return_list = accurate_check_rouge(content, recall_data_list)
+
+ return_text = {"resilt": return_list, "probabilities": None, "status_code": 200}
+ return jsonify(return_text) # 返回结果
+
+
+if __name__ == "__main__":
+ app.run(host="0.0.0.0", port=16001, threaded=True, debug=True)
\ No newline at end of file