
2 changed files with 997 additions and 0 deletions
@ -0,0 +1,474 @@ |
|||
import os |
|||
import numpy as np |
|||
from numpy.linalg import norm |
|||
import pandas as pd |
|||
# from rouge import Rouge |
|||
from rouge_chinese import Rouge |
|||
from Rouge_w import Rouge_w,Rouge_l |
|||
import json |
|||
import pymysql |
|||
|
|||
import requests |
|||
from flask import Flask, jsonify |
|||
from flask import request |
|||
import uuid |
|||
app = Flask(__name__) |
|||
app.config["JSON_AS_ASCII"] = False |
|||
|
|||
|
|||
nums_cpus = 16 |
|||
rouge = Rouge() |
|||
rouge_model = Rouge_w() |
|||
rouge_l_model = Rouge_l() |
|||
|
|||
|
|||
def rouge_value_self(data_1, data_2): |
|||
data_1 = [' '.join(i) for i in data_1] |
|||
data_2 = [' '.join(i) for i in data_2] |
|||
rouge_l_list = [] |
|||
|
|||
for sen_1, sen_2 in zip(data_1, data_2): |
|||
sen_1 = sen_1.split(" ") |
|||
sen_2 = sen_2.split(" ") |
|||
rouge_l_score = rouge_l_model.score(sen_1, sen_2) |
|||
rouge_l_list.append(rouge_l_score) |
|||
|
|||
return "", "", rouge_l_list |
|||
|
|||
|
|||
def rouge_pre(text, df_train_nuoche): |
|||
|
|||
return_list = [] |
|||
index_rouge_list = [] |
|||
text_list = [text] * len(df_train_nuoche) |
|||
|
|||
data_list = [] |
|||
for data_dan in df_train_nuoche: |
|||
data_list.append(data_dan[0]) |
|||
rouge_1, rouge_2, rouge_l = rouge_value_self(text_list, data_list) |
|||
index_rouge_list.extend(rouge_l) |
|||
|
|||
re1 = [(i[0], i[1]) for i in sorted(list(enumerate(index_rouge_list)), key=lambda x: x[1], reverse=True)] |
|||
|
|||
return_list.append(re1[0][1]) |
|||
return_list.append(re1[0][0]) |
|||
|
|||
return return_list |
|||
|
|||
|
|||
def accurate_check_rouge(text_paper, recall_data_list): |
|||
''' |
|||
精确查重出相似句子 |
|||
:param text: |
|||
:param recall_data_list: list [[sentence, filename],[sentence, filename],[sentence, filename]] |
|||
:return: |
|||
''' |
|||
# 文本处理 |
|||
# with open(text_paper_path, encoding="gbk") as f: |
|||
# text_paper = f.read() |
|||
centent_list = [] |
|||
text_paper = str(text_paper).replace("。\n", "。") |
|||
centent_list.extend(text_paper.split("。")) |
|||
data_zong = [] |
|||
|
|||
# rouge算法查重 |
|||
for text in centent_list: |
|||
rouge_pre_list = rouge_pre(text, recall_data_list) |
|||
data_zong.append(rouge_pre_list) |
|||
|
|||
original_dict = [] |
|||
|
|||
|
|||
# 找出相似的句子序号 |
|||
bool_check_sentense = [] |
|||
for i in range(len(data_zong)): |
|||
if data_zong[i][0] > 0.47: |
|||
bool_check_sentense.append([i,data_zong[i][1]]) |
|||
biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]] |
|||
|
|||
sentence_0_list = [] |
|||
sentence_1_list = [] |
|||
sim_paper_name = [] |
|||
|
|||
for i in biao_red: |
|||
if recall_data_list[i[1][0]][1] == recall_data_list[i[1][1]][1] == recall_data_list[i[1][2]][1]: |
|||
sentence_0_list.append("。".join([centent_list[i[0][0]], centent_list[i[0][1]], centent_list[i[0][2]]])) |
|||
sentence_1_list.append("".join([recall_data_list[i[1][0]][0], recall_data_list[i[1][1]][0], recall_data_list[i[1][2]][0]])) |
|||
sim_paper_name.append(recall_data_list[i[1][0]][1]) |
|||
else: |
|||
continue |
|||
|
|||
sentence_0_list_new = [] |
|||
sentence_1_list_new = [] |
|||
|
|||
for i in zip(sentence_0_list, sentence_1_list): |
|||
if len(i[0]) + len(i[1]) < 1200: |
|||
sentence_0_list_new.append(i[0]) |
|||
sentence_1_list_new.append(i[1]) |
|||
else: |
|||
print(len(i[0]) + len(i[1])) |
|||
continue |
|||
for i in zip(sentence_0_list_new, sentence_1_list_new): |
|||
print("超过字数", len(i[0])) |
|||
print("超过字数", len(i[1])) |
|||
|
|||
paper_dict = biaohong_bert_predict(sentence_0_list_new, sentence_1_list_new) |
|||
|
|||
# paper_dict |
|||
# print("原文:".format(i), paper_dict[i][0]) |
|||
# print("原文标红:".format(i), paper_dict[i][1]) |
|||
# print("相似:".format(i), paper_dict[i][2]) |
|||
# print("相似标红:".format(i), paper_dict[i][3]) |
|||
|
|||
# original_text |
|||
original_text = [] |
|||
original_text_contrast = [] |
|||
|
|||
|
|||
for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), sentence_0_list_new, sentence_1_list_new, sim_paper_name): |
|||
|
|||
print([sentence_0_dan, sentence_1_dan]) |
|||
original_text_contrast_dict = {} |
|||
similar_content = {"author": ""} |
|||
try: |
|||
sentence_0_bool, sentence_0_dan_red = original_text_marked_red(sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]) # text_original, bert_text, bert_text_pre |
|||
except: |
|||
print([sentence_0_dan,sentence_1_dan]) |
|||
9/0 |
|||
sentence_1_bool, sentence_1_dan_red = original_text_marked_red(sentence_1_dan, paper_dict[paper_dict_dan_id][2], paper_dict[paper_dict_dan_id][3]) # text_original, bert_text, bert_text_pre |
|||
|
|||
if sentence_0_bool == False or sentence_1_bool == False: |
|||
continue |
|||
original_text.append(sentence_0_dan_red) |
|||
original_text_contrast_dict["original_text"] = "此处有 {} 字相似\n".format( |
|||
len(paper_dict[paper_dict_dan_id][1])) + sentence_0_dan_red |
|||
|
|||
similar_content["content"] = sentence_1_dan_red |
|||
similar_content["title"] = sim_paper_name_dan |
|||
original_text_contrast_dict["similar_content"] = similar_content |
|||
|
|||
original_text_contrast.append(original_text_contrast_dict) |
|||
|
|||
original_text = "。".join(original_text) |
|||
|
|||
return { |
|||
"author": "", |
|||
"check_time": "", |
|||
"section_data": "", |
|||
"section_details": [ |
|||
{ |
|||
"end_page_index": 0, |
|||
"name": "", |
|||
"original_text": original_text, |
|||
"original_text_contrast": original_text_contrast |
|||
} |
|||
] |
|||
} |
|||
|
|||
|
|||
|
|||
|
|||
|
|||
def biaohong(bool_check_sentense, data_zong, df_train_nuoche): |
|||
''' |
|||
标红的序号 [[0,1,2],[3,4,5]] |
|||
:param bool_check_sentense: |
|||
:return: list |
|||
''' |
|||
biao_red = [] |
|||
i = 0 |
|||
start = -1 |
|||
end = -1 |
|||
while True: |
|||
if i >= len(bool_check_sentense) or bool_check_sentense[i][0] +1 >= len(data_zong) or bool_check_sentense[i][1]+1 >= len(df_train_nuoche): |
|||
break |
|||
elif bool_check_sentense[i][0]-1 == start: |
|||
i += 1 |
|||
continue |
|||
elif bool_check_sentense[i][0] == end: |
|||
i += 1 |
|||
continue |
|||
elif bool_check_sentense[i][0]-1 == end: |
|||
i += 1 |
|||
continue |
|||
else: |
|||
biao_red_dan = [] |
|||
biao_red_dan.append([bool_check_sentense[i][0] - 1, bool_check_sentense[i][1] - 1]) |
|||
biao_red_dan.append([bool_check_sentense[i][0], bool_check_sentense[i][1]]) |
|||
biao_red_dan.append([bool_check_sentense[i][0] + 1, bool_check_sentense[i][1] + 1]) |
|||
biao_red.append([[bool_check_sentense[i][0]-1, bool_check_sentense[i][0], bool_check_sentense[i][0]+1], |
|||
[bool_check_sentense[i][1]-1, bool_check_sentense[i][1], bool_check_sentense[i][1]+1]]) |
|||
start = bool_check_sentense[i][0]-1 |
|||
end = bool_check_sentense[i][0]+1 |
|||
i += 1 |
|||
|
|||
return biao_red |
|||
|
|||
|
|||
def dialog_line_parse(url, text): |
|||
""" |
|||
将数据输入模型进行分析并输出结果 |
|||
:param url: 模型url |
|||
:param text: 进入模型的数据 |
|||
:return: 模型返回结果 |
|||
""" |
|||
|
|||
response = requests.post( |
|||
url, |
|||
json=text, |
|||
timeout=100000 |
|||
) |
|||
if response.status_code == 200: |
|||
return response.json() |
|||
else: |
|||
# logger.error( |
|||
# "【{}】 Failed to get a proper response from remote " |
|||
# "server. Status Code: {}. Response: {}" |
|||
# "".format(url, response.status_code, response.text) |
|||
# ) |
|||
print("【{}】 Failed to get a proper response from remote " |
|||
"server. Status Code: {}. Response: {}" |
|||
"".format(url, response.status_code, response.text)) |
|||
print(text) |
|||
return [] |
|||
|
|||
|
|||
def original_text_marked_red(text_original, bert_text, bert_text_pre): |
|||
''' |
|||
把原文标红字段找到 |
|||
:param text_original: |
|||
:param bert_text: |
|||
:param bert_text_pre: |
|||
:return: |
|||
''' |
|||
|
|||
fuhao = ["\n"] |
|||
up_pointer = 0 |
|||
down_pointer = 0 |
|||
|
|||
pointer_list = [] |
|||
|
|||
if len(bert_text_pre) > len(bert_text): |
|||
return False, "" |
|||
|
|||
while True: |
|||
if down_pointer >= len(bert_text_pre): |
|||
break |
|||
elif down_pointer == len(bert_text_pre)-1: |
|||
if bert_text[up_pointer] == bert_text_pre[down_pointer]: |
|||
pointer_list.append(up_pointer) |
|||
break |
|||
else: |
|||
up_pointer += 1 |
|||
down_pointer = 0 |
|||
pointer_list = [] |
|||
|
|||
elif bert_text[up_pointer] in fuhao: |
|||
up_pointer += 1 |
|||
|
|||
else: |
|||
if bert_text[up_pointer] == bert_text_pre[down_pointer]: |
|||
pointer_list.append(up_pointer) |
|||
up_pointer += 1 |
|||
down_pointer += 1 |
|||
else: |
|||
if bert_text_pre[down_pointer:down_pointer+5] == "[UNK]": |
|||
up_pointer += 1 |
|||
down_pointer += 5 |
|||
pointer_list.append(up_pointer) |
|||
else: |
|||
up_pointer += 1 |
|||
down_pointer = 0 |
|||
pointer_list = [] |
|||
|
|||
|
|||
start = pointer_list[0] |
|||
end = pointer_list[-1] |
|||
bert_text_list = list(bert_text) |
|||
bert_text_list.insert(start, "<red>") |
|||
bert_text_list.insert(end + 2 , "</red>") |
|||
|
|||
text_original_list = list(text_original) |
|||
|
|||
up = 0 |
|||
down = 0 |
|||
|
|||
while True: |
|||
if up == len(text_original_list): |
|||
break |
|||
|
|||
if text_original_list[up] == bert_text_list[down]: |
|||
up += 1 |
|||
down += 1 |
|||
|
|||
else: |
|||
if bert_text_list[down] == "<red>": |
|||
down += 1 |
|||
elif bert_text_list[down] == "</red>": |
|||
down += 1 |
|||
else: |
|||
bert_text_list.insert(down, text_original_list[up]) |
|||
up += 1 |
|||
down += 1 |
|||
|
|||
bert_text = "".join(bert_text_list) |
|||
return True, bert_text |
|||
|
|||
|
|||
def biaohong_bert_predict(sentence_0_list, sentence_1_list): |
|||
''' |
|||
找出标红字符 |
|||
:param bool_check_sentense: |
|||
:return: |
|||
''' |
|||
|
|||
# sentence_0_list = [] |
|||
# sentence_1_list = [] |
|||
# sim_paper_name = [] |
|||
# |
|||
# for i in biaohong_list: |
|||
# sentence_0_list.append("。".join([paper_list[i[0][0]], paper_list[i[0][1]], paper_list[i[0][2]]])) |
|||
# sentence_1_list.append("。".join([recall_data_list[i[1][1]], recall_data_list[i[1][1]], recall_data_list[i[1][2]]])) |
|||
|
|||
paper_dict = dialog_line_parse("http://192.168.31.74:16003/", {"sentence_0": sentence_0_list, "sentence_1": sentence_1_list})["resilt"] |
|||
|
|||
# paper_dict |
|||
# print("原文:".format(i), paper_dict[i][0]) |
|||
# print("原文标红:".format(i), paper_dict[i][1]) |
|||
# print("相似:".format(i), paper_dict[i][2]) |
|||
# print("相似标红:".format(i), paper_dict[i][3]) |
|||
|
|||
# original_text |
|||
# |
|||
# |
|||
# for paper_dict_dan, sentence_0_dan, sentence_1_dan in zip(paper_dict, sentence_0_list, sentence_1_list): |
|||
# original_text_marked_red |
|||
|
|||
return paper_dict |
|||
|
|||
def ulit_text(title, text): |
|||
data = [] |
|||
try: |
|||
text = json.loads(text)["content"] |
|||
except: |
|||
pass |
|||
|
|||
text = text.strip().replace("\n", "").replace(" ", "").replace("。", "。\n") |
|||
text_list = text.split("\n") |
|||
|
|||
for i in text_list: |
|||
data.append([i, title]) |
|||
return data |
|||
|
|||
def run_query(conn, sql, params): |
|||
with conn.cursor() as cursor: |
|||
cursor.execute(sql, params) |
|||
result = cursor.fetchall() |
|||
return result |
|||
|
|||
|
|||
def processing_one_text(paper_id): |
|||
conn = pymysql.connect( |
|||
host='192.168.31.145', |
|||
port=3306, |
|||
user='root', |
|||
password='123456', |
|||
db='zhiwang_db', |
|||
charset='utf8mb4', |
|||
cursorclass=pymysql.cursors.DictCursor |
|||
) |
|||
|
|||
sql = 'SELECT * FROM main_table_paper_detail_message WHERE doc_id=%s' |
|||
params = (paper_id,) |
|||
|
|||
result = run_query(conn, sql, params) |
|||
|
|||
conn.close() |
|||
print(result) |
|||
title = result[0]['title'] |
|||
author = result[0]['author'] |
|||
content_path = result[0]['content'] |
|||
|
|||
try: |
|||
with open(content_path, encoding="utf-8") as f: |
|||
text = f.read() |
|||
except: |
|||
with open(content_path, encoding="gbk") as f: |
|||
text = f.read() |
|||
|
|||
data = ulit_text(title, text) |
|||
return data |
|||
|
|||
|
|||
def ulit_recall_paper(recall_data_list_dict): |
|||
''' |
|||
对返回的十篇文章路径读取并解析 |
|||
:param recall_data_list_path: |
|||
:return data: list [[sentence, filename],[sentence, filename],[sentence, filename]] |
|||
''' |
|||
|
|||
# data = [] |
|||
# for path in recall_data_list_path: |
|||
# filename = path.split("/")[-1] |
|||
# with open(path, encoding="gbk") as f: |
|||
# text = f.read() |
|||
# text_list = text.split("\n") |
|||
# for sentence in text_list: |
|||
# if sentence != "": |
|||
# data.append([sentence, filename]) |
|||
# return data |
|||
|
|||
|
|||
data = [] |
|||
for i in list(recall_data_list_dict.items())[:5]: |
|||
data_one = processing_one_text(i[0]) |
|||
data.extend(data_one) |
|||
|
|||
|
|||
return data |
|||
|
|||
|
|||
def recall_10(title, abst_zh, content) -> list: |
|||
''' |
|||
宇鹏召回接口 |
|||
:param paper_name: |
|||
:return: |
|||
''' |
|||
|
|||
request_json = { |
|||
"title": title, |
|||
"abst_zh": abst_zh, |
|||
"content": content |
|||
} |
|||
paper_dict = dialog_line_parse("http://192.168.31.145:50002/check", request_json) |
|||
|
|||
return paper_dict |
|||
|
|||
@app.route("/", methods=["POST"]) |
|||
def handle_query(): |
|||
|
|||
title = request.json["title"] |
|||
abst_zh = request.json["abst_zh"] # txt |
|||
content = request.json["content"] |
|||
|
|||
# 调用宇鹏查询相似十篇 |
|||
# recall_data_list_dict = recall_10(title, abst_zh, content) |
|||
|
|||
with open("data/rell_json.txt") as f: |
|||
recall_data_list_dict = eval(f.read()) |
|||
|
|||
# 读取文章转化成格式数据 |
|||
recall_data_list = ulit_recall_paper(recall_data_list_dict) |
|||
|
|||
|
|||
# recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist() |
|||
|
|||
# 进入精确查重系统 |
|||
return_list = accurate_check_rouge(content, recall_data_list) |
|||
|
|||
return_text = {"resilt": return_list, "probabilities": None, "status_code": 200} |
|||
return jsonify(return_text) # 返回结果 |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
app.run(host="0.0.0.0", port=16001, threaded=True, debug=True) |
@ -0,0 +1,523 @@ |
|||
import os |
|||
import numpy as np |
|||
from numpy.linalg import norm |
|||
import pandas as pd |
|||
# from rouge import Rouge |
|||
from rouge_chinese import Rouge |
|||
from Rouge_w import Rouge_w,Rouge_l |
|||
import json |
|||
import pymysql |
|||
|
|||
import requests |
|||
from flask import Flask, jsonify |
|||
from flask import request |
|||
import uuid |
|||
app = Flask(__name__) |
|||
app.config["JSON_AS_ASCII"] = False |
|||
|
|||
|
|||
nums_cpus = 16 |
|||
rouge = Rouge() |
|||
rouge_model = Rouge_w() |
|||
rouge_l_model = Rouge_l() |
|||
|
|||
|
|||
def bert_check(text, recall_data_list): |
|||
''' |
|||
bert 查重 |
|||
:return: |
|||
''' |
|||
|
|||
sen_0 = [text] * len(recall_data_list) |
|||
sen_1 = [i[0] for i in recall_data_list] |
|||
|
|||
return_list = [] |
|||
request_json = { |
|||
"texts": [sen_0, sen_1], |
|||
} |
|||
paper_dict = dialog_line_parse("http://192.168.31.74:16002/", request_json) |
|||
score_list = paper_dict["res"] |
|||
|
|||
# 后期要改 |
|||
# return_list.append(re1[0][1]) |
|||
# return_list.append(re1[0][0]) |
|||
if 1 in score_list: |
|||
index_score = score_list.index(1) |
|||
else: |
|||
index_score = "NaN" |
|||
|
|||
if index_score == "NaN": |
|||
return_list.append(0) |
|||
return_list.append("") |
|||
else: |
|||
return_list.append(1) |
|||
return_list.append(index_score) |
|||
|
|||
return return_list |
|||
|
|||
|
|||
|
|||
def rouge_value_self(data_1, data_2): |
|||
data_1 = [' '.join(i) for i in data_1] |
|||
data_2 = [' '.join(i) for i in data_2] |
|||
rouge_l_list = [] |
|||
|
|||
for sen_1, sen_2 in zip(data_1, data_2): |
|||
sen_1 = sen_1.split(" ") |
|||
sen_2 = sen_2.split(" ") |
|||
rouge_l_score = rouge_l_model.score(sen_1, sen_2) |
|||
rouge_l_list.append(rouge_l_score) |
|||
|
|||
return "", "", rouge_l_list |
|||
|
|||
|
|||
def rouge_pre(text, df_train_nuoche): |
|||
|
|||
return_list = [] |
|||
index_rouge_list = [] |
|||
text_list = [text] * len(df_train_nuoche) |
|||
|
|||
data_list = [] |
|||
for data_dan in df_train_nuoche: |
|||
data_list.append(data_dan[0]) |
|||
rouge_1, rouge_2, rouge_l = rouge_value_self(text_list, data_list) |
|||
index_rouge_list.extend(rouge_l) |
|||
|
|||
re1 = [(i[0], i[1]) for i in sorted(list(enumerate(index_rouge_list)), key=lambda x: x[1], reverse=True)] |
|||
|
|||
return_list.append(re1[0][1]) |
|||
return_list.append(re1[0][0]) |
|||
|
|||
return return_list |
|||
|
|||
|
|||
def accurate_check_rouge(text_paper, recall_data_list): |
|||
''' |
|||
精确查重出相似句子 |
|||
:param text: |
|||
:param recall_data_list: list [[sentence, filename],[sentence, filename],[sentence, filename]] |
|||
:return: |
|||
''' |
|||
# 文本处理 |
|||
# with open(text_paper_path, encoding="gbk") as f: |
|||
# text_paper = f.read() |
|||
centent_list = [] |
|||
text_paper = str(text_paper).replace("。\n", "。") |
|||
centent_list.extend(text_paper.split("。")) |
|||
data_zong = [] |
|||
|
|||
# rouge算法查重 |
|||
# for text in centent_list: |
|||
# rouge_pre_list = rouge_pre(text, recall_data_list) |
|||
# data_zong.append(rouge_pre_list) |
|||
|
|||
# bert算法查重 |
|||
for text in centent_list: |
|||
bert_pre_list = bert_check(text, recall_data_list) |
|||
data_zong.append(bert_pre_list) |
|||
|
|||
original_dict = [] |
|||
|
|||
|
|||
# 找出相似的句子序号 |
|||
bool_check_sentense = [] |
|||
for i in range(len(data_zong)): |
|||
if data_zong[i][0] == 1: |
|||
bool_check_sentense.append([i,data_zong[i][1]]) |
|||
biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]] |
|||
|
|||
sentence_0_list = [] |
|||
sentence_1_list = [] |
|||
sim_paper_name = [] |
|||
|
|||
for i in biao_red: |
|||
if recall_data_list[i[1][0]][1] == recall_data_list[i[1][1]][1] == recall_data_list[i[1][2]][1]: |
|||
sentence_0_list.append("。".join([centent_list[i[0][0]], centent_list[i[0][1]], centent_list[i[0][2]]])) |
|||
sentence_1_list.append("".join([recall_data_list[i[1][0]][0], recall_data_list[i[1][1]][0], recall_data_list[i[1][2]][0]])) |
|||
sim_paper_name.append(recall_data_list[i[1][0]][1]) |
|||
else: |
|||
continue |
|||
|
|||
sentence_0_list_new = [] |
|||
sentence_1_list_new = [] |
|||
|
|||
|
|||
for i in zip(sentence_0_list, sentence_1_list): |
|||
if len(i[0]) + len(i[1]) < 1200: |
|||
sentence_0_list_new.append(i[0]) |
|||
sentence_1_list_new.append(i[1]) |
|||
else: |
|||
print(len(i[0]) + len(i[1])) |
|||
continue |
|||
for i in zip(sentence_0_list_new, sentence_1_list_new): |
|||
print("超过字数", len(i[0])) |
|||
print("超过字数", len(i[1])) |
|||
|
|||
paper_dict = biaohong_bert_predict(sentence_0_list_new, sentence_1_list_new) |
|||
|
|||
# paper_dict |
|||
# print("原文:".format(i), paper_dict[i][0]) |
|||
# print("原文标红:".format(i), paper_dict[i][1]) |
|||
# print("相似:".format(i), paper_dict[i][2]) |
|||
# print("相似标红:".format(i), paper_dict[i][3]) |
|||
|
|||
# original_text |
|||
original_text = [] |
|||
original_text_contrast = [] |
|||
|
|||
|
|||
for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), sentence_0_list_new, sentence_1_list_new, sim_paper_name): |
|||
|
|||
print([sentence_0_dan, sentence_1_dan]) |
|||
original_text_contrast_dict = {} |
|||
similar_content = {"author": ""} |
|||
try: |
|||
sentence_0_bool, sentence_0_dan_red = original_text_marked_red(sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]) # text_original, bert_text, bert_text_pre |
|||
except: |
|||
print("报错", [sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]]) |
|||
9/0 |
|||
sentence_1_bool, sentence_1_dan_red = original_text_marked_red(sentence_1_dan, paper_dict[paper_dict_dan_id][2], paper_dict[paper_dict_dan_id][3]) # text_original, bert_text, bert_text_pre |
|||
|
|||
if sentence_0_bool == False or sentence_1_bool == False: |
|||
continue |
|||
original_text.append(sentence_0_dan_red) |
|||
original_text_contrast_dict["original_text"] = "此处有 {} 字相似\n".format( |
|||
len(paper_dict[paper_dict_dan_id][1])) + sentence_0_dan_red |
|||
|
|||
similar_content["content"] = sentence_1_dan_red |
|||
similar_content["title"] = sim_paper_name_dan |
|||
original_text_contrast_dict["similar_content"] = similar_content |
|||
|
|||
original_text_contrast.append(original_text_contrast_dict) |
|||
|
|||
original_text = "。".join(original_text) |
|||
|
|||
return { |
|||
"author": "", |
|||
"check_time": "", |
|||
"section_data": "", |
|||
"section_details": [ |
|||
{ |
|||
"end_page_index": 0, |
|||
"name": "", |
|||
"original_text": original_text, |
|||
"original_text_contrast": original_text_contrast |
|||
} |
|||
] |
|||
} |
|||
|
|||
|
|||
|
|||
|
|||
|
|||
def biaohong(bool_check_sentense, data_zong, df_train_nuoche): |
|||
''' |
|||
标红的序号 [[0,1,2],[3,4,5]] |
|||
:param bool_check_sentense: |
|||
:return: list |
|||
''' |
|||
biao_red = [] |
|||
i = 0 |
|||
start = -1 |
|||
end = -1 |
|||
while True: |
|||
if i >= len(bool_check_sentense) or bool_check_sentense[i][0] +1 >= len(data_zong) or bool_check_sentense[i][1]+1 >= len(df_train_nuoche): |
|||
break |
|||
elif bool_check_sentense[i][0]-1 == start: |
|||
i += 1 |
|||
continue |
|||
elif bool_check_sentense[i][0] == end: |
|||
i += 1 |
|||
continue |
|||
elif bool_check_sentense[i][0]-1 == end: |
|||
i += 1 |
|||
continue |
|||
else: |
|||
biao_red_dan = [] |
|||
biao_red_dan.append([bool_check_sentense[i][0] - 1, bool_check_sentense[i][1] - 1]) |
|||
biao_red_dan.append([bool_check_sentense[i][0], bool_check_sentense[i][1]]) |
|||
biao_red_dan.append([bool_check_sentense[i][0] + 1, bool_check_sentense[i][1] + 1]) |
|||
biao_red.append([[bool_check_sentense[i][0]-1, bool_check_sentense[i][0], bool_check_sentense[i][0]+1], |
|||
[bool_check_sentense[i][1]-1, bool_check_sentense[i][1], bool_check_sentense[i][1]+1]]) |
|||
start = bool_check_sentense[i][0]-1 |
|||
end = bool_check_sentense[i][0]+1 |
|||
i += 1 |
|||
|
|||
return biao_red |
|||
|
|||
|
|||
def dialog_line_parse(url, text): |
|||
""" |
|||
将数据输入模型进行分析并输出结果 |
|||
:param url: 模型url |
|||
:param text: 进入模型的数据 |
|||
:return: 模型返回结果 |
|||
""" |
|||
|
|||
response = requests.post( |
|||
url, |
|||
json=text, |
|||
timeout=100000 |
|||
) |
|||
if response.status_code == 200: |
|||
return response.json() |
|||
else: |
|||
# logger.error( |
|||
# "【{}】 Failed to get a proper response from remote " |
|||
# "server. Status Code: {}. Response: {}" |
|||
# "".format(url, response.status_code, response.text) |
|||
# ) |
|||
print("【{}】 Failed to get a proper response from remote " |
|||
"server. Status Code: {}. Response: {}" |
|||
"".format(url, response.status_code, response.text)) |
|||
print(text) |
|||
return [] |
|||
|
|||
|
|||
def is_english_char(char): |
|||
code = ord(char) |
|||
return 32 <= code <= 126 |
|||
|
|||
|
|||
def original_text_marked_red(text_original, bert_text, bert_text_pre): |
|||
''' |
|||
把原文标红字段找到 |
|||
:param text_original: |
|||
:param bert_text: |
|||
:param bert_text_pre: |
|||
:return: |
|||
''' |
|||
|
|||
fuhao = ["\n"] |
|||
up_pointer = 0 |
|||
down_pointer = 0 |
|||
|
|||
pointer_list = [] |
|||
|
|||
if len(bert_text_pre) > len(bert_text): |
|||
return False, "" |
|||
|
|||
while True: |
|||
if down_pointer >= len(bert_text_pre): |
|||
break |
|||
elif down_pointer == len(bert_text_pre)-1: |
|||
if bert_text[up_pointer] == bert_text_pre[down_pointer]: |
|||
pointer_list.append(up_pointer) |
|||
break |
|||
else: |
|||
up_pointer += 1 |
|||
down_pointer = 0 |
|||
pointer_list = [] |
|||
|
|||
elif bert_text[up_pointer] in fuhao: |
|||
up_pointer += 1 |
|||
|
|||
else: |
|||
if bert_text[up_pointer] == bert_text_pre[down_pointer]: |
|||
pointer_list.append(up_pointer) |
|||
up_pointer += 1 |
|||
down_pointer += 1 |
|||
else: |
|||
if bert_text_pre[down_pointer:down_pointer+5] == "[UNK]": |
|||
up_pointer += 1 |
|||
down_pointer += 5 |
|||
pointer_list.append(up_pointer) |
|||
elif is_english_char(bert_text_pre[down_pointer]) == True: |
|||
up_pointer += 1 |
|||
down_pointer += 1 |
|||
pointer_list.append(up_pointer) |
|||
else: |
|||
up_pointer += 1 |
|||
down_pointer = 0 |
|||
pointer_list = [] |
|||
|
|||
|
|||
start = pointer_list[0] |
|||
end = pointer_list[-1] |
|||
bert_text_list = list(bert_text) |
|||
bert_text_list.insert(start, "<red>") |
|||
bert_text_list.insert(end + 2 , "</red>") |
|||
|
|||
text_original_list = list(text_original) |
|||
|
|||
up = 0 |
|||
down = 0 |
|||
|
|||
while True: |
|||
if up == len(text_original_list): |
|||
break |
|||
|
|||
if text_original_list[up] == bert_text_list[down]: |
|||
up += 1 |
|||
down += 1 |
|||
|
|||
else: |
|||
if bert_text_list[down] == "<red>": |
|||
down += 1 |
|||
elif bert_text_list[down] == "</red>": |
|||
down += 1 |
|||
else: |
|||
bert_text_list.insert(down, text_original_list[up]) |
|||
up += 1 |
|||
down += 1 |
|||
|
|||
bert_text = "".join(bert_text_list) |
|||
return True, bert_text |
|||
|
|||
|
|||
def biaohong_bert_predict(sentence_0_list, sentence_1_list): |
|||
''' |
|||
找出标红字符 |
|||
:param bool_check_sentense: |
|||
:return: |
|||
''' |
|||
|
|||
# sentence_0_list = [] |
|||
# sentence_1_list = [] |
|||
# sim_paper_name = [] |
|||
# |
|||
# for i in biaohong_list: |
|||
# sentence_0_list.append("。".join([paper_list[i[0][0]], paper_list[i[0][1]], paper_list[i[0][2]]])) |
|||
# sentence_1_list.append("。".join([recall_data_list[i[1][1]], recall_data_list[i[1][1]], recall_data_list[i[1][2]]])) |
|||
|
|||
paper_dict = dialog_line_parse("http://192.168.31.74:16003/", {"sentence_0": sentence_0_list, "sentence_1": sentence_1_list})["resilt"] |
|||
|
|||
# paper_dict |
|||
# print("原文:".format(i), paper_dict[i][0]) |
|||
# print("原文标红:".format(i), paper_dict[i][1]) |
|||
# print("相似:".format(i), paper_dict[i][2]) |
|||
# print("相似标红:".format(i), paper_dict[i][3]) |
|||
|
|||
# original_text |
|||
# |
|||
# |
|||
# for paper_dict_dan, sentence_0_dan, sentence_1_dan in zip(paper_dict, sentence_0_list, sentence_1_list): |
|||
# original_text_marked_red |
|||
|
|||
return paper_dict |
|||
|
|||
def ulit_text(title, text): |
|||
data = [] |
|||
try: |
|||
text = json.loads(text)["content"] |
|||
except: |
|||
pass |
|||
|
|||
text = text.strip().replace("\n", "").replace(" ", "").replace("。", "。\n") |
|||
text_list = text.split("\n") |
|||
|
|||
for i in text_list: |
|||
data.append([i, title]) |
|||
return data |
|||
|
|||
def run_query(conn, sql, params): |
|||
with conn.cursor() as cursor: |
|||
cursor.execute(sql, params) |
|||
result = cursor.fetchall() |
|||
return result |
|||
|
|||
|
|||
def processing_one_text(paper_id): |
|||
conn = pymysql.connect( |
|||
host='192.168.31.145', |
|||
port=3306, |
|||
user='root', |
|||
password='123456', |
|||
db='zhiwang_db', |
|||
charset='utf8mb4', |
|||
cursorclass=pymysql.cursors.DictCursor |
|||
) |
|||
|
|||
sql = 'SELECT * FROM main_table_paper_detail_message WHERE doc_id=%s' |
|||
params = (paper_id,) |
|||
|
|||
result = run_query(conn, sql, params) |
|||
|
|||
conn.close() |
|||
print(result) |
|||
title = result[0]['title'] |
|||
author = result[0]['author'] |
|||
content_path = result[0]['content'] |
|||
|
|||
try: |
|||
with open(content_path, encoding="utf-8") as f: |
|||
text = f.read() |
|||
except: |
|||
with open(content_path, encoding="gbk") as f: |
|||
text = f.read() |
|||
|
|||
data = ulit_text(title, text) |
|||
return data |
|||
|
|||
|
|||
def ulit_recall_paper(recall_data_list_dict): |
|||
''' |
|||
对返回的十篇文章路径读取并解析 |
|||
:param recall_data_list_path: |
|||
:return data: list [[sentence, filename],[sentence, filename],[sentence, filename]] |
|||
''' |
|||
|
|||
# data = [] |
|||
# for path in recall_data_list_path: |
|||
# filename = path.split("/")[-1] |
|||
# with open(path, encoding="gbk") as f: |
|||
# text = f.read() |
|||
# text_list = text.split("\n") |
|||
# for sentence in text_list: |
|||
# if sentence != "": |
|||
# data.append([sentence, filename]) |
|||
# return data |
|||
|
|||
|
|||
data = [] |
|||
for i in list(recall_data_list_dict.items())[:5]: |
|||
data_one = processing_one_text(i[0]) |
|||
data.extend(data_one) |
|||
|
|||
return data |
|||
|
|||
|
|||
def recall_10(title, abst_zh, content) -> list: |
|||
''' |
|||
宇鹏召回接口 |
|||
:param paper_name: |
|||
:return: |
|||
''' |
|||
|
|||
request_json = { |
|||
"title": title, |
|||
"abst_zh": abst_zh, |
|||
"content": content |
|||
} |
|||
paper_dict = dialog_line_parse("http://192.168.31.145:50002/check", request_json) |
|||
|
|||
return paper_dict |
|||
|
|||
@app.route("/", methods=["POST"]) |
|||
def handle_query(): |
|||
print(request.remote_addr) |
|||
title = request.json["title"] |
|||
abst_zh = request.json["abst_zh"] # txt |
|||
content = request.json["content"] |
|||
|
|||
# 调用宇鹏查询相似十篇 |
|||
# recall_data_list_dict = recall_10(title, abst_zh, content) |
|||
with open("data/rell_json.txt") as f: |
|||
recall_data_list_dict = eval(f.read()) |
|||
|
|||
|
|||
# 读取文章转化成格式数据 |
|||
recall_data_list = ulit_recall_paper(recall_data_list_dict) |
|||
|
|||
|
|||
# recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist() |
|||
|
|||
# 进入精确查重系统 |
|||
return_list = accurate_check_rouge(content, recall_data_list) |
|||
|
|||
return_text = {"resilt": return_list, "probabilities": None, "status_code": 200} |
|||
return jsonify(return_text) # 返回结果 |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
app.run(host="0.0.0.0", port=16001, threaded=True, debug=True) |
Loading…
Reference in new issue