
1 changed files with 903 additions and 0 deletions
@ -0,0 +1,903 @@ |
|||||
|
import os |
||||
|
import numpy as np |
||||
|
from numpy.linalg import norm |
||||
|
import pandas as pd |
||||
|
# from rouge import Rouge |
||||
|
from rouge_chinese import Rouge |
||||
|
from Rouge_w import Rouge_w,Rouge_l |
||||
|
import json |
||||
|
import pymysql |
||||
|
import re |
||||
|
import requests |
||||
|
from flask import Flask, jsonify |
||||
|
from flask import request |
||||
|
import uuid |
||||
|
import time |
||||
|
import redis |
||||
|
from threading import Thread |
||||
|
from multiprocessing import Pool |
||||
|
app = Flask(__name__) |
||||
|
app.config["JSON_AS_ASCII"] = False |
||||
|
|
||||
|
pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=7, password="zhicheng123*") |
||||
|
redis_ = redis.Redis(connection_pool=pool, decode_responses=True) |
||||
|
|
||||
|
db_key_query = 'query' |
||||
|
db_key_querying = 'querying' |
||||
|
db_key_queryset = 'queryset' |
||||
|
|
||||
|
nums_cpus = 24 |
||||
|
rouge = Rouge() |
||||
|
rouge_model = Rouge_w() |
||||
|
rouge_l_model = Rouge_l() |
||||
|
|
||||
|
|
||||
|
def bert_check(text, recall_data_list): |
||||
|
''' |
||||
|
bert 查重 |
||||
|
:return: |
||||
|
''' |
||||
|
|
||||
|
sen_0 = [text] * len(recall_data_list) |
||||
|
sen_1 = [i[0] for i in recall_data_list] |
||||
|
|
||||
|
return_list = [] |
||||
|
request_json = { |
||||
|
"texts": [sen_0, sen_1], |
||||
|
} |
||||
|
paper_dict = dialog_line_parse("http://192.168.31.74:16002/", request_json) |
||||
|
score_list = paper_dict["res"] |
||||
|
|
||||
|
# 后期要改 |
||||
|
# return_list.append(re1[0][1]) |
||||
|
# return_list.append(re1[0][0]) |
||||
|
if 1 in score_list: |
||||
|
index_score = score_list.index(1) |
||||
|
else: |
||||
|
index_score = "NaN" |
||||
|
|
||||
|
if index_score == "NaN": |
||||
|
return_list.append(0) |
||||
|
return_list.append("") |
||||
|
else: |
||||
|
return_list.append(1) |
||||
|
return_list.append(index_score) |
||||
|
|
||||
|
return return_list |
||||
|
|
||||
|
|
||||
|
|
||||
|
def rouge_value_self(data_1, data_2): |
||||
|
data_1 = [' '.join(i) for i in data_1] |
||||
|
data_2 = [' '.join(i) for i in data_2] |
||||
|
rouge_l_list = [] |
||||
|
|
||||
|
for sen_1, sen_2 in zip(data_1, data_2): |
||||
|
sen_1 = sen_1.split(" ") |
||||
|
sen_2 = sen_2.split(" ") |
||||
|
rouge_l_score = rouge_l_model.score(sen_1, sen_2) |
||||
|
rouge_l_list.append(rouge_l_score) |
||||
|
|
||||
|
return "", "", rouge_l_list |
||||
|
|
||||
|
|
||||
|
def rouge_pre(text, df_train_nuoche): |
||||
|
|
||||
|
return_list = [] |
||||
|
index_rouge_list = [] |
||||
|
text_list = [text] * len(df_train_nuoche) |
||||
|
|
||||
|
data_list = [] |
||||
|
for data_dan in df_train_nuoche: |
||||
|
data_list.append(data_dan[0]) |
||||
|
rouge_1, rouge_2, rouge_l = rouge_value_self(text_list, data_list) |
||||
|
index_rouge_list.extend(rouge_l) |
||||
|
|
||||
|
re1 = [(i[0], i[1]) for i in sorted(list(enumerate(index_rouge_list)), key=lambda x: x[1], reverse=True)] |
||||
|
|
||||
|
return_list.append(re1[0][1]) |
||||
|
return_list.append(re1[0][0]) |
||||
|
|
||||
|
return return_list |
||||
|
|
||||
|
|
||||
|
def rouge_pre_m(text, df_train_nuoche): |
||||
|
|
||||
|
return_list = [] |
||||
|
index_rouge_list = [] |
||||
|
|
||||
|
text_list = [text] * len(df_train_nuoche) |
||||
|
|
||||
|
data_list = [] |
||||
|
for data_dan in df_train_nuoche: |
||||
|
data_list.append(data_dan[0]) |
||||
|
rouge_1, rouge_2, rouge_l = rouge_value_self(text_list, data_list) |
||||
|
index_rouge_list.extend(rouge_l) |
||||
|
|
||||
|
re1 = [(i[0], i[1]) for i in sorted(list(enumerate(index_rouge_list)), key=lambda x: x[1], reverse=True)] |
||||
|
|
||||
|
return_list.append(re1[0][1]) |
||||
|
return_list.append(re1[0][0]) |
||||
|
|
||||
|
return return_list |
||||
|
|
||||
|
|
||||
|
def accurate_check_rouge( |
||||
|
title, |
||||
|
author, |
||||
|
text_paper, |
||||
|
recall_data_list |
||||
|
): |
||||
|
''' |
||||
|
精确查重出相似句子 |
||||
|
:param text: |
||||
|
:param recall_data_list: list [[sentence, filename],[sentence, filename],[sentence, filename]] |
||||
|
:return: |
||||
|
''' |
||||
|
# 文本处理 |
||||
|
centent_list = [] |
||||
|
text_paper = str(text_paper).replace("。\n", "。") |
||||
|
centent_list.extend(text_paper.split("。")) |
||||
|
data_zong = [] |
||||
|
sentence_word_nums = 0 |
||||
|
|
||||
|
# rouge算法查重 |
||||
|
rst = [] |
||||
|
p = Pool(nums_cpus) # 进程池中含有n个子进程 |
||||
|
|
||||
|
print("centent_list", centent_list) |
||||
|
|
||||
|
for i in range(len(centent_list)): |
||||
|
text = centent_list[i] |
||||
|
a = p.apply_async(rouge_pre_m, args=(text, recall_data_list,)) |
||||
|
rst.append(a) |
||||
|
p.close() |
||||
|
p.join() # 等待所有子进程执行完毕。调用join()之前必须先调用close(),调用close()之后就不能继续添加新的Process了。 |
||||
|
|
||||
|
rst = [i.get() for i in rst] |
||||
|
|
||||
|
for i in range(len(rst)): |
||||
|
print(rst[i]) |
||||
|
data_zong.append(rst[i]) |
||||
|
|
||||
|
t0 = time.time() |
||||
|
# bert算法查重 |
||||
|
# for text in centent_list: |
||||
|
# bert_pre_list = bert_check(text, recall_data_list) |
||||
|
# data_zong.append(bert_pre_list) |
||||
|
t1 = time.time() |
||||
|
original_dict = [] |
||||
|
|
||||
|
|
||||
|
# 找出相似的句子序号 |
||||
|
bool_check_sentense = [] |
||||
|
# bert算法 |
||||
|
# for i in range(len(data_zong)): |
||||
|
# if data_zong[i][0] == 1: |
||||
|
# bool_check_sentense.append([i,data_zong[i][1]]) |
||||
|
|
||||
|
# rouge算法 |
||||
|
for i in range(len(data_zong)): |
||||
|
if data_zong[i][0] > 0.47: |
||||
|
bool_check_sentense.append([i,data_zong[i][1]]) |
||||
|
biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]] |
||||
|
|
||||
|
print("bert精确查重时间", t1-t0) |
||||
|
|
||||
|
|
||||
|
sentence_0_list = [] |
||||
|
sentence_1_list = [] |
||||
|
sim_paper_name = [] |
||||
|
|
||||
|
for i in biao_red: |
||||
|
if recall_data_list[i[1][0]][1] == recall_data_list[i[1][1]][1] == recall_data_list[i[1][2]][1]: |
||||
|
sentence_0_list.append("。".join([centent_list[i[0][0]], centent_list[i[0][1]], centent_list[i[0][2]]])) |
||||
|
sentence_1_list.append("".join([recall_data_list[i[1][0]][0], recall_data_list[i[1][1]][0], recall_data_list[i[1][2]][0]])) |
||||
|
sim_paper_name.append(recall_data_list[i[1][0]][1]) |
||||
|
else: |
||||
|
continue |
||||
|
|
||||
|
sentence_0_list_new = [] |
||||
|
sentence_1_list_new = [] |
||||
|
|
||||
|
|
||||
|
for i in zip(sentence_0_list, sentence_1_list): |
||||
|
if len(i[0]) + len(i[1]) < 1200: |
||||
|
sentence_0_list_new.append(i[0]) |
||||
|
sentence_1_list_new.append(i[1]) |
||||
|
else: |
||||
|
print(len(i[0]) + len(i[1])) |
||||
|
continue |
||||
|
t2 = time.time() |
||||
|
paper_dict = biaohong_bert_predict(sentence_0_list_new, sentence_1_list_new) |
||||
|
|
||||
|
t3 = time.time() |
||||
|
print("标红时间", t3 - t2) |
||||
|
original_text = [] |
||||
|
original_text_contrast = [] |
||||
|
repeat_quote_info = [] |
||||
|
|
||||
|
chongfuwendang = {} |
||||
|
|
||||
|
for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), sentence_0_list_new, sentence_1_list_new, sim_paper_name): |
||||
|
|
||||
|
print([sentence_0_dan, sentence_1_dan]) |
||||
|
original_text_contrast_dict = { |
||||
|
"original_text": "", |
||||
|
"similar_content": [ |
||||
|
{ |
||||
|
"content": "", |
||||
|
"thesis_info": "", |
||||
|
"title": "", |
||||
|
"year": "", |
||||
|
"degree": "", |
||||
|
"author": "", |
||||
|
} |
||||
|
] |
||||
|
} |
||||
|
try: |
||||
|
sentence_0_bool, sentence_0_dan_red = original_text_marked_red(sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]) # text_original, bert_text, bert_text_pre |
||||
|
except: |
||||
|
print("报错", [sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]]) |
||||
|
continue |
||||
|
# 9/0 |
||||
|
sentence_1_bool, sentence_1_dan_red = original_text_marked_red(sentence_1_dan, paper_dict[paper_dict_dan_id][2], paper_dict[paper_dict_dan_id][3]) # text_original, bert_text, bert_text_pre |
||||
|
|
||||
|
if sentence_0_bool == False or sentence_1_bool == False: |
||||
|
continue |
||||
|
|
||||
|
dan_sentence_word_nums = len(paper_dict[paper_dict_dan_id][1]) |
||||
|
sentence_word_nums += dan_sentence_word_nums |
||||
|
|
||||
|
original_text.append(sentence_0_dan_red) |
||||
|
original_text_contrast_dict["original_text"] = "此处有 {} 字相似\n".format( |
||||
|
dan_sentence_word_nums) + sentence_0_dan_red |
||||
|
|
||||
|
thesis_info = " ".join([sim_paper_name_dan["title"], sim_paper_name_dan["author"], sim_paper_name_dan["degree"], sim_paper_name_dan["year"]]) |
||||
|
original_text_contrast_dict["similar_content"][0]["content"] = sentence_1_dan_red |
||||
|
original_text_contrast_dict["similar_content"][0]["title"] = sim_paper_name_dan["title"] |
||||
|
original_text_contrast_dict["similar_content"][0]["author"] = sim_paper_name_dan["author"] |
||||
|
original_text_contrast_dict["similar_content"][0]["degree"] = sim_paper_name_dan["degree"] |
||||
|
original_text_contrast_dict["similar_content"][0]["year"] = sim_paper_name_dan["year"] |
||||
|
original_text_contrast_dict["similar_content"][0]["thesis_info"] = thesis_info |
||||
|
|
||||
|
original_text_contrast.append(original_text_contrast_dict) |
||||
|
|
||||
|
# for i in repeat_quote_info: |
||||
|
# if |
||||
|
|
||||
|
if thesis_info not in chongfuwendang: |
||||
|
chongfuwendang[thesis_info] = { |
||||
|
"quote": False, |
||||
|
"thesis_author": sim_paper_name_dan["author"], |
||||
|
"thesis_date" : sim_paper_name_dan["year"], |
||||
|
"thesis_info" : thesis_info, |
||||
|
"thesis_repeat_rate": (dan_sentence_word_nums/sim_paper_name_dan["paper_len_word"]) * 100, #round(repetition_rate, 3) * 100 |
||||
|
"thesis_title": sim_paper_name_dan["title"], |
||||
|
"thesis_link": "", |
||||
|
"thesis_publish": sim_paper_name_dan["degree"], |
||||
|
"thesis_repeat_word": dan_sentence_word_nums, |
||||
|
"thesis_teacher": "", |
||||
|
"paper_len_word": sim_paper_name_dan["paper_len_word"] |
||||
|
} |
||||
|
else: |
||||
|
chongfuwendang[thesis_info]["thesis_repeat_word"] += dan_sentence_word_nums |
||||
|
chongfuwendang[thesis_info]["thesis_repeat_rate"] = (chongfuwendang[thesis_info]["thesis_repeat_word"]/chongfuwendang[thesis_info]["paper_len_word"]) * 100 |
||||
|
|
||||
|
|
||||
|
chongfuwendang = sorted(chongfuwendang.items(), |
||||
|
key=lambda x: x[1]["thesis_repeat_rate"], reverse=False) |
||||
|
|
||||
|
|
||||
|
for i in range(len(chongfuwendang)): |
||||
|
repeat_paper_one_info_dict = chongfuwendang[i][1] |
||||
|
repeat_paper_one_info_dict.pop("paper_len_word") |
||||
|
repeat_paper_one_info_dict["thesis_repeat_rate"] = str(round(repeat_paper_one_info_dict["thesis_repeat_rate"], 1)) + "%" |
||||
|
repeat_quote_info.append(repeat_paper_one_info_dict) |
||||
|
|
||||
|
original_text = "。".join(original_text) |
||||
|
|
||||
|
repetition_rate = sentence_word_nums/len(text_paper) |
||||
|
repetition_rate = round(repetition_rate, 3) * 100 |
||||
|
|
||||
|
format = '%Y-%m-%d %H:%M:%S' |
||||
|
value = time.localtime(int(time.time())) |
||||
|
dt = time.strftime(format, value) |
||||
|
|
||||
|
return { |
||||
|
"author": author, |
||||
|
"check_time": dt, |
||||
|
"title": title, |
||||
|
"time_range": "1900-01-01至2023-08-08", |
||||
|
"section_data": [ |
||||
|
{ |
||||
|
"oneself_repeat_words": sentence_word_nums, |
||||
|
"reference_repeat_words": sentence_word_nums, |
||||
|
"section_name": "第1部分", |
||||
|
"section_oneself_rate": "{}%".format(repetition_rate), |
||||
|
"section_repeat_rate": "{}%".format(repetition_rate), |
||||
|
"section_repeat_words": sentence_word_nums, |
||||
|
"section_words": len(text_paper) |
||||
|
} |
||||
|
], |
||||
|
"section_details": [ |
||||
|
{ |
||||
|
"end_page_index": 0, |
||||
|
"name": "", |
||||
|
"repeat_rate": "", |
||||
|
"repeat_words": "", |
||||
|
"words": "", |
||||
|
"original_text": original_text, |
||||
|
"original_text_oneself": original_text, |
||||
|
"original_text_contrast": original_text_contrast, |
||||
|
"repeat_quote_info": repeat_quote_info |
||||
|
} |
||||
|
], |
||||
|
"total_data": { |
||||
|
"back_repeat_words": "", |
||||
|
"exclude_personal_rate": "{}%".format(repetition_rate), |
||||
|
"exclude_quote_rate": "{}%".format(repetition_rate), |
||||
|
"foot_end_note": "0", |
||||
|
"front_repeat_words": "", |
||||
|
"single_max_rate": "", |
||||
|
"single_max_repeat_words": "", |
||||
|
"suspected_paragraph": "1", |
||||
|
"suspected_paragraph_max_repeat_words": "", |
||||
|
"suspected_paragraph_min_repeat_words": "", |
||||
|
"tables": "0", |
||||
|
"total_paragraph": "1", |
||||
|
"total_repeat_rate": "{}%".format(repetition_rate), |
||||
|
"total_repeat_words": sentence_word_nums, |
||||
|
"total_words": len(text_paper) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
|
||||
|
|
||||
|
|
||||
|
|
||||
|
def biaohong(bool_check_sentense, data_zong, df_train_nuoche): |
||||
|
''' |
||||
|
标红的序号 [[0,1,2],[3,4,5]] |
||||
|
:param bool_check_sentense: |
||||
|
:return: list |
||||
|
''' |
||||
|
biao_red = [] |
||||
|
i = 0 |
||||
|
start = -1 |
||||
|
end = -1 |
||||
|
while True: |
||||
|
if i >= len(bool_check_sentense) or bool_check_sentense[i][0] +1 >= len(data_zong) or bool_check_sentense[i][1]+1 >= len(df_train_nuoche): |
||||
|
break |
||||
|
elif bool_check_sentense[i][0]-1 == start: |
||||
|
i += 1 |
||||
|
continue |
||||
|
elif bool_check_sentense[i][0] == end: |
||||
|
i += 1 |
||||
|
continue |
||||
|
elif bool_check_sentense[i][0]-1 == end: |
||||
|
i += 1 |
||||
|
continue |
||||
|
else: |
||||
|
biao_red_dan = [] |
||||
|
biao_red_dan.append([bool_check_sentense[i][0] - 1, bool_check_sentense[i][1] - 1]) |
||||
|
biao_red_dan.append([bool_check_sentense[i][0], bool_check_sentense[i][1]]) |
||||
|
biao_red_dan.append([bool_check_sentense[i][0] + 1, bool_check_sentense[i][1] + 1]) |
||||
|
biao_red.append([[bool_check_sentense[i][0]-1, bool_check_sentense[i][0], bool_check_sentense[i][0]+1], |
||||
|
[bool_check_sentense[i][1]-1, bool_check_sentense[i][1], bool_check_sentense[i][1]+1]]) |
||||
|
start = bool_check_sentense[i][0]-1 |
||||
|
end = bool_check_sentense[i][0]+1 |
||||
|
i += 1 |
||||
|
|
||||
|
return biao_red |
||||
|
|
||||
|
|
||||
|
def dialog_line_parse(url, text): |
||||
|
""" |
||||
|
将数据输入模型进行分析并输出结果 |
||||
|
:param url: 模型url |
||||
|
:param text: 进入模型的数据 |
||||
|
:return: 模型返回结果 |
||||
|
""" |
||||
|
|
||||
|
response = requests.post( |
||||
|
url, |
||||
|
json=text, |
||||
|
timeout=100000 |
||||
|
) |
||||
|
if response.status_code == 200: |
||||
|
return response.json() |
||||
|
else: |
||||
|
# logger.error( |
||||
|
# "【{}】 Failed to get a proper response from remote " |
||||
|
# "server. Status Code: {}. Response: {}" |
||||
|
# "".format(url, response.status_code, response.text) |
||||
|
# ) |
||||
|
print("【{}】 Failed to get a proper response from remote " |
||||
|
"server. Status Code: {}. Response: {}" |
||||
|
"".format(url, response.status_code, response.text)) |
||||
|
print(text) |
||||
|
return {} |
||||
|
|
||||
|
|
||||
|
def is_english_char(char): |
||||
|
code = ord(char) |
||||
|
return 32 <= code <= 126 |
||||
|
|
||||
|
|
||||
|
def original_text_marked_red(text_original, bert_text, bert_text_pre): |
||||
|
''' |
||||
|
把原文标红字段找到 |
||||
|
:param text_original: |
||||
|
:param bert_text: |
||||
|
:param bert_text_pre: |
||||
|
:return: |
||||
|
''' |
||||
|
|
||||
|
fuhao = ["\n"] |
||||
|
up_pointer = 0 |
||||
|
down_pointer = 0 |
||||
|
|
||||
|
pointer_list = [] |
||||
|
|
||||
|
if len(bert_text_pre) > len(bert_text): |
||||
|
return False, "" |
||||
|
|
||||
|
while True: |
||||
|
if down_pointer >= len(bert_text_pre): |
||||
|
break |
||||
|
elif down_pointer == len(bert_text_pre)-1: |
||||
|
if bert_text[up_pointer] == bert_text_pre[down_pointer]: |
||||
|
pointer_list.append(up_pointer) |
||||
|
break |
||||
|
else: |
||||
|
up_pointer += 1 |
||||
|
down_pointer = 0 |
||||
|
pointer_list = [] |
||||
|
|
||||
|
elif bert_text[up_pointer] in fuhao: |
||||
|
up_pointer += 1 |
||||
|
|
||||
|
else: |
||||
|
if bert_text[up_pointer] == bert_text_pre[down_pointer]: |
||||
|
pointer_list.append(up_pointer) |
||||
|
up_pointer += 1 |
||||
|
down_pointer += 1 |
||||
|
else: |
||||
|
if bert_text_pre[down_pointer:down_pointer+5] == "[UNK]": |
||||
|
up_pointer += 1 |
||||
|
down_pointer += 5 |
||||
|
pointer_list.append(up_pointer) |
||||
|
elif is_english_char(bert_text_pre[down_pointer]) == True: |
||||
|
up_pointer += 1 |
||||
|
down_pointer += 1 |
||||
|
pointer_list.append(up_pointer) |
||||
|
else: |
||||
|
up_pointer += 1 |
||||
|
down_pointer = 0 |
||||
|
pointer_list = [] |
||||
|
|
||||
|
|
||||
|
start = pointer_list[0] |
||||
|
end = pointer_list[-1] |
||||
|
bert_text_list = list(bert_text) |
||||
|
bert_text_list.insert(start, "<red>") |
||||
|
bert_text_list.insert(end + 2 , "</red>") |
||||
|
|
||||
|
text_original_list = list(text_original) |
||||
|
|
||||
|
up = 0 |
||||
|
down = 0 |
||||
|
|
||||
|
while True: |
||||
|
if up == len(text_original_list): |
||||
|
break |
||||
|
|
||||
|
if text_original_list[up] == bert_text_list[down]: |
||||
|
up += 1 |
||||
|
down += 1 |
||||
|
|
||||
|
else: |
||||
|
if bert_text_list[down] == "<red>": |
||||
|
down += 1 |
||||
|
elif bert_text_list[down] == "</red>": |
||||
|
down += 1 |
||||
|
else: |
||||
|
bert_text_list.insert(down, text_original_list[up]) |
||||
|
up += 1 |
||||
|
down += 1 |
||||
|
|
||||
|
bert_text = "".join(bert_text_list) |
||||
|
return True, bert_text |
||||
|
|
||||
|
|
||||
|
def biaohong_bert_predict(sentence_0_list, sentence_1_list): |
||||
|
''' |
||||
|
找出标红字符 |
||||
|
:param bool_check_sentense: |
||||
|
:return: |
||||
|
''' |
||||
|
|
||||
|
# sentence_0_list = [] |
||||
|
# sentence_1_list = [] |
||||
|
# sim_paper_name = [] |
||||
|
# |
||||
|
# for i in biaohong_list: |
||||
|
# sentence_0_list.append("。".join([paper_list[i[0][0]], paper_list[i[0][1]], paper_list[i[0][2]]])) |
||||
|
# sentence_1_list.append("。".join([recall_data_list[i[1][1]], recall_data_list[i[1][1]], recall_data_list[i[1][2]]])) |
||||
|
|
||||
|
paper_dict = dialog_line_parse("http://192.168.31.74:16003/", {"sentence_0": sentence_0_list, "sentence_1": sentence_1_list})["resilt"] |
||||
|
|
||||
|
# paper_dict |
||||
|
# print("原文:".format(i), paper_dict[i][0]) |
||||
|
# print("原文标红:".format(i), paper_dict[i][1]) |
||||
|
# print("相似:".format(i), paper_dict[i][2]) |
||||
|
# print("相似标红:".format(i), paper_dict[i][3]) |
||||
|
|
||||
|
# original_text |
||||
|
# |
||||
|
# |
||||
|
# for paper_dict_dan, sentence_0_dan, sentence_1_dan in zip(paper_dict, sentence_0_list, sentence_1_list): |
||||
|
# original_text_marked_red |
||||
|
|
||||
|
return paper_dict |
||||
|
|
||||
|
def ulit_text(title, text): |
||||
|
data = [] |
||||
|
try: |
||||
|
text = json.loads(text)["content"] |
||||
|
except: |
||||
|
pass |
||||
|
|
||||
|
text = text.strip().replace("\n", "").replace(" ", "").replace("。", "。\n") |
||||
|
text_list = text.split("\n") |
||||
|
|
||||
|
for i in text_list: |
||||
|
data.append([i, title]) |
||||
|
return data |
||||
|
|
||||
|
def run_query(conn, sql, params): |
||||
|
with conn.cursor() as cursor: |
||||
|
cursor.execute(sql, params) |
||||
|
result = cursor.fetchall() |
||||
|
return result |
||||
|
|
||||
|
|
||||
|
def processing_one_text(paper_id): |
||||
|
conn = pymysql.connect( |
||||
|
host='192.168.31.145', |
||||
|
port=3306, |
||||
|
user='root', |
||||
|
password='123456', |
||||
|
db='zhiwang_db', |
||||
|
charset='utf8mb4', |
||||
|
cursorclass=pymysql.cursors.DictCursor |
||||
|
) |
||||
|
|
||||
|
sql = 'SELECT * FROM main_table_paper_detail_message WHERE doc_id=%s' |
||||
|
params = (paper_id,) |
||||
|
|
||||
|
result = run_query(conn, sql, params) |
||||
|
|
||||
|
conn.close() |
||||
|
print(result[0]['title'], result[0]['author']) |
||||
|
title = result[0]['title'] |
||||
|
author = result[0]['author'] |
||||
|
degree = result[0]['degree'] |
||||
|
year = result[0]['content'].split("/")[5] |
||||
|
content_path = result[0]['content'] |
||||
|
|
||||
|
try: |
||||
|
with open(content_path, encoding="utf-8") as f: |
||||
|
text = f.read() |
||||
|
except: |
||||
|
with open(content_path, encoding="gbk") as f: |
||||
|
text = f.read() |
||||
|
|
||||
|
paper_info = { |
||||
|
"title": title, |
||||
|
"author": author, |
||||
|
"degree": degree, |
||||
|
"year": year, |
||||
|
"paper_len_word": len(text) |
||||
|
} |
||||
|
data = ulit_text(paper_info, text) |
||||
|
return data |
||||
|
|
||||
|
|
||||
|
def ulit_recall_paper(recall_data_list_dict): |
||||
|
''' |
||||
|
对返回的十篇文章路径读取并解析 |
||||
|
:param recall_data_list_path: |
||||
|
:return data: list [[sentence, filename],[sentence, filename],[sentence, filename]] |
||||
|
''' |
||||
|
|
||||
|
# data = [] |
||||
|
# for path in recall_data_list_path: |
||||
|
# filename = path.split("/")[-1] |
||||
|
# with open(path, encoding="gbk") as f: |
||||
|
# text = f.read() |
||||
|
# text_list = text.split("\n") |
||||
|
# for sentence in text_list: |
||||
|
# if sentence != "": |
||||
|
# data.append([sentence, filename]) |
||||
|
# return data |
||||
|
|
||||
|
|
||||
|
data = [] |
||||
|
for i in list(recall_data_list_dict.items())[:5]: |
||||
|
data_one = processing_one_text(i[0]) |
||||
|
data.extend(data_one) |
||||
|
|
||||
|
return data |
||||
|
|
||||
|
|
||||
|
def recall_10(title, abst_zh, content) -> dict: |
||||
|
''' |
||||
|
宇鹏召回接口 |
||||
|
:param paper_name: |
||||
|
:return: |
||||
|
''' |
||||
|
|
||||
|
request_json = { |
||||
|
"title": title, |
||||
|
"abst_zh": abst_zh, |
||||
|
"content": content |
||||
|
} |
||||
|
paper_dict = dialog_line_parse("http://192.168.31.145:50004/check", request_json) |
||||
|
|
||||
|
return paper_dict |
||||
|
|
||||
|
|
||||
|
def uilt_content(content): |
||||
|
zhaiyao_list = ["摘要"] |
||||
|
zhaiyao_en_list = ["Abstract", "abstract"] |
||||
|
mulu_list = ["目录"] |
||||
|
key_word_list = ["关键词"] |
||||
|
key_word_bool = False |
||||
|
key_word_str = "" |
||||
|
zhaiyao_bool = False |
||||
|
zhaiyao_en_bool = False |
||||
|
zhaiyao_str = "" |
||||
|
zhaiyao_en_str = "" |
||||
|
mulu_str = "" |
||||
|
zhaiyao_text = "" |
||||
|
mulu_bool = False |
||||
|
|
||||
|
for i in zhaiyao_list: |
||||
|
if i in content: |
||||
|
zhaiyao_bool = True |
||||
|
zhaiyao_str = i |
||||
|
break |
||||
|
|
||||
|
for i in zhaiyao_en_list: |
||||
|
if i in content: |
||||
|
zhaiyao_en_bool = True |
||||
|
zhaiyao_en_str = i |
||||
|
break |
||||
|
|
||||
|
for i in mulu_list: |
||||
|
if i in content: |
||||
|
mulu_str = i |
||||
|
mulu_bool = True |
||||
|
break |
||||
|
|
||||
|
for i in key_word_list: |
||||
|
if i in content: |
||||
|
key_word_str = i |
||||
|
key_word_bool = True |
||||
|
break |
||||
|
|
||||
|
if zhaiyao_bool== True and zhaiyao_en_bool == True: |
||||
|
pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,zhaiyao_en_str) |
||||
|
result_biaoti_list = re.findall(pantten_zhaiyao, content) |
||||
|
zhaiyao_text = result_biaoti_list[0] |
||||
|
|
||||
|
elif zhaiyao_bool == True and key_word_bool == True: |
||||
|
pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,key_word_str ) |
||||
|
result_biaoti_list = re.findall(pantten_zhaiyao, content) |
||||
|
zhaiyao_text = result_biaoti_list[0] |
||||
|
|
||||
|
elif zhaiyao_bool == True and mulu_bool == True: |
||||
|
pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,mulu_str) |
||||
|
result_biaoti_list = re.findall(pantten_zhaiyao, content) |
||||
|
zhaiyao_text = result_biaoti_list[0] |
||||
|
|
||||
|
return zhaiyao_text |
||||
|
|
||||
|
|
||||
|
def ulit_request_file(file): |
||||
|
file_name = file.filename |
||||
|
if file_name.split(".")[-1] == "txt": |
||||
|
file_name_save = "data/request/{}".format(file_name) |
||||
|
file.save(file_name_save) |
||||
|
try: |
||||
|
with open(file_name_save, encoding="gbk") as f: |
||||
|
content = f.read() |
||||
|
except: |
||||
|
with open(file_name_save, encoding="utf-8") as f: |
||||
|
content = f.read() |
||||
|
|
||||
|
content = content.strip().replace("\n", "").replace(" ", "") |
||||
|
abst_zh = uilt_content(content) |
||||
|
|
||||
|
return abst_zh, content |
||||
|
|
||||
|
|
||||
|
|
||||
|
# @app.route("/", methods=["POST"]) |
||||
|
# def handle_query(): |
||||
|
# print(request.remote_addr) |
||||
|
# |
||||
|
# # request.form.get('prompt') |
||||
|
# dataBases = request.form.get("dataBases") |
||||
|
# minSimilarity = request.form.get("minSimilarity") # txt |
||||
|
# minWords = request.form.get("minWords") |
||||
|
# title = request.form.get("title") |
||||
|
# author = request.form.get("author") # txt |
||||
|
# file = request.files.get('file') |
||||
|
# token = request.form.get("token") |
||||
|
# account = request.form.get("account") |
||||
|
# goodsId = request.form.get("goodsId") |
||||
|
# callbackUrl = request.form.get("callbackUrl") |
||||
|
# |
||||
|
# |
||||
|
# t0 = time.time() |
||||
|
# abst_zh, content = ulit_request_file(file) |
||||
|
# |
||||
|
# # 调用宇鹏查询相似十篇 |
||||
|
# # recall_data_list_dict = recall_10(title, abst_zh, content) |
||||
|
# |
||||
|
# t1 = time.time() |
||||
|
# print("查找相似的50篇完成") |
||||
|
# with open("data/rell_json.txt") as f: |
||||
|
# recall_data_list_dict = eval(f.read()) |
||||
|
# |
||||
|
# # 读取文章转化成格式数据 |
||||
|
# recall_data_list = ulit_recall_paper(recall_data_list_dict) |
||||
|
# print("文章格式转化完成") |
||||
|
# |
||||
|
# # recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist() |
||||
|
# |
||||
|
# # 进入精确查重系统 |
||||
|
# print("进入精确查重系统") |
||||
|
# return_list = accurate_check_rouge(title, author, content, recall_data_list) |
||||
|
# |
||||
|
# print("召回50篇", t1 - t0) |
||||
|
# |
||||
|
# return_text = {"resilt": return_list, "probabilities": None, "status_code": 200} |
||||
|
# return jsonify(return_text) # 返回结果 |
||||
|
|
||||
|
|
||||
|
def classify(): # 调用模型,设置最大batch_size |
||||
|
while True: |
||||
|
if redis_.llen(db_key_query) == 0: # 若队列中没有元素就继续获取 |
||||
|
time.sleep(3) |
||||
|
continue |
||||
|
query = redis_.lpop(db_key_query).decode('UTF-8') # 获取query的text |
||||
|
data_dict_path = json.loads(query) |
||||
|
path = data_dict_path['path'] |
||||
|
# text_type = data_dict["text_type"] |
||||
|
|
||||
|
with open(path, encoding='utf8') as f1: |
||||
|
# 加载文件的对象 |
||||
|
data_dict = json.load(f1) |
||||
|
|
||||
|
query_id = data_dict['id'] |
||||
|
print(query_id) |
||||
|
dataBases = data_dict['dataBases'] |
||||
|
minSimilarity = data_dict['minSimilarity'] |
||||
|
minWords = data_dict['minWords'] |
||||
|
title = data_dict['title'] |
||||
|
author = data_dict['author'] |
||||
|
abst_zh = data_dict['abst_zh'] |
||||
|
content = data_dict['content'] |
||||
|
token = data_dict['token'] |
||||
|
account = data_dict['account'] |
||||
|
goodsId = data_dict['goodsId'] |
||||
|
callbackUrl = data_dict['callbackUrl'] |
||||
|
|
||||
|
|
||||
|
# 调用宇鹏查询相似十篇 |
||||
|
# recall_data_list_dict = recall_10(title, abst_zh, content) |
||||
|
|
||||
|
t1 = time.time() |
||||
|
print("查找相似的50篇完成") |
||||
|
with open("data/rell_json.txt") as f: |
||||
|
recall_data_list_dict = eval(f.read()) |
||||
|
|
||||
|
# 读取文章转化成格式数据 |
||||
|
recall_data_list = ulit_recall_paper(recall_data_list_dict) |
||||
|
print("文章格式转化完成") |
||||
|
|
||||
|
# recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist() |
||||
|
|
||||
|
# 进入精确查重系统 |
||||
|
print("进入精确查重系统") |
||||
|
return_list = accurate_check_rouge(title, author, content, recall_data_list) |
||||
|
|
||||
|
return_text = {"resilt": return_list, "probabilities": None, "status_code": 200} |
||||
|
|
||||
|
load_result_path = "./new_data_logs/{}.json".format(query_id) |
||||
|
|
||||
|
print("query_id: ", query_id) |
||||
|
print("load_result_path: ", load_result_path) |
||||
|
|
||||
|
with open(load_result_path, 'w', encoding='utf8') as f2: |
||||
|
# ensure_ascii=False才能输入中文,否则是Unicode字符 |
||||
|
# indent=2 JSON数据的缩进,美观 |
||||
|
json.dump(return_text, f2, ensure_ascii=False, indent=4) |
||||
|
|
||||
|
print(query_id) |
||||
|
print(load_result_path) |
||||
|
redis_.set(query_id, load_result_path, 86400) |
||||
|
redis_.srem(db_key_querying, query_id) |
||||
|
|
||||
|
|
||||
|
@app.route("/", methods=["POST"]) |
||||
|
def handle_query(): |
||||
|
try: |
||||
|
print(request.remote_addr) |
||||
|
|
||||
|
# request.form.get('prompt') |
||||
|
dataBases = request.form.get("dataBases") |
||||
|
minSimilarity = request.form.get("minSimilarity") # txt |
||||
|
minWords = request.form.get("minWords") |
||||
|
title = request.form.get("title") |
||||
|
author = request.form.get("author") # txt |
||||
|
file = request.files.get('file') |
||||
|
token = request.form.get("token") |
||||
|
account = request.form.get("account") |
||||
|
goodsId = request.form.get("goodsId") |
||||
|
callbackUrl = request.form.get("callbackUrl") |
||||
|
|
||||
|
abst_zh, content = ulit_request_file(file) |
||||
|
|
||||
|
id_ = str(uuid.uuid1()) # 为query生成唯一标识 |
||||
|
print("uuid: ", uuid) |
||||
|
print(id_) |
||||
|
d = { |
||||
|
'id': id_, |
||||
|
'dataBases': dataBases, |
||||
|
'minSimilarity': minSimilarity, |
||||
|
'minWords': minWords, |
||||
|
'title': title, |
||||
|
'author': author, |
||||
|
'abst_zh': abst_zh, |
||||
|
'content': content, |
||||
|
'token': token, |
||||
|
'account': account, |
||||
|
'goodsId': goodsId, |
||||
|
'callbackUrl': callbackUrl |
||||
|
} |
||||
|
|
||||
|
# 绑定文本和query id |
||||
|
print(d) |
||||
|
load_request_path = './request_data_logs/{}.json'.format(id_) |
||||
|
with open(load_request_path, 'w', encoding='utf8') as f2: |
||||
|
# ensure_ascii=False才能输入中文,否则是Unicode字符 |
||||
|
# indent=2 JSON数据的缩进,美观 |
||||
|
json.dump(d, f2, ensure_ascii=False, indent=4) |
||||
|
redis_.rpush(db_key_query, json.dumps({"id": id_, "path": load_request_path})) # 加入redis |
||||
|
redis_.sadd(db_key_querying, id_) |
||||
|
redis_.sadd(db_key_queryset, id_) |
||||
|
return_text = { |
||||
|
'code': 0, |
||||
|
'msg': "请求成功", |
||||
|
'data': { |
||||
|
'balances': "", |
||||
|
'orderId': id_, |
||||
|
'consumeNum': "" |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
print("ok") |
||||
|
except: |
||||
|
return_text = {'code': 1} |
||||
|
return jsonify(return_text) # 返回结果 |
||||
|
|
||||
|
t = Thread(target=classify) |
||||
|
t.start() |
||||
|
|
||||
|
if __name__ == "__main__": |
||||
|
app.run(host="0.0.0.0", port=16001, threaded=True, debug=True) |
Loading…
Reference in new issue