Browse Source

增加单句多篇幅检测

master
majiahui@haimaqingfan.com 2 years ago
parent
commit
37e4f2e0d2
  1. 345
      flask_check_bert_test.py

345
flask_check_bert_test.py

@ -20,12 +20,14 @@ from multiprocessing import Pool
app = Flask(__name__)
app.config["JSON_AS_ASCII"] = False
pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=7, password="zhicheng123*")
pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=8, password="zhicheng123*")
redis_ = redis.Redis(connection_pool=pool, decode_responses=True)
db_key_query = 'query'
db_key_querying = 'querying'
db_key_queryset = 'queryset'
db_key_query = 'query_check_task'
db_key_querying = 'querying_check_task'
db_key_queryset = 'queryset_check_task'
db_key_query_recall = 'query_recall'
nums_cpus = 24
rouge = Rouge()
@ -101,6 +103,9 @@ def rouge_pre(text, df_train_nuoche):
def rouge_pre_m(text, df_train_nuoche):
return_list = []
index_rouge_list = []
@ -266,14 +271,17 @@ def total_data_func(section_data_list):
repeat_words += i["repeat_words"]
words += i["words"]
exclude_personal_rate = str(repeat_words / words * 100) + "%"
exclude_quote_rate = str(repeat_words / words * 100) + "%"
baifenbi = (repeat_words / words) *100
exclude_personal_rate = str(round(baifenbi, 1)) + "%"
exclude_quote_rate = str(round(baifenbi, 1)) + "%"
single_max_rate = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_rate"]
single_max_repeat_words = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_word"]
total_repeat_rate = str(repeat_words / words * 100) + "%"
total_repeat_rate = str(round(baifenbi, 1)) + "%"
total_repeat_words = repeat_words
total_words = words
print(exclude_personal_rate)
return {
"back_repeat_words": "",
"exclude_personal_rate": exclude_personal_rate,
@ -353,7 +361,7 @@ def section_data_func(section_details):
}
def section_details_func(data_section_dan, paper_dict):
def section_details_func(data_section_dan, paper_dict, num_words):
'''
章节详细信息
:param original_text_contrast:
@ -363,21 +371,22 @@ def section_details_func(data_section_dan, paper_dict):
original_text_contrast = []
section_repeat_rate = ""
repeat_words = 0
section_words = 0
section_words = num_words
oneself_repeat_words = ""
reference_repeat_words = ""
section_oneself_rate = ""
original_text_list = []
for sentence_dan in data_section_dan:
print("sentence_dan", sentence_dan)
original_text_contrast_dan = original_text_contrast_func(sentence_dan, paper_dict)
original_text_contrast.append(original_text_contrast_dan)
repeat_words += original_text_contrast_dan["dan_sentence_word_nums"]
original_text_list.append(original_text_contrast_dan["original_text"])
section_words += len(sentence_dan[0][1])
original_text = "".join(original_text_list)
repeat_rate = repeat_words / section_words
repeat_rate = (repeat_words / section_words)* 100
repeat_rate = str(round(repeat_rate, 1)) + "%"
repeat_quote_info = repeat_quote_info_func(original_text_contrast)
@ -395,6 +404,108 @@ def section_details_func(data_section_dan, paper_dict):
}
def check_dict(similar_content_control, paper_dict, num_words, title, author):
'''
生成返回字典
:param similar_content_control:
:param paper_dict:
:param num_words:
:param title:
:param author:
:return:
'''
if paper_dict != []:
data = [similar_content_control]
# 模拟多个章节
section_details_list = []
for data_dan in data:
data_section_dan = data_dan
# 章节详细信息
section_details = section_details_func(data_section_dan, paper_dict, num_words)
section_details_list.append(section_details)
# 模拟多个章节
section_data_list = []
for section_details in section_details_list:
section_data = section_data_func(section_details)
section_data_list.append(section_data)
total_data = total_data_func(section_details_list)
format = '%Y-%m-%d %H:%M:%S'
value = time.localtime(int(time.time()))
dt = time.strftime(format, value)
paper_data = {
"author": author,
"check_time": dt,
"time_range": "1900-01-01至2023-08-08",
"title": title,
"total_data": total_data,
"section_data": section_data_list,
"section_details": section_details_list
}
else:
total_data = {
"back_repeat_words": "",
"exclude_personal_rate": 0,
"exclude_quote_rate": 0,
"front_repeat_words": "",
"single_max_rate": 0,
"single_max_repeat_words": 0,
"suspected_paragraph": "",
"suspected_paragraph_max_repeat_words": "",
"suspected_paragraph_min_repeat_words": "",
"total_paragraph": "",
"total_repeat_rate": 0,
"total_repeat_words": 0,
"total_words": num_words,
"tables": 0
}
section_data_list = [{
"section_name": "第一部分",
"section_repeat_rate": 0,
"section_repeat_words": 0,
"section_words": num_words,
"oneself_repeat_words": 0,
"reference_repeat_words": 0,
"section_oneself_rate": 0
}]
section_details_list = [
{
"end_page_index": 0,
"name": "第1部分",
"repeat_rate": 0,
"repeat_words": 0,
"start_page_index": 0,
"words": num_words,
"original_text": "",
"original_text_oneself": "",
"original_text_contrast": [],
"repeat_quote_info": []
}
]
format = '%Y-%m-%d %H:%M:%S'
value = time.localtime(int(time.time()))
dt = time.strftime(format, value)
paper_data = {
"author": author,
"check_time": dt,
"time_range": "1900-01-01至2023-08-08",
"title": title,
"total_data": total_data,
"section_data": section_data_list,
"section_details": section_details_list
}
return paper_data
def accurate_check_rouge(
title,
author,
@ -408,18 +519,27 @@ def accurate_check_rouge(
:return:
'''
# 文本处理
centent_list = []
# centent_list = []
print("text_paper", len(text_paper))
text_paper = str(text_paper).replace("\n", "")
centent_list.extend(text_paper.split(""))
centent_list_old = text_paper.split("")
data_zong = []
sentence_word_nums = 0
# rouge算法查重
rst = []
p = Pool(nums_cpus) # 进程池中含有n个子进程
p = Pool(nums_cpus) # 进程池中含有n个子进程
print("centent_list", centent_list)
# print("centent_list", centent_list)
num_words = 0
centent_list = []
for i in centent_list_old:
num_words += len(i)
if len(i) < 300:
centent_list.append(i)
print("num_words", num_words)
for i in range(len(centent_list)):
text = centent_list[i]
a = p.apply_async(rouge_pre_m, args=(text, recall_data_list,))
@ -430,7 +550,7 @@ def accurate_check_rouge(
rst = [i.get() for i in rst]
for i in range(len(rst)):
print(rst[i])
# print(rst[i])
data_zong.append(rst[i])
t0 = time.time()
@ -453,6 +573,7 @@ def accurate_check_rouge(
for j in range(len(data_zong[i])):
if data_zong[i][j][1] > 0.47:
bool_check_sentense.append([i, data_zong[i][j][0]])
biao_red = biaohong(bool_check_sentense, data_zong,
recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]]
@ -483,7 +604,11 @@ def accurate_check_rouge(
print(len(i[0]) + len(i[1]))
continue
t2 = time.time()
paper_dict = biaohong_bert_predict(sentence_0_list_new, sentence_1_list_new)
if sentence_0_list_new == sentence_1_list_new == []:
paper_dict = []
else:
paper_dict = biaohong_bert_predict(sentence_0_list_new, sentence_1_list_new)
t3 = time.time()
print("标红时间", t3 - t2)
@ -498,9 +623,8 @@ def accurate_check_rouge(
print("sentence_1_list_new", sentence_1_list_new)
print("sim_paper_name", sim_paper_name)
similar_content_control = [[]]
with open("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/paper_dict.json", "w") as f:
json.dump(paper_dict, f, ensure_ascii=False)
# with open("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/paper_dict.json", "w") as f:
# json.dump(paper_dict, f, ensure_ascii=False)
sentence_0_list_new_cursor = sentence_0_list_new[0]
for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)),
@ -513,38 +637,40 @@ def accurate_check_rouge(
else:
similar_content_control[-1].append([paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan])
data = [similar_content_control]
# 模拟多个章节
section_details_list = []
for data_dan in data:
data_section_dan = data_dan
# 章节详细信息
section_details = section_details_func(data_section_dan, paper_dict)
section_details_list.append(section_details)
# 模拟多个章节
section_data_list = []
for section_details in section_details_list:
section_data = section_data_func(section_details)
total_data = total_data_func(section_details_list)
format = '%Y-%m-%d %H:%M:%S'
value = time.localtime(int(time.time()))
dt = time.strftime(format, value)
paper_data = {
"author": author,
"check_time": dt,
"time_range": "1900-01-01至2023-08-08",
"title": title,
"total_data": total_data,
"section_data": section_data_list,
"section_details": section_details_list
}
paper_data = check_dict(similar_content_control, paper_dict, num_words, title, author)
# data = [similar_content_control]
#
# # 模拟多个章节
# section_details_list = []
# for data_dan in data:
# data_section_dan = data_dan
#
# # 章节详细信息
# section_details = section_details_func(data_section_dan, paper_dict, num_words)
# section_details_list.append(section_details)
#
# # 模拟多个章节
#
# section_data_list = []
# for section_details in section_details_list:
# section_data = section_data_func(section_details)
# section_data_list.append(section_data)
#
# total_data = total_data_func(section_details_list)
#
# format = '%Y-%m-%d %H:%M:%S'
# value = time.localtime(int(time.time()))
# dt = time.strftime(format, value)
#
# paper_data = {
# "author": author,
# "check_time": dt,
# "time_range": "1900-01-01至2023-08-08",
# "title": title,
# "total_data": total_data,
# "section_data": section_data_list,
# "section_details": section_details_list
# }
return paper_data
@ -801,14 +927,14 @@ def ulit_recall_paper(recall_data_list_dict):
# return data
data = []
for i in list(recall_data_list_dict.items())[:10]:
for i in list(recall_data_list_dict.items()):
data_one = processing_one_text(i[0])
data.extend(data_one)
return data
def recall_10(queue_uuid, title, abst_zh, content) -> dict:
def recall_10(queue_uuid, title, abst_zh, content):
'''
宇鹏召回接口
:param paper_name:
@ -821,9 +947,9 @@ def recall_10(queue_uuid, title, abst_zh, content) -> dict:
"abst_zh": abst_zh,
"content": content
}
paper_dict = dialog_line_parse("http://192.168.31.145:50004/check", request_json)
print(request_json)
da = dialog_line_parse("http://192.168.31.145:50004/check", request_json)
return paper_dict
def uilt_content(content):
@ -831,6 +957,7 @@ def uilt_content(content):
zhaiyao_en_list = ["Abstract", "abstract"]
mulu_list = ["目录"]
key_word_list = ["关键词"]
caikanwenxian = ["参考文献"]
key_word_bool = False
key_word_str = ""
zhaiyao_bool = False
@ -880,6 +1007,10 @@ def uilt_content(content):
result_biaoti_list = re.findall(pantten_zhaiyao, content)
zhaiyao_text = result_biaoti_list[0]
if zhaiyao_text == "":
content = str(content).replace("\n", "")
content_list = content.split("")
zhaiyao_text = "".join(content_list[:15])
return zhaiyao_text
@ -895,7 +1026,7 @@ def ulit_request_file(file):
with open(file_name_save, encoding="utf-8") as f:
content = f.read()
content = content.strip().replace("\n", "").replace(" ", "")
content = " ".join([i for i in content.split("\n") if i != ""])
abst_zh = uilt_content(content)
return abst_zh, content
@ -945,22 +1076,61 @@ def ulit_request_file(file):
# return jsonify(return_text) # 返回结果
def classify(): # 调用模型,设置最大batch_size
# def classify_recall(): # 调用模型,设置最大batch_size
# while True:
# if redis_.llen(db_key_query) == 0: # 若队列中没有元素就继续获取
# time.sleep(3)
# continue
# query = redis_.lpop(db_key_query).decode('UTF-8') # 获取query的text
# data_dict_path = json.loads(query)
# path = data_dict_path['path']
# # text_type = data_dict["text_type"]
#
# with open(path, encoding='utf8') as f1:
# # 加载文件的对象
# data_dict = json.load(f1)
#
# queue_uuid = data_dict['id']
# print(queue_uuid)
# dataBases = data_dict['dataBases']
# minSimilarity = data_dict['minSimilarity']
# minWords = data_dict['minWords']
# title = data_dict['title']
# author = data_dict['author']
# abst_zh = data_dict['abst_zh']
# content = data_dict['content']
# token = data_dict['token']
# account = data_dict['account']
# goodsId = data_dict['goodsId']
# callbackUrl = data_dict['callbackUrl']
#
# # 调用宇鹏查询相似十篇
# recall_data_list_dict = recall_10(queue_uuid, title, abst_zh, content)
#
# # print("查找相似的50篇完成")
# # with open("data/rell_json.txt") as f:
# # recall_data_list_dict = eval(f.read())
#
# # 读取文章转化成格式
def classify_accurate_check():
while True:
if redis_.llen(db_key_query) == 0: # 若队列中没有元素就继续获取
if redis_.llen(db_key_query_recall) == 0: # 若队列中没有元素就继续获取
time.sleep(3)
continue
query = redis_.lpop(db_key_query).decode('UTF-8') # 获取query的text
data_dict_path = json.loads(query)
path = data_dict_path['path']
# text_type = data_dict["text_type"]
with open(path, encoding='utf8') as f1:
# 加载文件的对象
data_dict = json.load(f1)
query_recall = redis_.lpop(db_key_query_recall).decode('UTF-8') # 获取query的text
query_recall_dict = json.loads(query_recall)
query_recall_uuid = query_recall_dict["uuid"]
recall_data_list_dict = json.loads(query_recall_dict["data"])
recall_data_list = ulit_recall_paper(recall_data_list_dict)
data_dict_path = redis_.get(query_recall_uuid + "_request_check")
with open(data_dict_path, encoding='utf8') as f:
data_dict = json.loads(f.read())
queue_uuid = data_dict['id']
print(queue_uuid)
dataBases = data_dict['dataBases']
minSimilarity = data_dict['minSimilarity']
minWords = data_dict['minWords']
@ -973,21 +1143,22 @@ def classify(): # 调用模型,设置最大batch_size
goodsId = data_dict['goodsId']
callbackUrl = data_dict['callbackUrl']
# 调用宇鹏查询相似十篇
recall_data_list_dict = recall_10(queue_uuid, title, abst_zh, content)
# print("查找相似的50篇完成")
print("查找相似的50篇完成")
print(len(content))
# with open("data/rell_json.txt") as f:
# recall_data_list_dict = eval(f.read())
# recall_data_list = ulit_recall_paper(recall_data_list_dict)
# 读取文章转化成格式数据
recall_data_list = ulit_recall_paper(recall_data_list_dict)
print("文章格式转化完成")
# recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist()
# 进入精确查重系统
print("进入精确查重系统")
return_list = accurate_check_rouge(title, author, content, recall_data_list)
return_text = {"resilt": return_list, "probabilities": None, "status_code": 200}
@ -1028,7 +1199,7 @@ def handle_query():
abst_zh, content = ulit_request_file(file)
id_ = str(uuid.uuid1()) # 为query生成唯一标识
print("uuid: ", uuid)
print("uuid: ", id_)
print(id_)
d = {
'id': id_,
@ -1044,17 +1215,21 @@ def handle_query():
'goodsId': goodsId,
'callbackUrl': callbackUrl
}
# 绑定文本和query id
print(d)
# 绑定文本和query id
# recall_10(id_, title, abst_zh, content)
Thread_rellce = Thread(target=recall_10, args=(id_, title, abst_zh, content,))
Thread_rellce.start()
load_request_path = './request_data_logs/{}.json'.format(id_)
with open(load_request_path, 'w', encoding='utf8') as f2:
# ensure_ascii=False才能输入中文,否则是Unicode字符
# indent=2 JSON数据的缩进,美观
with open(load_request_path, 'w', encoding='utf8') as f2: # ensure_ascii=False才能输入中文,否则是Unicode字符 indent=2 JSON数据的缩进,美观
json.dump(d, f2, ensure_ascii=False, indent=4)
redis_.rpush(db_key_query, json.dumps({"id": id_, "path": load_request_path})) # 加入redis
redis_.sadd(db_key_querying, id_)
redis_.sadd(db_key_queryset, id_)
# redis_.rpush(db_key_query, json.dumps({"id": id_, "path": load_request_path})) # 加入redis
# redis_.sadd(db_key_querying, id_)
# redis_.sadd(db_key_queryset, id_)
redis_.set(id_ + "_request_check", load_request_path)
return_text = {
'code': 0,
'msg': "请求成功",
@ -1070,9 +1245,11 @@ def handle_query():
return_text = {'code': 1}
return jsonify(return_text) # 返回结果
t1 = Thread(target=classify_accurate_check)
t1.start()
t = Thread(target=classify)
t.start()
# t = Thread(target=classify_recall)
# t.start()
if __name__ == "__main__":
app.run(host="0.0.0.0", port=16001, threaded=True, debug=True)
app.run(host="0.0.0.0", port=16001, threaded=True)

Loading…
Cancel
Save