Browse Source

修改bug,接口和飞度的保持一致

master
majiahui@haimaqingfan.com 2 years ago
parent
commit
1bc6659ecb
  1. 326
      flask_check_bert.py
  2. 116
      redis_search_uuid.py

326
flask_check_bert.py

@ -12,9 +12,18 @@ import requests
from flask import Flask, jsonify
from flask import request
import uuid
import time
import redis
from threading import Thread
app = Flask(__name__)
app.config["JSON_AS_ASCII"] = False
pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=7, password="zhicheng123*")
redis_ = redis.Redis(connection_pool=pool, decode_responses=True)
db_key_query = 'query'
db_key_querying = 'querying'
db_key_queryset = 'queryset'
nums_cpus = 16
rouge = Rouge()
@ -91,7 +100,12 @@ def rouge_pre(text, df_train_nuoche):
return return_list
def accurate_check_rouge(text_paper, recall_data_list):
def accurate_check_rouge(
title,
author,
text_paper,
recall_data_list
):
'''
精确查重出相似句子
:param text:
@ -99,8 +113,6 @@ def accurate_check_rouge(text_paper, recall_data_list):
:return:
'''
# 文本处理
# with open(text_paper_path, encoding="gbk") as f:
# text_paper = f.read()
centent_list = []
text_paper = str(text_paper).replace("\n", "")
centent_list.extend(text_paper.split(""))
@ -108,25 +120,34 @@ def accurate_check_rouge(text_paper, recall_data_list):
sentence_word_nums = 0
# rouge算法查重
# for text in centent_list:
# rouge_pre_list = rouge_pre(text, recall_data_list)
# data_zong.append(rouge_pre_list)
# bert算法查重
for text in centent_list:
bert_pre_list = bert_check(text, recall_data_list)
data_zong.append(bert_pre_list)
rouge_pre_list = rouge_pre(text, recall_data_list)
data_zong.append(rouge_pre_list)
t0 = time.time()
# bert算法查重
# for text in centent_list:
# bert_pre_list = bert_check(text, recall_data_list)
# data_zong.append(bert_pre_list)
t1 = time.time()
original_dict = []
# 找出相似的句子序号
bool_check_sentense = []
# bert算法
# for i in range(len(data_zong)):
# if data_zong[i][0] == 1:
# bool_check_sentense.append([i,data_zong[i][1]])
# rouge算法
for i in range(len(data_zong)):
if data_zong[i][0] == 1:
if data_zong[i][0] > 0.47:
bool_check_sentense.append([i,data_zong[i][1]])
biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]]
print("bert精确查重时间", t1-t0)
sentence_0_list = []
sentence_1_list = []
@ -151,22 +172,16 @@ def accurate_check_rouge(text_paper, recall_data_list):
else:
print(len(i[0]) + len(i[1]))
continue
for i in zip(sentence_0_list_new, sentence_1_list_new):
print("超过字数", len(i[0]))
print("超过字数", len(i[1]))
t2 = time.time()
paper_dict = biaohong_bert_predict(sentence_0_list_new, sentence_1_list_new)
# paper_dict
# print("原文:".format(i), paper_dict[i][0])
# print("原文标红:".format(i), paper_dict[i][1])
# print("相似:".format(i), paper_dict[i][2])
# print("相似标红:".format(i), paper_dict[i][3])
# original_text
t3 = time.time()
print("标红时间", t3 - t2)
original_text = []
original_text_contrast = []
repeat_quote_info = []
chongfuwendang = {}
for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), sentence_0_list_new, sentence_1_list_new, sim_paper_name):
@ -184,7 +199,6 @@ def accurate_check_rouge(text_paper, recall_data_list):
}
]
}
similar_content = {"author": ""}
try:
sentence_0_bool, sentence_0_dan_red = original_text_marked_red(sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]) # text_original, bert_text, bert_text_pre
except:
@ -203,23 +217,62 @@ def accurate_check_rouge(text_paper, recall_data_list):
original_text_contrast_dict["original_text"] = "此处有 {} 字相似\n".format(
dan_sentence_word_nums) + sentence_0_dan_red
# similar_content["content"] = sentence_1_dan_red
# similar_content["title"] = sim_paper_name_dan
# original_text_contrast_dict["similar_content"][0] = similar_content
thesis_info = " ".join([sim_paper_name_dan["title"], sim_paper_name_dan["author"], sim_paper_name_dan["degree"], sim_paper_name_dan["year"]])
original_text_contrast_dict["similar_content"][0]["content"] = sentence_1_dan_red
original_text_contrast_dict["similar_content"][0]["title"] = sim_paper_name_dan
original_text_contrast_dict["similar_content"][0]["title"] = sim_paper_name_dan["title"]
original_text_contrast_dict["similar_content"][0]["author"] = sim_paper_name_dan["author"]
original_text_contrast_dict["similar_content"][0]["degree"] = sim_paper_name_dan["degree"]
original_text_contrast_dict["similar_content"][0]["year"] = sim_paper_name_dan["year"]
original_text_contrast_dict["similar_content"][0]["thesis_info"] = thesis_info
original_text_contrast.append(original_text_contrast_dict)
# for i in repeat_quote_info:
# if
if thesis_info not in chongfuwendang:
chongfuwendang[thesis_info] = {
"quote": False,
"thesis_author": sim_paper_name_dan["author"],
"thesis_date" : sim_paper_name_dan["year"],
"thesis_info" : thesis_info,
"thesis_repeat_rate": (dan_sentence_word_nums/sim_paper_name_dan["paper_len_word"]) * 100, #round(repetition_rate, 3) * 100
"thesis_title": sim_paper_name_dan["title"],
"thesis_link": "",
"thesis_publish": sim_paper_name_dan["degree"],
"thesis_repeat_word": dan_sentence_word_nums,
"thesis_teacher": "",
"paper_len_word": sim_paper_name_dan["paper_len_word"]
}
else:
chongfuwendang[thesis_info]["thesis_repeat_word"] += dan_sentence_word_nums
chongfuwendang[thesis_info]["thesis_repeat_rate"] = (chongfuwendang[thesis_info]["thesis_repeat_word"]/chongfuwendang[thesis_info]["paper_len_word"]) * 100
chongfuwendang = sorted(chongfuwendang.items(),
key=lambda x: x[1]["thesis_repeat_rate"], reverse=False)
for i in range(len(chongfuwendang)):
repeat_paper_one_info_dict = chongfuwendang[i][1]
repeat_paper_one_info_dict.pop("paper_len_word")
repeat_paper_one_info_dict["thesis_repeat_rate"] = str(round(repeat_paper_one_info_dict["thesis_repeat_rate"], 1)) + "%"
repeat_quote_info.append(repeat_paper_one_info_dict)
original_text = "".join(original_text)
repetition_rate = sentence_word_nums/len(text_paper)
repetition_rate = round(repetition_rate, 3) *100
repetition_rate = round(repetition_rate, 3) * 100
format = '%Y-%m-%d %H:%M:%S'
value = time.localtime(int(time.time()))
dt = time.strftime(format, value)
return {
"author": "",
"check_time": "",
"author": author,
"check_time": dt,
"title": title,
"time_range": "1900-01-01至2023-08-08",
"section_data": [
{
"oneself_repeat_words": sentence_word_nums,
@ -240,11 +293,10 @@ def accurate_check_rouge(text_paper, recall_data_list):
"words": "",
"original_text": original_text,
"original_text_oneself": original_text,
"original_text_contrast": original_text_contrast
"original_text_contrast": original_text_contrast,
"repeat_quote_info": repeat_quote_info
}
],
"time_range": "1900-01-01至2023-08-08",
"title": "3",
"total_data": {
"back_repeat_words": "",
"exclude_personal_rate": "{}%".format(repetition_rate),
@ -329,7 +381,7 @@ def dialog_line_parse(url, text):
"server. Status Code: {}. Response: {}"
"".format(url, response.status_code, response.text))
print(text)
return []
return {}
def is_english_char(char):
@ -492,9 +544,11 @@ def processing_one_text(paper_id):
result = run_query(conn, sql, params)
conn.close()
print(result)
print(result[0]['title'], result[0]['author'])
title = result[0]['title']
author = result[0]['author']
degree = result[0]['degree']
year = result[0]['content'].split("/")[5]
content_path = result[0]['content']
try:
@ -504,7 +558,14 @@ def processing_one_text(paper_id):
with open(content_path, encoding="gbk") as f:
text = f.read()
data = ulit_text(title, text)
paper_info = {
"title": title,
"author": author,
"degree": degree,
"year": year,
"paper_len_word": len(text)
}
data = ulit_text(paper_info, text)
return data
@ -535,7 +596,7 @@ def ulit_recall_paper(recall_data_list_dict):
return data
def recall_10(title, abst_zh, content) -> list:
def recall_10(title, abst_zh, content) -> dict:
'''
宇鹏召回接口
:param paper_name:
@ -606,8 +667,6 @@ def uilt_content(content):
result_biaoti_list = re.findall(pantten_zhaiyao, content)
zhaiyao_text = result_biaoti_list[0]
return zhaiyao_text
@ -630,42 +689,179 @@ def ulit_request_file(file):
@app.route("/", methods=["POST"])
def handle_query():
print(request.remote_addr)
# @app.route("/", methods=["POST"])
# def handle_query():
# print(request.remote_addr)
#
# # request.form.get('prompt')
# dataBases = request.form.get("dataBases")
# minSimilarity = request.form.get("minSimilarity") # txt
# minWords = request.form.get("minWords")
# title = request.form.get("title")
# author = request.form.get("author") # txt
# file = request.files.get('file')
# token = request.form.get("token")
# account = request.form.get("account")
# goodsId = request.form.get("goodsId")
# callbackUrl = request.form.get("callbackUrl")
#
#
# t0 = time.time()
# abst_zh, content = ulit_request_file(file)
#
# # 调用宇鹏查询相似十篇
# # recall_data_list_dict = recall_10(title, abst_zh, content)
#
# t1 = time.time()
# print("查找相似的50篇完成")
# with open("data/rell_json.txt") as f:
# recall_data_list_dict = eval(f.read())
#
# # 读取文章转化成格式数据
# recall_data_list = ulit_recall_paper(recall_data_list_dict)
# print("文章格式转化完成")
#
# # recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist()
#
# # 进入精确查重系统
# print("进入精确查重系统")
# return_list = accurate_check_rouge(title, author, content, recall_data_list)
#
# print("召回50篇", t1 - t0)
#
# return_text = {"resilt": return_list, "probabilities": None, "status_code": 200}
# return jsonify(return_text) # 返回结果
def classify(): # 调用模型,设置最大batch_size
while True:
if redis_.llen(db_key_query) == 0: # 若队列中没有元素就继续获取
time.sleep(3)
continue
query = redis_.lpop(db_key_query).decode('UTF-8') # 获取query的text
data_dict_path = json.loads(query)
path = data_dict_path['path']
# text_type = data_dict["text_type"]
with open(path, encoding='utf8') as f1:
# 加载文件的对象
data_dict = json.load(f1)
query_id = data_dict['id']
print(query_id)
dataBases = data_dict['dataBases']
minSimilarity = data_dict['minSimilarity']
minWords = data_dict['minWords']
title = data_dict['title']
author = data_dict['author']
abst_zh = data_dict['abst_zh']
content = data_dict['content']
token = data_dict['token']
account = data_dict['account']
goodsId = data_dict['goodsId']
callbackUrl = data_dict['callbackUrl']
# request.form.get('prompt')
dataBases = request.form.get("dataBases")
minSimilarity = request.form.get("minSimilarity") # txt
minWords = request.form.get("minWords")
title = request.form.get("title")
author = request.form.get("author") # txt
file = request.files.get('file')
token = request.form.get("token")
account = request.form.get("account")
goodsId = request.form.get("goodsId")
callbackUrl = request.form.get("callbackUrl")
# 调用宇鹏查询相似十篇
# recall_data_list_dict = recall_10(title, abst_zh, content)
t1 = time.time()
print("查找相似的50篇完成")
with open("data/rell_json.txt") as f:
recall_data_list_dict = eval(f.read())
abst_zh, content = ulit_request_file(file)
# 调用宇鹏查询相似十篇
recall_data_list_dict = recall_10(title, abst_zh, content)
# with open("data/rell_json.txt") as f:
# recall_data_list_dict = eval(f.read())
# 读取文章转化成格式数据
recall_data_list = ulit_recall_paper(recall_data_list_dict)
print("文章格式转化完成")
# recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist()
# 读取文章转化成格式数据
recall_data_list = ulit_recall_paper(recall_data_list_dict)
# 进入精确查重系统
print("进入精确查重系统")
return_list = accurate_check_rouge(title, author, content, recall_data_list)
return_text = {"resilt": return_list, "probabilities": None, "status_code": 200}
# recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist()
load_result_path = "./new_data_logs/{}.json".format(query_id)
# 进入精确查重系统
return_list = accurate_check_rouge(content, recall_data_list)
print("query_id: ", query_id)
print("load_result_path: ", load_result_path)
return_text = {"resilt": return_list, "probabilities": None, "status_code": 200}
with open(load_result_path, 'w', encoding='utf8') as f2:
# ensure_ascii=False才能输入中文,否则是Unicode字符
# indent=2 JSON数据的缩进,美观
json.dump(return_text, f2, ensure_ascii=False, indent=4)
print(query_id)
print(load_result_path)
redis_.set(query_id, load_result_path, 86400)
redis_.srem(db_key_querying, query_id)
@app.route("/", methods=["POST"])
def handle_query():
try:
print(request.remote_addr)
# request.form.get('prompt')
dataBases = request.form.get("dataBases")
minSimilarity = request.form.get("minSimilarity") # txt
minWords = request.form.get("minWords")
title = request.form.get("title")
author = request.form.get("author") # txt
file = request.files.get('file')
token = request.form.get("token")
account = request.form.get("account")
goodsId = request.form.get("goodsId")
callbackUrl = request.form.get("callbackUrl")
abst_zh, content = ulit_request_file(file)
id_ = str(uuid.uuid1()) # 为query生成唯一标识
print("uuid: ", uuid)
print(id_)
d = {
'id': id_,
'dataBases': dataBases,
'minSimilarity': minSimilarity,
'minWords': minWords,
'title': title,
'author': author,
'abst_zh': abst_zh,
'content': content,
'token': token,
'account': account,
'goodsId': goodsId,
'callbackUrl': callbackUrl
}
# 绑定文本和query id
print(d)
load_request_path = './request_data_logs/{}.json'.format(id_)
with open(load_request_path, 'w', encoding='utf8') as f2:
# ensure_ascii=False才能输入中文,否则是Unicode字符
# indent=2 JSON数据的缩进,美观
json.dump(d, f2, ensure_ascii=False, indent=4)
redis_.rpush(db_key_query, json.dumps({"id": id_, "path": load_request_path})) # 加入redis
redis_.sadd(db_key_querying, id_)
redis_.sadd(db_key_queryset, id_)
return_text = {
'code': 0,
'msg': "请求成功",
'data': {
'balances': "",
'orderId': id_,
'consumeNum': ""
}
}
print("ok")
except:
return_text = {'code': 1}
return jsonify(return_text) # 返回结果
t = Thread(target=classify)
t.start()
if __name__ == "__main__":
app.run(host="0.0.0.0", port=16001, threaded=True, debug=True)

116
redis_search_uuid.py

@ -0,0 +1,116 @@
# -*- coding: utf-8 -*-
"""
@Time : 2023/3/2 19:31
@Author :
@FileName:
@Software:
@Describe:
"""
#
# import redis
#
# redis_pool = redis.ConnectionPool(host='127.0.0.1', port=6379, password='', db=0)
# redis_conn = redis.Redis(connection_pool=redis_pool)
#
#
# name_dict = {
# 'name_4' : 'Zarten_4',
# 'name_5' : 'Zarten_5'
# }
# redis_conn.mset(name_dict)
import flask
import redis
import uuid
import json
from threading import Thread
import time
app = flask.Flask(__name__)
pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=7, password="zhicheng123*")
redis_ = redis.Redis(connection_pool=pool, decode_responses=True)
db_key_query = 'query'
db_key_querying = 'querying'
@app.route("/search", methods=["POST"])
def handle_query():
try:
id_ = flask.request.json['id'] # 获取用户query中的文本 例如"I love you"
result = redis_.get(id_) # 获取该query的模型结果
if result is not None:
# redis_.delete(id_)
result_path = result.decode('UTF-8')
with open(result_path, encoding='utf8') as f1:
# 加载文件的对象
result_dict = json.load(f1)
resilt = result_dict["resilt"]
result_text = {'status': 9,
'resilt': resilt,
'reportId': "",
'downloadurl': "",
'similarity': ""
}
else:
querying_list = list(redis_.smembers("querying"))
querying_set = set()
for i in querying_list:
querying_set.add(i.decode())
querying_bool = False
if id_ in querying_set:
querying_bool = True
query_list_json = redis_.lrange(db_key_query, 0, -1)
query_set_ids = set()
for i in query_list_json:
data_dict = json.loads(i)
query_id = data_dict['id']
query_set_ids.add(query_id)
query_bool = False
if id_ in query_set_ids:
query_bool = True
if querying_bool == True and query_bool == True:
result_text = {'status': 1,
'resilt': "",
'reportId': "",
'downloadurl': "",
'similarity': ""
}
elif querying_bool == True and query_bool == False:
result_text = {'status': 1,
'resilt': "",
'reportId': "",
'downloadurl': "",
'similarity': ""
}
else:
result_text = {'status': 1,
'resilt': "",
'reportId': "",
'downloadurl': "",
'similarity': ""
}
load_request_path = './request_data_logs_203/{}.json'.format(id_)
with open(load_request_path, 'w', encoding='utf8') as f2:
# ensure_ascii=False才能输入中文,否则是Unicode字符
# indent=2 JSON数据的缩进,美观
json.dump(result_text, f2, ensure_ascii=False, indent=4)
result = {'code':0,
"msg": "请求成功",
"data": result_text}
except:
result = {'code':1,
"msg": "请求失败"
}
return flask.jsonify(result) # 返回结果
if __name__ == "__main__":
app.run(debug=False, host='0.0.0.0', port=16002)
Loading…
Cancel
Save