Browse Source

修改bug,接口和飞度的保持一致

master
majiahui@haimaqingfan.com 2 years ago
parent
commit
1bc6659ecb
  1. 304
      flask_check_bert.py
  2. 116
      redis_search_uuid.py

304
flask_check_bert.py

@ -12,9 +12,18 @@ import requests
from flask import Flask, jsonify from flask import Flask, jsonify
from flask import request from flask import request
import uuid import uuid
import time
import redis
from threading import Thread
app = Flask(__name__) app = Flask(__name__)
app.config["JSON_AS_ASCII"] = False app.config["JSON_AS_ASCII"] = False
pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=7, password="zhicheng123*")
redis_ = redis.Redis(connection_pool=pool, decode_responses=True)
db_key_query = 'query'
db_key_querying = 'querying'
db_key_queryset = 'queryset'
nums_cpus = 16 nums_cpus = 16
rouge = Rouge() rouge = Rouge()
@ -91,7 +100,12 @@ def rouge_pre(text, df_train_nuoche):
return return_list return return_list
def accurate_check_rouge(text_paper, recall_data_list): def accurate_check_rouge(
title,
author,
text_paper,
recall_data_list
):
''' '''
精确查重出相似句子 精确查重出相似句子
:param text: :param text:
@ -99,8 +113,6 @@ def accurate_check_rouge(text_paper, recall_data_list):
:return: :return:
''' '''
# 文本处理 # 文本处理
# with open(text_paper_path, encoding="gbk") as f:
# text_paper = f.read()
centent_list = [] centent_list = []
text_paper = str(text_paper).replace("\n", "") text_paper = str(text_paper).replace("\n", "")
centent_list.extend(text_paper.split("")) centent_list.extend(text_paper.split(""))
@ -108,25 +120,34 @@ def accurate_check_rouge(text_paper, recall_data_list):
sentence_word_nums = 0 sentence_word_nums = 0
# rouge算法查重 # rouge算法查重
# for text in centent_list:
# rouge_pre_list = rouge_pre(text, recall_data_list)
# data_zong.append(rouge_pre_list)
# bert算法查重
for text in centent_list: for text in centent_list:
bert_pre_list = bert_check(text, recall_data_list) rouge_pre_list = rouge_pre(text, recall_data_list)
data_zong.append(bert_pre_list) data_zong.append(rouge_pre_list)
t0 = time.time()
# bert算法查重
# for text in centent_list:
# bert_pre_list = bert_check(text, recall_data_list)
# data_zong.append(bert_pre_list)
t1 = time.time()
original_dict = [] original_dict = []
# 找出相似的句子序号 # 找出相似的句子序号
bool_check_sentense = [] bool_check_sentense = []
# bert算法
# for i in range(len(data_zong)):
# if data_zong[i][0] == 1:
# bool_check_sentense.append([i,data_zong[i][1]])
# rouge算法
for i in range(len(data_zong)): for i in range(len(data_zong)):
if data_zong[i][0] == 1: if data_zong[i][0] > 0.47:
bool_check_sentense.append([i,data_zong[i][1]]) bool_check_sentense.append([i,data_zong[i][1]])
biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]] biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]]
print("bert精确查重时间", t1-t0)
sentence_0_list = [] sentence_0_list = []
sentence_1_list = [] sentence_1_list = []
@ -151,22 +172,16 @@ def accurate_check_rouge(text_paper, recall_data_list):
else: else:
print(len(i[0]) + len(i[1])) print(len(i[0]) + len(i[1]))
continue continue
for i in zip(sentence_0_list_new, sentence_1_list_new): t2 = time.time()
print("超过字数", len(i[0]))
print("超过字数", len(i[1]))
paper_dict = biaohong_bert_predict(sentence_0_list_new, sentence_1_list_new) paper_dict = biaohong_bert_predict(sentence_0_list_new, sentence_1_list_new)
# paper_dict t3 = time.time()
# print("原文:".format(i), paper_dict[i][0]) print("标红时间", t3 - t2)
# print("原文标红:".format(i), paper_dict[i][1])
# print("相似:".format(i), paper_dict[i][2])
# print("相似标红:".format(i), paper_dict[i][3])
# original_text
original_text = [] original_text = []
original_text_contrast = [] original_text_contrast = []
repeat_quote_info = []
chongfuwendang = {}
for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), sentence_0_list_new, sentence_1_list_new, sim_paper_name): for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), sentence_0_list_new, sentence_1_list_new, sim_paper_name):
@ -184,7 +199,6 @@ def accurate_check_rouge(text_paper, recall_data_list):
} }
] ]
} }
similar_content = {"author": ""}
try: try:
sentence_0_bool, sentence_0_dan_red = original_text_marked_red(sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]) # text_original, bert_text, bert_text_pre sentence_0_bool, sentence_0_dan_red = original_text_marked_red(sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]) # text_original, bert_text, bert_text_pre
except: except:
@ -203,23 +217,62 @@ def accurate_check_rouge(text_paper, recall_data_list):
original_text_contrast_dict["original_text"] = "此处有 {} 字相似\n".format( original_text_contrast_dict["original_text"] = "此处有 {} 字相似\n".format(
dan_sentence_word_nums) + sentence_0_dan_red dan_sentence_word_nums) + sentence_0_dan_red
# similar_content["content"] = sentence_1_dan_red thesis_info = " ".join([sim_paper_name_dan["title"], sim_paper_name_dan["author"], sim_paper_name_dan["degree"], sim_paper_name_dan["year"]])
# similar_content["title"] = sim_paper_name_dan
# original_text_contrast_dict["similar_content"][0] = similar_content
original_text_contrast_dict["similar_content"][0]["content"] = sentence_1_dan_red original_text_contrast_dict["similar_content"][0]["content"] = sentence_1_dan_red
original_text_contrast_dict["similar_content"][0]["title"] = sim_paper_name_dan original_text_contrast_dict["similar_content"][0]["title"] = sim_paper_name_dan["title"]
original_text_contrast_dict["similar_content"][0]["author"] = sim_paper_name_dan["author"]
original_text_contrast_dict["similar_content"][0]["degree"] = sim_paper_name_dan["degree"]
original_text_contrast_dict["similar_content"][0]["year"] = sim_paper_name_dan["year"]
original_text_contrast_dict["similar_content"][0]["thesis_info"] = thesis_info
original_text_contrast.append(original_text_contrast_dict) original_text_contrast.append(original_text_contrast_dict)
# for i in repeat_quote_info:
# if
if thesis_info not in chongfuwendang:
chongfuwendang[thesis_info] = {
"quote": False,
"thesis_author": sim_paper_name_dan["author"],
"thesis_date" : sim_paper_name_dan["year"],
"thesis_info" : thesis_info,
"thesis_repeat_rate": (dan_sentence_word_nums/sim_paper_name_dan["paper_len_word"]) * 100, #round(repetition_rate, 3) * 100
"thesis_title": sim_paper_name_dan["title"],
"thesis_link": "",
"thesis_publish": sim_paper_name_dan["degree"],
"thesis_repeat_word": dan_sentence_word_nums,
"thesis_teacher": "",
"paper_len_word": sim_paper_name_dan["paper_len_word"]
}
else:
chongfuwendang[thesis_info]["thesis_repeat_word"] += dan_sentence_word_nums
chongfuwendang[thesis_info]["thesis_repeat_rate"] = (chongfuwendang[thesis_info]["thesis_repeat_word"]/chongfuwendang[thesis_info]["paper_len_word"]) * 100
chongfuwendang = sorted(chongfuwendang.items(),
key=lambda x: x[1]["thesis_repeat_rate"], reverse=False)
for i in range(len(chongfuwendang)):
repeat_paper_one_info_dict = chongfuwendang[i][1]
repeat_paper_one_info_dict.pop("paper_len_word")
repeat_paper_one_info_dict["thesis_repeat_rate"] = str(round(repeat_paper_one_info_dict["thesis_repeat_rate"], 1)) + "%"
repeat_quote_info.append(repeat_paper_one_info_dict)
original_text = "".join(original_text) original_text = "".join(original_text)
repetition_rate = sentence_word_nums/len(text_paper) repetition_rate = sentence_word_nums/len(text_paper)
repetition_rate = round(repetition_rate, 3) *100 repetition_rate = round(repetition_rate, 3) * 100
format = '%Y-%m-%d %H:%M:%S'
value = time.localtime(int(time.time()))
dt = time.strftime(format, value)
return { return {
"author": "", "author": author,
"check_time": "", "check_time": dt,
"title": title,
"time_range": "1900-01-01至2023-08-08",
"section_data": [ "section_data": [
{ {
"oneself_repeat_words": sentence_word_nums, "oneself_repeat_words": sentence_word_nums,
@ -240,11 +293,10 @@ def accurate_check_rouge(text_paper, recall_data_list):
"words": "", "words": "",
"original_text": original_text, "original_text": original_text,
"original_text_oneself": original_text, "original_text_oneself": original_text,
"original_text_contrast": original_text_contrast "original_text_contrast": original_text_contrast,
"repeat_quote_info": repeat_quote_info
} }
], ],
"time_range": "1900-01-01至2023-08-08",
"title": "3",
"total_data": { "total_data": {
"back_repeat_words": "", "back_repeat_words": "",
"exclude_personal_rate": "{}%".format(repetition_rate), "exclude_personal_rate": "{}%".format(repetition_rate),
@ -329,7 +381,7 @@ def dialog_line_parse(url, text):
"server. Status Code: {}. Response: {}" "server. Status Code: {}. Response: {}"
"".format(url, response.status_code, response.text)) "".format(url, response.status_code, response.text))
print(text) print(text)
return [] return {}
def is_english_char(char): def is_english_char(char):
@ -492,9 +544,11 @@ def processing_one_text(paper_id):
result = run_query(conn, sql, params) result = run_query(conn, sql, params)
conn.close() conn.close()
print(result) print(result[0]['title'], result[0]['author'])
title = result[0]['title'] title = result[0]['title']
author = result[0]['author'] author = result[0]['author']
degree = result[0]['degree']
year = result[0]['content'].split("/")[5]
content_path = result[0]['content'] content_path = result[0]['content']
try: try:
@ -504,7 +558,14 @@ def processing_one_text(paper_id):
with open(content_path, encoding="gbk") as f: with open(content_path, encoding="gbk") as f:
text = f.read() text = f.read()
data = ulit_text(title, text) paper_info = {
"title": title,
"author": author,
"degree": degree,
"year": year,
"paper_len_word": len(text)
}
data = ulit_text(paper_info, text)
return data return data
@ -535,7 +596,7 @@ def ulit_recall_paper(recall_data_list_dict):
return data return data
def recall_10(title, abst_zh, content) -> list: def recall_10(title, abst_zh, content) -> dict:
''' '''
宇鹏召回接口 宇鹏召回接口
:param paper_name: :param paper_name:
@ -606,8 +667,6 @@ def uilt_content(content):
result_biaoti_list = re.findall(pantten_zhaiyao, content) result_biaoti_list = re.findall(pantten_zhaiyao, content)
zhaiyao_text = result_biaoti_list[0] zhaiyao_text = result_biaoti_list[0]
return zhaiyao_text return zhaiyao_text
@ -630,8 +689,118 @@ def ulit_request_file(file):
# @app.route("/", methods=["POST"])
# def handle_query():
# print(request.remote_addr)
#
# # request.form.get('prompt')
# dataBases = request.form.get("dataBases")
# minSimilarity = request.form.get("minSimilarity") # txt
# minWords = request.form.get("minWords")
# title = request.form.get("title")
# author = request.form.get("author") # txt
# file = request.files.get('file')
# token = request.form.get("token")
# account = request.form.get("account")
# goodsId = request.form.get("goodsId")
# callbackUrl = request.form.get("callbackUrl")
#
#
# t0 = time.time()
# abst_zh, content = ulit_request_file(file)
#
# # 调用宇鹏查询相似十篇
# # recall_data_list_dict = recall_10(title, abst_zh, content)
#
# t1 = time.time()
# print("查找相似的50篇完成")
# with open("data/rell_json.txt") as f:
# recall_data_list_dict = eval(f.read())
#
# # 读取文章转化成格式数据
# recall_data_list = ulit_recall_paper(recall_data_list_dict)
# print("文章格式转化完成")
#
# # recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist()
#
# # 进入精确查重系统
# print("进入精确查重系统")
# return_list = accurate_check_rouge(title, author, content, recall_data_list)
#
# print("召回50篇", t1 - t0)
#
# return_text = {"resilt": return_list, "probabilities": None, "status_code": 200}
# return jsonify(return_text) # 返回结果
def classify(): # 调用模型,设置最大batch_size
while True:
if redis_.llen(db_key_query) == 0: # 若队列中没有元素就继续获取
time.sleep(3)
continue
query = redis_.lpop(db_key_query).decode('UTF-8') # 获取query的text
data_dict_path = json.loads(query)
path = data_dict_path['path']
# text_type = data_dict["text_type"]
with open(path, encoding='utf8') as f1:
# 加载文件的对象
data_dict = json.load(f1)
query_id = data_dict['id']
print(query_id)
dataBases = data_dict['dataBases']
minSimilarity = data_dict['minSimilarity']
minWords = data_dict['minWords']
title = data_dict['title']
author = data_dict['author']
abst_zh = data_dict['abst_zh']
content = data_dict['content']
token = data_dict['token']
account = data_dict['account']
goodsId = data_dict['goodsId']
callbackUrl = data_dict['callbackUrl']
# 调用宇鹏查询相似十篇
# recall_data_list_dict = recall_10(title, abst_zh, content)
t1 = time.time()
print("查找相似的50篇完成")
with open("data/rell_json.txt") as f:
recall_data_list_dict = eval(f.read())
# 读取文章转化成格式数据
recall_data_list = ulit_recall_paper(recall_data_list_dict)
print("文章格式转化完成")
# recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist()
# 进入精确查重系统
print("进入精确查重系统")
return_list = accurate_check_rouge(title, author, content, recall_data_list)
return_text = {"resilt": return_list, "probabilities": None, "status_code": 200}
load_result_path = "./new_data_logs/{}.json".format(query_id)
print("query_id: ", query_id)
print("load_result_path: ", load_result_path)
with open(load_result_path, 'w', encoding='utf8') as f2:
# ensure_ascii=False才能输入中文,否则是Unicode字符
# indent=2 JSON数据的缩进,美观
json.dump(return_text, f2, ensure_ascii=False, indent=4)
print(query_id)
print(load_result_path)
redis_.set(query_id, load_result_path, 86400)
redis_.srem(db_key_querying, query_id)
@app.route("/", methods=["POST"]) @app.route("/", methods=["POST"])
def handle_query(): def handle_query():
try:
print(request.remote_addr) print(request.remote_addr)
# request.form.get('prompt') # request.form.get('prompt')
@ -646,26 +815,53 @@ def handle_query():
goodsId = request.form.get("goodsId") goodsId = request.form.get("goodsId")
callbackUrl = request.form.get("callbackUrl") callbackUrl = request.form.get("callbackUrl")
abst_zh, content = ulit_request_file(file) abst_zh, content = ulit_request_file(file)
# 调用宇鹏查询相似十篇
recall_data_list_dict = recall_10(title, abst_zh, content)
# with open("data/rell_json.txt") as f:
# recall_data_list_dict = eval(f.read())
# 读取文章转化成格式数据
recall_data_list = ulit_recall_paper(recall_data_list_dict)
id_ = str(uuid.uuid1()) # 为query生成唯一标识
print("uuid: ", uuid)
print(id_)
d = {
'id': id_,
'dataBases': dataBases,
'minSimilarity': minSimilarity,
'minWords': minWords,
'title': title,
'author': author,
'abst_zh': abst_zh,
'content': content,
'token': token,
'account': account,
'goodsId': goodsId,
'callbackUrl': callbackUrl
}
# recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist() # 绑定文本和query id
print(d)
# 进入精确查重系统 load_request_path = './request_data_logs/{}.json'.format(id_)
return_list = accurate_check_rouge(content, recall_data_list) with open(load_request_path, 'w', encoding='utf8') as f2:
# ensure_ascii=False才能输入中文,否则是Unicode字符
# indent=2 JSON数据的缩进,美观
json.dump(d, f2, ensure_ascii=False, indent=4)
redis_.rpush(db_key_query, json.dumps({"id": id_, "path": load_request_path})) # 加入redis
redis_.sadd(db_key_querying, id_)
redis_.sadd(db_key_queryset, id_)
return_text = {
'code': 0,
'msg': "请求成功",
'data': {
'balances': "",
'orderId': id_,
'consumeNum': ""
}
}
return_text = {"resilt": return_list, "probabilities": None, "status_code": 200} print("ok")
except:
return_text = {'code': 1}
return jsonify(return_text) # 返回结果 return jsonify(return_text) # 返回结果
t = Thread(target=classify)
t.start()
if __name__ == "__main__": if __name__ == "__main__":
app.run(host="0.0.0.0", port=16001, threaded=True, debug=True) app.run(host="0.0.0.0", port=16001, threaded=True, debug=True)

116
redis_search_uuid.py

@ -0,0 +1,116 @@
# -*- coding: utf-8 -*-
"""
@Time : 2023/3/2 19:31
@Author :
@FileName:
@Software:
@Describe:
"""
#
# import redis
#
# redis_pool = redis.ConnectionPool(host='127.0.0.1', port=6379, password='', db=0)
# redis_conn = redis.Redis(connection_pool=redis_pool)
#
#
# name_dict = {
# 'name_4' : 'Zarten_4',
# 'name_5' : 'Zarten_5'
# }
# redis_conn.mset(name_dict)
import flask
import redis
import uuid
import json
from threading import Thread
import time
app = flask.Flask(__name__)
pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=7, password="zhicheng123*")
redis_ = redis.Redis(connection_pool=pool, decode_responses=True)
db_key_query = 'query'
db_key_querying = 'querying'
@app.route("/search", methods=["POST"])
def handle_query():
try:
id_ = flask.request.json['id'] # 获取用户query中的文本 例如"I love you"
result = redis_.get(id_) # 获取该query的模型结果
if result is not None:
# redis_.delete(id_)
result_path = result.decode('UTF-8')
with open(result_path, encoding='utf8') as f1:
# 加载文件的对象
result_dict = json.load(f1)
resilt = result_dict["resilt"]
result_text = {'status': 9,
'resilt': resilt,
'reportId': "",
'downloadurl': "",
'similarity': ""
}
else:
querying_list = list(redis_.smembers("querying"))
querying_set = set()
for i in querying_list:
querying_set.add(i.decode())
querying_bool = False
if id_ in querying_set:
querying_bool = True
query_list_json = redis_.lrange(db_key_query, 0, -1)
query_set_ids = set()
for i in query_list_json:
data_dict = json.loads(i)
query_id = data_dict['id']
query_set_ids.add(query_id)
query_bool = False
if id_ in query_set_ids:
query_bool = True
if querying_bool == True and query_bool == True:
result_text = {'status': 1,
'resilt': "",
'reportId': "",
'downloadurl': "",
'similarity': ""
}
elif querying_bool == True and query_bool == False:
result_text = {'status': 1,
'resilt': "",
'reportId': "",
'downloadurl': "",
'similarity': ""
}
else:
result_text = {'status': 1,
'resilt': "",
'reportId': "",
'downloadurl': "",
'similarity': ""
}
load_request_path = './request_data_logs_203/{}.json'.format(id_)
with open(load_request_path, 'w', encoding='utf8') as f2:
# ensure_ascii=False才能输入中文,否则是Unicode字符
# indent=2 JSON数据的缩进,美观
json.dump(result_text, f2, ensure_ascii=False, indent=4)
result = {'code':0,
"msg": "请求成功",
"data": result_text}
except:
result = {'code':1,
"msg": "请求失败"
}
return flask.jsonify(result) # 返回结果
if __name__ == "__main__":
app.run(debug=False, host='0.0.0.0', port=16002)
Loading…
Cancel
Save