diff --git a/flask_check_bert.py b/flask_check_bert.py index bbc6170..b5c9818 100644 --- a/flask_check_bert.py +++ b/flask_check_bert.py @@ -12,9 +12,18 @@ import requests from flask import Flask, jsonify from flask import request import uuid +import time +import redis +from threading import Thread app = Flask(__name__) app.config["JSON_AS_ASCII"] = False +pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=7, password="zhicheng123*") +redis_ = redis.Redis(connection_pool=pool, decode_responses=True) + +db_key_query = 'query' +db_key_querying = 'querying' +db_key_queryset = 'queryset' nums_cpus = 16 rouge = Rouge() @@ -91,7 +100,12 @@ def rouge_pre(text, df_train_nuoche): return return_list -def accurate_check_rouge(text_paper, recall_data_list): +def accurate_check_rouge( + title, + author, + text_paper, + recall_data_list + ): ''' 精确查重出相似句子 :param text: @@ -99,8 +113,6 @@ def accurate_check_rouge(text_paper, recall_data_list): :return: ''' # 文本处理 - # with open(text_paper_path, encoding="gbk") as f: - # text_paper = f.read() centent_list = [] text_paper = str(text_paper).replace("。\n", "。") centent_list.extend(text_paper.split("。")) @@ -108,25 +120,34 @@ def accurate_check_rouge(text_paper, recall_data_list): sentence_word_nums = 0 # rouge算法查重 - # for text in centent_list: - # rouge_pre_list = rouge_pre(text, recall_data_list) - # data_zong.append(rouge_pre_list) - - # bert算法查重 for text in centent_list: - bert_pre_list = bert_check(text, recall_data_list) - data_zong.append(bert_pre_list) + rouge_pre_list = rouge_pre(text, recall_data_list) + data_zong.append(rouge_pre_list) + t0 = time.time() + # bert算法查重 + # for text in centent_list: + # bert_pre_list = bert_check(text, recall_data_list) + # data_zong.append(bert_pre_list) + t1 = time.time() original_dict = [] # 找出相似的句子序号 bool_check_sentense = [] + # bert算法 + # for i in range(len(data_zong)): + # if data_zong[i][0] == 1: + # bool_check_sentense.append([i,data_zong[i][1]]) + + # rouge算法 for i in range(len(data_zong)): - if data_zong[i][0] == 1: + if data_zong[i][0] > 0.47: bool_check_sentense.append([i,data_zong[i][1]]) biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]] + print("bert精确查重时间", t1-t0) + sentence_0_list = [] sentence_1_list = [] @@ -151,22 +172,16 @@ def accurate_check_rouge(text_paper, recall_data_list): else: print(len(i[0]) + len(i[1])) continue - for i in zip(sentence_0_list_new, sentence_1_list_new): - print("超过字数", len(i[0])) - print("超过字数", len(i[1])) - + t2 = time.time() paper_dict = biaohong_bert_predict(sentence_0_list_new, sentence_1_list_new) - # paper_dict - # print("原文:".format(i), paper_dict[i][0]) - # print("原文标红:".format(i), paper_dict[i][1]) - # print("相似:".format(i), paper_dict[i][2]) - # print("相似标红:".format(i), paper_dict[i][3]) - - # original_text + t3 = time.time() + print("标红时间", t3 - t2) original_text = [] original_text_contrast = [] + repeat_quote_info = [] + chongfuwendang = {} for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), sentence_0_list_new, sentence_1_list_new, sim_paper_name): @@ -184,7 +199,6 @@ def accurate_check_rouge(text_paper, recall_data_list): } ] } - similar_content = {"author": ""} try: sentence_0_bool, sentence_0_dan_red = original_text_marked_red(sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]) # text_original, bert_text, bert_text_pre except: @@ -203,23 +217,62 @@ def accurate_check_rouge(text_paper, recall_data_list): original_text_contrast_dict["original_text"] = "此处有 {} 字相似\n".format( dan_sentence_word_nums) + sentence_0_dan_red - # similar_content["content"] = sentence_1_dan_red - # similar_content["title"] = sim_paper_name_dan - # original_text_contrast_dict["similar_content"][0] = similar_content - + thesis_info = " ".join([sim_paper_name_dan["title"], sim_paper_name_dan["author"], sim_paper_name_dan["degree"], sim_paper_name_dan["year"]]) original_text_contrast_dict["similar_content"][0]["content"] = sentence_1_dan_red - original_text_contrast_dict["similar_content"][0]["title"] = sim_paper_name_dan + original_text_contrast_dict["similar_content"][0]["title"] = sim_paper_name_dan["title"] + original_text_contrast_dict["similar_content"][0]["author"] = sim_paper_name_dan["author"] + original_text_contrast_dict["similar_content"][0]["degree"] = sim_paper_name_dan["degree"] + original_text_contrast_dict["similar_content"][0]["year"] = sim_paper_name_dan["year"] + original_text_contrast_dict["similar_content"][0]["thesis_info"] = thesis_info original_text_contrast.append(original_text_contrast_dict) + # for i in repeat_quote_info: + # if + + if thesis_info not in chongfuwendang: + chongfuwendang[thesis_info] = { + "quote": False, + "thesis_author": sim_paper_name_dan["author"], + "thesis_date" : sim_paper_name_dan["year"], + "thesis_info" : thesis_info, + "thesis_repeat_rate": (dan_sentence_word_nums/sim_paper_name_dan["paper_len_word"]) * 100, #round(repetition_rate, 3) * 100 + "thesis_title": sim_paper_name_dan["title"], + "thesis_link": "", + "thesis_publish": sim_paper_name_dan["degree"], + "thesis_repeat_word": dan_sentence_word_nums, + "thesis_teacher": "", + "paper_len_word": sim_paper_name_dan["paper_len_word"] + } + else: + chongfuwendang[thesis_info]["thesis_repeat_word"] += dan_sentence_word_nums + chongfuwendang[thesis_info]["thesis_repeat_rate"] = (chongfuwendang[thesis_info]["thesis_repeat_word"]/chongfuwendang[thesis_info]["paper_len_word"]) * 100 + + + chongfuwendang = sorted(chongfuwendang.items(), + key=lambda x: x[1]["thesis_repeat_rate"], reverse=False) + + + for i in range(len(chongfuwendang)): + repeat_paper_one_info_dict = chongfuwendang[i][1] + repeat_paper_one_info_dict.pop("paper_len_word") + repeat_paper_one_info_dict["thesis_repeat_rate"] = str(round(repeat_paper_one_info_dict["thesis_repeat_rate"], 1)) + "%" + repeat_quote_info.append(repeat_paper_one_info_dict) + original_text = "。".join(original_text) repetition_rate = sentence_word_nums/len(text_paper) - repetition_rate = round(repetition_rate, 3) *100 + repetition_rate = round(repetition_rate, 3) * 100 + + format = '%Y-%m-%d %H:%M:%S' + value = time.localtime(int(time.time())) + dt = time.strftime(format, value) return { - "author": "", - "check_time": "", + "author": author, + "check_time": dt, + "title": title, + "time_range": "1900-01-01至2023-08-08", "section_data": [ { "oneself_repeat_words": sentence_word_nums, @@ -240,11 +293,10 @@ def accurate_check_rouge(text_paper, recall_data_list): "words": "", "original_text": original_text, "original_text_oneself": original_text, - "original_text_contrast": original_text_contrast + "original_text_contrast": original_text_contrast, + "repeat_quote_info": repeat_quote_info } ], - "time_range": "1900-01-01至2023-08-08", - "title": "3", "total_data": { "back_repeat_words": "", "exclude_personal_rate": "{}%".format(repetition_rate), @@ -329,7 +381,7 @@ def dialog_line_parse(url, text): "server. Status Code: {}. Response: {}" "".format(url, response.status_code, response.text)) print(text) - return [] + return {} def is_english_char(char): @@ -492,9 +544,11 @@ def processing_one_text(paper_id): result = run_query(conn, sql, params) conn.close() - print(result) + print(result[0]['title'], result[0]['author']) title = result[0]['title'] author = result[0]['author'] + degree = result[0]['degree'] + year = result[0]['content'].split("/")[5] content_path = result[0]['content'] try: @@ -504,7 +558,14 @@ def processing_one_text(paper_id): with open(content_path, encoding="gbk") as f: text = f.read() - data = ulit_text(title, text) + paper_info = { + "title": title, + "author": author, + "degree": degree, + "year": year, + "paper_len_word": len(text) + } + data = ulit_text(paper_info, text) return data @@ -535,7 +596,7 @@ def ulit_recall_paper(recall_data_list_dict): return data -def recall_10(title, abst_zh, content) -> list: +def recall_10(title, abst_zh, content) -> dict: ''' 宇鹏召回接口 :param paper_name: @@ -606,8 +667,6 @@ def uilt_content(content): result_biaoti_list = re.findall(pantten_zhaiyao, content) zhaiyao_text = result_biaoti_list[0] - - return zhaiyao_text @@ -630,42 +689,179 @@ def ulit_request_file(file): -@app.route("/", methods=["POST"]) -def handle_query(): - print(request.remote_addr) +# @app.route("/", methods=["POST"]) +# def handle_query(): +# print(request.remote_addr) +# +# # request.form.get('prompt') +# dataBases = request.form.get("dataBases") +# minSimilarity = request.form.get("minSimilarity") # txt +# minWords = request.form.get("minWords") +# title = request.form.get("title") +# author = request.form.get("author") # txt +# file = request.files.get('file') +# token = request.form.get("token") +# account = request.form.get("account") +# goodsId = request.form.get("goodsId") +# callbackUrl = request.form.get("callbackUrl") +# +# +# t0 = time.time() +# abst_zh, content = ulit_request_file(file) +# +# # 调用宇鹏查询相似十篇 +# # recall_data_list_dict = recall_10(title, abst_zh, content) +# +# t1 = time.time() +# print("查找相似的50篇完成") +# with open("data/rell_json.txt") as f: +# recall_data_list_dict = eval(f.read()) +# +# # 读取文章转化成格式数据 +# recall_data_list = ulit_recall_paper(recall_data_list_dict) +# print("文章格式转化完成") +# +# # recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist() +# +# # 进入精确查重系统 +# print("进入精确查重系统") +# return_list = accurate_check_rouge(title, author, content, recall_data_list) +# +# print("召回50篇", t1 - t0) +# +# return_text = {"resilt": return_list, "probabilities": None, "status_code": 200} +# return jsonify(return_text) # 返回结果 + + +def classify(): # 调用模型,设置最大batch_size + while True: + if redis_.llen(db_key_query) == 0: # 若队列中没有元素就继续获取 + time.sleep(3) + continue + query = redis_.lpop(db_key_query).decode('UTF-8') # 获取query的text + data_dict_path = json.loads(query) + path = data_dict_path['path'] + # text_type = data_dict["text_type"] + + with open(path, encoding='utf8') as f1: + # 加载文件的对象 + data_dict = json.load(f1) + + query_id = data_dict['id'] + print(query_id) + dataBases = data_dict['dataBases'] + minSimilarity = data_dict['minSimilarity'] + minWords = data_dict['minWords'] + title = data_dict['title'] + author = data_dict['author'] + abst_zh = data_dict['abst_zh'] + content = data_dict['content'] + token = data_dict['token'] + account = data_dict['account'] + goodsId = data_dict['goodsId'] + callbackUrl = data_dict['callbackUrl'] + - # request.form.get('prompt') - dataBases = request.form.get("dataBases") - minSimilarity = request.form.get("minSimilarity") # txt - minWords = request.form.get("minWords") - title = request.form.get("title") - author = request.form.get("author") # txt - file = request.files.get('file') - token = request.form.get("token") - account = request.form.get("account") - goodsId = request.form.get("goodsId") - callbackUrl = request.form.get("callbackUrl") + # 调用宇鹏查询相似十篇 + # recall_data_list_dict = recall_10(title, abst_zh, content) + t1 = time.time() + print("查找相似的50篇完成") + with open("data/rell_json.txt") as f: + recall_data_list_dict = eval(f.read()) - abst_zh, content = ulit_request_file(file) - # 调用宇鹏查询相似十篇 - recall_data_list_dict = recall_10(title, abst_zh, content) - # with open("data/rell_json.txt") as f: - # recall_data_list_dict = eval(f.read()) + # 读取文章转化成格式数据 + recall_data_list = ulit_recall_paper(recall_data_list_dict) + print("文章格式转化完成") + # recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist() - # 读取文章转化成格式数据 - recall_data_list = ulit_recall_paper(recall_data_list_dict) + # 进入精确查重系统 + print("进入精确查重系统") + return_list = accurate_check_rouge(title, author, content, recall_data_list) + return_text = {"resilt": return_list, "probabilities": None, "status_code": 200} - # recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist() + load_result_path = "./new_data_logs/{}.json".format(query_id) - # 进入精确查重系统 - return_list = accurate_check_rouge(content, recall_data_list) + print("query_id: ", query_id) + print("load_result_path: ", load_result_path) - return_text = {"resilt": return_list, "probabilities": None, "status_code": 200} + with open(load_result_path, 'w', encoding='utf8') as f2: + # ensure_ascii=False才能输入中文,否则是Unicode字符 + # indent=2 JSON数据的缩进,美观 + json.dump(return_text, f2, ensure_ascii=False, indent=4) + + print(query_id) + print(load_result_path) + redis_.set(query_id, load_result_path, 86400) + redis_.srem(db_key_querying, query_id) + + +@app.route("/", methods=["POST"]) +def handle_query(): + try: + print(request.remote_addr) + + # request.form.get('prompt') + dataBases = request.form.get("dataBases") + minSimilarity = request.form.get("minSimilarity") # txt + minWords = request.form.get("minWords") + title = request.form.get("title") + author = request.form.get("author") # txt + file = request.files.get('file') + token = request.form.get("token") + account = request.form.get("account") + goodsId = request.form.get("goodsId") + callbackUrl = request.form.get("callbackUrl") + + abst_zh, content = ulit_request_file(file) + + id_ = str(uuid.uuid1()) # 为query生成唯一标识 + print("uuid: ", uuid) + print(id_) + d = { + 'id': id_, + 'dataBases': dataBases, + 'minSimilarity': minSimilarity, + 'minWords': minWords, + 'title': title, + 'author': author, + 'abst_zh': abst_zh, + 'content': content, + 'token': token, + 'account': account, + 'goodsId': goodsId, + 'callbackUrl': callbackUrl + } + + # 绑定文本和query id + print(d) + load_request_path = './request_data_logs/{}.json'.format(id_) + with open(load_request_path, 'w', encoding='utf8') as f2: + # ensure_ascii=False才能输入中文,否则是Unicode字符 + # indent=2 JSON数据的缩进,美观 + json.dump(d, f2, ensure_ascii=False, indent=4) + redis_.rpush(db_key_query, json.dumps({"id": id_, "path": load_request_path})) # 加入redis + redis_.sadd(db_key_querying, id_) + redis_.sadd(db_key_queryset, id_) + return_text = { + 'code': 0, + 'msg': "请求成功", + 'data': { + 'balances': "", + 'orderId': id_, + 'consumeNum': "" + } + } + + print("ok") + except: + return_text = {'code': 1} return jsonify(return_text) # 返回结果 +t = Thread(target=classify) +t.start() if __name__ == "__main__": app.run(host="0.0.0.0", port=16001, threaded=True, debug=True) \ No newline at end of file diff --git a/redis_search_uuid.py b/redis_search_uuid.py new file mode 100644 index 0000000..d042ab1 --- /dev/null +++ b/redis_search_uuid.py @@ -0,0 +1,116 @@ +# -*- coding: utf-8 -*- + +""" +@Time : 2023/3/2 19:31 +@Author : +@FileName: +@Software: +@Describe: +""" +# +# import redis +# +# redis_pool = redis.ConnectionPool(host='127.0.0.1', port=6379, password='', db=0) +# redis_conn = redis.Redis(connection_pool=redis_pool) +# +# +# name_dict = { +# 'name_4' : 'Zarten_4', +# 'name_5' : 'Zarten_5' +# } +# redis_conn.mset(name_dict) + +import flask +import redis +import uuid +import json +from threading import Thread +import time + +app = flask.Flask(__name__) +pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=7, password="zhicheng123*") +redis_ = redis.Redis(connection_pool=pool, decode_responses=True) + +db_key_query = 'query' +db_key_querying = 'querying' + +@app.route("/search", methods=["POST"]) +def handle_query(): + try: + id_ = flask.request.json['id'] # 获取用户query中的文本 例如"I love you" + result = redis_.get(id_) # 获取该query的模型结果 + if result is not None: + # redis_.delete(id_) + result_path = result.decode('UTF-8') + with open(result_path, encoding='utf8') as f1: + # 加载文件的对象 + result_dict = json.load(f1) + resilt = result_dict["resilt"] + result_text = {'status': 9, + 'resilt': resilt, + 'reportId': "", + 'downloadurl': "", + 'similarity': "" + } + else: + querying_list = list(redis_.smembers("querying")) + querying_set = set() + for i in querying_list: + querying_set.add(i.decode()) + + querying_bool = False + if id_ in querying_set: + querying_bool = True + + query_list_json = redis_.lrange(db_key_query, 0, -1) + query_set_ids = set() + for i in query_list_json: + data_dict = json.loads(i) + query_id = data_dict['id'] + query_set_ids.add(query_id) + + query_bool = False + if id_ in query_set_ids: + query_bool = True + + if querying_bool == True and query_bool == True: + result_text = {'status': 1, + 'resilt': "", + 'reportId': "", + 'downloadurl': "", + 'similarity': "" + } + elif querying_bool == True and query_bool == False: + result_text = {'status': 1, + 'resilt': "", + 'reportId': "", + 'downloadurl': "", + 'similarity': "" + } + else: + result_text = {'status': 1, + 'resilt': "", + 'reportId': "", + 'downloadurl': "", + 'similarity': "" + } + load_request_path = './request_data_logs_203/{}.json'.format(id_) + with open(load_request_path, 'w', encoding='utf8') as f2: + # ensure_ascii=False才能输入中文,否则是Unicode字符 + # indent=2 JSON数据的缩进,美观 + json.dump(result_text, f2, ensure_ascii=False, indent=4) + + result = {'code':0, + "msg": "请求成功", + "data": result_text} + except: + result = {'code':1, + "msg": "请求失败" + } + + + return flask.jsonify(result) # 返回结果 + + +if __name__ == "__main__": + app.run(debug=False, host='0.0.0.0', port=16002) \ No newline at end of file