修改bug，接口和飞度的保持一致

2 years ago · 1bc6659ecb
2 changed files with 377 additions and 65 deletions
--- a/flask_check_bert.py
+++ b/flask_check_bert.py
@ -12,9 +12,18 @@ import requests
 from flask import Flask, jsonify
 from flask import request
 import uuid
+import time
+import redis
+from threading import Thread
 app = Flask(__name__)
 app.config["JSON_AS_ASCII"] = False

+pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=7, password="zhicheng123*")
+redis_ = redis.Redis(connection_pool=pool, decode_responses=True)
+
+db_key_query = 'query'
+db_key_querying = 'querying'
+db_key_queryset = 'queryset'

 nums_cpus = 16
 rouge = Rouge()
@ -91,7 +100,12 @@ def rouge_pre(text, df_train_nuoche):
    return return_list


-def accurate_check_rouge(text_paper, recall_data_list):
+def accurate_check_rouge(
+        title,
+        author,
+        text_paper,
+        recall_data_list
+    ):
    '''
    精确查重出相似句子
    :param text:
@ -99,8 +113,6 @@ def accurate_check_rouge(text_paper, recall_data_list):
    :return:
    '''
    # 文本处理
-    # with open(text_paper_path, encoding="gbk") as f:
-    #     text_paper = f.read()
    centent_list = []
    text_paper = str(text_paper).replace("。\n", "。")
    centent_list.extend(text_paper.split("。"))
@ -108,25 +120,34 @@ def accurate_check_rouge(text_paper, recall_data_list):
    sentence_word_nums = 0

    # rouge算法查重
-    # for text in centent_list:
-    #     rouge_pre_list = rouge_pre(text, recall_data_list)
-    #     data_zong.append(rouge_pre_list)
-
-    # bert算法查重
    for text in centent_list:
-        bert_pre_list = bert_check(text, recall_data_list)
-        data_zong.append(bert_pre_list)
+        rouge_pre_list = rouge_pre(text, recall_data_list)
+        data_zong.append(rouge_pre_list)

+    t0 = time.time()
+    # bert算法查重
+    # for text in centent_list:
+    #     bert_pre_list = bert_check(text, recall_data_list)
+    #     data_zong.append(bert_pre_list)
+    t1 = time.time()
    original_dict = []


    # 找出相似的句子序号
    bool_check_sentense = []
+    # bert算法
+    # for i in range(len(data_zong)):
+    #     if data_zong[i][0] == 1:
+    #         bool_check_sentense.append([i,data_zong[i][1]])
+
+    # rouge算法
    for i in range(len(data_zong)):
-        if data_zong[i][0] == 1:
+        if data_zong[i][0] > 0.47:
            bool_check_sentense.append([i,data_zong[i][1]])
    biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]]

+    print("bert精确查重时间", t1-t0)
+

    sentence_0_list = []
    sentence_1_list = []
@ -151,22 +172,16 @@ def accurate_check_rouge(text_paper, recall_data_list):
        else:
            print(len(i[0]) + len(i[1]))
            continue
-    for i in zip(sentence_0_list_new, sentence_1_list_new):
-        print("超过字数", len(i[0]))
-        print("超过字数", len(i[1]))
-
+    t2 = time.time()
    paper_dict = biaohong_bert_predict(sentence_0_list_new, sentence_1_list_new)

-    # paper_dict
-    # print("原文：".format(i), paper_dict[i][0])
-    # print("原文标红：".format(i), paper_dict[i][1])
-    # print("相似：".format(i), paper_dict[i][2])
-    # print("相似标红：".format(i), paper_dict[i][3])
-
-    # original_text
+    t3 = time.time()
+    print("标红时间", t3 - t2)
    original_text = []
    original_text_contrast = []
+    repeat_quote_info = []

+    chongfuwendang = {}

    for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), sentence_0_list_new, sentence_1_list_new, sim_paper_name):

@ -184,7 +199,6 @@ def accurate_check_rouge(text_paper, recall_data_list):
                }
            ]
        }
-        similar_content = {"author": ""}
        try:
            sentence_0_bool, sentence_0_dan_red = original_text_marked_red(sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]) # text_original, bert_text, bert_text_pre
        except:
@ -203,23 +217,62 @@ def accurate_check_rouge(text_paper, recall_data_list):
        original_text_contrast_dict["original_text"] = "此处有 {} 字相似\n".format(
            dan_sentence_word_nums) + sentence_0_dan_red

-        # similar_content["content"] = sentence_1_dan_red
-        # similar_content["title"] = sim_paper_name_dan
-        # original_text_contrast_dict["similar_content"][0] = similar_content
-
+        thesis_info = " ".join([sim_paper_name_dan["title"], sim_paper_name_dan["author"], sim_paper_name_dan["degree"], sim_paper_name_dan["year"]])
        original_text_contrast_dict["similar_content"][0]["content"] = sentence_1_dan_red
-        original_text_contrast_dict["similar_content"][0]["title"] = sim_paper_name_dan
+        original_text_contrast_dict["similar_content"][0]["title"] = sim_paper_name_dan["title"]
+        original_text_contrast_dict["similar_content"][0]["author"] = sim_paper_name_dan["author"]
+        original_text_contrast_dict["similar_content"][0]["degree"] = sim_paper_name_dan["degree"]
+        original_text_contrast_dict["similar_content"][0]["year"] = sim_paper_name_dan["year"]
+        original_text_contrast_dict["similar_content"][0]["thesis_info"] = thesis_info

        original_text_contrast.append(original_text_contrast_dict)

+        # for i in repeat_quote_info:
+        #     if
+
+        if thesis_info not in chongfuwendang:
+            chongfuwendang[thesis_info] = {
+                "quote": False,
+                "thesis_author": sim_paper_name_dan["author"],
+                "thesis_date" : sim_paper_name_dan["year"],
+                "thesis_info" : thesis_info,
+                "thesis_repeat_rate": (dan_sentence_word_nums/sim_paper_name_dan["paper_len_word"]) * 100, #round(repetition_rate, 3) * 100
+                "thesis_title": sim_paper_name_dan["title"],
+                "thesis_link": "",
+                "thesis_publish": sim_paper_name_dan["degree"],
+                "thesis_repeat_word": dan_sentence_word_nums,
+                "thesis_teacher": "",
+                "paper_len_word": sim_paper_name_dan["paper_len_word"]
+            }
+        else:
+            chongfuwendang[thesis_info]["thesis_repeat_word"] += dan_sentence_word_nums
+            chongfuwendang[thesis_info]["thesis_repeat_rate"] = (chongfuwendang[thesis_info]["thesis_repeat_word"]/chongfuwendang[thesis_info]["paper_len_word"]) * 100
+
+
+    chongfuwendang = sorted(chongfuwendang.items(),
+                            key=lambda x: x[1]["thesis_repeat_rate"], reverse=False)
+
+
+    for i in range(len(chongfuwendang)):
+        repeat_paper_one_info_dict = chongfuwendang[i][1]
+        repeat_paper_one_info_dict.pop("paper_len_word")
+        repeat_paper_one_info_dict["thesis_repeat_rate"] = str(round(repeat_paper_one_info_dict["thesis_repeat_rate"], 1)) + "%"
+        repeat_quote_info.append(repeat_paper_one_info_dict)
+
    original_text = "。".join(original_text)

    repetition_rate = sentence_word_nums/len(text_paper)
-    repetition_rate = round(repetition_rate, 3) *100
+    repetition_rate = round(repetition_rate, 3) * 100
+
+    format = '%Y-%m-%d %H:%M:%S'
+    value = time.localtime(int(time.time()))
+    dt = time.strftime(format, value)

    return {
-        "author": "",
-        "check_time": "",
+        "author": author,
+        "check_time": dt,
+        "title": title,
+        "time_range": "1900-01-01至2023-08-08",
        "section_data": [
            {
                "oneself_repeat_words": sentence_word_nums,
@ -240,11 +293,10 @@ def accurate_check_rouge(text_paper, recall_data_list):
                "words": "",
                "original_text": original_text,
                "original_text_oneself": original_text,
-                "original_text_contrast": original_text_contrast
+                "original_text_contrast": original_text_contrast,
+                "repeat_quote_info": repeat_quote_info
            }
        ],
-        "time_range": "1900-01-01至2023-08-08",
-        "title": "3",
        "total_data": {
            "back_repeat_words": "",
            "exclude_personal_rate": "{}%".format(repetition_rate),
@ -329,7 +381,7 @@ def dialog_line_parse(url, text):
            "server. Status Code: {}. Response: {}"
            "".format(url, response.status_code, response.text))
        print(text)
-        return []
+        return {}


 def is_english_char(char):
@ -492,9 +544,11 @@ def processing_one_text(paper_id):
    result = run_query(conn, sql, params)

    conn.close()
-    print(result)
+    print(result[0]['title'], result[0]['author'])
    title = result[0]['title']
    author = result[0]['author']
+    degree = result[0]['degree']
+    year = result[0]['content'].split("/")[5]
    content_path = result[0]['content']

    try:
@ -504,7 +558,14 @@ def processing_one_text(paper_id):
        with open(content_path, encoding="gbk") as f:
            text = f.read()

-    data = ulit_text(title, text)
+    paper_info = {
+        "title": title,
+        "author": author,
+        "degree": degree,
+        "year": year,
+        "paper_len_word": len(text)
+    }
+    data = ulit_text(paper_info, text)
    return data


@ -535,7 +596,7 @@ def ulit_recall_paper(recall_data_list_dict):
    return data


-def recall_10(title, abst_zh, content) -> list:
+def recall_10(title, abst_zh, content) -> dict:
    '''
    宇鹏召回接口
    :param paper_name:
@ -606,8 +667,6 @@ def uilt_content(content):
        result_biaoti_list = re.findall(pantten_zhaiyao, content)
        zhaiyao_text = result_biaoti_list[0]

-
-
    return zhaiyao_text


@ -630,8 +689,118 @@ def ulit_request_file(file):



+# @app.route("/", methods=["POST"])
+# def handle_query():
+#     print(request.remote_addr)
+#
+#     # request.form.get('prompt')
+#     dataBases = request.form.get("dataBases")
+#     minSimilarity = request.form.get("minSimilarity") # txt
+#     minWords = request.form.get("minWords")
+#     title = request.form.get("title")
+#     author = request.form.get("author") # txt
+#     file = request.files.get('file')
+#     token = request.form.get("token")
+#     account = request.form.get("account")
+#     goodsId = request.form.get("goodsId")
+#     callbackUrl = request.form.get("callbackUrl")
+#
+#
+#     t0 = time.time()
+#     abst_zh, content = ulit_request_file(file)
+#
+#     # 调用宇鹏查询相似十篇
+#     # recall_data_list_dict = recall_10(title, abst_zh, content)
+#
+#     t1 = time.time()
+#     print("查找相似的50篇完成")
+#     with open("data/rell_json.txt") as f:
+#         recall_data_list_dict = eval(f.read())
+#
+#     # 读取文章转化成格式数据
+#     recall_data_list = ulit_recall_paper(recall_data_list_dict)
+#     print("文章格式转化完成")
+#
+#     # recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist()
+#
+#     # 进入精确查重系统
+#     print("进入精确查重系统")
+#     return_list = accurate_check_rouge(title, author, content, recall_data_list)
+#
+#     print("召回50篇", t1 - t0)
+#
+#     return_text = {"resilt": return_list, "probabilities": None, "status_code": 200}
+#     return jsonify(return_text)  # 返回结果
+
+
+def classify():  # 调用模型，设置最大batch_size
+    while True:
+        if redis_.llen(db_key_query) == 0:  # 若队列中没有元素就继续获取
+            time.sleep(3)
+            continue
+        query = redis_.lpop(db_key_query).decode('UTF-8')  # 获取query的text
+        data_dict_path = json.loads(query)
+        path = data_dict_path['path']
+        # text_type = data_dict["text_type"]
+
+        with open(path, encoding='utf8') as f1:
+            # 加载文件的对象
+            data_dict = json.load(f1)
+
+        query_id = data_dict['id']
+        print(query_id)
+        dataBases = data_dict['dataBases']
+        minSimilarity = data_dict['minSimilarity']
+        minWords = data_dict['minWords']
+        title = data_dict['title']
+        author = data_dict['author']
+        abst_zh = data_dict['abst_zh']
+        content = data_dict['content']
+        token = data_dict['token']
+        account = data_dict['account']
+        goodsId = data_dict['goodsId']
+        callbackUrl = data_dict['callbackUrl']
+
+
+        # 调用宇鹏查询相似十篇
+        # recall_data_list_dict = recall_10(title, abst_zh, content)
+
+        t1 = time.time()
+        print("查找相似的50篇完成")
+        with open("data/rell_json.txt") as f:
+            recall_data_list_dict = eval(f.read())
+
+        # 读取文章转化成格式数据
+        recall_data_list = ulit_recall_paper(recall_data_list_dict)
+        print("文章格式转化完成")
+
+        # recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist()
+
+        # 进入精确查重系统
+        print("进入精确查重系统")
+        return_list = accurate_check_rouge(title, author, content, recall_data_list)
+
+        return_text = {"resilt": return_list, "probabilities": None, "status_code": 200}
+
+        load_result_path = "./new_data_logs/{}.json".format(query_id)
+
+        print("query_id: ", query_id)
+        print("load_result_path: ", load_result_path)
+
+        with open(load_result_path, 'w', encoding='utf8') as f2:
+            # ensure_ascii=False才能输入中文，否则是Unicode字符
+            # indent=2 JSON数据的缩进，美观
+            json.dump(return_text, f2, ensure_ascii=False, indent=4)
+
+        print(query_id)
+        print(load_result_path)
+        redis_.set(query_id, load_result_path, 86400)
+        redis_.srem(db_key_querying, query_id)
+
+
@app.route("/", methods=["POST"])
 def handle_query():
+    try:
        print(request.remote_addr)

        # request.form.get('prompt')
@ -646,26 +815,53 @@ def handle_query():
        goodsId = request.form.get("goodsId")
        callbackUrl = request.form.get("callbackUrl")

-
        abst_zh, content = ulit_request_file(file)
-    # 调用宇鹏查询相似十篇
-    recall_data_list_dict = recall_10(title, abst_zh, content)
-    # with open("data/rell_json.txt") as f:
-    #     recall_data_list_dict = eval(f.read())
-
-
-    # 读取文章转化成格式数据
-    recall_data_list = ulit_recall_paper(recall_data_list_dict)

+        id_ = str(uuid.uuid1())  # 为query生成唯一标识
+        print("uuid: ", uuid)
+        print(id_)
+        d = {
+            'id': id_,
+            'dataBases': dataBases,
+            'minSimilarity': minSimilarity,
+            'minWords': minWords,
+            'title': title,
+            'author': author,
+            'abst_zh': abst_zh,
+            'content': content,
+            'token': token,
+            'account': account,
+            'goodsId': goodsId,
+            'callbackUrl': callbackUrl
+            }

-    # recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist()
-
-    # 进入精确查重系统
-    return_list = accurate_check_rouge(content, recall_data_list)
+        # 绑定文本和query id
+        print(d)
+        load_request_path = './request_data_logs/{}.json'.format(id_)
+        with open(load_request_path, 'w', encoding='utf8') as f2:
+            # ensure_ascii=False才能输入中文，否则是Unicode字符
+            # indent=2 JSON数据的缩进，美观
+            json.dump(d, f2, ensure_ascii=False, indent=4)
+        redis_.rpush(db_key_query, json.dumps({"id": id_, "path": load_request_path}))  # 加入redis
+        redis_.sadd(db_key_querying, id_)
+        redis_.sadd(db_key_queryset, id_)
+        return_text = {
+            'code': 0,
+            'msg': "请求成功",
+            'data': {
+                'balances': "",
+                'orderId': id_,
+                'consumeNum': ""
+            }
+        }

-    return_text = {"resilt": return_list, "probabilities": None, "status_code": 200}
+        print("ok")
+    except:
+        return_text = {'code': 1}
    return jsonify(return_text)  # 返回结果

+t = Thread(target=classify)
+t.start()

 if __name__ == "__main__":
    app.run(host="0.0.0.0", port=16001, threaded=True, debug=True)
--- a/redis_search_uuid.py
+++ b/redis_search_uuid.py
@ -0,0 +1,116 @@
+# -*- coding: utf-8 -*-
+
+"""
+@Time    :  2023/3/2 19:31
+@Author  :
+@FileName:
+@Software:
+@Describe:
+"""
+#
+# import redis
+#
+# redis_pool = redis.ConnectionPool(host='127.0.0.1', port=6379, password='', db=0)
+# redis_conn = redis.Redis(connection_pool=redis_pool)
+#
+#
+# name_dict = {
+#     'name_4' : 'Zarten_4',
+#     'name_5' : 'Zarten_5'
+# }
+# redis_conn.mset(name_dict)
+
+import flask
+import redis
+import uuid
+import json
+from threading import Thread
+import time
+
+app = flask.Flask(__name__)
+pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=7, password="zhicheng123*")
+redis_ = redis.Redis(connection_pool=pool, decode_responses=True)
+
+db_key_query = 'query'
+db_key_querying = 'querying'
+
+@app.route("/search", methods=["POST"])
+def handle_query():
+    try:
+        id_ = flask.request.json['id']  # 获取用户query中的文本 例如"I love you"
+        result = redis_.get(id_)  # 获取该query的模型结果
+        if result is not None:
+            # redis_.delete(id_)
+            result_path = result.decode('UTF-8')
+            with open(result_path, encoding='utf8') as f1:
+                # 加载文件的对象
+                result_dict = json.load(f1)
+            resilt = result_dict["resilt"]
+            result_text = {'status': 9,
+                           'resilt': resilt,
+                           'reportId': "",
+                           'downloadurl': "",
+                           'similarity': ""
+                           }
+        else:
+            querying_list = list(redis_.smembers("querying"))
+            querying_set = set()
+            for i in querying_list:
+                querying_set.add(i.decode())
+
+            querying_bool = False
+            if id_ in querying_set:
+                querying_bool = True
+
+            query_list_json = redis_.lrange(db_key_query, 0, -1)
+            query_set_ids = set()
+            for i in query_list_json:
+                data_dict = json.loads(i)
+                query_id = data_dict['id']
+                query_set_ids.add(query_id)
+
+            query_bool = False
+            if id_ in query_set_ids:
+                query_bool = True
+
+            if querying_bool == True and query_bool == True:
+                result_text = {'status': 1,
+                           'resilt': "",
+                           'reportId': "",
+                           'downloadurl': "",
+                           'similarity': ""
+                           }
+            elif querying_bool == True and query_bool == False:
+                result_text = {'status': 1,
+                               'resilt': "",
+                               'reportId': "",
+                               'downloadurl': "",
+                               'similarity': ""
+                               }
+            else:
+                result_text = {'status': 1,
+                               'resilt': "",
+                               'reportId': "",
+                               'downloadurl': "",
+                               'similarity': ""
+                               }
+                load_request_path = './request_data_logs_203/{}.json'.format(id_)
+                with open(load_request_path, 'w', encoding='utf8') as f2:
+                    # ensure_ascii=False才能输入中文，否则是Unicode字符
+                    # indent=2 JSON数据的缩进，美观
+                    json.dump(result_text, f2, ensure_ascii=False, indent=4)
+
+        result = {'code':0,
+                  "msg": "请求成功",
+                  "data": result_text}
+    except:
+        result = {'code':1,
+                  "msg": "请求失败"
+                  }
+
+
+    return flask.jsonify(result)  # 返回结果
+
+
+if __name__ == "__main__":
+    app.run(debug=False, host='0.0.0.0', port=16002)