增加单句多篇幅检测

2 years ago · 37e4f2e0d2
1 changed files with 261 additions and 84 deletions
--- a/flask_check_bert_test.py
+++ b/flask_check_bert_test.py
@ -20,12 +20,14 @@ from multiprocessing import Pool
 app = Flask(__name__)
 app.config["JSON_AS_ASCII"] = False

-pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=7, password="zhicheng123*")
+pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=8, password="zhicheng123*")
 redis_ = redis.Redis(connection_pool=pool, decode_responses=True)

-db_key_query = 'query'
-db_key_querying = 'querying'
-db_key_queryset = 'queryset'
+db_key_query = 'query_check_task'
+db_key_querying = 'querying_check_task'
+db_key_queryset = 'queryset_check_task'
+db_key_query_recall = 'query_recall'
+

 nums_cpus = 24
 rouge = Rouge()
@ -101,6 +103,9 @@ def rouge_pre(text, df_train_nuoche):


 def rouge_pre_m(text, df_train_nuoche):
+
+
+
    return_list = []
    index_rouge_list = []

@ -266,14 +271,17 @@ def total_data_func(section_data_list):
        repeat_words += i["repeat_words"]
        words += i["words"]

-    exclude_personal_rate = str(repeat_words / words * 100) + "%"
-    exclude_quote_rate = str(repeat_words / words * 100) + "%"
+    baifenbi = (repeat_words / words) *100
+    exclude_personal_rate = str(round(baifenbi, 1)) + "%"
+    exclude_quote_rate = str(round(baifenbi, 1)) + "%"
    single_max_rate = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_rate"]
    single_max_repeat_words = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_word"]
-    total_repeat_rate = str(repeat_words / words * 100) + "%"
+    total_repeat_rate = str(round(baifenbi, 1)) + "%"
    total_repeat_words = repeat_words
    total_words = words

+    print(exclude_personal_rate)
+
    return {
        "back_repeat_words": "",
        "exclude_personal_rate": exclude_personal_rate,
@ -353,7 +361,7 @@ def section_data_func(section_details):
    }


-def section_details_func(data_section_dan, paper_dict):
+def section_details_func(data_section_dan, paper_dict, num_words):
    '''
    章节详细信息
    :param original_text_contrast:
@ -363,21 +371,22 @@ def section_details_func(data_section_dan, paper_dict):
    original_text_contrast = []
    section_repeat_rate = ""
    repeat_words = 0
-    section_words = 0
+    section_words = num_words
    oneself_repeat_words = ""
    reference_repeat_words = ""
    section_oneself_rate = ""
    original_text_list = []

    for sentence_dan in data_section_dan:
+        print("sentence_dan", sentence_dan)
        original_text_contrast_dan = original_text_contrast_func(sentence_dan, paper_dict)
        original_text_contrast.append(original_text_contrast_dan)
        repeat_words += original_text_contrast_dan["dan_sentence_word_nums"]
        original_text_list.append(original_text_contrast_dan["original_text"])
-        section_words += len(sentence_dan[0][1])

    original_text = "。".join(original_text_list)
-    repeat_rate = repeat_words / section_words
+    repeat_rate = (repeat_words / section_words)* 100
+    repeat_rate = str(round(repeat_rate, 1)) + "%"

    repeat_quote_info = repeat_quote_info_func(original_text_contrast)

@ -395,6 +404,108 @@ def section_details_func(data_section_dan, paper_dict):
    }


+def check_dict(similar_content_control, paper_dict, num_words, title, author):
+    '''
+    生成返回字典
+    :param similar_content_control:
+    :param paper_dict:
+    :param num_words:
+    :param title:
+    :param author:
+    :return:
+    '''
+    if paper_dict != []:
+        data = [similar_content_control]
+
+        # 模拟多个章节
+        section_details_list = []
+        for data_dan in data:
+            data_section_dan = data_dan
+
+            # 章节详细信息
+            section_details = section_details_func(data_section_dan, paper_dict, num_words)
+            section_details_list.append(section_details)
+
+        # 模拟多个章节
+
+        section_data_list = []
+        for section_details in section_details_list:
+            section_data = section_data_func(section_details)
+            section_data_list.append(section_data)
+
+        total_data = total_data_func(section_details_list)
+
+        format = '%Y-%m-%d %H:%M:%S'
+        value = time.localtime(int(time.time()))
+        dt = time.strftime(format, value)
+
+        paper_data = {
+            "author": author,
+            "check_time": dt,
+            "time_range": "1900-01-01至2023-08-08",
+            "title": title,
+            "total_data": total_data,
+            "section_data": section_data_list,
+            "section_details": section_details_list
+        }
+    else:
+        total_data = {
+            "back_repeat_words": "",
+            "exclude_personal_rate": 0,
+            "exclude_quote_rate": 0,
+            "front_repeat_words": "",
+            "single_max_rate": 0,
+            "single_max_repeat_words": 0,
+            "suspected_paragraph": "",
+            "suspected_paragraph_max_repeat_words": "",
+            "suspected_paragraph_min_repeat_words": "",
+            "total_paragraph": "",
+            "total_repeat_rate": 0,
+            "total_repeat_words": 0,
+            "total_words": num_words,
+            "tables": 0
+        }
+
+        section_data_list = [{
+            "section_name": "第一部分",
+            "section_repeat_rate": 0,
+            "section_repeat_words": 0,
+            "section_words": num_words,
+            "oneself_repeat_words": 0,
+            "reference_repeat_words": 0,
+            "section_oneself_rate": 0
+        }]
+
+
+        section_details_list = [
+            {
+                "end_page_index": 0,
+                "name": "第1部分",
+                "repeat_rate": 0,
+                "repeat_words": 0,
+                "start_page_index": 0,
+                "words": num_words,
+                "original_text": "",
+                "original_text_oneself": "",
+                "original_text_contrast": [],
+                "repeat_quote_info": []
+            }
+        ]
+        format = '%Y-%m-%d %H:%M:%S'
+        value = time.localtime(int(time.time()))
+        dt = time.strftime(format, value)
+
+        paper_data = {
+            "author": author,
+            "check_time": dt,
+            "time_range": "1900-01-01至2023-08-08",
+            "title": title,
+            "total_data": total_data,
+            "section_data": section_data_list,
+            "section_details": section_details_list
+        }
+    return paper_data
+
 def accurate_check_rouge(
        title,
        author,
@ -408,18 +519,27 @@ def accurate_check_rouge(
    :return:
    '''
    # 文本处理
-    centent_list = []
+    # centent_list = []
+    print("text_paper", len(text_paper))
    text_paper = str(text_paper).replace("。\n", "。")
-    centent_list.extend(text_paper.split("。"))
+    centent_list_old = text_paper.split("。")
    data_zong = []
    sentence_word_nums = 0

    # rouge算法查重
    rst = []
-    p = Pool(nums_cpus)  # 进程池中含有n个子进程
+    p = Pool(nums_cpus) # 进程池中含有n个子进程

-    print("centent_list", centent_list)
+    # print("centent_list", centent_list)
+
+    num_words = 0
+    centent_list = []
+    for i in centent_list_old:
+        num_words += len(i)
+        if len(i) < 300:
+            centent_list.append(i)

+    print("num_words", num_words)
    for i in range(len(centent_list)):
        text = centent_list[i]
        a = p.apply_async(rouge_pre_m, args=(text, recall_data_list,))
@ -430,7 +550,7 @@ def accurate_check_rouge(
    rst = [i.get() for i in rst]

    for i in range(len(rst)):
-        print(rst[i])
+        # print(rst[i])
        data_zong.append(rst[i])

    t0 = time.time()
@ -453,6 +573,7 @@ def accurate_check_rouge(
        for j in range(len(data_zong[i])):
            if data_zong[i][j][1] > 0.47:
                bool_check_sentense.append([i, data_zong[i][j][0]])
+
    biao_red = biaohong(bool_check_sentense, data_zong,
                        recall_data_list)  # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]]

@ -483,7 +604,11 @@ def accurate_check_rouge(
            print(len(i[0]) + len(i[1]))
            continue
    t2 = time.time()
-    paper_dict = biaohong_bert_predict(sentence_0_list_new, sentence_1_list_new)
+
+    if sentence_0_list_new == sentence_1_list_new == []:
+        paper_dict = []
+    else:
+        paper_dict = biaohong_bert_predict(sentence_0_list_new, sentence_1_list_new)

    t3 = time.time()
    print("标红时间", t3 - t2)
@ -498,9 +623,8 @@ def accurate_check_rouge(
    print("sentence_1_list_new", sentence_1_list_new)
    print("sim_paper_name", sim_paper_name)
    similar_content_control = [[]]
-
-    with open("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/paper_dict.json", "w") as f:
-        json.dump(paper_dict, f, ensure_ascii=False)
+    # with open("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/paper_dict.json", "w") as f:
+    #     json.dump(paper_dict, f, ensure_ascii=False)

    sentence_0_list_new_cursor = sentence_0_list_new[0]
    for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)),
@ -513,38 +637,40 @@ def accurate_check_rouge(
        else:
            similar_content_control[-1].append([paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan])

-    data = [similar_content_control]
-
-    # 模拟多个章节
-    section_details_list = []
-    for data_dan in data:
-        data_section_dan = data_dan
-
-        # 章节详细信息
-        section_details = section_details_func(data_section_dan, paper_dict)
-        section_details_list.append(section_details)
-
-    # 模拟多个章节
-
-    section_data_list = []
-    for section_details in section_details_list:
-        section_data = section_data_func(section_details)
-
-    total_data = total_data_func(section_details_list)
-
-    format = '%Y-%m-%d %H:%M:%S'
-    value = time.localtime(int(time.time()))
-    dt = time.strftime(format, value)
-
-    paper_data = {
-        "author": author,
-        "check_time": dt,
-        "time_range": "1900-01-01至2023-08-08",
-        "title": title,
-        "total_data": total_data,
-        "section_data": section_data_list,
-        "section_details": section_details_list
-    }
+    paper_data = check_dict(similar_content_control, paper_dict, num_words, title, author)
+    # data = [similar_content_control]
+    #
+    # # 模拟多个章节
+    # section_details_list = []
+    # for data_dan in data:
+    #     data_section_dan = data_dan
+    #
+    #     # 章节详细信息
+    #     section_details = section_details_func(data_section_dan, paper_dict, num_words)
+    #     section_details_list.append(section_details)
+    #
+    # # 模拟多个章节
+    #
+    # section_data_list = []
+    # for section_details in section_details_list:
+    #     section_data = section_data_func(section_details)
+    #     section_data_list.append(section_data)
+    #
+    # total_data = total_data_func(section_details_list)
+    #
+    # format = '%Y-%m-%d %H:%M:%S'
+    # value = time.localtime(int(time.time()))
+    # dt = time.strftime(format, value)
+    #
+    # paper_data = {
+    #     "author": author,
+    #     "check_time": dt,
+    #     "time_range": "1900-01-01至2023-08-08",
+    #     "title": title,
+    #     "total_data": total_data,
+    #     "section_data": section_data_list,
+    #     "section_details": section_details_list
+    # }
    return paper_data


@ -801,14 +927,14 @@ def ulit_recall_paper(recall_data_list_dict):
    # return data

    data = []
-    for i in list(recall_data_list_dict.items())[:10]:
+    for i in list(recall_data_list_dict.items()):
        data_one = processing_one_text(i[0])
        data.extend(data_one)

    return data


-def recall_10(queue_uuid, title, abst_zh, content) -> dict:
+def recall_10(queue_uuid, title, abst_zh, content):
    '''
    宇鹏召回接口
    :param paper_name:
@ -821,9 +947,9 @@ def recall_10(queue_uuid, title, abst_zh, content) -> dict:
        "abst_zh": abst_zh,
        "content": content
    }
-    paper_dict = dialog_line_parse("http://192.168.31.145:50004/check", request_json)
+    print(request_json)
+    da = dialog_line_parse("http://192.168.31.145:50004/check", request_json)

-    return paper_dict


 def uilt_content(content):
@ -831,6 +957,7 @@ def uilt_content(content):
    zhaiyao_en_list = ["Abstract", "abstract"]
    mulu_list = ["目录"]
    key_word_list = ["关键词"]
+    caikanwenxian = ["参考文献"]
    key_word_bool = False
    key_word_str = ""
    zhaiyao_bool = False
@ -880,6 +1007,10 @@ def uilt_content(content):
        result_biaoti_list = re.findall(pantten_zhaiyao, content)
        zhaiyao_text = result_biaoti_list[0]

+    if zhaiyao_text == "":
+        content = str(content).replace("。\n", "。")
+        content_list = content.split("。")
+        zhaiyao_text = "".join(content_list[:15])
    return zhaiyao_text


@ -895,7 +1026,7 @@ def ulit_request_file(file):
            with open(file_name_save, encoding="utf-8") as f:
                content = f.read()

-    content = content.strip().replace("\n", "").replace(" ", "")
+    content = " ".join([i for i in content.split("\n") if i != ""])
    abst_zh = uilt_content(content)

    return abst_zh, content
@ -945,22 +1076,61 @@ def ulit_request_file(file):
 #     return jsonify(return_text)  # 返回结果


-def classify():  # 调用模型，设置最大batch_size
+# def classify_recall():  # 调用模型，设置最大batch_size
+#     while True:
+#         if redis_.llen(db_key_query) == 0:  # 若队列中没有元素就继续获取
+#             time.sleep(3)
+#             continue
+#         query = redis_.lpop(db_key_query).decode('UTF-8')  # 获取query的text
+#         data_dict_path = json.loads(query)
+#         path = data_dict_path['path']
+#         # text_type = data_dict["text_type"]
+#
+#         with open(path, encoding='utf8') as f1:
+#             # 加载文件的对象
+#             data_dict = json.load(f1)
+#
+#         queue_uuid = data_dict['id']
+#         print(queue_uuid)
+#         dataBases = data_dict['dataBases']
+#         minSimilarity = data_dict['minSimilarity']
+#         minWords = data_dict['minWords']
+#         title = data_dict['title']
+#         author = data_dict['author']
+#         abst_zh = data_dict['abst_zh']
+#         content = data_dict['content']
+#         token = data_dict['token']
+#         account = data_dict['account']
+#         goodsId = data_dict['goodsId']
+#         callbackUrl = data_dict['callbackUrl']
+#
+#         # 调用宇鹏查询相似十篇
+#         recall_data_list_dict = recall_10(queue_uuid, title, abst_zh, content)
+#
+#         # print("查找相似的50篇完成")
+#         # with open("data/rell_json.txt") as f:
+#         #     recall_data_list_dict = eval(f.read())
+#
+#         # 读取文章转化成格式
+
+
+def classify_accurate_check():
    while True:
-        if redis_.llen(db_key_query) == 0:  # 若队列中没有元素就继续获取
+        if redis_.llen(db_key_query_recall) == 0:  # 若队列中没有元素就继续获取
            time.sleep(3)
            continue
-        query = redis_.lpop(db_key_query).decode('UTF-8')  # 获取query的text
-        data_dict_path = json.loads(query)
-        path = data_dict_path['path']
-        # text_type = data_dict["text_type"]

-        with open(path, encoding='utf8') as f1:
-            # 加载文件的对象
-            data_dict = json.load(f1)
+        query_recall = redis_.lpop(db_key_query_recall).decode('UTF-8')  # 获取query的text
+        query_recall_dict = json.loads(query_recall)
+        query_recall_uuid = query_recall_dict["uuid"]
+        recall_data_list_dict = json.loads(query_recall_dict["data"])
+        recall_data_list = ulit_recall_paper(recall_data_list_dict)
+        data_dict_path = redis_.get(query_recall_uuid + "_request_check")
+        with open(data_dict_path, encoding='utf8') as f:
+            data_dict = json.loads(f.read())
+

        queue_uuid = data_dict['id']
-        print(queue_uuid)
        dataBases = data_dict['dataBases']
        minSimilarity = data_dict['minSimilarity']
        minWords = data_dict['minWords']
@ -973,21 +1143,22 @@ def classify():  # 调用模型，设置最大batch_size
        goodsId = data_dict['goodsId']
        callbackUrl = data_dict['callbackUrl']

-        # 调用宇鹏查询相似十篇
-        recall_data_list_dict = recall_10(queue_uuid, title, abst_zh, content)

-        # print("查找相似的50篇完成")
+
+        print("查找相似的50篇完成")
+        print(len(content))
+
        # with open("data/rell_json.txt") as f:
        #     recall_data_list_dict = eval(f.read())
+        # recall_data_list = ulit_recall_paper(recall_data_list_dict)

-        # 读取文章转化成格式数据
-        recall_data_list = ulit_recall_paper(recall_data_list_dict)
        print("文章格式转化完成")

        # recall_data_list = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist()

        # 进入精确查重系统
        print("进入精确查重系统")
+
        return_list = accurate_check_rouge(title, author, content, recall_data_list)

        return_text = {"resilt": return_list, "probabilities": None, "status_code": 200}
@ -1028,7 +1199,7 @@ def handle_query():
        abst_zh, content = ulit_request_file(file)

        id_ = str(uuid.uuid1())  # 为query生成唯一标识
-        print("uuid: ", uuid)
+        print("uuid: ", id_)
        print(id_)
        d = {
            'id': id_,
@ -1044,17 +1215,21 @@ def handle_query():
            'goodsId': goodsId,
            'callbackUrl': callbackUrl
        }
-
-        # 绑定文本和query id
        print(d)
+        # 绑定文本和query id
+        # recall_10(id_, title, abst_zh, content)
+
+        Thread_rellce = Thread(target=recall_10, args=(id_, title, abst_zh, content,))
+        Thread_rellce.start()
+
        load_request_path = './request_data_logs/{}.json'.format(id_)
-        with open(load_request_path, 'w', encoding='utf8') as f2:
-            # ensure_ascii=False才能输入中文，否则是Unicode字符
-            # indent=2 JSON数据的缩进，美观
+        with open(load_request_path, 'w', encoding='utf8') as f2: # ensure_ascii=False才能输入中文，否则是Unicode字符 indent=2 JSON数据的缩进，美观
            json.dump(d, f2, ensure_ascii=False, indent=4)
-        redis_.rpush(db_key_query, json.dumps({"id": id_, "path": load_request_path}))  # 加入redis
-        redis_.sadd(db_key_querying, id_)
-        redis_.sadd(db_key_queryset, id_)
+        # redis_.rpush(db_key_query, json.dumps({"id": id_, "path": load_request_path}))  # 加入redis
+        # redis_.sadd(db_key_querying, id_)
+        # redis_.sadd(db_key_queryset, id_)
+
+        redis_.set(id_ + "_request_check", load_request_path)
        return_text = {
            'code': 0,
            'msg': "请求成功",
@ -1070,9 +1245,11 @@ def handle_query():
        return_text = {'code': 1}
    return jsonify(return_text)  # 返回结果

+t1 = Thread(target=classify_accurate_check)
+t1.start()

-t = Thread(target=classify)
-t.start()
+# t = Thread(target=classify_recall)
+# t.start()

 if __name__ == "__main__":
-    app.run(host="0.0.0.0", port=16001, threaded=True, debug=True)
+    app.run(host="0.0.0.0", port=16001, threaded=True)