修改bug，接口和飞度的对其

2 years ago · b35679bdd0
1 changed files with 164 additions and 16 deletions
--- a/flask_check_bert.py
+++ b/flask_check_bert.py
@ -7,7 +7,7 @@ from rouge_chinese import Rouge
 from Rouge_w import Rouge_w,Rouge_l
 import json
 import pymysql
-
+import re
 import requests
 from flask import Flask, jsonify
 from flask import request
@ -105,6 +105,7 @@ def accurate_check_rouge(text_paper, recall_data_list):
    text_paper = str(text_paper).replace("。\n", "。")
    centent_list.extend(text_paper.split("。"))
    data_zong = []
+    sentence_word_nums = 0

    # rouge算法查重
    # for text in centent_list:
@ -126,6 +127,7 @@ def accurate_check_rouge(text_paper, recall_data_list):
            bool_check_sentense.append([i,data_zong[i][1]])
    biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]]

+
    sentence_0_list = []
    sentence_1_list = []
    sim_paper_name = []
@ -169,41 +171,97 @@ def accurate_check_rouge(text_paper, recall_data_list):
    for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), sentence_0_list_new, sentence_1_list_new, sim_paper_name):

        print([sentence_0_dan, sentence_1_dan])
-        original_text_contrast_dict = {}
+        original_text_contrast_dict = {
+            "original_text": "",
+            "similar_content": [
+                {
+                    "content": "",
+                    "thesis_info": "",
+                    "title": "",
+                    "year": "",
+                    "degree": "",
+                    "author": "",
+                }
+            ]
+        }
        similar_content = {"author": ""}
        try:
            sentence_0_bool, sentence_0_dan_red = original_text_marked_red(sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]) # text_original, bert_text, bert_text_pre
        except:
            print("报错", [sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]])
-            9/0
+            continue
+            # 9/0
        sentence_1_bool, sentence_1_dan_red = original_text_marked_red(sentence_1_dan, paper_dict[paper_dict_dan_id][2], paper_dict[paper_dict_dan_id][3]) # text_original, bert_text, bert_text_pre

        if sentence_0_bool == False or sentence_1_bool == False:
            continue
+
+        dan_sentence_word_nums = len(paper_dict[paper_dict_dan_id][1])
+        sentence_word_nums += dan_sentence_word_nums
+
        original_text.append(sentence_0_dan_red)
        original_text_contrast_dict["original_text"] = "此处有 {} 字相似\n".format(
-            len(paper_dict[paper_dict_dan_id][1])) + sentence_0_dan_red
+            dan_sentence_word_nums) + sentence_0_dan_red
+
+        # similar_content["content"] = sentence_1_dan_red
+        # similar_content["title"] = sim_paper_name_dan
+        # original_text_contrast_dict["similar_content"][0] = similar_content

-        similar_content["content"] = sentence_1_dan_red
-        similar_content["title"] = sim_paper_name_dan
-        original_text_contrast_dict["similar_content"] = similar_content
+        original_text_contrast_dict["similar_content"][0]["content"] = sentence_1_dan_red
+        original_text_contrast_dict["similar_content"][0]["title"] = sim_paper_name_dan

        original_text_contrast.append(original_text_contrast_dict)

    original_text = "。".join(original_text)

+    repetition_rate = sentence_word_nums/len(text_paper)
+    repetition_rate = round(repetition_rate, 3) *100
+
    return {
        "author": "",
        "check_time": "",
-        "section_data": "",
+        "section_data": [
+            {
+                "oneself_repeat_words": sentence_word_nums,
+                "reference_repeat_words": sentence_word_nums,
+                "section_name": "第1部分",
+                "section_oneself_rate": "{}%".format(repetition_rate),
+                "section_repeat_rate": "{}%".format(repetition_rate),
+                "section_repeat_words": sentence_word_nums,
+                "section_words": len(text_paper)
+            }
+        ],
        "section_details": [
            {
                "end_page_index": 0,
                "name": "",
+                "repeat_rate": "",
+                "repeat_words": "",
+                "words": "",
                "original_text": original_text,
+                "original_text_oneself": original_text,
                "original_text_contrast": original_text_contrast
            }
-        ]
+        ],
+        "time_range": "1900-01-01至2023-08-08",
+        "title": "3",
+        "total_data": {
+            "back_repeat_words": "",
+            "exclude_personal_rate": "{}%".format(repetition_rate),
+            "exclude_quote_rate": "{}%".format(repetition_rate),
+            "foot_end_note": "0",
+            "front_repeat_words": "",
+            "single_max_rate": "",
+            "single_max_repeat_words": "",
+            "suspected_paragraph": "1",
+            "suspected_paragraph_max_repeat_words": "",
+            "suspected_paragraph_min_repeat_words": "",
+            "tables": "0",
+            "total_paragraph": "1",
+            "total_repeat_rate": "{}%".format(repetition_rate),
+            "total_repeat_words": sentence_word_nums,
+            "total_words": len(text_paper)
+        }
    }


@ -489,21 +547,111 @@ def recall_10(title, abst_zh, content) -> list:
        "abst_zh": abst_zh,
        "content": content
    }
-    paper_dict = dialog_line_parse("http://192.168.31.145:50002/check", request_json)
+    paper_dict = dialog_line_parse("http://192.168.31.145:50004/check", request_json)

    return paper_dict

+
+def uilt_content(content):
+    zhaiyao_list = ["摘要"]
+    zhaiyao_en_list = ["Abstract", "abstract"]
+    mulu_list = ["目录"]
+    key_word_list = ["关键词"]
+    key_word_bool = False
+    key_word_str = ""
+    zhaiyao_bool = False
+    zhaiyao_en_bool = False
+    zhaiyao_str = ""
+    zhaiyao_en_str = ""
+    mulu_str = ""
+    zhaiyao_text = ""
+    mulu_bool = False
+
+    for i in zhaiyao_list:
+        if i in content:
+            zhaiyao_bool = True
+            zhaiyao_str = i
+            break
+
+    for i in zhaiyao_en_list:
+        if i in content:
+            zhaiyao_en_bool = True
+            zhaiyao_en_str = i
+            break
+
+    for i in mulu_list:
+        if i in content:
+            mulu_str = i
+            mulu_bool = True
+            break
+
+    for i in key_word_list:
+        if i in content:
+            key_word_str = i
+            key_word_bool = True
+            break
+
+    if zhaiyao_bool== True and zhaiyao_en_bool == True:
+        pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,zhaiyao_en_str)
+        result_biaoti_list = re.findall(pantten_zhaiyao, content)
+        zhaiyao_text = result_biaoti_list[0]
+
+    elif zhaiyao_bool == True and key_word_bool == True:
+        pantten_zhaiyao  = "{}(.*?){}".format(zhaiyao_str,key_word_str )
+        result_biaoti_list = re.findall(pantten_zhaiyao, content)
+        zhaiyao_text = result_biaoti_list[0]
+
+    elif zhaiyao_bool == True and mulu_bool == True:
+        pantten_zhaiyao  = "{}(.*?){}".format(zhaiyao_str,mulu_str)
+        result_biaoti_list = re.findall(pantten_zhaiyao, content)
+        zhaiyao_text = result_biaoti_list[0]
+
+
+
+    return zhaiyao_text
+
+
+def ulit_request_file(file):
+    file_name = file.filename
+    if file_name.split(".")[-1] == "txt":
+        file_name_save = "data/request/{}".format(file_name)
+        file.save(file_name_save)
+        try:
+            with open(file_name_save, encoding="gbk") as f:
+                content = f.read()
+        except:
+            with open(file_name_save, encoding="utf-8") as f:
+                content = f.read()
+
+    content = content.strip().replace("\n", "").replace(" ", "")
+    abst_zh = uilt_content(content)
+
+    return abst_zh, content
+
+
+
@app.route("/", methods=["POST"])
 def handle_query():
    print(request.remote_addr)
-    title = request.json["title"]
-    abst_zh = request.json["abst_zh"] # txt
-    content = request.json["content"]

+    # request.form.get('prompt')
+    dataBases = request.form.get("dataBases")
+    minSimilarity = request.form.get("minSimilarity") # txt
+    minWords = request.form.get("minWords")
+    title = request.form.get("title")
+    author = request.form.get("author") # txt
+    file = request.files.get('file')
+    token = request.form.get("token")
+    account = request.form.get("account")
+    goodsId = request.form.get("goodsId")
+    callbackUrl = request.form.get("callbackUrl")
+
+
+    abst_zh, content = ulit_request_file(file)
    # 调用宇鹏查询相似十篇
-    # recall_data_list_dict = recall_10(title, abst_zh, content)
-    with open("data/rell_json.txt") as f:
-        recall_data_list_dict = eval(f.read())
+    recall_data_list_dict = recall_10(title, abst_zh, content)
+    # with open("data/rell_json.txt") as f:
+    #     recall_data_list_dict = eval(f.read())


    # 读取文章转化成格式数据