diff --git a/flask_check_bert.py b/flask_check_bert.py index 2a6010e..bbc6170 100644 --- a/flask_check_bert.py +++ b/flask_check_bert.py @@ -7,7 +7,7 @@ from rouge_chinese import Rouge from Rouge_w import Rouge_w,Rouge_l import json import pymysql - +import re import requests from flask import Flask, jsonify from flask import request @@ -105,6 +105,7 @@ def accurate_check_rouge(text_paper, recall_data_list): text_paper = str(text_paper).replace("。\n", "。") centent_list.extend(text_paper.split("。")) data_zong = [] + sentence_word_nums = 0 # rouge算法查重 # for text in centent_list: @@ -126,6 +127,7 @@ def accurate_check_rouge(text_paper, recall_data_list): bool_check_sentense.append([i,data_zong[i][1]]) biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]] + sentence_0_list = [] sentence_1_list = [] sim_paper_name = [] @@ -169,41 +171,97 @@ def accurate_check_rouge(text_paper, recall_data_list): for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), sentence_0_list_new, sentence_1_list_new, sim_paper_name): print([sentence_0_dan, sentence_1_dan]) - original_text_contrast_dict = {} + original_text_contrast_dict = { + "original_text": "", + "similar_content": [ + { + "content": "", + "thesis_info": "", + "title": "", + "year": "", + "degree": "", + "author": "", + } + ] + } similar_content = {"author": ""} try: sentence_0_bool, sentence_0_dan_red = original_text_marked_red(sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]) # text_original, bert_text, bert_text_pre except: print("报错", [sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]]) - 9/0 + continue + # 9/0 sentence_1_bool, sentence_1_dan_red = original_text_marked_red(sentence_1_dan, paper_dict[paper_dict_dan_id][2], paper_dict[paper_dict_dan_id][3]) # text_original, bert_text, bert_text_pre if sentence_0_bool == False or sentence_1_bool == False: continue + + dan_sentence_word_nums = len(paper_dict[paper_dict_dan_id][1]) + sentence_word_nums += dan_sentence_word_nums + original_text.append(sentence_0_dan_red) original_text_contrast_dict["original_text"] = "此处有 {} 字相似\n".format( - len(paper_dict[paper_dict_dan_id][1])) + sentence_0_dan_red + dan_sentence_word_nums) + sentence_0_dan_red - similar_content["content"] = sentence_1_dan_red - similar_content["title"] = sim_paper_name_dan - original_text_contrast_dict["similar_content"] = similar_content + # similar_content["content"] = sentence_1_dan_red + # similar_content["title"] = sim_paper_name_dan + # original_text_contrast_dict["similar_content"][0] = similar_content + + original_text_contrast_dict["similar_content"][0]["content"] = sentence_1_dan_red + original_text_contrast_dict["similar_content"][0]["title"] = sim_paper_name_dan original_text_contrast.append(original_text_contrast_dict) original_text = "。".join(original_text) + repetition_rate = sentence_word_nums/len(text_paper) + repetition_rate = round(repetition_rate, 3) *100 + return { "author": "", "check_time": "", - "section_data": "", + "section_data": [ + { + "oneself_repeat_words": sentence_word_nums, + "reference_repeat_words": sentence_word_nums, + "section_name": "第1部分", + "section_oneself_rate": "{}%".format(repetition_rate), + "section_repeat_rate": "{}%".format(repetition_rate), + "section_repeat_words": sentence_word_nums, + "section_words": len(text_paper) + } + ], "section_details": [ { "end_page_index": 0, "name": "", + "repeat_rate": "", + "repeat_words": "", + "words": "", "original_text": original_text, + "original_text_oneself": original_text, "original_text_contrast": original_text_contrast } - ] + ], + "time_range": "1900-01-01至2023-08-08", + "title": "3", + "total_data": { + "back_repeat_words": "", + "exclude_personal_rate": "{}%".format(repetition_rate), + "exclude_quote_rate": "{}%".format(repetition_rate), + "foot_end_note": "0", + "front_repeat_words": "", + "single_max_rate": "", + "single_max_repeat_words": "", + "suspected_paragraph": "1", + "suspected_paragraph_max_repeat_words": "", + "suspected_paragraph_min_repeat_words": "", + "tables": "0", + "total_paragraph": "1", + "total_repeat_rate": "{}%".format(repetition_rate), + "total_repeat_words": sentence_word_nums, + "total_words": len(text_paper) + } } @@ -489,21 +547,111 @@ def recall_10(title, abst_zh, content) -> list: "abst_zh": abst_zh, "content": content } - paper_dict = dialog_line_parse("http://192.168.31.145:50002/check", request_json) + paper_dict = dialog_line_parse("http://192.168.31.145:50004/check", request_json) return paper_dict + +def uilt_content(content): + zhaiyao_list = ["摘要"] + zhaiyao_en_list = ["Abstract", "abstract"] + mulu_list = ["目录"] + key_word_list = ["关键词"] + key_word_bool = False + key_word_str = "" + zhaiyao_bool = False + zhaiyao_en_bool = False + zhaiyao_str = "" + zhaiyao_en_str = "" + mulu_str = "" + zhaiyao_text = "" + mulu_bool = False + + for i in zhaiyao_list: + if i in content: + zhaiyao_bool = True + zhaiyao_str = i + break + + for i in zhaiyao_en_list: + if i in content: + zhaiyao_en_bool = True + zhaiyao_en_str = i + break + + for i in mulu_list: + if i in content: + mulu_str = i + mulu_bool = True + break + + for i in key_word_list: + if i in content: + key_word_str = i + key_word_bool = True + break + + if zhaiyao_bool== True and zhaiyao_en_bool == True: + pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,zhaiyao_en_str) + result_biaoti_list = re.findall(pantten_zhaiyao, content) + zhaiyao_text = result_biaoti_list[0] + + elif zhaiyao_bool == True and key_word_bool == True: + pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,key_word_str ) + result_biaoti_list = re.findall(pantten_zhaiyao, content) + zhaiyao_text = result_biaoti_list[0] + + elif zhaiyao_bool == True and mulu_bool == True: + pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,mulu_str) + result_biaoti_list = re.findall(pantten_zhaiyao, content) + zhaiyao_text = result_biaoti_list[0] + + + + return zhaiyao_text + + +def ulit_request_file(file): + file_name = file.filename + if file_name.split(".")[-1] == "txt": + file_name_save = "data/request/{}".format(file_name) + file.save(file_name_save) + try: + with open(file_name_save, encoding="gbk") as f: + content = f.read() + except: + with open(file_name_save, encoding="utf-8") as f: + content = f.read() + + content = content.strip().replace("\n", "").replace(" ", "") + abst_zh = uilt_content(content) + + return abst_zh, content + + + @app.route("/", methods=["POST"]) def handle_query(): print(request.remote_addr) - title = request.json["title"] - abst_zh = request.json["abst_zh"] # txt - content = request.json["content"] + # request.form.get('prompt') + dataBases = request.form.get("dataBases") + minSimilarity = request.form.get("minSimilarity") # txt + minWords = request.form.get("minWords") + title = request.form.get("title") + author = request.form.get("author") # txt + file = request.files.get('file') + token = request.form.get("token") + account = request.form.get("account") + goodsId = request.form.get("goodsId") + callbackUrl = request.form.get("callbackUrl") + + + abst_zh, content = ulit_request_file(file) # 调用宇鹏查询相似十篇 - # recall_data_list_dict = recall_10(title, abst_zh, content) - with open("data/rell_json.txt") as f: - recall_data_list_dict = eval(f.read()) + recall_data_list_dict = recall_10(title, abst_zh, content) + # with open("data/rell_json.txt") as f: + # recall_data_list_dict = eval(f.read()) # 读取文章转化成格式数据