Browse Source

修改bug,接口和飞度的对其

master
majiahui@haimaqingfan.com 2 years ago
parent
commit
b35679bdd0
  1. 180
      flask_check_bert.py

180
flask_check_bert.py

@ -7,7 +7,7 @@ from rouge_chinese import Rouge
from Rouge_w import Rouge_w,Rouge_l from Rouge_w import Rouge_w,Rouge_l
import json import json
import pymysql import pymysql
import re
import requests import requests
from flask import Flask, jsonify from flask import Flask, jsonify
from flask import request from flask import request
@ -105,6 +105,7 @@ def accurate_check_rouge(text_paper, recall_data_list):
text_paper = str(text_paper).replace("\n", "") text_paper = str(text_paper).replace("\n", "")
centent_list.extend(text_paper.split("")) centent_list.extend(text_paper.split(""))
data_zong = [] data_zong = []
sentence_word_nums = 0
# rouge算法查重 # rouge算法查重
# for text in centent_list: # for text in centent_list:
@ -126,6 +127,7 @@ def accurate_check_rouge(text_paper, recall_data_list):
bool_check_sentense.append([i,data_zong[i][1]]) bool_check_sentense.append([i,data_zong[i][1]])
biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]] biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]]
sentence_0_list = [] sentence_0_list = []
sentence_1_list = [] sentence_1_list = []
sim_paper_name = [] sim_paper_name = []
@ -169,41 +171,97 @@ def accurate_check_rouge(text_paper, recall_data_list):
for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), sentence_0_list_new, sentence_1_list_new, sim_paper_name): for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), sentence_0_list_new, sentence_1_list_new, sim_paper_name):
print([sentence_0_dan, sentence_1_dan]) print([sentence_0_dan, sentence_1_dan])
original_text_contrast_dict = {} original_text_contrast_dict = {
"original_text": "",
"similar_content": [
{
"content": "",
"thesis_info": "",
"title": "",
"year": "",
"degree": "",
"author": "",
}
]
}
similar_content = {"author": ""} similar_content = {"author": ""}
try: try:
sentence_0_bool, sentence_0_dan_red = original_text_marked_red(sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]) # text_original, bert_text, bert_text_pre sentence_0_bool, sentence_0_dan_red = original_text_marked_red(sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]) # text_original, bert_text, bert_text_pre
except: except:
print("报错", [sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]]) print("报错", [sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]])
9/0 continue
# 9/0
sentence_1_bool, sentence_1_dan_red = original_text_marked_red(sentence_1_dan, paper_dict[paper_dict_dan_id][2], paper_dict[paper_dict_dan_id][3]) # text_original, bert_text, bert_text_pre sentence_1_bool, sentence_1_dan_red = original_text_marked_red(sentence_1_dan, paper_dict[paper_dict_dan_id][2], paper_dict[paper_dict_dan_id][3]) # text_original, bert_text, bert_text_pre
if sentence_0_bool == False or sentence_1_bool == False: if sentence_0_bool == False or sentence_1_bool == False:
continue continue
dan_sentence_word_nums = len(paper_dict[paper_dict_dan_id][1])
sentence_word_nums += dan_sentence_word_nums
original_text.append(sentence_0_dan_red) original_text.append(sentence_0_dan_red)
original_text_contrast_dict["original_text"] = "此处有 {} 字相似\n".format( original_text_contrast_dict["original_text"] = "此处有 {} 字相似\n".format(
len(paper_dict[paper_dict_dan_id][1])) + sentence_0_dan_red dan_sentence_word_nums) + sentence_0_dan_red
similar_content["content"] = sentence_1_dan_red # similar_content["content"] = sentence_1_dan_red
similar_content["title"] = sim_paper_name_dan # similar_content["title"] = sim_paper_name_dan
original_text_contrast_dict["similar_content"] = similar_content # original_text_contrast_dict["similar_content"][0] = similar_content
original_text_contrast_dict["similar_content"][0]["content"] = sentence_1_dan_red
original_text_contrast_dict["similar_content"][0]["title"] = sim_paper_name_dan
original_text_contrast.append(original_text_contrast_dict) original_text_contrast.append(original_text_contrast_dict)
original_text = "".join(original_text) original_text = "".join(original_text)
repetition_rate = sentence_word_nums/len(text_paper)
repetition_rate = round(repetition_rate, 3) *100
return { return {
"author": "", "author": "",
"check_time": "", "check_time": "",
"section_data": "", "section_data": [
{
"oneself_repeat_words": sentence_word_nums,
"reference_repeat_words": sentence_word_nums,
"section_name": "第1部分",
"section_oneself_rate": "{}%".format(repetition_rate),
"section_repeat_rate": "{}%".format(repetition_rate),
"section_repeat_words": sentence_word_nums,
"section_words": len(text_paper)
}
],
"section_details": [ "section_details": [
{ {
"end_page_index": 0, "end_page_index": 0,
"name": "", "name": "",
"repeat_rate": "",
"repeat_words": "",
"words": "",
"original_text": original_text, "original_text": original_text,
"original_text_oneself": original_text,
"original_text_contrast": original_text_contrast "original_text_contrast": original_text_contrast
} }
] ],
"time_range": "1900-01-01至2023-08-08",
"title": "3",
"total_data": {
"back_repeat_words": "",
"exclude_personal_rate": "{}%".format(repetition_rate),
"exclude_quote_rate": "{}%".format(repetition_rate),
"foot_end_note": "0",
"front_repeat_words": "",
"single_max_rate": "",
"single_max_repeat_words": "",
"suspected_paragraph": "1",
"suspected_paragraph_max_repeat_words": "",
"suspected_paragraph_min_repeat_words": "",
"tables": "0",
"total_paragraph": "1",
"total_repeat_rate": "{}%".format(repetition_rate),
"total_repeat_words": sentence_word_nums,
"total_words": len(text_paper)
}
} }
@ -489,21 +547,111 @@ def recall_10(title, abst_zh, content) -> list:
"abst_zh": abst_zh, "abst_zh": abst_zh,
"content": content "content": content
} }
paper_dict = dialog_line_parse("http://192.168.31.145:50002/check", request_json) paper_dict = dialog_line_parse("http://192.168.31.145:50004/check", request_json)
return paper_dict return paper_dict
def uilt_content(content):
zhaiyao_list = ["摘要"]
zhaiyao_en_list = ["Abstract", "abstract"]
mulu_list = ["目录"]
key_word_list = ["关键词"]
key_word_bool = False
key_word_str = ""
zhaiyao_bool = False
zhaiyao_en_bool = False
zhaiyao_str = ""
zhaiyao_en_str = ""
mulu_str = ""
zhaiyao_text = ""
mulu_bool = False
for i in zhaiyao_list:
if i in content:
zhaiyao_bool = True
zhaiyao_str = i
break
for i in zhaiyao_en_list:
if i in content:
zhaiyao_en_bool = True
zhaiyao_en_str = i
break
for i in mulu_list:
if i in content:
mulu_str = i
mulu_bool = True
break
for i in key_word_list:
if i in content:
key_word_str = i
key_word_bool = True
break
if zhaiyao_bool== True and zhaiyao_en_bool == True:
pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,zhaiyao_en_str)
result_biaoti_list = re.findall(pantten_zhaiyao, content)
zhaiyao_text = result_biaoti_list[0]
elif zhaiyao_bool == True and key_word_bool == True:
pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,key_word_str )
result_biaoti_list = re.findall(pantten_zhaiyao, content)
zhaiyao_text = result_biaoti_list[0]
elif zhaiyao_bool == True and mulu_bool == True:
pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,mulu_str)
result_biaoti_list = re.findall(pantten_zhaiyao, content)
zhaiyao_text = result_biaoti_list[0]
return zhaiyao_text
def ulit_request_file(file):
file_name = file.filename
if file_name.split(".")[-1] == "txt":
file_name_save = "data/request/{}".format(file_name)
file.save(file_name_save)
try:
with open(file_name_save, encoding="gbk") as f:
content = f.read()
except:
with open(file_name_save, encoding="utf-8") as f:
content = f.read()
content = content.strip().replace("\n", "").replace(" ", "")
abst_zh = uilt_content(content)
return abst_zh, content
@app.route("/", methods=["POST"]) @app.route("/", methods=["POST"])
def handle_query(): def handle_query():
print(request.remote_addr) print(request.remote_addr)
title = request.json["title"]
abst_zh = request.json["abst_zh"] # txt
content = request.json["content"]
# request.form.get('prompt')
dataBases = request.form.get("dataBases")
minSimilarity = request.form.get("minSimilarity") # txt
minWords = request.form.get("minWords")
title = request.form.get("title")
author = request.form.get("author") # txt
file = request.files.get('file')
token = request.form.get("token")
account = request.form.get("account")
goodsId = request.form.get("goodsId")
callbackUrl = request.form.get("callbackUrl")
abst_zh, content = ulit_request_file(file)
# 调用宇鹏查询相似十篇 # 调用宇鹏查询相似十篇
# recall_data_list_dict = recall_10(title, abst_zh, content) recall_data_list_dict = recall_10(title, abst_zh, content)
with open("data/rell_json.txt") as f: # with open("data/rell_json.txt") as f:
recall_data_list_dict = eval(f.read()) # recall_data_list_dict = eval(f.read())
# 读取文章转化成格式数据 # 读取文章转化成格式数据

Loading…
Cancel
Save