Browse Source

修改bug,接口和飞度的对其

master
majiahui@haimaqingfan.com 2 years ago
parent
commit
b35679bdd0
  1. 180
      flask_check_bert.py

180
flask_check_bert.py

@ -7,7 +7,7 @@ from rouge_chinese import Rouge
from Rouge_w import Rouge_w,Rouge_l
import json
import pymysql
import re
import requests
from flask import Flask, jsonify
from flask import request
@ -105,6 +105,7 @@ def accurate_check_rouge(text_paper, recall_data_list):
text_paper = str(text_paper).replace("\n", "")
centent_list.extend(text_paper.split(""))
data_zong = []
sentence_word_nums = 0
# rouge算法查重
# for text in centent_list:
@ -126,6 +127,7 @@ def accurate_check_rouge(text_paper, recall_data_list):
bool_check_sentense.append([i,data_zong[i][1]])
biao_red = biaohong(bool_check_sentense, data_zong, recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]]
sentence_0_list = []
sentence_1_list = []
sim_paper_name = []
@ -169,41 +171,97 @@ def accurate_check_rouge(text_paper, recall_data_list):
for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), sentence_0_list_new, sentence_1_list_new, sim_paper_name):
print([sentence_0_dan, sentence_1_dan])
original_text_contrast_dict = {}
original_text_contrast_dict = {
"original_text": "",
"similar_content": [
{
"content": "",
"thesis_info": "",
"title": "",
"year": "",
"degree": "",
"author": "",
}
]
}
similar_content = {"author": ""}
try:
sentence_0_bool, sentence_0_dan_red = original_text_marked_red(sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]) # text_original, bert_text, bert_text_pre
except:
print("报错", [sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]])
9/0
continue
# 9/0
sentence_1_bool, sentence_1_dan_red = original_text_marked_red(sentence_1_dan, paper_dict[paper_dict_dan_id][2], paper_dict[paper_dict_dan_id][3]) # text_original, bert_text, bert_text_pre
if sentence_0_bool == False or sentence_1_bool == False:
continue
dan_sentence_word_nums = len(paper_dict[paper_dict_dan_id][1])
sentence_word_nums += dan_sentence_word_nums
original_text.append(sentence_0_dan_red)
original_text_contrast_dict["original_text"] = "此处有 {} 字相似\n".format(
len(paper_dict[paper_dict_dan_id][1])) + sentence_0_dan_red
dan_sentence_word_nums) + sentence_0_dan_red
# similar_content["content"] = sentence_1_dan_red
# similar_content["title"] = sim_paper_name_dan
# original_text_contrast_dict["similar_content"][0] = similar_content
similar_content["content"] = sentence_1_dan_red
similar_content["title"] = sim_paper_name_dan
original_text_contrast_dict["similar_content"] = similar_content
original_text_contrast_dict["similar_content"][0]["content"] = sentence_1_dan_red
original_text_contrast_dict["similar_content"][0]["title"] = sim_paper_name_dan
original_text_contrast.append(original_text_contrast_dict)
original_text = "".join(original_text)
repetition_rate = sentence_word_nums/len(text_paper)
repetition_rate = round(repetition_rate, 3) *100
return {
"author": "",
"check_time": "",
"section_data": "",
"section_data": [
{
"oneself_repeat_words": sentence_word_nums,
"reference_repeat_words": sentence_word_nums,
"section_name": "第1部分",
"section_oneself_rate": "{}%".format(repetition_rate),
"section_repeat_rate": "{}%".format(repetition_rate),
"section_repeat_words": sentence_word_nums,
"section_words": len(text_paper)
}
],
"section_details": [
{
"end_page_index": 0,
"name": "",
"repeat_rate": "",
"repeat_words": "",
"words": "",
"original_text": original_text,
"original_text_oneself": original_text,
"original_text_contrast": original_text_contrast
}
]
],
"time_range": "1900-01-01至2023-08-08",
"title": "3",
"total_data": {
"back_repeat_words": "",
"exclude_personal_rate": "{}%".format(repetition_rate),
"exclude_quote_rate": "{}%".format(repetition_rate),
"foot_end_note": "0",
"front_repeat_words": "",
"single_max_rate": "",
"single_max_repeat_words": "",
"suspected_paragraph": "1",
"suspected_paragraph_max_repeat_words": "",
"suspected_paragraph_min_repeat_words": "",
"tables": "0",
"total_paragraph": "1",
"total_repeat_rate": "{}%".format(repetition_rate),
"total_repeat_words": sentence_word_nums,
"total_words": len(text_paper)
}
}
@ -489,21 +547,111 @@ def recall_10(title, abst_zh, content) -> list:
"abst_zh": abst_zh,
"content": content
}
paper_dict = dialog_line_parse("http://192.168.31.145:50002/check", request_json)
paper_dict = dialog_line_parse("http://192.168.31.145:50004/check", request_json)
return paper_dict
def uilt_content(content):
zhaiyao_list = ["摘要"]
zhaiyao_en_list = ["Abstract", "abstract"]
mulu_list = ["目录"]
key_word_list = ["关键词"]
key_word_bool = False
key_word_str = ""
zhaiyao_bool = False
zhaiyao_en_bool = False
zhaiyao_str = ""
zhaiyao_en_str = ""
mulu_str = ""
zhaiyao_text = ""
mulu_bool = False
for i in zhaiyao_list:
if i in content:
zhaiyao_bool = True
zhaiyao_str = i
break
for i in zhaiyao_en_list:
if i in content:
zhaiyao_en_bool = True
zhaiyao_en_str = i
break
for i in mulu_list:
if i in content:
mulu_str = i
mulu_bool = True
break
for i in key_word_list:
if i in content:
key_word_str = i
key_word_bool = True
break
if zhaiyao_bool== True and zhaiyao_en_bool == True:
pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,zhaiyao_en_str)
result_biaoti_list = re.findall(pantten_zhaiyao, content)
zhaiyao_text = result_biaoti_list[0]
elif zhaiyao_bool == True and key_word_bool == True:
pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,key_word_str )
result_biaoti_list = re.findall(pantten_zhaiyao, content)
zhaiyao_text = result_biaoti_list[0]
elif zhaiyao_bool == True and mulu_bool == True:
pantten_zhaiyao = "{}(.*?){}".format(zhaiyao_str,mulu_str)
result_biaoti_list = re.findall(pantten_zhaiyao, content)
zhaiyao_text = result_biaoti_list[0]
return zhaiyao_text
def ulit_request_file(file):
file_name = file.filename
if file_name.split(".")[-1] == "txt":
file_name_save = "data/request/{}".format(file_name)
file.save(file_name_save)
try:
with open(file_name_save, encoding="gbk") as f:
content = f.read()
except:
with open(file_name_save, encoding="utf-8") as f:
content = f.read()
content = content.strip().replace("\n", "").replace(" ", "")
abst_zh = uilt_content(content)
return abst_zh, content
@app.route("/", methods=["POST"])
def handle_query():
print(request.remote_addr)
title = request.json["title"]
abst_zh = request.json["abst_zh"] # txt
content = request.json["content"]
# request.form.get('prompt')
dataBases = request.form.get("dataBases")
minSimilarity = request.form.get("minSimilarity") # txt
minWords = request.form.get("minWords")
title = request.form.get("title")
author = request.form.get("author") # txt
file = request.files.get('file')
token = request.form.get("token")
account = request.form.get("account")
goodsId = request.form.get("goodsId")
callbackUrl = request.form.get("callbackUrl")
abst_zh, content = ulit_request_file(file)
# 调用宇鹏查询相似十篇
# recall_data_list_dict = recall_10(title, abst_zh, content)
with open("data/rell_json.txt") as f:
recall_data_list_dict = eval(f.read())
recall_data_list_dict = recall_10(title, abst_zh, content)
# with open("data/rell_json.txt") as f:
# recall_data_list_dict = eval(f.read())
# 读取文章转化成格式数据

Loading…
Cancel
Save