Browse Source

增加单句多篇幅检测

master
majiahui@haimaqingfan.com 2 years ago
parent
commit
1f1bacbd2c
  1. 451
      flask_check_bert_test.py

451
flask_check_bert_test.py

@ -121,6 +121,287 @@ def rouge_pre_m(text, df_train_nuoche):
return return_list return return_list
# 以单个章节为例
def similar_content_func():
'''
重复文章
:return:
'''
return [{
"content": "重复的内容标红",
"thesis_info": "论文标题 + 论文作者 + 来源 + 年份日期--敞开式奶牛舍环境控制系统的设计 李晓红,刘晓丽,余泳昌 - 商丘工学院机械工程学院 - 2015-04-01",
"title": "标题",
"year": "日期",
"degree": "来源",
"author": "作者"
}]
def original_text_contrast_func(data_sentence_dan, paper_dict):
'''
重复的对比详细信息
:param similar_content:
:return:
'''
original_text = ""
start = len(data_sentence_dan[0][1])
end = 0
similar_content = []
for i in data_sentence_dan: #可能有很多个暂且确定是一个
similar_content_dan = {
"paper_red_len_word": "",
"content": "重复的内容标红",
"thesis_info": "论文标题 + 论文作者 + 来源 + 年份日期--敞开式奶牛舍环境控制系统的设计 李晓红,刘晓丽,余泳昌 - 商丘工学院机械工程学院 - 2015-04-01",
"title": "标题",
"year": "日期",
"degree": "来源",
"author": "作者",
"paper_len_word": ""
}
sentence_0_bool, sentence_0_dan_red = original_text_marked_red(i[1], paper_dict[i[0]][0],
paper_dict[i[0]][
1]) # text_original, bert_text, bert_text_pre
sentence_1_bool, sentence_1_dan_red = original_text_marked_red(i[2], paper_dict[i[0]][2],
paper_dict[i[0]][
3]) # text_original, bert_text, bert_text_pre
start_dan = sentence_0_dan_red.index("<red>")
end_dan = sentence_0_dan_red.index("</red>") - len("<red>")
if start_dan < start:
start = start_dan
if end_dan > end:
end = end_dan
if sentence_0_bool == False or sentence_1_bool == False:
continue
similar_content_dan["content"] = sentence_1_dan_red
similar_content_dan["title"] = i[3]["title"]
similar_content_dan["author"] = i[3]["author"]
similar_content_dan["degree"] = i[3]["degree"]
similar_content_dan["year"] = i[3]["year"]
similar_content_dan["paper_len_word"] = i[3]["paper_len_word"]
similar_content_dan["paper_red_len_word"] = len(paper_dict[i[0]][3])
thesis_info = " ".join(
[similar_content_dan["title"], similar_content_dan["author"], similar_content_dan["degree"],
similar_content_dan["year"]])
similar_content_dan["thesis_info"] = thesis_info
similar_content.append(similar_content_dan)
original_text_list = list(data_sentence_dan[0][1])
original_text_list.insert(end, "</red>")
original_text_list.insert(start, "<red>")
original_text = "".join(original_text_list)
return_info = {
"original_text": original_text,
"dan_sentence_word_nums": end - start,
"similar_content": similar_content
}
return return_info
def repeat_quote_info_func(original_text_contrast):
'''
重复的引用信息
:return:
'''
chongfuwendang = {}
for sentence_dan in original_text_contrast:
for i in sentence_dan["similar_content"]:
thesis_info = i["thesis_info"]
if thesis_info not in chongfuwendang:
chongfuwendang[thesis_info] = {
"quote": False,
"thesis_author": i["author"],
"thesis_date": i["year"],
"thesis_info": thesis_info,
"thesis_repeat_rate": (i["paper_red_len_word"] / i["paper_len_word"]) * 100,
# round(repetition_rate, 3) * 100
"thesis_title": i["title"],
"thesis_link": "",
"thesis_publish": i["degree"],
"thesis_repeat_word": i["paper_red_len_word"],
"thesis_teacher": "",
"paper_len_word": i["paper_len_word"]
}
else:
chongfuwendang[thesis_info]["thesis_repeat_word"] += i["paper_red_len_word"]
chongfuwendang[thesis_info]["thesis_repeat_rate"] = (chongfuwendang[thesis_info]["thesis_repeat_word"] /
chongfuwendang[thesis_info]["paper_len_word"]) * 100
chongfuwendang = sorted(chongfuwendang.items(),
key=lambda x: x[1]["thesis_repeat_rate"], reverse=False)
chongfuwendang_list = [i[1] for i in chongfuwendang]
return chongfuwendang_list
def total_data_func(section_data_list):
'''
总体数据
:return:
'''
# "end_page_index": 0,
# "name": "第1部分",
# "repeat_rate": repeat_rate,
# "repeat_words": repeat_words,
# "start_page_index": 0,
# "words": section_words,
# "original_text": original_text,
# "original_text_oneself": original_text,
# "original_text_contrast/重复的对比详细信息": original_text_contrast,
# "repeat_quote_info/重复的引用信息": repeat_quote_info
repeat_words = 0
words = 0
for i in section_data_list:
repeat_words += i["repeat_words"]
words += i["words"]
exclude_personal_rate = str(repeat_words/words * 100) + "%"
exclude_quote_rate = str(repeat_words/words * 100) + "%"
single_max_rate = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_rate"]
single_max_repeat_words = section_data_list[0]["repeat_quote_info"][0]["thesis_repeat_word"]
total_repeat_rate = str(repeat_words/words * 100) + "%"
total_repeat_words = repeat_words
total_words = words
return {
"back_repeat_words": "",
"exclude_personal_rate": exclude_personal_rate,
"exclude_quote_rate": exclude_quote_rate,
"front_repeat_words": "",
"single_max_rate": single_max_rate,
"single_max_repeat_words": single_max_repeat_words,
"suspected_paragraph": "",
"suspected_paragraph_max_repeat_words": "",
"suspected_paragraph_min_repeat_words": "",
"total_paragraph": "",
"total_repeat_rate": total_repeat_rate,
"total_repeat_words": total_repeat_words,
"total_words": total_words,
"tables": 0
}
def section_data_func_dan():
'''
章节信息单个
:return:
'''
# {
# "section_name": "章节名称",
# "section_repeat_rate": "重复率",
# "section_repeat_words": "重复字数",
# "section_words": "章节字数",
# "oneself_repeat_words": "去除本人后重复字数",
# "reference_repeat_words": "去除引用后重复字数",
# "section_oneself_rate": "去除本人后重复率"
# }
return {
"section_name": "",
"section_repeat_rate": "",
"section_repeat_words": "",
"section_words": "",
"oneself_repeat_words": "",
"reference_repeat_words": "",
"section_oneself_rate": ""
}
def section_data_func(section_details):
'''
章节信息
:return:
'''
# "end_page_index": 0,
# "name": "第1部分",
# "repeat_rate": repeat_rate,
# "repeat_words": repeat_words,
# "start_page_index": 0,
# "words": section_words,
# "original_text": original_text,
# "original_text_oneself": original_text,
# "original_text_contrast/重复的对比详细信息": original_text_contrast,
# "repeat_quote_info/重复的引用信息": repeat_quote_info
section_name = section_details["name"]
section_repeat_rate = section_details["repeat_rate"]
section_repeat_words = section_details["repeat_words"]
section_words = section_details["words"]
oneself_repeat_words = section_details["repeat_words"]
reference_repeat_words = section_details["repeat_words"]
section_oneself_rate = section_details["repeat_rate"]
return {
"section_name": section_name,
"section_repeat_rate": section_repeat_rate,
"section_repeat_words": section_repeat_words,
"section_words": section_words,
"oneself_repeat_words": oneself_repeat_words,
"reference_repeat_words": reference_repeat_words,
"section_oneself_rate": section_oneself_rate
}
def section_details_func(data_section_dan, paper_dict):
'''
章节详细信息
:param original_text_contrast:
:param repeat_quote_info:
:return:
'''
original_text_contrast = []
section_repeat_rate = ""
repeat_words = 0
section_words = 0
oneself_repeat_words = ""
reference_repeat_words = ""
section_oneself_rate = ""
original_text_list = []
for sentence_dan in data_section_dan:
original_text_contrast_dan = original_text_contrast_func(sentence_dan, paper_dict)
original_text_contrast.append(original_text_contrast_dan)
repeat_words += original_text_contrast_dan["dan_sentence_word_nums"]
original_text_list.append(original_text_contrast_dan["original_text"])
section_words += len(sentence_dan[0][1])
original_text = "".join(original_text_list)
repeat_rate = repeat_words/section_words
repeat_quote_info = repeat_quote_info_func(original_text_contrast)
return {
"end_page_index": 0,
"name": "第1部分",
"repeat_rate": repeat_rate,
"repeat_words": repeat_words,
"start_page_index": 0,
"words": section_words,
"original_text": original_text,
"original_text_oneself": original_text,
"original_text_contrast": original_text_contrast,
"repeat_quote_info": repeat_quote_info
}
def accurate_check_rouge( def accurate_check_rouge(
title, title,
author, author,
@ -220,139 +501,59 @@ def accurate_check_rouge(
chongfuwendang = {} chongfuwendang = {}
for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), sentence_0_list_new, sentence_1_list_new, sim_paper_name): print("paper_dict", paper_dict)
print("sentence_0_list_new", sentence_0_list_new)
print([sentence_0_dan, sentence_1_dan]) print("sentence_1_list_new", sentence_1_list_new)
original_text_contrast_dict = { print("sim_paper_name", sim_paper_name)
"original_text": "", similar_content_control = [[]]
"similar_content": [
{
"content": "",
"thesis_info": "",
"title": "",
"year": "",
"degree": "",
"author": "",
}
]
}
try:
sentence_0_bool, sentence_0_dan_red = original_text_marked_red(sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]) # text_original, bert_text, bert_text_pre
except:
print("报错", [sentence_0_dan, paper_dict[paper_dict_dan_id][0], paper_dict[paper_dict_dan_id][1]])
continue
# 9/0
sentence_1_bool, sentence_1_dan_red = original_text_marked_red(sentence_1_dan, paper_dict[paper_dict_dan_id][2], paper_dict[paper_dict_dan_id][3]) # text_original, bert_text, bert_text_pre
if sentence_0_bool == False or sentence_1_bool == False: with open("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/paper_dict.json", "w") as f:
continue json.dump(paper_dict, f, ensure_ascii=False)
dan_sentence_word_nums = len(paper_dict[paper_dict_dan_id][1])
sentence_word_nums += dan_sentence_word_nums
original_text.append(sentence_0_dan_red)
original_text_contrast_dict["original_text"] = "此处有 {} 字相似\n".format(
dan_sentence_word_nums) + sentence_0_dan_red
thesis_info = " ".join([sim_paper_name_dan["title"], sim_paper_name_dan["author"], sim_paper_name_dan["degree"], sim_paper_name_dan["year"]])
original_text_contrast_dict["similar_content"][0]["content"] = sentence_1_dan_red
original_text_contrast_dict["similar_content"][0]["title"] = sim_paper_name_dan["title"]
original_text_contrast_dict["similar_content"][0]["author"] = sim_paper_name_dan["author"]
original_text_contrast_dict["similar_content"][0]["degree"] = sim_paper_name_dan["degree"]
original_text_contrast_dict["similar_content"][0]["year"] = sim_paper_name_dan["year"]
original_text_contrast_dict["similar_content"][0]["thesis_info"] = thesis_info
original_text_contrast.append(original_text_contrast_dict) sentence_0_list_new_cursor = sentence_0_list_new[0]
for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)),
sentence_0_list_new,
sentence_1_list_new,
sim_paper_name):
# for i in repeat_quote_info: if sentence_0_list_new_cursor != sentence_0_dan:
# if similar_content_control.append([[paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan]])
if thesis_info not in chongfuwendang:
chongfuwendang[thesis_info] = {
"quote": False,
"thesis_author": sim_paper_name_dan["author"],
"thesis_date" : sim_paper_name_dan["year"],
"thesis_info" : thesis_info,
"thesis_repeat_rate": (dan_sentence_word_nums/sim_paper_name_dan["paper_len_word"]) * 100, #round(repetition_rate, 3) * 100
"thesis_title": sim_paper_name_dan["title"],
"thesis_link": "",
"thesis_publish": sim_paper_name_dan["degree"],
"thesis_repeat_word": dan_sentence_word_nums,
"thesis_teacher": "",
"paper_len_word": sim_paper_name_dan["paper_len_word"]
}
else: else:
chongfuwendang[thesis_info]["thesis_repeat_word"] += dan_sentence_word_nums similar_content_control[-1].append([paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan])
chongfuwendang[thesis_info]["thesis_repeat_rate"] = (chongfuwendang[thesis_info]["thesis_repeat_word"]/chongfuwendang[thesis_info]["paper_len_word"]) * 100
data = [similar_content_control]
chongfuwendang = sorted(chongfuwendang.items(), # 模拟多个章节
key=lambda x: x[1]["thesis_repeat_rate"], reverse=False) section_details_list = []
for data_dan in data:
data_section_dan = data_dan
# 章节详细信息
section_details = section_details_func(data_section_dan, paper_dict)
section_details_list.append(section_details)
for i in range(len(chongfuwendang)): # 模拟多个章节
repeat_paper_one_info_dict = chongfuwendang[i][1]
repeat_paper_one_info_dict.pop("paper_len_word")
repeat_paper_one_info_dict["thesis_repeat_rate"] = str(round(repeat_paper_one_info_dict["thesis_repeat_rate"], 1)) + "%"
repeat_quote_info.append(repeat_paper_one_info_dict)
original_text = "".join(original_text) section_data_list = []
for section_details in section_details_list:
section_data = section_data_func(section_details)
repetition_rate = sentence_word_nums/len(text_paper) total_data = total_data_func(section_details_list)
repetition_rate = round(repetition_rate, 3) * 100
format = '%Y-%m-%d %H:%M:%S' format = '%Y-%m-%d %H:%M:%S'
value = time.localtime(int(time.time())) value = time.localtime(int(time.time()))
dt = time.strftime(format, value) dt = time.strftime(format, value)
return { paper_data = {
"author": author, "author": author,
"check_time": dt, "check_time": dt,
"title": title,
"time_range": "1900-01-01至2023-08-08", "time_range": "1900-01-01至2023-08-08",
"section_data": [ "title": title,
{ "total_data": total_data,
"oneself_repeat_words": sentence_word_nums, "section_data": section_data_list,
"reference_repeat_words": sentence_word_nums, "section_details": section_details_list
"section_name": "第1部分",
"section_oneself_rate": "{}%".format(repetition_rate),
"section_repeat_rate": "{}%".format(repetition_rate),
"section_repeat_words": sentence_word_nums,
"section_words": len(text_paper)
}
],
"section_details": [
{
"end_page_index": 0,
"name": "",
"repeat_rate": "",
"repeat_words": "",
"words": "",
"original_text": original_text,
"original_text_oneself": original_text,
"original_text_contrast": original_text_contrast,
"repeat_quote_info": repeat_quote_info
}
],
"total_data": {
"back_repeat_words": "",
"exclude_personal_rate": "{}%".format(repetition_rate),
"exclude_quote_rate": "{}%".format(repetition_rate),
"foot_end_note": "0",
"front_repeat_words": "",
"single_max_rate": "",
"single_max_repeat_words": "",
"suspected_paragraph": "1",
"suspected_paragraph_max_repeat_words": "",
"suspected_paragraph_min_repeat_words": "",
"tables": "0",
"total_paragraph": "1",
"total_repeat_rate": "{}%".format(repetition_rate),
"total_repeat_words": sentence_word_nums,
"total_words": len(text_paper)
}
} }
return paper_data
@ -519,28 +720,8 @@ def biaohong_bert_predict(sentence_0_list, sentence_1_list):
:return: :return:
''' '''
# sentence_0_list = []
# sentence_1_list = []
# sim_paper_name = []
#
# for i in biaohong_list:
# sentence_0_list.append("。".join([paper_list[i[0][0]], paper_list[i[0][1]], paper_list[i[0][2]]]))
# sentence_1_list.append("。".join([recall_data_list[i[1][1]], recall_data_list[i[1][1]], recall_data_list[i[1][2]]]))
paper_dict = dialog_line_parse("http://192.168.31.74:16003/", {"sentence_0": sentence_0_list, "sentence_1": sentence_1_list})["resilt"] paper_dict = dialog_line_parse("http://192.168.31.74:16003/", {"sentence_0": sentence_0_list, "sentence_1": sentence_1_list})["resilt"]
# paper_dict
# print("原文:".format(i), paper_dict[i][0])
# print("原文标红:".format(i), paper_dict[i][1])
# print("相似:".format(i), paper_dict[i][2])
# print("相似标红:".format(i), paper_dict[i][3])
# original_text
#
#
# for paper_dict_dan, sentence_0_dan, sentence_1_dan in zip(paper_dict, sentence_0_list, sentence_1_list):
# original_text_marked_red
return paper_dict return paper_dict
def ulit_text(title, text): def ulit_text(title, text):
@ -626,7 +807,7 @@ def ulit_recall_paper(recall_data_list_dict):
data = [] data = []
for i in list(recall_data_list_dict.items()): for i in list(recall_data_list_dict.items())[:10]:
data_one = processing_one_text(i[0]) data_one = processing_one_text(i[0])
data.extend(data_one) data.extend(data_one)

Loading…
Cancel
Save