Browse Source

完整版v1.0,和飞度查重报告一致

master
majiahui@haimaqingfan.com 2 years ago
parent
commit
875f04a6c9
  1. 243
      flask_check_bert_test.py

243
flask_check_bert_test.py

@ -248,15 +248,16 @@ def original_text_contrast_func(data_sentence_dan, paper_dict, centent_list):
original_text_start = "".join(original_text_list[:start])
original_text_end = "".join(original_text_list[end:])
print(data_sentence_dan)
if data_sentence_dan[0][4][0]-1 < 0:
start_sen = ""
else:
start_sen = centent_list[data_sentence_dan[0][4][0]-1]
if data_sentence_dan[0][4][0]+1 > len(centent_list) -1:
if data_sentence_dan[0][4][-1]+1 >= len(centent_list):
end_sen = ""
else:
end_sen = centent_list[data_sentence_dan[0][4][2]+1]
end_sen = centent_list[data_sentence_dan[0][4][-1]+1]
start_sen = start_sen + original_text_start
end_sen = original_text_end + end_sen
@ -428,7 +429,7 @@ def section_data_func(section_details):
}
def section_details_func(data_section_dan, paper_dict, num_words, centent_list):
def section_details_func(data_section_dan, paper_dict, num_words, centent_list, index_centent_list_dan):
'''
章节详细信息
:param original_text_contrast:
@ -458,7 +459,7 @@ def section_details_func(data_section_dan, paper_dict, num_words, centent_list):
return {
"end_page_index": 0,
"name": "1部分",
"name": "{}部分".format(str(index_centent_list_dan)),
"repeat_rate": repeat_rate,
"repeat_words": repeat_words,
"start_page_index": 0,
@ -470,7 +471,8 @@ def section_details_func(data_section_dan, paper_dict, num_words, centent_list):
}
def check_dict(similar_content_control, paper_dict, num_words, title, author, centent_list):
def check_dict(similar_content_control_zong, paper_dict_zong, num_words_zong, title, author, chapter_data, index_centent_list):
# similar_content_control, paper_dict, num_words, title, author, centent_list
'''
生成返回字典
:param similar_content_control:
@ -480,16 +482,14 @@ def check_dict(similar_content_control, paper_dict, num_words, title, author, ce
:param author:
:return:
'''
if paper_dict != []:
data = [similar_content_control]
if paper_dict_zong != []:
# 模拟多个章节
section_details_list = []
for data_dan in data:
data_section_dan = data_dan
for data_section_dan, paper_dict, num_words, centent_list, index_centent_list_dan in zip(similar_content_control_zong, paper_dict_zong, num_words_zong, chapter_data, index_centent_list):
# 章节详细信息
section_details = section_details_func(data_section_dan, paper_dict, num_words, centent_list)
section_details = section_details_func(data_section_dan, paper_dict, num_words, centent_list, index_centent_list_dan)
section_details_list.append(section_details)
# 模拟多个章节
@ -572,26 +572,37 @@ def check_dict(similar_content_control, paper_dict, num_words, title, author, ce
}
return paper_data
def accurate_check_rouge(
title,
author,
text_paper,
recall_data_list
):
def split_chapter(centent_list):
'''
精确查重出相似句子
:param text:
:param recall_data_list: list [[sentence, filename],[sentence, filename],[sentence, filename]]
:return:
:param centent_list:
:return: [[[sentence, sentence, ... sentence], 2000], [[sentence, sentence, ... sentence], 2000]]
'''
# 文本处理
# centent_list = []
print("text_paper", len(text_paper))
text_paper = str(text_paper).replace("\n", "")
centent_list_old = text_paper.split("")
data_zong = []
sentence_word_nums = 0
centent_list_new = []
zishu = 2000
dangqianzishu = 0
i = 0
centent_list_dan = []
while True:
if i >= len(centent_list):
if centent_list_dan != []:
centent_list_new.append([centent_list_dan, dangqianzishu])
break
centent_list_dan.append(centent_list[i])
dangqianzishu += len(centent_list[i])
if dangqianzishu > zishu:
centent_list_new.append([centent_list_dan, dangqianzishu])
dangqianzishu = 0
centent_list_dan = []
i += 1
return centent_list_new
def chapter_check(dan_chapter_data, recall_data_list):
# =============================================================================================
# 多进程算法
# rouge算法查重
@ -599,15 +610,6 @@ def accurate_check_rouge(
# rst = []
# p = Pool(nums_cpus) # 进程池中含有n个子进程
#
# # print("centent_list", centent_list)
#
# num_words = 0
# centent_list = []
# for i in centent_list_old:
# num_words += len(i)
# if len(i) < 300:
# centent_list.append(i)
#
# print("num_words", num_words)
# for i in range(len(centent_list)):
# text = centent_list[i]
@ -624,20 +626,13 @@ def accurate_check_rouge(
# =========================================================================================================
rst = []
num_words = 0
centent_list = []
for i in centent_list_old:
num_words += len(i)
if len(i) < 300:
centent_list.append(i + "")
if i == "":
continue
for i in range(len(centent_list)):
text = centent_list[i]
for i in range(len(dan_chapter_data)):
text = dan_chapter_data[i]
rst.append(rouge_pre_m(text, recall_data_list))
# ========================================================================================================
data_zong = []
for i in range(len(rst)):
# print(rst[i])
data_zong.append(rst[i])
@ -670,7 +665,8 @@ def accurate_check_rouge(
# 继续用rouge方法筛选
bool_check_sentense = rouge_pre_m_1(bool_check_sentense, centent_list, recall_data_list) # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]]
bool_check_sentense = rouge_pre_m_1(bool_check_sentense, dan_chapter_data,
recall_data_list) # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]]
print("bool_check_sentense", bool_check_sentense)
print("找出相似的句子序号完成")
@ -692,6 +688,7 @@ def accurate_check_rouge(
for i in range(len(biao_red)):
for j in range(len(biao_red[i])):
print("i,j",i, j)
# if recall_data_list[biao_red[i][j][1][0]][1] == recall_data_list[biao_red[i][j][1][1]][1] == recall_data_list[biao_red[i][j][1][2]][1]:
# sentence_0_list.append("".join([centent_list[biao_red[i][j][0][0]], centent_list[biao_red[i][j][0][1]], centent_list[biao_red[i][j][0][2]]]))
# sentence_1_list.append(
@ -700,27 +697,31 @@ def accurate_check_rouge(
# else:
# continue
file_name = recall_data_list[biao_red[i][j][1][1]][1]
sentence_0_list_dan = []
sentence_1_list_dan = []
sentence_0_list_dan_index = []
houxuna_file_list = [
[recall_data_list[biao_red[i][j][1][0]][1], centent_list[biao_red[i][j][0][0]], recall_data_list[biao_red[i][j][1][0]][0]],
[recall_data_list[biao_red[i][j][1][1]][1], centent_list[biao_red[i][j][0][1]], recall_data_list[biao_red[i][j][1][1]][0]],
[recall_data_list[biao_red[i][j][1][2]][1], centent_list[biao_red[i][j][0][2]], recall_data_list[biao_red[i][j][1][2]][0]]
]
# houxuna_file_list = [
# [recall_data_list[biao_red[i][j][1][0]][1], dan_chapter_data[biao_red[i][j][0][0]],
# recall_data_list[biao_red[i][j][1][0]][0]],
# [recall_data_list[biao_red[i][j][1][1]][1], dan_chapter_data[biao_red[i][j][0][1]],
# recall_data_list[biao_red[i][j][1][1]][0]],
# [recall_data_list[biao_red[i][j][1][2]][1], dan_chapter_data[biao_red[i][j][0][2]],
# recall_data_list[biao_red[i][j][1][2]][0]]
# ]
sentence_0_list_dan = [dan_chapter_data[biao_red[i][j][0][index_simsentence]] for index_simsentence in range(len(biao_red[i][j][0]))]
houxuna_file_list = [[recall_data_list[biao_red[i][j][1][index_simsentence]][1], recall_data_list[biao_red[i][j][1][index_simsentence]][0]] for index_simsentence in range(len(biao_red[i][j][0]))]
for dan_sen_info in houxuna_file_list:
sentence_0_list_dan.append(dan_sen_info[1])
if dan_sen_info[0] == file_name:
sentence_1_list_dan.append(dan_sen_info[2])
sentence_1_list_dan.append(dan_sen_info[1])
if sentence_0_list_dan != [] and sentence_1_list_dan != []:
sentence_0_list.append("".join(sentence_0_list_dan))
sentence_1_list.append("".join(sentence_1_list_dan))
original_sentence_index.append(biao_red[i][j][0])
sim_paper_name.append(recall_data_list[biao_red[i][j][1][1]][1])
print("待标红句子筛选完成")
sentence_0_list_new = []
sentence_1_list_new = []
@ -759,21 +760,73 @@ def accurate_check_rouge(
# with open("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/paper_dict.json", "w") as f:
# json.dump(paper_dict, f, ensure_ascii=False)
sentence_0_list_new_cursor = sentence_0_list_new[0]
for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan in zip(range(len(paper_dict)),
for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan in zip(
range(len(paper_dict)),
sentence_0_list_new,
sentence_1_list_new,
sim_paper_name,
original_sentence_index):
if sentence_0_list_new_cursor != sentence_0_dan:
similar_content_control.append([[paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan]])
similar_content_control.append(
[[paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan]])
sentence_0_list_new_cursor = sentence_0_dan
else:
similar_content_control[-1].append([paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan])
similar_content_control[-1].append(
[paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan])
return similar_content_control, paper_dict
def accurate_check_rouge(
title,
author,
text_paper,
recall_data_list
):
'''
精确查重出相似句子
:param text:
:param recall_data_list: list [[sentence, filename],[sentence, filename],[sentence, filename]]
:return:
'''
# 文本处理
# centent_list = []
print("text_paper", len(text_paper))
text_paper = str(text_paper).replace("\n", "")
centent_list_old = text_paper.split("")
sentence_word_nums = 0
centent_list = []
for i in centent_list_old:
if len(i) < 300:
centent_list.append(i + "")
if i == "":
continue
paper_data = check_dict(similar_content_control, paper_dict, num_words, title, author, centent_list)
centent_list_zong = split_chapter(centent_list)
# 分章
similar_content_control_zong = []
paper_dict_zong = []
num_words_zong = []
chapter_data = []
index_centent_list = []
for index_centent_list_zong in range(len(centent_list_zong)):
dan_chapter_data, dan_chapter_num_words = centent_list_zong[index_centent_list_zong][0], centent_list_zong[index_centent_list_zong][1]
similar_content_control, paper_dict = chapter_check(dan_chapter_data, recall_data_list)
similar_content_control_zong.append(similar_content_control)
paper_dict_zong.append(paper_dict)
num_words_zong.append(dan_chapter_num_words)
chapter_data.append(dan_chapter_data)
index_centent_list.append(index_centent_list_zong)
paper_data = check_dict(similar_content_control_zong, paper_dict_zong, num_words_zong, title, author, chapter_data, index_centent_list)
# data = [similar_content_control]
#
# # 模拟多个章节
@ -833,7 +886,7 @@ def biaohong(bool_check_sentense, data_zong, df_train_nuoche):
for j in bool_check_sentense[i]:
# print("j", j)
if j[0] + 1 >= len(data_zong):
if j[0] + 1 > len(data_zong):
tiaochu = True
break
@ -864,6 +917,36 @@ def biaohong(bool_check_sentense, data_zong, df_train_nuoche):
# [bool_check_sentense[i+1][j][1] - 1, bool_check_sentense[i+1][j][1], bool_check_sentense[i+1][j][1] + 1]])
# biao_red.append(biao_red_dan)
elif i == len(bool_check_sentense)-1:
if end == bool_check_sentense[i][0][0]:
i += 1
break
elif bool_check_sentense[i][0][0]-1 == end + 1 and bool_check_sentense[i][0][0] == len(data_zong) -1:
index_list = [ii for ii in range(bool_check_sentense[i][0][0]-1, bool_check_sentense[i][0][0] + 1)]
elif bool_check_sentense[i][0][0]-1 == end and bool_check_sentense[i][0][0] == len(data_zong) -1:
index_list = [ii for ii in range(bool_check_sentense[i][0][0], bool_check_sentense[i][0][0] + 1)]
elif bool_check_sentense[i][0][0]-1 > end + 1 and bool_check_sentense[i][0][0] == len(data_zong) -1:
index_list = [ii for ii in range(bool_check_sentense[i][0][0] - 1, bool_check_sentense[i][0][0] + 1)]
else:
index_list = [ii for ii in range(bool_check_sentense[i][0][0] - 1, bool_check_sentense[i][0][0] + 2)]
biaohongset = set()
biao_red_dan = []
for j in range(len(bool_check_sentense[
i])): # bool_check_sentense: [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]]
if bool_check_sentense[i][j][1] not in biaohongset:
biao_red_dan.append([index_list,
[bool_check_sentense[i][j][1] - 1, bool_check_sentense[i][j][1],
bool_check_sentense[i][j][1] + 1]])
biaohongset.add(bool_check_sentense[i][j][1] - 1)
biaohongset.add(bool_check_sentense[i][j][1])
biaohongset.add(bool_check_sentense[i][j][1] + 1)
else:
continue
i += 1
biao_red.append(biao_red_dan)
break
elif bool_check_sentense[i][0][0] - 1 == start:
i += 1
@ -1316,7 +1399,7 @@ def classify_accurate_check():
goodsId = data_dict['goodsId']
callbackUrl = data_dict['callbackUrl']
try:
# try:
print("查找相似的50篇完成")
print(len(content))
@ -1349,22 +1432,22 @@ def classify_accurate_check():
print(load_result_path)
redis_.set(queue_uuid, load_result_path, 86400)
redis_.srem(db_key_querying, queue_uuid)
except:
return_text = {"resilt": "", "probabilities": None, "status_code": 401}
load_result_path = "./new_data_logs/{}.json".format(queue_uuid)
print("queue_uuid: ", queue_uuid)
print("load_result_path: ", load_result_path)
with open(load_result_path, 'w', encoding='utf8') as f2:
# ensure_ascii=False才能输入中文,否则是Unicode字符
# indent=2 JSON数据的缩进,美观
json.dump(return_text, f2, ensure_ascii=False, indent=4)
print(queue_uuid)
print(load_result_path)
redis_.set(queue_uuid, load_result_path, 86400)
redis_.srem(db_key_querying, queue_uuid)
# except:
# return_text = {"resilt": "", "probabilities": None, "status_code": 401}
# load_result_path = "./new_data_logs/{}.json".format(queue_uuid)
#
# print("queue_uuid: ", queue_uuid)
# print("load_result_path: ", load_result_path)
#
# with open(load_result_path, 'w', encoding='utf8') as f2:
# # ensure_ascii=False才能输入中文,否则是Unicode字符
# # indent=2 JSON数据的缩进,美观
# json.dump(return_text, f2, ensure_ascii=False, indent=4)
#
# print(queue_uuid)
# print(load_result_path)
# redis_.set(queue_uuid, load_result_path, 86400)
# redis_.srem(db_key_querying, queue_uuid)
@app.route("/", methods=["POST"])

Loading…
Cancel
Save