|
|
@ -248,15 +248,16 @@ def original_text_contrast_func(data_sentence_dan, paper_dict, centent_list): |
|
|
|
original_text_start = "".join(original_text_list[:start]) |
|
|
|
original_text_end = "".join(original_text_list[end:]) |
|
|
|
|
|
|
|
print(data_sentence_dan) |
|
|
|
if data_sentence_dan[0][4][0]-1 < 0: |
|
|
|
start_sen = "" |
|
|
|
else: |
|
|
|
start_sen = centent_list[data_sentence_dan[0][4][0]-1] |
|
|
|
|
|
|
|
if data_sentence_dan[0][4][0]+1 > len(centent_list) -1: |
|
|
|
if data_sentence_dan[0][4][-1]+1 >= len(centent_list): |
|
|
|
end_sen = "" |
|
|
|
else: |
|
|
|
end_sen = centent_list[data_sentence_dan[0][4][2]+1] |
|
|
|
end_sen = centent_list[data_sentence_dan[0][4][-1]+1] |
|
|
|
|
|
|
|
start_sen = start_sen + original_text_start |
|
|
|
end_sen = original_text_end + end_sen |
|
|
@ -428,7 +429,7 @@ def section_data_func(section_details): |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
def section_details_func(data_section_dan, paper_dict, num_words, centent_list): |
|
|
|
def section_details_func(data_section_dan, paper_dict, num_words, centent_list, index_centent_list_dan): |
|
|
|
''' |
|
|
|
章节详细信息 |
|
|
|
:param original_text_contrast: |
|
|
@ -458,7 +459,7 @@ def section_details_func(data_section_dan, paper_dict, num_words, centent_list): |
|
|
|
|
|
|
|
return { |
|
|
|
"end_page_index": 0, |
|
|
|
"name": "第1部分", |
|
|
|
"name": "第{}部分".format(str(index_centent_list_dan)), |
|
|
|
"repeat_rate": repeat_rate, |
|
|
|
"repeat_words": repeat_words, |
|
|
|
"start_page_index": 0, |
|
|
@ -470,7 +471,8 @@ def section_details_func(data_section_dan, paper_dict, num_words, centent_list): |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
def check_dict(similar_content_control, paper_dict, num_words, title, author, centent_list): |
|
|
|
def check_dict(similar_content_control_zong, paper_dict_zong, num_words_zong, title, author, chapter_data, index_centent_list): |
|
|
|
# similar_content_control, paper_dict, num_words, title, author, centent_list |
|
|
|
''' |
|
|
|
生成返回字典 |
|
|
|
:param similar_content_control: |
|
|
@ -480,16 +482,14 @@ def check_dict(similar_content_control, paper_dict, num_words, title, author, ce |
|
|
|
:param author: |
|
|
|
:return: |
|
|
|
''' |
|
|
|
if paper_dict != []: |
|
|
|
data = [similar_content_control] |
|
|
|
if paper_dict_zong != []: |
|
|
|
|
|
|
|
# 模拟多个章节 |
|
|
|
section_details_list = [] |
|
|
|
for data_dan in data: |
|
|
|
data_section_dan = data_dan |
|
|
|
for data_section_dan, paper_dict, num_words, centent_list, index_centent_list_dan in zip(similar_content_control_zong, paper_dict_zong, num_words_zong, chapter_data, index_centent_list): |
|
|
|
|
|
|
|
# 章节详细信息 |
|
|
|
section_details = section_details_func(data_section_dan, paper_dict, num_words, centent_list) |
|
|
|
section_details = section_details_func(data_section_dan, paper_dict, num_words, centent_list, index_centent_list_dan) |
|
|
|
section_details_list.append(section_details) |
|
|
|
|
|
|
|
# 模拟多个章节 |
|
|
@ -572,26 +572,37 @@ def check_dict(similar_content_control, paper_dict, num_words, title, author, ce |
|
|
|
} |
|
|
|
return paper_data |
|
|
|
|
|
|
|
def accurate_check_rouge( |
|
|
|
title, |
|
|
|
author, |
|
|
|
text_paper, |
|
|
|
recall_data_list |
|
|
|
): |
|
|
|
|
|
|
|
def split_chapter(centent_list): |
|
|
|
''' |
|
|
|
精确查重出相似句子 |
|
|
|
:param text: |
|
|
|
:param recall_data_list: list [[sentence, filename],[sentence, filename],[sentence, filename]] |
|
|
|
:return: |
|
|
|
|
|
|
|
:param centent_list: |
|
|
|
:return: [[[sentence, sentence, ... sentence], 2000], [[sentence, sentence, ... sentence], 2000]] |
|
|
|
''' |
|
|
|
# 文本处理 |
|
|
|
# centent_list = [] |
|
|
|
print("text_paper", len(text_paper)) |
|
|
|
text_paper = str(text_paper).replace("。\n", "。") |
|
|
|
centent_list_old = text_paper.split("。") |
|
|
|
data_zong = [] |
|
|
|
sentence_word_nums = 0 |
|
|
|
|
|
|
|
centent_list_new = [] |
|
|
|
zishu = 2000 |
|
|
|
dangqianzishu = 0 |
|
|
|
|
|
|
|
i = 0 |
|
|
|
centent_list_dan = [] |
|
|
|
while True: |
|
|
|
if i >= len(centent_list): |
|
|
|
if centent_list_dan != []: |
|
|
|
centent_list_new.append([centent_list_dan, dangqianzishu]) |
|
|
|
break |
|
|
|
centent_list_dan.append(centent_list[i]) |
|
|
|
dangqianzishu += len(centent_list[i]) |
|
|
|
if dangqianzishu > zishu: |
|
|
|
centent_list_new.append([centent_list_dan, dangqianzishu]) |
|
|
|
dangqianzishu = 0 |
|
|
|
centent_list_dan = [] |
|
|
|
i += 1 |
|
|
|
|
|
|
|
return centent_list_new |
|
|
|
|
|
|
|
|
|
|
|
def chapter_check(dan_chapter_data, recall_data_list): |
|
|
|
# ============================================================================================= |
|
|
|
# 多进程算法 |
|
|
|
# rouge算法查重 |
|
|
@ -599,15 +610,6 @@ def accurate_check_rouge( |
|
|
|
# rst = [] |
|
|
|
# p = Pool(nums_cpus) # 进程池中含有n个子进程 |
|
|
|
# |
|
|
|
# # print("centent_list", centent_list) |
|
|
|
# |
|
|
|
# num_words = 0 |
|
|
|
# centent_list = [] |
|
|
|
# for i in centent_list_old: |
|
|
|
# num_words += len(i) |
|
|
|
# if len(i) < 300: |
|
|
|
# centent_list.append(i) |
|
|
|
# |
|
|
|
# print("num_words", num_words) |
|
|
|
# for i in range(len(centent_list)): |
|
|
|
# text = centent_list[i] |
|
|
@ -624,20 +626,13 @@ def accurate_check_rouge( |
|
|
|
# ========================================================================================================= |
|
|
|
|
|
|
|
rst = [] |
|
|
|
num_words = 0 |
|
|
|
centent_list = [] |
|
|
|
for i in centent_list_old: |
|
|
|
num_words += len(i) |
|
|
|
if len(i) < 300: |
|
|
|
centent_list.append(i + "。") |
|
|
|
if i == "": |
|
|
|
continue |
|
|
|
for i in range(len(centent_list)): |
|
|
|
text = centent_list[i] |
|
|
|
for i in range(len(dan_chapter_data)): |
|
|
|
text = dan_chapter_data[i] |
|
|
|
rst.append(rouge_pre_m(text, recall_data_list)) |
|
|
|
|
|
|
|
# ======================================================================================================== |
|
|
|
|
|
|
|
data_zong = [] |
|
|
|
for i in range(len(rst)): |
|
|
|
# print(rst[i]) |
|
|
|
data_zong.append(rst[i]) |
|
|
@ -670,7 +665,8 @@ def accurate_check_rouge( |
|
|
|
|
|
|
|
# 继续用rouge方法筛选 |
|
|
|
|
|
|
|
bool_check_sentense = rouge_pre_m_1(bool_check_sentense, centent_list, recall_data_list) # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] |
|
|
|
bool_check_sentense = rouge_pre_m_1(bool_check_sentense, dan_chapter_data, |
|
|
|
recall_data_list) # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] |
|
|
|
|
|
|
|
print("bool_check_sentense", bool_check_sentense) |
|
|
|
print("找出相似的句子序号完成") |
|
|
@ -692,6 +688,7 @@ def accurate_check_rouge( |
|
|
|
|
|
|
|
for i in range(len(biao_red)): |
|
|
|
for j in range(len(biao_red[i])): |
|
|
|
print("i,j",i, j) |
|
|
|
# if recall_data_list[biao_red[i][j][1][0]][1] == recall_data_list[biao_red[i][j][1][1]][1] == recall_data_list[biao_red[i][j][1][2]][1]: |
|
|
|
# sentence_0_list.append("".join([centent_list[biao_red[i][j][0][0]], centent_list[biao_red[i][j][0][1]], centent_list[biao_red[i][j][0][2]]])) |
|
|
|
# sentence_1_list.append( |
|
|
@ -700,27 +697,31 @@ def accurate_check_rouge( |
|
|
|
# else: |
|
|
|
# continue |
|
|
|
|
|
|
|
|
|
|
|
file_name = recall_data_list[biao_red[i][j][1][1]][1] |
|
|
|
sentence_0_list_dan = [] |
|
|
|
sentence_1_list_dan = [] |
|
|
|
sentence_0_list_dan_index = [] |
|
|
|
houxuna_file_list = [ |
|
|
|
[recall_data_list[biao_red[i][j][1][0]][1], centent_list[biao_red[i][j][0][0]], recall_data_list[biao_red[i][j][1][0]][0]], |
|
|
|
[recall_data_list[biao_red[i][j][1][1]][1], centent_list[biao_red[i][j][0][1]], recall_data_list[biao_red[i][j][1][1]][0]], |
|
|
|
[recall_data_list[biao_red[i][j][1][2]][1], centent_list[biao_red[i][j][0][2]], recall_data_list[biao_red[i][j][1][2]][0]] |
|
|
|
] |
|
|
|
# houxuna_file_list = [ |
|
|
|
# [recall_data_list[biao_red[i][j][1][0]][1], dan_chapter_data[biao_red[i][j][0][0]], |
|
|
|
# recall_data_list[biao_red[i][j][1][0]][0]], |
|
|
|
# [recall_data_list[biao_red[i][j][1][1]][1], dan_chapter_data[biao_red[i][j][0][1]], |
|
|
|
# recall_data_list[biao_red[i][j][1][1]][0]], |
|
|
|
# [recall_data_list[biao_red[i][j][1][2]][1], dan_chapter_data[biao_red[i][j][0][2]], |
|
|
|
# recall_data_list[biao_red[i][j][1][2]][0]] |
|
|
|
# ] |
|
|
|
|
|
|
|
sentence_0_list_dan = [dan_chapter_data[biao_red[i][j][0][index_simsentence]] for index_simsentence in range(len(biao_red[i][j][0]))] |
|
|
|
houxuna_file_list = [[recall_data_list[biao_red[i][j][1][index_simsentence]][1], recall_data_list[biao_red[i][j][1][index_simsentence]][0]] for index_simsentence in range(len(biao_red[i][j][0]))] |
|
|
|
|
|
|
|
for dan_sen_info in houxuna_file_list: |
|
|
|
sentence_0_list_dan.append(dan_sen_info[1]) |
|
|
|
if dan_sen_info[0] == file_name: |
|
|
|
sentence_1_list_dan.append(dan_sen_info[2]) |
|
|
|
|
|
|
|
sentence_1_list_dan.append(dan_sen_info[1]) |
|
|
|
if sentence_0_list_dan != [] and sentence_1_list_dan != []: |
|
|
|
sentence_0_list.append("".join(sentence_0_list_dan)) |
|
|
|
sentence_1_list.append("".join(sentence_1_list_dan)) |
|
|
|
original_sentence_index.append(biao_red[i][j][0]) |
|
|
|
sim_paper_name.append(recall_data_list[biao_red[i][j][1][1]][1]) |
|
|
|
|
|
|
|
|
|
|
|
print("待标红句子筛选完成") |
|
|
|
sentence_0_list_new = [] |
|
|
|
sentence_1_list_new = [] |
|
|
@ -759,21 +760,73 @@ def accurate_check_rouge( |
|
|
|
# with open("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/paper_dict.json", "w") as f: |
|
|
|
# json.dump(paper_dict, f, ensure_ascii=False) |
|
|
|
|
|
|
|
|
|
|
|
sentence_0_list_new_cursor = sentence_0_list_new[0] |
|
|
|
for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan in zip(range(len(paper_dict)), |
|
|
|
for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan in zip( |
|
|
|
range(len(paper_dict)), |
|
|
|
sentence_0_list_new, |
|
|
|
sentence_1_list_new, |
|
|
|
sim_paper_name, |
|
|
|
original_sentence_index): |
|
|
|
|
|
|
|
if sentence_0_list_new_cursor != sentence_0_dan: |
|
|
|
similar_content_control.append([[paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan]]) |
|
|
|
similar_content_control.append( |
|
|
|
[[paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan]]) |
|
|
|
sentence_0_list_new_cursor = sentence_0_dan |
|
|
|
else: |
|
|
|
similar_content_control[-1].append([paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan]) |
|
|
|
similar_content_control[-1].append( |
|
|
|
[paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan]) |
|
|
|
return similar_content_control, paper_dict |
|
|
|
|
|
|
|
|
|
|
|
paper_data = check_dict(similar_content_control, paper_dict, num_words, title, author, centent_list) |
|
|
|
def accurate_check_rouge( |
|
|
|
title, |
|
|
|
author, |
|
|
|
text_paper, |
|
|
|
recall_data_list |
|
|
|
): |
|
|
|
''' |
|
|
|
精确查重出相似句子 |
|
|
|
:param text: |
|
|
|
:param recall_data_list: list [[sentence, filename],[sentence, filename],[sentence, filename]] |
|
|
|
:return: |
|
|
|
''' |
|
|
|
# 文本处理 |
|
|
|
# centent_list = [] |
|
|
|
print("text_paper", len(text_paper)) |
|
|
|
text_paper = str(text_paper).replace("。\n", "。") |
|
|
|
centent_list_old = text_paper.split("。") |
|
|
|
|
|
|
|
sentence_word_nums = 0 |
|
|
|
|
|
|
|
centent_list = [] |
|
|
|
for i in centent_list_old: |
|
|
|
if len(i) < 300: |
|
|
|
centent_list.append(i + "。") |
|
|
|
if i == "": |
|
|
|
continue |
|
|
|
|
|
|
|
centent_list_zong = split_chapter(centent_list) |
|
|
|
|
|
|
|
|
|
|
|
# 分章 |
|
|
|
|
|
|
|
similar_content_control_zong = [] |
|
|
|
paper_dict_zong = [] |
|
|
|
num_words_zong = [] |
|
|
|
chapter_data = [] |
|
|
|
index_centent_list = [] |
|
|
|
|
|
|
|
for index_centent_list_zong in range(len(centent_list_zong)): |
|
|
|
dan_chapter_data, dan_chapter_num_words = centent_list_zong[index_centent_list_zong][0], centent_list_zong[index_centent_list_zong][1] |
|
|
|
|
|
|
|
similar_content_control, paper_dict = chapter_check(dan_chapter_data, recall_data_list) |
|
|
|
similar_content_control_zong.append(similar_content_control) |
|
|
|
paper_dict_zong.append(paper_dict) |
|
|
|
num_words_zong.append(dan_chapter_num_words) |
|
|
|
chapter_data.append(dan_chapter_data) |
|
|
|
index_centent_list.append(index_centent_list_zong) |
|
|
|
|
|
|
|
paper_data = check_dict(similar_content_control_zong, paper_dict_zong, num_words_zong, title, author, chapter_data, index_centent_list) |
|
|
|
# data = [similar_content_control] |
|
|
|
# |
|
|
|
# # 模拟多个章节 |
|
|
@ -833,7 +886,7 @@ def biaohong(bool_check_sentense, data_zong, df_train_nuoche): |
|
|
|
|
|
|
|
for j in bool_check_sentense[i]: |
|
|
|
# print("j", j) |
|
|
|
if j[0] + 1 >= len(data_zong): |
|
|
|
if j[0] + 1 > len(data_zong): |
|
|
|
tiaochu = True |
|
|
|
break |
|
|
|
|
|
|
@ -864,6 +917,36 @@ def biaohong(bool_check_sentense, data_zong, df_train_nuoche): |
|
|
|
# [bool_check_sentense[i+1][j][1] - 1, bool_check_sentense[i+1][j][1], bool_check_sentense[i+1][j][1] + 1]]) |
|
|
|
# biao_red.append(biao_red_dan) |
|
|
|
|
|
|
|
elif i == len(bool_check_sentense)-1: |
|
|
|
if end == bool_check_sentense[i][0][0]: |
|
|
|
i += 1 |
|
|
|
break |
|
|
|
elif bool_check_sentense[i][0][0]-1 == end + 1 and bool_check_sentense[i][0][0] == len(data_zong) -1: |
|
|
|
index_list = [ii for ii in range(bool_check_sentense[i][0][0]-1, bool_check_sentense[i][0][0] + 1)] |
|
|
|
elif bool_check_sentense[i][0][0]-1 == end and bool_check_sentense[i][0][0] == len(data_zong) -1: |
|
|
|
index_list = [ii for ii in range(bool_check_sentense[i][0][0], bool_check_sentense[i][0][0] + 1)] |
|
|
|
elif bool_check_sentense[i][0][0]-1 > end + 1 and bool_check_sentense[i][0][0] == len(data_zong) -1: |
|
|
|
index_list = [ii for ii in range(bool_check_sentense[i][0][0] - 1, bool_check_sentense[i][0][0] + 1)] |
|
|
|
else: |
|
|
|
index_list = [ii for ii in range(bool_check_sentense[i][0][0] - 1, bool_check_sentense[i][0][0] + 2)] |
|
|
|
|
|
|
|
biaohongset = set() |
|
|
|
biao_red_dan = [] |
|
|
|
for j in range(len(bool_check_sentense[ |
|
|
|
i])): # bool_check_sentense: [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] |
|
|
|
if bool_check_sentense[i][j][1] not in biaohongset: |
|
|
|
biao_red_dan.append([index_list, |
|
|
|
[bool_check_sentense[i][j][1] - 1, bool_check_sentense[i][j][1], |
|
|
|
bool_check_sentense[i][j][1] + 1]]) |
|
|
|
biaohongset.add(bool_check_sentense[i][j][1] - 1) |
|
|
|
biaohongset.add(bool_check_sentense[i][j][1]) |
|
|
|
biaohongset.add(bool_check_sentense[i][j][1] + 1) |
|
|
|
else: |
|
|
|
continue |
|
|
|
|
|
|
|
i += 1 |
|
|
|
biao_red.append(biao_red_dan) |
|
|
|
break |
|
|
|
|
|
|
|
elif bool_check_sentense[i][0][0] - 1 == start: |
|
|
|
i += 1 |
|
|
@ -1316,7 +1399,7 @@ def classify_accurate_check(): |
|
|
|
goodsId = data_dict['goodsId'] |
|
|
|
callbackUrl = data_dict['callbackUrl'] |
|
|
|
|
|
|
|
try: |
|
|
|
# try: |
|
|
|
print("查找相似的50篇完成") |
|
|
|
print(len(content)) |
|
|
|
|
|
|
@ -1349,22 +1432,22 @@ def classify_accurate_check(): |
|
|
|
print(load_result_path) |
|
|
|
redis_.set(queue_uuid, load_result_path, 86400) |
|
|
|
redis_.srem(db_key_querying, queue_uuid) |
|
|
|
except: |
|
|
|
return_text = {"resilt": "", "probabilities": None, "status_code": 401} |
|
|
|
load_result_path = "./new_data_logs/{}.json".format(queue_uuid) |
|
|
|
|
|
|
|
print("queue_uuid: ", queue_uuid) |
|
|
|
print("load_result_path: ", load_result_path) |
|
|
|
|
|
|
|
with open(load_result_path, 'w', encoding='utf8') as f2: |
|
|
|
# ensure_ascii=False才能输入中文,否则是Unicode字符 |
|
|
|
# indent=2 JSON数据的缩进,美观 |
|
|
|
json.dump(return_text, f2, ensure_ascii=False, indent=4) |
|
|
|
|
|
|
|
print(queue_uuid) |
|
|
|
print(load_result_path) |
|
|
|
redis_.set(queue_uuid, load_result_path, 86400) |
|
|
|
redis_.srem(db_key_querying, queue_uuid) |
|
|
|
# except: |
|
|
|
# return_text = {"resilt": "", "probabilities": None, "status_code": 401} |
|
|
|
# load_result_path = "./new_data_logs/{}.json".format(queue_uuid) |
|
|
|
# |
|
|
|
# print("queue_uuid: ", queue_uuid) |
|
|
|
# print("load_result_path: ", load_result_path) |
|
|
|
# |
|
|
|
# with open(load_result_path, 'w', encoding='utf8') as f2: |
|
|
|
# # ensure_ascii=False才能输入中文,否则是Unicode字符 |
|
|
|
# # indent=2 JSON数据的缩进,美观 |
|
|
|
# json.dump(return_text, f2, ensure_ascii=False, indent=4) |
|
|
|
# |
|
|
|
# print(queue_uuid) |
|
|
|
# print(load_result_path) |
|
|
|
# redis_.set(queue_uuid, load_result_path, 86400) |
|
|
|
# redis_.srem(db_key_querying, queue_uuid) |
|
|
|
|
|
|
|
|
|
|
|
@app.route("/", methods=["POST"]) |
|
|
|