|
@ -182,7 +182,7 @@ def similar_content_func(): |
|
|
}] |
|
|
}] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def original_text_contrast_func(data_sentence_dan, paper_dict): |
|
|
def original_text_contrast_func(data_sentence_dan, paper_dict, centent_list): |
|
|
''' |
|
|
''' |
|
|
重复的对比详细信息 |
|
|
重复的对比详细信息 |
|
|
:param similar_content: |
|
|
:param similar_content: |
|
@ -241,9 +241,26 @@ def original_text_contrast_func(data_sentence_dan, paper_dict): |
|
|
similar_content.append(similar_content_dan) |
|
|
similar_content.append(similar_content_dan) |
|
|
|
|
|
|
|
|
original_text_list = list(data_sentence_dan[0][1]) |
|
|
original_text_list = list(data_sentence_dan[0][1]) |
|
|
original_text_list.insert(end, "</red>") |
|
|
# original_text_list.insert(end, "</red>\n") |
|
|
original_text_list.insert(start, "<red>") |
|
|
# original_text_list.insert(start, "\n<red>") |
|
|
original_text = "此处有 {} 字相似\n".format(str(end - start)) + "".join(original_text_list) |
|
|
target_text_str = "".join(["\n<red>"] + original_text_list[start: end] + ["</red>\n"]) |
|
|
|
|
|
|
|
|
|
|
|
original_text_start = "".join(original_text_list[:start]) |
|
|
|
|
|
original_text_end = "".join(original_text_list[end:]) |
|
|
|
|
|
|
|
|
|
|
|
if data_sentence_dan[0][4][0]-1 < 0: |
|
|
|
|
|
start_sen = "" |
|
|
|
|
|
else: |
|
|
|
|
|
start_sen = centent_list[data_sentence_dan[0][4][0]-1] |
|
|
|
|
|
|
|
|
|
|
|
if data_sentence_dan[0][4][0]+1 > len(centent_list) -1: |
|
|
|
|
|
end_sen = "" |
|
|
|
|
|
else: |
|
|
|
|
|
end_sen = centent_list[data_sentence_dan[0][4][2]+1] |
|
|
|
|
|
|
|
|
|
|
|
start_sen = start_sen + original_text_start |
|
|
|
|
|
end_sen = original_text_end + end_sen |
|
|
|
|
|
original_text = "此处有 {} 字相似\n".format(str(end - start)) + start_sen[-60:] + target_text_str + end_sen[:60] |
|
|
|
|
|
|
|
|
return_info = { |
|
|
return_info = { |
|
|
"original_text": original_text, |
|
|
"original_text": original_text, |
|
@ -411,7 +428,7 @@ def section_data_func(section_details): |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def section_details_func(data_section_dan, paper_dict, num_words): |
|
|
def section_details_func(data_section_dan, paper_dict, num_words, centent_list): |
|
|
''' |
|
|
''' |
|
|
章节详细信息 |
|
|
章节详细信息 |
|
|
:param original_text_contrast: |
|
|
:param original_text_contrast: |
|
@ -428,12 +445,12 @@ def section_details_func(data_section_dan, paper_dict, num_words): |
|
|
original_text_list = [] |
|
|
original_text_list = [] |
|
|
|
|
|
|
|
|
for sentence_dan in data_section_dan: |
|
|
for sentence_dan in data_section_dan: |
|
|
original_text_contrast_dan = original_text_contrast_func(sentence_dan, paper_dict) |
|
|
original_text_contrast_dan = original_text_contrast_func(sentence_dan, paper_dict, centent_list) |
|
|
original_text_contrast.append(original_text_contrast_dan) |
|
|
original_text_contrast.append(original_text_contrast_dan) |
|
|
repeat_words += original_text_contrast_dan["dan_sentence_word_nums"] |
|
|
repeat_words += original_text_contrast_dan["dan_sentence_word_nums"] |
|
|
original_text_list.append(original_text_contrast_dan["original_text"]) |
|
|
original_text_list.append(original_text_contrast_dan["original_text"]) |
|
|
|
|
|
|
|
|
original_text = "。".join(original_text_list) |
|
|
original_text = "".join(original_text_list) |
|
|
repeat_rate = (repeat_words / section_words)* 100 |
|
|
repeat_rate = (repeat_words / section_words)* 100 |
|
|
repeat_rate = str(round(repeat_rate, 1)) + "%" |
|
|
repeat_rate = str(round(repeat_rate, 1)) + "%" |
|
|
|
|
|
|
|
@ -453,7 +470,7 @@ def section_details_func(data_section_dan, paper_dict, num_words): |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def check_dict(similar_content_control, paper_dict, num_words, title, author): |
|
|
def check_dict(similar_content_control, paper_dict, num_words, title, author, centent_list): |
|
|
''' |
|
|
''' |
|
|
生成返回字典 |
|
|
生成返回字典 |
|
|
:param similar_content_control: |
|
|
:param similar_content_control: |
|
@ -472,7 +489,7 @@ def check_dict(similar_content_control, paper_dict, num_words, title, author): |
|
|
data_section_dan = data_dan |
|
|
data_section_dan = data_dan |
|
|
|
|
|
|
|
|
# 章节详细信息 |
|
|
# 章节详细信息 |
|
|
section_details = section_details_func(data_section_dan, paper_dict, num_words) |
|
|
section_details = section_details_func(data_section_dan, paper_dict, num_words, centent_list) |
|
|
section_details_list.append(section_details) |
|
|
section_details_list.append(section_details) |
|
|
|
|
|
|
|
|
# 模拟多个章节 |
|
|
# 模拟多个章节 |
|
@ -612,7 +629,9 @@ def accurate_check_rouge( |
|
|
for i in centent_list_old: |
|
|
for i in centent_list_old: |
|
|
num_words += len(i) |
|
|
num_words += len(i) |
|
|
if len(i) < 300: |
|
|
if len(i) < 300: |
|
|
centent_list.append(i) |
|
|
centent_list.append(i + "。") |
|
|
|
|
|
if i == "": |
|
|
|
|
|
continue |
|
|
for i in range(len(centent_list)): |
|
|
for i in range(len(centent_list)): |
|
|
text = centent_list[i] |
|
|
text = centent_list[i] |
|
|
rst.append(rouge_pre_m(text, recall_data_list)) |
|
|
rst.append(rouge_pre_m(text, recall_data_list)) |
|
@ -655,23 +674,51 @@ def accurate_check_rouge( |
|
|
|
|
|
|
|
|
print("bool_check_sentense", bool_check_sentense) |
|
|
print("bool_check_sentense", bool_check_sentense) |
|
|
print("找出相似的句子序号完成") |
|
|
print("找出相似的句子序号完成") |
|
|
|
|
|
|
|
|
biao_red = biaohong(bool_check_sentense, data_zong, |
|
|
biao_red = biaohong(bool_check_sentense, data_zong, |
|
|
recall_data_list) # [[[[0, 1, 2], [479, 480, 481]],[[0, 1, 2], [471, 472, 473]]], [[[3, 4, 5], [481, 482, 483]], [[3, 4, 5], [461, 462, 463]]]] |
|
|
recall_data_list) # [[[[0, 1, 2], [479, 480, 481]],[[0, 1, 2], [471, 472, 473]]], [[[3, 4, 5], [481, 482, 483]], [[3, 4, 5], [461, 462, 463]]]] |
|
|
|
|
|
|
|
|
print("biao_red", str(biao_red)) |
|
|
print("biao_red", str(biao_red)) |
|
|
|
|
|
|
|
|
|
|
|
original_sentence_index = [] |
|
|
|
|
|
# for i in biao_red: |
|
|
|
|
|
# for j in i: |
|
|
|
|
|
# original_sentence_index.append(j[0]) |
|
|
|
|
|
|
|
|
sentence_0_list = [] |
|
|
sentence_0_list = [] |
|
|
sentence_1_list = [] |
|
|
sentence_1_list = [] |
|
|
sim_paper_name = [] |
|
|
sim_paper_name = [] |
|
|
|
|
|
|
|
|
for i in range(len(biao_red)): |
|
|
for i in range(len(biao_red)): |
|
|
for j in range(len(biao_red[i])): |
|
|
for j in range(len(biao_red[i])): |
|
|
if recall_data_list[biao_red[i][j][1][0]][1] == recall_data_list[biao_red[i][j][1][1]][1] == recall_data_list[biao_red[i][j][1][2]][1]: |
|
|
# if recall_data_list[biao_red[i][j][1][0]][1] == recall_data_list[biao_red[i][j][1][1]][1] == recall_data_list[biao_red[i][j][1][2]][1]: |
|
|
sentence_0_list.append("。".join([centent_list[biao_red[i][j][0][0]], centent_list[biao_red[i][j][0][1]], centent_list[biao_red[i][j][0][2]]])) |
|
|
# sentence_0_list.append("".join([centent_list[biao_red[i][j][0][0]], centent_list[biao_red[i][j][0][1]], centent_list[biao_red[i][j][0][2]]])) |
|
|
sentence_1_list.append( |
|
|
# sentence_1_list.append( |
|
|
"".join([recall_data_list[biao_red[i][j][1][0]][0], recall_data_list[biao_red[i][j][1][1]][0], recall_data_list[biao_red[i][j][1][2]][0]])) |
|
|
# "".join([recall_data_list[biao_red[i][j][1][0]][0], recall_data_list[biao_red[i][j][1][1]][0], recall_data_list[biao_red[i][j][1][2]][0]])) |
|
|
sim_paper_name.append(recall_data_list[biao_red[i][j][1][0]][1]) |
|
|
# sim_paper_name.append(recall_data_list[biao_red[i][j][1][0]][1]) |
|
|
else: |
|
|
# else: |
|
|
continue |
|
|
# continue |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
file_name = recall_data_list[biao_red[i][j][1][1]][1] |
|
|
|
|
|
sentence_0_list_dan = [] |
|
|
|
|
|
sentence_1_list_dan = [] |
|
|
|
|
|
sentence_0_list_dan_index = [] |
|
|
|
|
|
houxuna_file_list = [ |
|
|
|
|
|
[recall_data_list[biao_red[i][j][1][0]][1], centent_list[biao_red[i][j][0][0]], recall_data_list[biao_red[i][j][1][0]][0]], |
|
|
|
|
|
[recall_data_list[biao_red[i][j][1][1]][1], centent_list[biao_red[i][j][0][1]], recall_data_list[biao_red[i][j][1][1]][0]], |
|
|
|
|
|
[recall_data_list[biao_red[i][j][1][2]][1], centent_list[biao_red[i][j][0][2]], recall_data_list[biao_red[i][j][1][2]][0]] |
|
|
|
|
|
] |
|
|
|
|
|
for dan_sen_info in houxuna_file_list: |
|
|
|
|
|
sentence_0_list_dan.append(dan_sen_info[1]) |
|
|
|
|
|
if dan_sen_info[0] == file_name: |
|
|
|
|
|
sentence_1_list_dan.append(dan_sen_info[2]) |
|
|
|
|
|
|
|
|
|
|
|
sentence_0_list.append("".join(sentence_0_list_dan)) |
|
|
|
|
|
sentence_1_list.append("".join(sentence_1_list_dan)) |
|
|
|
|
|
original_sentence_index.append(biao_red[i][j][0]) |
|
|
|
|
|
sim_paper_name.append(recall_data_list[biao_red[i][j][1][1]][1]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("待标红句子筛选完成") |
|
|
print("待标红句子筛选完成") |
|
|
sentence_0_list_new = [] |
|
|
sentence_0_list_new = [] |
|
@ -713,18 +760,19 @@ def accurate_check_rouge( |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sentence_0_list_new_cursor = sentence_0_list_new[0] |
|
|
sentence_0_list_new_cursor = sentence_0_list_new[0] |
|
|
for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), |
|
|
for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan in zip(range(len(paper_dict)), |
|
|
sentence_0_list_new, |
|
|
sentence_0_list_new, |
|
|
sentence_1_list_new, |
|
|
sentence_1_list_new, |
|
|
sim_paper_name): |
|
|
sim_paper_name, |
|
|
|
|
|
original_sentence_index): |
|
|
|
|
|
|
|
|
if sentence_0_list_new_cursor != sentence_0_dan: |
|
|
if sentence_0_list_new_cursor != sentence_0_dan: |
|
|
similar_content_control.append([[paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan]]) |
|
|
similar_content_control.append([[paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan]]) |
|
|
sentence_0_list_new_cursor = sentence_0_dan |
|
|
sentence_0_list_new_cursor = sentence_0_dan |
|
|
else: |
|
|
else: |
|
|
similar_content_control[-1].append([paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan]) |
|
|
similar_content_control[-1].append([paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan, original_sentence_index_dan]) |
|
|
|
|
|
|
|
|
paper_data = check_dict(similar_content_control, paper_dict, num_words, title, author) |
|
|
paper_data = check_dict(similar_content_control, paper_dict, num_words, title, author, centent_list) |
|
|
# data = [similar_content_control] |
|
|
# data = [similar_content_control] |
|
|
# |
|
|
# |
|
|
# # 模拟多个章节 |
|
|
# # 模拟多个章节 |
|
@ -1227,8 +1275,7 @@ def classify_accurate_check(): |
|
|
goodsId = data_dict['goodsId'] |
|
|
goodsId = data_dict['goodsId'] |
|
|
callbackUrl = data_dict['callbackUrl'] |
|
|
callbackUrl = data_dict['callbackUrl'] |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
|
|
|
print("查找相似的50篇完成") |
|
|
print("查找相似的50篇完成") |
|
|
print(len(content)) |
|
|
print(len(content)) |
|
|
|
|
|
|
|
@ -1261,6 +1308,22 @@ def classify_accurate_check(): |
|
|
print(load_result_path) |
|
|
print(load_result_path) |
|
|
redis_.set(queue_uuid, load_result_path, 86400) |
|
|
redis_.set(queue_uuid, load_result_path, 86400) |
|
|
redis_.srem(db_key_querying, queue_uuid) |
|
|
redis_.srem(db_key_querying, queue_uuid) |
|
|
|
|
|
except: |
|
|
|
|
|
return_text = {"resilt": "", "probabilities": None, "status_code": 401} |
|
|
|
|
|
load_result_path = "./new_data_logs/{}.json".format(queue_uuid) |
|
|
|
|
|
|
|
|
|
|
|
print("queue_uuid: ", queue_uuid) |
|
|
|
|
|
print("load_result_path: ", load_result_path) |
|
|
|
|
|
|
|
|
|
|
|
with open(load_result_path, 'w', encoding='utf8') as f2: |
|
|
|
|
|
# ensure_ascii=False才能输入中文,否则是Unicode字符 |
|
|
|
|
|
# indent=2 JSON数据的缩进,美观 |
|
|
|
|
|
json.dump(return_text, f2, ensure_ascii=False, indent=4) |
|
|
|
|
|
|
|
|
|
|
|
print(queue_uuid) |
|
|
|
|
|
print(load_result_path) |
|
|
|
|
|
redis_.set(queue_uuid, load_result_path, 86400) |
|
|
|
|
|
redis_.srem(db_key_querying, queue_uuid) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.route("/", methods=["POST"]) |
|
|
@app.route("/", methods=["POST"]) |
|
|