|
|
@ -174,8 +174,8 @@ def original_text_contrast_func(data_sentence_dan, paper_dict): |
|
|
|
paper_dict[i[0]][4][1]) # text_original, bert_text, bert_text_pre |
|
|
|
|
|
|
|
sentence_1_bool, sentence_1_dan_red = original_text_marked_red(i[2], paper_dict[i[0]][2], |
|
|
|
paper_dict[i[0]][4][0], |
|
|
|
paper_dict[i[0]][4][1]) # text_original, bert_text, bert_text_pre |
|
|
|
paper_dict[i[0]][4][2], |
|
|
|
paper_dict[i[0]][4][3]) # text_original, bert_text, bert_text_pre |
|
|
|
|
|
|
|
if sentence_0_bool == False or sentence_1_bool == False: |
|
|
|
continue |
|
|
@ -194,7 +194,7 @@ def original_text_contrast_func(data_sentence_dan, paper_dict): |
|
|
|
similar_content_dan["degree"] = i[3]["degree"] |
|
|
|
similar_content_dan["year"] = i[3]["year"] |
|
|
|
similar_content_dan["paper_len_word"] = i[3]["paper_len_word"] |
|
|
|
similar_content_dan["paper_red_len_word"] = len(paper_dict[i[0]][3]) |
|
|
|
similar_content_dan["paper_red_len_word"] = end_dan - start_dan |
|
|
|
|
|
|
|
thesis_info = " ".join( |
|
|
|
[similar_content_dan["title"], similar_content_dan["author"], similar_content_dan["degree"], |
|
|
@ -206,7 +206,7 @@ def original_text_contrast_func(data_sentence_dan, paper_dict): |
|
|
|
original_text_list = list(data_sentence_dan[0][1]) |
|
|
|
original_text_list.insert(end, "</red>") |
|
|
|
original_text_list.insert(start, "<red>") |
|
|
|
original_text = "".join(original_text_list) |
|
|
|
original_text = "此处有 {} 字相似\n".format(str(end - start)) + "".join(original_text_list) |
|
|
|
|
|
|
|
return_info = { |
|
|
|
"original_text": original_text, |
|
|
@ -216,7 +216,7 @@ def original_text_contrast_func(data_sentence_dan, paper_dict): |
|
|
|
return return_info |
|
|
|
|
|
|
|
|
|
|
|
def repeat_quote_info_func(original_text_contrast): |
|
|
|
def repeat_quote_info_func(original_text_contrast, section_words): |
|
|
|
''' |
|
|
|
重复的引用信息 |
|
|
|
:return: |
|
|
@ -232,7 +232,7 @@ def repeat_quote_info_func(original_text_contrast): |
|
|
|
"thesis_author": i["author"], |
|
|
|
"thesis_date": i["year"], |
|
|
|
"thesis_info": thesis_info, |
|
|
|
"thesis_repeat_rate": (i["paper_red_len_word"] / i["paper_len_word"]) * 100, |
|
|
|
"thesis_repeat_rate": (i["paper_red_len_word"] / section_words) * 100, # str(round(repeat_rate, 1)) + "%" |
|
|
|
# round(repetition_rate, 3) * 100 |
|
|
|
"thesis_title": i["title"], |
|
|
|
"thesis_link": "", |
|
|
@ -244,11 +244,19 @@ def repeat_quote_info_func(original_text_contrast): |
|
|
|
else: |
|
|
|
chongfuwendang[thesis_info]["thesis_repeat_word"] += i["paper_red_len_word"] |
|
|
|
chongfuwendang[thesis_info]["thesis_repeat_rate"] = (chongfuwendang[thesis_info]["thesis_repeat_word"] / |
|
|
|
chongfuwendang[thesis_info][ |
|
|
|
"paper_len_word"]) * 100 |
|
|
|
section_words) * 100 |
|
|
|
chongfuwendang = sorted(chongfuwendang.items(), |
|
|
|
key=lambda x: x[1]["thesis_repeat_rate"], reverse=False) |
|
|
|
chongfuwendang_list = [i[1] for i in chongfuwendang] |
|
|
|
|
|
|
|
|
|
|
|
chongfuwendang_list = [] |
|
|
|
|
|
|
|
for i in chongfuwendang: |
|
|
|
chongfuwendang_dan = i[1] |
|
|
|
print(chongfuwendang_dan) |
|
|
|
chongfuwendang_dan["thesis_repeat_rate"] = str(round(chongfuwendang_dan["thesis_repeat_rate"], 1)) + "%" |
|
|
|
chongfuwendang_list.append(chongfuwendang_dan) |
|
|
|
|
|
|
|
|
|
|
|
return chongfuwendang_list |
|
|
|
|
|
|
@ -383,7 +391,6 @@ def section_details_func(data_section_dan, paper_dict, num_words): |
|
|
|
original_text_list = [] |
|
|
|
|
|
|
|
for sentence_dan in data_section_dan: |
|
|
|
print("sentence_dan", sentence_dan) |
|
|
|
original_text_contrast_dan = original_text_contrast_func(sentence_dan, paper_dict) |
|
|
|
original_text_contrast.append(original_text_contrast_dan) |
|
|
|
repeat_words += original_text_contrast_dan["dan_sentence_word_nums"] |
|
|
@ -393,7 +400,7 @@ def section_details_func(data_section_dan, paper_dict, num_words): |
|
|
|
repeat_rate = (repeat_words / section_words)* 100 |
|
|
|
repeat_rate = str(round(repeat_rate, 1)) + "%" |
|
|
|
|
|
|
|
repeat_quote_info = repeat_quote_info_func(original_text_contrast) |
|
|
|
repeat_quote_info = repeat_quote_info_func(original_text_contrast, section_words) |
|
|
|
|
|
|
|
return { |
|
|
|
"end_page_index": 0, |
|
|
@ -531,7 +538,10 @@ def accurate_check_rouge( |
|
|
|
data_zong = [] |
|
|
|
sentence_word_nums = 0 |
|
|
|
|
|
|
|
# ============================================================================================= |
|
|
|
# 多进程算法 |
|
|
|
# rouge算法查重 |
|
|
|
t1_0 = time.time() |
|
|
|
rst = [] |
|
|
|
p = Pool(nums_cpus) # 进程池中含有n个子进程 |
|
|
|
|
|
|
@ -552,8 +562,26 @@ def accurate_check_rouge( |
|
|
|
p.close() |
|
|
|
p.join() # 等待所有子进程执行完毕。调用join()之前必须先调用close(),调用close()之后就不能继续添加新的Process了。 |
|
|
|
|
|
|
|
print("筛选句子完成") |
|
|
|
rst = [i.get() for i in rst] |
|
|
|
|
|
|
|
t2_0 = time.time() |
|
|
|
print(t2_0- t1_0) |
|
|
|
# ========================================================================================================= |
|
|
|
|
|
|
|
# rst = [] |
|
|
|
# num_words = 0 |
|
|
|
# centent_list = [] |
|
|
|
# for i in centent_list_old: |
|
|
|
# num_words += len(i) |
|
|
|
# if len(i) < 300: |
|
|
|
# centent_list.append(i) |
|
|
|
# for i in range(len(centent_list)): |
|
|
|
# text = centent_list[i] |
|
|
|
# rst.append(rouge_pre_m(text, recall_data_list)) |
|
|
|
|
|
|
|
# ======================================================================================================== |
|
|
|
|
|
|
|
for i in range(len(rst)): |
|
|
|
# print(rst[i]) |
|
|
|
data_zong.append(rst[i]) |
|
|
@ -567,7 +595,8 @@ def accurate_check_rouge( |
|
|
|
original_dict = [] |
|
|
|
|
|
|
|
# 找出相似的句子序号 |
|
|
|
bool_check_sentense = [] |
|
|
|
bool_check_sentense = [] # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] |
|
|
|
|
|
|
|
# bert算法 |
|
|
|
# for i in range(len(data_zong)): |
|
|
|
# if data_zong[i][0] == 1: |
|
|
@ -575,29 +604,35 @@ def accurate_check_rouge( |
|
|
|
|
|
|
|
# rouge算法 |
|
|
|
for i in range(len(data_zong)): |
|
|
|
bool_check_sentense_dan = [] # [[1, 223],[1, 226], [1, 562]] |
|
|
|
for j in range(len(data_zong[i])): |
|
|
|
if data_zong[i][j][1] > 0.35: |
|
|
|
bool_check_sentense.append([i, data_zong[i][j][0]]) |
|
|
|
|
|
|
|
if data_zong[i][j][1] > 0.3: |
|
|
|
# print("data_zong[{}][{}]".format(i,j), data_zong[i][j][0]) |
|
|
|
bool_check_sentense_dan.append([i, data_zong[i][j][0]]) |
|
|
|
if bool_check_sentense_dan != []: |
|
|
|
bool_check_sentense.append(bool_check_sentense_dan) |
|
|
|
|
|
|
|
print("bool_check_sentense", bool_check_sentense) |
|
|
|
print("找出相似的句子序号完成") |
|
|
|
biao_red = biaohong(bool_check_sentense, data_zong, |
|
|
|
recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]] |
|
|
|
|
|
|
|
print("bert精确查重时间", t1 - t0) |
|
|
|
print(biao_red) |
|
|
|
recall_data_list) # [[[[0, 1, 2], [479, 480, 481]],[[0, 1, 2], [471, 472, 473]]], [[[3, 4, 5], [481, 482, 483]], [[3, 4, 5], [461, 462, 463]]]] |
|
|
|
|
|
|
|
print("biao_red", str(biao_red)) |
|
|
|
sentence_0_list = [] |
|
|
|
sentence_1_list = [] |
|
|
|
sim_paper_name = [] |
|
|
|
|
|
|
|
for i in biao_red: |
|
|
|
if recall_data_list[i[1][0]][1] == recall_data_list[i[1][1]][1] == recall_data_list[i[1][2]][1]: |
|
|
|
sentence_0_list.append("。".join([centent_list[i[0][0]], centent_list[i[0][1]], centent_list[i[0][2]]])) |
|
|
|
for i in range(len(biao_red)): |
|
|
|
for j in range(len(biao_red[i])): |
|
|
|
if recall_data_list[biao_red[i][j][1][0]][1] == recall_data_list[biao_red[i][j][1][1]][1] == recall_data_list[biao_red[i][j][1][2]][1]: |
|
|
|
sentence_0_list.append("。".join([centent_list[biao_red[i][j][0][0]], centent_list[biao_red[i][j][0][1]], centent_list[biao_red[i][j][0][2]]])) |
|
|
|
sentence_1_list.append( |
|
|
|
"".join([recall_data_list[i[1][0]][0], recall_data_list[i[1][1]][0], recall_data_list[i[1][2]][0]])) |
|
|
|
sim_paper_name.append(recall_data_list[i[1][0]][1]) |
|
|
|
"".join([recall_data_list[biao_red[i][j][1][0]][0], recall_data_list[biao_red[i][j][1][1]][0], recall_data_list[biao_red[i][j][1][2]][0]])) |
|
|
|
sim_paper_name.append(recall_data_list[biao_red[i][j][1][0]][1]) |
|
|
|
else: |
|
|
|
continue |
|
|
|
|
|
|
|
print("待标红句子筛选完成") |
|
|
|
sentence_0_list_new = [] |
|
|
|
sentence_1_list_new = [] |
|
|
|
|
|
|
@ -610,12 +645,16 @@ def accurate_check_rouge( |
|
|
|
continue |
|
|
|
t2 = time.time() |
|
|
|
|
|
|
|
print() |
|
|
|
for i in sentence_0_list_new: |
|
|
|
print("sentence_0_list_new", i) |
|
|
|
if sentence_0_list_new == sentence_1_list_new == []: |
|
|
|
paper_dict = [] |
|
|
|
else: |
|
|
|
paper_dict = biaohong_bert_predict(sentence_0_list_new, sentence_1_list_new) |
|
|
|
|
|
|
|
t3 = time.time() |
|
|
|
print("标红完成") |
|
|
|
print("标红时间", t3 - t2) |
|
|
|
original_text = [] |
|
|
|
original_text_contrast = [] |
|
|
@ -631,6 +670,7 @@ def accurate_check_rouge( |
|
|
|
# with open("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/paper_dict.json", "w") as f: |
|
|
|
# json.dump(paper_dict, f, ensure_ascii=False) |
|
|
|
|
|
|
|
|
|
|
|
sentence_0_list_new_cursor = sentence_0_list_new[0] |
|
|
|
for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), |
|
|
|
sentence_0_list_new, |
|
|
@ -639,6 +679,7 @@ def accurate_check_rouge( |
|
|
|
|
|
|
|
if sentence_0_list_new_cursor != sentence_0_dan: |
|
|
|
similar_content_control.append([[paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan]]) |
|
|
|
sentence_0_list_new_cursor = sentence_0_dan |
|
|
|
else: |
|
|
|
similar_content_control[-1].append([paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan]) |
|
|
|
|
|
|
@ -682,38 +723,69 @@ def accurate_check_rouge( |
|
|
|
def biaohong(bool_check_sentense, data_zong, df_train_nuoche): |
|
|
|
''' |
|
|
|
标红的序号 [[0,1,2],[3,4,5]] |
|
|
|
:param bool_check_sentense: |
|
|
|
:return: list |
|
|
|
:param bool_check_sentense: # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] |
|
|
|
:return: list # [[[[0, 1, 2], [479, 480, 481]],[[0, 1, 2], [471, 472, 473]]], [[[3, 4, 5], [481, 482, 483]], [[3, 4, 5], [461, 462, 463]]]] |
|
|
|
''' |
|
|
|
|
|
|
|
# print("bool_check_sentense", bool_check_sentense) |
|
|
|
biao_red = [] |
|
|
|
i = 0 |
|
|
|
start = -1 |
|
|
|
end = -1 |
|
|
|
tiaochu = False |
|
|
|
while True: |
|
|
|
if i >= len(bool_check_sentense) or bool_check_sentense[i][0] + 1 >= len(data_zong) or bool_check_sentense[i][1] \ |
|
|
|
+ 1 >= len(df_train_nuoche): |
|
|
|
# if i >= len(bool_check_sentense) or bool_check_sentense[i][0] + 1 >= len(data_zong) or bool_check_sentense[i][1] \ |
|
|
|
# + 1 >= len(df_train_nuoche): |
|
|
|
# break |
|
|
|
|
|
|
|
if i >= len(bool_check_sentense): |
|
|
|
break |
|
|
|
|
|
|
|
for j in bool_check_sentense[i]: |
|
|
|
# print("j", j) |
|
|
|
if j[0] + 1 >= len(data_zong): |
|
|
|
tiaochu = True |
|
|
|
break |
|
|
|
|
|
|
|
for j in bool_check_sentense[i]: |
|
|
|
if j[1] + 1 >= len(df_train_nuoche): |
|
|
|
tiaochu = True |
|
|
|
break |
|
|
|
elif bool_check_sentense[i][0] - 1 == start: |
|
|
|
|
|
|
|
if tiaochu == True: |
|
|
|
break |
|
|
|
|
|
|
|
elif bool_check_sentense[i][0][0] - 1 == start: |
|
|
|
i += 1 |
|
|
|
continue |
|
|
|
elif bool_check_sentense[i][0] == end: |
|
|
|
elif bool_check_sentense[i][0][0] == end: |
|
|
|
i += 1 |
|
|
|
continue |
|
|
|
elif bool_check_sentense[i][0] - 1 == end: |
|
|
|
elif bool_check_sentense[i][0][0] - 1 == end: |
|
|
|
i += 1 |
|
|
|
continue |
|
|
|
else: |
|
|
|
biao_red_dan = [] |
|
|
|
biao_red_dan.append([bool_check_sentense[i][0] - 1, bool_check_sentense[i][1] - 1]) |
|
|
|
biao_red_dan.append([bool_check_sentense[i][0], bool_check_sentense[i][1]]) |
|
|
|
biao_red_dan.append([bool_check_sentense[i][0] + 1, bool_check_sentense[i][1] + 1]) |
|
|
|
biao_red.append([[bool_check_sentense[i][0] - 1, bool_check_sentense[i][0], bool_check_sentense[i][0] + 1], |
|
|
|
[bool_check_sentense[i][1] - 1, bool_check_sentense[i][1], bool_check_sentense[i][1] + 1]]) |
|
|
|
start = bool_check_sentense[i][0] - 1 |
|
|
|
end = bool_check_sentense[i][0] + 1 |
|
|
|
for j in range(len(bool_check_sentense[i])): # bool_check_sentense: [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] |
|
|
|
# biao_red_dan.append([bool_check_sentense[i][j][0] - 1, bool_check_sentense[i][j][1] - 1]) |
|
|
|
# biao_red_dan.append([bool_check_sentense[i][j][0], bool_check_sentense[i][j][1]]) |
|
|
|
# biao_red_dan.append([bool_check_sentense[i][j][0] + 1, bool_check_sentense[i][j][1] + 1]) |
|
|
|
# biao_red.append([[bool_check_sentense[i][0] - 1, bool_check_sentense[i][0], bool_check_sentense[i][0] + 1], |
|
|
|
# [bool_check_sentense[i][1] - 1, bool_check_sentense[i][1], bool_check_sentense[i][1] + 1]]) |
|
|
|
# start = bool_check_sentense[i][0] - 1 |
|
|
|
# end = bool_check_sentense[i][0] + 1 |
|
|
|
# i += 1 |
|
|
|
# print("i:{}, j:{}".format(i, j), ) |
|
|
|
# print(bool_check_sentense) |
|
|
|
# print([bool_check_sentense[i][j][0] - 1, bool_check_sentense[i][j][0], bool_check_sentense[i][j][0] + 1]) |
|
|
|
biao_red_dan.append([[bool_check_sentense[i][j][0] - 1, bool_check_sentense[i][j][0], bool_check_sentense[i][j][0] + 1], |
|
|
|
[bool_check_sentense[i][j][1] - 1, bool_check_sentense[i][j][1], bool_check_sentense[i][j][1] + 1]]) |
|
|
|
start = bool_check_sentense[i][0][0] - 1 |
|
|
|
end = bool_check_sentense[i][0][0] + 1 |
|
|
|
i += 1 |
|
|
|
biao_red.append(biao_red_dan) |
|
|
|
|
|
|
|
return biao_red |
|
|
|
return biao_red # [[[[0, 1, 2], [479, 480, 481]],[[0, 1, 2], [471, 472, 473]]], [[[3, 4, 5], [481, 482, 483]], [[3, 4, 5], [461, 462, 463]]]] |
|
|
|
|
|
|
|
|
|
|
|
def dialog_line_parse(url, text): |
|
|
|