Browse Source

完整版v1.0

master
majiahui@haimaqingfan.com 2 years ago
parent
commit
7493ebf9af
  1. 156
      flask_check_bert_test.py

156
flask_check_bert_test.py

@ -174,8 +174,8 @@ def original_text_contrast_func(data_sentence_dan, paper_dict):
paper_dict[i[0]][4][1]) # text_original, bert_text, bert_text_pre paper_dict[i[0]][4][1]) # text_original, bert_text, bert_text_pre
sentence_1_bool, sentence_1_dan_red = original_text_marked_red(i[2], paper_dict[i[0]][2], sentence_1_bool, sentence_1_dan_red = original_text_marked_red(i[2], paper_dict[i[0]][2],
paper_dict[i[0]][4][0], paper_dict[i[0]][4][2],
paper_dict[i[0]][4][1]) # text_original, bert_text, bert_text_pre paper_dict[i[0]][4][3]) # text_original, bert_text, bert_text_pre
if sentence_0_bool == False or sentence_1_bool == False: if sentence_0_bool == False or sentence_1_bool == False:
continue continue
@ -194,7 +194,7 @@ def original_text_contrast_func(data_sentence_dan, paper_dict):
similar_content_dan["degree"] = i[3]["degree"] similar_content_dan["degree"] = i[3]["degree"]
similar_content_dan["year"] = i[3]["year"] similar_content_dan["year"] = i[3]["year"]
similar_content_dan["paper_len_word"] = i[3]["paper_len_word"] similar_content_dan["paper_len_word"] = i[3]["paper_len_word"]
similar_content_dan["paper_red_len_word"] = len(paper_dict[i[0]][3]) similar_content_dan["paper_red_len_word"] = end_dan - start_dan
thesis_info = " ".join( thesis_info = " ".join(
[similar_content_dan["title"], similar_content_dan["author"], similar_content_dan["degree"], [similar_content_dan["title"], similar_content_dan["author"], similar_content_dan["degree"],
@ -206,7 +206,7 @@ def original_text_contrast_func(data_sentence_dan, paper_dict):
original_text_list = list(data_sentence_dan[0][1]) original_text_list = list(data_sentence_dan[0][1])
original_text_list.insert(end, "</red>") original_text_list.insert(end, "</red>")
original_text_list.insert(start, "<red>") original_text_list.insert(start, "<red>")
original_text = "".join(original_text_list) original_text = "此处有 {} 字相似\n".format(str(end - start)) + "".join(original_text_list)
return_info = { return_info = {
"original_text": original_text, "original_text": original_text,
@ -216,7 +216,7 @@ def original_text_contrast_func(data_sentence_dan, paper_dict):
return return_info return return_info
def repeat_quote_info_func(original_text_contrast): def repeat_quote_info_func(original_text_contrast, section_words):
''' '''
重复的引用信息 重复的引用信息
:return: :return:
@ -232,7 +232,7 @@ def repeat_quote_info_func(original_text_contrast):
"thesis_author": i["author"], "thesis_author": i["author"],
"thesis_date": i["year"], "thesis_date": i["year"],
"thesis_info": thesis_info, "thesis_info": thesis_info,
"thesis_repeat_rate": (i["paper_red_len_word"] / i["paper_len_word"]) * 100, "thesis_repeat_rate": (i["paper_red_len_word"] / section_words) * 100, # str(round(repeat_rate, 1)) + "%"
# round(repetition_rate, 3) * 100 # round(repetition_rate, 3) * 100
"thesis_title": i["title"], "thesis_title": i["title"],
"thesis_link": "", "thesis_link": "",
@ -244,11 +244,19 @@ def repeat_quote_info_func(original_text_contrast):
else: else:
chongfuwendang[thesis_info]["thesis_repeat_word"] += i["paper_red_len_word"] chongfuwendang[thesis_info]["thesis_repeat_word"] += i["paper_red_len_word"]
chongfuwendang[thesis_info]["thesis_repeat_rate"] = (chongfuwendang[thesis_info]["thesis_repeat_word"] / chongfuwendang[thesis_info]["thesis_repeat_rate"] = (chongfuwendang[thesis_info]["thesis_repeat_word"] /
chongfuwendang[thesis_info][ section_words) * 100
"paper_len_word"]) * 100
chongfuwendang = sorted(chongfuwendang.items(), chongfuwendang = sorted(chongfuwendang.items(),
key=lambda x: x[1]["thesis_repeat_rate"], reverse=False) key=lambda x: x[1]["thesis_repeat_rate"], reverse=False)
chongfuwendang_list = [i[1] for i in chongfuwendang]
chongfuwendang_list = []
for i in chongfuwendang:
chongfuwendang_dan = i[1]
print(chongfuwendang_dan)
chongfuwendang_dan["thesis_repeat_rate"] = str(round(chongfuwendang_dan["thesis_repeat_rate"], 1)) + "%"
chongfuwendang_list.append(chongfuwendang_dan)
return chongfuwendang_list return chongfuwendang_list
@ -383,7 +391,6 @@ def section_details_func(data_section_dan, paper_dict, num_words):
original_text_list = [] original_text_list = []
for sentence_dan in data_section_dan: for sentence_dan in data_section_dan:
print("sentence_dan", sentence_dan)
original_text_contrast_dan = original_text_contrast_func(sentence_dan, paper_dict) original_text_contrast_dan = original_text_contrast_func(sentence_dan, paper_dict)
original_text_contrast.append(original_text_contrast_dan) original_text_contrast.append(original_text_contrast_dan)
repeat_words += original_text_contrast_dan["dan_sentence_word_nums"] repeat_words += original_text_contrast_dan["dan_sentence_word_nums"]
@ -393,7 +400,7 @@ def section_details_func(data_section_dan, paper_dict, num_words):
repeat_rate = (repeat_words / section_words)* 100 repeat_rate = (repeat_words / section_words)* 100
repeat_rate = str(round(repeat_rate, 1)) + "%" repeat_rate = str(round(repeat_rate, 1)) + "%"
repeat_quote_info = repeat_quote_info_func(original_text_contrast) repeat_quote_info = repeat_quote_info_func(original_text_contrast, section_words)
return { return {
"end_page_index": 0, "end_page_index": 0,
@ -531,7 +538,10 @@ def accurate_check_rouge(
data_zong = [] data_zong = []
sentence_word_nums = 0 sentence_word_nums = 0
# =============================================================================================
# 多进程算法
# rouge算法查重 # rouge算法查重
t1_0 = time.time()
rst = [] rst = []
p = Pool(nums_cpus) # 进程池中含有n个子进程 p = Pool(nums_cpus) # 进程池中含有n个子进程
@ -552,8 +562,26 @@ def accurate_check_rouge(
p.close() p.close()
p.join() # 等待所有子进程执行完毕。调用join()之前必须先调用close(),调用close()之后就不能继续添加新的Process了。 p.join() # 等待所有子进程执行完毕。调用join()之前必须先调用close(),调用close()之后就不能继续添加新的Process了。
print("筛选句子完成")
rst = [i.get() for i in rst] rst = [i.get() for i in rst]
t2_0 = time.time()
print(t2_0- t1_0)
# =========================================================================================================
# rst = []
# num_words = 0
# centent_list = []
# for i in centent_list_old:
# num_words += len(i)
# if len(i) < 300:
# centent_list.append(i)
# for i in range(len(centent_list)):
# text = centent_list[i]
# rst.append(rouge_pre_m(text, recall_data_list))
# ========================================================================================================
for i in range(len(rst)): for i in range(len(rst)):
# print(rst[i]) # print(rst[i])
data_zong.append(rst[i]) data_zong.append(rst[i])
@ -567,7 +595,8 @@ def accurate_check_rouge(
original_dict = [] original_dict = []
# 找出相似的句子序号 # 找出相似的句子序号
bool_check_sentense = [] bool_check_sentense = [] # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]]
# bert算法 # bert算法
# for i in range(len(data_zong)): # for i in range(len(data_zong)):
# if data_zong[i][0] == 1: # if data_zong[i][0] == 1:
@ -575,29 +604,35 @@ def accurate_check_rouge(
# rouge算法 # rouge算法
for i in range(len(data_zong)): for i in range(len(data_zong)):
bool_check_sentense_dan = [] # [[1, 223],[1, 226], [1, 562]]
for j in range(len(data_zong[i])): for j in range(len(data_zong[i])):
if data_zong[i][j][1] > 0.35: if data_zong[i][j][1] > 0.3:
bool_check_sentense.append([i, data_zong[i][j][0]]) # print("data_zong[{}][{}]".format(i,j), data_zong[i][j][0])
bool_check_sentense_dan.append([i, data_zong[i][j][0]])
if bool_check_sentense_dan != []:
bool_check_sentense.append(bool_check_sentense_dan)
print("bool_check_sentense", bool_check_sentense)
print("找出相似的句子序号完成")
biao_red = biaohong(bool_check_sentense, data_zong, biao_red = biaohong(bool_check_sentense, data_zong,
recall_data_list) # [[[0, 1, 2], [479, 480, 481]], [[3, 4, 5], [481, 482, 483]], [[6, 7, 8], [484, 485, 486]]] recall_data_list) # [[[[0, 1, 2], [479, 480, 481]],[[0, 1, 2], [471, 472, 473]]], [[[3, 4, 5], [481, 482, 483]], [[3, 4, 5], [461, 462, 463]]]]
print("bert精确查重时间", t1 - t0)
print(biao_red)
print("biao_red", str(biao_red))
sentence_0_list = [] sentence_0_list = []
sentence_1_list = [] sentence_1_list = []
sim_paper_name = [] sim_paper_name = []
for i in biao_red: for i in range(len(biao_red)):
if recall_data_list[i[1][0]][1] == recall_data_list[i[1][1]][1] == recall_data_list[i[1][2]][1]: for j in range(len(biao_red[i])):
sentence_0_list.append("".join([centent_list[i[0][0]], centent_list[i[0][1]], centent_list[i[0][2]]])) if recall_data_list[biao_red[i][j][1][0]][1] == recall_data_list[biao_red[i][j][1][1]][1] == recall_data_list[biao_red[i][j][1][2]][1]:
sentence_1_list.append( sentence_0_list.append("".join([centent_list[biao_red[i][j][0][0]], centent_list[biao_red[i][j][0][1]], centent_list[biao_red[i][j][0][2]]]))
"".join([recall_data_list[i[1][0]][0], recall_data_list[i[1][1]][0], recall_data_list[i[1][2]][0]])) sentence_1_list.append(
sim_paper_name.append(recall_data_list[i[1][0]][1]) "".join([recall_data_list[biao_red[i][j][1][0]][0], recall_data_list[biao_red[i][j][1][1]][0], recall_data_list[biao_red[i][j][1][2]][0]]))
else: sim_paper_name.append(recall_data_list[biao_red[i][j][1][0]][1])
continue else:
continue
print("待标红句子筛选完成")
sentence_0_list_new = [] sentence_0_list_new = []
sentence_1_list_new = [] sentence_1_list_new = []
@ -610,12 +645,16 @@ def accurate_check_rouge(
continue continue
t2 = time.time() t2 = time.time()
print()
for i in sentence_0_list_new:
print("sentence_0_list_new", i)
if sentence_0_list_new == sentence_1_list_new == []: if sentence_0_list_new == sentence_1_list_new == []:
paper_dict = [] paper_dict = []
else: else:
paper_dict = biaohong_bert_predict(sentence_0_list_new, sentence_1_list_new) paper_dict = biaohong_bert_predict(sentence_0_list_new, sentence_1_list_new)
t3 = time.time() t3 = time.time()
print("标红完成")
print("标红时间", t3 - t2) print("标红时间", t3 - t2)
original_text = [] original_text = []
original_text_contrast = [] original_text_contrast = []
@ -631,6 +670,7 @@ def accurate_check_rouge(
# with open("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/paper_dict.json", "w") as f: # with open("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/paper_dict.json", "w") as f:
# json.dump(paper_dict, f, ensure_ascii=False) # json.dump(paper_dict, f, ensure_ascii=False)
sentence_0_list_new_cursor = sentence_0_list_new[0] sentence_0_list_new_cursor = sentence_0_list_new[0]
for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)), for paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan in zip(range(len(paper_dict)),
sentence_0_list_new, sentence_0_list_new,
@ -639,6 +679,7 @@ def accurate_check_rouge(
if sentence_0_list_new_cursor != sentence_0_dan: if sentence_0_list_new_cursor != sentence_0_dan:
similar_content_control.append([[paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan]]) similar_content_control.append([[paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan]])
sentence_0_list_new_cursor = sentence_0_dan
else: else:
similar_content_control[-1].append([paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan]) similar_content_control[-1].append([paper_dict_dan_id, sentence_0_dan, sentence_1_dan, sim_paper_name_dan])
@ -682,38 +723,69 @@ def accurate_check_rouge(
def biaohong(bool_check_sentense, data_zong, df_train_nuoche): def biaohong(bool_check_sentense, data_zong, df_train_nuoche):
''' '''
标红的序号 [[0,1,2],[3,4,5]] 标红的序号 [[0,1,2],[3,4,5]]
:param bool_check_sentense: :param bool_check_sentense: # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]]
:return: list :return: list # [[[[0, 1, 2], [479, 480, 481]],[[0, 1, 2], [471, 472, 473]]], [[[3, 4, 5], [481, 482, 483]], [[3, 4, 5], [461, 462, 463]]]]
''' '''
# print("bool_check_sentense", bool_check_sentense)
biao_red = [] biao_red = []
i = 0 i = 0
start = -1 start = -1
end = -1 end = -1
tiaochu = False
while True: while True:
if i >= len(bool_check_sentense) or bool_check_sentense[i][0] + 1 >= len(data_zong) or bool_check_sentense[i][1] \ # if i >= len(bool_check_sentense) or bool_check_sentense[i][0] + 1 >= len(data_zong) or bool_check_sentense[i][1] \
+ 1 >= len(df_train_nuoche): # + 1 >= len(df_train_nuoche):
# break
if i >= len(bool_check_sentense):
break break
elif bool_check_sentense[i][0] - 1 == start:
for j in bool_check_sentense[i]:
# print("j", j)
if j[0] + 1 >= len(data_zong):
tiaochu = True
break
for j in bool_check_sentense[i]:
if j[1] + 1 >= len(df_train_nuoche):
tiaochu = True
break
if tiaochu == True:
break
elif bool_check_sentense[i][0][0] - 1 == start:
i += 1 i += 1
continue continue
elif bool_check_sentense[i][0] == end: elif bool_check_sentense[i][0][0] == end:
i += 1 i += 1
continue continue
elif bool_check_sentense[i][0] - 1 == end: elif bool_check_sentense[i][0][0] - 1 == end:
i += 1 i += 1
continue continue
else: else:
biao_red_dan = [] biao_red_dan = []
biao_red_dan.append([bool_check_sentense[i][0] - 1, bool_check_sentense[i][1] - 1]) for j in range(len(bool_check_sentense[i])): # bool_check_sentense: [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]]
biao_red_dan.append([bool_check_sentense[i][0], bool_check_sentense[i][1]]) # biao_red_dan.append([bool_check_sentense[i][j][0] - 1, bool_check_sentense[i][j][1] - 1])
biao_red_dan.append([bool_check_sentense[i][0] + 1, bool_check_sentense[i][1] + 1]) # biao_red_dan.append([bool_check_sentense[i][j][0], bool_check_sentense[i][j][1]])
biao_red.append([[bool_check_sentense[i][0] - 1, bool_check_sentense[i][0], bool_check_sentense[i][0] + 1], # biao_red_dan.append([bool_check_sentense[i][j][0] + 1, bool_check_sentense[i][j][1] + 1])
[bool_check_sentense[i][1] - 1, bool_check_sentense[i][1], bool_check_sentense[i][1] + 1]]) # biao_red.append([[bool_check_sentense[i][0] - 1, bool_check_sentense[i][0], bool_check_sentense[i][0] + 1],
start = bool_check_sentense[i][0] - 1 # [bool_check_sentense[i][1] - 1, bool_check_sentense[i][1], bool_check_sentense[i][1] + 1]])
end = bool_check_sentense[i][0] + 1 # start = bool_check_sentense[i][0] - 1
# end = bool_check_sentense[i][0] + 1
# i += 1
# print("i:{}, j:{}".format(i, j), )
# print(bool_check_sentense)
# print([bool_check_sentense[i][j][0] - 1, bool_check_sentense[i][j][0], bool_check_sentense[i][j][0] + 1])
biao_red_dan.append([[bool_check_sentense[i][j][0] - 1, bool_check_sentense[i][j][0], bool_check_sentense[i][j][0] + 1],
[bool_check_sentense[i][j][1] - 1, bool_check_sentense[i][j][1], bool_check_sentense[i][j][1] + 1]])
start = bool_check_sentense[i][0][0] - 1
end = bool_check_sentense[i][0][0] + 1
i += 1 i += 1
biao_red.append(biao_red_dan)
return biao_red return biao_red # [[[[0, 1, 2], [479, 480, 481]],[[0, 1, 2], [471, 472, 473]]], [[[3, 4, 5], [481, 482, 483]], [[3, 4, 5], [461, 462, 463]]]]
def dialog_line_parse(url, text): def dialog_line_parse(url, text):

Loading…
Cancel
Save