|
|
@ -83,7 +83,20 @@ def rouge_value_self(data_1, data_2): |
|
|
|
for sen_1, sen_2 in zip(data_1, data_2): |
|
|
|
sen_1 = sen_1.split(" ") |
|
|
|
sen_2 = sen_2.split(" ") |
|
|
|
# rouge_l_score = rouge_l_model.score(sen_1, sen_2) |
|
|
|
rouge_l_score = rouge_l_model.score(sen_1, sen_2) |
|
|
|
rouge_l_list.append(rouge_l_score) |
|
|
|
|
|
|
|
return "", "", rouge_l_list |
|
|
|
|
|
|
|
|
|
|
|
def strsim_value(data_1, data_2): |
|
|
|
data_1 = [' '.join(i) for i in data_1] |
|
|
|
data_2 = [' '.join(i) for i in data_2] |
|
|
|
rouge_l_list = [] |
|
|
|
|
|
|
|
for sen_1, sen_2 in zip(data_1, data_2): |
|
|
|
sen_1 = sen_1.split(" ") |
|
|
|
sen_2 = sen_2.split(" ") |
|
|
|
rouge_l_score = jaccard_similarity(sen_1, sen_2) |
|
|
|
rouge_l_list.append(rouge_l_score) |
|
|
|
|
|
|
@ -119,7 +132,7 @@ def rouge_pre_m(text, df_train_nuoche): |
|
|
|
data_list = [] |
|
|
|
for data_dan in df_train_nuoche: |
|
|
|
data_list.append(data_dan[0]) |
|
|
|
rouge_1, rouge_2, rouge_l = rouge_value_self(text_list, data_list) |
|
|
|
rouge_1, rouge_2, rouge_l = strsim_value(text_list, data_list) |
|
|
|
index_rouge_list.extend(rouge_l) |
|
|
|
|
|
|
|
re1 = [(i[0], i[1]) for i in sorted(list(enumerate(index_rouge_list)), key=lambda x: x[1], reverse=True)] |
|
|
@ -129,6 +142,30 @@ def rouge_pre_m(text, df_train_nuoche): |
|
|
|
return return_list |
|
|
|
|
|
|
|
|
|
|
|
def rouge_pre_m_1(bool_check_sentense, centent_list, recall_data_list): |
|
|
|
# bool_check_sentense [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] |
|
|
|
|
|
|
|
bool_check_sentense_new = [] |
|
|
|
for bool_check_sentense_dan in bool_check_sentense: |
|
|
|
bool_check_sentense_new_dan = [] |
|
|
|
|
|
|
|
text_list = [] |
|
|
|
data_list = [] |
|
|
|
linshi = [] |
|
|
|
for i in bool_check_sentense_dan: |
|
|
|
text1 = centent_list[i[0]] |
|
|
|
text2 = recall_data_list[i[1]][0] |
|
|
|
linshi.append([i[0], i[1]]) |
|
|
|
text_list.append(text1) |
|
|
|
data_list.append(text2) |
|
|
|
_, _, rouge_l_list = rouge_value_self(text_list, data_list) |
|
|
|
for i in range(len(rouge_l_list)): |
|
|
|
if rouge_l_list[i] > 0.47: |
|
|
|
bool_check_sentense_new_dan.append(linshi[i]) |
|
|
|
if bool_check_sentense_new_dan != []: |
|
|
|
bool_check_sentense_new.append(bool_check_sentense_new_dan) |
|
|
|
return bool_check_sentense_new |
|
|
|
|
|
|
|
# 以单个章节为例 |
|
|
|
def similar_content_func(): |
|
|
|
''' |
|
|
@ -541,44 +578,44 @@ def accurate_check_rouge( |
|
|
|
# ============================================================================================= |
|
|
|
# 多进程算法 |
|
|
|
# rouge算法查重 |
|
|
|
t1_0 = time.time() |
|
|
|
rst = [] |
|
|
|
p = Pool(nums_cpus) # 进程池中含有n个子进程 |
|
|
|
|
|
|
|
# print("centent_list", centent_list) |
|
|
|
|
|
|
|
num_words = 0 |
|
|
|
centent_list = [] |
|
|
|
for i in centent_list_old: |
|
|
|
num_words += len(i) |
|
|
|
if len(i) < 300: |
|
|
|
centent_list.append(i) |
|
|
|
|
|
|
|
print("num_words", num_words) |
|
|
|
for i in range(len(centent_list)): |
|
|
|
text = centent_list[i] |
|
|
|
a = p.apply_async(rouge_pre_m, args=(text, recall_data_list,)) |
|
|
|
rst.append(a) |
|
|
|
p.close() |
|
|
|
p.join() # 等待所有子进程执行完毕。调用join()之前必须先调用close(),调用close()之后就不能继续添加新的Process了。 |
|
|
|
|
|
|
|
print("筛选句子完成") |
|
|
|
rst = [i.get() for i in rst] |
|
|
|
|
|
|
|
t2_0 = time.time() |
|
|
|
print(t2_0- t1_0) |
|
|
|
# ========================================================================================================= |
|
|
|
|
|
|
|
# t1_0 = time.time() |
|
|
|
# rst = [] |
|
|
|
# p = Pool(nums_cpus) # 进程池中含有n个子进程 |
|
|
|
# |
|
|
|
# # print("centent_list", centent_list) |
|
|
|
# |
|
|
|
# num_words = 0 |
|
|
|
# centent_list = [] |
|
|
|
# for i in centent_list_old: |
|
|
|
# num_words += len(i) |
|
|
|
# if len(i) < 300: |
|
|
|
# centent_list.append(i) |
|
|
|
# |
|
|
|
# print("num_words", num_words) |
|
|
|
# for i in range(len(centent_list)): |
|
|
|
# text = centent_list[i] |
|
|
|
# rst.append(rouge_pre_m(text, recall_data_list)) |
|
|
|
# a = p.apply_async(rouge_pre_m, args=(text, recall_data_list,)) |
|
|
|
# rst.append(a) |
|
|
|
# p.close() |
|
|
|
# p.join() # 等待所有子进程执行完毕。调用join()之前必须先调用close(),调用close()之后就不能继续添加新的Process了。 |
|
|
|
# |
|
|
|
# print("筛选句子完成") |
|
|
|
# rst = [i.get() for i in rst] |
|
|
|
# |
|
|
|
# t2_0 = time.time() |
|
|
|
# print(t2_0- t1_0) |
|
|
|
# ========================================================================================================= |
|
|
|
|
|
|
|
rst = [] |
|
|
|
num_words = 0 |
|
|
|
centent_list = [] |
|
|
|
for i in centent_list_old: |
|
|
|
num_words += len(i) |
|
|
|
if len(i) < 300: |
|
|
|
centent_list.append(i) |
|
|
|
for i in range(len(centent_list)): |
|
|
|
text = centent_list[i] |
|
|
|
rst.append(rouge_pre_m(text, recall_data_list)) |
|
|
|
|
|
|
|
# ======================================================================================================== |
|
|
|
|
|
|
@ -612,6 +649,10 @@ def accurate_check_rouge( |
|
|
|
if bool_check_sentense_dan != []: |
|
|
|
bool_check_sentense.append(bool_check_sentense_dan) |
|
|
|
|
|
|
|
# 继续用rouge方法筛选 |
|
|
|
|
|
|
|
bool_check_sentense = rouge_pre_m_1(bool_check_sentense, centent_list, recall_data_list) # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] |
|
|
|
|
|
|
|
print("bool_check_sentense", bool_check_sentense) |
|
|
|
print("找出相似的句子序号完成") |
|
|
|
biao_red = biaohong(bool_check_sentense, data_zong, |
|
|
|