Browse Source

完整版v1.0

master
majiahui@haimaqingfan.com 2 years ago
parent
commit
dae45a2aa4
  1. 103
      flask_check_bert_test.py

103
flask_check_bert_test.py

@ -83,7 +83,20 @@ def rouge_value_self(data_1, data_2):
for sen_1, sen_2 in zip(data_1, data_2):
sen_1 = sen_1.split(" ")
sen_2 = sen_2.split(" ")
# rouge_l_score = rouge_l_model.score(sen_1, sen_2)
rouge_l_score = rouge_l_model.score(sen_1, sen_2)
rouge_l_list.append(rouge_l_score)
return "", "", rouge_l_list
def strsim_value(data_1, data_2):
data_1 = [' '.join(i) for i in data_1]
data_2 = [' '.join(i) for i in data_2]
rouge_l_list = []
for sen_1, sen_2 in zip(data_1, data_2):
sen_1 = sen_1.split(" ")
sen_2 = sen_2.split(" ")
rouge_l_score = jaccard_similarity(sen_1, sen_2)
rouge_l_list.append(rouge_l_score)
@ -119,7 +132,7 @@ def rouge_pre_m(text, df_train_nuoche):
data_list = []
for data_dan in df_train_nuoche:
data_list.append(data_dan[0])
rouge_1, rouge_2, rouge_l = rouge_value_self(text_list, data_list)
rouge_1, rouge_2, rouge_l = strsim_value(text_list, data_list)
index_rouge_list.extend(rouge_l)
re1 = [(i[0], i[1]) for i in sorted(list(enumerate(index_rouge_list)), key=lambda x: x[1], reverse=True)]
@ -129,6 +142,30 @@ def rouge_pre_m(text, df_train_nuoche):
return return_list
def rouge_pre_m_1(bool_check_sentense, centent_list, recall_data_list):
# bool_check_sentense [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]]
bool_check_sentense_new = []
for bool_check_sentense_dan in bool_check_sentense:
bool_check_sentense_new_dan = []
text_list = []
data_list = []
linshi = []
for i in bool_check_sentense_dan:
text1 = centent_list[i[0]]
text2 = recall_data_list[i[1]][0]
linshi.append([i[0], i[1]])
text_list.append(text1)
data_list.append(text2)
_, _, rouge_l_list = rouge_value_self(text_list, data_list)
for i in range(len(rouge_l_list)):
if rouge_l_list[i] > 0.47:
bool_check_sentense_new_dan.append(linshi[i])
if bool_check_sentense_new_dan != []:
bool_check_sentense_new.append(bool_check_sentense_new_dan)
return bool_check_sentense_new
# 以单个章节为例
def similar_content_func():
'''
@ -541,44 +578,44 @@ def accurate_check_rouge(
# =============================================================================================
# 多进程算法
# rouge算法查重
t1_0 = time.time()
rst = []
p = Pool(nums_cpus) # 进程池中含有n个子进程
# print("centent_list", centent_list)
num_words = 0
centent_list = []
for i in centent_list_old:
num_words += len(i)
if len(i) < 300:
centent_list.append(i)
print("num_words", num_words)
for i in range(len(centent_list)):
text = centent_list[i]
a = p.apply_async(rouge_pre_m, args=(text, recall_data_list,))
rst.append(a)
p.close()
p.join() # 等待所有子进程执行完毕。调用join()之前必须先调用close(),调用close()之后就不能继续添加新的Process了。
print("筛选句子完成")
rst = [i.get() for i in rst]
t2_0 = time.time()
print(t2_0- t1_0)
# =========================================================================================================
# t1_0 = time.time()
# rst = []
# p = Pool(nums_cpus) # 进程池中含有n个子进程
#
# # print("centent_list", centent_list)
#
# num_words = 0
# centent_list = []
# for i in centent_list_old:
# num_words += len(i)
# if len(i) < 300:
# centent_list.append(i)
#
# print("num_words", num_words)
# for i in range(len(centent_list)):
# text = centent_list[i]
# rst.append(rouge_pre_m(text, recall_data_list))
# a = p.apply_async(rouge_pre_m, args=(text, recall_data_list,))
# rst.append(a)
# p.close()
# p.join() # 等待所有子进程执行完毕。调用join()之前必须先调用close(),调用close()之后就不能继续添加新的Process了。
#
# print("筛选句子完成")
# rst = [i.get() for i in rst]
#
# t2_0 = time.time()
# print(t2_0- t1_0)
# =========================================================================================================
rst = []
num_words = 0
centent_list = []
for i in centent_list_old:
num_words += len(i)
if len(i) < 300:
centent_list.append(i)
for i in range(len(centent_list)):
text = centent_list[i]
rst.append(rouge_pre_m(text, recall_data_list))
# ========================================================================================================
@ -612,6 +649,10 @@ def accurate_check_rouge(
if bool_check_sentense_dan != []:
bool_check_sentense.append(bool_check_sentense_dan)
# 继续用rouge方法筛选
bool_check_sentense = rouge_pre_m_1(bool_check_sentense, centent_list, recall_data_list) # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]]
print("bool_check_sentense", bool_check_sentense)
print("找出相似的句子序号完成")
biao_red = biaohong(bool_check_sentense, data_zong,

Loading…
Cancel
Save