From dae45a2aa417e4dd7cf0d576f14b5b0cbd5604ff Mon Sep 17 00:00:00 2001 From: "majiahui@haimaqingfan.com" Date: Wed, 20 Sep 2023 18:45:38 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=95=B4=E7=89=88v1.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_check_bert_test.py | 103 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 72 insertions(+), 31 deletions(-) diff --git a/flask_check_bert_test.py b/flask_check_bert_test.py index c12e800..c86d33a 100644 --- a/flask_check_bert_test.py +++ b/flask_check_bert_test.py @@ -83,7 +83,20 @@ def rouge_value_self(data_1, data_2): for sen_1, sen_2 in zip(data_1, data_2): sen_1 = sen_1.split(" ") sen_2 = sen_2.split(" ") - # rouge_l_score = rouge_l_model.score(sen_1, sen_2) + rouge_l_score = rouge_l_model.score(sen_1, sen_2) + rouge_l_list.append(rouge_l_score) + + return "", "", rouge_l_list + + +def strsim_value(data_1, data_2): + data_1 = [' '.join(i) for i in data_1] + data_2 = [' '.join(i) for i in data_2] + rouge_l_list = [] + + for sen_1, sen_2 in zip(data_1, data_2): + sen_1 = sen_1.split(" ") + sen_2 = sen_2.split(" ") rouge_l_score = jaccard_similarity(sen_1, sen_2) rouge_l_list.append(rouge_l_score) @@ -119,7 +132,7 @@ def rouge_pre_m(text, df_train_nuoche): data_list = [] for data_dan in df_train_nuoche: data_list.append(data_dan[0]) - rouge_1, rouge_2, rouge_l = rouge_value_self(text_list, data_list) + rouge_1, rouge_2, rouge_l = strsim_value(text_list, data_list) index_rouge_list.extend(rouge_l) re1 = [(i[0], i[1]) for i in sorted(list(enumerate(index_rouge_list)), key=lambda x: x[1], reverse=True)] @@ -129,6 +142,30 @@ def rouge_pre_m(text, df_train_nuoche): return return_list +def rouge_pre_m_1(bool_check_sentense, centent_list, recall_data_list): + # bool_check_sentense [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] + + bool_check_sentense_new = [] + for bool_check_sentense_dan in bool_check_sentense: + bool_check_sentense_new_dan = [] + + text_list = [] + data_list = [] + linshi = [] + for i in bool_check_sentense_dan: + text1 = centent_list[i[0]] + text2 = recall_data_list[i[1]][0] + linshi.append([i[0], i[1]]) + text_list.append(text1) + data_list.append(text2) + _, _, rouge_l_list = rouge_value_self(text_list, data_list) + for i in range(len(rouge_l_list)): + if rouge_l_list[i] > 0.47: + bool_check_sentense_new_dan.append(linshi[i]) + if bool_check_sentense_new_dan != []: + bool_check_sentense_new.append(bool_check_sentense_new_dan) + return bool_check_sentense_new + # 以单个章节为例 def similar_content_func(): ''' @@ -541,44 +578,44 @@ def accurate_check_rouge( # ============================================================================================= # 多进程算法 # rouge算法查重 - t1_0 = time.time() - rst = [] - p = Pool(nums_cpus) # 进程池中含有n个子进程 - - # print("centent_list", centent_list) - - num_words = 0 - centent_list = [] - for i in centent_list_old: - num_words += len(i) - if len(i) < 300: - centent_list.append(i) - - print("num_words", num_words) - for i in range(len(centent_list)): - text = centent_list[i] - a = p.apply_async(rouge_pre_m, args=(text, recall_data_list,)) - rst.append(a) - p.close() - p.join() # 等待所有子进程执行完毕。调用join()之前必须先调用close(),调用close()之后就不能继续添加新的Process了。 - - print("筛选句子完成") - rst = [i.get() for i in rst] - - t2_0 = time.time() - print(t2_0- t1_0) - # ========================================================================================================= - + # t1_0 = time.time() # rst = [] + # p = Pool(nums_cpus) # 进程池中含有n个子进程 + # + # # print("centent_list", centent_list) + # # num_words = 0 # centent_list = [] # for i in centent_list_old: # num_words += len(i) # if len(i) < 300: # centent_list.append(i) + # + # print("num_words", num_words) # for i in range(len(centent_list)): # text = centent_list[i] - # rst.append(rouge_pre_m(text, recall_data_list)) + # a = p.apply_async(rouge_pre_m, args=(text, recall_data_list,)) + # rst.append(a) + # p.close() + # p.join() # 等待所有子进程执行完毕。调用join()之前必须先调用close(),调用close()之后就不能继续添加新的Process了。 + # + # print("筛选句子完成") + # rst = [i.get() for i in rst] + # + # t2_0 = time.time() + # print(t2_0- t1_0) + # ========================================================================================================= + + rst = [] + num_words = 0 + centent_list = [] + for i in centent_list_old: + num_words += len(i) + if len(i) < 300: + centent_list.append(i) + for i in range(len(centent_list)): + text = centent_list[i] + rst.append(rouge_pre_m(text, recall_data_list)) # ======================================================================================================== @@ -612,6 +649,10 @@ def accurate_check_rouge( if bool_check_sentense_dan != []: bool_check_sentense.append(bool_check_sentense_dan) + # 继续用rouge方法筛选 + + bool_check_sentense = rouge_pre_m_1(bool_check_sentense, centent_list, recall_data_list) # [[[1, 223],[1, 226], [1, 562]],[[2, 243],[2, 226], [2, 561]]] + print("bool_check_sentense", bool_check_sentense) print("找出相似的句子序号完成") biao_red = biaohong(bool_check_sentense, data_zong,