From 43dc9d1d8a5704a611f9b66b2180fc0df3485db0 Mon Sep 17 00:00:00 2001 From: "majiahui@haimaqingfan.com" Date: Fri, 4 Aug 2023 12:21:06 +0800 Subject: [PATCH] =?UTF-8?q?=E8=87=AA=E5=B7=B1=E5=AE=9E=E7=8E=B0rouge?= =?UTF-8?q?=E6=A3=80=E6=B5=8B=E7=9B=B8=E4=BC=BC=E5=BA=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Rouge_w.py | 192 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ rouge_weight_2.py | 157 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 349 insertions(+) create mode 100644 Rouge_w.py create mode 100644 rouge_weight_2.py diff --git a/Rouge_w.py b/Rouge_w.py new file mode 100644 index 0000000..9a57d13 --- /dev/null +++ b/Rouge_w.py @@ -0,0 +1,192 @@ +# -*- coding: utf-8 -*- + +""" +@Time : 2023/3/16 11:03 +@Author : +@FileName: +@Software: +@Describe: +""" +from rouge import Rouge +rouge = Rouge() +from copy import deepcopy + +class Rouge_w: + def __init__(self): + self.k = 0.1 + self.ki = 1.2 + self.p = 1.0 + + def fi_(self,a): + return a * self.ki + + def f(self, a): + return self.k * (a ** 2) + + def WLCS(self, X, Y, f): + m = len(X) + n = len(Y) + c = [[0 for j in range(n+1)] for i in range(m+1)] + w = [[0 for j in range(n+1)] for i in range(m+1)] + + for i in range(1, m+1): + for j in range(1, n+1): + if X[i-1] == Y[j-1]: + k = w[i-1][j-1] + c[i][j] = c[i-1][j-1] + 10.0 * (f(k+1) - f(k)) + w[i][j] = k+1 + else: + if c[i-1][j] > c[i][j-1]: + c[i][j] = c[i-1][j] + w[i][j] = 0 + else: + c[i][j] = c[i][j-1] + w[i][j] = 0 + + return c[m][n] + + def f_1(self, k): + return k ** 0.5 + + def f_(self, k): + return k ** 2 + +# print(WLCS([1,2,5], [1,2,5],f)) + + def score(self, p, r): + m = len(p) + n = len(r) + wlcs = self.WLCS(p, r, self.f) + p_wlcs = self.f_1(wlcs/self.f_(m)) + r_wlcs = self.f_1(wlcs/self.f_(n)) + f_lcs = (1 + self.p **2) * ((p_wlcs * r_wlcs) / (p_wlcs + ((self.p ** 2) *r_wlcs) + 1e-8)) + return f_lcs + +class Rouge_l: + def __init__(self): + self.b = 3 + + def LCS(self, X, Y): + m = len(X) + n = len(Y) + # 创建一个二维数组来存储中间结果 + dp = [[0] * (n + 1) for _ in range(m + 1)] + + # 使用动态规划填充dp数组 + for i in range(1, m + 1): + for j in range(1, n + 1): + if X[i - 1] == Y[j - 1]: + dp[i][j] = dp[i - 1][j - 1] + 1 + else: + dp[i][j] = max(dp[i - 1][j], dp[i][j - 1]) + + return dp[m][n] + +# print(WLCS([1,2,5], [1,2,5],f)) + + def score(self, p, r): + m = len(p) + n = len(r) + lcs = self.LCS(p, r) + p_lcs = lcs/m + r_lcs = lcs/n + f_lcs = ((1 + self.b ** 2) * (p_lcs * r_lcs) / (p_lcs + self.b ** 2 * r_lcs + 1e-8)) + return f_lcs + + +# class Ngrams(object): +# """ +# Ngrams datastructure based on `set` or `list` +# depending in `exclusive` +# """ +# +# def __init__(self, ngrams={}, exclusive=True): +# if exclusive: +# self._ngrams = set(ngrams) +# else: +# self._ngrams = list(ngrams) +# self.exclusive = exclusive +# +# def add(self, o): +# if self.exclusive: +# self._ngrams.add(o) +# else: +# self._ngrams.append(o) +# +# def __len__(self): +# return len(self._ngrams) +# +# def intersection(self, o): +# if self.exclusive: +# inter_set = self._ngrams.intersection(o._ngrams) +# return Ngrams(inter_set, exclusive=True) +# else: +# other_list = deepcopy(o._ngrams) +# inter_list = [] +# +# for e in self._ngrams: +# try: +# i = other_list.index(e) +# except ValueError: +# continue +# other_list.pop(i) +# inter_list.append(e) +# return Ngrams(inter_list, exclusive=False) +# +# def union(self, *ngrams): +# if self.exclusive: +# union_set = self._ngrams +# for o in ngrams: +# union_set = union_set.union(o._ngrams) +# return Ngrams(union_set, exclusive=True) +# else: +# union_list = deepcopy(self._ngrams) +# for o in ngrams: +# union_list.extend(o._ngrams) +# return Ngrams(union_list, exclusive=False) +# +# class Rouge_l: +# def __init__(self): +# +# def score(self, evaluated_sentences, reference_sentences, raw_results=False, exclusive=True, **_): +# if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: +# raise ValueError("Collections must contain at least 1 sentence.") +# +# # total number of words in reference sentences +# m = len( +# Ngrams( +# _split_into_words(reference_sentences), +# exclusive=exclusive)) +# +# # total number of words in evaluated sentences +# n = len( +# Ngrams( +# _split_into_words(evaluated_sentences), +# exclusive=exclusive)) +# +# # print("m,n %d %d" % (m, n)) +# union_lcs_sum_across_all_references = 0 +# union = Ngrams(exclusive=exclusive) +# for ref_s in reference_sentences: +# lcs_count, union = _union_lcs(evaluated_sentences, +# ref_s, +# prev_union=union, +# exclusive=exclusive) +# union_lcs_sum_across_all_references += lcs_count +# +# llcs = union_lcs_sum_across_all_references +# r_lcs = llcs / m +# p_lcs = llcs / n +# +# f_lcs = 2.0 * ((p_lcs * r_lcs) / (p_lcs + r_lcs + 1e-8)) + +if __name__ == '__main__': + + rouge_model = Rouge_l() + X = ["A", "B", "C", "D", "u", "u", "u", "u", "u", "u"] + Y1 = ["A", "B", "C", "D", "H", "I", "K", "K", "K", "K", "K", "K"] + Y2 = ["A", "H", "B", "K", "C", "I", "K", "K", "K", "K", "K", "K"] + # X = "我爱你" + # Y = "我他爱" + print(rouge_model.score(X, Y1)) + # print(WLCS([1,2,5], [1,2,5],f)) \ No newline at end of file diff --git a/rouge_weight_2.py b/rouge_weight_2.py new file mode 100644 index 0000000..7c41d71 --- /dev/null +++ b/rouge_weight_2.py @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- + +""" +@Time : 2023/3/9 18:36 +@Author : +@FileName: +@Software: +@Describe: +""" +import os +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0" +import jieba +import tensorflow as tf +import os +from src import basemodel +from src import simscemodel +import numpy as np +from numpy.linalg import norm +import pandas as pd +# from rouge import Rouge +from rouge_chinese import Rouge +from Rouge_w import Rouge_w,Rouge_l +from tqdm import tqdm + +rouge = Rouge() +rouge_model = Rouge_w() +rouge_l_model = Rouge_l() + +# a = [[1, 3, 2], [2, 2, 1]] +# print(cosine_similarity(a)) + +def cos_sim(a, b): + A = np.array(a) + B = np.array(b) + cosine = np.dot(A, B) / (norm(A) * norm(B)) + return cosine + + + +# def rouge_value(data_1, data_2): +# data_1 = [' '.join(i) for i in data_1] +# data_2 = [' '.join(i) for i in data_2] +# scores = rouge.get_scores(hyps=data_1, refs=data_2) +# +# rouge_1_list = [] +# rouge_2_list = [] +# rouge_l_list = [] +# +# for i in range(len(scores)): +# rouge_1 = scores[i]['rouge-1']['f'] +# rouge_2 = scores[i]['rouge-2']['f'] +# rouge_l = scores[i]['rouge-l']['f'] +# rouge_1_list.append(rouge_1) +# rouge_2_list.append(rouge_2) +# rouge_l_list.append(rouge_l) +# +# return rouge_1_list, rouge_2_list, rouge_l_list + + +def rouge_value_dan(data_1, data_2): + + hypothesis = ' '.join(jieba.cut(data_1)) + reference = ' '.join(jieba.cut(data_2)) + scores = rouge.get_scores(hypothesis, reference) + rouge_1 = scores[0]['rouge-1']['f'] + rouge_2 = scores[0]['rouge-2']['f'] + rouge_l = scores[0]['rouge-l']['f'] + + return rouge_1, rouge_2, rouge_l + +def rouge_value(data_1, data_2): + rouge_l_list = [] + + for data_1_dan, data_2_dan in zip(data_1, data_2): + rouge_1, rouge_2, rouge_l = rouge_value_dan(data_1_dan, data_2_dan) + # rouge_l = weight_lenw(data_1_dan,data_2_dan,rouge_l) + rouge_l_list.append(rouge_l) + + return "", "", rouge_l_list + + +def rouge_value_self(data_1, data_2): + data_1 = [' '.join(i) for i in data_1] + data_2 = [' '.join(i) for i in data_2] + + rouge_l_list = [] + + for sen_1, sen_2 in zip(data_1, data_2): + sen_1 = sen_1.split(" ") + sen_2 = sen_2.split(" ") + rouge_l_score = rouge_l_model.score(sen_1, sen_2) + rouge_l_list.append(rouge_l_score) + + return "", "", rouge_l_list + + +def rouge_w_value(data_1, data_2): + score = rouge_model.score(data_1, data_2) + return score + +def weight_lenw(text_1, text_2, wight): + if len(text_2) > len(text_1): + x = len(text_2) / len(text_1) + else: + return wight + + k = 0.08 + b = 0.92 + y = k * x + b + + wight = wight * y + return wight + + +def rouge_pre(text, df_train_nuoche): + return_list = [] + index_rouge_list = [] + index_rouge_w_list = [] + text_list = [text] * len(df_train_nuoche) + + data_list = [] + for data_dan in df_train_nuoche: + data_list.append(data_dan[0]) + rouge_1, rouge_2, rouge_l = rouge_value_self(text_list, data_list) + index_rouge_list.extend(rouge_l) + + # index_rouge_list = [weight_lenw(text_1, text_2, w) for text_1, text_2, w in zip(text_list, data_list, index_rouge_list)] + + re1 = [(i[0], i[1]) for i in sorted(list(enumerate(index_rouge_list)), key=lambda x: x[1], reverse=True)] + + + return_list.append(re1[0][1]) + return_list.append(df_train_nuoche[re1[0][0]][0]) + filename = df_train_nuoche[re1[0][0]][1].split("\\")[-1] + return_list.append(filename) + + return return_list + + +if __name__ == '__main__': + load_weights_path = r"E:\pycharm_workspace\premodel\keras\simscemodel/my_model_4.weights" + df_train_nuoche = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist() + + data_zong = [] + path_csv = "data/10235513_大型商业建筑人员疏散设计研究_沈福禹/大型商业建筑人员疏散设计研究_全文对照.csv" + path_excel = "data/10235513_大型商业建筑人员疏散设计研究_沈福禹/大型商业建筑人员疏散设计研究_全文对照_acc_13_self_rouge.xlsx" + + centent_list = pd.read_csv(path_csv, encoding="gbk").values.tolist() + for text in tqdm(centent_list): + if text[1] == "##": + true_bool = 0 + else: + true_bool = 1 + rouge_pre_list = rouge_pre(text[0], df_train_nuoche) + data_zong.append([text[0], text[1], true_bool] + rouge_pre_list) + pd.DataFrame(data_zong).to_excel(path_excel, index=None)