第一次提交，非batch预测版本

3 years ago · 8a56f2ffe6
18 changed files with 278312 additions and 0 deletions
--- a/data_do/yy数据处理.py
+++ b/data_do/yy数据处理.py
@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+
+"""
+@Time    :  2022/12/20 10:35
+@Author  : 
+@FileName: 
+@Software: 
+@Describe:
+"""
+import os
+from bs4 import BeautifulSoup
+import pandas as pd
+import re
+# 遍历文件夹
+
+
+path_list = []
+
+def walkFile(file):
+    for root, dirs, files in os.walk(file):
+        # root 表示当前访问的文件夹路径
+        # dirs 表示该文件夹下的子目录名list
+        # files 表示该文件夹下的文件list
+        # 遍历文件
+        for f in files:
+            name = str(f).split("_")[0]
+            path_list.append(name)
+
+
+walkFile("../data/yy_reduce_data")
+
+
+path_list = list(set(path_list))
+print(path_list)
+
+
+data = []
+for i in path_list:
+
+    soup_source = BeautifulSoup(open("../data/yy_reduce_data/{}_source".format(i), encoding='utf-8'),
+                                "html.parser")
+
+    soup_result = BeautifulSoup(open("../data/yy_reduce_data/{}_result".format(i), encoding='utf-8'),
+                         "html.parser")
+
+    source_sentence_list = soup_source.select('p > em')
+    result_sentence_list = soup_result.select('p > em')
+    for sentence_index in range(len(source_sentence_list)):
+        try:
+            if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"] \
+                    and result_sentence_list[sentence_index]["class"] == ['similar']:
+            # if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"]:
+                source_text = source_sentence_list[sentence_index].string
+                result_text = result_sentence_list[sentence_index].string
+                if source_text != None and result_text != None:
+                    data.append([source_sentence_list[sentence_index].string, result_sentence_list[sentence_index].string])
+        except:
+            print(i,sentence_index)
+
+    # print(data)
+
+
+def data_clean(text):
+    # 清洗excel中的非法字符，都是不常见的不可显示字符，例如退格，响铃等
+    ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
+    text = ILLEGAL_CHARACTERS_RE.sub(r'', text)
+    return text
+
+
+df = pd.DataFrame(data,columns=["原文","yy降重"])
+for col in df.columns:
+    df[col] = df[col].apply(lambda x: data_clean(x))
+
+df.to_excel("../data/论文_yy_小说.xlsx",index=None)
--- a/data_do/yy训练数据处理.py
+++ b/data_do/yy训练数据处理.py
@ -0,0 +1,26 @@
+# -*- coding: utf-8 -*-
+
+"""
+@Time    :  2022/12/23 16:00
+@Author  : 
+@FileName: 
+@Software: 
+@Describe:
+"""
+import pandas as pd
+
+
+path = "../data/论文_yy_小说_3.xlsx"
+df_list = pd.read_excel(path).values.tolist()
+
+df_list_new = []
+print(len(df_list))
+for i in df_list:
+    a = i[0]
+    b = i[1]
+    df_list_new.append("\t".join([a, "to", b]))
+
+with open("../data/train_yy_1.txt", "w", encoding='utf-8') as file:
+    for i in df_list_new:
+        file.write(i + '\n')
+    file.close()
--- a/data_do/合并数据.py
+++ b/data_do/合并数据.py
@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+
+"""
+@Time    :  2023/2/2 11:29
+@Author  : 
+@FileName: 
+@Software: 
+@Describe:
+"""
+
+def read_text(file):
+    try:
+        with open(file, 'r', encoding="utf-8") as f:
+            lines = [x.strip() for x in f if x.strip() != '']
+    except:
+        with open(file, 'r', encoding="gbk") as f:
+            lines = [x.strip() for x in f if x.strip() != '']
+    return lines
+
+
+if __name__ == '__main__':
+
+    data = []
+    path_list = ["train_yy_sim_10.txt", "train_yy_1_sim_10.txt"]
+    for i in path_list:
+        data += read_text(i)
+    fileName = '../data/train_yy_sim.txt'
+    with open(fileName, 'w', encoding='utf-8') as file:
+        for i in data:
+            file.write(str(i) + '\n')
+        file.close()
--- a/data_do/处理11篇文本.py
+++ b/data_do/处理11篇文本.py
@ -0,0 +1,102 @@
+# -*- coding: utf-8 -*-
+
+"""
+@Time    :  2023/2/14 14:19
+@Author  : 
+@FileName: 
+@Software: 
+@Describe:
+"""
+import os
+import pandas as pd
+import docx
+import win32com.client as wc
+import operator
+
+
+def is_chinese(uchar):
+    """
+    判断一个unicode是否是汉字
+    :param uchar:
+    :return:
+    """
+    if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
+        return True
+    else:
+        return False
+
+
+def snetence(text):
+    bool_ = True
+    for i in text:
+        bool_1 = is_chinese(i)
+        if bool_1 == True:
+            continue
+        else:
+            if i in fuhao:
+                continue
+            else:
+                bool_ = False
+                break
+    return bool_
+
+
+fuhao = ["，","。",",","、"]
+path = '../data/11篇'
+path_list = []
+for file_name in os.listdir(path):
+    path_list.append(file_name)
+# print(path_list)
+
+
+
+
+
+# path = "../data/11篇/13139551_于丰源_在线考试系统-原文.docx"
+for docx_name in path_list:
+    data = []
+    data_new = []
+    file_name = docx_name.split(".")[0]
+    file_type = docx_name.split(".")[1]
+    if file_type == "docx":
+        document = docx.Document(path + "/" + docx_name)
+    else:
+        continue
+    #获取所有段落
+    all_paragraphs = document.paragraphs
+    for paragraph in all_paragraphs:
+        #打印每一个段落的文字
+        data.append(paragraph.text)
+    data = sorted(data,key=lambda x:len(x))
+    for data_dan in data:
+        if data_dan == "":
+            continue
+        for i in data_dan:
+            if i == "章":
+                continue
+        if len(data_dan) < 15:
+            continue
+        # else:
+        #     bool_ = snetence(data_dan)
+        #     if bool_ == True:
+        #         data_new.append(data_dan)
+        else:
+            data_list = str(data_dan).split("。")
+            for data_dan_short in data_list:
+                if data_dan_short == "":
+                    continue
+                for i in data_dan_short:
+                    if i == "章":
+                        continue
+                if len(data_dan_short) < 10:
+                    continue
+                if len(data_dan_short) > 120:
+                    continue
+                data_new.append(data_dan_short)
+    data_new = sorted(data_new,key=lambda x:len(x))
+    data_df = []
+    for i in data_new:
+        data_df.append([i])
+
+
+    pd.DataFrame(data_df).to_csv("../data/11篇csv/" + file_name + ".csv", index=False)
--- a/data_do/处理11篇顺序输入.py
+++ b/data_do/处理11篇顺序输入.py
@ -0,0 +1,114 @@
+# -*- coding: utf-8 -*-
+
+"""
+@Time    :  2023/2/14 14:19
+@Author  :
+@FileName:
+@Software:
+@Describe:
+"""
+import os
+import pandas as pd
+import docx
+import win32com.client as wc
+import operator
+
+
+def is_chinese(uchar):
+    """
+    判断一个unicode是否是汉字
+    :param uchar:
+    :return:
+    """
+    if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
+        return True
+    else:
+        return False
+
+
+def snetence(text):
+    bool_ = True
+    for i in text:
+        bool_1 = is_chinese(i)
+        if bool_1 == True:
+            continue
+        else:
+            if i in fuhao:
+                continue
+            else:
+                bool_ = False
+                break
+    return bool_
+
+
+fuhao = ["，","。",",","、"]
+path = '../data/11篇'
+path_list = []
+for file_name in os.listdir(path):
+    path_list.append(file_name)
+# print(path_list)
+
+
+def chulichangju_2(text, chulipangban_return_list):
+    fuhao = ["，","？","！","…"]
+    text_1 = text[:120]
+    text_2 = text[120:]
+    text_1_new = ""
+    for i in range(len(text_1)-1, -1, -1):
+        if text_1[i] in fuhao:
+            text_1_new = text_1[:i]
+            text_1_new += text_1[i]
+            chulipangban_return_list.append(text_1_new)
+            if text_2 != "":
+                if i+1 != 120:
+                    text_2 = text_1[i+1:] + text_2
+            break
+        # else:
+        #     chulipangban_return_list.append(text_1)
+    if text_1_new == "":
+        chulipangban_return_list.append(text_1)
+    if text_2 != "":
+        chulipangban_return_list = chulichangju_2(text_2,  chulipangban_return_list)
+    return chulipangban_return_list
+
+
+# path = "../data/11篇/13139551_于丰源_在线考试系统-原文.docx"
+for docx_name in path_list:
+    data = []
+    data_new = []
+    file_name = docx_name.split(".")[0]
+    file_type = docx_name.split(".")[1]
+    if file_type == "docx":
+        document = docx.Document(path + "/" + docx_name)
+    else:
+        continue
+    #获取所有段落
+    all_paragraphs = document.paragraphs
+    for paragraph in all_paragraphs:
+        #打印每一个段落的文字
+        data.append(paragraph.text)
+    for data_dan in data:
+        if data_dan == "":
+            continue
+        # else:
+        #     bool_ = snetence(data_dan)
+        #     if bool_ == True:
+        #         data_new.append(data_dan)
+        else:
+            data_list = str(data_dan).split("。")
+            for data_dan_short in data_list:
+                if data_dan_short == "":
+                    continue
+                if len(data_dan_short) > 120:
+                    dan_list = chulichangju_2(data_dan_short, [])
+                    dan_list[-1] += "。"
+                    data_new.extend(dan_list)
+                else:
+                    data_dan_short += "。"
+                    data_new.append(data_dan_short)
+
+
+    with open("../data/11篇txt/{}.txt".format(file_name), "w", encoding='utf-8') as file:
+        for i in data_new:
+            file.write(i + '\n')
+        file.close()
--- a/data_do/处理yy数据原始数据.py
+++ b/data_do/处理yy数据原始数据.py
@ -0,0 +1,87 @@
+# -*- coding: utf-8 -*-
+
+"""
+@Time    :  2023/2/1 19:18
+@Author  : 
+@FileName: 
+@Software: 
+@Describe:
+"""
+
+import os
+import os
+from bs4 import BeautifulSoup
+import pandas as pd
+import re
+from tqdm import tqdm
+# 遍历文件夹
+
+
+data_path_list = []
+
+def walkFile(file):
+    for root, dirs, files in os.walk(file):
+        # root 表示当前访问的文件夹路径
+        # dirs 表示该文件夹下的子目录名list
+        # files 表示该文件夹下的文件list
+        # 遍历文件
+        for f in files:
+            # print(os.path.join(root, f))
+            data_path_list.append(os.path.join(root, f))
+        # 遍历所有的文件夹
+        # for d in dirs:
+        #     print(os.path.join(root, d))
+def main():
+    walkFile("../data/yy_reduce_data_20221219-20230131")
+
+main()
+
+
+data = []
+
+rootpath_list = []
+for i in data_path_list:
+    danpath_list = str(i).split("\\")
+    rootpath_list.append("\\".join(danpath_list[:-1]))
+
+print(len(rootpath_list))
+rootpath_list = list(set(rootpath_list))
+for i in tqdm(rootpath_list):
+    try:
+        soup_source = BeautifulSoup(open("{}\\source".format(i), encoding='utf-8'),
+                                    "html.parser")
+        soup_result = BeautifulSoup(open("{}\\result".format(i), encoding='utf-8'),
+                             "html.parser")
+    except:
+        continue
+
+    source_sentence_list = soup_source.select('p > em')
+    result_sentence_list = soup_result.select('p > em')
+    for sentence_index in range(len(source_sentence_list)):
+        try:
+            if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"] \
+                    and result_sentence_list[sentence_index]["class"] == ['similar']:
+            # if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"]:
+                source_text = source_sentence_list[sentence_index].string
+                result_text = result_sentence_list[sentence_index].string
+                if source_text != None and result_text != None:
+                    data.append([source_sentence_list[sentence_index].string, result_sentence_list[sentence_index].string])
+        except:
+            pass
+        #     print(i,sentence_index)
+
+    # print(data)
+
+
+def data_clean(text):
+    # 清洗excel中的非法字符，都是不常见的不可显示字符，例如退格，响铃等
+    ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
+    text = ILLEGAL_CHARACTERS_RE.sub(r'', text)
+    return text
+
+
+df = pd.DataFrame(data,columns=["原文","yy降重"])
+for col in df.columns:
+    df[col] = df[col].apply(lambda x: data_clean(x))
+
+df.to_excel("../data/论文_yy_小说_1.xlsx",index=None)
--- a/data_do/汇总.py
+++ b/data_do/汇总.py
@ -0,0 +1,33 @@
+# -*- coding: utf-8 -*-
+
+"""
+@Time    :  2023/2/15 14:13
+@Author  : 
+@FileName: 
+@Software: 
+@Describe:
+"""
+import os
+import pandas as pd
+path_1 = '../data/11篇excel'
+path_2 = "../data/11篇临时拼接"
+path_3 = "../data/11篇临时拼接2"
+path_list = []
+for file_name in os.listdir(path_1):
+    path_list.append(file_name)
+
+
+for file_name in path_list:
+    data_new = []
+    data_1 = pd.read_excel(path_1 + "/" + file_name).values.tolist()
+    data_2 = pd.read_excel(path_2 + "/" + file_name).values.tolist()
+    file_name_0 = file_name.split(".")[0]
+    file_name_1 = file_name.split(".")[1]
+    file_name_ = file_name_0 + "_." + file_name_1
+    data_3 = pd.read_excel(path_3 + "/" + file_name_).values.tolist()
+    for i in range(len(data_1)):
+        data_new.append(data_1[i] + [data_2[i][1]] + [data_3[i][1]])
+
+    df = pd.DataFrame(data_new,columns=["原文","simbert","simbert_datasim07","bertsim_simsim"])
+    df.to_excel("../data/11篇测试excel_汇总_1/{}.xlsx".format(file_name_0), index=None)
+
--- a/data_do/筛选训练数据.py
+++ b/data_do/筛选训练数据.py
@ -0,0 +1,209 @@
+# -*- coding: utf-8 -*-
+
+"""
+@Time    :  2023/1/31 19:02
+@Author  : 
+@FileName: 
+@Software: 
+@Describe:
+"""
+import os
+# os.environ["TF_KERAS"] = "1"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+import json
+import numpy as np
+from bert4keras.backend import keras, set_gelu
+from bert4keras.tokenizers import Tokenizer, load_vocab
+from bert4keras.models import build_transformer_model
+from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
+from bert4keras.snippets import sequence_padding, DataGenerator
+from bert4keras.snippets import open
+from keras.layers import Lambda, Dense
+import tensorflow as tf
+from keras.backend import set_session
+from sklearn.metrics.pairwise import cosine_similarity
+from rouge import Rouge  # pip install rouge
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+from tqdm import tqdm
+import jieba
+from gensim.models import KeyedVectors, word2vec, Word2Vec
+import random
+
+config = tf.ConfigProto()
+config.gpu_options.allow_growth = True
+set_session(tf.Session(config=config)) # 此处不同
+
+class Word2vecModel:
+    def __init__(self):
+        self.path = "E:\pycharm_workspace\查重分析\word2vec_model\\word2vec_add_new_18.model"
+        self.model = Word2Vec.load(self.path)
+
+    def word2vec_res(self,seg_0_list, seg_1_list):
+        sentence_0_list = []
+        sentence_1_list = []
+        for i in seg_0_list:
+            a = self.model.wv[i]
+            sentence_0_list.append(a)
+
+        for i in seg_1_list:
+            a = self.model.wv[i]
+            sentence_1_list.append(a)
+
+        return sentence_0_list, sentence_1_list
+
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+
+    def __init__(self):
+        self.rouge = Rouge()
+        self.smooth = SmoothingFunction().method1
+        self.best_bleu = 0.
+
+    # def on_epoch_end(self, epoch, logs=None):
+    #     metrics = self.evaluate(valid_data)  # 评测模型
+    #     if metrics['bleu'] > self.best_bleu:
+    #         self.best_bleu = metrics['bleu']
+    #         model.save_weights('./best_model.weights')  # 保存模型
+    #     metrics['best_bleu'] = self.best_bleu
+    #     print('valid_data:', metrics)
+
+
+    def evaluate_t(self, data_1, data_2, topk=1):
+        total = 0
+        rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
+
+        scores = self.rouge.get_scores(hyps=[data_1], refs=[data_2])
+        rouge_1 += scores[0]['rouge-1']['f']
+        rouge_2 += scores[0]['rouge-2']['f']
+        rouge_l += scores[0]['rouge-l']['f']
+        bleu += sentence_bleu(
+            references=[data_1.split(' ')],
+            hypothesis=data_2.split(' '),
+            smoothing_function=self.smooth
+        )
+        # rouge_1 /= total
+        # rouge_2 /= total
+        # rouge_l /= total
+        # bleu /= total
+        return [rouge_1, rouge_2, rouge_l, bleu]
+
+class bertModel:
+    def __init__(self):
+        self.config_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json'
+        self.checkpoint_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt'
+        self.dict_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt'
+        self.token_dict, self.keep_tokens = load_vocab(
+            dict_path=self.dict_path,
+            simplified=True,
+            startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
+        )
+        self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True)
+        self.buildmodel()
+
+
+    def buildmodel(self):
+        bert = build_transformer_model(
+            config_path=self.config_path,
+            checkpoint_path=self.checkpoint_path,
+            return_keras_model=False,
+        )
+
+        output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
+        self.model = keras.models.Model(bert.model.input, output)
+        self.model.summary()
+
+    def predict(self,text):
+        batch_token_ids, batch_segment_ids = [], []
+        token_ids, segment_ids = self.tokenizer.encode(text, maxlen=256)
+        batch_token_ids.append(token_ids)
+        batch_segment_ids.append(segment_ids)
+        return self.model.predict([batch_token_ids, batch_segment_ids])
+
+
+def simbert(data_1, data_2):
+    pass
+
+def word2vec():
+    pass
+
+def bleu():
+    pass
+
+
+if __name__ == '__main__':
+    file = "../data/train_yy_1.txt"
+    model = bertModel()
+    eval_class = Evaluator()
+    # word2vecmodel = Word2vecModel()
+    try:
+        with open(file, 'r', encoding="utf-8") as f:
+            lines = [x.strip() for x in f if x.strip() != '']
+    except:
+        with open(file, 'r', encoding="gbk") as f:
+            lines = [x.strip() for x in f if x.strip() != '']
+
+    bertsim_list = []
+    bleusim_list = []
+    word2vecsim_list = []
+    data_train_text = []
+
+    random.shuffle(lines)
+    print(len(lines))
+    for txt in tqdm(lines):
+        text = txt.split('\t')
+        if len(text) == 3:
+            data_1 = text[0]
+            data_2 = text[2]
+            y1 = model.predict(data_1)[0]
+            y2 = model.predict(data_2)[0]
+            cos_sim = cosine_similarity(y1.reshape(1, -1), y2.reshape(1, -1))
+            bertsim_list.append(cos_sim[0][0])
+            bertsim_value = cos_sim[0][0]
+
+            eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2))
+            bleusim_list.append(eval_list[3])
+            bleusim_value = eval_list[3]
+
+            if bertsim_value <= 0.94 and bleusim_value <= 0.4:
+                data_train_text.append("\t".join([data_1, "to", data_2]))
+
+
+
+            # eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2))
+            # bleusim_list.append(eval_list[3])
+
+            # word2vec
+            # seg_0_list = jieba.cut(data_1, cut_all=False)
+            # seg_1_list = jieba.cut(data_2, cut_all=False)
+            # seg_0_list = [char for char in seg_0_list]
+            # seg_1_list = [char for char in seg_1_list]
+            #
+            # sentence_0_list, sentence_1_list = word2vecmodel.word2vec_res(seg_0_list, seg_1_list)
+            # sentence_0_result = np.array(sentence_0_list)
+            # sentence_1_result = np.array(sentence_1_list)
+            # sentence_0_array = sentence_0_result.sum(axis=0)
+            # sentence_1_array = sentence_1_result.sum(axis=0)
+            # print(sentence_1_array)
+            # print(sentence_0_array)
+            # cos_sim = cosine_similarity(sentence_0_array.reshape(1, -1), sentence_1_array.reshape(1, -1))
+            # word2vecsim_list.append(cos_sim[0][0])
+
+    # bertsim_list = sorted(bertsim_list)
+    # zong_num = len(bertsim_list)
+    # print(bertsim_list)
+    # print(bertsim_list[int(zong_num/2)])
+    # print(sum(bertsim_list)/zong_num)
+
+    # bleusim_list = sorted(bleusim_list)
+    # zong_num = len(bleusim_list)
+    # print(bleusim_list)
+    # print(bleusim_list[int(zong_num / 2)])
+    # print(sum(bleusim_list) / zong_num)
+
+    fileName = 'train_yy_1_sim_09.txt'
+    with open(fileName, 'w', encoding='utf-8') as file:
+        for i in data_train_text:
+            file.write(str(i) + '\n')
+        file.close()
+
--- a/data_do/筛选训练数据_str_sim.py
+++ b/data_do/筛选训练数据_str_sim.py
@ -0,0 +1,209 @@
+# -*- coding: utf-8 -*-
+
+"""
+@Time    :  2023/1/31 19:02
+@Author  :
+@FileName:
+@Software:
+@Describe:
+"""
+import os
+# os.environ["TF_KERAS"] = "1"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+import json
+import numpy as np
+from bert4keras.backend import keras, set_gelu
+from bert4keras.tokenizers import Tokenizer, load_vocab
+from bert4keras.models import build_transformer_model
+from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
+from bert4keras.snippets import sequence_padding, DataGenerator
+from bert4keras.snippets import open
+from keras.layers import Lambda, Dense
+import tensorflow as tf
+from keras.backend import set_session
+from sklearn.metrics.pairwise import cosine_similarity
+from rouge import Rouge  # pip install rouge
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+from tqdm import tqdm
+import jieba
+from gensim.models import KeyedVectors, word2vec, Word2Vec
+import random
+
+config = tf.ConfigProto()
+config.gpu_options.allow_growth = True
+set_session(tf.Session(config=config)) # 此处不同
+
+class Word2vecModel:
+    def __init__(self):
+        self.path = "E:\pycharm_workspace\查重分析\word2vec_model\\word2vec_add_new_18.model"
+        self.model = Word2Vec.load(self.path)
+
+    def word2vec_res(self,seg_0_list, seg_1_list):
+        sentence_0_list = []
+        sentence_1_list = []
+        for i in seg_0_list:
+            a = self.model.wv[i]
+            sentence_0_list.append(a)
+
+        for i in seg_1_list:
+            a = self.model.wv[i]
+            sentence_1_list.append(a)
+
+        return sentence_0_list, sentence_1_list
+
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+
+    def __init__(self):
+        self.rouge = Rouge()
+        self.smooth = SmoothingFunction().method1
+        self.best_bleu = 0.
+
+    # def on_epoch_end(self, epoch, logs=None):
+    #     metrics = self.evaluate(valid_data)  # 评测模型
+    #     if metrics['bleu'] > self.best_bleu:
+    #         self.best_bleu = metrics['bleu']
+    #         model.save_weights('./best_model.weights')  # 保存模型
+    #     metrics['best_bleu'] = self.best_bleu
+    #     print('valid_data:', metrics)
+
+
+    def evaluate_t(self, data_1, data_2, topk=1):
+        total = 0
+        rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
+
+        scores = self.rouge.get_scores(hyps=[data_1], refs=[data_2])
+        rouge_1 += scores[0]['rouge-1']['f']
+        rouge_2 += scores[0]['rouge-2']['f']
+        rouge_l += scores[0]['rouge-l']['f']
+        bleu += sentence_bleu(
+            references=[data_1.split(' ')],
+            hypothesis=data_2.split(' '),
+            smoothing_function=self.smooth
+        )
+        # rouge_1 /= total
+        # rouge_2 /= total
+        # rouge_l /= total
+        # bleu /= total
+        return [rouge_1, rouge_2, rouge_l, bleu]
+
+class bertModel:
+    def __init__(self):
+        self.config_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json'
+        self.checkpoint_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt'
+        self.dict_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt'
+        self.token_dict, self.keep_tokens = load_vocab(
+            dict_path=self.dict_path,
+            simplified=True,
+            startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
+        )
+        self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True)
+        self.buildmodel()
+
+
+    def buildmodel(self):
+        bert = build_transformer_model(
+            config_path=self.config_path,
+            checkpoint_path=self.checkpoint_path,
+            return_keras_model=False,
+        )
+
+        output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
+        self.model = keras.models.Model(bert.model.input, output)
+        self.model.summary()
+
+    def predict(self,text):
+        batch_token_ids, batch_segment_ids = [], []
+        token_ids, segment_ids = self.tokenizer.encode(text, maxlen=256)
+        batch_token_ids.append(token_ids)
+        batch_segment_ids.append(segment_ids)
+        return self.model.predict([batch_token_ids, batch_segment_ids])
+
+
+def simbert(data_1, data_2):
+    pass
+
+def word2vec():
+    pass
+
+def bleu():
+    pass
+
+
+if __name__ == '__main__':
+    file = "../data/train_yy_1.txt"
+    model = bertModel()
+    eval_class = Evaluator()
+    # word2vecmodel = Word2vecModel()
+    try:
+        with open(file, 'r', encoding="utf-8") as f:
+            lines = [x.strip() for x in f if x.strip() != '']
+    except:
+        with open(file, 'r', encoding="gbk") as f:
+            lines = [x.strip() for x in f if x.strip() != '']
+
+    bertsim_list = []
+    bleusim_list = []
+    word2vecsim_list = []
+    data_train_text = []
+
+    random.shuffle(lines)
+    print(len(lines))
+    for txt in tqdm(lines):
+        text = txt.split('\t')
+        if len(text) == 3:
+            data_1 = text[0]
+            data_2 = text[2]
+            y1 = model.predict(data_1)[0]
+            y2 = model.predict(data_2)[0]
+            cos_sim = cosine_similarity(y1.reshape(1, -1), y2.reshape(1, -1))
+            bertsim_list.append(cos_sim[0][0])
+            bertsim_value = cos_sim[0][0]
+
+            eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2))
+            bleusim_list.append(eval_list[3])
+            bleusim_value = eval_list[3]
+
+            if bertsim_value <= 0.94 and bleusim_value <= 0.4:
+                data_train_text.append("\t".join([data_1, "to", data_2]))
+
+
+
+            # eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2))
+            # bleusim_list.append(eval_list[3])
+
+            # word2vec
+            # seg_0_list = jieba.cut(data_1, cut_all=False)
+            # seg_1_list = jieba.cut(data_2, cut_all=False)
+            # seg_0_list = [char for char in seg_0_list]
+            # seg_1_list = [char for char in seg_1_list]
+            #
+            # sentence_0_list, sentence_1_list = word2vecmodel.word2vec_res(seg_0_list, seg_1_list)
+            # sentence_0_result = np.array(sentence_0_list)
+            # sentence_1_result = np.array(sentence_1_list)
+            # sentence_0_array = sentence_0_result.sum(axis=0)
+            # sentence_1_array = sentence_1_result.sum(axis=0)
+            # print(sentence_1_array)
+            # print(sentence_0_array)
+            # cos_sim = cosine_similarity(sentence_0_array.reshape(1, -1), sentence_1_array.reshape(1, -1))
+            # word2vecsim_list.append(cos_sim[0][0])
+
+    # bertsim_list = sorted(bertsim_list)
+    # zong_num = len(bertsim_list)
+    # print(bertsim_list)
+    # print(bertsim_list[int(zong_num/2)])
+    # print(sum(bertsim_list)/zong_num)
+
+    # bleusim_list = sorted(bleusim_list)
+    # zong_num = len(bleusim_list)
+    # print(bleusim_list)
+    # print(bleusim_list[int(zong_num / 2)])
+    # print(sum(bleusim_list) / zong_num)
+
+    fileName = 'train_yy_1_sim_09.txt'
+    with open(fileName, 'w', encoding='utf-8') as file:
+        for i in data_train_text:
+            file.write(str(i) + '\n')
+        file.close()
+
--- a/data_do/筛选训练数据strsim.py
+++ b/data_do/筛选训练数据strsim.py
@ -0,0 +1,206 @@
+# -*- coding: utf-8 -*-
+
+"""
+@Time    :  2023/1/31 19:02
+@Author  :
+@FileName:
+@Software:
+@Describe:
+"""
+import os
+# os.environ["TF_KERAS"] = "1"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+import json
+import numpy as np
+from bert4keras.backend import keras, set_gelu
+from bert4keras.tokenizers import Tokenizer, load_vocab
+from bert4keras.models import build_transformer_model
+from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
+from bert4keras.snippets import sequence_padding, DataGenerator
+from bert4keras.snippets import open
+from keras.layers import Lambda, Dense
+import tensorflow as tf
+from keras.backend import set_session
+from sklearn.metrics.pairwise import cosine_similarity
+from rouge import Rouge  # pip install rouge
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+from tqdm import tqdm
+import jieba
+from gensim.models import KeyedVectors, word2vec, Word2Vec
+import random
+import difflib
+
+config = tf.ConfigProto()
+config.gpu_options.allow_growth = True
+set_session(tf.Session(config=config)) # 此处不同
+
+class Word2vecModel:
+    def __init__(self):
+        self.path = "E:\pycharm_workspace\查重分析\word2vec_model\\word2vec_add_new_18.model"
+        self.model = Word2Vec.load(self.path)
+
+    def word2vec_res(self,seg_0_list, seg_1_list):
+        sentence_0_list = []
+        sentence_1_list = []
+        for i in seg_0_list:
+            a = self.model.wv[i]
+            sentence_0_list.append(a)
+
+        for i in seg_1_list:
+            a = self.model.wv[i]
+            sentence_1_list.append(a)
+
+        return sentence_0_list, sentence_1_list
+
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+
+    def __init__(self):
+        self.rouge = Rouge()
+        self.smooth = SmoothingFunction().method1
+        self.best_bleu = 0.
+
+    # def on_epoch_end(self, epoch, logs=None):
+    #     metrics = self.evaluate(valid_data)  # 评测模型
+    #     if metrics['bleu'] > self.best_bleu:
+    #         self.best_bleu = metrics['bleu']
+    #         model.save_weights('./best_model.weights')  # 保存模型
+    #     metrics['best_bleu'] = self.best_bleu
+    #     print('valid_data:', metrics)
+
+
+    def evaluate_t(self, data_1, data_2, topk=1):
+        total = 0
+        rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
+
+        scores = self.rouge.get_scores(hyps=[data_1], refs=[data_2])
+        rouge_1 += scores[0]['rouge-1']['f']
+        rouge_2 += scores[0]['rouge-2']['f']
+        rouge_l += scores[0]['rouge-l']['f']
+        bleu += sentence_bleu(
+            references=[data_1.split(' ')],
+            hypothesis=data_2.split(' '),
+            smoothing_function=self.smooth
+        )
+        # rouge_1 /= total
+        # rouge_2 /= total
+        # rouge_l /= total
+        # bleu /= total
+        return [rouge_1, rouge_2, rouge_l, bleu]
+
+class bertModel:
+    def __init__(self):
+        self.config_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json'
+        self.checkpoint_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt'
+        self.dict_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt'
+        self.token_dict, self.keep_tokens = load_vocab(
+            dict_path=self.dict_path,
+            simplified=True,
+            startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
+        )
+        self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True)
+        self.buildmodel()
+
+
+    def buildmodel(self):
+        bert = build_transformer_model(
+            config_path=self.config_path,
+            checkpoint_path=self.checkpoint_path,
+            return_keras_model=False,
+        )
+
+        output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
+        self.model = keras.models.Model(bert.model.input, output)
+        self.model.summary()
+
+    def predict(self,text):
+        batch_token_ids, batch_segment_ids = [], []
+        token_ids, segment_ids = self.tokenizer.encode(text, maxlen=256)
+        batch_token_ids.append(token_ids)
+        batch_segment_ids.append(segment_ids)
+        return self.model.predict([batch_token_ids, batch_segment_ids])
+
+
+def simbert(data_1, data_2):
+    pass
+
+def word2vec():
+    pass
+
+def bleu():
+    pass
+
+
+if __name__ == '__main__':
+    file = "../data/train_yy_1.txt"
+    model = bertModel()
+    eval_class = Evaluator()
+    # word2vecmodel = Word2vecModel()
+    try:
+        with open(file, 'r', encoding="utf-8") as f:
+            lines = [x.strip() for x in f if x.strip() != '']
+    except:
+        with open(file, 'r', encoding="gbk") as f:
+            lines = [x.strip() for x in f if x.strip() != '']
+
+    bertsim_list = []
+    bleusim_list = []
+    word2vecsim_list = []
+    data_train_text = []
+
+    random.shuffle(lines)
+    print(len(lines))
+    for txt in tqdm(lines):
+        text = txt.split('\t')
+        if len(text) == 3:
+            data_1 = text[0]
+            data_2 = text[2]
+            str_sim_value = difflib.SequenceMatcher(None, data_1, data_2).quick_ratio()
+            if len(data_2) - len(data_1) < 0 and len(data_2) / len(data_1) > 0.8:
+                num_yu = 1 - len(data_2) / len(data_1)
+                str_sim_value = 1 - str_sim_value * num_yu
+
+
+            if str_sim_value < 0.70:
+                data_train_text.append("\t".join([data_1, "to", data_2]))
+
+
+
+            # eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2))
+            # bleusim_list.append(eval_list[3])
+
+            # word2vec
+            # seg_0_list = jieba.cut(data_1, cut_all=False)
+            # seg_1_list = jieba.cut(data_2, cut_all=False)
+            # seg_0_list = [char for char in seg_0_list]
+            # seg_1_list = [char for char in seg_1_list]
+            #
+            # sentence_0_list, sentence_1_list = word2vecmodel.word2vec_res(seg_0_list, seg_1_list)
+            # sentence_0_result = np.array(sentence_0_list)
+            # sentence_1_result = np.array(sentence_1_list)
+            # sentence_0_array = sentence_0_result.sum(axis=0)
+            # sentence_1_array = sentence_1_result.sum(axis=0)
+            # print(sentence_1_array)
+            # print(sentence_0_array)
+            # cos_sim = cosine_similarity(sentence_0_array.reshape(1, -1), sentence_1_array.reshape(1, -1))
+            # word2vecsim_list.append(cos_sim[0][0])
+
+    # bertsim_list = sorted(bertsim_list)
+    # zong_num = len(bertsim_list)
+    # print(bertsim_list)
+    # print(bertsim_list[int(zong_num/2)])
+    # print(sum(bertsim_list)/zong_num)
+
+    # bleusim_list = sorted(bleusim_list)
+    # zong_num = len(bleusim_list)
+    # print(bleusim_list)
+    # print(bleusim_list[int(zong_num / 2)])
+    # print(sum(bleusim_list) / zong_num)
+    print(len(data_train_text))
+    fileName = 'train_yy_1_sim_10.txt'
+    with open(fileName, 'w', encoding='utf-8') as file:
+        for i in data_train_text:
+            file.write(str(i) + '\n')
+        file.close()
+
--- a/data_do/统计数据的_str_sim_值.py
+++ b/data_do/统计数据的_str_sim_值.py
@ -0,0 +1,83 @@
+# -*- coding: utf-8 -*-
+
+"""
+@Time    :  2023/1/31 19:02
+@Author  :
+@FileName:
+@Software:
+@Describe:
+"""
+import os
+# os.environ["TF_KERAS"] = "1"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+import json
+import numpy as np
+from bert4keras.backend import keras, set_gelu
+from bert4keras.tokenizers import Tokenizer, load_vocab
+from bert4keras.models import build_transformer_model
+from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
+from bert4keras.snippets import sequence_padding, DataGenerator
+from bert4keras.snippets import open
+from keras.layers import Lambda, Dense
+import tensorflow as tf
+from keras.backend import set_session
+from sklearn.metrics.pairwise import cosine_similarity
+from rouge import Rouge  # pip install rouge
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+from tqdm import tqdm
+import jieba
+from gensim.models import KeyedVectors, word2vec, Word2Vec
+import random
+import difflib
+
+config = tf.ConfigProto()
+config.gpu_options.allow_growth = True
+set_session(tf.Session(config=config)) # 此处不同
+
+
+
+
+if __name__ == '__main__':
+    file = "../data/train_yy.txt"
+    # word2vecmodel = Word2vecModel()
+    try:
+        with open(file, 'r', encoding="utf-8") as f:
+            lines = [x.strip() for x in f if x.strip() != '']
+    except:
+        with open(file, 'r', encoding="gbk") as f:
+            lines = [x.strip() for x in f if x.strip() != '']
+
+    bertsim_list = []
+    bleusim_list = []
+    strsim_list = []
+    word2vecsim_list = []
+    data_train_text = []
+
+    random.shuffle(lines)
+    print(len(lines))
+    for txt in tqdm(lines):
+        text = txt.split('\t')
+        if len(text) == 3:
+            data_1 = text[0]
+            data_2 = text[2]
+            # difflib.SequenceMatcher(None, one_data[0], one_data_simbert[j]).quick_ratio()
+            str_sim_value = difflib.SequenceMatcher(None, data_1, data_2).quick_ratio()
+
+            if len(data_2) - len(data_1) < 0:
+                num_yu = 1- len(data_2) / len(data_1)
+                str_sim_value = 1 - str_sim_value * num_yu
+            strsim_list.append(str_sim_value)
+
+    strsim_list = sorted(strsim_list)
+    zong_num = len(strsim_list)
+    print(strsim_list)
+    print(strsim_list[int(zong_num / 2)])
+    print(sum(strsim_list) / zong_num)
+
+
+
+    fileName = '统计数据的strsim值.txt'
+    with open(fileName, 'w', encoding='utf-8') as file:
+        for i in strsim_list:
+            file.write(str(i) + '\n')
+        file.close()
--- a/data_do/统计数据的strsim值.txt
+++ b/data_do/统计数据的strsim值.txt
--- a/data_do/统计非中文字符.py
+++ b/data_do/统计非中文字符.py
@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+
+"""
+@Time    :  2022/12/20 16:20
+@Author  : 
+@FileName: 
+@Software: 
+@Describe:
+"""
+import pandas as pd
+from tqdm import tqdm
+import json
+
+
+path = "../data/论文_yy_小说.xlsx"
+df_list = pd.read_excel(path).values.tolist()
+
+fuhao = {}
+for i in tqdm(df_list):
+    for word in i:
+        word = str(word)
+        if word == "nan":
+            continue
+        for ch in word:
+            if u'\u4e00' <= ch <= u'\u9fff':
+                continue
+            else:
+                if ch in fuhao:
+                    fuhao[ch] += 1
+                else:
+                    fuhao[ch] = 1
+
+test_1 = sorted(fuhao.items(),key=lambda x:x[1],reverse=True)
+fuhao_new = {}
+for i in test_1:
+    fuhao_new[i[0]] = i[1]
+
+json_data = json.dumps(fuhao_new, ensure_ascii=False, indent=2)
+with open('../data/fuhao.json', 'w', encoding="utf-8") as f_six:
+    f_six.write(json_data)
--- a/data_do/读取docx.py
+++ b/data_do/读取docx.py
@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+
+"""
+@Time    :  2023/2/14 14:29
+@Author  : 
+@FileName: 
+@Software: 
+@Describe:
+"""
+# from zipfile import ZipFile
+# from bs4 import BeautifulSoup
+#
+# document=ZipFile("../data/11篇/13139551_于丰源_在线考试系统-原文.docx")
+# xml=document.read("word/document.xml")
+# wordObj=BeautifulSoup(xml.decode("utf-8"))
+# texts=wordObj.findAll("w:t")
+# for text in texts:
+#   print(text.text)
+
+import docx
+import win32com.client as wc
+import operator
+#doc文件另存为docx
+
+# path = "E:\pycharm_workspace\drop_weight_rewrite\data\\11篇\\13138572_李菊_谈小学语文教学难点与创新-原文.doc"
+# path_new = "E:\pycharm_workspace\drop_weight_rewrite\data\\11篇\\13138572_李菊_谈小学语文教学难点与创新-原文.docx"
+# word = wc.Dispatch("Word.Application")
+# doc = word.Documents.Open(path)
+# # 12代表转换后为docx文件
+# doc.SaveAs(path_new, 12)
+# doc.Close
+# word.Quit
+#
+# #读取转换后的docx
+#
+# file = docx.Document(path_new)
+# for p in file.paragraphs:
+#     print(p.text)
+
+# from win32com import client as wc
+# w = wc.Dispatch('Word.Application')
+# # 或者使用下面的方法，使用启动独立的进程：
+# # w = wc.DispatchEx('Word.Application')
+# doc=w.Documents.Open(path)
+# doc.SaveAs(path_new,16)#必须有参数16，否则会出错.
+
+
+import os
+from win32com import client as wc
+
+def save_doc_to_docx(rawpath):  # doc转docx
+    '''
+    :param rawpath: 传入和传出文件夹的路径
+    :return: None
+    '''
+    word = wc.Dispatch("Word.Application")
+    # 不能用相对路径，老老实实用绝对路径
+    # 需要处理的文件所在文件夹目录
+    filenamelist = os.listdir(rawpath)
+    for i in os.listdir(rawpath):
+        # 找出文件中以.doc结尾并且不以~$开头的文件（~$是为了排除临时文件的）
+        if i.endswith('.doc') and not i.startswith('~$'):
+            print(i)
+            # try
+            # 打开文件
+            doc = word.Documents.Open(rawpath + i)
+            # # 将文件名与后缀分割
+            rename = os.path.splitext(i)
+            # 将文件另存为.docx
+            doc.SaveAs(pathls + rename[0] + '.docx', 12)  # 12表示docx格式
+            doc.Close()
+    word.Quit()
+
+
+if __name__ == '__main__':
+    pathls = "E:\\pycharm_workspace\\drop_weight_rewrite\\data\\11篇\\"
+    save_doc_to_docx(pathls)
--- a/data_do/进一步处理降重数据.py
+++ b/data_do/进一步处理降重数据.py
@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+
+"""
+@Time    :  2022/12/20 17:56
+@Author  : 
+@FileName: 
+@Software: 
+@Describe:
+"""
+import pandas as pd
+from tqdm import tqdm
+import json
+
+
+path = "../data/论文_yy_小说_1.xlsx"
+df_list = pd.read_excel(path).values.tolist()
+
+
+def sentence_do(source,result):
+    source = str(source)
+    result = str(result)
+    if source == "nan" or result == "nan":
+        return False, source,result
+    if len(source) > 120 or len(result) > 120:
+        return False, source,result
+    else:
+        source = str(source).replace("\t", "").replace("　", "").replace("", "").replace("", "")
+        result = str(result).replace("\t", "").replace("　", "").replace("", "").replace("", "")
+        return True, source, result
+
+
+df_list_new = []
+for i in df_list:
+    source = i[0]
+    result = i[1]
+    bool, source, result = sentence_do(source, result)
+    if bool == False:
+        continue
+    else:
+        df_list_new.append([source,result])
+
+df = pd.DataFrame(df_list_new, columns=["原文","yy降重"])
+df.to_excel("../data/论文_yy_小说_3.xlsx",index=None)
--- a/优化点/优化点.txt
+++ b/优化点/优化点.txt
@ -0,0 +1,55 @@
+方案1
+	1.人名替换处理
+	2.使用翻译模型翻译出来句子人工筛选
+	3.筛选出来的句子进行训练
+		训练方案1
+			常规训练方案
+		训练方案2
+			使用simbert方式训练
+	
+
+翻译模型方案
+	1，使用百度注册账号
+	2，https://www.oschina.net/news/170812/meta-open-source-wmt-21
+	3，使用t5模型训练
+	
+	
+改写语料
+	1，现有的语料 58万
+	2，lcqmc语料
+	
+	
+写作猫发现的规律
+	1.
+		可以确定的是，应该是用模型做的，因为同样的短句，后面加点东西和不加，出来的整体句子都不一样
+		
+		伴随着风硕哀嚎的声音，白策笑嘻嘻的离开了。 --> 在冯硕的惨叫声中，白策嘿嘿一笑，转身离去。
+		伴随着风硕哀嚎的声音 --> 伴随着风硕的惨叫声
+	2. 
+		人名特殊处理
+	3.(待定,因为试了几条结果跟我预想的差不多，不敢确定)
+		一般是合并句子居多，且一般是前面句子或者后面句子形容中心句
+		
+		
+	修改后短句增多的例子
+		对方无声哭泣着再次说了一遍。 --> 对方带着哭腔，又重复了一遍。
+		足足等到叶澜歌换好了衣服 --> 一直到叶澜歌穿好衣服，这才停了下来。
+	修改后短句合并的例子
+		林天宇瞬间就察觉到问题，对着她，轻声问道，“你是不是察觉到什么？” --> 林天宇立刻意识到了不对劲，小声的问了一句，“有没有发现？”
+
+		
+	方案：
+	1，有可能形容词替换之后（使用work2vec），再用中译英英译中（也会补充出来代词）
+
+		张三尖叫的喊道
+		张三尖叫的嚷嚷道  --> 张三大吼一声。 --> 张三咆哮起来。 --> 张三大吼一声。
+	
+		
+	
+	
+	
+	
+
+	
+
+	
--- a/改进方案/改进方案.txt
+++ b/改进方案/改进方案.txt
--- a/测试range_1.py
+++ b/测试range_1.py
@ -0,0 +1,70 @@
+# -*- coding: utf-8 -*-
+
+"""
+@Time    :  2023/2/3 17:27
+@Author  : 
+@FileName: 
+@Software: 
+@Describe:
+"""
+
+import os
+# os.environ["TF_KERAS"] = "1"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+from bert4keras.backend import keras, set_gelu
+import numpy as np
+from rouge import Rouge  # pip install rouge
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+
+
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+
+    def __init__(self):
+        self.rouge = Rouge()
+        self.smooth = SmoothingFunction().method1
+        self.best_bleu = 0.
+
+    # def on_epoch_end(self, epoch, logs=None):
+    #     metrics = self.evaluate(valid_data)  # 评测模型
+    #     if metrics['bleu'] > self.best_bleu:
+    #         self.best_bleu = metrics['bleu']
+    #         model.save_weights('./best_model.weights')  # 保存模型
+    #     metrics['best_bleu'] = self.best_bleu
+    #     print('valid_data:', metrics)
+
+
+    def evaluate_t(self, data_1, data_2, topk=1):
+        total = 0
+        rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
+
+        scores = self.rouge.get_scores(hyps=[data_1], refs=[data_2])
+        rouge_1 += scores[0]['rouge-1']['f']
+        rouge_2 += scores[0]['rouge-2']['f']
+        rouge_l += scores[0]['rouge-l']['f']
+        bleu += sentence_bleu(
+            references=[data_1.split(' ')],
+            hypothesis=data_2.split(' '),
+            smoothing_function=self.smooth
+        )
+        # rouge_1 /= total
+        # rouge_2 /= total
+        # rouge_l /= total
+        # bleu /= total
+        return [rouge_1, rouge_2, rouge_l, bleu]
+
+eval_class = Evaluator()
+data_1 = "上海中心大厦"
+data_2 = "上海"
+eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2))
+print(eval_list)
+
+
+a = len(data_2) - len(data_1)
+if a < 0:
+    a *
+
+
+a = len(data_2)/len(data_1)
+np.exp(len(data_2) - len(data_1))