第一次提交，非batch预测版本

3 years ago · 8a56f2ffe6
18 changed files with 278312 additions and 0 deletions
--- a/data_do/yy数据处理.py
+++ b/data_do/yy数据处理.py
@ -0,0 +1,74 @@
 # -*- coding: utf-8 -*-
 """
@Time    :  2022/12/20 10:35
@Author  : 
@FileName: 
@Software: 
@Describe:
 """
 import os
 from bs4 import BeautifulSoup
 import pandas as pd
 import re
 # 遍历文件夹
 path_list = []
 def walkFile(file):
    for root, dirs, files in os.walk(file):
        # root 表示当前访问的文件夹路径
        # dirs 表示该文件夹下的子目录名list
        # files 表示该文件夹下的文件list
        # 遍历文件
        for f in files:
            name = str(f).split("_")[0]
            path_list.append(name)
 walkFile("../data/yy_reduce_data")
 path_list = list(set(path_list))
 print(path_list)
 data = []
 for i in path_list:
    soup_source = BeautifulSoup(open("../data/yy_reduce_data/{}_source".format(i), encoding='utf-8'),
                                "html.parser")
    soup_result = BeautifulSoup(open("../data/yy_reduce_data/{}_result".format(i), encoding='utf-8'),
                         "html.parser")
    source_sentence_list = soup_source.select('p > em')
    result_sentence_list = soup_result.select('p > em')
    for sentence_index in range(len(source_sentence_list)):
        try:
            if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"] \
                    and result_sentence_list[sentence_index]["class"] == ['similar']:
            # if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"]:
                source_text = source_sentence_list[sentence_index].string
                result_text = result_sentence_list[sentence_index].string
                if source_text != None and result_text != None:
                    data.append([source_sentence_list[sentence_index].string, result_sentence_list[sentence_index].string])
        except:
            print(i,sentence_index)
    # print(data)
 def data_clean(text):
    # 清洗excel中的非法字符，都是不常见的不可显示字符，例如退格，响铃等
    ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
    text = ILLEGAL_CHARACTERS_RE.sub(r'', text)
    return text
 df = pd.DataFrame(data,columns=["原文","yy降重"])
 for col in df.columns:
    df[col] = df[col].apply(lambda x: data_clean(x))
 df.to_excel("../data/论文_yy_小说.xlsx",index=None)
--- a/data_do/yy训练数据处理.py
+++ b/data_do/yy训练数据处理.py
@ -0,0 +1,26 @@
 # -*- coding: utf-8 -*-
 """
@Time    :  2022/12/23 16:00
@Author  : 
@FileName: 
@Software: 
@Describe:
 """
 import pandas as pd
 path = "../data/论文_yy_小说_3.xlsx"
 df_list = pd.read_excel(path).values.tolist()
 df_list_new = []
 print(len(df_list))
 for i in df_list:
    a = i[0]
    b = i[1]
    df_list_new.append("\t".join([a, "to", b]))
 with open("../data/train_yy_1.txt", "w", encoding='utf-8') as file:
    for i in df_list_new:
        file.write(i + '\n')
    file.close()
--- a/data_do/合并数据.py
+++ b/data_do/合并数据.py
@ -0,0 +1,31 @@
 # -*- coding: utf-8 -*-
 """
@Time    :  2023/2/2 11:29
@Author  : 
@FileName: 
@Software: 
@Describe:
 """
 def read_text(file):
    try:
        with open(file, 'r', encoding="utf-8") as f:
            lines = [x.strip() for x in f if x.strip() != '']
    except:
        with open(file, 'r', encoding="gbk") as f:
            lines = [x.strip() for x in f if x.strip() != '']
    return lines
 if __name__ == '__main__':
    data = []
    path_list = ["train_yy_sim_10.txt", "train_yy_1_sim_10.txt"]
    for i in path_list:
        data += read_text(i)
    fileName = '../data/train_yy_sim.txt'
    with open(fileName, 'w', encoding='utf-8') as file:
        for i in data:
            file.write(str(i) + '\n')
        file.close()
--- a/data_do/处理11篇文本.py
+++ b/data_do/处理11篇文本.py
@ -0,0 +1,102 @@
 # -*- coding: utf-8 -*-
 """
@Time    :  2023/2/14 14:19
@Author  : 
@FileName: 
@Software: 
@Describe:
 """
 import os
 import pandas as pd
 import docx
 import win32com.client as wc
 import operator
 def is_chinese(uchar):
    """
    判断一个unicode是否是汉字
    :param uchar:
    :return:
    """
    if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
        return True
    else:
        return False
 def snetence(text):
    bool_ = True
    for i in text:
        bool_1 = is_chinese(i)
        if bool_1 == True:
            continue
        else:
            if i in fuhao:
                continue
            else:
                bool_ = False
                break
    return bool_
 fuhao = ["，","。",",","、"]
 path = '../data/11篇'
 path_list = []
 for file_name in os.listdir(path):
    path_list.append(file_name)
 # print(path_list)
 # path = "../data/11篇/13139551_于丰源_在线考试系统-原文.docx"
 for docx_name in path_list:
    data = []
    data_new = []
    file_name = docx_name.split(".")[0]
    file_type = docx_name.split(".")[1]
    if file_type == "docx":
        document = docx.Document(path + "/" + docx_name)
    else:
        continue
    #获取所有段落
    all_paragraphs = document.paragraphs
    for paragraph in all_paragraphs:
        #打印每一个段落的文字
        data.append(paragraph.text)
    data = sorted(data,key=lambda x:len(x))
    for data_dan in data:
        if data_dan == "":
            continue
        for i in data_dan:
            if i == "章":
                continue
        if len(data_dan) < 15:
            continue
        # else:
        #     bool_ = snetence(data_dan)
        #     if bool_ == True:
        #         data_new.append(data_dan)
        else:
            data_list = str(data_dan).split("。")
            for data_dan_short in data_list:
                if data_dan_short == "":
                    continue
                for i in data_dan_short:
                    if i == "章":
                        continue
                if len(data_dan_short) < 10:
                    continue
                if len(data_dan_short) > 120:
                    continue
                data_new.append(data_dan_short)
    data_new = sorted(data_new,key=lambda x:len(x))
    data_df = []
    for i in data_new:
        data_df.append([i])
    pd.DataFrame(data_df).to_csv("../data/11篇csv/" + file_name + ".csv", index=False)
--- a/data_do/处理11篇顺序输入.py
+++ b/data_do/处理11篇顺序输入.py
@ -0,0 +1,114 @@
 # -*- coding: utf-8 -*-
 """
@Time    :  2023/2/14 14:19
@Author  :
@FileName:
@Software:
@Describe:
 """
 import os
 import pandas as pd
 import docx
 import win32com.client as wc
 import operator
 def is_chinese(uchar):
    """
    判断一个unicode是否是汉字
    :param uchar:
    :return:
    """
    if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
        return True
    else:
        return False
 def snetence(text):
    bool_ = True
    for i in text:
        bool_1 = is_chinese(i)
        if bool_1 == True:
            continue
        else:
            if i in fuhao:
                continue
            else:
                bool_ = False
                break
    return bool_
 fuhao = ["，","。",",","、"]
 path = '../data/11篇'
 path_list = []
 for file_name in os.listdir(path):
    path_list.append(file_name)
 # print(path_list)
 def chulichangju_2(text, chulipangban_return_list):
    fuhao = ["，","？","！","…"]
    text_1 = text[:120]
    text_2 = text[120:]
    text_1_new = ""
    for i in range(len(text_1)-1, -1, -1):
        if text_1[i] in fuhao:
            text_1_new = text_1[:i]
            text_1_new += text_1[i]
            chulipangban_return_list.append(text_1_new)
            if text_2 != "":
                if i+1 != 120:
                    text_2 = text_1[i+1:] + text_2
            break
        # else:
        #     chulipangban_return_list.append(text_1)
    if text_1_new == "":
        chulipangban_return_list.append(text_1)
    if text_2 != "":
        chulipangban_return_list = chulichangju_2(text_2,  chulipangban_return_list)
    return chulipangban_return_list
 # path = "../data/11篇/13139551_于丰源_在线考试系统-原文.docx"
 for docx_name in path_list:
    data = []
    data_new = []
    file_name = docx_name.split(".")[0]
    file_type = docx_name.split(".")[1]
    if file_type == "docx":
        document = docx.Document(path + "/" + docx_name)
    else:
        continue
    #获取所有段落
    all_paragraphs = document.paragraphs
    for paragraph in all_paragraphs:
        #打印每一个段落的文字
        data.append(paragraph.text)
    for data_dan in data:
        if data_dan == "":
            continue
        # else:
        #     bool_ = snetence(data_dan)
        #     if bool_ == True:
        #         data_new.append(data_dan)
        else:
            data_list = str(data_dan).split("。")
            for data_dan_short in data_list:
                if data_dan_short == "":
                    continue
                if len(data_dan_short) > 120:
                    dan_list = chulichangju_2(data_dan_short, [])
                    dan_list[-1] += "。"
                    data_new.extend(dan_list)
                else:
                    data_dan_short += "。"
                    data_new.append(data_dan_short)
    with open("../data/11篇txt/{}.txt".format(file_name), "w", encoding='utf-8') as file:
        for i in data_new:
            file.write(i + '\n')
        file.close()
--- a/data_do/处理yy数据原始数据.py
+++ b/data_do/处理yy数据原始数据.py
@ -0,0 +1,87 @@
 # -*- coding: utf-8 -*-
 """
@Time    :  2023/2/1 19:18
@Author  : 
@FileName: 
@Software: 
@Describe:
 """
 import os
 import os
 from bs4 import BeautifulSoup
 import pandas as pd
 import re
 from tqdm import tqdm
 # 遍历文件夹
 data_path_list = []
 def walkFile(file):
    for root, dirs, files in os.walk(file):
        # root 表示当前访问的文件夹路径
        # dirs 表示该文件夹下的子目录名list
        # files 表示该文件夹下的文件list
        # 遍历文件
        for f in files:
            # print(os.path.join(root, f))
            data_path_list.append(os.path.join(root, f))
        # 遍历所有的文件夹
        # for d in dirs:
        #     print(os.path.join(root, d))
 def main():
    walkFile("../data/yy_reduce_data_20221219-20230131")
 main()
 data = []
 rootpath_list = []
 for i in data_path_list:
    danpath_list = str(i).split("\\")
    rootpath_list.append("\\".join(danpath_list[:-1]))
 print(len(rootpath_list))
 rootpath_list = list(set(rootpath_list))
 for i in tqdm(rootpath_list):
    try:
        soup_source = BeautifulSoup(open("{}\\source".format(i), encoding='utf-8'),
                                    "html.parser")
        soup_result = BeautifulSoup(open("{}\\result".format(i), encoding='utf-8'),
                             "html.parser")
    except:
        continue
    source_sentence_list = soup_source.select('p > em')
    result_sentence_list = soup_result.select('p > em')
    for sentence_index in range(len(source_sentence_list)):
        try:
            if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"] \
                    and result_sentence_list[sentence_index]["class"] == ['similar']:
            # if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"]:
                source_text = source_sentence_list[sentence_index].string
                result_text = result_sentence_list[sentence_index].string
                if source_text != None and result_text != None:
                    data.append([source_sentence_list[sentence_index].string, result_sentence_list[sentence_index].string])
        except:
            pass
        #     print(i,sentence_index)
    # print(data)
 def data_clean(text):
    # 清洗excel中的非法字符，都是不常见的不可显示字符，例如退格，响铃等
    ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
    text = ILLEGAL_CHARACTERS_RE.sub(r'', text)
    return text
 df = pd.DataFrame(data,columns=["原文","yy降重"])
 for col in df.columns:
    df[col] = df[col].apply(lambda x: data_clean(x))
 df.to_excel("../data/论文_yy_小说_1.xlsx",index=None)
--- a/data_do/汇总.py
+++ b/data_do/汇总.py
@ -0,0 +1,33 @@
 # -*- coding: utf-8 -*-
 """
@Time    :  2023/2/15 14:13
@Author  : 
@FileName: 
@Software: 
@Describe:
 """
 import os
 import pandas as pd
 path_1 = '../data/11篇excel'
 path_2 = "../data/11篇临时拼接"
 path_3 = "../data/11篇临时拼接2"
 path_list = []
 for file_name in os.listdir(path_1):
    path_list.append(file_name)
 for file_name in path_list:
    data_new = []
    data_1 = pd.read_excel(path_1 + "/" + file_name).values.tolist()
    data_2 = pd.read_excel(path_2 + "/" + file_name).values.tolist()
    file_name_0 = file_name.split(".")[0]
    file_name_1 = file_name.split(".")[1]
    file_name_ = file_name_0 + "_." + file_name_1
    data_3 = pd.read_excel(path_3 + "/" + file_name_).values.tolist()
    for i in range(len(data_1)):
        data_new.append(data_1[i] + [data_2[i][1]] + [data_3[i][1]])
    df = pd.DataFrame(data_new,columns=["原文","simbert","simbert_datasim07","bertsim_simsim"])
    df.to_excel("../data/11篇测试excel_汇总_1/{}.xlsx".format(file_name_0), index=None)
--- a/data_do/筛选训练数据.py
+++ b/data_do/筛选训练数据.py
@ -0,0 +1,209 @@
 # -*- coding: utf-8 -*-
 """
@Time    :  2023/1/31 19:02
@Author  : 
@FileName: 
@Software: 
@Describe:
 """
 import os
 # os.environ["TF_KERAS"] = "1"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 import json
 import numpy as np
 from bert4keras.backend import keras, set_gelu
 from bert4keras.tokenizers import Tokenizer, load_vocab
 from bert4keras.models import build_transformer_model
 from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
 from bert4keras.snippets import sequence_padding, DataGenerator
 from bert4keras.snippets import open
 from keras.layers import Lambda, Dense
 import tensorflow as tf
 from keras.backend import set_session
 from sklearn.metrics.pairwise import cosine_similarity
 from rouge import Rouge  # pip install rouge
 from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
 from tqdm import tqdm
 import jieba
 from gensim.models import KeyedVectors, word2vec, Word2Vec
 import random
 config = tf.ConfigProto()
 config.gpu_options.allow_growth = True
 set_session(tf.Session(config=config)) # 此处不同
 class Word2vecModel:
    def __init__(self):
        self.path = "E:\pycharm_workspace\查重分析\word2vec_model\\word2vec_add_new_18.model"
        self.model = Word2Vec.load(self.path)
    def word2vec_res(self,seg_0_list, seg_1_list):
        sentence_0_list = []
        sentence_1_list = []
        for i in seg_0_list:
            a = self.model.wv[i]
            sentence_0_list.append(a)
        for i in seg_1_list:
            a = self.model.wv[i]
            sentence_1_list.append(a)
        return sentence_0_list, sentence_1_list
 class Evaluator(keras.callbacks.Callback):
    """评估与保存
    """
    def __init__(self):
        self.rouge = Rouge()
        self.smooth = SmoothingFunction().method1
        self.best_bleu = 0.
    # def on_epoch_end(self, epoch, logs=None):
    #     metrics = self.evaluate(valid_data)  # 评测模型
    #     if metrics['bleu'] > self.best_bleu:
    #         self.best_bleu = metrics['bleu']
    #         model.save_weights('./best_model.weights')  # 保存模型
    #     metrics['best_bleu'] = self.best_bleu
    #     print('valid_data:', metrics)
    def evaluate_t(self, data_1, data_2, topk=1):
        total = 0
        rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
        scores = self.rouge.get_scores(hyps=[data_1], refs=[data_2])
        rouge_1 += scores[0]['rouge-1']['f']
        rouge_2 += scores[0]['rouge-2']['f']
        rouge_l += scores[0]['rouge-l']['f']
        bleu += sentence_bleu(
            references=[data_1.split(' ')],
            hypothesis=data_2.split(' '),
            smoothing_function=self.smooth
        )
        # rouge_1 /= total
        # rouge_2 /= total
        # rouge_l /= total
        # bleu /= total
        return [rouge_1, rouge_2, rouge_l, bleu]
 class bertModel:
    def __init__(self):
        self.config_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json'
        self.checkpoint_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt'
        self.dict_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt'
        self.token_dict, self.keep_tokens = load_vocab(
            dict_path=self.dict_path,
            simplified=True,
            startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
        )
        self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True)
        self.buildmodel()
    def buildmodel(self):
        bert = build_transformer_model(
            config_path=self.config_path,
            checkpoint_path=self.checkpoint_path,
            return_keras_model=False,
        )
        output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
        self.model = keras.models.Model(bert.model.input, output)
        self.model.summary()
    def predict(self,text):
        batch_token_ids, batch_segment_ids = [], []
        token_ids, segment_ids = self.tokenizer.encode(text, maxlen=256)
        batch_token_ids.append(token_ids)
        batch_segment_ids.append(segment_ids)
        return self.model.predict([batch_token_ids, batch_segment_ids])
 def simbert(data_1, data_2):
    pass
 def word2vec():
    pass
 def bleu():
    pass
 if __name__ == '__main__':
    file = "../data/train_yy_1.txt"
    model = bertModel()
    eval_class = Evaluator()
    # word2vecmodel = Word2vecModel()
    try:
        with open(file, 'r', encoding="utf-8") as f:
            lines = [x.strip() for x in f if x.strip() != '']
    except:
        with open(file, 'r', encoding="gbk") as f:
            lines = [x.strip() for x in f if x.strip() != '']
    bertsim_list = []
    bleusim_list = []
    word2vecsim_list = []
    data_train_text = []
    random.shuffle(lines)
    print(len(lines))
    for txt in tqdm(lines):
        text = txt.split('\t')
        if len(text) == 3:
            data_1 = text[0]
            data_2 = text[2]
            y1 = model.predict(data_1)[0]
            y2 = model.predict(data_2)[0]
            cos_sim = cosine_similarity(y1.reshape(1, -1), y2.reshape(1, -1))
            bertsim_list.append(cos_sim[0][0])
            bertsim_value = cos_sim[0][0]
            eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2))
            bleusim_list.append(eval_list[3])
            bleusim_value = eval_list[3]
            if bertsim_value <= 0.94 and bleusim_value <= 0.4:
                data_train_text.append("\t".join([data_1, "to", data_2]))
            # eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2))
            # bleusim_list.append(eval_list[3])
            # word2vec
            # seg_0_list = jieba.cut(data_1, cut_all=False)
            # seg_1_list = jieba.cut(data_2, cut_all=False)
            # seg_0_list = [char for char in seg_0_list]
            # seg_1_list = [char for char in seg_1_list]
            #
            # sentence_0_list, sentence_1_list = word2vecmodel.word2vec_res(seg_0_list, seg_1_list)
            # sentence_0_result = np.array(sentence_0_list)
            # sentence_1_result = np.array(sentence_1_list)
            # sentence_0_array = sentence_0_result.sum(axis=0)
            # sentence_1_array = sentence_1_result.sum(axis=0)
            # print(sentence_1_array)
            # print(sentence_0_array)
            # cos_sim = cosine_similarity(sentence_0_array.reshape(1, -1), sentence_1_array.reshape(1, -1))
            # word2vecsim_list.append(cos_sim[0][0])
    # bertsim_list = sorted(bertsim_list)
    # zong_num = len(bertsim_list)
    # print(bertsim_list)
    # print(bertsim_list[int(zong_num/2)])
    # print(sum(bertsim_list)/zong_num)
    # bleusim_list = sorted(bleusim_list)
    # zong_num = len(bleusim_list)
    # print(bleusim_list)
    # print(bleusim_list[int(zong_num / 2)])
    # print(sum(bleusim_list) / zong_num)
    fileName = 'train_yy_1_sim_09.txt'
    with open(fileName, 'w', encoding='utf-8') as file:
        for i in data_train_text:
            file.write(str(i) + '\n')
        file.close()
--- a/data_do/筛选训练数据_str_sim.py
+++ b/data_do/筛选训练数据_str_sim.py
@ -0,0 +1,209 @@
 # -*- coding: utf-8 -*-
 """
@Time    :  2023/1/31 19:02
@Author  :
@FileName:
@Software:
@Describe:
 """
 import os
 # os.environ["TF_KERAS"] = "1"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 import json
 import numpy as np
 from bert4keras.backend import keras, set_gelu
 from bert4keras.tokenizers import Tokenizer, load_vocab
 from bert4keras.models import build_transformer_model
 from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
 from bert4keras.snippets import sequence_padding, DataGenerator
 from bert4keras.snippets import open
 from keras.layers import Lambda, Dense
 import tensorflow as tf
 from keras.backend import set_session
 from sklearn.metrics.pairwise import cosine_similarity
 from rouge import Rouge  # pip install rouge
 from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
 from tqdm import tqdm
 import jieba
 from gensim.models import KeyedVectors, word2vec, Word2Vec
 import random
 config = tf.ConfigProto()
 config.gpu_options.allow_growth = True
 set_session(tf.Session(config=config)) # 此处不同
 class Word2vecModel:
    def __init__(self):
        self.path = "E:\pycharm_workspace\查重分析\word2vec_model\\word2vec_add_new_18.model"
        self.model = Word2Vec.load(self.path)
    def word2vec_res(self,seg_0_list, seg_1_list):
        sentence_0_list = []
        sentence_1_list = []
        for i in seg_0_list:
            a = self.model.wv[i]
            sentence_0_list.append(a)
        for i in seg_1_list:
            a = self.model.wv[i]
            sentence_1_list.append(a)
        return sentence_0_list, sentence_1_list
 class Evaluator(keras.callbacks.Callback):
    """评估与保存
    """
    def __init__(self):
        self.rouge = Rouge()
        self.smooth = SmoothingFunction().method1
        self.best_bleu = 0.
    # def on_epoch_end(self, epoch, logs=None):
    #     metrics = self.evaluate(valid_data)  # 评测模型
    #     if metrics['bleu'] > self.best_bleu:
    #         self.best_bleu = metrics['bleu']
    #         model.save_weights('./best_model.weights')  # 保存模型
    #     metrics['best_bleu'] = self.best_bleu
    #     print('valid_data:', metrics)
    def evaluate_t(self, data_1, data_2, topk=1):
        total = 0
        rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
        scores = self.rouge.get_scores(hyps=[data_1], refs=[data_2])
        rouge_1 += scores[0]['rouge-1']['f']
        rouge_2 += scores[0]['rouge-2']['f']
        rouge_l += scores[0]['rouge-l']['f']
        bleu += sentence_bleu(
            references=[data_1.split(' ')],
            hypothesis=data_2.split(' '),
            smoothing_function=self.smooth
        )
        # rouge_1 /= total
        # rouge_2 /= total
        # rouge_l /= total
        # bleu /= total
        return [rouge_1, rouge_2, rouge_l, bleu]
 class bertModel:
    def __init__(self):
        self.config_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json'
        self.checkpoint_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt'
        self.dict_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt'
        self.token_dict, self.keep_tokens = load_vocab(
            dict_path=self.dict_path,
            simplified=True,
            startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
        )
        self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True)
        self.buildmodel()
    def buildmodel(self):
        bert = build_transformer_model(
            config_path=self.config_path,
            checkpoint_path=self.checkpoint_path,
            return_keras_model=False,
        )
        output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
        self.model = keras.models.Model(bert.model.input, output)
        self.model.summary()
    def predict(self,text):
        batch_token_ids, batch_segment_ids = [], []
        token_ids, segment_ids = self.tokenizer.encode(text, maxlen=256)
        batch_token_ids.append(token_ids)
        batch_segment_ids.append(segment_ids)
        return self.model.predict([batch_token_ids, batch_segment_ids])
 def simbert(data_1, data_2):
    pass
 def word2vec():
    pass
 def bleu():
    pass
 if __name__ == '__main__':
    file = "../data/train_yy_1.txt"
    model = bertModel()
    eval_class = Evaluator()
    # word2vecmodel = Word2vecModel()
    try:
        with open(file, 'r', encoding="utf-8") as f:
            lines = [x.strip() for x in f if x.strip() != '']
    except:
        with open(file, 'r', encoding="gbk") as f:
            lines = [x.strip() for x in f if x.strip() != '']
    bertsim_list = []
    bleusim_list = []
    word2vecsim_list = []
    data_train_text = []
    random.shuffle(lines)
    print(len(lines))
    for txt in tqdm(lines):
        text = txt.split('\t')
        if len(text) == 3:
            data_1 = text[0]
            data_2 = text[2]
            y1 = model.predict(data_1)[0]
            y2 = model.predict(data_2)[0]
            cos_sim = cosine_similarity(y1.reshape(1, -1), y2.reshape(1, -1))
            bertsim_list.append(cos_sim[0][0])
            bertsim_value = cos_sim[0][0]
            eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2))
            bleusim_list.append(eval_list[3])
            bleusim_value = eval_list[3]
            if bertsim_value <= 0.94 and bleusim_value <= 0.4:
                data_train_text.append("\t".join([data_1, "to", data_2]))
            # eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2))
            # bleusim_list.append(eval_list[3])
            # word2vec
            # seg_0_list = jieba.cut(data_1, cut_all=False)
            # seg_1_list = jieba.cut(data_2, cut_all=False)
            # seg_0_list = [char for char in seg_0_list]
            # seg_1_list = [char for char in seg_1_list]
            #
            # sentence_0_list, sentence_1_list = word2vecmodel.word2vec_res(seg_0_list, seg_1_list)
            # sentence_0_result = np.array(sentence_0_list)
            # sentence_1_result = np.array(sentence_1_list)
            # sentence_0_array = sentence_0_result.sum(axis=0)
            # sentence_1_array = sentence_1_result.sum(axis=0)
            # print(sentence_1_array)
            # print(sentence_0_array)
            # cos_sim = cosine_similarity(sentence_0_array.reshape(1, -1), sentence_1_array.reshape(1, -1))
            # word2vecsim_list.append(cos_sim[0][0])
    # bertsim_list = sorted(bertsim_list)
    # zong_num = len(bertsim_list)
    # print(bertsim_list)
    # print(bertsim_list[int(zong_num/2)])
    # print(sum(bertsim_list)/zong_num)
    # bleusim_list = sorted(bleusim_list)
    # zong_num = len(bleusim_list)
    # print(bleusim_list)
    # print(bleusim_list[int(zong_num / 2)])
    # print(sum(bleusim_list) / zong_num)
    fileName = 'train_yy_1_sim_09.txt'
    with open(fileName, 'w', encoding='utf-8') as file:
        for i in data_train_text:
            file.write(str(i) + '\n')
        file.close()
--- a/data_do/筛选训练数据strsim.py
+++ b/data_do/筛选训练数据strsim.py
@ -0,0 +1,206 @@
 # -*- coding: utf-8 -*-
 """
@Time    :  2023/1/31 19:02
@Author  :
@FileName:
@Software:
@Describe:
 """
 import os
 # os.environ["TF_KERAS"] = "1"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 import json
 import numpy as np
 from bert4keras.backend import keras, set_gelu
 from bert4keras.tokenizers import Tokenizer, load_vocab
 from bert4keras.models import build_transformer_model
 from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
 from bert4keras.snippets import sequence_padding, DataGenerator
 from bert4keras.snippets import open
 from keras.layers import Lambda, Dense
 import tensorflow as tf
 from keras.backend import set_session
 from sklearn.metrics.pairwise import cosine_similarity
 from rouge import Rouge  # pip install rouge
 from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
 from tqdm import tqdm
 import jieba
 from gensim.models import KeyedVectors, word2vec, Word2Vec
 import random
 import difflib
 config = tf.ConfigProto()
 config.gpu_options.allow_growth = True
 set_session(tf.Session(config=config)) # 此处不同
 class Word2vecModel:
    def __init__(self):
        self.path = "E:\pycharm_workspace\查重分析\word2vec_model\\word2vec_add_new_18.model"
        self.model = Word2Vec.load(self.path)
    def word2vec_res(self,seg_0_list, seg_1_list):
        sentence_0_list = []
        sentence_1_list = []
        for i in seg_0_list:
            a = self.model.wv[i]
            sentence_0_list.append(a)
        for i in seg_1_list:
            a = self.model.wv[i]
            sentence_1_list.append(a)
        return sentence_0_list, sentence_1_list
 class Evaluator(keras.callbacks.Callback):
    """评估与保存
    """
    def __init__(self):
        self.rouge = Rouge()
        self.smooth = SmoothingFunction().method1
        self.best_bleu = 0.
    # def on_epoch_end(self, epoch, logs=None):
    #     metrics = self.evaluate(valid_data)  # 评测模型
    #     if metrics['bleu'] > self.best_bleu:
    #         self.best_bleu = metrics['bleu']
    #         model.save_weights('./best_model.weights')  # 保存模型
    #     metrics['best_bleu'] = self.best_bleu
    #     print('valid_data:', metrics)
    def evaluate_t(self, data_1, data_2, topk=1):
        total = 0
        rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
        scores = self.rouge.get_scores(hyps=[data_1], refs=[data_2])
        rouge_1 += scores[0]['rouge-1']['f']
        rouge_2 += scores[0]['rouge-2']['f']
        rouge_l += scores[0]['rouge-l']['f']
        bleu += sentence_bleu(
            references=[data_1.split(' ')],
            hypothesis=data_2.split(' '),
            smoothing_function=self.smooth
        )
        # rouge_1 /= total
        # rouge_2 /= total
        # rouge_l /= total
        # bleu /= total
        return [rouge_1, rouge_2, rouge_l, bleu]
 class bertModel:
    def __init__(self):
        self.config_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json'
        self.checkpoint_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt'
        self.dict_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt'
        self.token_dict, self.keep_tokens = load_vocab(
            dict_path=self.dict_path,
            simplified=True,
            startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
        )
        self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True)
        self.buildmodel()
    def buildmodel(self):
        bert = build_transformer_model(
            config_path=self.config_path,
            checkpoint_path=self.checkpoint_path,
            return_keras_model=False,
        )
        output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
        self.model = keras.models.Model(bert.model.input, output)
        self.model.summary()
    def predict(self,text):
        batch_token_ids, batch_segment_ids = [], []
        token_ids, segment_ids = self.tokenizer.encode(text, maxlen=256)
        batch_token_ids.append(token_ids)
        batch_segment_ids.append(segment_ids)
        return self.model.predict([batch_token_ids, batch_segment_ids])
 def simbert(data_1, data_2):
    pass
 def word2vec():
    pass
 def bleu():
    pass
 if __name__ == '__main__':
    file = "../data/train_yy_1.txt"
    model = bertModel()
    eval_class = Evaluator()
    # word2vecmodel = Word2vecModel()
    try:
        with open(file, 'r', encoding="utf-8") as f:
            lines = [x.strip() for x in f if x.strip() != '']
    except:
        with open(file, 'r', encoding="gbk") as f:
            lines = [x.strip() for x in f if x.strip() != '']
    bertsim_list = []
    bleusim_list = []
    word2vecsim_list = []
    data_train_text = []
    random.shuffle(lines)
    print(len(lines))
    for txt in tqdm(lines):
        text = txt.split('\t')
        if len(text) == 3:
            data_1 = text[0]
            data_2 = text[2]
            str_sim_value = difflib.SequenceMatcher(None, data_1, data_2).quick_ratio()
            if len(data_2) - len(data_1) < 0 and len(data_2) / len(data_1) > 0.8:
                num_yu = 1 - len(data_2) / len(data_1)
                str_sim_value = 1 - str_sim_value * num_yu
            if str_sim_value < 0.70:
                data_train_text.append("\t".join([data_1, "to", data_2]))
            # eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2))
            # bleusim_list.append(eval_list[3])
            # word2vec
            # seg_0_list = jieba.cut(data_1, cut_all=False)
            # seg_1_list = jieba.cut(data_2, cut_all=False)
            # seg_0_list = [char for char in seg_0_list]
            # seg_1_list = [char for char in seg_1_list]
            #
            # sentence_0_list, sentence_1_list = word2vecmodel.word2vec_res(seg_0_list, seg_1_list)
            # sentence_0_result = np.array(sentence_0_list)
            # sentence_1_result = np.array(sentence_1_list)
            # sentence_0_array = sentence_0_result.sum(axis=0)
            # sentence_1_array = sentence_1_result.sum(axis=0)
            # print(sentence_1_array)
            # print(sentence_0_array)
            # cos_sim = cosine_similarity(sentence_0_array.reshape(1, -1), sentence_1_array.reshape(1, -1))
            # word2vecsim_list.append(cos_sim[0][0])
    # bertsim_list = sorted(bertsim_list)
    # zong_num = len(bertsim_list)
    # print(bertsim_list)
    # print(bertsim_list[int(zong_num/2)])
    # print(sum(bertsim_list)/zong_num)
    # bleusim_list = sorted(bleusim_list)
    # zong_num = len(bleusim_list)
    # print(bleusim_list)
    # print(bleusim_list[int(zong_num / 2)])
    # print(sum(bleusim_list) / zong_num)
    print(len(data_train_text))
    fileName = 'train_yy_1_sim_10.txt'
    with open(fileName, 'w', encoding='utf-8') as file:
        for i in data_train_text:
            file.write(str(i) + '\n')
        file.close()
--- a/data_do/统计数据的_str_sim_值.py
+++ b/data_do/统计数据的_str_sim_值.py
@ -0,0 +1,83 @@
 # -*- coding: utf-8 -*-
 """
@Time    :  2023/1/31 19:02
@Author  :
@FileName:
@Software:
@Describe:
 """
 import os
 # os.environ["TF_KERAS"] = "1"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 import json
 import numpy as np
 from bert4keras.backend import keras, set_gelu
 from bert4keras.tokenizers import Tokenizer, load_vocab
 from bert4keras.models import build_transformer_model
 from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
 from bert4keras.snippets import sequence_padding, DataGenerator
 from bert4keras.snippets import open
 from keras.layers import Lambda, Dense
 import tensorflow as tf
 from keras.backend import set_session
 from sklearn.metrics.pairwise import cosine_similarity
 from rouge import Rouge  # pip install rouge
 from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
 from tqdm import tqdm
 import jieba
 from gensim.models import KeyedVectors, word2vec, Word2Vec
 import random
 import difflib
 config = tf.ConfigProto()
 config.gpu_options.allow_growth = True
 set_session(tf.Session(config=config)) # 此处不同
 if __name__ == '__main__':
    file = "../data/train_yy.txt"
    # word2vecmodel = Word2vecModel()
    try:
        with open(file, 'r', encoding="utf-8") as f:
            lines = [x.strip() for x in f if x.strip() != '']
    except:
        with open(file, 'r', encoding="gbk") as f:
            lines = [x.strip() for x in f if x.strip() != '']
    bertsim_list = []
    bleusim_list = []
    strsim_list = []
    word2vecsim_list = []
    data_train_text = []
    random.shuffle(lines)
    print(len(lines))
    for txt in tqdm(lines):
        text = txt.split('\t')
        if len(text) == 3:
            data_1 = text[0]
            data_2 = text[2]
            # difflib.SequenceMatcher(None, one_data[0], one_data_simbert[j]).quick_ratio()
            str_sim_value = difflib.SequenceMatcher(None, data_1, data_2).quick_ratio()
            if len(data_2) - len(data_1) < 0:
                num_yu = 1- len(data_2) / len(data_1)
                str_sim_value = 1 - str_sim_value * num_yu
            strsim_list.append(str_sim_value)
    strsim_list = sorted(strsim_list)
    zong_num = len(strsim_list)
    print(strsim_list)
    print(strsim_list[int(zong_num / 2)])
    print(sum(strsim_list) / zong_num)
    fileName = '统计数据的strsim值.txt'
    with open(fileName, 'w', encoding='utf-8') as file:
        for i in strsim_list:
            file.write(str(i) + '\n')
        file.close()
--- a/data_do/统计数据的strsim值.txt
+++ b/data_do/统计数据的strsim值.txt
--- a/data_do/统计非中文字符.py
+++ b/data_do/统计非中文字符.py
@ -0,0 +1,40 @@
 # -*- coding: utf-8 -*-
 """
@Time    :  2022/12/20 16:20
@Author  : 
@FileName: 
@Software: 
@Describe:
 """
 import pandas as pd
 from tqdm import tqdm
 import json
 path = "../data/论文_yy_小说.xlsx"
 df_list = pd.read_excel(path).values.tolist()
 fuhao = {}
 for i in tqdm(df_list):
    for word in i:
        word = str(word)
        if word == "nan":
            continue
        for ch in word:
            if u'\u4e00' <= ch <= u'\u9fff':
                continue
            else:
                if ch in fuhao:
                    fuhao[ch] += 1
                else:
                    fuhao[ch] = 1
 test_1 = sorted(fuhao.items(),key=lambda x:x[1],reverse=True)
 fuhao_new = {}
 for i in test_1:
    fuhao_new[i[0]] = i[1]
 json_data = json.dumps(fuhao_new, ensure_ascii=False, indent=2)
 with open('../data/fuhao.json', 'w', encoding="utf-8") as f_six:
    f_six.write(json_data)
--- a/data_do/读取docx.py
+++ b/data_do/读取docx.py
@ -0,0 +1,77 @@
 # -*- coding: utf-8 -*-
 """
@Time    :  2023/2/14 14:29
@Author  : 
@FileName: 
@Software: 
@Describe:
 """
 # from zipfile import ZipFile
 # from bs4 import BeautifulSoup
 #
 # document=ZipFile("../data/11篇/13139551_于丰源_在线考试系统-原文.docx")
 # xml=document.read("word/document.xml")
 # wordObj=BeautifulSoup(xml.decode("utf-8"))
 # texts=wordObj.findAll("w:t")
 # for text in texts:
 #   print(text.text)
 import docx
 import win32com.client as wc
 import operator
 #doc文件另存为docx
 # path = "E:\pycharm_workspace\drop_weight_rewrite\data\\11篇\\13138572_李菊_谈小学语文教学难点与创新-原文.doc"
 # path_new = "E:\pycharm_workspace\drop_weight_rewrite\data\\11篇\\13138572_李菊_谈小学语文教学难点与创新-原文.docx"
 # word = wc.Dispatch("Word.Application")
 # doc = word.Documents.Open(path)
 # # 12代表转换后为docx文件
 # doc.SaveAs(path_new, 12)
 # doc.Close
 # word.Quit
 #
 # #读取转换后的docx
 #
 # file = docx.Document(path_new)
 # for p in file.paragraphs:
 #     print(p.text)
 # from win32com import client as wc
 # w = wc.Dispatch('Word.Application')
 # # 或者使用下面的方法，使用启动独立的进程：
 # # w = wc.DispatchEx('Word.Application')
 # doc=w.Documents.Open(path)
 # doc.SaveAs(path_new,16)#必须有参数16，否则会出错.
 import os
 from win32com import client as wc
 def save_doc_to_docx(rawpath):  # doc转docx
    '''
    :param rawpath: 传入和传出文件夹的路径
    :return: None
    '''
    word = wc.Dispatch("Word.Application")
    # 不能用相对路径，老老实实用绝对路径
    # 需要处理的文件所在文件夹目录
    filenamelist = os.listdir(rawpath)
    for i in os.listdir(rawpath):
        # 找出文件中以.doc结尾并且不以~$开头的文件（~$是为了排除临时文件的）
        if i.endswith('.doc') and not i.startswith('~$'):
            print(i)
            # try
            # 打开文件
            doc = word.Documents.Open(rawpath + i)
            # # 将文件名与后缀分割
            rename = os.path.splitext(i)
            # 将文件另存为.docx
            doc.SaveAs(pathls + rename[0] + '.docx', 12)  # 12表示docx格式
            doc.Close()
    word.Quit()
 if __name__ == '__main__':
    pathls = "E:\\pycharm_workspace\\drop_weight_rewrite\\data\\11篇\\"
    save_doc_to_docx(pathls)
--- a/data_do/进一步处理降重数据.py
+++ b/data_do/进一步处理降重数据.py
@ -0,0 +1,43 @@
 # -*- coding: utf-8 -*-
 """
@Time    :  2022/12/20 17:56
@Author  : 
@FileName: 
@Software: 
@Describe:
 """
 import pandas as pd
 from tqdm import tqdm
 import json
 path = "../data/论文_yy_小说_1.xlsx"
 df_list = pd.read_excel(path).values.tolist()
 def sentence_do(source,result):
    source = str(source)
    result = str(result)
    if source == "nan" or result == "nan":
        return False, source,result
    if len(source) > 120 or len(result) > 120:
        return False, source,result
    else:
        source = str(source).replace("\t", "").replace("　", "").replace("", "").replace("", "")
        result = str(result).replace("\t", "").replace("　", "").replace("", "").replace("", "")
        return True, source, result
 df_list_new = []
 for i in df_list:
    source = i[0]
    result = i[1]
    bool, source, result = sentence_do(source, result)
    if bool == False:
        continue
    else:
        df_list_new.append([source,result])
 df = pd.DataFrame(df_list_new, columns=["原文","yy降重"])
 df.to_excel("../data/论文_yy_小说_3.xlsx",index=None)
--- a/优化点/优化点.txt
+++ b/优化点/优化点.txt
@ -0,0 +1,55 @@
 方案1
 	1.人名替换处理
 	2.使用翻译模型翻译出来句子人工筛选
 	3.筛选出来的句子进行训练
 		训练方案1
 			常规训练方案
 		训练方案2
 			使用simbert方式训练
 翻译模型方案
 	1，使用百度注册账号
 	2，https://www.oschina.net/news/170812/meta-open-source-wmt-21
 	3，使用t5模型训练
 改写语料
 	1，现有的语料 58万
 	2，lcqmc语料
 写作猫发现的规律
 	1.
 		可以确定的是，应该是用模型做的，因为同样的短句，后面加点东西和不加，出来的整体句子都不一样
 		伴随着风硕哀嚎的声音，白策笑嘻嘻的离开了。 --> 在冯硕的惨叫声中，白策嘿嘿一笑，转身离去。
 		伴随着风硕哀嚎的声音 --> 伴随着风硕的惨叫声
 	2. 
 		人名特殊处理
 	3.(待定,因为试了几条结果跟我预想的差不多，不敢确定)
 		一般是合并句子居多，且一般是前面句子或者后面句子形容中心句
 	修改后短句增多的例子
 		对方无声哭泣着再次说了一遍。 --> 对方带着哭腔，又重复了一遍。
 		足足等到叶澜歌换好了衣服 --> 一直到叶澜歌穿好衣服，这才停了下来。
 	修改后短句合并的例子
 		林天宇瞬间就察觉到问题，对着她，轻声问道，“你是不是察觉到什么？” --> 林天宇立刻意识到了不对劲，小声的问了一句，“有没有发现？”
 	方案：
 	1，有可能形容词替换之后（使用work2vec），再用中译英英译中（也会补充出来代词）
 		张三尖叫的喊道
 		张三尖叫的嚷嚷道  --> 张三大吼一声。 --> 张三咆哮起来。 --> 张三大吼一声。
--- a/改进方案/改进方案.txt
+++ b/改进方案/改进方案.txt
--- a/测试range_1.py
+++ b/测试range_1.py
@ -0,0 +1,70 @@
 # -*- coding: utf-8 -*-
 """
@Time    :  2023/2/3 17:27
@Author  : 
@FileName: 
@Software: 
@Describe:
 """
 import os
 # os.environ["TF_KERAS"] = "1"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 from bert4keras.backend import keras, set_gelu
 import numpy as np
 from rouge import Rouge  # pip install rouge
 from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
 class Evaluator(keras.callbacks.Callback):
    """评估与保存
    """
    def __init__(self):
        self.rouge = Rouge()
        self.smooth = SmoothingFunction().method1
        self.best_bleu = 0.
    # def on_epoch_end(self, epoch, logs=None):
    #     metrics = self.evaluate(valid_data)  # 评测模型
    #     if metrics['bleu'] > self.best_bleu:
    #         self.best_bleu = metrics['bleu']
    #         model.save_weights('./best_model.weights')  # 保存模型
    #     metrics['best_bleu'] = self.best_bleu
    #     print('valid_data:', metrics)
    def evaluate_t(self, data_1, data_2, topk=1):
        total = 0
        rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
        scores = self.rouge.get_scores(hyps=[data_1], refs=[data_2])
        rouge_1 += scores[0]['rouge-1']['f']
        rouge_2 += scores[0]['rouge-2']['f']
        rouge_l += scores[0]['rouge-l']['f']
        bleu += sentence_bleu(
            references=[data_1.split(' ')],
            hypothesis=data_2.split(' '),
            smoothing_function=self.smooth
        )
        # rouge_1 /= total
        # rouge_2 /= total
        # rouge_l /= total
        # bleu /= total
        return [rouge_1, rouge_2, rouge_l, bleu]
 eval_class = Evaluator()
 data_1 = "上海中心大厦"
 data_2 = "上海"
 eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2))
 print(eval_list)
 a = len(data_2) - len(data_1)
 if a < 0:
    a *
 a = len(data_2)/len(data_1)
 np.exp(len(data_2) - len(data_1))