普通版降重
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

207 lines
6.7 KiB

# -*- coding: utf-8 -*-
"""
@Time : 2023/1/31 19:02
@Author :
@FileName:
@Software:
@Describe:
"""
import os
# os.environ["TF_KERAS"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import json
import numpy as np
from bert4keras.backend import keras, set_gelu
from bert4keras.tokenizers import Tokenizer, load_vocab
from bert4keras.models import build_transformer_model
from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.snippets import open
from keras.layers import Lambda, Dense
import tensorflow as tf
from keras.backend import set_session
from sklearn.metrics.pairwise import cosine_similarity
from rouge import Rouge # pip install rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm
import jieba
from gensim.models import KeyedVectors, word2vec, Word2Vec
import random
import difflib
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config)) # 此处不同
class Word2vecModel:
def __init__(self):
self.path = "E:\pycharm_workspace\查重分析\word2vec_model\\word2vec_add_new_18.model"
self.model = Word2Vec.load(self.path)
def word2vec_res(self,seg_0_list, seg_1_list):
sentence_0_list = []
sentence_1_list = []
for i in seg_0_list:
a = self.model.wv[i]
sentence_0_list.append(a)
for i in seg_1_list:
a = self.model.wv[i]
sentence_1_list.append(a)
return sentence_0_list, sentence_1_list
class Evaluator(keras.callbacks.Callback):
"""评估与保存
"""
def __init__(self):
self.rouge = Rouge()
self.smooth = SmoothingFunction().method1
self.best_bleu = 0.
# def on_epoch_end(self, epoch, logs=None):
# metrics = self.evaluate(valid_data) # 评测模型
# if metrics['bleu'] > self.best_bleu:
# self.best_bleu = metrics['bleu']
# model.save_weights('./best_model.weights') # 保存模型
# metrics['best_bleu'] = self.best_bleu
# print('valid_data:', metrics)
def evaluate_t(self, data_1, data_2, topk=1):
total = 0
rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
scores = self.rouge.get_scores(hyps=[data_1], refs=[data_2])
rouge_1 += scores[0]['rouge-1']['f']
rouge_2 += scores[0]['rouge-2']['f']
rouge_l += scores[0]['rouge-l']['f']
bleu += sentence_bleu(
references=[data_1.split(' ')],
hypothesis=data_2.split(' '),
smoothing_function=self.smooth
)
# rouge_1 /= total
# rouge_2 /= total
# rouge_l /= total
# bleu /= total
return [rouge_1, rouge_2, rouge_l, bleu]
class bertModel:
def __init__(self):
self.config_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json'
self.checkpoint_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt'
self.dict_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt'
self.token_dict, self.keep_tokens = load_vocab(
dict_path=self.dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True)
self.buildmodel()
def buildmodel(self):
bert = build_transformer_model(
config_path=self.config_path,
checkpoint_path=self.checkpoint_path,
return_keras_model=False,
)
output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
self.model = keras.models.Model(bert.model.input, output)
self.model.summary()
def predict(self,text):
batch_token_ids, batch_segment_ids = [], []
token_ids, segment_ids = self.tokenizer.encode(text, maxlen=256)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
return self.model.predict([batch_token_ids, batch_segment_ids])
def simbert(data_1, data_2):
pass
def word2vec():
pass
def bleu():
pass
if __name__ == '__main__':
file = "../data/train_yy_1.txt"
model = bertModel()
eval_class = Evaluator()
# word2vecmodel = Word2vecModel()
try:
with open(file, 'r', encoding="utf-8") as f:
lines = [x.strip() for x in f if x.strip() != '']
except:
with open(file, 'r', encoding="gbk") as f:
lines = [x.strip() for x in f if x.strip() != '']
bertsim_list = []
bleusim_list = []
word2vecsim_list = []
data_train_text = []
random.shuffle(lines)
print(len(lines))
for txt in tqdm(lines):
text = txt.split('\t')
if len(text) == 3:
data_1 = text[0]
data_2 = text[2]
str_sim_value = difflib.SequenceMatcher(None, data_1, data_2).quick_ratio()
if len(data_2) - len(data_1) < 0 and len(data_2) / len(data_1) > 0.8:
num_yu = 1 - len(data_2) / len(data_1)
str_sim_value = 1 - str_sim_value * num_yu
if str_sim_value < 0.70:
data_train_text.append("\t".join([data_1, "to", data_2]))
# eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2))
# bleusim_list.append(eval_list[3])
# word2vec
# seg_0_list = jieba.cut(data_1, cut_all=False)
# seg_1_list = jieba.cut(data_2, cut_all=False)
# seg_0_list = [char for char in seg_0_list]
# seg_1_list = [char for char in seg_1_list]
#
# sentence_0_list, sentence_1_list = word2vecmodel.word2vec_res(seg_0_list, seg_1_list)
# sentence_0_result = np.array(sentence_0_list)
# sentence_1_result = np.array(sentence_1_list)
# sentence_0_array = sentence_0_result.sum(axis=0)
# sentence_1_array = sentence_1_result.sum(axis=0)
# print(sentence_1_array)
# print(sentence_0_array)
# cos_sim = cosine_similarity(sentence_0_array.reshape(1, -1), sentence_1_array.reshape(1, -1))
# word2vecsim_list.append(cos_sim[0][0])
# bertsim_list = sorted(bertsim_list)
# zong_num = len(bertsim_list)
# print(bertsim_list)
# print(bertsim_list[int(zong_num/2)])
# print(sum(bertsim_list)/zong_num)
# bleusim_list = sorted(bleusim_list)
# zong_num = len(bleusim_list)
# print(bleusim_list)
# print(bleusim_list[int(zong_num / 2)])
# print(sum(bleusim_list) / zong_num)
print(len(data_train_text))
fileName = 'train_yy_1_sim_10.txt'
with open(fileName, 'w', encoding='utf-8') as file:
for i in data_train_text:
file.write(str(i) + '\n')
file.close()