# -*- coding: utf-8 -*- """ @Time : 2023/1/31 19:02 @Author : @FileName: @Software: @Describe: """ import os # os.environ["TF_KERAS"] = "1" os.environ["CUDA_VISIBLE_DEVICES"] = "0" import json import numpy as np from bert4keras.backend import keras, set_gelu from bert4keras.tokenizers import Tokenizer, load_vocab from bert4keras.models import build_transformer_model from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr from bert4keras.snippets import sequence_padding, DataGenerator from bert4keras.snippets import open from keras.layers import Lambda, Dense import tensorflow as tf from keras.backend import set_session from sklearn.metrics.pairwise import cosine_similarity from rouge import Rouge # pip install rouge from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction from tqdm import tqdm import jieba from gensim.models import KeyedVectors, word2vec, Word2Vec import random import difflib config = tf.ConfigProto() config.gpu_options.allow_growth = True set_session(tf.Session(config=config)) # 此处不同 if __name__ == '__main__': file = "../data/train_yy.txt" # word2vecmodel = Word2vecModel() try: with open(file, 'r', encoding="utf-8") as f: lines = [x.strip() for x in f if x.strip() != ''] except: with open(file, 'r', encoding="gbk") as f: lines = [x.strip() for x in f if x.strip() != ''] bertsim_list = [] bleusim_list = [] strsim_list = [] word2vecsim_list = [] data_train_text = [] random.shuffle(lines) print(len(lines)) for txt in tqdm(lines): text = txt.split('\t') if len(text) == 3: data_1 = text[0] data_2 = text[2] # difflib.SequenceMatcher(None, one_data[0], one_data_simbert[j]).quick_ratio() str_sim_value = difflib.SequenceMatcher(None, data_1, data_2).quick_ratio() if len(data_2) - len(data_1) < 0: num_yu = 1- len(data_2) / len(data_1) str_sim_value = 1 - str_sim_value * num_yu strsim_list.append(str_sim_value) strsim_list = sorted(strsim_list) zong_num = len(strsim_list) print(strsim_list) print(strsim_list[int(zong_num / 2)]) print(sum(strsim_list) / zong_num) fileName = '统计数据的strsim值.txt' with open(fileName, 'w', encoding='utf-8') as file: for i in strsim_list: file.write(str(i) + '\n') file.close()