You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
83 lines
2.4 KiB
83 lines
2.4 KiB
![]()
2 years ago
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
"""
|
||
|
@Time : 2023/1/31 19:02
|
||
|
@Author :
|
||
|
@FileName:
|
||
|
@Software:
|
||
|
@Describe:
|
||
|
"""
|
||
|
import os
|
||
|
# os.environ["TF_KERAS"] = "1"
|
||
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
||
|
import json
|
||
|
import numpy as np
|
||
|
from bert4keras.backend import keras, set_gelu
|
||
|
from bert4keras.tokenizers import Tokenizer, load_vocab
|
||
|
from bert4keras.models import build_transformer_model
|
||
|
from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
|
||
|
from bert4keras.snippets import sequence_padding, DataGenerator
|
||
|
from bert4keras.snippets import open
|
||
|
from keras.layers import Lambda, Dense
|
||
|
import tensorflow as tf
|
||
|
from keras.backend import set_session
|
||
|
from sklearn.metrics.pairwise import cosine_similarity
|
||
|
from rouge import Rouge # pip install rouge
|
||
|
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
|
||
|
from tqdm import tqdm
|
||
|
import jieba
|
||
|
from gensim.models import KeyedVectors, word2vec, Word2Vec
|
||
|
import random
|
||
|
import difflib
|
||
|
|
||
|
config = tf.ConfigProto()
|
||
|
config.gpu_options.allow_growth = True
|
||
|
set_session(tf.Session(config=config)) # 此处不同
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
file = "../data/train_yy.txt"
|
||
|
# word2vecmodel = Word2vecModel()
|
||
|
try:
|
||
|
with open(file, 'r', encoding="utf-8") as f:
|
||
|
lines = [x.strip() for x in f if x.strip() != '']
|
||
|
except:
|
||
|
with open(file, 'r', encoding="gbk") as f:
|
||
|
lines = [x.strip() for x in f if x.strip() != '']
|
||
|
|
||
|
bertsim_list = []
|
||
|
bleusim_list = []
|
||
|
strsim_list = []
|
||
|
word2vecsim_list = []
|
||
|
data_train_text = []
|
||
|
|
||
|
random.shuffle(lines)
|
||
|
print(len(lines))
|
||
|
for txt in tqdm(lines):
|
||
|
text = txt.split('\t')
|
||
|
if len(text) == 3:
|
||
|
data_1 = text[0]
|
||
|
data_2 = text[2]
|
||
|
# difflib.SequenceMatcher(None, one_data[0], one_data_simbert[j]).quick_ratio()
|
||
|
str_sim_value = difflib.SequenceMatcher(None, data_1, data_2).quick_ratio()
|
||
|
|
||
|
if len(data_2) - len(data_1) < 0:
|
||
|
num_yu = 1- len(data_2) / len(data_1)
|
||
|
str_sim_value = 1 - str_sim_value * num_yu
|
||
|
strsim_list.append(str_sim_value)
|
||
|
|
||
|
strsim_list = sorted(strsim_list)
|
||
|
zong_num = len(strsim_list)
|
||
|
print(strsim_list)
|
||
|
print(strsim_list[int(zong_num / 2)])
|
||
|
print(sum(strsim_list) / zong_num)
|
||
|
|
||
|
|
||
|
|
||
|
fileName = '统计数据的strsim值.txt'
|
||
|
with open(fileName, 'w', encoding='utf-8') as file:
|
||
|
for i in strsim_list:
|
||
|
file.write(str(i) + '\n')
|
||
|
file.close()
|