
18 changed files with 278312 additions and 0 deletions
@ -0,0 +1,74 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
""" |
||||
|
@Time : 2022/12/20 10:35 |
||||
|
@Author : |
||||
|
@FileName: |
||||
|
@Software: |
||||
|
@Describe: |
||||
|
""" |
||||
|
import os |
||||
|
from bs4 import BeautifulSoup |
||||
|
import pandas as pd |
||||
|
import re |
||||
|
# 遍历文件夹 |
||||
|
|
||||
|
|
||||
|
path_list = [] |
||||
|
|
||||
|
def walkFile(file): |
||||
|
for root, dirs, files in os.walk(file): |
||||
|
# root 表示当前访问的文件夹路径 |
||||
|
# dirs 表示该文件夹下的子目录名list |
||||
|
# files 表示该文件夹下的文件list |
||||
|
# 遍历文件 |
||||
|
for f in files: |
||||
|
name = str(f).split("_")[0] |
||||
|
path_list.append(name) |
||||
|
|
||||
|
|
||||
|
walkFile("../data/yy_reduce_data") |
||||
|
|
||||
|
|
||||
|
path_list = list(set(path_list)) |
||||
|
print(path_list) |
||||
|
|
||||
|
|
||||
|
data = [] |
||||
|
for i in path_list: |
||||
|
|
||||
|
soup_source = BeautifulSoup(open("../data/yy_reduce_data/{}_source".format(i), encoding='utf-8'), |
||||
|
"html.parser") |
||||
|
|
||||
|
soup_result = BeautifulSoup(open("../data/yy_reduce_data/{}_result".format(i), encoding='utf-8'), |
||||
|
"html.parser") |
||||
|
|
||||
|
source_sentence_list = soup_source.select('p > em') |
||||
|
result_sentence_list = soup_result.select('p > em') |
||||
|
for sentence_index in range(len(source_sentence_list)): |
||||
|
try: |
||||
|
if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"] \ |
||||
|
and result_sentence_list[sentence_index]["class"] == ['similar']: |
||||
|
# if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"]: |
||||
|
source_text = source_sentence_list[sentence_index].string |
||||
|
result_text = result_sentence_list[sentence_index].string |
||||
|
if source_text != None and result_text != None: |
||||
|
data.append([source_sentence_list[sentence_index].string, result_sentence_list[sentence_index].string]) |
||||
|
except: |
||||
|
print(i,sentence_index) |
||||
|
|
||||
|
# print(data) |
||||
|
|
||||
|
|
||||
|
def data_clean(text): |
||||
|
# 清洗excel中的非法字符,都是不常见的不可显示字符,例如退格,响铃等 |
||||
|
ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]') |
||||
|
text = ILLEGAL_CHARACTERS_RE.sub(r'', text) |
||||
|
return text |
||||
|
|
||||
|
|
||||
|
df = pd.DataFrame(data,columns=["原文","yy降重"]) |
||||
|
for col in df.columns: |
||||
|
df[col] = df[col].apply(lambda x: data_clean(x)) |
||||
|
|
||||
|
df.to_excel("../data/论文_yy_小说.xlsx",index=None) |
@ -0,0 +1,26 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
""" |
||||
|
@Time : 2022/12/23 16:00 |
||||
|
@Author : |
||||
|
@FileName: |
||||
|
@Software: |
||||
|
@Describe: |
||||
|
""" |
||||
|
import pandas as pd |
||||
|
|
||||
|
|
||||
|
path = "../data/论文_yy_小说_3.xlsx" |
||||
|
df_list = pd.read_excel(path).values.tolist() |
||||
|
|
||||
|
df_list_new = [] |
||||
|
print(len(df_list)) |
||||
|
for i in df_list: |
||||
|
a = i[0] |
||||
|
b = i[1] |
||||
|
df_list_new.append("\t".join([a, "to", b])) |
||||
|
|
||||
|
with open("../data/train_yy_1.txt", "w", encoding='utf-8') as file: |
||||
|
for i in df_list_new: |
||||
|
file.write(i + '\n') |
||||
|
file.close() |
@ -0,0 +1,31 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
""" |
||||
|
@Time : 2023/2/2 11:29 |
||||
|
@Author : |
||||
|
@FileName: |
||||
|
@Software: |
||||
|
@Describe: |
||||
|
""" |
||||
|
|
||||
|
def read_text(file): |
||||
|
try: |
||||
|
with open(file, 'r', encoding="utf-8") as f: |
||||
|
lines = [x.strip() for x in f if x.strip() != ''] |
||||
|
except: |
||||
|
with open(file, 'r', encoding="gbk") as f: |
||||
|
lines = [x.strip() for x in f if x.strip() != ''] |
||||
|
return lines |
||||
|
|
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
|
||||
|
data = [] |
||||
|
path_list = ["train_yy_sim_10.txt", "train_yy_1_sim_10.txt"] |
||||
|
for i in path_list: |
||||
|
data += read_text(i) |
||||
|
fileName = '../data/train_yy_sim.txt' |
||||
|
with open(fileName, 'w', encoding='utf-8') as file: |
||||
|
for i in data: |
||||
|
file.write(str(i) + '\n') |
||||
|
file.close() |
@ -0,0 +1,102 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
""" |
||||
|
@Time : 2023/2/14 14:19 |
||||
|
@Author : |
||||
|
@FileName: |
||||
|
@Software: |
||||
|
@Describe: |
||||
|
""" |
||||
|
import os |
||||
|
import pandas as pd |
||||
|
import docx |
||||
|
import win32com.client as wc |
||||
|
import operator |
||||
|
|
||||
|
|
||||
|
def is_chinese(uchar): |
||||
|
""" |
||||
|
判断一个unicode是否是汉字 |
||||
|
:param uchar: |
||||
|
:return: |
||||
|
""" |
||||
|
if uchar >= u'\u4e00' and uchar<=u'\u9fa5': |
||||
|
return True |
||||
|
else: |
||||
|
return False |
||||
|
|
||||
|
|
||||
|
def snetence(text): |
||||
|
bool_ = True |
||||
|
for i in text: |
||||
|
bool_1 = is_chinese(i) |
||||
|
if bool_1 == True: |
||||
|
continue |
||||
|
else: |
||||
|
if i in fuhao: |
||||
|
continue |
||||
|
else: |
||||
|
bool_ = False |
||||
|
break |
||||
|
return bool_ |
||||
|
|
||||
|
|
||||
|
fuhao = [",","。",",","、"] |
||||
|
path = '../data/11篇' |
||||
|
path_list = [] |
||||
|
for file_name in os.listdir(path): |
||||
|
path_list.append(file_name) |
||||
|
# print(path_list) |
||||
|
|
||||
|
|
||||
|
|
||||
|
|
||||
|
|
||||
|
# path = "../data/11篇/13139551_于丰源_在线考试系统-原文.docx" |
||||
|
for docx_name in path_list: |
||||
|
data = [] |
||||
|
data_new = [] |
||||
|
file_name = docx_name.split(".")[0] |
||||
|
file_type = docx_name.split(".")[1] |
||||
|
if file_type == "docx": |
||||
|
document = docx.Document(path + "/" + docx_name) |
||||
|
else: |
||||
|
continue |
||||
|
#获取所有段落 |
||||
|
all_paragraphs = document.paragraphs |
||||
|
for paragraph in all_paragraphs: |
||||
|
#打印每一个段落的文字 |
||||
|
data.append(paragraph.text) |
||||
|
data = sorted(data,key=lambda x:len(x)) |
||||
|
for data_dan in data: |
||||
|
if data_dan == "": |
||||
|
continue |
||||
|
for i in data_dan: |
||||
|
if i == "章": |
||||
|
continue |
||||
|
if len(data_dan) < 15: |
||||
|
continue |
||||
|
# else: |
||||
|
# bool_ = snetence(data_dan) |
||||
|
# if bool_ == True: |
||||
|
# data_new.append(data_dan) |
||||
|
else: |
||||
|
data_list = str(data_dan).split("。") |
||||
|
for data_dan_short in data_list: |
||||
|
if data_dan_short == "": |
||||
|
continue |
||||
|
for i in data_dan_short: |
||||
|
if i == "章": |
||||
|
continue |
||||
|
if len(data_dan_short) < 10: |
||||
|
continue |
||||
|
if len(data_dan_short) > 120: |
||||
|
continue |
||||
|
data_new.append(data_dan_short) |
||||
|
data_new = sorted(data_new,key=lambda x:len(x)) |
||||
|
data_df = [] |
||||
|
for i in data_new: |
||||
|
data_df.append([i]) |
||||
|
|
||||
|
|
||||
|
pd.DataFrame(data_df).to_csv("../data/11篇csv/" + file_name + ".csv", index=False) |
@ -0,0 +1,114 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
""" |
||||
|
@Time : 2023/2/14 14:19 |
||||
|
@Author : |
||||
|
@FileName: |
||||
|
@Software: |
||||
|
@Describe: |
||||
|
""" |
||||
|
import os |
||||
|
import pandas as pd |
||||
|
import docx |
||||
|
import win32com.client as wc |
||||
|
import operator |
||||
|
|
||||
|
|
||||
|
def is_chinese(uchar): |
||||
|
""" |
||||
|
判断一个unicode是否是汉字 |
||||
|
:param uchar: |
||||
|
:return: |
||||
|
""" |
||||
|
if uchar >= u'\u4e00' and uchar<=u'\u9fa5': |
||||
|
return True |
||||
|
else: |
||||
|
return False |
||||
|
|
||||
|
|
||||
|
def snetence(text): |
||||
|
bool_ = True |
||||
|
for i in text: |
||||
|
bool_1 = is_chinese(i) |
||||
|
if bool_1 == True: |
||||
|
continue |
||||
|
else: |
||||
|
if i in fuhao: |
||||
|
continue |
||||
|
else: |
||||
|
bool_ = False |
||||
|
break |
||||
|
return bool_ |
||||
|
|
||||
|
|
||||
|
fuhao = [",","。",",","、"] |
||||
|
path = '../data/11篇' |
||||
|
path_list = [] |
||||
|
for file_name in os.listdir(path): |
||||
|
path_list.append(file_name) |
||||
|
# print(path_list) |
||||
|
|
||||
|
|
||||
|
def chulichangju_2(text, chulipangban_return_list): |
||||
|
fuhao = [",","?","!","…"] |
||||
|
text_1 = text[:120] |
||||
|
text_2 = text[120:] |
||||
|
text_1_new = "" |
||||
|
for i in range(len(text_1)-1, -1, -1): |
||||
|
if text_1[i] in fuhao: |
||||
|
text_1_new = text_1[:i] |
||||
|
text_1_new += text_1[i] |
||||
|
chulipangban_return_list.append(text_1_new) |
||||
|
if text_2 != "": |
||||
|
if i+1 != 120: |
||||
|
text_2 = text_1[i+1:] + text_2 |
||||
|
break |
||||
|
# else: |
||||
|
# chulipangban_return_list.append(text_1) |
||||
|
if text_1_new == "": |
||||
|
chulipangban_return_list.append(text_1) |
||||
|
if text_2 != "": |
||||
|
chulipangban_return_list = chulichangju_2(text_2, chulipangban_return_list) |
||||
|
return chulipangban_return_list |
||||
|
|
||||
|
|
||||
|
# path = "../data/11篇/13139551_于丰源_在线考试系统-原文.docx" |
||||
|
for docx_name in path_list: |
||||
|
data = [] |
||||
|
data_new = [] |
||||
|
file_name = docx_name.split(".")[0] |
||||
|
file_type = docx_name.split(".")[1] |
||||
|
if file_type == "docx": |
||||
|
document = docx.Document(path + "/" + docx_name) |
||||
|
else: |
||||
|
continue |
||||
|
#获取所有段落 |
||||
|
all_paragraphs = document.paragraphs |
||||
|
for paragraph in all_paragraphs: |
||||
|
#打印每一个段落的文字 |
||||
|
data.append(paragraph.text) |
||||
|
for data_dan in data: |
||||
|
if data_dan == "": |
||||
|
continue |
||||
|
# else: |
||||
|
# bool_ = snetence(data_dan) |
||||
|
# if bool_ == True: |
||||
|
# data_new.append(data_dan) |
||||
|
else: |
||||
|
data_list = str(data_dan).split("。") |
||||
|
for data_dan_short in data_list: |
||||
|
if data_dan_short == "": |
||||
|
continue |
||||
|
if len(data_dan_short) > 120: |
||||
|
dan_list = chulichangju_2(data_dan_short, []) |
||||
|
dan_list[-1] += "。" |
||||
|
data_new.extend(dan_list) |
||||
|
else: |
||||
|
data_dan_short += "。" |
||||
|
data_new.append(data_dan_short) |
||||
|
|
||||
|
|
||||
|
with open("../data/11篇txt/{}.txt".format(file_name), "w", encoding='utf-8') as file: |
||||
|
for i in data_new: |
||||
|
file.write(i + '\n') |
||||
|
file.close() |
@ -0,0 +1,87 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
""" |
||||
|
@Time : 2023/2/1 19:18 |
||||
|
@Author : |
||||
|
@FileName: |
||||
|
@Software: |
||||
|
@Describe: |
||||
|
""" |
||||
|
|
||||
|
import os |
||||
|
import os |
||||
|
from bs4 import BeautifulSoup |
||||
|
import pandas as pd |
||||
|
import re |
||||
|
from tqdm import tqdm |
||||
|
# 遍历文件夹 |
||||
|
|
||||
|
|
||||
|
data_path_list = [] |
||||
|
|
||||
|
def walkFile(file): |
||||
|
for root, dirs, files in os.walk(file): |
||||
|
# root 表示当前访问的文件夹路径 |
||||
|
# dirs 表示该文件夹下的子目录名list |
||||
|
# files 表示该文件夹下的文件list |
||||
|
# 遍历文件 |
||||
|
for f in files: |
||||
|
# print(os.path.join(root, f)) |
||||
|
data_path_list.append(os.path.join(root, f)) |
||||
|
# 遍历所有的文件夹 |
||||
|
# for d in dirs: |
||||
|
# print(os.path.join(root, d)) |
||||
|
def main(): |
||||
|
walkFile("../data/yy_reduce_data_20221219-20230131") |
||||
|
|
||||
|
main() |
||||
|
|
||||
|
|
||||
|
data = [] |
||||
|
|
||||
|
rootpath_list = [] |
||||
|
for i in data_path_list: |
||||
|
danpath_list = str(i).split("\\") |
||||
|
rootpath_list.append("\\".join(danpath_list[:-1])) |
||||
|
|
||||
|
print(len(rootpath_list)) |
||||
|
rootpath_list = list(set(rootpath_list)) |
||||
|
for i in tqdm(rootpath_list): |
||||
|
try: |
||||
|
soup_source = BeautifulSoup(open("{}\\source".format(i), encoding='utf-8'), |
||||
|
"html.parser") |
||||
|
soup_result = BeautifulSoup(open("{}\\result".format(i), encoding='utf-8'), |
||||
|
"html.parser") |
||||
|
except: |
||||
|
continue |
||||
|
|
||||
|
source_sentence_list = soup_source.select('p > em') |
||||
|
result_sentence_list = soup_result.select('p > em') |
||||
|
for sentence_index in range(len(source_sentence_list)): |
||||
|
try: |
||||
|
if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"] \ |
||||
|
and result_sentence_list[sentence_index]["class"] == ['similar']: |
||||
|
# if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"]: |
||||
|
source_text = source_sentence_list[sentence_index].string |
||||
|
result_text = result_sentence_list[sentence_index].string |
||||
|
if source_text != None and result_text != None: |
||||
|
data.append([source_sentence_list[sentence_index].string, result_sentence_list[sentence_index].string]) |
||||
|
except: |
||||
|
pass |
||||
|
# print(i,sentence_index) |
||||
|
|
||||
|
# print(data) |
||||
|
|
||||
|
|
||||
|
def data_clean(text): |
||||
|
# 清洗excel中的非法字符,都是不常见的不可显示字符,例如退格,响铃等 |
||||
|
ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]') |
||||
|
text = ILLEGAL_CHARACTERS_RE.sub(r'', text) |
||||
|
return text |
||||
|
|
||||
|
|
||||
|
df = pd.DataFrame(data,columns=["原文","yy降重"]) |
||||
|
for col in df.columns: |
||||
|
df[col] = df[col].apply(lambda x: data_clean(x)) |
||||
|
|
||||
|
df.to_excel("../data/论文_yy_小说_1.xlsx",index=None) |
@ -0,0 +1,33 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
""" |
||||
|
@Time : 2023/2/15 14:13 |
||||
|
@Author : |
||||
|
@FileName: |
||||
|
@Software: |
||||
|
@Describe: |
||||
|
""" |
||||
|
import os |
||||
|
import pandas as pd |
||||
|
path_1 = '../data/11篇excel' |
||||
|
path_2 = "../data/11篇临时拼接" |
||||
|
path_3 = "../data/11篇临时拼接2" |
||||
|
path_list = [] |
||||
|
for file_name in os.listdir(path_1): |
||||
|
path_list.append(file_name) |
||||
|
|
||||
|
|
||||
|
for file_name in path_list: |
||||
|
data_new = [] |
||||
|
data_1 = pd.read_excel(path_1 + "/" + file_name).values.tolist() |
||||
|
data_2 = pd.read_excel(path_2 + "/" + file_name).values.tolist() |
||||
|
file_name_0 = file_name.split(".")[0] |
||||
|
file_name_1 = file_name.split(".")[1] |
||||
|
file_name_ = file_name_0 + "_." + file_name_1 |
||||
|
data_3 = pd.read_excel(path_3 + "/" + file_name_).values.tolist() |
||||
|
for i in range(len(data_1)): |
||||
|
data_new.append(data_1[i] + [data_2[i][1]] + [data_3[i][1]]) |
||||
|
|
||||
|
df = pd.DataFrame(data_new,columns=["原文","simbert","simbert_datasim07","bertsim_simsim"]) |
||||
|
df.to_excel("../data/11篇测试excel_汇总_1/{}.xlsx".format(file_name_0), index=None) |
||||
|
|
@ -0,0 +1,209 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
""" |
||||
|
@Time : 2023/1/31 19:02 |
||||
|
@Author : |
||||
|
@FileName: |
||||
|
@Software: |
||||
|
@Describe: |
||||
|
""" |
||||
|
import os |
||||
|
# os.environ["TF_KERAS"] = "1" |
||||
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0" |
||||
|
import json |
||||
|
import numpy as np |
||||
|
from bert4keras.backend import keras, set_gelu |
||||
|
from bert4keras.tokenizers import Tokenizer, load_vocab |
||||
|
from bert4keras.models import build_transformer_model |
||||
|
from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr |
||||
|
from bert4keras.snippets import sequence_padding, DataGenerator |
||||
|
from bert4keras.snippets import open |
||||
|
from keras.layers import Lambda, Dense |
||||
|
import tensorflow as tf |
||||
|
from keras.backend import set_session |
||||
|
from sklearn.metrics.pairwise import cosine_similarity |
||||
|
from rouge import Rouge # pip install rouge |
||||
|
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction |
||||
|
from tqdm import tqdm |
||||
|
import jieba |
||||
|
from gensim.models import KeyedVectors, word2vec, Word2Vec |
||||
|
import random |
||||
|
|
||||
|
config = tf.ConfigProto() |
||||
|
config.gpu_options.allow_growth = True |
||||
|
set_session(tf.Session(config=config)) # 此处不同 |
||||
|
|
||||
|
class Word2vecModel: |
||||
|
def __init__(self): |
||||
|
self.path = "E:\pycharm_workspace\查重分析\word2vec_model\\word2vec_add_new_18.model" |
||||
|
self.model = Word2Vec.load(self.path) |
||||
|
|
||||
|
def word2vec_res(self,seg_0_list, seg_1_list): |
||||
|
sentence_0_list = [] |
||||
|
sentence_1_list = [] |
||||
|
for i in seg_0_list: |
||||
|
a = self.model.wv[i] |
||||
|
sentence_0_list.append(a) |
||||
|
|
||||
|
for i in seg_1_list: |
||||
|
a = self.model.wv[i] |
||||
|
sentence_1_list.append(a) |
||||
|
|
||||
|
return sentence_0_list, sentence_1_list |
||||
|
|
||||
|
class Evaluator(keras.callbacks.Callback): |
||||
|
"""评估与保存 |
||||
|
""" |
||||
|
|
||||
|
def __init__(self): |
||||
|
self.rouge = Rouge() |
||||
|
self.smooth = SmoothingFunction().method1 |
||||
|
self.best_bleu = 0. |
||||
|
|
||||
|
# def on_epoch_end(self, epoch, logs=None): |
||||
|
# metrics = self.evaluate(valid_data) # 评测模型 |
||||
|
# if metrics['bleu'] > self.best_bleu: |
||||
|
# self.best_bleu = metrics['bleu'] |
||||
|
# model.save_weights('./best_model.weights') # 保存模型 |
||||
|
# metrics['best_bleu'] = self.best_bleu |
||||
|
# print('valid_data:', metrics) |
||||
|
|
||||
|
|
||||
|
def evaluate_t(self, data_1, data_2, topk=1): |
||||
|
total = 0 |
||||
|
rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0 |
||||
|
|
||||
|
scores = self.rouge.get_scores(hyps=[data_1], refs=[data_2]) |
||||
|
rouge_1 += scores[0]['rouge-1']['f'] |
||||
|
rouge_2 += scores[0]['rouge-2']['f'] |
||||
|
rouge_l += scores[0]['rouge-l']['f'] |
||||
|
bleu += sentence_bleu( |
||||
|
references=[data_1.split(' ')], |
||||
|
hypothesis=data_2.split(' '), |
||||
|
smoothing_function=self.smooth |
||||
|
) |
||||
|
# rouge_1 /= total |
||||
|
# rouge_2 /= total |
||||
|
# rouge_l /= total |
||||
|
# bleu /= total |
||||
|
return [rouge_1, rouge_2, rouge_l, bleu] |
||||
|
|
||||
|
class bertModel: |
||||
|
def __init__(self): |
||||
|
self.config_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json' |
||||
|
self.checkpoint_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt' |
||||
|
self.dict_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt' |
||||
|
self.token_dict, self.keep_tokens = load_vocab( |
||||
|
dict_path=self.dict_path, |
||||
|
simplified=True, |
||||
|
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], |
||||
|
) |
||||
|
self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True) |
||||
|
self.buildmodel() |
||||
|
|
||||
|
|
||||
|
def buildmodel(self): |
||||
|
bert = build_transformer_model( |
||||
|
config_path=self.config_path, |
||||
|
checkpoint_path=self.checkpoint_path, |
||||
|
return_keras_model=False, |
||||
|
) |
||||
|
|
||||
|
output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output) |
||||
|
self.model = keras.models.Model(bert.model.input, output) |
||||
|
self.model.summary() |
||||
|
|
||||
|
def predict(self,text): |
||||
|
batch_token_ids, batch_segment_ids = [], [] |
||||
|
token_ids, segment_ids = self.tokenizer.encode(text, maxlen=256) |
||||
|
batch_token_ids.append(token_ids) |
||||
|
batch_segment_ids.append(segment_ids) |
||||
|
return self.model.predict([batch_token_ids, batch_segment_ids]) |
||||
|
|
||||
|
|
||||
|
def simbert(data_1, data_2): |
||||
|
pass |
||||
|
|
||||
|
def word2vec(): |
||||
|
pass |
||||
|
|
||||
|
def bleu(): |
||||
|
pass |
||||
|
|
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
file = "../data/train_yy_1.txt" |
||||
|
model = bertModel() |
||||
|
eval_class = Evaluator() |
||||
|
# word2vecmodel = Word2vecModel() |
||||
|
try: |
||||
|
with open(file, 'r', encoding="utf-8") as f: |
||||
|
lines = [x.strip() for x in f if x.strip() != ''] |
||||
|
except: |
||||
|
with open(file, 'r', encoding="gbk") as f: |
||||
|
lines = [x.strip() for x in f if x.strip() != ''] |
||||
|
|
||||
|
bertsim_list = [] |
||||
|
bleusim_list = [] |
||||
|
word2vecsim_list = [] |
||||
|
data_train_text = [] |
||||
|
|
||||
|
random.shuffle(lines) |
||||
|
print(len(lines)) |
||||
|
for txt in tqdm(lines): |
||||
|
text = txt.split('\t') |
||||
|
if len(text) == 3: |
||||
|
data_1 = text[0] |
||||
|
data_2 = text[2] |
||||
|
y1 = model.predict(data_1)[0] |
||||
|
y2 = model.predict(data_2)[0] |
||||
|
cos_sim = cosine_similarity(y1.reshape(1, -1), y2.reshape(1, -1)) |
||||
|
bertsim_list.append(cos_sim[0][0]) |
||||
|
bertsim_value = cos_sim[0][0] |
||||
|
|
||||
|
eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2)) |
||||
|
bleusim_list.append(eval_list[3]) |
||||
|
bleusim_value = eval_list[3] |
||||
|
|
||||
|
if bertsim_value <= 0.94 and bleusim_value <= 0.4: |
||||
|
data_train_text.append("\t".join([data_1, "to", data_2])) |
||||
|
|
||||
|
|
||||
|
|
||||
|
# eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2)) |
||||
|
# bleusim_list.append(eval_list[3]) |
||||
|
|
||||
|
# word2vec |
||||
|
# seg_0_list = jieba.cut(data_1, cut_all=False) |
||||
|
# seg_1_list = jieba.cut(data_2, cut_all=False) |
||||
|
# seg_0_list = [char for char in seg_0_list] |
||||
|
# seg_1_list = [char for char in seg_1_list] |
||||
|
# |
||||
|
# sentence_0_list, sentence_1_list = word2vecmodel.word2vec_res(seg_0_list, seg_1_list) |
||||
|
# sentence_0_result = np.array(sentence_0_list) |
||||
|
# sentence_1_result = np.array(sentence_1_list) |
||||
|
# sentence_0_array = sentence_0_result.sum(axis=0) |
||||
|
# sentence_1_array = sentence_1_result.sum(axis=0) |
||||
|
# print(sentence_1_array) |
||||
|
# print(sentence_0_array) |
||||
|
# cos_sim = cosine_similarity(sentence_0_array.reshape(1, -1), sentence_1_array.reshape(1, -1)) |
||||
|
# word2vecsim_list.append(cos_sim[0][0]) |
||||
|
|
||||
|
# bertsim_list = sorted(bertsim_list) |
||||
|
# zong_num = len(bertsim_list) |
||||
|
# print(bertsim_list) |
||||
|
# print(bertsim_list[int(zong_num/2)]) |
||||
|
# print(sum(bertsim_list)/zong_num) |
||||
|
|
||||
|
# bleusim_list = sorted(bleusim_list) |
||||
|
# zong_num = len(bleusim_list) |
||||
|
# print(bleusim_list) |
||||
|
# print(bleusim_list[int(zong_num / 2)]) |
||||
|
# print(sum(bleusim_list) / zong_num) |
||||
|
|
||||
|
fileName = 'train_yy_1_sim_09.txt' |
||||
|
with open(fileName, 'w', encoding='utf-8') as file: |
||||
|
for i in data_train_text: |
||||
|
file.write(str(i) + '\n') |
||||
|
file.close() |
||||
|
|
@ -0,0 +1,209 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
""" |
||||
|
@Time : 2023/1/31 19:02 |
||||
|
@Author : |
||||
|
@FileName: |
||||
|
@Software: |
||||
|
@Describe: |
||||
|
""" |
||||
|
import os |
||||
|
# os.environ["TF_KERAS"] = "1" |
||||
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0" |
||||
|
import json |
||||
|
import numpy as np |
||||
|
from bert4keras.backend import keras, set_gelu |
||||
|
from bert4keras.tokenizers import Tokenizer, load_vocab |
||||
|
from bert4keras.models import build_transformer_model |
||||
|
from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr |
||||
|
from bert4keras.snippets import sequence_padding, DataGenerator |
||||
|
from bert4keras.snippets import open |
||||
|
from keras.layers import Lambda, Dense |
||||
|
import tensorflow as tf |
||||
|
from keras.backend import set_session |
||||
|
from sklearn.metrics.pairwise import cosine_similarity |
||||
|
from rouge import Rouge # pip install rouge |
||||
|
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction |
||||
|
from tqdm import tqdm |
||||
|
import jieba |
||||
|
from gensim.models import KeyedVectors, word2vec, Word2Vec |
||||
|
import random |
||||
|
|
||||
|
config = tf.ConfigProto() |
||||
|
config.gpu_options.allow_growth = True |
||||
|
set_session(tf.Session(config=config)) # 此处不同 |
||||
|
|
||||
|
class Word2vecModel: |
||||
|
def __init__(self): |
||||
|
self.path = "E:\pycharm_workspace\查重分析\word2vec_model\\word2vec_add_new_18.model" |
||||
|
self.model = Word2Vec.load(self.path) |
||||
|
|
||||
|
def word2vec_res(self,seg_0_list, seg_1_list): |
||||
|
sentence_0_list = [] |
||||
|
sentence_1_list = [] |
||||
|
for i in seg_0_list: |
||||
|
a = self.model.wv[i] |
||||
|
sentence_0_list.append(a) |
||||
|
|
||||
|
for i in seg_1_list: |
||||
|
a = self.model.wv[i] |
||||
|
sentence_1_list.append(a) |
||||
|
|
||||
|
return sentence_0_list, sentence_1_list |
||||
|
|
||||
|
class Evaluator(keras.callbacks.Callback): |
||||
|
"""评估与保存 |
||||
|
""" |
||||
|
|
||||
|
def __init__(self): |
||||
|
self.rouge = Rouge() |
||||
|
self.smooth = SmoothingFunction().method1 |
||||
|
self.best_bleu = 0. |
||||
|
|
||||
|
# def on_epoch_end(self, epoch, logs=None): |
||||
|
# metrics = self.evaluate(valid_data) # 评测模型 |
||||
|
# if metrics['bleu'] > self.best_bleu: |
||||
|
# self.best_bleu = metrics['bleu'] |
||||
|
# model.save_weights('./best_model.weights') # 保存模型 |
||||
|
# metrics['best_bleu'] = self.best_bleu |
||||
|
# print('valid_data:', metrics) |
||||
|
|
||||
|
|
||||
|
def evaluate_t(self, data_1, data_2, topk=1): |
||||
|
total = 0 |
||||
|
rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0 |
||||
|
|
||||
|
scores = self.rouge.get_scores(hyps=[data_1], refs=[data_2]) |
||||
|
rouge_1 += scores[0]['rouge-1']['f'] |
||||
|
rouge_2 += scores[0]['rouge-2']['f'] |
||||
|
rouge_l += scores[0]['rouge-l']['f'] |
||||
|
bleu += sentence_bleu( |
||||
|
references=[data_1.split(' ')], |
||||
|
hypothesis=data_2.split(' '), |
||||
|
smoothing_function=self.smooth |
||||
|
) |
||||
|
# rouge_1 /= total |
||||
|
# rouge_2 /= total |
||||
|
# rouge_l /= total |
||||
|
# bleu /= total |
||||
|
return [rouge_1, rouge_2, rouge_l, bleu] |
||||
|
|
||||
|
class bertModel: |
||||
|
def __init__(self): |
||||
|
self.config_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json' |
||||
|
self.checkpoint_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt' |
||||
|
self.dict_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt' |
||||
|
self.token_dict, self.keep_tokens = load_vocab( |
||||
|
dict_path=self.dict_path, |
||||
|
simplified=True, |
||||
|
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], |
||||
|
) |
||||
|
self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True) |
||||
|
self.buildmodel() |
||||
|
|
||||
|
|
||||
|
def buildmodel(self): |
||||
|
bert = build_transformer_model( |
||||
|
config_path=self.config_path, |
||||
|
checkpoint_path=self.checkpoint_path, |
||||
|
return_keras_model=False, |
||||
|
) |
||||
|
|
||||
|
output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output) |
||||
|
self.model = keras.models.Model(bert.model.input, output) |
||||
|
self.model.summary() |
||||
|
|
||||
|
def predict(self,text): |
||||
|
batch_token_ids, batch_segment_ids = [], [] |
||||
|
token_ids, segment_ids = self.tokenizer.encode(text, maxlen=256) |
||||
|
batch_token_ids.append(token_ids) |
||||
|
batch_segment_ids.append(segment_ids) |
||||
|
return self.model.predict([batch_token_ids, batch_segment_ids]) |
||||
|
|
||||
|
|
||||
|
def simbert(data_1, data_2): |
||||
|
pass |
||||
|
|
||||
|
def word2vec(): |
||||
|
pass |
||||
|
|
||||
|
def bleu(): |
||||
|
pass |
||||
|
|
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
file = "../data/train_yy_1.txt" |
||||
|
model = bertModel() |
||||
|
eval_class = Evaluator() |
||||
|
# word2vecmodel = Word2vecModel() |
||||
|
try: |
||||
|
with open(file, 'r', encoding="utf-8") as f: |
||||
|
lines = [x.strip() for x in f if x.strip() != ''] |
||||
|
except: |
||||
|
with open(file, 'r', encoding="gbk") as f: |
||||
|
lines = [x.strip() for x in f if x.strip() != ''] |
||||
|
|
||||
|
bertsim_list = [] |
||||
|
bleusim_list = [] |
||||
|
word2vecsim_list = [] |
||||
|
data_train_text = [] |
||||
|
|
||||
|
random.shuffle(lines) |
||||
|
print(len(lines)) |
||||
|
for txt in tqdm(lines): |
||||
|
text = txt.split('\t') |
||||
|
if len(text) == 3: |
||||
|
data_1 = text[0] |
||||
|
data_2 = text[2] |
||||
|
y1 = model.predict(data_1)[0] |
||||
|
y2 = model.predict(data_2)[0] |
||||
|
cos_sim = cosine_similarity(y1.reshape(1, -1), y2.reshape(1, -1)) |
||||
|
bertsim_list.append(cos_sim[0][0]) |
||||
|
bertsim_value = cos_sim[0][0] |
||||
|
|
||||
|
eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2)) |
||||
|
bleusim_list.append(eval_list[3]) |
||||
|
bleusim_value = eval_list[3] |
||||
|
|
||||
|
if bertsim_value <= 0.94 and bleusim_value <= 0.4: |
||||
|
data_train_text.append("\t".join([data_1, "to", data_2])) |
||||
|
|
||||
|
|
||||
|
|
||||
|
# eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2)) |
||||
|
# bleusim_list.append(eval_list[3]) |
||||
|
|
||||
|
# word2vec |
||||
|
# seg_0_list = jieba.cut(data_1, cut_all=False) |
||||
|
# seg_1_list = jieba.cut(data_2, cut_all=False) |
||||
|
# seg_0_list = [char for char in seg_0_list] |
||||
|
# seg_1_list = [char for char in seg_1_list] |
||||
|
# |
||||
|
# sentence_0_list, sentence_1_list = word2vecmodel.word2vec_res(seg_0_list, seg_1_list) |
||||
|
# sentence_0_result = np.array(sentence_0_list) |
||||
|
# sentence_1_result = np.array(sentence_1_list) |
||||
|
# sentence_0_array = sentence_0_result.sum(axis=0) |
||||
|
# sentence_1_array = sentence_1_result.sum(axis=0) |
||||
|
# print(sentence_1_array) |
||||
|
# print(sentence_0_array) |
||||
|
# cos_sim = cosine_similarity(sentence_0_array.reshape(1, -1), sentence_1_array.reshape(1, -1)) |
||||
|
# word2vecsim_list.append(cos_sim[0][0]) |
||||
|
|
||||
|
# bertsim_list = sorted(bertsim_list) |
||||
|
# zong_num = len(bertsim_list) |
||||
|
# print(bertsim_list) |
||||
|
# print(bertsim_list[int(zong_num/2)]) |
||||
|
# print(sum(bertsim_list)/zong_num) |
||||
|
|
||||
|
# bleusim_list = sorted(bleusim_list) |
||||
|
# zong_num = len(bleusim_list) |
||||
|
# print(bleusim_list) |
||||
|
# print(bleusim_list[int(zong_num / 2)]) |
||||
|
# print(sum(bleusim_list) / zong_num) |
||||
|
|
||||
|
fileName = 'train_yy_1_sim_09.txt' |
||||
|
with open(fileName, 'w', encoding='utf-8') as file: |
||||
|
for i in data_train_text: |
||||
|
file.write(str(i) + '\n') |
||||
|
file.close() |
||||
|
|
@ -0,0 +1,206 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
""" |
||||
|
@Time : 2023/1/31 19:02 |
||||
|
@Author : |
||||
|
@FileName: |
||||
|
@Software: |
||||
|
@Describe: |
||||
|
""" |
||||
|
import os |
||||
|
# os.environ["TF_KERAS"] = "1" |
||||
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0" |
||||
|
import json |
||||
|
import numpy as np |
||||
|
from bert4keras.backend import keras, set_gelu |
||||
|
from bert4keras.tokenizers import Tokenizer, load_vocab |
||||
|
from bert4keras.models import build_transformer_model |
||||
|
from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr |
||||
|
from bert4keras.snippets import sequence_padding, DataGenerator |
||||
|
from bert4keras.snippets import open |
||||
|
from keras.layers import Lambda, Dense |
||||
|
import tensorflow as tf |
||||
|
from keras.backend import set_session |
||||
|
from sklearn.metrics.pairwise import cosine_similarity |
||||
|
from rouge import Rouge # pip install rouge |
||||
|
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction |
||||
|
from tqdm import tqdm |
||||
|
import jieba |
||||
|
from gensim.models import KeyedVectors, word2vec, Word2Vec |
||||
|
import random |
||||
|
import difflib |
||||
|
|
||||
|
config = tf.ConfigProto() |
||||
|
config.gpu_options.allow_growth = True |
||||
|
set_session(tf.Session(config=config)) # 此处不同 |
||||
|
|
||||
|
class Word2vecModel: |
||||
|
def __init__(self): |
||||
|
self.path = "E:\pycharm_workspace\查重分析\word2vec_model\\word2vec_add_new_18.model" |
||||
|
self.model = Word2Vec.load(self.path) |
||||
|
|
||||
|
def word2vec_res(self,seg_0_list, seg_1_list): |
||||
|
sentence_0_list = [] |
||||
|
sentence_1_list = [] |
||||
|
for i in seg_0_list: |
||||
|
a = self.model.wv[i] |
||||
|
sentence_0_list.append(a) |
||||
|
|
||||
|
for i in seg_1_list: |
||||
|
a = self.model.wv[i] |
||||
|
sentence_1_list.append(a) |
||||
|
|
||||
|
return sentence_0_list, sentence_1_list |
||||
|
|
||||
|
class Evaluator(keras.callbacks.Callback): |
||||
|
"""评估与保存 |
||||
|
""" |
||||
|
|
||||
|
def __init__(self): |
||||
|
self.rouge = Rouge() |
||||
|
self.smooth = SmoothingFunction().method1 |
||||
|
self.best_bleu = 0. |
||||
|
|
||||
|
# def on_epoch_end(self, epoch, logs=None): |
||||
|
# metrics = self.evaluate(valid_data) # 评测模型 |
||||
|
# if metrics['bleu'] > self.best_bleu: |
||||
|
# self.best_bleu = metrics['bleu'] |
||||
|
# model.save_weights('./best_model.weights') # 保存模型 |
||||
|
# metrics['best_bleu'] = self.best_bleu |
||||
|
# print('valid_data:', metrics) |
||||
|
|
||||
|
|
||||
|
def evaluate_t(self, data_1, data_2, topk=1): |
||||
|
total = 0 |
||||
|
rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0 |
||||
|
|
||||
|
scores = self.rouge.get_scores(hyps=[data_1], refs=[data_2]) |
||||
|
rouge_1 += scores[0]['rouge-1']['f'] |
||||
|
rouge_2 += scores[0]['rouge-2']['f'] |
||||
|
rouge_l += scores[0]['rouge-l']['f'] |
||||
|
bleu += sentence_bleu( |
||||
|
references=[data_1.split(' ')], |
||||
|
hypothesis=data_2.split(' '), |
||||
|
smoothing_function=self.smooth |
||||
|
) |
||||
|
# rouge_1 /= total |
||||
|
# rouge_2 /= total |
||||
|
# rouge_l /= total |
||||
|
# bleu /= total |
||||
|
return [rouge_1, rouge_2, rouge_l, bleu] |
||||
|
|
||||
|
class bertModel: |
||||
|
def __init__(self): |
||||
|
self.config_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json' |
||||
|
self.checkpoint_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt' |
||||
|
self.dict_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt' |
||||
|
self.token_dict, self.keep_tokens = load_vocab( |
||||
|
dict_path=self.dict_path, |
||||
|
simplified=True, |
||||
|
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], |
||||
|
) |
||||
|
self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True) |
||||
|
self.buildmodel() |
||||
|
|
||||
|
|
||||
|
def buildmodel(self): |
||||
|
bert = build_transformer_model( |
||||
|
config_path=self.config_path, |
||||
|
checkpoint_path=self.checkpoint_path, |
||||
|
return_keras_model=False, |
||||
|
) |
||||
|
|
||||
|
output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output) |
||||
|
self.model = keras.models.Model(bert.model.input, output) |
||||
|
self.model.summary() |
||||
|
|
||||
|
def predict(self,text): |
||||
|
batch_token_ids, batch_segment_ids = [], [] |
||||
|
token_ids, segment_ids = self.tokenizer.encode(text, maxlen=256) |
||||
|
batch_token_ids.append(token_ids) |
||||
|
batch_segment_ids.append(segment_ids) |
||||
|
return self.model.predict([batch_token_ids, batch_segment_ids]) |
||||
|
|
||||
|
|
||||
|
def simbert(data_1, data_2): |
||||
|
pass |
||||
|
|
||||
|
def word2vec(): |
||||
|
pass |
||||
|
|
||||
|
def bleu(): |
||||
|
pass |
||||
|
|
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
file = "../data/train_yy_1.txt" |
||||
|
model = bertModel() |
||||
|
eval_class = Evaluator() |
||||
|
# word2vecmodel = Word2vecModel() |
||||
|
try: |
||||
|
with open(file, 'r', encoding="utf-8") as f: |
||||
|
lines = [x.strip() for x in f if x.strip() != ''] |
||||
|
except: |
||||
|
with open(file, 'r', encoding="gbk") as f: |
||||
|
lines = [x.strip() for x in f if x.strip() != ''] |
||||
|
|
||||
|
bertsim_list = [] |
||||
|
bleusim_list = [] |
||||
|
word2vecsim_list = [] |
||||
|
data_train_text = [] |
||||
|
|
||||
|
random.shuffle(lines) |
||||
|
print(len(lines)) |
||||
|
for txt in tqdm(lines): |
||||
|
text = txt.split('\t') |
||||
|
if len(text) == 3: |
||||
|
data_1 = text[0] |
||||
|
data_2 = text[2] |
||||
|
str_sim_value = difflib.SequenceMatcher(None, data_1, data_2).quick_ratio() |
||||
|
if len(data_2) - len(data_1) < 0 and len(data_2) / len(data_1) > 0.8: |
||||
|
num_yu = 1 - len(data_2) / len(data_1) |
||||
|
str_sim_value = 1 - str_sim_value * num_yu |
||||
|
|
||||
|
|
||||
|
if str_sim_value < 0.70: |
||||
|
data_train_text.append("\t".join([data_1, "to", data_2])) |
||||
|
|
||||
|
|
||||
|
|
||||
|
# eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2)) |
||||
|
# bleusim_list.append(eval_list[3]) |
||||
|
|
||||
|
# word2vec |
||||
|
# seg_0_list = jieba.cut(data_1, cut_all=False) |
||||
|
# seg_1_list = jieba.cut(data_2, cut_all=False) |
||||
|
# seg_0_list = [char for char in seg_0_list] |
||||
|
# seg_1_list = [char for char in seg_1_list] |
||||
|
# |
||||
|
# sentence_0_list, sentence_1_list = word2vecmodel.word2vec_res(seg_0_list, seg_1_list) |
||||
|
# sentence_0_result = np.array(sentence_0_list) |
||||
|
# sentence_1_result = np.array(sentence_1_list) |
||||
|
# sentence_0_array = sentence_0_result.sum(axis=0) |
||||
|
# sentence_1_array = sentence_1_result.sum(axis=0) |
||||
|
# print(sentence_1_array) |
||||
|
# print(sentence_0_array) |
||||
|
# cos_sim = cosine_similarity(sentence_0_array.reshape(1, -1), sentence_1_array.reshape(1, -1)) |
||||
|
# word2vecsim_list.append(cos_sim[0][0]) |
||||
|
|
||||
|
# bertsim_list = sorted(bertsim_list) |
||||
|
# zong_num = len(bertsim_list) |
||||
|
# print(bertsim_list) |
||||
|
# print(bertsim_list[int(zong_num/2)]) |
||||
|
# print(sum(bertsim_list)/zong_num) |
||||
|
|
||||
|
# bleusim_list = sorted(bleusim_list) |
||||
|
# zong_num = len(bleusim_list) |
||||
|
# print(bleusim_list) |
||||
|
# print(bleusim_list[int(zong_num / 2)]) |
||||
|
# print(sum(bleusim_list) / zong_num) |
||||
|
print(len(data_train_text)) |
||||
|
fileName = 'train_yy_1_sim_10.txt' |
||||
|
with open(fileName, 'w', encoding='utf-8') as file: |
||||
|
for i in data_train_text: |
||||
|
file.write(str(i) + '\n') |
||||
|
file.close() |
||||
|
|
@ -0,0 +1,83 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
""" |
||||
|
@Time : 2023/1/31 19:02 |
||||
|
@Author : |
||||
|
@FileName: |
||||
|
@Software: |
||||
|
@Describe: |
||||
|
""" |
||||
|
import os |
||||
|
# os.environ["TF_KERAS"] = "1" |
||||
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0" |
||||
|
import json |
||||
|
import numpy as np |
||||
|
from bert4keras.backend import keras, set_gelu |
||||
|
from bert4keras.tokenizers import Tokenizer, load_vocab |
||||
|
from bert4keras.models import build_transformer_model |
||||
|
from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr |
||||
|
from bert4keras.snippets import sequence_padding, DataGenerator |
||||
|
from bert4keras.snippets import open |
||||
|
from keras.layers import Lambda, Dense |
||||
|
import tensorflow as tf |
||||
|
from keras.backend import set_session |
||||
|
from sklearn.metrics.pairwise import cosine_similarity |
||||
|
from rouge import Rouge # pip install rouge |
||||
|
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction |
||||
|
from tqdm import tqdm |
||||
|
import jieba |
||||
|
from gensim.models import KeyedVectors, word2vec, Word2Vec |
||||
|
import random |
||||
|
import difflib |
||||
|
|
||||
|
config = tf.ConfigProto() |
||||
|
config.gpu_options.allow_growth = True |
||||
|
set_session(tf.Session(config=config)) # 此处不同 |
||||
|
|
||||
|
|
||||
|
|
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
file = "../data/train_yy.txt" |
||||
|
# word2vecmodel = Word2vecModel() |
||||
|
try: |
||||
|
with open(file, 'r', encoding="utf-8") as f: |
||||
|
lines = [x.strip() for x in f if x.strip() != ''] |
||||
|
except: |
||||
|
with open(file, 'r', encoding="gbk") as f: |
||||
|
lines = [x.strip() for x in f if x.strip() != ''] |
||||
|
|
||||
|
bertsim_list = [] |
||||
|
bleusim_list = [] |
||||
|
strsim_list = [] |
||||
|
word2vecsim_list = [] |
||||
|
data_train_text = [] |
||||
|
|
||||
|
random.shuffle(lines) |
||||
|
print(len(lines)) |
||||
|
for txt in tqdm(lines): |
||||
|
text = txt.split('\t') |
||||
|
if len(text) == 3: |
||||
|
data_1 = text[0] |
||||
|
data_2 = text[2] |
||||
|
# difflib.SequenceMatcher(None, one_data[0], one_data_simbert[j]).quick_ratio() |
||||
|
str_sim_value = difflib.SequenceMatcher(None, data_1, data_2).quick_ratio() |
||||
|
|
||||
|
if len(data_2) - len(data_1) < 0: |
||||
|
num_yu = 1- len(data_2) / len(data_1) |
||||
|
str_sim_value = 1 - str_sim_value * num_yu |
||||
|
strsim_list.append(str_sim_value) |
||||
|
|
||||
|
strsim_list = sorted(strsim_list) |
||||
|
zong_num = len(strsim_list) |
||||
|
print(strsim_list) |
||||
|
print(strsim_list[int(zong_num / 2)]) |
||||
|
print(sum(strsim_list) / zong_num) |
||||
|
|
||||
|
|
||||
|
|
||||
|
fileName = '统计数据的strsim值.txt' |
||||
|
with open(fileName, 'w', encoding='utf-8') as file: |
||||
|
for i in strsim_list: |
||||
|
file.write(str(i) + '\n') |
||||
|
file.close() |
File diff suppressed because it is too large
@ -0,0 +1,40 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
""" |
||||
|
@Time : 2022/12/20 16:20 |
||||
|
@Author : |
||||
|
@FileName: |
||||
|
@Software: |
||||
|
@Describe: |
||||
|
""" |
||||
|
import pandas as pd |
||||
|
from tqdm import tqdm |
||||
|
import json |
||||
|
|
||||
|
|
||||
|
path = "../data/论文_yy_小说.xlsx" |
||||
|
df_list = pd.read_excel(path).values.tolist() |
||||
|
|
||||
|
fuhao = {} |
||||
|
for i in tqdm(df_list): |
||||
|
for word in i: |
||||
|
word = str(word) |
||||
|
if word == "nan": |
||||
|
continue |
||||
|
for ch in word: |
||||
|
if u'\u4e00' <= ch <= u'\u9fff': |
||||
|
continue |
||||
|
else: |
||||
|
if ch in fuhao: |
||||
|
fuhao[ch] += 1 |
||||
|
else: |
||||
|
fuhao[ch] = 1 |
||||
|
|
||||
|
test_1 = sorted(fuhao.items(),key=lambda x:x[1],reverse=True) |
||||
|
fuhao_new = {} |
||||
|
for i in test_1: |
||||
|
fuhao_new[i[0]] = i[1] |
||||
|
|
||||
|
json_data = json.dumps(fuhao_new, ensure_ascii=False, indent=2) |
||||
|
with open('../data/fuhao.json', 'w', encoding="utf-8") as f_six: |
||||
|
f_six.write(json_data) |
@ -0,0 +1,77 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
""" |
||||
|
@Time : 2023/2/14 14:29 |
||||
|
@Author : |
||||
|
@FileName: |
||||
|
@Software: |
||||
|
@Describe: |
||||
|
""" |
||||
|
# from zipfile import ZipFile |
||||
|
# from bs4 import BeautifulSoup |
||||
|
# |
||||
|
# document=ZipFile("../data/11篇/13139551_于丰源_在线考试系统-原文.docx") |
||||
|
# xml=document.read("word/document.xml") |
||||
|
# wordObj=BeautifulSoup(xml.decode("utf-8")) |
||||
|
# texts=wordObj.findAll("w:t") |
||||
|
# for text in texts: |
||||
|
# print(text.text) |
||||
|
|
||||
|
import docx |
||||
|
import win32com.client as wc |
||||
|
import operator |
||||
|
#doc文件另存为docx |
||||
|
|
||||
|
# path = "E:\pycharm_workspace\drop_weight_rewrite\data\\11篇\\13138572_李菊_谈小学语文教学难点与创新-原文.doc" |
||||
|
# path_new = "E:\pycharm_workspace\drop_weight_rewrite\data\\11篇\\13138572_李菊_谈小学语文教学难点与创新-原文.docx" |
||||
|
# word = wc.Dispatch("Word.Application") |
||||
|
# doc = word.Documents.Open(path) |
||||
|
# # 12代表转换后为docx文件 |
||||
|
# doc.SaveAs(path_new, 12) |
||||
|
# doc.Close |
||||
|
# word.Quit |
||||
|
# |
||||
|
# #读取转换后的docx |
||||
|
# |
||||
|
# file = docx.Document(path_new) |
||||
|
# for p in file.paragraphs: |
||||
|
# print(p.text) |
||||
|
|
||||
|
# from win32com import client as wc |
||||
|
# w = wc.Dispatch('Word.Application') |
||||
|
# # 或者使用下面的方法,使用启动独立的进程: |
||||
|
# # w = wc.DispatchEx('Word.Application') |
||||
|
# doc=w.Documents.Open(path) |
||||
|
# doc.SaveAs(path_new,16)#必须有参数16,否则会出错. |
||||
|
|
||||
|
|
||||
|
import os |
||||
|
from win32com import client as wc |
||||
|
|
||||
|
def save_doc_to_docx(rawpath): # doc转docx |
||||
|
''' |
||||
|
:param rawpath: 传入和传出文件夹的路径 |
||||
|
:return: None |
||||
|
''' |
||||
|
word = wc.Dispatch("Word.Application") |
||||
|
# 不能用相对路径,老老实实用绝对路径 |
||||
|
# 需要处理的文件所在文件夹目录 |
||||
|
filenamelist = os.listdir(rawpath) |
||||
|
for i in os.listdir(rawpath): |
||||
|
# 找出文件中以.doc结尾并且不以~$开头的文件(~$是为了排除临时文件的) |
||||
|
if i.endswith('.doc') and not i.startswith('~$'): |
||||
|
print(i) |
||||
|
# try |
||||
|
# 打开文件 |
||||
|
doc = word.Documents.Open(rawpath + i) |
||||
|
# # 将文件名与后缀分割 |
||||
|
rename = os.path.splitext(i) |
||||
|
# 将文件另存为.docx |
||||
|
doc.SaveAs(pathls + rename[0] + '.docx', 12) # 12表示docx格式 |
||||
|
doc.Close() |
||||
|
word.Quit() |
||||
|
|
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
pathls = "E:\\pycharm_workspace\\drop_weight_rewrite\\data\\11篇\\" |
||||
|
save_doc_to_docx(pathls) |
@ -0,0 +1,43 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
""" |
||||
|
@Time : 2022/12/20 17:56 |
||||
|
@Author : |
||||
|
@FileName: |
||||
|
@Software: |
||||
|
@Describe: |
||||
|
""" |
||||
|
import pandas as pd |
||||
|
from tqdm import tqdm |
||||
|
import json |
||||
|
|
||||
|
|
||||
|
path = "../data/论文_yy_小说_1.xlsx" |
||||
|
df_list = pd.read_excel(path).values.tolist() |
||||
|
|
||||
|
|
||||
|
def sentence_do(source,result): |
||||
|
source = str(source) |
||||
|
result = str(result) |
||||
|
if source == "nan" or result == "nan": |
||||
|
return False, source,result |
||||
|
if len(source) > 120 or len(result) > 120: |
||||
|
return False, source,result |
||||
|
else: |
||||
|
source = str(source).replace("\t", "").replace(" ", "").replace("", "").replace("", "") |
||||
|
result = str(result).replace("\t", "").replace(" ", "").replace("", "").replace("", "") |
||||
|
return True, source, result |
||||
|
|
||||
|
|
||||
|
df_list_new = [] |
||||
|
for i in df_list: |
||||
|
source = i[0] |
||||
|
result = i[1] |
||||
|
bool, source, result = sentence_do(source, result) |
||||
|
if bool == False: |
||||
|
continue |
||||
|
else: |
||||
|
df_list_new.append([source,result]) |
||||
|
|
||||
|
df = pd.DataFrame(df_list_new, columns=["原文","yy降重"]) |
||||
|
df.to_excel("../data/论文_yy_小说_3.xlsx",index=None) |
@ -0,0 +1,55 @@ |
|||||
|
方案1 |
||||
|
1.人名替换处理 |
||||
|
2.使用翻译模型翻译出来句子人工筛选 |
||||
|
3.筛选出来的句子进行训练 |
||||
|
训练方案1 |
||||
|
常规训练方案 |
||||
|
训练方案2 |
||||
|
使用simbert方式训练 |
||||
|
|
||||
|
|
||||
|
翻译模型方案 |
||||
|
1,使用百度注册账号 |
||||
|
2,https://www.oschina.net/news/170812/meta-open-source-wmt-21 |
||||
|
3,使用t5模型训练 |
||||
|
|
||||
|
|
||||
|
改写语料 |
||||
|
1,现有的语料 58万 |
||||
|
2,lcqmc语料 |
||||
|
|
||||
|
|
||||
|
写作猫发现的规律 |
||||
|
1. |
||||
|
可以确定的是,应该是用模型做的,因为同样的短句,后面加点东西和不加,出来的整体句子都不一样 |
||||
|
|
||||
|
伴随着风硕哀嚎的声音,白策笑嘻嘻的离开了。 --> 在冯硕的惨叫声中,白策嘿嘿一笑,转身离去。 |
||||
|
伴随着风硕哀嚎的声音 --> 伴随着风硕的惨叫声 |
||||
|
2. |
||||
|
人名特殊处理 |
||||
|
3.(待定,因为试了几条结果跟我预想的差不多,不敢确定) |
||||
|
一般是合并句子居多,且一般是前面句子或者后面句子形容中心句 |
||||
|
|
||||
|
|
||||
|
修改后短句增多的例子 |
||||
|
对方无声哭泣着再次说了一遍。 --> 对方带着哭腔,又重复了一遍。 |
||||
|
足足等到叶澜歌换好了衣服 --> 一直到叶澜歌穿好衣服,这才停了下来。 |
||||
|
修改后短句合并的例子 |
||||
|
林天宇瞬间就察觉到问题,对着她,轻声问道,“你是不是察觉到什么?” --> 林天宇立刻意识到了不对劲,小声的问了一句,“有没有发现?” |
||||
|
|
||||
|
|
||||
|
方案: |
||||
|
1,有可能形容词替换之后(使用work2vec),再用中译英英译中(也会补充出来代词) |
||||
|
|
||||
|
张三尖叫的喊道 |
||||
|
张三尖叫的嚷嚷道 --> 张三大吼一声。 --> 张三咆哮起来。 --> 张三大吼一声。 |
||||
|
|
||||
|
|
||||
|
|
||||
|
|
||||
|
|
||||
|
|
||||
|
|
||||
|
|
||||
|
|
||||
|
|
@ -0,0 +1,70 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
""" |
||||
|
@Time : 2023/2/3 17:27 |
||||
|
@Author : |
||||
|
@FileName: |
||||
|
@Software: |
||||
|
@Describe: |
||||
|
""" |
||||
|
|
||||
|
import os |
||||
|
# os.environ["TF_KERAS"] = "1" |
||||
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0" |
||||
|
from bert4keras.backend import keras, set_gelu |
||||
|
import numpy as np |
||||
|
from rouge import Rouge # pip install rouge |
||||
|
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction |
||||
|
|
||||
|
|
||||
|
class Evaluator(keras.callbacks.Callback): |
||||
|
"""评估与保存 |
||||
|
""" |
||||
|
|
||||
|
def __init__(self): |
||||
|
self.rouge = Rouge() |
||||
|
self.smooth = SmoothingFunction().method1 |
||||
|
self.best_bleu = 0. |
||||
|
|
||||
|
# def on_epoch_end(self, epoch, logs=None): |
||||
|
# metrics = self.evaluate(valid_data) # 评测模型 |
||||
|
# if metrics['bleu'] > self.best_bleu: |
||||
|
# self.best_bleu = metrics['bleu'] |
||||
|
# model.save_weights('./best_model.weights') # 保存模型 |
||||
|
# metrics['best_bleu'] = self.best_bleu |
||||
|
# print('valid_data:', metrics) |
||||
|
|
||||
|
|
||||
|
def evaluate_t(self, data_1, data_2, topk=1): |
||||
|
total = 0 |
||||
|
rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0 |
||||
|
|
||||
|
scores = self.rouge.get_scores(hyps=[data_1], refs=[data_2]) |
||||
|
rouge_1 += scores[0]['rouge-1']['f'] |
||||
|
rouge_2 += scores[0]['rouge-2']['f'] |
||||
|
rouge_l += scores[0]['rouge-l']['f'] |
||||
|
bleu += sentence_bleu( |
||||
|
references=[data_1.split(' ')], |
||||
|
hypothesis=data_2.split(' '), |
||||
|
smoothing_function=self.smooth |
||||
|
) |
||||
|
# rouge_1 /= total |
||||
|
# rouge_2 /= total |
||||
|
# rouge_l /= total |
||||
|
# bleu /= total |
||||
|
return [rouge_1, rouge_2, rouge_l, bleu] |
||||
|
|
||||
|
eval_class = Evaluator() |
||||
|
data_1 = "上海中心大厦" |
||||
|
data_2 = "上海" |
||||
|
eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2)) |
||||
|
print(eval_list) |
||||
|
|
||||
|
|
||||
|
a = len(data_2) - len(data_1) |
||||
|
if a < 0: |
||||
|
a * |
||||
|
|
||||
|
|
||||
|
a = len(data_2)/len(data_1) |
||||
|
np.exp(len(data_2) - len(data_1)) |
Loading…
Reference in new issue