
18 changed files with 278312 additions and 0 deletions
@ -0,0 +1,74 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
""" |
|||
@Time : 2022/12/20 10:35 |
|||
@Author : |
|||
@FileName: |
|||
@Software: |
|||
@Describe: |
|||
""" |
|||
import os |
|||
from bs4 import BeautifulSoup |
|||
import pandas as pd |
|||
import re |
|||
# 遍历文件夹 |
|||
|
|||
|
|||
path_list = [] |
|||
|
|||
def walkFile(file): |
|||
for root, dirs, files in os.walk(file): |
|||
# root 表示当前访问的文件夹路径 |
|||
# dirs 表示该文件夹下的子目录名list |
|||
# files 表示该文件夹下的文件list |
|||
# 遍历文件 |
|||
for f in files: |
|||
name = str(f).split("_")[0] |
|||
path_list.append(name) |
|||
|
|||
|
|||
walkFile("../data/yy_reduce_data") |
|||
|
|||
|
|||
path_list = list(set(path_list)) |
|||
print(path_list) |
|||
|
|||
|
|||
data = [] |
|||
for i in path_list: |
|||
|
|||
soup_source = BeautifulSoup(open("../data/yy_reduce_data/{}_source".format(i), encoding='utf-8'), |
|||
"html.parser") |
|||
|
|||
soup_result = BeautifulSoup(open("../data/yy_reduce_data/{}_result".format(i), encoding='utf-8'), |
|||
"html.parser") |
|||
|
|||
source_sentence_list = soup_source.select('p > em') |
|||
result_sentence_list = soup_result.select('p > em') |
|||
for sentence_index in range(len(source_sentence_list)): |
|||
try: |
|||
if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"] \ |
|||
and result_sentence_list[sentence_index]["class"] == ['similar']: |
|||
# if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"]: |
|||
source_text = source_sentence_list[sentence_index].string |
|||
result_text = result_sentence_list[sentence_index].string |
|||
if source_text != None and result_text != None: |
|||
data.append([source_sentence_list[sentence_index].string, result_sentence_list[sentence_index].string]) |
|||
except: |
|||
print(i,sentence_index) |
|||
|
|||
# print(data) |
|||
|
|||
|
|||
def data_clean(text): |
|||
# 清洗excel中的非法字符,都是不常见的不可显示字符,例如退格,响铃等 |
|||
ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]') |
|||
text = ILLEGAL_CHARACTERS_RE.sub(r'', text) |
|||
return text |
|||
|
|||
|
|||
df = pd.DataFrame(data,columns=["原文","yy降重"]) |
|||
for col in df.columns: |
|||
df[col] = df[col].apply(lambda x: data_clean(x)) |
|||
|
|||
df.to_excel("../data/论文_yy_小说.xlsx",index=None) |
@ -0,0 +1,26 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
""" |
|||
@Time : 2022/12/23 16:00 |
|||
@Author : |
|||
@FileName: |
|||
@Software: |
|||
@Describe: |
|||
""" |
|||
import pandas as pd |
|||
|
|||
|
|||
path = "../data/论文_yy_小说_3.xlsx" |
|||
df_list = pd.read_excel(path).values.tolist() |
|||
|
|||
df_list_new = [] |
|||
print(len(df_list)) |
|||
for i in df_list: |
|||
a = i[0] |
|||
b = i[1] |
|||
df_list_new.append("\t".join([a, "to", b])) |
|||
|
|||
with open("../data/train_yy_1.txt", "w", encoding='utf-8') as file: |
|||
for i in df_list_new: |
|||
file.write(i + '\n') |
|||
file.close() |
@ -0,0 +1,31 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
""" |
|||
@Time : 2023/2/2 11:29 |
|||
@Author : |
|||
@FileName: |
|||
@Software: |
|||
@Describe: |
|||
""" |
|||
|
|||
def read_text(file): |
|||
try: |
|||
with open(file, 'r', encoding="utf-8") as f: |
|||
lines = [x.strip() for x in f if x.strip() != ''] |
|||
except: |
|||
with open(file, 'r', encoding="gbk") as f: |
|||
lines = [x.strip() for x in f if x.strip() != ''] |
|||
return lines |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
|
|||
data = [] |
|||
path_list = ["train_yy_sim_10.txt", "train_yy_1_sim_10.txt"] |
|||
for i in path_list: |
|||
data += read_text(i) |
|||
fileName = '../data/train_yy_sim.txt' |
|||
with open(fileName, 'w', encoding='utf-8') as file: |
|||
for i in data: |
|||
file.write(str(i) + '\n') |
|||
file.close() |
@ -0,0 +1,102 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
""" |
|||
@Time : 2023/2/14 14:19 |
|||
@Author : |
|||
@FileName: |
|||
@Software: |
|||
@Describe: |
|||
""" |
|||
import os |
|||
import pandas as pd |
|||
import docx |
|||
import win32com.client as wc |
|||
import operator |
|||
|
|||
|
|||
def is_chinese(uchar): |
|||
""" |
|||
判断一个unicode是否是汉字 |
|||
:param uchar: |
|||
:return: |
|||
""" |
|||
if uchar >= u'\u4e00' and uchar<=u'\u9fa5': |
|||
return True |
|||
else: |
|||
return False |
|||
|
|||
|
|||
def snetence(text): |
|||
bool_ = True |
|||
for i in text: |
|||
bool_1 = is_chinese(i) |
|||
if bool_1 == True: |
|||
continue |
|||
else: |
|||
if i in fuhao: |
|||
continue |
|||
else: |
|||
bool_ = False |
|||
break |
|||
return bool_ |
|||
|
|||
|
|||
fuhao = [",","。",",","、"] |
|||
path = '../data/11篇' |
|||
path_list = [] |
|||
for file_name in os.listdir(path): |
|||
path_list.append(file_name) |
|||
# print(path_list) |
|||
|
|||
|
|||
|
|||
|
|||
|
|||
# path = "../data/11篇/13139551_于丰源_在线考试系统-原文.docx" |
|||
for docx_name in path_list: |
|||
data = [] |
|||
data_new = [] |
|||
file_name = docx_name.split(".")[0] |
|||
file_type = docx_name.split(".")[1] |
|||
if file_type == "docx": |
|||
document = docx.Document(path + "/" + docx_name) |
|||
else: |
|||
continue |
|||
#获取所有段落 |
|||
all_paragraphs = document.paragraphs |
|||
for paragraph in all_paragraphs: |
|||
#打印每一个段落的文字 |
|||
data.append(paragraph.text) |
|||
data = sorted(data,key=lambda x:len(x)) |
|||
for data_dan in data: |
|||
if data_dan == "": |
|||
continue |
|||
for i in data_dan: |
|||
if i == "章": |
|||
continue |
|||
if len(data_dan) < 15: |
|||
continue |
|||
# else: |
|||
# bool_ = snetence(data_dan) |
|||
# if bool_ == True: |
|||
# data_new.append(data_dan) |
|||
else: |
|||
data_list = str(data_dan).split("。") |
|||
for data_dan_short in data_list: |
|||
if data_dan_short == "": |
|||
continue |
|||
for i in data_dan_short: |
|||
if i == "章": |
|||
continue |
|||
if len(data_dan_short) < 10: |
|||
continue |
|||
if len(data_dan_short) > 120: |
|||
continue |
|||
data_new.append(data_dan_short) |
|||
data_new = sorted(data_new,key=lambda x:len(x)) |
|||
data_df = [] |
|||
for i in data_new: |
|||
data_df.append([i]) |
|||
|
|||
|
|||
pd.DataFrame(data_df).to_csv("../data/11篇csv/" + file_name + ".csv", index=False) |
@ -0,0 +1,114 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
""" |
|||
@Time : 2023/2/14 14:19 |
|||
@Author : |
|||
@FileName: |
|||
@Software: |
|||
@Describe: |
|||
""" |
|||
import os |
|||
import pandas as pd |
|||
import docx |
|||
import win32com.client as wc |
|||
import operator |
|||
|
|||
|
|||
def is_chinese(uchar): |
|||
""" |
|||
判断一个unicode是否是汉字 |
|||
:param uchar: |
|||
:return: |
|||
""" |
|||
if uchar >= u'\u4e00' and uchar<=u'\u9fa5': |
|||
return True |
|||
else: |
|||
return False |
|||
|
|||
|
|||
def snetence(text): |
|||
bool_ = True |
|||
for i in text: |
|||
bool_1 = is_chinese(i) |
|||
if bool_1 == True: |
|||
continue |
|||
else: |
|||
if i in fuhao: |
|||
continue |
|||
else: |
|||
bool_ = False |
|||
break |
|||
return bool_ |
|||
|
|||
|
|||
fuhao = [",","。",",","、"] |
|||
path = '../data/11篇' |
|||
path_list = [] |
|||
for file_name in os.listdir(path): |
|||
path_list.append(file_name) |
|||
# print(path_list) |
|||
|
|||
|
|||
def chulichangju_2(text, chulipangban_return_list): |
|||
fuhao = [",","?","!","…"] |
|||
text_1 = text[:120] |
|||
text_2 = text[120:] |
|||
text_1_new = "" |
|||
for i in range(len(text_1)-1, -1, -1): |
|||
if text_1[i] in fuhao: |
|||
text_1_new = text_1[:i] |
|||
text_1_new += text_1[i] |
|||
chulipangban_return_list.append(text_1_new) |
|||
if text_2 != "": |
|||
if i+1 != 120: |
|||
text_2 = text_1[i+1:] + text_2 |
|||
break |
|||
# else: |
|||
# chulipangban_return_list.append(text_1) |
|||
if text_1_new == "": |
|||
chulipangban_return_list.append(text_1) |
|||
if text_2 != "": |
|||
chulipangban_return_list = chulichangju_2(text_2, chulipangban_return_list) |
|||
return chulipangban_return_list |
|||
|
|||
|
|||
# path = "../data/11篇/13139551_于丰源_在线考试系统-原文.docx" |
|||
for docx_name in path_list: |
|||
data = [] |
|||
data_new = [] |
|||
file_name = docx_name.split(".")[0] |
|||
file_type = docx_name.split(".")[1] |
|||
if file_type == "docx": |
|||
document = docx.Document(path + "/" + docx_name) |
|||
else: |
|||
continue |
|||
#获取所有段落 |
|||
all_paragraphs = document.paragraphs |
|||
for paragraph in all_paragraphs: |
|||
#打印每一个段落的文字 |
|||
data.append(paragraph.text) |
|||
for data_dan in data: |
|||
if data_dan == "": |
|||
continue |
|||
# else: |
|||
# bool_ = snetence(data_dan) |
|||
# if bool_ == True: |
|||
# data_new.append(data_dan) |
|||
else: |
|||
data_list = str(data_dan).split("。") |
|||
for data_dan_short in data_list: |
|||
if data_dan_short == "": |
|||
continue |
|||
if len(data_dan_short) > 120: |
|||
dan_list = chulichangju_2(data_dan_short, []) |
|||
dan_list[-1] += "。" |
|||
data_new.extend(dan_list) |
|||
else: |
|||
data_dan_short += "。" |
|||
data_new.append(data_dan_short) |
|||
|
|||
|
|||
with open("../data/11篇txt/{}.txt".format(file_name), "w", encoding='utf-8') as file: |
|||
for i in data_new: |
|||
file.write(i + '\n') |
|||
file.close() |
@ -0,0 +1,87 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
""" |
|||
@Time : 2023/2/1 19:18 |
|||
@Author : |
|||
@FileName: |
|||
@Software: |
|||
@Describe: |
|||
""" |
|||
|
|||
import os |
|||
import os |
|||
from bs4 import BeautifulSoup |
|||
import pandas as pd |
|||
import re |
|||
from tqdm import tqdm |
|||
# 遍历文件夹 |
|||
|
|||
|
|||
data_path_list = [] |
|||
|
|||
def walkFile(file): |
|||
for root, dirs, files in os.walk(file): |
|||
# root 表示当前访问的文件夹路径 |
|||
# dirs 表示该文件夹下的子目录名list |
|||
# files 表示该文件夹下的文件list |
|||
# 遍历文件 |
|||
for f in files: |
|||
# print(os.path.join(root, f)) |
|||
data_path_list.append(os.path.join(root, f)) |
|||
# 遍历所有的文件夹 |
|||
# for d in dirs: |
|||
# print(os.path.join(root, d)) |
|||
def main(): |
|||
walkFile("../data/yy_reduce_data_20221219-20230131") |
|||
|
|||
main() |
|||
|
|||
|
|||
data = [] |
|||
|
|||
rootpath_list = [] |
|||
for i in data_path_list: |
|||
danpath_list = str(i).split("\\") |
|||
rootpath_list.append("\\".join(danpath_list[:-1])) |
|||
|
|||
print(len(rootpath_list)) |
|||
rootpath_list = list(set(rootpath_list)) |
|||
for i in tqdm(rootpath_list): |
|||
try: |
|||
soup_source = BeautifulSoup(open("{}\\source".format(i), encoding='utf-8'), |
|||
"html.parser") |
|||
soup_result = BeautifulSoup(open("{}\\result".format(i), encoding='utf-8'), |
|||
"html.parser") |
|||
except: |
|||
continue |
|||
|
|||
source_sentence_list = soup_source.select('p > em') |
|||
result_sentence_list = soup_result.select('p > em') |
|||
for sentence_index in range(len(source_sentence_list)): |
|||
try: |
|||
if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"] \ |
|||
and result_sentence_list[sentence_index]["class"] == ['similar']: |
|||
# if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"]: |
|||
source_text = source_sentence_list[sentence_index].string |
|||
result_text = result_sentence_list[sentence_index].string |
|||
if source_text != None and result_text != None: |
|||
data.append([source_sentence_list[sentence_index].string, result_sentence_list[sentence_index].string]) |
|||
except: |
|||
pass |
|||
# print(i,sentence_index) |
|||
|
|||
# print(data) |
|||
|
|||
|
|||
def data_clean(text): |
|||
# 清洗excel中的非法字符,都是不常见的不可显示字符,例如退格,响铃等 |
|||
ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]') |
|||
text = ILLEGAL_CHARACTERS_RE.sub(r'', text) |
|||
return text |
|||
|
|||
|
|||
df = pd.DataFrame(data,columns=["原文","yy降重"]) |
|||
for col in df.columns: |
|||
df[col] = df[col].apply(lambda x: data_clean(x)) |
|||
|
|||
df.to_excel("../data/论文_yy_小说_1.xlsx",index=None) |
@ -0,0 +1,33 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
""" |
|||
@Time : 2023/2/15 14:13 |
|||
@Author : |
|||
@FileName: |
|||
@Software: |
|||
@Describe: |
|||
""" |
|||
import os |
|||
import pandas as pd |
|||
path_1 = '../data/11篇excel' |
|||
path_2 = "../data/11篇临时拼接" |
|||
path_3 = "../data/11篇临时拼接2" |
|||
path_list = [] |
|||
for file_name in os.listdir(path_1): |
|||
path_list.append(file_name) |
|||
|
|||
|
|||
for file_name in path_list: |
|||
data_new = [] |
|||
data_1 = pd.read_excel(path_1 + "/" + file_name).values.tolist() |
|||
data_2 = pd.read_excel(path_2 + "/" + file_name).values.tolist() |
|||
file_name_0 = file_name.split(".")[0] |
|||
file_name_1 = file_name.split(".")[1] |
|||
file_name_ = file_name_0 + "_." + file_name_1 |
|||
data_3 = pd.read_excel(path_3 + "/" + file_name_).values.tolist() |
|||
for i in range(len(data_1)): |
|||
data_new.append(data_1[i] + [data_2[i][1]] + [data_3[i][1]]) |
|||
|
|||
df = pd.DataFrame(data_new,columns=["原文","simbert","simbert_datasim07","bertsim_simsim"]) |
|||
df.to_excel("../data/11篇测试excel_汇总_1/{}.xlsx".format(file_name_0), index=None) |
|||
|
@ -0,0 +1,209 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
""" |
|||
@Time : 2023/1/31 19:02 |
|||
@Author : |
|||
@FileName: |
|||
@Software: |
|||
@Describe: |
|||
""" |
|||
import os |
|||
# os.environ["TF_KERAS"] = "1" |
|||
os.environ["CUDA_VISIBLE_DEVICES"] = "0" |
|||
import json |
|||
import numpy as np |
|||
from bert4keras.backend import keras, set_gelu |
|||
from bert4keras.tokenizers import Tokenizer, load_vocab |
|||
from bert4keras.models import build_transformer_model |
|||
from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr |
|||
from bert4keras.snippets import sequence_padding, DataGenerator |
|||
from bert4keras.snippets import open |
|||
from keras.layers import Lambda, Dense |
|||
import tensorflow as tf |
|||
from keras.backend import set_session |
|||
from sklearn.metrics.pairwise import cosine_similarity |
|||
from rouge import Rouge # pip install rouge |
|||
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction |
|||
from tqdm import tqdm |
|||
import jieba |
|||
from gensim.models import KeyedVectors, word2vec, Word2Vec |
|||
import random |
|||
|
|||
config = tf.ConfigProto() |
|||
config.gpu_options.allow_growth = True |
|||
set_session(tf.Session(config=config)) # 此处不同 |
|||
|
|||
class Word2vecModel: |
|||
def __init__(self): |
|||
self.path = "E:\pycharm_workspace\查重分析\word2vec_model\\word2vec_add_new_18.model" |
|||
self.model = Word2Vec.load(self.path) |
|||
|
|||
def word2vec_res(self,seg_0_list, seg_1_list): |
|||
sentence_0_list = [] |
|||
sentence_1_list = [] |
|||
for i in seg_0_list: |
|||
a = self.model.wv[i] |
|||
sentence_0_list.append(a) |
|||
|
|||
for i in seg_1_list: |
|||
a = self.model.wv[i] |
|||
sentence_1_list.append(a) |
|||
|
|||
return sentence_0_list, sentence_1_list |
|||
|
|||
class Evaluator(keras.callbacks.Callback): |
|||
"""评估与保存 |
|||
""" |
|||
|
|||
def __init__(self): |
|||
self.rouge = Rouge() |
|||
self.smooth = SmoothingFunction().method1 |
|||
self.best_bleu = 0. |
|||
|
|||
# def on_epoch_end(self, epoch, logs=None): |
|||
# metrics = self.evaluate(valid_data) # 评测模型 |
|||
# if metrics['bleu'] > self.best_bleu: |
|||
# self.best_bleu = metrics['bleu'] |
|||
# model.save_weights('./best_model.weights') # 保存模型 |
|||
# metrics['best_bleu'] = self.best_bleu |
|||
# print('valid_data:', metrics) |
|||
|
|||
|
|||
def evaluate_t(self, data_1, data_2, topk=1): |
|||
total = 0 |
|||
rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0 |
|||
|
|||
scores = self.rouge.get_scores(hyps=[data_1], refs=[data_2]) |
|||
rouge_1 += scores[0]['rouge-1']['f'] |
|||
rouge_2 += scores[0]['rouge-2']['f'] |
|||
rouge_l += scores[0]['rouge-l']['f'] |
|||
bleu += sentence_bleu( |
|||
references=[data_1.split(' ')], |
|||
hypothesis=data_2.split(' '), |
|||
smoothing_function=self.smooth |
|||
) |
|||
# rouge_1 /= total |
|||
# rouge_2 /= total |
|||
# rouge_l /= total |
|||
# bleu /= total |
|||
return [rouge_1, rouge_2, rouge_l, bleu] |
|||
|
|||
class bertModel: |
|||
def __init__(self): |
|||
self.config_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json' |
|||
self.checkpoint_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt' |
|||
self.dict_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt' |
|||
self.token_dict, self.keep_tokens = load_vocab( |
|||
dict_path=self.dict_path, |
|||
simplified=True, |
|||
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], |
|||
) |
|||
self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True) |
|||
self.buildmodel() |
|||
|
|||
|
|||
def buildmodel(self): |
|||
bert = build_transformer_model( |
|||
config_path=self.config_path, |
|||
checkpoint_path=self.checkpoint_path, |
|||
return_keras_model=False, |
|||
) |
|||
|
|||
output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output) |
|||
self.model = keras.models.Model(bert.model.input, output) |
|||
self.model.summary() |
|||
|
|||
def predict(self,text): |
|||
batch_token_ids, batch_segment_ids = [], [] |
|||
token_ids, segment_ids = self.tokenizer.encode(text, maxlen=256) |
|||
batch_token_ids.append(token_ids) |
|||
batch_segment_ids.append(segment_ids) |
|||
return self.model.predict([batch_token_ids, batch_segment_ids]) |
|||
|
|||
|
|||
def simbert(data_1, data_2): |
|||
pass |
|||
|
|||
def word2vec(): |
|||
pass |
|||
|
|||
def bleu(): |
|||
pass |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
file = "../data/train_yy_1.txt" |
|||
model = bertModel() |
|||
eval_class = Evaluator() |
|||
# word2vecmodel = Word2vecModel() |
|||
try: |
|||
with open(file, 'r', encoding="utf-8") as f: |
|||
lines = [x.strip() for x in f if x.strip() != ''] |
|||
except: |
|||
with open(file, 'r', encoding="gbk") as f: |
|||
lines = [x.strip() for x in f if x.strip() != ''] |
|||
|
|||
bertsim_list = [] |
|||
bleusim_list = [] |
|||
word2vecsim_list = [] |
|||
data_train_text = [] |
|||
|
|||
random.shuffle(lines) |
|||
print(len(lines)) |
|||
for txt in tqdm(lines): |
|||
text = txt.split('\t') |
|||
if len(text) == 3: |
|||
data_1 = text[0] |
|||
data_2 = text[2] |
|||
y1 = model.predict(data_1)[0] |
|||
y2 = model.predict(data_2)[0] |
|||
cos_sim = cosine_similarity(y1.reshape(1, -1), y2.reshape(1, -1)) |
|||
bertsim_list.append(cos_sim[0][0]) |
|||
bertsim_value = cos_sim[0][0] |
|||
|
|||
eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2)) |
|||
bleusim_list.append(eval_list[3]) |
|||
bleusim_value = eval_list[3] |
|||
|
|||
if bertsim_value <= 0.94 and bleusim_value <= 0.4: |
|||
data_train_text.append("\t".join([data_1, "to", data_2])) |
|||
|
|||
|
|||
|
|||
# eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2)) |
|||
# bleusim_list.append(eval_list[3]) |
|||
|
|||
# word2vec |
|||
# seg_0_list = jieba.cut(data_1, cut_all=False) |
|||
# seg_1_list = jieba.cut(data_2, cut_all=False) |
|||
# seg_0_list = [char for char in seg_0_list] |
|||
# seg_1_list = [char for char in seg_1_list] |
|||
# |
|||
# sentence_0_list, sentence_1_list = word2vecmodel.word2vec_res(seg_0_list, seg_1_list) |
|||
# sentence_0_result = np.array(sentence_0_list) |
|||
# sentence_1_result = np.array(sentence_1_list) |
|||
# sentence_0_array = sentence_0_result.sum(axis=0) |
|||
# sentence_1_array = sentence_1_result.sum(axis=0) |
|||
# print(sentence_1_array) |
|||
# print(sentence_0_array) |
|||
# cos_sim = cosine_similarity(sentence_0_array.reshape(1, -1), sentence_1_array.reshape(1, -1)) |
|||
# word2vecsim_list.append(cos_sim[0][0]) |
|||
|
|||
# bertsim_list = sorted(bertsim_list) |
|||
# zong_num = len(bertsim_list) |
|||
# print(bertsim_list) |
|||
# print(bertsim_list[int(zong_num/2)]) |
|||
# print(sum(bertsim_list)/zong_num) |
|||
|
|||
# bleusim_list = sorted(bleusim_list) |
|||
# zong_num = len(bleusim_list) |
|||
# print(bleusim_list) |
|||
# print(bleusim_list[int(zong_num / 2)]) |
|||
# print(sum(bleusim_list) / zong_num) |
|||
|
|||
fileName = 'train_yy_1_sim_09.txt' |
|||
with open(fileName, 'w', encoding='utf-8') as file: |
|||
for i in data_train_text: |
|||
file.write(str(i) + '\n') |
|||
file.close() |
|||
|
@ -0,0 +1,209 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
""" |
|||
@Time : 2023/1/31 19:02 |
|||
@Author : |
|||
@FileName: |
|||
@Software: |
|||
@Describe: |
|||
""" |
|||
import os |
|||
# os.environ["TF_KERAS"] = "1" |
|||
os.environ["CUDA_VISIBLE_DEVICES"] = "0" |
|||
import json |
|||
import numpy as np |
|||
from bert4keras.backend import keras, set_gelu |
|||
from bert4keras.tokenizers import Tokenizer, load_vocab |
|||
from bert4keras.models import build_transformer_model |
|||
from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr |
|||
from bert4keras.snippets import sequence_padding, DataGenerator |
|||
from bert4keras.snippets import open |
|||
from keras.layers import Lambda, Dense |
|||
import tensorflow as tf |
|||
from keras.backend import set_session |
|||
from sklearn.metrics.pairwise import cosine_similarity |
|||
from rouge import Rouge # pip install rouge |
|||
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction |
|||
from tqdm import tqdm |
|||
import jieba |
|||
from gensim.models import KeyedVectors, word2vec, Word2Vec |
|||
import random |
|||
|
|||
config = tf.ConfigProto() |
|||
config.gpu_options.allow_growth = True |
|||
set_session(tf.Session(config=config)) # 此处不同 |
|||
|
|||
class Word2vecModel: |
|||
def __init__(self): |
|||
self.path = "E:\pycharm_workspace\查重分析\word2vec_model\\word2vec_add_new_18.model" |
|||
self.model = Word2Vec.load(self.path) |
|||
|
|||
def word2vec_res(self,seg_0_list, seg_1_list): |
|||
sentence_0_list = [] |
|||
sentence_1_list = [] |
|||
for i in seg_0_list: |
|||
a = self.model.wv[i] |
|||
sentence_0_list.append(a) |
|||
|
|||
for i in seg_1_list: |
|||
a = self.model.wv[i] |
|||
sentence_1_list.append(a) |
|||
|
|||
return sentence_0_list, sentence_1_list |
|||
|
|||
class Evaluator(keras.callbacks.Callback): |
|||
"""评估与保存 |
|||
""" |
|||
|
|||
def __init__(self): |
|||
self.rouge = Rouge() |
|||
self.smooth = SmoothingFunction().method1 |
|||
self.best_bleu = 0. |
|||
|
|||
# def on_epoch_end(self, epoch, logs=None): |
|||
# metrics = self.evaluate(valid_data) # 评测模型 |
|||
# if metrics['bleu'] > self.best_bleu: |
|||
# self.best_bleu = metrics['bleu'] |
|||
# model.save_weights('./best_model.weights') # 保存模型 |
|||
# metrics['best_bleu'] = self.best_bleu |
|||
# print('valid_data:', metrics) |
|||
|
|||
|
|||
def evaluate_t(self, data_1, data_2, topk=1): |
|||
total = 0 |
|||
rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0 |
|||
|
|||
scores = self.rouge.get_scores(hyps=[data_1], refs=[data_2]) |
|||
rouge_1 += scores[0]['rouge-1']['f'] |
|||
rouge_2 += scores[0]['rouge-2']['f'] |
|||
rouge_l += scores[0]['rouge-l']['f'] |
|||
bleu += sentence_bleu( |
|||
references=[data_1.split(' ')], |
|||
hypothesis=data_2.split(' '), |
|||
smoothing_function=self.smooth |
|||
) |
|||
# rouge_1 /= total |
|||
# rouge_2 /= total |
|||
# rouge_l /= total |
|||
# bleu /= total |
|||
return [rouge_1, rouge_2, rouge_l, bleu] |
|||
|
|||
class bertModel: |
|||
def __init__(self): |
|||
self.config_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json' |
|||
self.checkpoint_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt' |
|||
self.dict_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt' |
|||
self.token_dict, self.keep_tokens = load_vocab( |
|||
dict_path=self.dict_path, |
|||
simplified=True, |
|||
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], |
|||
) |
|||
self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True) |
|||
self.buildmodel() |
|||
|
|||
|
|||
def buildmodel(self): |
|||
bert = build_transformer_model( |
|||
config_path=self.config_path, |
|||
checkpoint_path=self.checkpoint_path, |
|||
return_keras_model=False, |
|||
) |
|||
|
|||
output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output) |
|||
self.model = keras.models.Model(bert.model.input, output) |
|||
self.model.summary() |
|||
|
|||
def predict(self,text): |
|||
batch_token_ids, batch_segment_ids = [], [] |
|||
token_ids, segment_ids = self.tokenizer.encode(text, maxlen=256) |
|||
batch_token_ids.append(token_ids) |
|||
batch_segment_ids.append(segment_ids) |
|||
return self.model.predict([batch_token_ids, batch_segment_ids]) |
|||
|
|||
|
|||
def simbert(data_1, data_2): |
|||
pass |
|||
|
|||
def word2vec(): |
|||
pass |
|||
|
|||
def bleu(): |
|||
pass |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
file = "../data/train_yy_1.txt" |
|||
model = bertModel() |
|||
eval_class = Evaluator() |
|||
# word2vecmodel = Word2vecModel() |
|||
try: |
|||
with open(file, 'r', encoding="utf-8") as f: |
|||
lines = [x.strip() for x in f if x.strip() != ''] |
|||
except: |
|||
with open(file, 'r', encoding="gbk") as f: |
|||
lines = [x.strip() for x in f if x.strip() != ''] |
|||
|
|||
bertsim_list = [] |
|||
bleusim_list = [] |
|||
word2vecsim_list = [] |
|||
data_train_text = [] |
|||
|
|||
random.shuffle(lines) |
|||
print(len(lines)) |
|||
for txt in tqdm(lines): |
|||
text = txt.split('\t') |
|||
if len(text) == 3: |
|||
data_1 = text[0] |
|||
data_2 = text[2] |
|||
y1 = model.predict(data_1)[0] |
|||
y2 = model.predict(data_2)[0] |
|||
cos_sim = cosine_similarity(y1.reshape(1, -1), y2.reshape(1, -1)) |
|||
bertsim_list.append(cos_sim[0][0]) |
|||
bertsim_value = cos_sim[0][0] |
|||
|
|||
eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2)) |
|||
bleusim_list.append(eval_list[3]) |
|||
bleusim_value = eval_list[3] |
|||
|
|||
if bertsim_value <= 0.94 and bleusim_value <= 0.4: |
|||
data_train_text.append("\t".join([data_1, "to", data_2])) |
|||
|
|||
|
|||
|
|||
# eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2)) |
|||
# bleusim_list.append(eval_list[3]) |
|||
|
|||
# word2vec |
|||
# seg_0_list = jieba.cut(data_1, cut_all=False) |
|||
# seg_1_list = jieba.cut(data_2, cut_all=False) |
|||
# seg_0_list = [char for char in seg_0_list] |
|||
# seg_1_list = [char for char in seg_1_list] |
|||
# |
|||
# sentence_0_list, sentence_1_list = word2vecmodel.word2vec_res(seg_0_list, seg_1_list) |
|||
# sentence_0_result = np.array(sentence_0_list) |
|||
# sentence_1_result = np.array(sentence_1_list) |
|||
# sentence_0_array = sentence_0_result.sum(axis=0) |
|||
# sentence_1_array = sentence_1_result.sum(axis=0) |
|||
# print(sentence_1_array) |
|||
# print(sentence_0_array) |
|||
# cos_sim = cosine_similarity(sentence_0_array.reshape(1, -1), sentence_1_array.reshape(1, -1)) |
|||
# word2vecsim_list.append(cos_sim[0][0]) |
|||
|
|||
# bertsim_list = sorted(bertsim_list) |
|||
# zong_num = len(bertsim_list) |
|||
# print(bertsim_list) |
|||
# print(bertsim_list[int(zong_num/2)]) |
|||
# print(sum(bertsim_list)/zong_num) |
|||
|
|||
# bleusim_list = sorted(bleusim_list) |
|||
# zong_num = len(bleusim_list) |
|||
# print(bleusim_list) |
|||
# print(bleusim_list[int(zong_num / 2)]) |
|||
# print(sum(bleusim_list) / zong_num) |
|||
|
|||
fileName = 'train_yy_1_sim_09.txt' |
|||
with open(fileName, 'w', encoding='utf-8') as file: |
|||
for i in data_train_text: |
|||
file.write(str(i) + '\n') |
|||
file.close() |
|||
|
@ -0,0 +1,206 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
""" |
|||
@Time : 2023/1/31 19:02 |
|||
@Author : |
|||
@FileName: |
|||
@Software: |
|||
@Describe: |
|||
""" |
|||
import os |
|||
# os.environ["TF_KERAS"] = "1" |
|||
os.environ["CUDA_VISIBLE_DEVICES"] = "0" |
|||
import json |
|||
import numpy as np |
|||
from bert4keras.backend import keras, set_gelu |
|||
from bert4keras.tokenizers import Tokenizer, load_vocab |
|||
from bert4keras.models import build_transformer_model |
|||
from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr |
|||
from bert4keras.snippets import sequence_padding, DataGenerator |
|||
from bert4keras.snippets import open |
|||
from keras.layers import Lambda, Dense |
|||
import tensorflow as tf |
|||
from keras.backend import set_session |
|||
from sklearn.metrics.pairwise import cosine_similarity |
|||
from rouge import Rouge # pip install rouge |
|||
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction |
|||
from tqdm import tqdm |
|||
import jieba |
|||
from gensim.models import KeyedVectors, word2vec, Word2Vec |
|||
import random |
|||
import difflib |
|||
|
|||
config = tf.ConfigProto() |
|||
config.gpu_options.allow_growth = True |
|||
set_session(tf.Session(config=config)) # 此处不同 |
|||
|
|||
class Word2vecModel: |
|||
def __init__(self): |
|||
self.path = "E:\pycharm_workspace\查重分析\word2vec_model\\word2vec_add_new_18.model" |
|||
self.model = Word2Vec.load(self.path) |
|||
|
|||
def word2vec_res(self,seg_0_list, seg_1_list): |
|||
sentence_0_list = [] |
|||
sentence_1_list = [] |
|||
for i in seg_0_list: |
|||
a = self.model.wv[i] |
|||
sentence_0_list.append(a) |
|||
|
|||
for i in seg_1_list: |
|||
a = self.model.wv[i] |
|||
sentence_1_list.append(a) |
|||
|
|||
return sentence_0_list, sentence_1_list |
|||
|
|||
class Evaluator(keras.callbacks.Callback): |
|||
"""评估与保存 |
|||
""" |
|||
|
|||
def __init__(self): |
|||
self.rouge = Rouge() |
|||
self.smooth = SmoothingFunction().method1 |
|||
self.best_bleu = 0. |
|||
|
|||
# def on_epoch_end(self, epoch, logs=None): |
|||
# metrics = self.evaluate(valid_data) # 评测模型 |
|||
# if metrics['bleu'] > self.best_bleu: |
|||
# self.best_bleu = metrics['bleu'] |
|||
# model.save_weights('./best_model.weights') # 保存模型 |
|||
# metrics['best_bleu'] = self.best_bleu |
|||
# print('valid_data:', metrics) |
|||
|
|||
|
|||
def evaluate_t(self, data_1, data_2, topk=1): |
|||
total = 0 |
|||
rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0 |
|||
|
|||
scores = self.rouge.get_scores(hyps=[data_1], refs=[data_2]) |
|||
rouge_1 += scores[0]['rouge-1']['f'] |
|||
rouge_2 += scores[0]['rouge-2']['f'] |
|||
rouge_l += scores[0]['rouge-l']['f'] |
|||
bleu += sentence_bleu( |
|||
references=[data_1.split(' ')], |
|||
hypothesis=data_2.split(' '), |
|||
smoothing_function=self.smooth |
|||
) |
|||
# rouge_1 /= total |
|||
# rouge_2 /= total |
|||
# rouge_l /= total |
|||
# bleu /= total |
|||
return [rouge_1, rouge_2, rouge_l, bleu] |
|||
|
|||
class bertModel: |
|||
def __init__(self): |
|||
self.config_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json' |
|||
self.checkpoint_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt' |
|||
self.dict_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt' |
|||
self.token_dict, self.keep_tokens = load_vocab( |
|||
dict_path=self.dict_path, |
|||
simplified=True, |
|||
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], |
|||
) |
|||
self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True) |
|||
self.buildmodel() |
|||
|
|||
|
|||
def buildmodel(self): |
|||
bert = build_transformer_model( |
|||
config_path=self.config_path, |
|||
checkpoint_path=self.checkpoint_path, |
|||
return_keras_model=False, |
|||
) |
|||
|
|||
output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output) |
|||
self.model = keras.models.Model(bert.model.input, output) |
|||
self.model.summary() |
|||
|
|||
def predict(self,text): |
|||
batch_token_ids, batch_segment_ids = [], [] |
|||
token_ids, segment_ids = self.tokenizer.encode(text, maxlen=256) |
|||
batch_token_ids.append(token_ids) |
|||
batch_segment_ids.append(segment_ids) |
|||
return self.model.predict([batch_token_ids, batch_segment_ids]) |
|||
|
|||
|
|||
def simbert(data_1, data_2): |
|||
pass |
|||
|
|||
def word2vec(): |
|||
pass |
|||
|
|||
def bleu(): |
|||
pass |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
file = "../data/train_yy_1.txt" |
|||
model = bertModel() |
|||
eval_class = Evaluator() |
|||
# word2vecmodel = Word2vecModel() |
|||
try: |
|||
with open(file, 'r', encoding="utf-8") as f: |
|||
lines = [x.strip() for x in f if x.strip() != ''] |
|||
except: |
|||
with open(file, 'r', encoding="gbk") as f: |
|||
lines = [x.strip() for x in f if x.strip() != ''] |
|||
|
|||
bertsim_list = [] |
|||
bleusim_list = [] |
|||
word2vecsim_list = [] |
|||
data_train_text = [] |
|||
|
|||
random.shuffle(lines) |
|||
print(len(lines)) |
|||
for txt in tqdm(lines): |
|||
text = txt.split('\t') |
|||
if len(text) == 3: |
|||
data_1 = text[0] |
|||
data_2 = text[2] |
|||
str_sim_value = difflib.SequenceMatcher(None, data_1, data_2).quick_ratio() |
|||
if len(data_2) - len(data_1) < 0 and len(data_2) / len(data_1) > 0.8: |
|||
num_yu = 1 - len(data_2) / len(data_1) |
|||
str_sim_value = 1 - str_sim_value * num_yu |
|||
|
|||
|
|||
if str_sim_value < 0.70: |
|||
data_train_text.append("\t".join([data_1, "to", data_2])) |
|||
|
|||
|
|||
|
|||
# eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2)) |
|||
# bleusim_list.append(eval_list[3]) |
|||
|
|||
# word2vec |
|||
# seg_0_list = jieba.cut(data_1, cut_all=False) |
|||
# seg_1_list = jieba.cut(data_2, cut_all=False) |
|||
# seg_0_list = [char for char in seg_0_list] |
|||
# seg_1_list = [char for char in seg_1_list] |
|||
# |
|||
# sentence_0_list, sentence_1_list = word2vecmodel.word2vec_res(seg_0_list, seg_1_list) |
|||
# sentence_0_result = np.array(sentence_0_list) |
|||
# sentence_1_result = np.array(sentence_1_list) |
|||
# sentence_0_array = sentence_0_result.sum(axis=0) |
|||
# sentence_1_array = sentence_1_result.sum(axis=0) |
|||
# print(sentence_1_array) |
|||
# print(sentence_0_array) |
|||
# cos_sim = cosine_similarity(sentence_0_array.reshape(1, -1), sentence_1_array.reshape(1, -1)) |
|||
# word2vecsim_list.append(cos_sim[0][0]) |
|||
|
|||
# bertsim_list = sorted(bertsim_list) |
|||
# zong_num = len(bertsim_list) |
|||
# print(bertsim_list) |
|||
# print(bertsim_list[int(zong_num/2)]) |
|||
# print(sum(bertsim_list)/zong_num) |
|||
|
|||
# bleusim_list = sorted(bleusim_list) |
|||
# zong_num = len(bleusim_list) |
|||
# print(bleusim_list) |
|||
# print(bleusim_list[int(zong_num / 2)]) |
|||
# print(sum(bleusim_list) / zong_num) |
|||
print(len(data_train_text)) |
|||
fileName = 'train_yy_1_sim_10.txt' |
|||
with open(fileName, 'w', encoding='utf-8') as file: |
|||
for i in data_train_text: |
|||
file.write(str(i) + '\n') |
|||
file.close() |
|||
|
@ -0,0 +1,83 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
""" |
|||
@Time : 2023/1/31 19:02 |
|||
@Author : |
|||
@FileName: |
|||
@Software: |
|||
@Describe: |
|||
""" |
|||
import os |
|||
# os.environ["TF_KERAS"] = "1" |
|||
os.environ["CUDA_VISIBLE_DEVICES"] = "0" |
|||
import json |
|||
import numpy as np |
|||
from bert4keras.backend import keras, set_gelu |
|||
from bert4keras.tokenizers import Tokenizer, load_vocab |
|||
from bert4keras.models import build_transformer_model |
|||
from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr |
|||
from bert4keras.snippets import sequence_padding, DataGenerator |
|||
from bert4keras.snippets import open |
|||
from keras.layers import Lambda, Dense |
|||
import tensorflow as tf |
|||
from keras.backend import set_session |
|||
from sklearn.metrics.pairwise import cosine_similarity |
|||
from rouge import Rouge # pip install rouge |
|||
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction |
|||
from tqdm import tqdm |
|||
import jieba |
|||
from gensim.models import KeyedVectors, word2vec, Word2Vec |
|||
import random |
|||
import difflib |
|||
|
|||
config = tf.ConfigProto() |
|||
config.gpu_options.allow_growth = True |
|||
set_session(tf.Session(config=config)) # 此处不同 |
|||
|
|||
|
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
file = "../data/train_yy.txt" |
|||
# word2vecmodel = Word2vecModel() |
|||
try: |
|||
with open(file, 'r', encoding="utf-8") as f: |
|||
lines = [x.strip() for x in f if x.strip() != ''] |
|||
except: |
|||
with open(file, 'r', encoding="gbk") as f: |
|||
lines = [x.strip() for x in f if x.strip() != ''] |
|||
|
|||
bertsim_list = [] |
|||
bleusim_list = [] |
|||
strsim_list = [] |
|||
word2vecsim_list = [] |
|||
data_train_text = [] |
|||
|
|||
random.shuffle(lines) |
|||
print(len(lines)) |
|||
for txt in tqdm(lines): |
|||
text = txt.split('\t') |
|||
if len(text) == 3: |
|||
data_1 = text[0] |
|||
data_2 = text[2] |
|||
# difflib.SequenceMatcher(None, one_data[0], one_data_simbert[j]).quick_ratio() |
|||
str_sim_value = difflib.SequenceMatcher(None, data_1, data_2).quick_ratio() |
|||
|
|||
if len(data_2) - len(data_1) < 0: |
|||
num_yu = 1- len(data_2) / len(data_1) |
|||
str_sim_value = 1 - str_sim_value * num_yu |
|||
strsim_list.append(str_sim_value) |
|||
|
|||
strsim_list = sorted(strsim_list) |
|||
zong_num = len(strsim_list) |
|||
print(strsim_list) |
|||
print(strsim_list[int(zong_num / 2)]) |
|||
print(sum(strsim_list) / zong_num) |
|||
|
|||
|
|||
|
|||
fileName = '统计数据的strsim值.txt' |
|||
with open(fileName, 'w', encoding='utf-8') as file: |
|||
for i in strsim_list: |
|||
file.write(str(i) + '\n') |
|||
file.close() |
File diff suppressed because it is too large
@ -0,0 +1,40 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
""" |
|||
@Time : 2022/12/20 16:20 |
|||
@Author : |
|||
@FileName: |
|||
@Software: |
|||
@Describe: |
|||
""" |
|||
import pandas as pd |
|||
from tqdm import tqdm |
|||
import json |
|||
|
|||
|
|||
path = "../data/论文_yy_小说.xlsx" |
|||
df_list = pd.read_excel(path).values.tolist() |
|||
|
|||
fuhao = {} |
|||
for i in tqdm(df_list): |
|||
for word in i: |
|||
word = str(word) |
|||
if word == "nan": |
|||
continue |
|||
for ch in word: |
|||
if u'\u4e00' <= ch <= u'\u9fff': |
|||
continue |
|||
else: |
|||
if ch in fuhao: |
|||
fuhao[ch] += 1 |
|||
else: |
|||
fuhao[ch] = 1 |
|||
|
|||
test_1 = sorted(fuhao.items(),key=lambda x:x[1],reverse=True) |
|||
fuhao_new = {} |
|||
for i in test_1: |
|||
fuhao_new[i[0]] = i[1] |
|||
|
|||
json_data = json.dumps(fuhao_new, ensure_ascii=False, indent=2) |
|||
with open('../data/fuhao.json', 'w', encoding="utf-8") as f_six: |
|||
f_six.write(json_data) |
@ -0,0 +1,77 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
""" |
|||
@Time : 2023/2/14 14:29 |
|||
@Author : |
|||
@FileName: |
|||
@Software: |
|||
@Describe: |
|||
""" |
|||
# from zipfile import ZipFile |
|||
# from bs4 import BeautifulSoup |
|||
# |
|||
# document=ZipFile("../data/11篇/13139551_于丰源_在线考试系统-原文.docx") |
|||
# xml=document.read("word/document.xml") |
|||
# wordObj=BeautifulSoup(xml.decode("utf-8")) |
|||
# texts=wordObj.findAll("w:t") |
|||
# for text in texts: |
|||
# print(text.text) |
|||
|
|||
import docx |
|||
import win32com.client as wc |
|||
import operator |
|||
#doc文件另存为docx |
|||
|
|||
# path = "E:\pycharm_workspace\drop_weight_rewrite\data\\11篇\\13138572_李菊_谈小学语文教学难点与创新-原文.doc" |
|||
# path_new = "E:\pycharm_workspace\drop_weight_rewrite\data\\11篇\\13138572_李菊_谈小学语文教学难点与创新-原文.docx" |
|||
# word = wc.Dispatch("Word.Application") |
|||
# doc = word.Documents.Open(path) |
|||
# # 12代表转换后为docx文件 |
|||
# doc.SaveAs(path_new, 12) |
|||
# doc.Close |
|||
# word.Quit |
|||
# |
|||
# #读取转换后的docx |
|||
# |
|||
# file = docx.Document(path_new) |
|||
# for p in file.paragraphs: |
|||
# print(p.text) |
|||
|
|||
# from win32com import client as wc |
|||
# w = wc.Dispatch('Word.Application') |
|||
# # 或者使用下面的方法,使用启动独立的进程: |
|||
# # w = wc.DispatchEx('Word.Application') |
|||
# doc=w.Documents.Open(path) |
|||
# doc.SaveAs(path_new,16)#必须有参数16,否则会出错. |
|||
|
|||
|
|||
import os |
|||
from win32com import client as wc |
|||
|
|||
def save_doc_to_docx(rawpath): # doc转docx |
|||
''' |
|||
:param rawpath: 传入和传出文件夹的路径 |
|||
:return: None |
|||
''' |
|||
word = wc.Dispatch("Word.Application") |
|||
# 不能用相对路径,老老实实用绝对路径 |
|||
# 需要处理的文件所在文件夹目录 |
|||
filenamelist = os.listdir(rawpath) |
|||
for i in os.listdir(rawpath): |
|||
# 找出文件中以.doc结尾并且不以~$开头的文件(~$是为了排除临时文件的) |
|||
if i.endswith('.doc') and not i.startswith('~$'): |
|||
print(i) |
|||
# try |
|||
# 打开文件 |
|||
doc = word.Documents.Open(rawpath + i) |
|||
# # 将文件名与后缀分割 |
|||
rename = os.path.splitext(i) |
|||
# 将文件另存为.docx |
|||
doc.SaveAs(pathls + rename[0] + '.docx', 12) # 12表示docx格式 |
|||
doc.Close() |
|||
word.Quit() |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
pathls = "E:\\pycharm_workspace\\drop_weight_rewrite\\data\\11篇\\" |
|||
save_doc_to_docx(pathls) |
@ -0,0 +1,43 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
""" |
|||
@Time : 2022/12/20 17:56 |
|||
@Author : |
|||
@FileName: |
|||
@Software: |
|||
@Describe: |
|||
""" |
|||
import pandas as pd |
|||
from tqdm import tqdm |
|||
import json |
|||
|
|||
|
|||
path = "../data/论文_yy_小说_1.xlsx" |
|||
df_list = pd.read_excel(path).values.tolist() |
|||
|
|||
|
|||
def sentence_do(source,result): |
|||
source = str(source) |
|||
result = str(result) |
|||
if source == "nan" or result == "nan": |
|||
return False, source,result |
|||
if len(source) > 120 or len(result) > 120: |
|||
return False, source,result |
|||
else: |
|||
source = str(source).replace("\t", "").replace(" ", "").replace("", "").replace("", "") |
|||
result = str(result).replace("\t", "").replace(" ", "").replace("", "").replace("", "") |
|||
return True, source, result |
|||
|
|||
|
|||
df_list_new = [] |
|||
for i in df_list: |
|||
source = i[0] |
|||
result = i[1] |
|||
bool, source, result = sentence_do(source, result) |
|||
if bool == False: |
|||
continue |
|||
else: |
|||
df_list_new.append([source,result]) |
|||
|
|||
df = pd.DataFrame(df_list_new, columns=["原文","yy降重"]) |
|||
df.to_excel("../data/论文_yy_小说_3.xlsx",index=None) |
@ -0,0 +1,55 @@ |
|||
方案1 |
|||
1.人名替换处理 |
|||
2.使用翻译模型翻译出来句子人工筛选 |
|||
3.筛选出来的句子进行训练 |
|||
训练方案1 |
|||
常规训练方案 |
|||
训练方案2 |
|||
使用simbert方式训练 |
|||
|
|||
|
|||
翻译模型方案 |
|||
1,使用百度注册账号 |
|||
2,https://www.oschina.net/news/170812/meta-open-source-wmt-21 |
|||
3,使用t5模型训练 |
|||
|
|||
|
|||
改写语料 |
|||
1,现有的语料 58万 |
|||
2,lcqmc语料 |
|||
|
|||
|
|||
写作猫发现的规律 |
|||
1. |
|||
可以确定的是,应该是用模型做的,因为同样的短句,后面加点东西和不加,出来的整体句子都不一样 |
|||
|
|||
伴随着风硕哀嚎的声音,白策笑嘻嘻的离开了。 --> 在冯硕的惨叫声中,白策嘿嘿一笑,转身离去。 |
|||
伴随着风硕哀嚎的声音 --> 伴随着风硕的惨叫声 |
|||
2. |
|||
人名特殊处理 |
|||
3.(待定,因为试了几条结果跟我预想的差不多,不敢确定) |
|||
一般是合并句子居多,且一般是前面句子或者后面句子形容中心句 |
|||
|
|||
|
|||
修改后短句增多的例子 |
|||
对方无声哭泣着再次说了一遍。 --> 对方带着哭腔,又重复了一遍。 |
|||
足足等到叶澜歌换好了衣服 --> 一直到叶澜歌穿好衣服,这才停了下来。 |
|||
修改后短句合并的例子 |
|||
林天宇瞬间就察觉到问题,对着她,轻声问道,“你是不是察觉到什么?” --> 林天宇立刻意识到了不对劲,小声的问了一句,“有没有发现?” |
|||
|
|||
|
|||
方案: |
|||
1,有可能形容词替换之后(使用work2vec),再用中译英英译中(也会补充出来代词) |
|||
|
|||
张三尖叫的喊道 |
|||
张三尖叫的嚷嚷道 --> 张三大吼一声。 --> 张三咆哮起来。 --> 张三大吼一声。 |
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
@ -0,0 +1,70 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
""" |
|||
@Time : 2023/2/3 17:27 |
|||
@Author : |
|||
@FileName: |
|||
@Software: |
|||
@Describe: |
|||
""" |
|||
|
|||
import os |
|||
# os.environ["TF_KERAS"] = "1" |
|||
os.environ["CUDA_VISIBLE_DEVICES"] = "0" |
|||
from bert4keras.backend import keras, set_gelu |
|||
import numpy as np |
|||
from rouge import Rouge # pip install rouge |
|||
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction |
|||
|
|||
|
|||
class Evaluator(keras.callbacks.Callback): |
|||
"""评估与保存 |
|||
""" |
|||
|
|||
def __init__(self): |
|||
self.rouge = Rouge() |
|||
self.smooth = SmoothingFunction().method1 |
|||
self.best_bleu = 0. |
|||
|
|||
# def on_epoch_end(self, epoch, logs=None): |
|||
# metrics = self.evaluate(valid_data) # 评测模型 |
|||
# if metrics['bleu'] > self.best_bleu: |
|||
# self.best_bleu = metrics['bleu'] |
|||
# model.save_weights('./best_model.weights') # 保存模型 |
|||
# metrics['best_bleu'] = self.best_bleu |
|||
# print('valid_data:', metrics) |
|||
|
|||
|
|||
def evaluate_t(self, data_1, data_2, topk=1): |
|||
total = 0 |
|||
rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0 |
|||
|
|||
scores = self.rouge.get_scores(hyps=[data_1], refs=[data_2]) |
|||
rouge_1 += scores[0]['rouge-1']['f'] |
|||
rouge_2 += scores[0]['rouge-2']['f'] |
|||
rouge_l += scores[0]['rouge-l']['f'] |
|||
bleu += sentence_bleu( |
|||
references=[data_1.split(' ')], |
|||
hypothesis=data_2.split(' '), |
|||
smoothing_function=self.smooth |
|||
) |
|||
# rouge_1 /= total |
|||
# rouge_2 /= total |
|||
# rouge_l /= total |
|||
# bleu /= total |
|||
return [rouge_1, rouge_2, rouge_l, bleu] |
|||
|
|||
eval_class = Evaluator() |
|||
data_1 = "上海中心大厦" |
|||
data_2 = "上海" |
|||
eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2)) |
|||
print(eval_list) |
|||
|
|||
|
|||
a = len(data_2) - len(data_1) |
|||
if a < 0: |
|||
a * |
|||
|
|||
|
|||
a = len(data_2)/len(data_1) |
|||
np.exp(len(data_2) - len(data_1)) |
Loading…
Reference in new issue