Browse Source

第一次提交,非batch预测版本

master
majiahui@haimaqingfan.com 2 years ago
parent
commit
8a56f2ffe6
  1. 74
      data_do/yy数据处理.py
  2. 26
      data_do/yy训练数据处理.py
  3. 31
      data_do/合并数据.py
  4. 102
      data_do/处理11篇文本.py
  5. 114
      data_do/处理11篇顺序输入.py
  6. 87
      data_do/处理yy数据原始数据.py
  7. 33
      data_do/汇总.py
  8. 209
      data_do/筛选训练数据.py
  9. 209
      data_do/筛选训练数据_str_sim.py
  10. 206
      data_do/筛选训练数据strsim.py
  11. 83
      data_do/统计数据的_str_sim_值.py
  12. 276853
      data_do/统计数据的strsim值.txt
  13. 40
      data_do/统计非中文字符.py
  14. 77
      data_do/读取docx.py
  15. 43
      data_do/进一步处理降重数据.py
  16. 55
      优化点/优化点.txt
  17. 0
      改进方案/改进方案.txt
  18. 70
      测试range_1.py

74
data_do/yy数据处理.py

@ -0,0 +1,74 @@
# -*- coding: utf-8 -*-
"""
@Time : 2022/12/20 10:35
@Author :
@FileName:
@Software:
@Describe:
"""
import os
from bs4 import BeautifulSoup
import pandas as pd
import re
# 遍历文件夹
path_list = []
def walkFile(file):
for root, dirs, files in os.walk(file):
# root 表示当前访问的文件夹路径
# dirs 表示该文件夹下的子目录名list
# files 表示该文件夹下的文件list
# 遍历文件
for f in files:
name = str(f).split("_")[0]
path_list.append(name)
walkFile("../data/yy_reduce_data")
path_list = list(set(path_list))
print(path_list)
data = []
for i in path_list:
soup_source = BeautifulSoup(open("../data/yy_reduce_data/{}_source".format(i), encoding='utf-8'),
"html.parser")
soup_result = BeautifulSoup(open("../data/yy_reduce_data/{}_result".format(i), encoding='utf-8'),
"html.parser")
source_sentence_list = soup_source.select('p > em')
result_sentence_list = soup_result.select('p > em')
for sentence_index in range(len(source_sentence_list)):
try:
if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"] \
and result_sentence_list[sentence_index]["class"] == ['similar']:
# if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"]:
source_text = source_sentence_list[sentence_index].string
result_text = result_sentence_list[sentence_index].string
if source_text != None and result_text != None:
data.append([source_sentence_list[sentence_index].string, result_sentence_list[sentence_index].string])
except:
print(i,sentence_index)
# print(data)
def data_clean(text):
# 清洗excel中的非法字符,都是不常见的不可显示字符,例如退格,响铃等
ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
text = ILLEGAL_CHARACTERS_RE.sub(r'', text)
return text
df = pd.DataFrame(data,columns=["原文","yy降重"])
for col in df.columns:
df[col] = df[col].apply(lambda x: data_clean(x))
df.to_excel("../data/论文_yy_小说.xlsx",index=None)

26
data_do/yy训练数据处理.py

@ -0,0 +1,26 @@
# -*- coding: utf-8 -*-
"""
@Time : 2022/12/23 16:00
@Author :
@FileName:
@Software:
@Describe:
"""
import pandas as pd
path = "../data/论文_yy_小说_3.xlsx"
df_list = pd.read_excel(path).values.tolist()
df_list_new = []
print(len(df_list))
for i in df_list:
a = i[0]
b = i[1]
df_list_new.append("\t".join([a, "to", b]))
with open("../data/train_yy_1.txt", "w", encoding='utf-8') as file:
for i in df_list_new:
file.write(i + '\n')
file.close()

31
data_do/合并数据.py

@ -0,0 +1,31 @@
# -*- coding: utf-8 -*-
"""
@Time : 2023/2/2 11:29
@Author :
@FileName:
@Software:
@Describe:
"""
def read_text(file):
try:
with open(file, 'r', encoding="utf-8") as f:
lines = [x.strip() for x in f if x.strip() != '']
except:
with open(file, 'r', encoding="gbk") as f:
lines = [x.strip() for x in f if x.strip() != '']
return lines
if __name__ == '__main__':
data = []
path_list = ["train_yy_sim_10.txt", "train_yy_1_sim_10.txt"]
for i in path_list:
data += read_text(i)
fileName = '../data/train_yy_sim.txt'
with open(fileName, 'w', encoding='utf-8') as file:
for i in data:
file.write(str(i) + '\n')
file.close()

102
data_do/处理11篇文本.py

@ -0,0 +1,102 @@
# -*- coding: utf-8 -*-
"""
@Time : 2023/2/14 14:19
@Author :
@FileName:
@Software:
@Describe:
"""
import os
import pandas as pd
import docx
import win32com.client as wc
import operator
def is_chinese(uchar):
"""
判断一个unicode是否是汉字
:param uchar:
:return:
"""
if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
return True
else:
return False
def snetence(text):
bool_ = True
for i in text:
bool_1 = is_chinese(i)
if bool_1 == True:
continue
else:
if i in fuhao:
continue
else:
bool_ = False
break
return bool_
fuhao = ["","",",",""]
path = '../data/11篇'
path_list = []
for file_name in os.listdir(path):
path_list.append(file_name)
# print(path_list)
# path = "../data/11篇/13139551_于丰源_在线考试系统-原文.docx"
for docx_name in path_list:
data = []
data_new = []
file_name = docx_name.split(".")[0]
file_type = docx_name.split(".")[1]
if file_type == "docx":
document = docx.Document(path + "/" + docx_name)
else:
continue
#获取所有段落
all_paragraphs = document.paragraphs
for paragraph in all_paragraphs:
#打印每一个段落的文字
data.append(paragraph.text)
data = sorted(data,key=lambda x:len(x))
for data_dan in data:
if data_dan == "":
continue
for i in data_dan:
if i == "":
continue
if len(data_dan) < 15:
continue
# else:
# bool_ = snetence(data_dan)
# if bool_ == True:
# data_new.append(data_dan)
else:
data_list = str(data_dan).split("")
for data_dan_short in data_list:
if data_dan_short == "":
continue
for i in data_dan_short:
if i == "":
continue
if len(data_dan_short) < 10:
continue
if len(data_dan_short) > 120:
continue
data_new.append(data_dan_short)
data_new = sorted(data_new,key=lambda x:len(x))
data_df = []
for i in data_new:
data_df.append([i])
pd.DataFrame(data_df).to_csv("../data/11篇csv/" + file_name + ".csv", index=False)

114
data_do/处理11篇顺序输入.py

@ -0,0 +1,114 @@
# -*- coding: utf-8 -*-
"""
@Time : 2023/2/14 14:19
@Author :
@FileName:
@Software:
@Describe:
"""
import os
import pandas as pd
import docx
import win32com.client as wc
import operator
def is_chinese(uchar):
"""
判断一个unicode是否是汉字
:param uchar:
:return:
"""
if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
return True
else:
return False
def snetence(text):
bool_ = True
for i in text:
bool_1 = is_chinese(i)
if bool_1 == True:
continue
else:
if i in fuhao:
continue
else:
bool_ = False
break
return bool_
fuhao = ["","",",",""]
path = '../data/11篇'
path_list = []
for file_name in os.listdir(path):
path_list.append(file_name)
# print(path_list)
def chulichangju_2(text, chulipangban_return_list):
fuhao = ["","","",""]
text_1 = text[:120]
text_2 = text[120:]
text_1_new = ""
for i in range(len(text_1)-1, -1, -1):
if text_1[i] in fuhao:
text_1_new = text_1[:i]
text_1_new += text_1[i]
chulipangban_return_list.append(text_1_new)
if text_2 != "":
if i+1 != 120:
text_2 = text_1[i+1:] + text_2
break
# else:
# chulipangban_return_list.append(text_1)
if text_1_new == "":
chulipangban_return_list.append(text_1)
if text_2 != "":
chulipangban_return_list = chulichangju_2(text_2, chulipangban_return_list)
return chulipangban_return_list
# path = "../data/11篇/13139551_于丰源_在线考试系统-原文.docx"
for docx_name in path_list:
data = []
data_new = []
file_name = docx_name.split(".")[0]
file_type = docx_name.split(".")[1]
if file_type == "docx":
document = docx.Document(path + "/" + docx_name)
else:
continue
#获取所有段落
all_paragraphs = document.paragraphs
for paragraph in all_paragraphs:
#打印每一个段落的文字
data.append(paragraph.text)
for data_dan in data:
if data_dan == "":
continue
# else:
# bool_ = snetence(data_dan)
# if bool_ == True:
# data_new.append(data_dan)
else:
data_list = str(data_dan).split("")
for data_dan_short in data_list:
if data_dan_short == "":
continue
if len(data_dan_short) > 120:
dan_list = chulichangju_2(data_dan_short, [])
dan_list[-1] += ""
data_new.extend(dan_list)
else:
data_dan_short += ""
data_new.append(data_dan_short)
with open("../data/11篇txt/{}.txt".format(file_name), "w", encoding='utf-8') as file:
for i in data_new:
file.write(i + '\n')
file.close()

87
data_do/处理yy数据原始数据.py

@ -0,0 +1,87 @@
# -*- coding: utf-8 -*-
"""
@Time : 2023/2/1 19:18
@Author :
@FileName:
@Software:
@Describe:
"""
import os
import os
from bs4 import BeautifulSoup
import pandas as pd
import re
from tqdm import tqdm
# 遍历文件夹
data_path_list = []
def walkFile(file):
for root, dirs, files in os.walk(file):
# root 表示当前访问的文件夹路径
# dirs 表示该文件夹下的子目录名list
# files 表示该文件夹下的文件list
# 遍历文件
for f in files:
# print(os.path.join(root, f))
data_path_list.append(os.path.join(root, f))
# 遍历所有的文件夹
# for d in dirs:
# print(os.path.join(root, d))
def main():
walkFile("../data/yy_reduce_data_20221219-20230131")
main()
data = []
rootpath_list = []
for i in data_path_list:
danpath_list = str(i).split("\\")
rootpath_list.append("\\".join(danpath_list[:-1]))
print(len(rootpath_list))
rootpath_list = list(set(rootpath_list))
for i in tqdm(rootpath_list):
try:
soup_source = BeautifulSoup(open("{}\\source".format(i), encoding='utf-8'),
"html.parser")
soup_result = BeautifulSoup(open("{}\\result".format(i), encoding='utf-8'),
"html.parser")
except:
continue
source_sentence_list = soup_source.select('p > em')
result_sentence_list = soup_result.select('p > em')
for sentence_index in range(len(source_sentence_list)):
try:
if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"] \
and result_sentence_list[sentence_index]["class"] == ['similar']:
# if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"]:
source_text = source_sentence_list[sentence_index].string
result_text = result_sentence_list[sentence_index].string
if source_text != None and result_text != None:
data.append([source_sentence_list[sentence_index].string, result_sentence_list[sentence_index].string])
except:
pass
# print(i,sentence_index)
# print(data)
def data_clean(text):
# 清洗excel中的非法字符,都是不常见的不可显示字符,例如退格,响铃等
ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
text = ILLEGAL_CHARACTERS_RE.sub(r'', text)
return text
df = pd.DataFrame(data,columns=["原文","yy降重"])
for col in df.columns:
df[col] = df[col].apply(lambda x: data_clean(x))
df.to_excel("../data/论文_yy_小说_1.xlsx",index=None)

33
data_do/汇总.py

@ -0,0 +1,33 @@
# -*- coding: utf-8 -*-
"""
@Time : 2023/2/15 14:13
@Author :
@FileName:
@Software:
@Describe:
"""
import os
import pandas as pd
path_1 = '../data/11篇excel'
path_2 = "../data/11篇临时拼接"
path_3 = "../data/11篇临时拼接2"
path_list = []
for file_name in os.listdir(path_1):
path_list.append(file_name)
for file_name in path_list:
data_new = []
data_1 = pd.read_excel(path_1 + "/" + file_name).values.tolist()
data_2 = pd.read_excel(path_2 + "/" + file_name).values.tolist()
file_name_0 = file_name.split(".")[0]
file_name_1 = file_name.split(".")[1]
file_name_ = file_name_0 + "_." + file_name_1
data_3 = pd.read_excel(path_3 + "/" + file_name_).values.tolist()
for i in range(len(data_1)):
data_new.append(data_1[i] + [data_2[i][1]] + [data_3[i][1]])
df = pd.DataFrame(data_new,columns=["原文","simbert","simbert_datasim07","bertsim_simsim"])
df.to_excel("../data/11篇测试excel_汇总_1/{}.xlsx".format(file_name_0), index=None)

209
data_do/筛选训练数据.py

@ -0,0 +1,209 @@
# -*- coding: utf-8 -*-
"""
@Time : 2023/1/31 19:02
@Author :
@FileName:
@Software:
@Describe:
"""
import os
# os.environ["TF_KERAS"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import json
import numpy as np
from bert4keras.backend import keras, set_gelu
from bert4keras.tokenizers import Tokenizer, load_vocab
from bert4keras.models import build_transformer_model
from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.snippets import open
from keras.layers import Lambda, Dense
import tensorflow as tf
from keras.backend import set_session
from sklearn.metrics.pairwise import cosine_similarity
from rouge import Rouge # pip install rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm
import jieba
from gensim.models import KeyedVectors, word2vec, Word2Vec
import random
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config)) # 此处不同
class Word2vecModel:
def __init__(self):
self.path = "E:\pycharm_workspace\查重分析\word2vec_model\\word2vec_add_new_18.model"
self.model = Word2Vec.load(self.path)
def word2vec_res(self,seg_0_list, seg_1_list):
sentence_0_list = []
sentence_1_list = []
for i in seg_0_list:
a = self.model.wv[i]
sentence_0_list.append(a)
for i in seg_1_list:
a = self.model.wv[i]
sentence_1_list.append(a)
return sentence_0_list, sentence_1_list
class Evaluator(keras.callbacks.Callback):
"""评估与保存
"""
def __init__(self):
self.rouge = Rouge()
self.smooth = SmoothingFunction().method1
self.best_bleu = 0.
# def on_epoch_end(self, epoch, logs=None):
# metrics = self.evaluate(valid_data) # 评测模型
# if metrics['bleu'] > self.best_bleu:
# self.best_bleu = metrics['bleu']
# model.save_weights('./best_model.weights') # 保存模型
# metrics['best_bleu'] = self.best_bleu
# print('valid_data:', metrics)
def evaluate_t(self, data_1, data_2, topk=1):
total = 0
rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
scores = self.rouge.get_scores(hyps=[data_1], refs=[data_2])
rouge_1 += scores[0]['rouge-1']['f']
rouge_2 += scores[0]['rouge-2']['f']
rouge_l += scores[0]['rouge-l']['f']
bleu += sentence_bleu(
references=[data_1.split(' ')],
hypothesis=data_2.split(' '),
smoothing_function=self.smooth
)
# rouge_1 /= total
# rouge_2 /= total
# rouge_l /= total
# bleu /= total
return [rouge_1, rouge_2, rouge_l, bleu]
class bertModel:
def __init__(self):
self.config_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json'
self.checkpoint_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt'
self.dict_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt'
self.token_dict, self.keep_tokens = load_vocab(
dict_path=self.dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True)
self.buildmodel()
def buildmodel(self):
bert = build_transformer_model(
config_path=self.config_path,
checkpoint_path=self.checkpoint_path,
return_keras_model=False,
)
output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
self.model = keras.models.Model(bert.model.input, output)
self.model.summary()
def predict(self,text):
batch_token_ids, batch_segment_ids = [], []
token_ids, segment_ids = self.tokenizer.encode(text, maxlen=256)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
return self.model.predict([batch_token_ids, batch_segment_ids])
def simbert(data_1, data_2):
pass
def word2vec():
pass
def bleu():
pass
if __name__ == '__main__':
file = "../data/train_yy_1.txt"
model = bertModel()
eval_class = Evaluator()
# word2vecmodel = Word2vecModel()
try:
with open(file, 'r', encoding="utf-8") as f:
lines = [x.strip() for x in f if x.strip() != '']
except:
with open(file, 'r', encoding="gbk") as f:
lines = [x.strip() for x in f if x.strip() != '']
bertsim_list = []
bleusim_list = []
word2vecsim_list = []
data_train_text = []
random.shuffle(lines)
print(len(lines))
for txt in tqdm(lines):
text = txt.split('\t')
if len(text) == 3:
data_1 = text[0]
data_2 = text[2]
y1 = model.predict(data_1)[0]
y2 = model.predict(data_2)[0]
cos_sim = cosine_similarity(y1.reshape(1, -1), y2.reshape(1, -1))
bertsim_list.append(cos_sim[0][0])
bertsim_value = cos_sim[0][0]
eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2))
bleusim_list.append(eval_list[3])
bleusim_value = eval_list[3]
if bertsim_value <= 0.94 and bleusim_value <= 0.4:
data_train_text.append("\t".join([data_1, "to", data_2]))
# eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2))
# bleusim_list.append(eval_list[3])
# word2vec
# seg_0_list = jieba.cut(data_1, cut_all=False)
# seg_1_list = jieba.cut(data_2, cut_all=False)
# seg_0_list = [char for char in seg_0_list]
# seg_1_list = [char for char in seg_1_list]
#
# sentence_0_list, sentence_1_list = word2vecmodel.word2vec_res(seg_0_list, seg_1_list)
# sentence_0_result = np.array(sentence_0_list)
# sentence_1_result = np.array(sentence_1_list)
# sentence_0_array = sentence_0_result.sum(axis=0)
# sentence_1_array = sentence_1_result.sum(axis=0)
# print(sentence_1_array)
# print(sentence_0_array)
# cos_sim = cosine_similarity(sentence_0_array.reshape(1, -1), sentence_1_array.reshape(1, -1))
# word2vecsim_list.append(cos_sim[0][0])
# bertsim_list = sorted(bertsim_list)
# zong_num = len(bertsim_list)
# print(bertsim_list)
# print(bertsim_list[int(zong_num/2)])
# print(sum(bertsim_list)/zong_num)
# bleusim_list = sorted(bleusim_list)
# zong_num = len(bleusim_list)
# print(bleusim_list)
# print(bleusim_list[int(zong_num / 2)])
# print(sum(bleusim_list) / zong_num)
fileName = 'train_yy_1_sim_09.txt'
with open(fileName, 'w', encoding='utf-8') as file:
for i in data_train_text:
file.write(str(i) + '\n')
file.close()

209
data_do/筛选训练数据_str_sim.py

@ -0,0 +1,209 @@
# -*- coding: utf-8 -*-
"""
@Time : 2023/1/31 19:02
@Author :
@FileName:
@Software:
@Describe:
"""
import os
# os.environ["TF_KERAS"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import json
import numpy as np
from bert4keras.backend import keras, set_gelu
from bert4keras.tokenizers import Tokenizer, load_vocab
from bert4keras.models import build_transformer_model
from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.snippets import open
from keras.layers import Lambda, Dense
import tensorflow as tf
from keras.backend import set_session
from sklearn.metrics.pairwise import cosine_similarity
from rouge import Rouge # pip install rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm
import jieba
from gensim.models import KeyedVectors, word2vec, Word2Vec
import random
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config)) # 此处不同
class Word2vecModel:
def __init__(self):
self.path = "E:\pycharm_workspace\查重分析\word2vec_model\\word2vec_add_new_18.model"
self.model = Word2Vec.load(self.path)
def word2vec_res(self,seg_0_list, seg_1_list):
sentence_0_list = []
sentence_1_list = []
for i in seg_0_list:
a = self.model.wv[i]
sentence_0_list.append(a)
for i in seg_1_list:
a = self.model.wv[i]
sentence_1_list.append(a)
return sentence_0_list, sentence_1_list
class Evaluator(keras.callbacks.Callback):
"""评估与保存
"""
def __init__(self):
self.rouge = Rouge()
self.smooth = SmoothingFunction().method1
self.best_bleu = 0.
# def on_epoch_end(self, epoch, logs=None):
# metrics = self.evaluate(valid_data) # 评测模型
# if metrics['bleu'] > self.best_bleu:
# self.best_bleu = metrics['bleu']
# model.save_weights('./best_model.weights') # 保存模型
# metrics['best_bleu'] = self.best_bleu
# print('valid_data:', metrics)
def evaluate_t(self, data_1, data_2, topk=1):
total = 0
rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
scores = self.rouge.get_scores(hyps=[data_1], refs=[data_2])
rouge_1 += scores[0]['rouge-1']['f']
rouge_2 += scores[0]['rouge-2']['f']
rouge_l += scores[0]['rouge-l']['f']
bleu += sentence_bleu(
references=[data_1.split(' ')],
hypothesis=data_2.split(' '),
smoothing_function=self.smooth
)
# rouge_1 /= total
# rouge_2 /= total
# rouge_l /= total
# bleu /= total
return [rouge_1, rouge_2, rouge_l, bleu]
class bertModel:
def __init__(self):
self.config_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json'
self.checkpoint_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt'
self.dict_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt'
self.token_dict, self.keep_tokens = load_vocab(
dict_path=self.dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True)
self.buildmodel()
def buildmodel(self):
bert = build_transformer_model(
config_path=self.config_path,
checkpoint_path=self.checkpoint_path,
return_keras_model=False,
)
output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
self.model = keras.models.Model(bert.model.input, output)
self.model.summary()
def predict(self,text):
batch_token_ids, batch_segment_ids = [], []
token_ids, segment_ids = self.tokenizer.encode(text, maxlen=256)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
return self.model.predict([batch_token_ids, batch_segment_ids])
def simbert(data_1, data_2):
pass
def word2vec():
pass
def bleu():
pass
if __name__ == '__main__':
file = "../data/train_yy_1.txt"
model = bertModel()
eval_class = Evaluator()
# word2vecmodel = Word2vecModel()
try:
with open(file, 'r', encoding="utf-8") as f:
lines = [x.strip() for x in f if x.strip() != '']
except:
with open(file, 'r', encoding="gbk") as f:
lines = [x.strip() for x in f if x.strip() != '']
bertsim_list = []
bleusim_list = []
word2vecsim_list = []
data_train_text = []
random.shuffle(lines)
print(len(lines))
for txt in tqdm(lines):
text = txt.split('\t')
if len(text) == 3:
data_1 = text[0]
data_2 = text[2]
y1 = model.predict(data_1)[0]
y2 = model.predict(data_2)[0]
cos_sim = cosine_similarity(y1.reshape(1, -1), y2.reshape(1, -1))
bertsim_list.append(cos_sim[0][0])
bertsim_value = cos_sim[0][0]
eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2))
bleusim_list.append(eval_list[3])
bleusim_value = eval_list[3]
if bertsim_value <= 0.94 and bleusim_value <= 0.4:
data_train_text.append("\t".join([data_1, "to", data_2]))
# eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2))
# bleusim_list.append(eval_list[3])
# word2vec
# seg_0_list = jieba.cut(data_1, cut_all=False)
# seg_1_list = jieba.cut(data_2, cut_all=False)
# seg_0_list = [char for char in seg_0_list]
# seg_1_list = [char for char in seg_1_list]
#
# sentence_0_list, sentence_1_list = word2vecmodel.word2vec_res(seg_0_list, seg_1_list)
# sentence_0_result = np.array(sentence_0_list)
# sentence_1_result = np.array(sentence_1_list)
# sentence_0_array = sentence_0_result.sum(axis=0)
# sentence_1_array = sentence_1_result.sum(axis=0)
# print(sentence_1_array)
# print(sentence_0_array)
# cos_sim = cosine_similarity(sentence_0_array.reshape(1, -1), sentence_1_array.reshape(1, -1))
# word2vecsim_list.append(cos_sim[0][0])
# bertsim_list = sorted(bertsim_list)
# zong_num = len(bertsim_list)
# print(bertsim_list)
# print(bertsim_list[int(zong_num/2)])
# print(sum(bertsim_list)/zong_num)
# bleusim_list = sorted(bleusim_list)
# zong_num = len(bleusim_list)
# print(bleusim_list)
# print(bleusim_list[int(zong_num / 2)])
# print(sum(bleusim_list) / zong_num)
fileName = 'train_yy_1_sim_09.txt'
with open(fileName, 'w', encoding='utf-8') as file:
for i in data_train_text:
file.write(str(i) + '\n')
file.close()

206
data_do/筛选训练数据strsim.py

@ -0,0 +1,206 @@
# -*- coding: utf-8 -*-
"""
@Time : 2023/1/31 19:02
@Author :
@FileName:
@Software:
@Describe:
"""
import os
# os.environ["TF_KERAS"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import json
import numpy as np
from bert4keras.backend import keras, set_gelu
from bert4keras.tokenizers import Tokenizer, load_vocab
from bert4keras.models import build_transformer_model
from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.snippets import open
from keras.layers import Lambda, Dense
import tensorflow as tf
from keras.backend import set_session
from sklearn.metrics.pairwise import cosine_similarity
from rouge import Rouge # pip install rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm
import jieba
from gensim.models import KeyedVectors, word2vec, Word2Vec
import random
import difflib
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config)) # 此处不同
class Word2vecModel:
def __init__(self):
self.path = "E:\pycharm_workspace\查重分析\word2vec_model\\word2vec_add_new_18.model"
self.model = Word2Vec.load(self.path)
def word2vec_res(self,seg_0_list, seg_1_list):
sentence_0_list = []
sentence_1_list = []
for i in seg_0_list:
a = self.model.wv[i]
sentence_0_list.append(a)
for i in seg_1_list:
a = self.model.wv[i]
sentence_1_list.append(a)
return sentence_0_list, sentence_1_list
class Evaluator(keras.callbacks.Callback):
"""评估与保存
"""
def __init__(self):
self.rouge = Rouge()
self.smooth = SmoothingFunction().method1
self.best_bleu = 0.
# def on_epoch_end(self, epoch, logs=None):
# metrics = self.evaluate(valid_data) # 评测模型
# if metrics['bleu'] > self.best_bleu:
# self.best_bleu = metrics['bleu']
# model.save_weights('./best_model.weights') # 保存模型
# metrics['best_bleu'] = self.best_bleu
# print('valid_data:', metrics)
def evaluate_t(self, data_1, data_2, topk=1):
total = 0
rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
scores = self.rouge.get_scores(hyps=[data_1], refs=[data_2])
rouge_1 += scores[0]['rouge-1']['f']
rouge_2 += scores[0]['rouge-2']['f']
rouge_l += scores[0]['rouge-l']['f']
bleu += sentence_bleu(
references=[data_1.split(' ')],
hypothesis=data_2.split(' '),
smoothing_function=self.smooth
)
# rouge_1 /= total
# rouge_2 /= total
# rouge_l /= total
# bleu /= total
return [rouge_1, rouge_2, rouge_l, bleu]
class bertModel:
def __init__(self):
self.config_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json'
self.checkpoint_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt'
self.dict_path = '../chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt'
self.token_dict, self.keep_tokens = load_vocab(
dict_path=self.dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True)
self.buildmodel()
def buildmodel(self):
bert = build_transformer_model(
config_path=self.config_path,
checkpoint_path=self.checkpoint_path,
return_keras_model=False,
)
output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
self.model = keras.models.Model(bert.model.input, output)
self.model.summary()
def predict(self,text):
batch_token_ids, batch_segment_ids = [], []
token_ids, segment_ids = self.tokenizer.encode(text, maxlen=256)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
return self.model.predict([batch_token_ids, batch_segment_ids])
def simbert(data_1, data_2):
pass
def word2vec():
pass
def bleu():
pass
if __name__ == '__main__':
file = "../data/train_yy_1.txt"
model = bertModel()
eval_class = Evaluator()
# word2vecmodel = Word2vecModel()
try:
with open(file, 'r', encoding="utf-8") as f:
lines = [x.strip() for x in f if x.strip() != '']
except:
with open(file, 'r', encoding="gbk") as f:
lines = [x.strip() for x in f if x.strip() != '']
bertsim_list = []
bleusim_list = []
word2vecsim_list = []
data_train_text = []
random.shuffle(lines)
print(len(lines))
for txt in tqdm(lines):
text = txt.split('\t')
if len(text) == 3:
data_1 = text[0]
data_2 = text[2]
str_sim_value = difflib.SequenceMatcher(None, data_1, data_2).quick_ratio()
if len(data_2) - len(data_1) < 0 and len(data_2) / len(data_1) > 0.8:
num_yu = 1 - len(data_2) / len(data_1)
str_sim_value = 1 - str_sim_value * num_yu
if str_sim_value < 0.70:
data_train_text.append("\t".join([data_1, "to", data_2]))
# eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2))
# bleusim_list.append(eval_list[3])
# word2vec
# seg_0_list = jieba.cut(data_1, cut_all=False)
# seg_1_list = jieba.cut(data_2, cut_all=False)
# seg_0_list = [char for char in seg_0_list]
# seg_1_list = [char for char in seg_1_list]
#
# sentence_0_list, sentence_1_list = word2vecmodel.word2vec_res(seg_0_list, seg_1_list)
# sentence_0_result = np.array(sentence_0_list)
# sentence_1_result = np.array(sentence_1_list)
# sentence_0_array = sentence_0_result.sum(axis=0)
# sentence_1_array = sentence_1_result.sum(axis=0)
# print(sentence_1_array)
# print(sentence_0_array)
# cos_sim = cosine_similarity(sentence_0_array.reshape(1, -1), sentence_1_array.reshape(1, -1))
# word2vecsim_list.append(cos_sim[0][0])
# bertsim_list = sorted(bertsim_list)
# zong_num = len(bertsim_list)
# print(bertsim_list)
# print(bertsim_list[int(zong_num/2)])
# print(sum(bertsim_list)/zong_num)
# bleusim_list = sorted(bleusim_list)
# zong_num = len(bleusim_list)
# print(bleusim_list)
# print(bleusim_list[int(zong_num / 2)])
# print(sum(bleusim_list) / zong_num)
print(len(data_train_text))
fileName = 'train_yy_1_sim_10.txt'
with open(fileName, 'w', encoding='utf-8') as file:
for i in data_train_text:
file.write(str(i) + '\n')
file.close()

83
data_do/统计数据的_str_sim_值.py

@ -0,0 +1,83 @@
# -*- coding: utf-8 -*-
"""
@Time : 2023/1/31 19:02
@Author :
@FileName:
@Software:
@Describe:
"""
import os
# os.environ["TF_KERAS"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import json
import numpy as np
from bert4keras.backend import keras, set_gelu
from bert4keras.tokenizers import Tokenizer, load_vocab
from bert4keras.models import build_transformer_model
from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.snippets import open
from keras.layers import Lambda, Dense
import tensorflow as tf
from keras.backend import set_session
from sklearn.metrics.pairwise import cosine_similarity
from rouge import Rouge # pip install rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm
import jieba
from gensim.models import KeyedVectors, word2vec, Word2Vec
import random
import difflib
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config)) # 此处不同
if __name__ == '__main__':
file = "../data/train_yy.txt"
# word2vecmodel = Word2vecModel()
try:
with open(file, 'r', encoding="utf-8") as f:
lines = [x.strip() for x in f if x.strip() != '']
except:
with open(file, 'r', encoding="gbk") as f:
lines = [x.strip() for x in f if x.strip() != '']
bertsim_list = []
bleusim_list = []
strsim_list = []
word2vecsim_list = []
data_train_text = []
random.shuffle(lines)
print(len(lines))
for txt in tqdm(lines):
text = txt.split('\t')
if len(text) == 3:
data_1 = text[0]
data_2 = text[2]
# difflib.SequenceMatcher(None, one_data[0], one_data_simbert[j]).quick_ratio()
str_sim_value = difflib.SequenceMatcher(None, data_1, data_2).quick_ratio()
if len(data_2) - len(data_1) < 0:
num_yu = 1- len(data_2) / len(data_1)
str_sim_value = 1 - str_sim_value * num_yu
strsim_list.append(str_sim_value)
strsim_list = sorted(strsim_list)
zong_num = len(strsim_list)
print(strsim_list)
print(strsim_list[int(zong_num / 2)])
print(sum(strsim_list) / zong_num)
fileName = '统计数据的strsim值.txt'
with open(fileName, 'w', encoding='utf-8') as file:
for i in strsim_list:
file.write(str(i) + '\n')
file.close()

276853
data_do/统计数据的strsim值.txt

File diff suppressed because it is too large

40
data_do/统计非中文字符.py

@ -0,0 +1,40 @@
# -*- coding: utf-8 -*-
"""
@Time : 2022/12/20 16:20
@Author :
@FileName:
@Software:
@Describe:
"""
import pandas as pd
from tqdm import tqdm
import json
path = "../data/论文_yy_小说.xlsx"
df_list = pd.read_excel(path).values.tolist()
fuhao = {}
for i in tqdm(df_list):
for word in i:
word = str(word)
if word == "nan":
continue
for ch in word:
if u'\u4e00' <= ch <= u'\u9fff':
continue
else:
if ch in fuhao:
fuhao[ch] += 1
else:
fuhao[ch] = 1
test_1 = sorted(fuhao.items(),key=lambda x:x[1],reverse=True)
fuhao_new = {}
for i in test_1:
fuhao_new[i[0]] = i[1]
json_data = json.dumps(fuhao_new, ensure_ascii=False, indent=2)
with open('../data/fuhao.json', 'w', encoding="utf-8") as f_six:
f_six.write(json_data)

77
data_do/读取docx.py

@ -0,0 +1,77 @@
# -*- coding: utf-8 -*-
"""
@Time : 2023/2/14 14:29
@Author :
@FileName:
@Software:
@Describe:
"""
# from zipfile import ZipFile
# from bs4 import BeautifulSoup
#
# document=ZipFile("../data/11篇/13139551_于丰源_在线考试系统-原文.docx")
# xml=document.read("word/document.xml")
# wordObj=BeautifulSoup(xml.decode("utf-8"))
# texts=wordObj.findAll("w:t")
# for text in texts:
# print(text.text)
import docx
import win32com.client as wc
import operator
#doc文件另存为docx
# path = "E:\pycharm_workspace\drop_weight_rewrite\data\\11篇\\13138572_李菊_谈小学语文教学难点与创新-原文.doc"
# path_new = "E:\pycharm_workspace\drop_weight_rewrite\data\\11篇\\13138572_李菊_谈小学语文教学难点与创新-原文.docx"
# word = wc.Dispatch("Word.Application")
# doc = word.Documents.Open(path)
# # 12代表转换后为docx文件
# doc.SaveAs(path_new, 12)
# doc.Close
# word.Quit
#
# #读取转换后的docx
#
# file = docx.Document(path_new)
# for p in file.paragraphs:
# print(p.text)
# from win32com import client as wc
# w = wc.Dispatch('Word.Application')
# # 或者使用下面的方法,使用启动独立的进程:
# # w = wc.DispatchEx('Word.Application')
# doc=w.Documents.Open(path)
# doc.SaveAs(path_new,16)#必须有参数16,否则会出错.
import os
from win32com import client as wc
def save_doc_to_docx(rawpath): # doc转docx
'''
:param rawpath: 传入和传出文件夹的路径
:return: None
'''
word = wc.Dispatch("Word.Application")
# 不能用相对路径,老老实实用绝对路径
# 需要处理的文件所在文件夹目录
filenamelist = os.listdir(rawpath)
for i in os.listdir(rawpath):
# 找出文件中以.doc结尾并且不以~$开头的文件(~$是为了排除临时文件的)
if i.endswith('.doc') and not i.startswith('~$'):
print(i)
# try
# 打开文件
doc = word.Documents.Open(rawpath + i)
# # 将文件名与后缀分割
rename = os.path.splitext(i)
# 将文件另存为.docx
doc.SaveAs(pathls + rename[0] + '.docx', 12) # 12表示docx格式
doc.Close()
word.Quit()
if __name__ == '__main__':
pathls = "E:\\pycharm_workspace\\drop_weight_rewrite\\data\\11篇\\"
save_doc_to_docx(pathls)

43
data_do/进一步处理降重数据.py

@ -0,0 +1,43 @@
# -*- coding: utf-8 -*-
"""
@Time : 2022/12/20 17:56
@Author :
@FileName:
@Software:
@Describe:
"""
import pandas as pd
from tqdm import tqdm
import json
path = "../data/论文_yy_小说_1.xlsx"
df_list = pd.read_excel(path).values.tolist()
def sentence_do(source,result):
source = str(source)
result = str(result)
if source == "nan" or result == "nan":
return False, source,result
if len(source) > 120 or len(result) > 120:
return False, source,result
else:
source = str(source).replace("\t", "").replace(" ", "").replace("", "").replace("‚", "")
result = str(result).replace("\t", "").replace(" ", "").replace("", "").replace("‚", "")
return True, source, result
df_list_new = []
for i in df_list:
source = i[0]
result = i[1]
bool, source, result = sentence_do(source, result)
if bool == False:
continue
else:
df_list_new.append([source,result])
df = pd.DataFrame(df_list_new, columns=["原文","yy降重"])
df.to_excel("../data/论文_yy_小说_3.xlsx",index=None)

55
优化点/优化点.txt

@ -0,0 +1,55 @@
方案1
1.人名替换处理
2.使用翻译模型翻译出来句子人工筛选
3.筛选出来的句子进行训练
训练方案1
常规训练方案
训练方案2
使用simbert方式训练
翻译模型方案
1,使用百度注册账号
2,https://www.oschina.net/news/170812/meta-open-source-wmt-21
3,使用t5模型训练
改写语料
1,现有的语料 58万
2,lcqmc语料
写作猫发现的规律
1.
可以确定的是,应该是用模型做的,因为同样的短句,后面加点东西和不加,出来的整体句子都不一样
伴随着风硕哀嚎的声音,白策笑嘻嘻的离开了。 --> 在冯硕的惨叫声中,白策嘿嘿一笑,转身离去。
伴随着风硕哀嚎的声音 --> 伴随着风硕的惨叫声
2.
人名特殊处理
3.(待定,因为试了几条结果跟我预想的差不多,不敢确定)
一般是合并句子居多,且一般是前面句子或者后面句子形容中心句
修改后短句增多的例子
对方无声哭泣着再次说了一遍。 --> 对方带着哭腔,又重复了一遍。
足足等到叶澜歌换好了衣服 --> 一直到叶澜歌穿好衣服,这才停了下来。
修改后短句合并的例子
林天宇瞬间就察觉到问题,对着她,轻声问道,“你是不是察觉到什么?” --> 林天宇立刻意识到了不对劲,小声的问了一句,“有没有发现?”
方案:
1,有可能形容词替换之后(使用work2vec),再用中译英英译中(也会补充出来代词)
张三尖叫的喊道
张三尖叫的嚷嚷道 --> 张三大吼一声。 --> 张三咆哮起来。 --> 张三大吼一声。

0
改进方案/改进方案.txt

70
测试range_1.py

@ -0,0 +1,70 @@
# -*- coding: utf-8 -*-
"""
@Time : 2023/2/3 17:27
@Author :
@FileName:
@Software:
@Describe:
"""
import os
# os.environ["TF_KERAS"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from bert4keras.backend import keras, set_gelu
import numpy as np
from rouge import Rouge # pip install rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
class Evaluator(keras.callbacks.Callback):
"""评估与保存
"""
def __init__(self):
self.rouge = Rouge()
self.smooth = SmoothingFunction().method1
self.best_bleu = 0.
# def on_epoch_end(self, epoch, logs=None):
# metrics = self.evaluate(valid_data) # 评测模型
# if metrics['bleu'] > self.best_bleu:
# self.best_bleu = metrics['bleu']
# model.save_weights('./best_model.weights') # 保存模型
# metrics['best_bleu'] = self.best_bleu
# print('valid_data:', metrics)
def evaluate_t(self, data_1, data_2, topk=1):
total = 0
rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
scores = self.rouge.get_scores(hyps=[data_1], refs=[data_2])
rouge_1 += scores[0]['rouge-1']['f']
rouge_2 += scores[0]['rouge-2']['f']
rouge_l += scores[0]['rouge-l']['f']
bleu += sentence_bleu(
references=[data_1.split(' ')],
hypothesis=data_2.split(' '),
smoothing_function=self.smooth
)
# rouge_1 /= total
# rouge_2 /= total
# rouge_l /= total
# bleu /= total
return [rouge_1, rouge_2, rouge_l, bleu]
eval_class = Evaluator()
data_1 = "上海中心大厦"
data_2 = "上海"
eval_list = eval_class.evaluate_t(' '.join(data_1), ' '.join(data_2))
print(eval_list)
a = len(data_2) - len(data_1)
if a < 0:
a *
a = len(data_2)/len(data_1)
np.exp(len(data_2) - len(data_1))
Loading…
Cancel
Save