Browse Source

第一次提交

master
majiahui@haimaqingfan.com 2 years ago
parent
commit
da6a11019c
  1. 4
      .gitignore
  2. 0
      README.md
  3. 100
      bert_cls.py
  4. 34
      faiss_test.py
  5. 16
      main.py
  6. 58
      range_sim.py
  7. 65
      src/basemodel.py
  8. 38
      txt_to_csv.py
  9. 61
      vec_to_numpy.py
  10. 28
      word2vec_sim.py
  11. 34
      处理全文对照.py
  12. 42
      文本处理.py
  13. 0
      查重方案.txt
  14. 46
      读取docx.py
  15. 48
      读取pdf.py

4
.gitignore

@ -5,3 +5,7 @@
/train_model_67/
/roberta_model6/
/.idea/
/ceshiyuxian.py
/ceshi_xiangliang.py
/ceshi.py
/range_sim_ceshi.py

0
README.md

100
bert_cls.py

@ -0,0 +1,100 @@
# -*- coding: utf-8 -*-
"""
@Time : 2023/3/9 18:36
@Author :
@FileName:
@Software:
@Describe:
"""
#! -*- coding: utf-8 -*-
# 用CRF做中文命名实体识别
# 数据集 http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# 实测验证集的F1可以到96.18%,测试集的F1可以到95.35%
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import tensorflow as tf
import os
from src.basemodel import ClassifyModel
import numpy as np
from numpy.linalg import norm
import pandas as pd
# a = [[1, 3, 2], [2, 2, 1]]
# print(cosine_similarity(a))
def cos_sim(a, b):
A = np.array(a)
B = np.array(b)
cosine = np.dot(A, B) / (norm(A) * norm(B))
return cosine
if __name__ == '__main__':
maxlen = 512
batch_size = 32
# bert配置
config_path = 'chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt'
dict_path = 'chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt'
lable_vec_path = "data/10235513_大型商业建筑人员疏散设计研究_沈福禹/save_x.npy"
b = np.load(lable_vec_path)
df_train_nuoche = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist()
classifymodel = ClassifyModel(config_path, checkpoint_path, dict_path, is_train=False, load_weights_path=None)
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# while True:
# text = input("請輸入")
# data = classifymodel.data_generator([text], batch_size)
# token, segment = data[0][0], data[1][0]
# content_cls = classifymodel.predict(token, segment)
# content_cls = content_cls.reshape(-1)
# print(content_cls.shape)
#
# index_list = []
# for vec in b:
#
# cos_value = cos_sim(content_cls, vec)
# index_list.append(cos_value)
#
# re1 = [(i[0],i[1]) for i in sorted(list(enumerate(index_list)), key=lambda x: x[1], reverse=True)]
#
# for i in range(0, 10):
# print(re1[i])
# print(df_train_nuoche[re1[i][0]])
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
path_txt = "data/10235513_大型商业建筑人员疏散设计研究_沈福禹/大型商业建筑人员疏散设计研究.txt"
path_excel = "data/10235513_大型商业建筑人员疏散设计研究_沈福禹/大型商业建筑人员疏散设计研究_2.xlsx"
f = open(path_txt, encoding="utf-8")
centent = f.read()
f.close()
data_zong = []
centent_list = centent.split("\n")
for text in centent_list:
if text[:5] == "*****":
continue
dan_data = [text]
data = classifymodel.data_generator([text], batch_size)
token, segment = data[0][0], data[1][0]
content_cls = classifymodel.predict(token, segment)
content_cls = content_cls.reshape(-1)
index_list = []
for vec in b:
cos_value = cos_sim(content_cls, vec)
index_list.append(cos_value)
re1 = [(i[0],i[1]) for i in sorted(list(enumerate(index_list)), key=lambda x: x[1], reverse=True)]
for i in range(0, 10):
dan_data.append(re1[i][1])
dan_data.append(df_train_nuoche[re1[i][0]][0])
filename = df_train_nuoche[re1[i][0]][1].split("\\")[-1]
dan_data.append(filename)
data_zong.append(dan_data)
pd.DataFrame(data_zong).to_excel(path_excel, index=None)

34
faiss_test.py

@ -0,0 +1,34 @@
# -*- coding: utf-8 -*-
"""
@Time : 2023/3/7 14:31
@Author :
@FileName:
@Software:
@Describe:
"""
import numpy as np
d = 768 # 向量维度
nb = 1000000 # index向量库的数据量
nq = 5 # 待检索query的数目
np.random.seed(1234)
xb = np.random.random((nb, d)).astype('float32')
xb[:, 0] += np.arange(nb) / 1000. # index向量库的向量
xq = np.random.random((nq, d)).astype('float32')
xq[:, 0] += np.arange(nq) / 1000.
print("0", xb)
print("1", xq)
import faiss
index = faiss.IndexFlatL2(d)
print(index.is_trained) # 输出为True,代表该类index不需要训练,只需要add向量进去即可
index.add(xb) # 将向量库中的向量加入到index中
print(index.ntotal)
k = 4 # topK的K值
D, I = index.search(xq, k)# xq为待检索向量,返回的I为每个待检索query最相似TopK的索引list,D为其对应的距离
print(D)
print(I)

16
main.py

@ -0,0 +1,16 @@
# This is a sample Python script.
# Press Shift+F10 to execute it or replace it with your code.
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
def print_hi(name):
# Use a breakpoint in the code line below to debug your script.
print(f'Hi, {name}') # Press Ctrl+F8 to toggle the breakpoint.
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
print_hi('PyCharm')
# See PyCharm help at https://www.jetbrains.com/help/pycharm/

58
range_sim.py

@ -0,0 +1,58 @@
# -*- coding: utf-8 -*-
"""
@Time : 2023/3/14 17:52
@Author :
@FileName:
@Software:
@Describe:
"""
from rouge import Rouge
import pandas as pd
from tqdm import tqdm
rouge = Rouge()
def rouge_value(data_1, data_2):
data_1 = ' '.join(data_1)
data_2 = ' '.join(data_2)
scores = rouge.get_scores(hyps=[data_1], refs=[data_2])
rouge_1 = scores[0]['rouge-1']['f']
rouge_2 = scores[0]['rouge-2']['f']
rouge_l = scores[0]['rouge-l']['f']
# rouge_w = scores[0]['rouge-w']['f']
# rouge_s = scores[0]['rouge-s']['f']
return rouge_1, rouge_2, rouge_l
if __name__ == '__main__':
df_train_nuoche = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist()
path_txt = "data/10235513_大型商业建筑人员疏散设计研究_沈福禹/大型商业建筑人员疏散设计研究.txt"
path_excel = "data/10235513_大型商业建筑人员疏散设计研究_沈福禹/大型商业建筑人员疏散设计研究_2_rouge.xlsx"
f = open(path_txt, encoding="utf-8")
centent = f.read()
f.close()
data_zong = []
centent_list = centent.split("\n")
for text in tqdm(centent_list):
dan_data = [text]
index_list = []
if text[:5] == "*****":
continue
for data_dan in df_train_nuoche:
rouge_1, rouge_2, rouge_l = rouge_value(text, data_dan[0])
index_list.append(rouge_l)
re1 = [(i[0],i[1]) for i in sorted(list(enumerate(index_list)), key=lambda x: x[1], reverse=True)]
for i in range(0, 10):
dan_data.append(re1[i][1])
dan_data.append(df_train_nuoche[re1[i][0]][0])
filename = df_train_nuoche[re1[i][0]][1].split("\\")[-1]
dan_data.append(filename)
data_zong.append(dan_data)
pd.DataFrame(data_zong).to_excel(path_excel, index=None)

65
src/basemodel.py

@ -0,0 +1,65 @@
# -*- coding: utf-8 -*-
"""
@Time : 2023/3/13 10:15
@Author :
@FileName:
@Software:
@Describe:
"""
from bert4keras.backend import keras, K
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.snippets import open
from bert4keras.layers import ConditionalRandomField
from keras.layers import Dense
from keras.models import Model
from tqdm import tqdm
import json
from keras.layers import *
class ClassifyModel:
def __init__(self, config_path, checkpoint_path, dict_path, is_train, load_weights_path=None):
self.config_path = config_path
self.checkpoint_path = checkpoint_path
self.dict_path = dict_path
self.is_train = True
self.load_weights_path = load_weights_path
self.model = self.create_model(self.is_train, self.load_weights_path)
self.tokenizer = Tokenizer(self.dict_path, do_lower_case=True)
self.maxlen = 256
def create_model(self, is_train, load_weights_path):
bert = build_transformer_model(
config_path=self.config_path,
checkpoint_path=self.checkpoint_path,
return_keras_model=False,
)
output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
model = keras.models.Model(bert.model.input, output)
if is_train == False:
model.load_weights(load_weights_path)
return model
def predict(self,token_ids, segment_ids):
return self.model.predict([token_ids, segment_ids])
def data_generator(self, texts, batch_size):
batch_token_ids = []
batch_segment_ids = []
batch_dan_token_ids = []
batch_dan_segment_ids = []
for id_, text in enumerate(texts):
token_ids, segment_ids = self.tokenizer.encode(text, maxlen=self.maxlen)
batch_dan_token_ids.append(token_ids)
batch_dan_segment_ids.append(segment_ids)
if len(batch_dan_token_ids) == batch_size or id_ == len(texts)-1:
batch_dan_token_ids = sequence_padding(batch_dan_token_ids)
batch_dan_segment_ids = sequence_padding(batch_dan_segment_ids)
batch_token_ids.append(batch_dan_token_ids)
batch_segment_ids.append(batch_dan_segment_ids)
batch_dan_token_ids, batch_dan_segment_ids = [], []
return batch_token_ids, batch_segment_ids

38
txt_to_csv.py

@ -0,0 +1,38 @@
# -*- coding: utf-8 -*-
"""
@Time : 2023/3/13 10:38
@Author :
@FileName:
@Software:
@Describe:
"""
import os
import pandas as pd
file = './data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重txt_new'
file_csv = './data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv'
path_list = []
data = []
for root, dirs, files in os.walk(file):
for file in files:
path = os.path.join(root, file)
path_list.append(path)
print(path_list)
for path in path_list:
with open(path, encoding="gbk") as f:
text = f.read()
text_list = text.split("@@@@@@@@@@")
text_zhengwen = text_list[-1]
text_zhengwen_list = text_zhengwen.split("\n")
for sentence in text_zhengwen_list:
if sentence != "":
data.append([sentence, path])
pd.DataFrame(data,columns=["sentence", "path"]).to_csv(file_csv, index=None)

61
vec_to_numpy.py

@ -0,0 +1,61 @@
# -*- coding: utf-8 -*-
"""
@Time : 2023/3/10 18:53
@Author :
@FileName:
@Software:
@Describe:
"""
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# import pickle
# import redis
# from redis import ConnectionPool
# app = Flask(__name__)
import numpy as np
import pandas as pd
import json
from keras.layers import *
from tqdm import tqdm
import time
from src.basemodel import ClassifyModel
if __name__ == '__main__':
maxlen = 256
batch_size = 32
# bert配置
config_path = 'chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt'
dict_path = 'chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt'
texts = ["我们有个好朋友"] * 34
print(texts)
classifymodel = ClassifyModel(config_path, checkpoint_path, dict_path, is_train=False, load_weights_path=None)
# data = classifymodel.data_generator(texts, batch_size)
# for token, segment in zip(data[0],data[1]):
# print(classifymodel.predict(token, segment).shape)
df_train_nuoche = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv",encoding="utf-8")
Data = []
for data_dan in df_train_nuoche.values.tolist():
Data.append(data_dan[0])
print(Data[0])
print(len(Data))
data = classifymodel.data_generator(Data, batch_size)
print(len(data[0][-1]))
# print(type(train_generator))
# d = next(train_generator)
# print(d)
a1 = np.empty((0, 768), dtype=int)
for token, segment in zip(data[0],data[1]):
a2 = classifymodel.predict(token, segment)
a1 = np.concatenate([a1, a2])
print(a1.shape)
np.save('data/10235513_大型商业建筑人员疏散设计研究_沈福禹/save_x', a1)

28
word2vec_sim.py

@ -0,0 +1,28 @@
# -*- coding: utf-8 -*-
"""
@Time : 2023/3/14 19:01
@Author :
@FileName:
@Software:
@Describe:
"""
from gensim.models.word2vec import LineSentence
import numpy as np
from tqdm import tqdm
path = "word2vec_model/word2vec.txt"
def iter_word(word, txt_path):
"""迭代器方法获取词向量"""
vec = 0
iter1 = LineSentence(open(txt_path, 'r', encoding='utf-8'))
for i,v in tqdm(enumerate(iter1)):
if i == 0:
continue
if word == v[:1]:
vec = np.array([float(j) for j in v[1:]])
break
return vec
word = "公共"
print(iter_word(word,path))

34
处理全文对照.py

@ -0,0 +1,34 @@
# -*- coding: utf-8 -*-
"""
@Time : 2023/3/15 11:39
@Author :
@FileName:
@Software:
@Describe:
"""
import pandas as pd
import difflib
path_txt = "data/10235513_大型商业建筑人员疏散设计研究_沈福禹/大型商业建筑人员疏散设计研究.txt"
path_csv = "data/10235513_大型商业建筑人员疏散设计研究_沈福禹/大型商业建筑人员疏散设计研究_全文.csv"
path_csv_sim = "data/10235513_大型商业建筑人员疏散设计研究_沈福禹/大型商业建筑人员疏散设计研究_全文对照.csv"
f = open(path_txt, encoding="utf-8")
centent = f.read()
f.close()
data = []
centent_text_list = centent.split("\n")
centent_csv_list = pd.read_csv(path_csv).values.tolist()
for dan_yuan in centent_csv_list:
str_sim_text = "##"
for dan_lable in centent_text_list:
str_sim_value = difflib.SequenceMatcher(None, dan_yuan[0], dan_lable).quick_ratio()
if str_sim_value >= 0.95:
str_sim_text = dan_lable
break
data.append([dan_yuan[0], str_sim_text])
pd.DataFrame(data).to_csv(path_csv_sim,index=None)

42
文本处理.py

@ -0,0 +1,42 @@
# -*- coding: utf-8 -*-
"""
@Time : 2023/3/10 17:45
@Author :
@FileName:
@Software:
@Describe:
"""
import os
file = './data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重txt'
path_list = []
for root, dirs, files in os.walk(file):
for file in files:
path = os.path.join(root, file)
path_list.append(path)
for path in path_list:
with open(path, encoding="utf-8") as f:
text = f.read()
path_dan_list = path.split("\\")
root_path = path_dan_list[0]
file_path = path_dan_list[1]
root_pathdan_list = root_path.split("/")
root_pathdan_list = root_pathdan_list[:-1]
print(root_pathdan_list)
text_list = text.split("@@@@@@@@@@")
text_zhengwen = text_list[-1]
text_list = [i.lstrip("\n") for i in text_list[:-1]]
print(text_list)
text_zhengwen = text_zhengwen.strip().replace("\n", "").replace(" ", "").replace("", "\n")
text_list = text_list + [text_zhengwen]
text_str = "@@@@@@@@@@".join(text_list)
path_new = "/".join(root_pathdan_list + ["查重txt_new", file_path])
with open(path_new, "w") as f:
f.write(text_str)
f.close()

0
查重方案.txt

46
读取docx.py

@ -0,0 +1,46 @@
# -*- coding: utf-8 -*-
"""
@Time : 2023/3/15 10:38
@Author :
@FileName:
@Software:
@Describe:
"""
import os
import docx
import pandas as pd
def read_docx(rawpath): # doc转docx
data = []
data_new = []
document = docx.Document(rawpath)
# 获取所有段落
all_paragraphs = document.paragraphs
for paragraph in all_paragraphs:
# 打印每一个段落的文字
data.append(paragraph.text)
# for data_dan in data:
# if data_dan == "":
# continue
# else:
# data_list = str(data_dan).split("。")
# for data_dan_short in data_list:
# if data_dan_short == "":
# continue
# data_new.append(data_dan_short)
data = [dan for dan in data if dan != ""]
data = "".join(data)
data_list = str(data).split("")
data_new = [dan + "" for dan in data_list if dan != ""]
return data_new
if __name__ == '__main__':
pathls = r"E:\pycharm_workspace\duplicate_check\data\10235513_大型商业建筑人员疏散设计研究_沈福禹\10235513_沈福禹_大型商业建筑人员疏散设计研究.docx"
path_csv = "data/10235513_大型商业建筑人员疏散设计研究_沈福禹/大型商业建筑人员疏散设计研究_全文.csv"
data = read_docx(pathls)
data = [[i] for i in data]
pd.DataFrame(data).to_csv(path_csv, index=None)

48
读取pdf.py

@ -0,0 +1,48 @@
# -*- coding: utf-8 -*-
"""
@Time : 2023/3/9 15:34
@Author :
@FileName:
@Software:
@Describe:
"""
import pdfplumber
import pandas as pd
path = "./data/新建文件夹/13977991/全文对照.pdf"
# with pdfplumber.open(path) as pdf:
# first_page = pdf.pages[0]
# # 获取文本,直接得到字符串,包括了换行符【与PDF上的换行位置一致,而不是实际的“段落”】
# print(first_page.extract_texts())
# # 获取本页全部表格,也可以使用extract_table()获得单个表格
# for table in p0.extract_tables():
# #得到的table是嵌套list类型,转化成DataFrame更加方便查看和分析
# df = pd.DataFrame(table[1:], columns=table[0])
# print(df)
with pdfplumber.open(path) as pdf:
content = ''
for i in range(len(pdf.pages)):
# 读取PDF文档第i+1页
page = pdf.pages[i]
# page.extract_text()函数即读取文本内容,下面这步是去掉文档最下面的页码
page_content = '\n'.join(page.extract_text().split('\n')[:-1])
content = content + page_content
print(content)
import pdfplumber
import pandas as pd
with pdfplumber.open(path) as pdf:
first_page = pdf.pages[3]
tables = first_page.extract_tables()
for table in tables:
df = pd.DataFrame(table)
# 第一列当成表头:
# df = pd.DataFrame(table[1:], columns=table[0])
print(df)
Loading…
Cancel
Save