diff --git a/.gitignore b/.gitignore index de2f6b5..a83f57a 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,7 @@ /train_model_67/ /roberta_model6/ /.idea/ +/ceshiyuxian.py +/ceshi_xiangliang.py +/ceshi.py +/range_sim_ceshi.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/bert_cls.py b/bert_cls.py new file mode 100644 index 0000000..b8359ee --- /dev/null +++ b/bert_cls.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- + +""" +@Time : 2023/3/9 18:36 +@Author : +@FileName: +@Software: +@Describe: +""" +#! -*- coding: utf-8 -*- +# 用CRF做中文命名实体识别 +# 数据集 http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz +# 实测验证集的F1可以到96.18%,测试集的F1可以到95.35% +import os +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0" +import tensorflow as tf +import os +from src.basemodel import ClassifyModel +import numpy as np +from numpy.linalg import norm +import pandas as pd + +# a = [[1, 3, 2], [2, 2, 1]] +# print(cosine_similarity(a)) + +def cos_sim(a, b): + A = np.array(a) + B = np.array(b) + cosine = np.dot(A, B) / (norm(A) * norm(B)) + return cosine + + +if __name__ == '__main__': + maxlen = 512 + batch_size = 32 + # bert配置 + config_path = 'chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json' + checkpoint_path = 'chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt' + dict_path = 'chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt' + + lable_vec_path = "data/10235513_大型商业建筑人员疏散设计研究_沈福禹/save_x.npy" + b = np.load(lable_vec_path) + df_train_nuoche = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist() + + classifymodel = ClassifyModel(config_path, checkpoint_path, dict_path, is_train=False, load_weights_path=None) + +# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +# while True: +# text = input("請輸入") +# data = classifymodel.data_generator([text], batch_size) +# token, segment = data[0][0], data[1][0] +# content_cls = classifymodel.predict(token, segment) +# content_cls = content_cls.reshape(-1) +# print(content_cls.shape) +# +# index_list = [] +# for vec in b: +# +# cos_value = cos_sim(content_cls, vec) +# index_list.append(cos_value) +# +# re1 = [(i[0],i[1]) for i in sorted(list(enumerate(index_list)), key=lambda x: x[1], reverse=True)] +# +# for i in range(0, 10): +# print(re1[i]) +# print(df_train_nuoche[re1[i][0]]) +# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + path_txt = "data/10235513_大型商业建筑人员疏散设计研究_沈福禹/大型商业建筑人员疏散设计研究.txt" + path_excel = "data/10235513_大型商业建筑人员疏散设计研究_沈福禹/大型商业建筑人员疏散设计研究_2.xlsx" + f = open(path_txt, encoding="utf-8") + centent = f.read() + f.close() + + data_zong = [] + centent_list = centent.split("\n") + for text in centent_list: + if text[:5] == "*****": + continue + dan_data = [text] + data = classifymodel.data_generator([text], batch_size) + token, segment = data[0][0], data[1][0] + content_cls = classifymodel.predict(token, segment) + content_cls = content_cls.reshape(-1) + + index_list = [] + for vec in b: + + cos_value = cos_sim(content_cls, vec) + index_list.append(cos_value) + + re1 = [(i[0],i[1]) for i in sorted(list(enumerate(index_list)), key=lambda x: x[1], reverse=True)] + + for i in range(0, 10): + dan_data.append(re1[i][1]) + dan_data.append(df_train_nuoche[re1[i][0]][0]) + filename = df_train_nuoche[re1[i][0]][1].split("\\")[-1] + dan_data.append(filename) + data_zong.append(dan_data) + pd.DataFrame(data_zong).to_excel(path_excel, index=None) diff --git a/faiss_test.py b/faiss_test.py new file mode 100644 index 0000000..3e459db --- /dev/null +++ b/faiss_test.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- + +""" +@Time : 2023/3/7 14:31 +@Author : +@FileName: +@Software: +@Describe: +""" +import numpy as np +d = 768 # 向量维度 +nb = 1000000 # index向量库的数据量 +nq = 5 # 待检索query的数目 +np.random.seed(1234) +xb = np.random.random((nb, d)).astype('float32') +xb[:, 0] += np.arange(nb) / 1000. # index向量库的向量 +xq = np.random.random((nq, d)).astype('float32') +xq[:, 0] += np.arange(nq) / 1000. + +print("0", xb) +print("1", xq) + + +import faiss +index = faiss.IndexFlatL2(d) +print(index.is_trained) # 输出为True,代表该类index不需要训练,只需要add向量进去即可 +index.add(xb) # 将向量库中的向量加入到index中 +print(index.ntotal) + +k = 4 # topK的K值 +D, I = index.search(xq, k)# xq为待检索向量,返回的I为每个待检索query最相似TopK的索引list,D为其对应的距离 + +print(D) +print(I) \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..5596b44 --- /dev/null +++ b/main.py @@ -0,0 +1,16 @@ +# This is a sample Python script. + +# Press Shift+F10 to execute it or replace it with your code. +# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings. + + +def print_hi(name): + # Use a breakpoint in the code line below to debug your script. + print(f'Hi, {name}') # Press Ctrl+F8 to toggle the breakpoint. + + +# Press the green button in the gutter to run the script. +if __name__ == '__main__': + print_hi('PyCharm') + +# See PyCharm help at https://www.jetbrains.com/help/pycharm/ diff --git a/range_sim.py b/range_sim.py new file mode 100644 index 0000000..abe67d7 --- /dev/null +++ b/range_sim.py @@ -0,0 +1,58 @@ +# -*- coding: utf-8 -*- + +""" +@Time : 2023/3/14 17:52 +@Author : +@FileName: +@Software: +@Describe: +""" +from rouge import Rouge +import pandas as pd +from tqdm import tqdm + +rouge = Rouge() +def rouge_value(data_1, data_2): + data_1 = ' '.join(data_1) + data_2 = ' '.join(data_2) + scores = rouge.get_scores(hyps=[data_1], refs=[data_2]) + rouge_1 = scores[0]['rouge-1']['f'] + rouge_2 = scores[0]['rouge-2']['f'] + rouge_l = scores[0]['rouge-l']['f'] + # rouge_w = scores[0]['rouge-w']['f'] + # rouge_s = scores[0]['rouge-s']['f'] + + return rouge_1, rouge_2, rouge_l + +if __name__ == '__main__': + + df_train_nuoche = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv", encoding="utf-8").values.tolist() + + path_txt = "data/10235513_大型商业建筑人员疏散设计研究_沈福禹/大型商业建筑人员疏散设计研究.txt" + path_excel = "data/10235513_大型商业建筑人员疏散设计研究_沈福禹/大型商业建筑人员疏散设计研究_2_rouge.xlsx" + f = open(path_txt, encoding="utf-8") + centent = f.read() + f.close() + + data_zong = [] + centent_list = centent.split("\n") + for text in tqdm(centent_list): + dan_data = [text] + index_list = [] + if text[:5] == "*****": + continue + + for data_dan in df_train_nuoche: + + rouge_1, rouge_2, rouge_l = rouge_value(text, data_dan[0]) + index_list.append(rouge_l) + + re1 = [(i[0],i[1]) for i in sorted(list(enumerate(index_list)), key=lambda x: x[1], reverse=True)] + + for i in range(0, 10): + dan_data.append(re1[i][1]) + dan_data.append(df_train_nuoche[re1[i][0]][0]) + filename = df_train_nuoche[re1[i][0]][1].split("\\")[-1] + dan_data.append(filename) + data_zong.append(dan_data) + pd.DataFrame(data_zong).to_excel(path_excel, index=None) \ No newline at end of file diff --git a/src/basemodel.py b/src/basemodel.py new file mode 100644 index 0000000..3aad6f0 --- /dev/null +++ b/src/basemodel.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- + +""" +@Time : 2023/3/13 10:15 +@Author : +@FileName: +@Software: +@Describe: +""" +from bert4keras.backend import keras, K +from bert4keras.models import build_transformer_model +from bert4keras.tokenizers import Tokenizer +from bert4keras.optimizers import Adam +from bert4keras.snippets import sequence_padding, DataGenerator +from bert4keras.snippets import open +from bert4keras.layers import ConditionalRandomField +from keras.layers import Dense +from keras.models import Model +from tqdm import tqdm +import json +from keras.layers import * + + +class ClassifyModel: + def __init__(self, config_path, checkpoint_path, dict_path, is_train, load_weights_path=None): + self.config_path = config_path + self.checkpoint_path = checkpoint_path + self.dict_path = dict_path + self.is_train = True + self.load_weights_path = load_weights_path + self.model = self.create_model(self.is_train, self.load_weights_path) + self.tokenizer = Tokenizer(self.dict_path, do_lower_case=True) + self.maxlen = 256 + + def create_model(self, is_train, load_weights_path): + bert = build_transformer_model( + config_path=self.config_path, + checkpoint_path=self.checkpoint_path, + return_keras_model=False, + ) + output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output) + model = keras.models.Model(bert.model.input, output) + if is_train == False: + model.load_weights(load_weights_path) + return model + + def predict(self,token_ids, segment_ids): + return self.model.predict([token_ids, segment_ids]) + + def data_generator(self, texts, batch_size): + batch_token_ids = [] + batch_segment_ids = [] + batch_dan_token_ids = [] + batch_dan_segment_ids = [] + for id_, text in enumerate(texts): + token_ids, segment_ids = self.tokenizer.encode(text, maxlen=self.maxlen) + batch_dan_token_ids.append(token_ids) + batch_dan_segment_ids.append(segment_ids) + if len(batch_dan_token_ids) == batch_size or id_ == len(texts)-1: + batch_dan_token_ids = sequence_padding(batch_dan_token_ids) + batch_dan_segment_ids = sequence_padding(batch_dan_segment_ids) + batch_token_ids.append(batch_dan_token_ids) + batch_segment_ids.append(batch_dan_segment_ids) + batch_dan_token_ids, batch_dan_segment_ids = [], [] + return batch_token_ids, batch_segment_ids \ No newline at end of file diff --git a/txt_to_csv.py b/txt_to_csv.py new file mode 100644 index 0000000..641dd21 --- /dev/null +++ b/txt_to_csv.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- + +""" +@Time : 2023/3/13 10:38 +@Author : +@FileName: +@Software: +@Describe: +""" +import os + +import pandas as pd + +file = './data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重txt_new' +file_csv = './data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv' + +path_list = [] +data = [] + + +for root, dirs, files in os.walk(file): + for file in files: + path = os.path.join(root, file) + path_list.append(path) + +print(path_list) +for path in path_list: + with open(path, encoding="gbk") as f: + text = f.read() + + text_list = text.split("@@@@@@@@@@") + text_zhengwen = text_list[-1] + text_zhengwen_list = text_zhengwen.split("\n") + for sentence in text_zhengwen_list: + if sentence != "": + data.append([sentence, path]) + +pd.DataFrame(data,columns=["sentence", "path"]).to_csv(file_csv, index=None) \ No newline at end of file diff --git a/vec_to_numpy.py b/vec_to_numpy.py new file mode 100644 index 0000000..09e9741 --- /dev/null +++ b/vec_to_numpy.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- + +""" +@Time : 2023/3/10 18:53 +@Author : +@FileName: +@Software: +@Describe: +""" +import os +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0" +# import pickle +# import redis +# from redis import ConnectionPool +# app = Flask(__name__) +import numpy as np +import pandas as pd + +import json +from keras.layers import * +from tqdm import tqdm +import time +from src.basemodel import ClassifyModel + + +if __name__ == '__main__': + maxlen = 256 + batch_size = 32 + # bert配置 + config_path = 'chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json' + checkpoint_path = 'chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt' + dict_path = 'chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt' + + texts = ["我们有个好朋友"] * 34 + print(texts) + classifymodel = ClassifyModel(config_path, checkpoint_path, dict_path, is_train=False, load_weights_path=None) + # data = classifymodel.data_generator(texts, batch_size) + # for token, segment in zip(data[0],data[1]): + # print(classifymodel.predict(token, segment).shape) + + df_train_nuoche = pd.read_csv("data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重.csv",encoding="utf-8") + Data = [] + for data_dan in df_train_nuoche.values.tolist(): + Data.append(data_dan[0]) + print(Data[0]) + print(len(Data)) + + data = classifymodel.data_generator(Data, batch_size) + + print(len(data[0][-1])) + # print(type(train_generator)) + # d = next(train_generator) + # print(d) + a1 = np.empty((0, 768), dtype=int) + for token, segment in zip(data[0],data[1]): + a2 = classifymodel.predict(token, segment) + a1 = np.concatenate([a1, a2]) + + print(a1.shape) + np.save('data/10235513_大型商业建筑人员疏散设计研究_沈福禹/save_x', a1) \ No newline at end of file diff --git a/word2vec_sim.py b/word2vec_sim.py new file mode 100644 index 0000000..6e8488d --- /dev/null +++ b/word2vec_sim.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- + +""" +@Time : 2023/3/14 19:01 +@Author : +@FileName: +@Software: +@Describe: +""" +from gensim.models.word2vec import LineSentence +import numpy as np +from tqdm import tqdm + +path = "word2vec_model/word2vec.txt" +def iter_word(word, txt_path): + """迭代器方法获取词向量""" + vec = 0 + iter1 = LineSentence(open(txt_path, 'r', encoding='utf-8')) + for i,v in tqdm(enumerate(iter1)): + if i == 0: + continue + if word == v[:1]: + vec = np.array([float(j) for j in v[1:]]) + break + return vec + +word = "公共" +print(iter_word(word,path)) \ No newline at end of file diff --git a/处理全文对照.py b/处理全文对照.py new file mode 100644 index 0000000..61a9bcb --- /dev/null +++ b/处理全文对照.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- + +""" +@Time : 2023/3/15 11:39 +@Author : +@FileName: +@Software: +@Describe: +""" +import pandas as pd +import difflib + + + +path_txt = "data/10235513_大型商业建筑人员疏散设计研究_沈福禹/大型商业建筑人员疏散设计研究.txt" +path_csv = "data/10235513_大型商业建筑人员疏散设计研究_沈福禹/大型商业建筑人员疏散设计研究_全文.csv" +path_csv_sim = "data/10235513_大型商业建筑人员疏散设计研究_沈福禹/大型商业建筑人员疏散设计研究_全文对照.csv" +f = open(path_txt, encoding="utf-8") +centent = f.read() +f.close() + +data = [] +centent_text_list = centent.split("\n") +centent_csv_list = pd.read_csv(path_csv).values.tolist() +for dan_yuan in centent_csv_list: + str_sim_text = "##" + for dan_lable in centent_text_list: + str_sim_value = difflib.SequenceMatcher(None, dan_yuan[0], dan_lable).quick_ratio() + if str_sim_value >= 0.95: + str_sim_text = dan_lable + break + data.append([dan_yuan[0], str_sim_text]) + +pd.DataFrame(data).to_csv(path_csv_sim,index=None) \ No newline at end of file diff --git a/文本处理.py b/文本处理.py new file mode 100644 index 0000000..9f62cd8 --- /dev/null +++ b/文本处理.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- + +""" +@Time : 2023/3/10 17:45 +@Author : +@FileName: +@Software: +@Describe: +""" +import os + +file = './data/10235513_大型商业建筑人员疏散设计研究_沈福禹/查重txt' + +path_list = [] + +for root, dirs, files in os.walk(file): + for file in files: + path = os.path.join(root, file) + path_list.append(path) + + +for path in path_list: + with open(path, encoding="utf-8") as f: + text = f.read() + path_dan_list = path.split("\\") + root_path = path_dan_list[0] + file_path = path_dan_list[1] + root_pathdan_list = root_path.split("/") + root_pathdan_list = root_pathdan_list[:-1] + print(root_pathdan_list) + + text_list = text.split("@@@@@@@@@@") + text_zhengwen = text_list[-1] + text_list = [i.lstrip("\n") for i in text_list[:-1]] + print(text_list) + text_zhengwen = text_zhengwen.strip().replace("\n", "").replace(" ", "").replace("。", "。\n") + text_list = text_list + [text_zhengwen] + text_str = "@@@@@@@@@@".join(text_list) + path_new = "/".join(root_pathdan_list + ["查重txt_new", file_path]) + with open(path_new, "w") as f: + f.write(text_str) + f.close() diff --git a/查重方案.txt b/查重方案.txt new file mode 100644 index 0000000..e69de29 diff --git a/读取docx.py b/读取docx.py new file mode 100644 index 0000000..9b6b1ac --- /dev/null +++ b/读取docx.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- + +""" +@Time : 2023/3/15 10:38 +@Author : +@FileName: +@Software: +@Describe: +""" +import os +import docx +import pandas as pd + + +def read_docx(rawpath): # doc转docx + data = [] + data_new = [] + document = docx.Document(rawpath) + + # 获取所有段落 + all_paragraphs = document.paragraphs + for paragraph in all_paragraphs: + # 打印每一个段落的文字 + data.append(paragraph.text) + # for data_dan in data: + # if data_dan == "": + # continue + # else: + # data_list = str(data_dan).split("。") + # for data_dan_short in data_list: + # if data_dan_short == "": + # continue + # data_new.append(data_dan_short) + data = [dan for dan in data if dan != ""] + data = "".join(data) + data_list = str(data).split("。") + data_new = [dan + "。" for dan in data_list if dan != ""] + return data_new + + +if __name__ == '__main__': + pathls = r"E:\pycharm_workspace\duplicate_check\data\10235513_大型商业建筑人员疏散设计研究_沈福禹\10235513_沈福禹_大型商业建筑人员疏散设计研究.docx" + path_csv = "data/10235513_大型商业建筑人员疏散设计研究_沈福禹/大型商业建筑人员疏散设计研究_全文.csv" + data = read_docx(pathls) + data = [[i] for i in data] + pd.DataFrame(data).to_csv(path_csv, index=None) diff --git a/读取pdf.py b/读取pdf.py new file mode 100644 index 0000000..ef7e0da --- /dev/null +++ b/读取pdf.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- + +""" +@Time : 2023/3/9 15:34 +@Author : +@FileName: +@Software: +@Describe: +""" +import pdfplumber +import pandas as pd + +path = "./data/新建文件夹/13977991/全文对照.pdf" +# with pdfplumber.open(path) as pdf: +# first_page = pdf.pages[0] +# # 获取文本,直接得到字符串,包括了换行符【与PDF上的换行位置一致,而不是实际的“段落”】 +# print(first_page.extract_texts()) +# # 获取本页全部表格,也可以使用extract_table()获得单个表格 +# for table in p0.extract_tables(): +# #得到的table是嵌套list类型,转化成DataFrame更加方便查看和分析 +# df = pd.DataFrame(table[1:], columns=table[0]) +# print(df) + + +with pdfplumber.open(path) as pdf: + content = '' + for i in range(len(pdf.pages)): + # 读取PDF文档第i+1页 + page = pdf.pages[i] + + # page.extract_text()函数即读取文本内容,下面这步是去掉文档最下面的页码 + page_content = '\n'.join(page.extract_text().split('\n')[:-1]) + content = content + page_content + +print(content) + +import pdfplumber +import pandas as pd + +with pdfplumber.open(path) as pdf: + first_page = pdf.pages[3] + tables = first_page.extract_tables() + for table in tables: + df = pd.DataFrame(table) + # 第一列当成表头: + # df = pd.DataFrame(table[1:], columns=table[0]) + +print(df) \ No newline at end of file