From fcb93c6326558659620b5074f3ecb18e0c7264b6 Mon Sep 17 00:00:00 2001 From: "majiahui@haimaqingfan.com" Date: Wed, 17 Apr 2024 15:25:33 +0800 Subject: [PATCH] =?UTF-8?q?=E9=A6=96=E6=AC=A1=E6=8F=90=E4=BA=A4=EF=BC=8C?= =?UTF-8?q?=E5=AE=9E=E7=8E=B0=E6=9C=9F=E5=88=8A=E6=95=B0=E6=8D=AE=E7=9A=84?= =?UTF-8?q?=E6=96=87=E7=8C=AE=E6=9F=A5=E6=89=BE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- generate_references_api.py | 188 ++++++++++++++++++++++++++++++++++++++++++++ 拉去数据.py | 33 ++++++++ 数据合并.py | 66 ++++++++++++++++ 数据生成ndarray.py | 80 +++++++++++++++++++ 期刊数据整理.py | 66 ++++++++++++++++ 测试向量匹配.py | 53 +++++++++++++ 生成lable.py | 23 ++++++ 统计学科二级分类.py | 31 ++++++++ 训练faiss.py | 34 ++++++++ 9 files changed, 574 insertions(+) create mode 100644 generate_references_api.py create mode 100644 拉去数据.py create mode 100644 数据合并.py create mode 100644 数据生成ndarray.py create mode 100644 期刊数据整理.py create mode 100644 测试向量匹配.py create mode 100644 生成lable.py create mode 100644 统计学科二级分类.py create mode 100644 训练faiss.py diff --git a/generate_references_api.py b/generate_references_api.py new file mode 100644 index 0000000..8b2b9ef --- /dev/null +++ b/generate_references_api.py @@ -0,0 +1,188 @@ +import os +os.environ["CUDA_VISIBLE_DEVICES"] = "0" +from flask import Flask, jsonify +from flask import request +import numpy as np +import faiss +import json +import requests +import socket +from sentence_transformers import SentenceTransformer + + +with open("data/lable/id2lable.json", encoding="utf-8") as f: + id2lable = json.loads(f.read()) + +with open("data/lable/lable2id.json", encoding="utf-8") as f: + lable2id = json.loads(f.read()) + +with open("data/discipline_types.json") as f: + lable_discipline_types = json.loads(f.read()) + + +app = Flask(__name__) +app.config["JSON_AS_ASCII"] = False + +d = 768 # dimension +model = SentenceTransformer('Dmeta-embedding-zh') + +def get_host_ip(): + """ + 查询本机ip地址 + :return: ip + """ + try: + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + s.connect(('8.8.8.8', 80)) + ip = s.getsockname()[0] + finally: + s.close() + + return ip + +# url = "http://{}:50003/roformer".format(str(get_host_ip())) +url = "http://{}:50003/roformer".format("192.168.31.149") + +def dialog_line_parse(url, text): + """ + 将数据输入模型进行分析并输出结果 + :param url: 模型url + :param text: 进入模型的数据 + :return: 模型返回结果 + """ + + response = requests.post( + url, + json=text, + timeout=1000 + ) + if response.status_code == 200: + return response.json() + else: + # logger.error( + # "【{}】 Failed to get a proper response from remote " + # "server. Status Code: {}. Response: {}" + # "".format(url, response.status_code, response.text) + # ) + print("【{}】 Failed to get a proper response from remote " + "server. Status Code: {}. Response: {}" + "".format(url, response.status_code, response.text)) + print(text) + return [] + +def ulit_recall_paper(reference_list): + ''' + 对返回的十篇文章路径读取并解析 + :param recall_data_list_path: + :return data: list [[sentence, filename],[sentence, filename],[sentence, filename]] + ''' + + # data = [] + # for path in recall_data_list_path: + # filename = path.split("/")[-1] + # with open(path, encoding="gbk") as f: + # text = f.read() + # text_list = text.split("\n") + # for sentence in text_list: + # if sentence != "": + # data.append([sentence, filename]) + # return data + + # recall_data_list + # 作者 论文名称 论文类别 论文来源 论文年份 摘要 + # "[1]赵璐.基于旅游资源开发下的新农村景观营建研究[D].西安建筑科技大学,2014." + data = [] + for data_one in reference_list: + paper = ".".join([ + ",".join([i for i in data_one[0].split(";") if i != ""]), + data_one[1] + "[J]", + ",".join([ + data_one[3], str(data_one[4]) + "." + ]) + ]) + + data.append(paper) + + return data + + +def main(title, abstract, nums): + data = { + "title": title, + "abst_zh": abstract, + "content": "" + } + # { + # "label_num": [ + # 117, + # 143 + # ] + # } + result = dialog_line_parse(url, data) + + # print(result['label_num'][0]) + # print(id2lable[result['label_num'][0]]) + subject_pinyin = lable_discipline_types[id2lable[str(result['label_num'][0])]] + + # with open(f"data/prompt/{subject_pinyin}.npy") as : + # zidonghua = np.load('data/prompt/{subject_pinyin}.npy') + + data_subject = np.load(f"data/prompt_qikan/{subject_pinyin}.npy") + + index = faiss.read_index(f'data/prompt_qikan_ivf/{subject_pinyin}.ivf') + + with open(f"data/data_info_qikan/{subject_pinyin}.json") as f: + data_info = json.loads(f.read()) + + index.add(data_subject) + # index.nprobe = 2 # default nprobe is 1, try a few more + k = nums + prompt = "标题:“{}”,摘要:“{}”".format(title, abstract) + embs = model.encode([prompt], normalize_embeddings=True) + + D, I = index.search(embs, int(k)) + print(I) + + reference_list = [] + abstract_list = [] + for i in I[0]: + reference_list.append(data_info[i]) + abstract_list.append(data_info[i][5]) + + return "200", ulit_recall_paper(reference_list), abstract_list + + +@app.route("/", methods=["POST"]) +def handle_query(): + # try: + title = request.form.get("title") + abstract = request.form.get('abstract') + nums = request.form.get('nums') + + # content = ulit_request_file(file) + + status_code, reference, abstract_list = main(title, abstract, nums) + + if status_code == "400": + return_text = {"resilt": "", "probabilities": None, "status_code": 400} + else: + reference_list = reference + print(reference_list) + reference = [f"[{str(i+1)}]" + reference_list[i] for i in range(len(reference_list))] + if status_code == "200": + return_text = { + "resilt": { + "reference": reference, + "abstract": abstract_list + }, + "probabilities": None, + "status_code": 200 + } + else: + return_text = {"resilt": "", "probabilities": None, "status_code": 400} + # except: + # return_text = {"resilt": "", "probabilities": None, "status_code": 400} + return jsonify(return_text) # 返回结果 + +if __name__ == "__main__": + app.run(host="0.0.0.0", port=17001, threaded=True) \ No newline at end of file diff --git a/拉去数据.py b/拉去数据.py new file mode 100644 index 0000000..b6dde0f --- /dev/null +++ b/拉去数据.py @@ -0,0 +1,33 @@ +import pymysql +import json + +# 建立数据库连接 +connection = pymysql.connect( + host='rm-bp11ky2z5f34d2949fo.mysql.rds.aliyuncs.com', + user='fabiao_r', + password='f5u1w8nfb3b', + database='fabiao', + cursorclass=pymysql.cursors.DictCursor # 返回字典形式的结果,方便操作 +) + +try: + with connection.cursor() as cursor: + # 执行查询 + sql = "SELECT * FROM spider_latest_journal_paper_list" + cursor.execute(sql) + + # 获取查询结果 + result = cursor.fetchall() + print(len(result)) + + # 处理结果 + + # for row in result: + # print(row) + + with open("data/doctor_2018_2021.json", "w", encoding="utf-8") as f: + f.write(json.dumps(result, indent=2, ensure_ascii=False)) + +finally: + # 关闭连接 + connection.close() \ No newline at end of file diff --git a/数据合并.py b/数据合并.py new file mode 100644 index 0000000..47c181c --- /dev/null +++ b/数据合并.py @@ -0,0 +1,66 @@ +import json + +# json.load() + +# with open("t_xuewei_cnki_spider.csv", encoding="utf-8") as f: +# a = f.read() +# print(a) + +import pandas as pd + +filename = 'data/spider_latest_journal_paper_list.csv' +chunksize = 10000 # 指定每次读取的行数,可以根据需要调整 + +df_list = [] +# 使用 chunksize 参数迭代读取 CSV 文件 +for chunk in pd.read_csv(filename, chunksize=chunksize): + # 作者 论文名称 论文类别 论文来源 论文年份 摘要 + + # 对每个 chunk 进行处理 + + # print(chunk.columns) + # 9 / 0 + df_list_dan = chunk.values.tolist() + # print(df_list[0]) + for i in range(len(df_list_dan)): + df_list.append({ + 'author': df_list_dan[i][2], + 'title': df_list_dan[i][1], + 'special_topic': df_list_dan[i][7], + 'qikan_name': df_list_dan[i][3], + 'year': df_list_dan[i][4], + 'abstract': df_list_dan[i][10], + }) + +# data = [] +# json_list = [ +# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki.json", +# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki2.json", +# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki3.json", +# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki6.json", +# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki7.json", +# ] +# +# +# print("主库数据完成加载") +# for path in json_list: +# name, typr_file = path.split(".") +# name = name.split("/")[-1] +# a = json.load(open(path)) +# for i in a: +# autoid = "_".join([name, str(i['autoid'])]) +# if autoid in df_dict: +# data.append([i['f_title']] + df_dict[autoid]) +# print("path完成筛选") +# +# +with open("data/data_0416.json", "w") as f: + f.write(json.dumps(df_list, ensure_ascii=False, indent=2)) + +# +# with open("data.json", encoding="utf-8") as f: +# for i in f.readlines(): +# a = json.loads(i) +# +# +# print(a) \ No newline at end of file diff --git a/数据生成ndarray.py b/数据生成ndarray.py new file mode 100644 index 0000000..d951bc1 --- /dev/null +++ b/数据生成ndarray.py @@ -0,0 +1,80 @@ +import os +os.environ["CUDA_VISIBLE_DEVICES"] = "0" +import json +import numpy as np +from tqdm import tqdm +from sentence_transformers import SentenceTransformer + +model = SentenceTransformer('Dmeta-embedding-zh') +print(1) +with open("data/discipline_types.json", encoding="utf-8") as f: + lable_discipline_types = json.loads(f.read()) + +def erjimul_ulit(): + pass + +def shengcehng_array(data): + embs = model.encode(data, normalize_embeddings=True) + return embs + + +if __name__ == '__main__': + + # data = [] + with open("data/data_0416.json", encoding="utf-8") as f: + # for i in f.readlines(): + # a = json.loads(i) + # data.append(a) + data = json.loads(f.read()) + + print(len(data)) + + a = 0 + + a_ = 0 + data_info = {} # 作者 论文名称 论文类别 论文来源 论文年份 摘要 + data_prompt = {} + for data_dan in data: + if str(data_dan["special_topic"]) == "nan": + a_ += 1 + continue + + leibie_list = data_dan["special_topic"].split(";") + for leibie in leibie_list: + if leibie in lable_discipline_types: + if lable_discipline_types[leibie] not in data_prompt: + data_prompt[lable_discipline_types[leibie]] = ["标题:“{}”,摘要:“{}”".format(data_dan["title"], data_dan["abstract"])] + data_info[lable_discipline_types[leibie]] = [[data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], data_dan["year"], data_dan["abstract"], "期刊"]] + else: + data_prompt[lable_discipline_types[leibie]].append("标题:“{}”,摘要:“{}”".format(data_dan["title"], data_dan["abstract"])) + data_info[lable_discipline_types[leibie]].append([data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], data_dan["year"], data_dan["abstract"], "期刊"]) + a += 1 + + print(2) + strat = 0 + end = 10000 + print(len(data_prompt)) + for leibie in tqdm(data_prompt): + data_ndarray = np.empty((0, 768)) + print("len(data_prompt[leibie])", len(data_prompt[leibie])) + while True: + if end >= len(data_prompt[leibie]): + break + linshi_data = data_prompt[leibie][strat:end] + data_ndarray = np.concatenate((data_ndarray, shengcehng_array(linshi_data))) + print("data_ndarray.shape", data_ndarray.shape) + strat = end + end += 10000 + + linshi_data = data_prompt[leibie][strat:len(data_prompt[leibie])] + print("len(linshi_data)", len(linshi_data)) + data_ndarray = np.concatenate((data_ndarray, shengcehng_array(linshi_data))) + print("data_ndarray.shape", data_ndarray.shape) + np.save(f'data/prompt_qikan/{leibie}.npy', data_ndarray) + strat = 0 + end = 10000 + + for leibie in data_info: + print(len(data_info[leibie])) + with open(f"data/data_info_qikan/{leibie}.json", "w", encoding="utf-8") as f: + f.write(json.dumps(data_info[leibie], ensure_ascii=False, indent=2)) diff --git a/期刊数据整理.py b/期刊数据整理.py new file mode 100644 index 0000000..47c181c --- /dev/null +++ b/期刊数据整理.py @@ -0,0 +1,66 @@ +import json + +# json.load() + +# with open("t_xuewei_cnki_spider.csv", encoding="utf-8") as f: +# a = f.read() +# print(a) + +import pandas as pd + +filename = 'data/spider_latest_journal_paper_list.csv' +chunksize = 10000 # 指定每次读取的行数,可以根据需要调整 + +df_list = [] +# 使用 chunksize 参数迭代读取 CSV 文件 +for chunk in pd.read_csv(filename, chunksize=chunksize): + # 作者 论文名称 论文类别 论文来源 论文年份 摘要 + + # 对每个 chunk 进行处理 + + # print(chunk.columns) + # 9 / 0 + df_list_dan = chunk.values.tolist() + # print(df_list[0]) + for i in range(len(df_list_dan)): + df_list.append({ + 'author': df_list_dan[i][2], + 'title': df_list_dan[i][1], + 'special_topic': df_list_dan[i][7], + 'qikan_name': df_list_dan[i][3], + 'year': df_list_dan[i][4], + 'abstract': df_list_dan[i][10], + }) + +# data = [] +# json_list = [ +# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki.json", +# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki2.json", +# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki3.json", +# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki6.json", +# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki7.json", +# ] +# +# +# print("主库数据完成加载") +# for path in json_list: +# name, typr_file = path.split(".") +# name = name.split("/")[-1] +# a = json.load(open(path)) +# for i in a: +# autoid = "_".join([name, str(i['autoid'])]) +# if autoid in df_dict: +# data.append([i['f_title']] + df_dict[autoid]) +# print("path完成筛选") +# +# +with open("data/data_0416.json", "w") as f: + f.write(json.dumps(df_list, ensure_ascii=False, indent=2)) + +# +# with open("data.json", encoding="utf-8") as f: +# for i in f.readlines(): +# a = json.loads(i) +# +# +# print(a) \ No newline at end of file diff --git a/测试向量匹配.py b/测试向量匹配.py new file mode 100644 index 0000000..3099d66 --- /dev/null +++ b/测试向量匹配.py @@ -0,0 +1,53 @@ +import os +os.environ["CUDA_VISIBLE_DEVICES"] = "0" + +import numpy as np +import faiss +import json +from sentence_transformers import SentenceTransformer + +d = 768 # dimension +zidonghua = np.load('zidonghua.npy') +model = SentenceTransformer('Dmeta-embedding-zh') + +data = [] +with open("data.json", encoding="utf-8") as f: + for i in f.readlines(): + a = json.loads(i) + data.append(a) + +mubiaoliebie = "自动化技术" +data_prompt = [] +for i in data: + if str(i[1]) == "nan": + continue + + leibie_list = i[1].split(";") + for leibie in leibie_list: + if leibie == mubiaoliebie: + data_prompt.append("标题:“{}”,摘要:“{}”".format(i[0], i[2])) + + + +# faiss.write_index(index, 'index.ivf') +index = faiss.read_index('zidonghua.ivf') + +index.add(zidonghua) # add may be a bit slower as well +# D, I = index.search(xq, k) # actual search +# print(I[-5:]) # neighbors of the 5 last queries + + +print("=======================================") +index.nprobe = 2 # default nprobe is 1, try a few more +k = 4 +biaoti = "工业机器人视觉导航系统的设计与实现" +zhaiyoa = "本研究致力于设计和实现工业机器人视觉导航系统,旨在提高工业生产中机器人的自主导航和定位能力。首先,通过综合考虑视觉传感器、定位算法和控制策略,设计了一种高效的机器人视觉导航系统框架。其次,利用深度学习技术对环境中的关键特征进行识别和定位,实现了机器人在复杂工作场景下的精确定位和路径规划。通过实验验证,本系统在提高机器人工作效率、减少人工干预以及降低操作误差等方面取得了显著的成果。因此,本研究为工业机器人在生产领域的应用提供了重要的技术支持,具有一定的实用和推广价值。" + +prompt = "标题:“{}”,摘要:“{}”".format(biaoti, zhaiyoa) +embs = model.encode([prompt], normalize_embeddings=True) + +D, I = index.search(embs, k) +print(I) + +for i in I[0]: + print(data_prompt[i]) \ No newline at end of file diff --git a/生成lable.py b/生成lable.py new file mode 100644 index 0000000..1d6c09e --- /dev/null +++ b/生成lable.py @@ -0,0 +1,23 @@ +import json + +with open("label_threshold.txt", encoding="utf-8") as f: + data = json.loads(f.read()) + + +id2lable = {} +lable2id = {} +for i in data: + if i not in lable2id: + lable2id[i] = data[i][0] + +for i in lable2id: + if lable2id[i] not in id2lable: + id2lable[lable2id[i]] = i + + +with open("data/lable/id2lable.json", "w", encoding="utf-8") as f: + f.write(json.dumps(id2lable, indent=2, ensure_ascii=False)) + + +with open("data/lable/lable2id.json", "w", encoding="utf-8") as f: + f.write(json.dumps(lable2id, indent=2, ensure_ascii=False)) \ No newline at end of file diff --git a/统计学科二级分类.py b/统计学科二级分类.py new file mode 100644 index 0000000..a86fe0c --- /dev/null +++ b/统计学科二级分类.py @@ -0,0 +1,31 @@ +import json +from pypinyin import pinyin, Style +import pandas as pd + + +def hanzi_to_pinyin(hanzi): + # 将汉字转换为拼音,Style.NORMAL表示以带音调的拼音形式输出 + pinyin_list = pinyin(hanzi, style=Style.NORMAL, heteronym=False) + print(pinyin_list) + # 将拼音列表连接成字符串 + pinyin_str = ''.join([i[0] for i in pinyin_list]) + return pinyin_str + + +if __name__ == '__main__': + df_list = pd.read_excel("论文种类分类表1.xls").values.tolist() + print(df_list) + + erji_dict = {} + + for i in range(len(df_list)): + if str(df_list[i][1]) == "nan": + continue + if df_list[i][1] not in erji_dict : + erji_dict[df_list[i][1]] = hanzi_to_pinyin(df_list[i][1]) + + print(erji_dict) + print(len(erji_dict)) + + with open("discipline_types.json", "w", encoding="utf-8") as f: + f.write(json.dumps(erji_dict, ensure_ascii=False, indent=2)) \ No newline at end of file diff --git a/训练faiss.py b/训练faiss.py new file mode 100644 index 0000000..a7f075f --- /dev/null +++ b/训练faiss.py @@ -0,0 +1,34 @@ +import numpy as np +import faiss +import json +import math + + +d = 768 # dimension +# nlist = 1000 #聚类的数目 + + +with open("data/discipline_types.json") as f: + lable_discipline_types = json.loads(f.read()) + +a = 0 +for leibie_zh in lable_discipline_types: + + xb = np.load(f'data/prompt_qikan/{lable_discipline_types[leibie_zh]}.npy') + + # nlist = math.floor((len(lable_discipline_types[leibie_zh]) ** 0.5)) # 聚类的数目 + # print(leibie_zh) + # print(len(lable_discipline_types[leibie_zh])) + # print(nlist) + + print(xb.shape) + nlist = math.floor((xb.shape[0] ** 0.5)) + a += xb.shape[0] + print(nlist) + quantizer = faiss.IndexFlatL2(d) + index = faiss.IndexIVFFlat(quantizer, d, nlist) + assert not index.is_trained + index.train(xb) # IndexIVFFlat是需要训练的,这边是学习聚类 + assert index.is_trained + faiss.write_index(index, f'data/prompt_qikan_ivf/{lable_discipline_types[leibie_zh]}.ivf') +print(a) \ No newline at end of file