Browse Source

首次提交,实现期刊数据的文献查找

master
majiahui@haimaqingfan.com 1 year ago
commit
fcb93c6326
  1. 188
      generate_references_api.py
  2. 33
      拉去数据.py
  3. 66
      数据合并.py
  4. 80
      数据生成ndarray.py
  5. 66
      期刊数据整理.py
  6. 53
      测试向量匹配.py
  7. 23
      生成lable.py
  8. 31
      统计学科二级分类.py
  9. 34
      训练faiss.py

188
generate_references_api.py

@ -0,0 +1,188 @@
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from flask import Flask, jsonify
from flask import request
import numpy as np
import faiss
import json
import requests
import socket
from sentence_transformers import SentenceTransformer
with open("data/lable/id2lable.json", encoding="utf-8") as f:
id2lable = json.loads(f.read())
with open("data/lable/lable2id.json", encoding="utf-8") as f:
lable2id = json.loads(f.read())
with open("data/discipline_types.json") as f:
lable_discipline_types = json.loads(f.read())
app = Flask(__name__)
app.config["JSON_AS_ASCII"] = False
d = 768 # dimension
model = SentenceTransformer('Dmeta-embedding-zh')
def get_host_ip():
"""
查询本机ip地址
:return: ip
"""
try:
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.connect(('8.8.8.8', 80))
ip = s.getsockname()[0]
finally:
s.close()
return ip
# url = "http://{}:50003/roformer".format(str(get_host_ip()))
url = "http://{}:50003/roformer".format("192.168.31.149")
def dialog_line_parse(url, text):
"""
将数据输入模型进行分析并输出结果
:param url: 模型url
:param text: 进入模型的数据
:return: 模型返回结果
"""
response = requests.post(
url,
json=text,
timeout=1000
)
if response.status_code == 200:
return response.json()
else:
# logger.error(
# "【{}】 Failed to get a proper response from remote "
# "server. Status Code: {}. Response: {}"
# "".format(url, response.status_code, response.text)
# )
print("{}】 Failed to get a proper response from remote "
"server. Status Code: {}. Response: {}"
"".format(url, response.status_code, response.text))
print(text)
return []
def ulit_recall_paper(reference_list):
'''
对返回的十篇文章路径读取并解析
:param recall_data_list_path:
:return data: list [[sentence, filename],[sentence, filename],[sentence, filename]]
'''
# data = []
# for path in recall_data_list_path:
# filename = path.split("/")[-1]
# with open(path, encoding="gbk") as f:
# text = f.read()
# text_list = text.split("\n")
# for sentence in text_list:
# if sentence != "":
# data.append([sentence, filename])
# return data
# recall_data_list
# 作者 论文名称 论文类别 论文来源 论文年份 摘要
# "[1]赵璐.基于旅游资源开发下的新农村景观营建研究[D].西安建筑科技大学,2014."
data = []
for data_one in reference_list:
paper = ".".join([
",".join([i for i in data_one[0].split(";") if i != ""]),
data_one[1] + "[J]",
",".join([
data_one[3], str(data_one[4]) + "."
])
])
data.append(paper)
return data
def main(title, abstract, nums):
data = {
"title": title,
"abst_zh": abstract,
"content": ""
}
# {
# "label_num": [
# 117,
# 143
# ]
# }
result = dialog_line_parse(url, data)
# print(result['label_num'][0])
# print(id2lable[result['label_num'][0]])
subject_pinyin = lable_discipline_types[id2lable[str(result['label_num'][0])]]
# with open(f"data/prompt/{subject_pinyin}.npy") as :
# zidonghua = np.load('data/prompt/{subject_pinyin}.npy')
data_subject = np.load(f"data/prompt_qikan/{subject_pinyin}.npy")
index = faiss.read_index(f'data/prompt_qikan_ivf/{subject_pinyin}.ivf')
with open(f"data/data_info_qikan/{subject_pinyin}.json") as f:
data_info = json.loads(f.read())
index.add(data_subject)
# index.nprobe = 2 # default nprobe is 1, try a few more
k = nums
prompt = "标题:“{}”,摘要:“{}".format(title, abstract)
embs = model.encode([prompt], normalize_embeddings=True)
D, I = index.search(embs, int(k))
print(I)
reference_list = []
abstract_list = []
for i in I[0]:
reference_list.append(data_info[i])
abstract_list.append(data_info[i][5])
return "200", ulit_recall_paper(reference_list), abstract_list
@app.route("/", methods=["POST"])
def handle_query():
# try:
title = request.form.get("title")
abstract = request.form.get('abstract')
nums = request.form.get('nums')
# content = ulit_request_file(file)
status_code, reference, abstract_list = main(title, abstract, nums)
if status_code == "400":
return_text = {"resilt": "", "probabilities": None, "status_code": 400}
else:
reference_list = reference
print(reference_list)
reference = [f"[{str(i+1)}]" + reference_list[i] for i in range(len(reference_list))]
if status_code == "200":
return_text = {
"resilt": {
"reference": reference,
"abstract": abstract_list
},
"probabilities": None,
"status_code": 200
}
else:
return_text = {"resilt": "", "probabilities": None, "status_code": 400}
# except:
# return_text = {"resilt": "", "probabilities": None, "status_code": 400}
return jsonify(return_text) # 返回结果
if __name__ == "__main__":
app.run(host="0.0.0.0", port=17001, threaded=True)

33
拉去数据.py

@ -0,0 +1,33 @@
import pymysql
import json
# 建立数据库连接
connection = pymysql.connect(
host='rm-bp11ky2z5f34d2949fo.mysql.rds.aliyuncs.com',
user='fabiao_r',
password='f5u1w8nfb3b',
database='fabiao',
cursorclass=pymysql.cursors.DictCursor # 返回字典形式的结果,方便操作
)
try:
with connection.cursor() as cursor:
# 执行查询
sql = "SELECT * FROM spider_latest_journal_paper_list"
cursor.execute(sql)
# 获取查询结果
result = cursor.fetchall()
print(len(result))
# 处理结果
# for row in result:
# print(row)
with open("data/doctor_2018_2021.json", "w", encoding="utf-8") as f:
f.write(json.dumps(result, indent=2, ensure_ascii=False))
finally:
# 关闭连接
connection.close()

66
数据合并.py

@ -0,0 +1,66 @@
import json
# json.load()
# with open("t_xuewei_cnki_spider.csv", encoding="utf-8") as f:
# a = f.read()
# print(a)
import pandas as pd
filename = 'data/spider_latest_journal_paper_list.csv'
chunksize = 10000 # 指定每次读取的行数,可以根据需要调整
df_list = []
# 使用 chunksize 参数迭代读取 CSV 文件
for chunk in pd.read_csv(filename, chunksize=chunksize):
# 作者 论文名称 论文类别 论文来源 论文年份 摘要
# 对每个 chunk 进行处理
# print(chunk.columns)
# 9 / 0
df_list_dan = chunk.values.tolist()
# print(df_list[0])
for i in range(len(df_list_dan)):
df_list.append({
'author': df_list_dan[i][2],
'title': df_list_dan[i][1],
'special_topic': df_list_dan[i][7],
'qikan_name': df_list_dan[i][3],
'year': df_list_dan[i][4],
'abstract': df_list_dan[i][10],
})
# data = []
# json_list = [
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki.json",
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki2.json",
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki3.json",
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki6.json",
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki7.json",
# ]
#
#
# print("主库数据完成加载")
# for path in json_list:
# name, typr_file = path.split(".")
# name = name.split("/")[-1]
# a = json.load(open(path))
# for i in a:
# autoid = "_".join([name, str(i['autoid'])])
# if autoid in df_dict:
# data.append([i['f_title']] + df_dict[autoid])
# print("path完成筛选")
#
#
with open("data/data_0416.json", "w") as f:
f.write(json.dumps(df_list, ensure_ascii=False, indent=2))
#
# with open("data.json", encoding="utf-8") as f:
# for i in f.readlines():
# a = json.loads(i)
#
#
# print(a)

80
数据生成ndarray.py

@ -0,0 +1,80 @@
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import json
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('Dmeta-embedding-zh')
print(1)
with open("data/discipline_types.json", encoding="utf-8") as f:
lable_discipline_types = json.loads(f.read())
def erjimul_ulit():
pass
def shengcehng_array(data):
embs = model.encode(data, normalize_embeddings=True)
return embs
if __name__ == '__main__':
# data = []
with open("data/data_0416.json", encoding="utf-8") as f:
# for i in f.readlines():
# a = json.loads(i)
# data.append(a)
data = json.loads(f.read())
print(len(data))
a = 0
a_ = 0
data_info = {} # 作者 论文名称 论文类别 论文来源 论文年份 摘要
data_prompt = {}
for data_dan in data:
if str(data_dan["special_topic"]) == "nan":
a_ += 1
continue
leibie_list = data_dan["special_topic"].split(";")
for leibie in leibie_list:
if leibie in lable_discipline_types:
if lable_discipline_types[leibie] not in data_prompt:
data_prompt[lable_discipline_types[leibie]] = ["标题:“{}”,摘要:“{}".format(data_dan["title"], data_dan["abstract"])]
data_info[lable_discipline_types[leibie]] = [[data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], data_dan["year"], data_dan["abstract"], "期刊"]]
else:
data_prompt[lable_discipline_types[leibie]].append("标题:“{}”,摘要:“{}".format(data_dan["title"], data_dan["abstract"]))
data_info[lable_discipline_types[leibie]].append([data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], data_dan["year"], data_dan["abstract"], "期刊"])
a += 1
print(2)
strat = 0
end = 10000
print(len(data_prompt))
for leibie in tqdm(data_prompt):
data_ndarray = np.empty((0, 768))
print("len(data_prompt[leibie])", len(data_prompt[leibie]))
while True:
if end >= len(data_prompt[leibie]):
break
linshi_data = data_prompt[leibie][strat:end]
data_ndarray = np.concatenate((data_ndarray, shengcehng_array(linshi_data)))
print("data_ndarray.shape", data_ndarray.shape)
strat = end
end += 10000
linshi_data = data_prompt[leibie][strat:len(data_prompt[leibie])]
print("len(linshi_data)", len(linshi_data))
data_ndarray = np.concatenate((data_ndarray, shengcehng_array(linshi_data)))
print("data_ndarray.shape", data_ndarray.shape)
np.save(f'data/prompt_qikan/{leibie}.npy', data_ndarray)
strat = 0
end = 10000
for leibie in data_info:
print(len(data_info[leibie]))
with open(f"data/data_info_qikan/{leibie}.json", "w", encoding="utf-8") as f:
f.write(json.dumps(data_info[leibie], ensure_ascii=False, indent=2))

66
期刊数据整理.py

@ -0,0 +1,66 @@
import json
# json.load()
# with open("t_xuewei_cnki_spider.csv", encoding="utf-8") as f:
# a = f.read()
# print(a)
import pandas as pd
filename = 'data/spider_latest_journal_paper_list.csv'
chunksize = 10000 # 指定每次读取的行数,可以根据需要调整
df_list = []
# 使用 chunksize 参数迭代读取 CSV 文件
for chunk in pd.read_csv(filename, chunksize=chunksize):
# 作者 论文名称 论文类别 论文来源 论文年份 摘要
# 对每个 chunk 进行处理
# print(chunk.columns)
# 9 / 0
df_list_dan = chunk.values.tolist()
# print(df_list[0])
for i in range(len(df_list_dan)):
df_list.append({
'author': df_list_dan[i][2],
'title': df_list_dan[i][1],
'special_topic': df_list_dan[i][7],
'qikan_name': df_list_dan[i][3],
'year': df_list_dan[i][4],
'abstract': df_list_dan[i][10],
})
# data = []
# json_list = [
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki.json",
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki2.json",
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki3.json",
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki6.json",
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki7.json",
# ]
#
#
# print("主库数据完成加载")
# for path in json_list:
# name, typr_file = path.split(".")
# name = name.split("/")[-1]
# a = json.load(open(path))
# for i in a:
# autoid = "_".join([name, str(i['autoid'])])
# if autoid in df_dict:
# data.append([i['f_title']] + df_dict[autoid])
# print("path完成筛选")
#
#
with open("data/data_0416.json", "w") as f:
f.write(json.dumps(df_list, ensure_ascii=False, indent=2))
#
# with open("data.json", encoding="utf-8") as f:
# for i in f.readlines():
# a = json.loads(i)
#
#
# print(a)

53
测试向量匹配.py

@ -0,0 +1,53 @@
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import numpy as np
import faiss
import json
from sentence_transformers import SentenceTransformer
d = 768 # dimension
zidonghua = np.load('zidonghua.npy')
model = SentenceTransformer('Dmeta-embedding-zh')
data = []
with open("data.json", encoding="utf-8") as f:
for i in f.readlines():
a = json.loads(i)
data.append(a)
mubiaoliebie = "自动化技术"
data_prompt = []
for i in data:
if str(i[1]) == "nan":
continue
leibie_list = i[1].split(";")
for leibie in leibie_list:
if leibie == mubiaoliebie:
data_prompt.append("标题:“{}”,摘要:“{}".format(i[0], i[2]))
# faiss.write_index(index, 'index.ivf')
index = faiss.read_index('zidonghua.ivf')
index.add(zidonghua) # add may be a bit slower as well
# D, I = index.search(xq, k) # actual search
# print(I[-5:]) # neighbors of the 5 last queries
print("=======================================")
index.nprobe = 2 # default nprobe is 1, try a few more
k = 4
biaoti = "工业机器人视觉导航系统的设计与实现"
zhaiyoa = "本研究致力于设计和实现工业机器人视觉导航系统,旨在提高工业生产中机器人的自主导航和定位能力。首先,通过综合考虑视觉传感器、定位算法和控制策略,设计了一种高效的机器人视觉导航系统框架。其次,利用深度学习技术对环境中的关键特征进行识别和定位,实现了机器人在复杂工作场景下的精确定位和路径规划。通过实验验证,本系统在提高机器人工作效率、减少人工干预以及降低操作误差等方面取得了显著的成果。因此,本研究为工业机器人在生产领域的应用提供了重要的技术支持,具有一定的实用和推广价值。"
prompt = "标题:“{}”,摘要:“{}".format(biaoti, zhaiyoa)
embs = model.encode([prompt], normalize_embeddings=True)
D, I = index.search(embs, k)
print(I)
for i in I[0]:
print(data_prompt[i])

23
生成lable.py

@ -0,0 +1,23 @@
import json
with open("label_threshold.txt", encoding="utf-8") as f:
data = json.loads(f.read())
id2lable = {}
lable2id = {}
for i in data:
if i not in lable2id:
lable2id[i] = data[i][0]
for i in lable2id:
if lable2id[i] not in id2lable:
id2lable[lable2id[i]] = i
with open("data/lable/id2lable.json", "w", encoding="utf-8") as f:
f.write(json.dumps(id2lable, indent=2, ensure_ascii=False))
with open("data/lable/lable2id.json", "w", encoding="utf-8") as f:
f.write(json.dumps(lable2id, indent=2, ensure_ascii=False))

31
统计学科二级分类.py

@ -0,0 +1,31 @@
import json
from pypinyin import pinyin, Style
import pandas as pd
def hanzi_to_pinyin(hanzi):
# 将汉字转换为拼音,Style.NORMAL表示以带音调的拼音形式输出
pinyin_list = pinyin(hanzi, style=Style.NORMAL, heteronym=False)
print(pinyin_list)
# 将拼音列表连接成字符串
pinyin_str = ''.join([i[0] for i in pinyin_list])
return pinyin_str
if __name__ == '__main__':
df_list = pd.read_excel("论文种类分类表1.xls").values.tolist()
print(df_list)
erji_dict = {}
for i in range(len(df_list)):
if str(df_list[i][1]) == "nan":
continue
if df_list[i][1] not in erji_dict :
erji_dict[df_list[i][1]] = hanzi_to_pinyin(df_list[i][1])
print(erji_dict)
print(len(erji_dict))
with open("discipline_types.json", "w", encoding="utf-8") as f:
f.write(json.dumps(erji_dict, ensure_ascii=False, indent=2))

34
训练faiss.py

@ -0,0 +1,34 @@
import numpy as np
import faiss
import json
import math
d = 768 # dimension
# nlist = 1000 #聚类的数目
with open("data/discipline_types.json") as f:
lable_discipline_types = json.loads(f.read())
a = 0
for leibie_zh in lable_discipline_types:
xb = np.load(f'data/prompt_qikan/{lable_discipline_types[leibie_zh]}.npy')
# nlist = math.floor((len(lable_discipline_types[leibie_zh]) ** 0.5)) # 聚类的数目
# print(leibie_zh)
# print(len(lable_discipline_types[leibie_zh]))
# print(nlist)
print(xb.shape)
nlist = math.floor((xb.shape[0] ** 0.5))
a += xb.shape[0]
print(nlist)
quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFFlat(quantizer, d, nlist)
assert not index.is_trained
index.train(xb) # IndexIVFFlat是需要训练的,这边是学习聚类
assert index.is_trained
faiss.write_index(index, f'data/prompt_qikan_ivf/{lable_discipline_types[leibie_zh]}.ivf')
print(a)
Loading…
Cancel
Save