Browse Source

更新参考文献

master
majiahui@haimaqingfan.com 8 months ago
parent
commit
2e736e38b2
  1. 14
      README.md
  2. 236
      generate_reference_faiss_data_info.py
  3. 26
      generate_references_api_1.py
  4. 74
      博士数据整理.py
  5. 134
      博士数据生成ndarray.py
  6. 70
      数据生成ndarray.py
  7. 2
      期刊数据整理.py
  8. 8
      训练faiss.py

14
README.md

@ -0,0 +1,14 @@
## 参考文献生成
新参考文件项目使用faiss聚类方式查询,实现秒级响应
#### 生成ndarray数据,生成文献基本信息
修改或增加数据
python 数据生成ndarray.py
#### 训练faiss
修改或增加 npy文件路径
python 训练faiss.py
#### 部署服务
python generate_references_api.py

236
generate_reference_faiss_data_info.py

@ -0,0 +1,236 @@
import os
import random
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from flask import Flask, jsonify
from flask import request
import numpy as np
import faiss
import json
import requests
import socket
from sentence_transformers import SentenceTransformer
with open("data/lable/id2lable.json", encoding="utf-8") as f:
id2lable = json.loads(f.read())
with open("data/lable/lable2id.json", encoding="utf-8") as f:
lable2id = json.loads(f.read())
with open("data/discipline_types.json") as f:
lable_discipline_types = json.loads(f.read())
app = Flask(__name__)
app.config["JSON_AS_ASCII"] = False
d = 768 # dimension
model = SentenceTransformer('Dmeta-embedding-zh')
def get_host_ip():
"""
查询本机ip地址
:return: ip
"""
try:
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.connect(('8.8.8.8', 80))
ip = s.getsockname()[0]
finally:
s.close()
return ip
# url = "http://{}:50003/roformer".format(str(get_host_ip()))
url = "http://{}:50003/roformer".format("192.168.31.149")
def dialog_line_parse(url, text):
"""
将数据输入模型进行分析并输出结果
:param url: 模型url
:param text: 进入模型的数据
:return: 模型返回结果
"""
response = requests.post(
url,
json=text,
timeout=1000
)
if response.status_code == 200:
return response.json()
else:
# logger.error(
# "【{}】 Failed to get a proper response from remote "
# "server. Status Code: {}. Response: {}"
# "".format(url, response.status_code, response.text)
# )
print("{}】 Failed to get a proper response from remote "
"server. Status Code: {}. Response: {}"
"".format(url, response.status_code, response.text))
print(text)
return []
def panduan_paper_lable(paper_lable_text):
paper_lable = {
"硕士": "D",
"期刊": "J",
"博士": "J"
}
return paper_lable[paper_lable_text]
def ulit_recall_paper(reference_list, nums):
'''
对返回的十篇文章路径读取并解析
:param recall_data_list_path:
:return data: list [[sentence, filename],[sentence, filename],[sentence, filename]]
'''
# data = []
# for path in recall_data_list_path:
# filename = path.split("/")[-1]
# with open(path, encoding="gbk") as f:
# text = f.read()
# text_list = text.split("\n")
# for sentence in text_list:
# if sentence != "":
# data.append([sentence, filename])
# return data
# recall_data_list
# 作者 论文名称 论文类别 论文来源 论文年份 摘要 期刊
# "[1]赵璐.基于旅游资源开发下的新农村景观营建研究[D].西安建筑科技大学,2014."
data_info = []
data_title = []
for data_one in reference_list:
if data_one[1] not in data_title:
print("data_one", data_one)
print("data_one[0]", data_one[0])
paper = ".".join([
",".join([str(i).replace("\n", "").replace("\r", "") for i in data_one[0].split(";") if i != ""]),
data_one[1] + f"[{panduan_paper_lable(data_one[6])}]",
",".join([
data_one[3], str(data_one[4]) + "."
])
])
data_title.append(data_one[1])
data_info.append({
"author": data_one[0],
"title": data_one[1],
"special_topic": data_one[2],
"qikan_name": data_one[3],
"year": str(data_one[4]),
"abstract": data_one[5],
"classlable": data_one[6],
"reference": paper
})
# print(data)
print(data_title)
print(nums)
random.shuffle(data_info)
random.shuffle(data_info)
data_info = data_info[:int(nums)]
return data_info
def main(title, abstract, nums):
data = {
"title": title,
"abst_zh": abstract,
"content": ""
}
# {
# "label_num": [
# 117,
# 143
# ]
# }
result = dialog_line_parse(url, data)
# print(result['label_num'][0])
# print(id2lable[result['label_num'][0]])
subject_pinyin = lable_discipline_types[id2lable[str(result['label_num'][0])]]
# with open(f"data/prompt/{subject_pinyin}.npy") as :
# zidonghua = np.load('data/prompt/{subject_pinyin}.npy')
data_subject = np.load(f"data/prompt_qikan/{subject_pinyin}.npy")
data_subject_1 = np.load(f"data/prompt_master/{subject_pinyin}.npy")
data_subject_2 = np.load(f"data/prompt_doctor/{subject_pinyin}.npy")
print("xb.shape", data_subject.shape)
print("xb_1.shape", data_subject_1.shape)
print("xb_2.shape", data_subject_2.shape)
data_subject = np.concatenate((data_subject, data_subject_1, data_subject_2))
print("data_subject.shape", data_subject.shape)
index = faiss.read_index(f'data/prompt_qikan_master_doctor_ivf/{subject_pinyin}.ivf')
with open(f"data/data_info_qikan/{subject_pinyin}.json") as f:
data_info = json.loads(f.read())
with open(f"data/data_info_master/{subject_pinyin}.json") as f:
data_info_1 = json.loads(f.read())
with open(f"data/data_info_doctor/{subject_pinyin}.json") as f:
data_info_2 = json.loads(f.read())
print(len(data_info))
print(len(data_info_1))
print(len(data_info_2))
data_info = data_info + data_info_1 + data_info_2
print(len(data_info))
print(data_info[0])
index.add(data_subject)
# index.nprobe = 2 # default nprobe is 1, try a few more
# k = nums
k = 20
prompt = "标题:“{}”,摘要:“{}".format(title, abstract)
embs = model.encode([prompt], normalize_embeddings=True)
D, I = index.search(embs, int(k))
# print(I)
reference_list = []
for i in I[0]:
reference_list.append(data_info[i])
data_info = ulit_recall_paper(reference_list, nums)
return "200", data_info
@app.route("/", methods=["POST"])
def handle_query():
# try:
title = request.form.get("title")
abstract = ""
nums = request.form.get('nums')
# content = ulit_request_file(file)
status_code, data_info_list = main(title, abstract, nums)
if status_code == "400":
return_text = {"resilt": "", "probabilities": None, "status_code": 400}
else:
if status_code == "200":
return_text = {
"data_info": data_info_list,
"probabilities": None,
"status_code": 200
}
else:
return_text = {"resilt": "", "probabilities": None, "status_code": 400}
return jsonify(return_text) # 返回结果
if __name__ == "__main__":
app.run(host="0.0.0.0", port=17003, threaded=True)

26
generate_references_api_1.py

@ -75,7 +75,7 @@ def panduan_paper_lable(paper_lable_text):
paper_lable = {
"硕士": "D",
"期刊": "J",
"博士": "J"
"博士": "D"
}
return paper_lable[paper_lable_text]
@ -106,13 +106,23 @@ def ulit_recall_paper(reference_list, nums):
for data_one in reference_list:
print("data_one", data_one)
print("data_one[0]", data_one[0])
paper = ".".join([
",".join([str(i).replace("\n", "").replace("\r", "") for i in data_one[0].split(";") if i != ""]),
data_one[1] + f"[{panduan_paper_lable(data_one[6])}]",
",".join([
data_one[3], str(data_one[4]) + "."
if panduan_paper_lable(data_one[6]) == "J":
paper = ".".join([
",".join([str(i).replace("\n", "").replace("\r", "") for i in data_one[0].split(";") if i != ""]),
data_one[1] + f"[{panduan_paper_lable(data_one[6])}]",
",".join([
data_one[3], str(data_one[4])
])
]) + "" + f"({data_one[8]})" + f":{data_one[7]}" + "."
else:
paper = ".".join([
",".join([str(i).replace("\n", "").replace("\r", "") for i in data_one[0].split(";") if i != ""]),
data_one[1] + f"[{panduan_paper_lable(data_one[6])}]",
",".join([
data_one[3], str(data_one[4]) + "."
]),
])
])
data.append(paper)
@ -157,7 +167,7 @@ def main(title, abstract, nums):
index = faiss.read_index(f'data/prompt_qikan_master_doctor_ivf/{subject_pinyin}.ivf')
with open(f"data/data_info_qikan/{subject_pinyin}.json") as f:
with open(f"data/data_info_qikan_1/{subject_pinyin}.json") as f:
data_info = json.loads(f.read())
with open(f"data/data_info_master/{subject_pinyin}.json") as f:

74
博士数据整理.py

@ -0,0 +1,74 @@
import json
from tqdm import tqdm
# json.load()
# with open("t_xuewei_cnki_spider.csv", encoding="utf-8") as f:
# a = f.read()
# print(a)
import pandas as pd
filename = 'data/spider_latest_doctor_paper_list.csv'
chunksize = 10000 # 指定每次读取的行数,可以根据需要调整
df_list = []
# 使用 chunksize 参数迭代读取 CSV 文件
for chunk in pd.read_csv(filename, chunksize=chunksize):
# 作者 论文名称 论文类别 论文来源 论文年份 摘要
# 对每个 chunk 进行处理
# print(chunk.columns)
# 9 / 0
df_list_dan = chunk.values.tolist()
# print(df_list[0])
for i in tqdm(range(len(df_list_dan))):
if str(df_list_dan[i][2]) != "nan" and \
str(df_list_dan[i][1]) != "nan" and\
str(df_list_dan[i][6]) != "nan" and\
str(df_list_dan[i][3]) != "nan" and\
str(df_list_dan[i][4]) != "nan" and\
str(df_list_dan[i][13]) != "nan":
df_list.append({
'author': df_list_dan[i][2],
'title': df_list_dan[i][1],
'special_topic': df_list_dan[i][6],
'qikan_name': df_list_dan[i][3],
'year': df_list_dan[i][4],
'abstract': df_list_dan[i][13],
})
# data = []
# json_list = [
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki.json",
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki2.json",
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki3.json",
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki6.json",
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki7.json",
# ]
#
#
# print("主库数据完成加载")
# for path in json_list:
# name, typr_file = path.split(".")
# name = name.split("/")[-1]
# a = json.load(open(path))
# for i in a:
# autoid = "_".join([name, str(i['autoid'])])
# if autoid in df_dict:
# data.append([i['f_title']] + df_dict[autoid])
# print("path完成筛选")
#
print(len(df_list))
with open("data/data_0423_doctor.json", "w", encoding="utf-8") as f:
f.write(json.dumps(df_list, ensure_ascii=False, indent=2))
#
# with open("data.json", encoding="utf-8") as f:
# for i in f.readlines():
# a = json.loads(i)
#
#
# print(a)

134
博士数据生成ndarray.py

@ -0,0 +1,134 @@
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import json
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import re
model = SentenceTransformer('Dmeta-embedding-zh')
print(1)
with open("data/discipline_types.json", encoding="utf-8") as f:
lable_discipline_types = json.loads(f.read())
def erjimul_ulit():
pass
def shengcehng_array(data):
embs = model.encode(data, normalize_embeddings=True)
return embs
def is_contain_chinese(word):
"""
判断字符串是否包含中文字符
:param word: 字符串
:return: 布尔值True表示包含中文False表示不包含中文
"""
pattern = re.compile(r'[\u4e00-\u9fa5]')
match = pattern.search(word)
return True if match else False
if __name__ == '__main__':
# data = []
with open("data/data_0423_doctor.json", encoding="utf-8") as f:
# for i in f.readlines():
# a = json.loads(i)
# data.append(a)
data = json.loads(f.read())
print(len(data))
a = 0
a_ = 0
data_info = {} # 作者 论文名称 论文类别 论文来源 论文年份 摘要
data_prompt = {}
data_info_en = {} # 作者 论文名称 论文类别 论文来源 论文年份 摘要
data_prompt_en = {}
for data_dan in data:
if str(data_dan["special_topic"]) == "nan" or \
str(data_dan["author"]) == "nan" or \
str(data_dan["title"]) == "nan" or \
str(data_dan["qikan_name"]) == "nan" or \
str(data_dan["year"]) == "nan" or \
str(data_dan["abstract"]) == "nan":
a_ += 1
continue
leibie_list = data_dan["special_topic"].split(";")
for leibie in leibie_list:
if leibie in lable_discipline_types:
zh_bool = is_contain_chinese(data_dan["title"])
if zh_bool == True:
if lable_discipline_types[leibie] not in data_prompt:
dan_data_prompt = "标题:“{}”,摘要:“{}".format(data_dan["title"], data_dan["abstract"])
data_prompt[lable_discipline_types[leibie]] = [dan_data_prompt]
data_info[lable_discipline_types[leibie]] = [
[data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"],
data_dan["year"], data_dan["abstract"], "博士"]]
else:
dan_data_prompt = "标题:“{}”,摘要:“{}".format(data_dan["title"], data_dan["abstract"])
data_prompt[lable_discipline_types[leibie]].append(dan_data_prompt)
data_info[lable_discipline_types[leibie]].append(
[data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"],
data_dan["year"], data_dan["abstract"], "博士"])
else:
if lable_discipline_types[leibie] not in data_prompt_en:
dan_data_prompt = "标题:“{}”,摘要:“{}".format(data_dan["title"], data_dan["abstract"])
data_prompt_en[lable_discipline_types[leibie]] = [dan_data_prompt]
data_info_en[lable_discipline_types[leibie]] = [
[data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"],
data_dan["year"], data_dan["abstract"], "博士"]]
else:
dan_data_prompt = "标题:“{}”,摘要:“{}".format(data_dan["title"], data_dan["abstract"])
data_prompt_en[lable_discipline_types[leibie]].append(dan_data_prompt)
data_info_en[lable_discipline_types[leibie]].append(
[data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"],
data_dan["year"], data_dan["abstract"], "博士"])
a += 1
print(2)
strat = 0
end = 10000
print(len(data_prompt))
for leibie in tqdm(data_prompt):
data_ndarray = np.empty((0, 768))
print("len(data_prompt[leibie])", len(data_prompt[leibie]))
while True:
if end >= len(data_prompt[leibie]):
break
linshi_data = data_prompt[leibie][strat:end]
data_ndarray = np.concatenate((data_ndarray, shengcehng_array(linshi_data)))
print("data_ndarray.shape", data_ndarray.shape)
strat = end
end += 10000
linshi_data = data_prompt[leibie][strat:len(data_prompt[leibie])]
print("len(linshi_data)", len(linshi_data))
data_ndarray = np.concatenate((data_ndarray, shengcehng_array(linshi_data)))
print("data_ndarray.shape", data_ndarray.shape)
np.save(f'data/prompt_doctor/{leibie}.npy', data_ndarray)
strat = 0
end = 10000
for leibie in data_info:
print(len(data_info[leibie]))
with open(f"data/data_info_doctor/{leibie}.json", "w", encoding="utf-8") as f:
f.write(json.dumps(data_info[leibie], ensure_ascii=False, indent=2))
for i in data_prompt_en:
print(i)
print(len(data_prompt_en[i]))
print(len(data))
print(a_)

70
数据生成ndarray.py

@ -1,27 +1,42 @@
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import json
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import re
model = SentenceTransformer('Dmeta-embedding-zh')
print(1)
with open("data/discipline_types.json", encoding="utf-8") as f:
lable_discipline_types = json.loads(f.read())
def erjimul_ulit():
pass
def shengcehng_array(data):
embs = model.encode(data, normalize_embeddings=True)
return embs
def is_contain_chinese(word):
"""
判断字符串是否包含中文字符
:param word: 字符串
:return: 布尔值True表示包含中文False表示不包含中文
"""
pattern = re.compile(r'[\u4e00-\u9fa5]')
match = pattern.search(word)
return True if match else False
if __name__ == '__main__':
# data = []
with open("data/data_0416.json", encoding="utf-8") as f:
with open("data/data_0423_qikan.json", encoding="utf-8") as f:
# for i in f.readlines():
# a = json.loads(i)
# data.append(a)
@ -34,20 +49,52 @@ if __name__ == '__main__':
a_ = 0
data_info = {} # 作者 论文名称 论文类别 论文来源 论文年份 摘要
data_prompt = {}
data_info_en = {} # 作者 论文名称 论文类别 论文来源 论文年份 摘要
data_prompt_en = {}
for data_dan in data:
if str(data_dan["special_topic"]) == "nan":
if str(data_dan["special_topic"]) == "nan" or \
str(data_dan["author"]) == "nan" or \
str(data_dan["title"]) == "nan" or \
str(data_dan["qikan_name"]) == "nan" or \
str(data_dan["year"]) == "nan" or \
str(data_dan["abstract"]) == "nan":
a_ += 1
continue
leibie_list = data_dan["special_topic"].split(";")
for leibie in leibie_list:
if leibie in lable_discipline_types:
if lable_discipline_types[leibie] not in data_prompt:
data_prompt[lable_discipline_types[leibie]] = ["标题:“{}”,摘要:“{}".format(data_dan["title"], data_dan["abstract"])]
data_info[lable_discipline_types[leibie]] = [[data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], data_dan["year"], data_dan["abstract"], "期刊"]]
zh_bool = is_contain_chinese(data_dan["title"])
if zh_bool == True:
if lable_discipline_types[leibie] not in data_prompt:
dan_data_prompt = "标题:“{}”,摘要:“{}".format(data_dan["title"], data_dan["abstract"])
data_prompt[lable_discipline_types[leibie]] = [dan_data_prompt]
data_info[lable_discipline_types[leibie]] = [
[data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"],
data_dan["year"], data_dan["abstract"], "期刊"]]
else:
dan_data_prompt = "标题:“{}”,摘要:“{}".format(data_dan["title"], data_dan["abstract"])
data_prompt[lable_discipline_types[leibie]].append(dan_data_prompt)
data_info[lable_discipline_types[leibie]].append(
[data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"],
data_dan["year"], data_dan["abstract"], "期刊"])
else:
data_prompt[lable_discipline_types[leibie]].append("标题:“{}”,摘要:“{}".format(data_dan["title"], data_dan["abstract"]))
data_info[lable_discipline_types[leibie]].append([data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], data_dan["year"], data_dan["abstract"], "期刊"])
if lable_discipline_types[leibie] not in data_prompt_en:
dan_data_prompt = "标题:“{}”,摘要:“{}".format(data_dan["title"], data_dan["abstract"])
data_prompt_en[lable_discipline_types[leibie]] = [dan_data_prompt]
data_info_en[lable_discipline_types[leibie]] = [
[data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"],
data_dan["year"], data_dan["abstract"], "期刊"]]
else:
dan_data_prompt = "标题:“{}”,摘要:“{}".format(data_dan["title"], data_dan["abstract"])
data_prompt_en[lable_discipline_types[leibie]].append(dan_data_prompt)
data_info_en[lable_discipline_types[leibie]].append(
[data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"],
data_dan["year"], data_dan["abstract"], "期刊"])
a += 1
print(2)
@ -78,3 +125,10 @@ if __name__ == '__main__':
print(len(data_info[leibie]))
with open(f"data/data_info_qikan/{leibie}.json", "w", encoding="utf-8") as f:
f.write(json.dumps(data_info[leibie], ensure_ascii=False, indent=2))
for i in data_prompt_en:
print(i)
print(len(data_prompt_en[i]))
print(len(data))
print(a_)

2
期刊数据整理.py

@ -54,7 +54,7 @@ for chunk in pd.read_csv(filename, chunksize=chunksize):
# print("path完成筛选")
#
#
with open("data/data_0416.json", "w") as f:
with open("data/data_0423_qikan.json", "w") as f:
f.write(json.dumps(df_list, ensure_ascii=False, indent=2))
#

8
训练faiss.py

@ -15,6 +15,12 @@ a = 0
for leibie_zh in lable_discipline_types:
xb = np.load(f'data/prompt_qikan/{lable_discipline_types[leibie_zh]}.npy')
xb_1 = np.load(f'data/prompt_master/{lable_discipline_types[leibie_zh]}.npy')
xb_2 = np.load(f'data/prompt_doctor/{lable_discipline_types[leibie_zh]}.npy')
print("xb.shape", xb.shape)
print("xb_1.shape", xb_1.shape)
print("xb_2.shape", xb_2.shape)
xb = np.concatenate((xb, xb_1, xb_2))
# nlist = math.floor((len(lable_discipline_types[leibie_zh]) ** 0.5)) # 聚类的数目
# print(leibie_zh)
@ -30,5 +36,5 @@ for leibie_zh in lable_discipline_types:
assert not index.is_trained
index.train(xb) # IndexIVFFlat是需要训练的,这边是学习聚类
assert index.is_trained
faiss.write_index(index, f'data/prompt_qikan_ivf/{lable_discipline_types[leibie_zh]}.ivf')
faiss.write_index(index, f'data/prompt_qikan_master_doctor_ivf/{lable_discipline_types[leibie_zh]}.ivf')
print(a)
Loading…
Cancel
Save