
8 changed files with 546 additions and 18 deletions
@ -0,0 +1,14 @@ |
|||||
|
## 参考文献生成 |
||||
|
新参考文件项目使用faiss聚类方式查询,实现秒级响应 |
||||
|
|
||||
|
|
||||
|
#### 生成ndarray数据,生成文献基本信息 |
||||
|
修改或增加数据 |
||||
|
python 数据生成ndarray.py |
||||
|
|
||||
|
#### 训练faiss |
||||
|
修改或增加 npy文件路径 |
||||
|
python 训练faiss.py |
||||
|
|
||||
|
#### 部署服务 |
||||
|
python generate_references_api.py |
@ -0,0 +1,236 @@ |
|||||
|
import os |
||||
|
import random |
||||
|
|
||||
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0" |
||||
|
from flask import Flask, jsonify |
||||
|
from flask import request |
||||
|
import numpy as np |
||||
|
import faiss |
||||
|
import json |
||||
|
import requests |
||||
|
import socket |
||||
|
from sentence_transformers import SentenceTransformer |
||||
|
|
||||
|
|
||||
|
with open("data/lable/id2lable.json", encoding="utf-8") as f: |
||||
|
id2lable = json.loads(f.read()) |
||||
|
|
||||
|
with open("data/lable/lable2id.json", encoding="utf-8") as f: |
||||
|
lable2id = json.loads(f.read()) |
||||
|
|
||||
|
with open("data/discipline_types.json") as f: |
||||
|
lable_discipline_types = json.loads(f.read()) |
||||
|
|
||||
|
|
||||
|
app = Flask(__name__) |
||||
|
app.config["JSON_AS_ASCII"] = False |
||||
|
|
||||
|
d = 768 # dimension |
||||
|
model = SentenceTransformer('Dmeta-embedding-zh') |
||||
|
|
||||
|
def get_host_ip(): |
||||
|
""" |
||||
|
查询本机ip地址 |
||||
|
:return: ip |
||||
|
""" |
||||
|
try: |
||||
|
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) |
||||
|
s.connect(('8.8.8.8', 80)) |
||||
|
ip = s.getsockname()[0] |
||||
|
finally: |
||||
|
s.close() |
||||
|
|
||||
|
return ip |
||||
|
|
||||
|
# url = "http://{}:50003/roformer".format(str(get_host_ip())) |
||||
|
url = "http://{}:50003/roformer".format("192.168.31.149") |
||||
|
|
||||
|
def dialog_line_parse(url, text): |
||||
|
""" |
||||
|
将数据输入模型进行分析并输出结果 |
||||
|
:param url: 模型url |
||||
|
:param text: 进入模型的数据 |
||||
|
:return: 模型返回结果 |
||||
|
""" |
||||
|
|
||||
|
response = requests.post( |
||||
|
url, |
||||
|
json=text, |
||||
|
timeout=1000 |
||||
|
) |
||||
|
if response.status_code == 200: |
||||
|
return response.json() |
||||
|
else: |
||||
|
# logger.error( |
||||
|
# "【{}】 Failed to get a proper response from remote " |
||||
|
# "server. Status Code: {}. Response: {}" |
||||
|
# "".format(url, response.status_code, response.text) |
||||
|
# ) |
||||
|
print("【{}】 Failed to get a proper response from remote " |
||||
|
"server. Status Code: {}. Response: {}" |
||||
|
"".format(url, response.status_code, response.text)) |
||||
|
print(text) |
||||
|
return [] |
||||
|
|
||||
|
|
||||
|
def panduan_paper_lable(paper_lable_text): |
||||
|
paper_lable = { |
||||
|
"硕士": "D", |
||||
|
"期刊": "J", |
||||
|
"博士": "J" |
||||
|
} |
||||
|
return paper_lable[paper_lable_text] |
||||
|
|
||||
|
|
||||
|
def ulit_recall_paper(reference_list, nums): |
||||
|
''' |
||||
|
对返回的十篇文章路径读取并解析 |
||||
|
:param recall_data_list_path: |
||||
|
:return data: list [[sentence, filename],[sentence, filename],[sentence, filename]] |
||||
|
''' |
||||
|
|
||||
|
# data = [] |
||||
|
# for path in recall_data_list_path: |
||||
|
# filename = path.split("/")[-1] |
||||
|
# with open(path, encoding="gbk") as f: |
||||
|
# text = f.read() |
||||
|
# text_list = text.split("\n") |
||||
|
# for sentence in text_list: |
||||
|
# if sentence != "": |
||||
|
# data.append([sentence, filename]) |
||||
|
# return data |
||||
|
|
||||
|
# recall_data_list |
||||
|
# 作者 论文名称 论文类别 论文来源 论文年份 摘要 期刊 |
||||
|
# "[1]赵璐.基于旅游资源开发下的新农村景观营建研究[D].西安建筑科技大学,2014." |
||||
|
|
||||
|
data_info = [] |
||||
|
data_title = [] |
||||
|
|
||||
|
for data_one in reference_list: |
||||
|
|
||||
|
if data_one[1] not in data_title: |
||||
|
|
||||
|
print("data_one", data_one) |
||||
|
print("data_one[0]", data_one[0]) |
||||
|
paper = ".".join([ |
||||
|
",".join([str(i).replace("\n", "").replace("\r", "") for i in data_one[0].split(";") if i != ""]), |
||||
|
data_one[1] + f"[{panduan_paper_lable(data_one[6])}]", |
||||
|
",".join([ |
||||
|
data_one[3], str(data_one[4]) + "." |
||||
|
]) |
||||
|
]) |
||||
|
|
||||
|
data_title.append(data_one[1]) |
||||
|
data_info.append({ |
||||
|
"author": data_one[0], |
||||
|
"title": data_one[1], |
||||
|
"special_topic": data_one[2], |
||||
|
"qikan_name": data_one[3], |
||||
|
"year": str(data_one[4]), |
||||
|
"abstract": data_one[5], |
||||
|
"classlable": data_one[6], |
||||
|
"reference": paper |
||||
|
}) |
||||
|
|
||||
|
# print(data) |
||||
|
print(data_title) |
||||
|
print(nums) |
||||
|
random.shuffle(data_info) |
||||
|
random.shuffle(data_info) |
||||
|
data_info = data_info[:int(nums)] |
||||
|
return data_info |
||||
|
|
||||
|
|
||||
|
def main(title, abstract, nums): |
||||
|
data = { |
||||
|
"title": title, |
||||
|
"abst_zh": abstract, |
||||
|
"content": "" |
||||
|
} |
||||
|
# { |
||||
|
# "label_num": [ |
||||
|
# 117, |
||||
|
# 143 |
||||
|
# ] |
||||
|
# } |
||||
|
result = dialog_line_parse(url, data) |
||||
|
|
||||
|
# print(result['label_num'][0]) |
||||
|
# print(id2lable[result['label_num'][0]]) |
||||
|
subject_pinyin = lable_discipline_types[id2lable[str(result['label_num'][0])]] |
||||
|
|
||||
|
# with open(f"data/prompt/{subject_pinyin}.npy") as : |
||||
|
# zidonghua = np.load('data/prompt/{subject_pinyin}.npy') |
||||
|
|
||||
|
data_subject = np.load(f"data/prompt_qikan/{subject_pinyin}.npy") |
||||
|
data_subject_1 = np.load(f"data/prompt_master/{subject_pinyin}.npy") |
||||
|
data_subject_2 = np.load(f"data/prompt_doctor/{subject_pinyin}.npy") |
||||
|
print("xb.shape", data_subject.shape) |
||||
|
print("xb_1.shape", data_subject_1.shape) |
||||
|
print("xb_2.shape", data_subject_2.shape) |
||||
|
data_subject = np.concatenate((data_subject, data_subject_1, data_subject_2)) |
||||
|
print("data_subject.shape", data_subject.shape) |
||||
|
|
||||
|
index = faiss.read_index(f'data/prompt_qikan_master_doctor_ivf/{subject_pinyin}.ivf') |
||||
|
|
||||
|
with open(f"data/data_info_qikan/{subject_pinyin}.json") as f: |
||||
|
data_info = json.loads(f.read()) |
||||
|
|
||||
|
with open(f"data/data_info_master/{subject_pinyin}.json") as f: |
||||
|
data_info_1 = json.loads(f.read()) |
||||
|
|
||||
|
with open(f"data/data_info_doctor/{subject_pinyin}.json") as f: |
||||
|
data_info_2 = json.loads(f.read()) |
||||
|
|
||||
|
print(len(data_info)) |
||||
|
print(len(data_info_1)) |
||||
|
print(len(data_info_2)) |
||||
|
data_info = data_info + data_info_1 + data_info_2 |
||||
|
print(len(data_info)) |
||||
|
print(data_info[0]) |
||||
|
index.add(data_subject) |
||||
|
# index.nprobe = 2 # default nprobe is 1, try a few more |
||||
|
# k = nums |
||||
|
k = 20 |
||||
|
prompt = "标题:“{}”,摘要:“{}”".format(title, abstract) |
||||
|
embs = model.encode([prompt], normalize_embeddings=True) |
||||
|
|
||||
|
D, I = index.search(embs, int(k)) |
||||
|
# print(I) |
||||
|
|
||||
|
reference_list = [] |
||||
|
for i in I[0]: |
||||
|
reference_list.append(data_info[i]) |
||||
|
|
||||
|
data_info = ulit_recall_paper(reference_list, nums) |
||||
|
return "200", data_info |
||||
|
|
||||
|
|
||||
|
@app.route("/", methods=["POST"]) |
||||
|
def handle_query(): |
||||
|
# try: |
||||
|
title = request.form.get("title") |
||||
|
abstract = "" |
||||
|
nums = request.form.get('nums') |
||||
|
|
||||
|
# content = ulit_request_file(file) |
||||
|
|
||||
|
status_code, data_info_list = main(title, abstract, nums) |
||||
|
|
||||
|
if status_code == "400": |
||||
|
return_text = {"resilt": "", "probabilities": None, "status_code": 400} |
||||
|
else: |
||||
|
if status_code == "200": |
||||
|
return_text = { |
||||
|
"data_info": data_info_list, |
||||
|
"probabilities": None, |
||||
|
"status_code": 200 |
||||
|
} |
||||
|
else: |
||||
|
return_text = {"resilt": "", "probabilities": None, "status_code": 400} |
||||
|
|
||||
|
return jsonify(return_text) # 返回结果 |
||||
|
|
||||
|
if __name__ == "__main__": |
||||
|
app.run(host="0.0.0.0", port=17003, threaded=True) |
@ -0,0 +1,74 @@ |
|||||
|
import json |
||||
|
from tqdm import tqdm |
||||
|
# json.load() |
||||
|
|
||||
|
# with open("t_xuewei_cnki_spider.csv", encoding="utf-8") as f: |
||||
|
# a = f.read() |
||||
|
# print(a) |
||||
|
|
||||
|
import pandas as pd |
||||
|
|
||||
|
filename = 'data/spider_latest_doctor_paper_list.csv' |
||||
|
chunksize = 10000 # 指定每次读取的行数,可以根据需要调整 |
||||
|
|
||||
|
df_list = [] |
||||
|
# 使用 chunksize 参数迭代读取 CSV 文件 |
||||
|
for chunk in pd.read_csv(filename, chunksize=chunksize): |
||||
|
# 作者 论文名称 论文类别 论文来源 论文年份 摘要 |
||||
|
|
||||
|
# 对每个 chunk 进行处理 |
||||
|
|
||||
|
# print(chunk.columns) |
||||
|
# 9 / 0 |
||||
|
df_list_dan = chunk.values.tolist() |
||||
|
# print(df_list[0]) |
||||
|
for i in tqdm(range(len(df_list_dan))): |
||||
|
if str(df_list_dan[i][2]) != "nan" and \ |
||||
|
str(df_list_dan[i][1]) != "nan" and\ |
||||
|
str(df_list_dan[i][6]) != "nan" and\ |
||||
|
str(df_list_dan[i][3]) != "nan" and\ |
||||
|
str(df_list_dan[i][4]) != "nan" and\ |
||||
|
str(df_list_dan[i][13]) != "nan": |
||||
|
|
||||
|
df_list.append({ |
||||
|
'author': df_list_dan[i][2], |
||||
|
'title': df_list_dan[i][1], |
||||
|
'special_topic': df_list_dan[i][6], |
||||
|
'qikan_name': df_list_dan[i][3], |
||||
|
'year': df_list_dan[i][4], |
||||
|
'abstract': df_list_dan[i][13], |
||||
|
}) |
||||
|
|
||||
|
# data = [] |
||||
|
# json_list = [ |
||||
|
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki.json", |
||||
|
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki2.json", |
||||
|
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki3.json", |
||||
|
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki6.json", |
||||
|
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki7.json", |
||||
|
# ] |
||||
|
# |
||||
|
# |
||||
|
# print("主库数据完成加载") |
||||
|
# for path in json_list: |
||||
|
# name, typr_file = path.split(".") |
||||
|
# name = name.split("/")[-1] |
||||
|
# a = json.load(open(path)) |
||||
|
# for i in a: |
||||
|
# autoid = "_".join([name, str(i['autoid'])]) |
||||
|
# if autoid in df_dict: |
||||
|
# data.append([i['f_title']] + df_dict[autoid]) |
||||
|
# print("path完成筛选") |
||||
|
# |
||||
|
|
||||
|
print(len(df_list)) |
||||
|
with open("data/data_0423_doctor.json", "w", encoding="utf-8") as f: |
||||
|
f.write(json.dumps(df_list, ensure_ascii=False, indent=2)) |
||||
|
|
||||
|
# |
||||
|
# with open("data.json", encoding="utf-8") as f: |
||||
|
# for i in f.readlines(): |
||||
|
# a = json.loads(i) |
||||
|
# |
||||
|
# |
||||
|
# print(a) |
@ -0,0 +1,134 @@ |
|||||
|
import os |
||||
|
|
||||
|
os.environ["CUDA_VISIBLE_DEVICES"] = "1" |
||||
|
import json |
||||
|
import numpy as np |
||||
|
from tqdm import tqdm |
||||
|
from sentence_transformers import SentenceTransformer |
||||
|
import re |
||||
|
|
||||
|
model = SentenceTransformer('Dmeta-embedding-zh') |
||||
|
print(1) |
||||
|
with open("data/discipline_types.json", encoding="utf-8") as f: |
||||
|
lable_discipline_types = json.loads(f.read()) |
||||
|
|
||||
|
|
||||
|
def erjimul_ulit(): |
||||
|
pass |
||||
|
|
||||
|
|
||||
|
def shengcehng_array(data): |
||||
|
embs = model.encode(data, normalize_embeddings=True) |
||||
|
return embs |
||||
|
|
||||
|
|
||||
|
def is_contain_chinese(word): |
||||
|
""" |
||||
|
判断字符串是否包含中文字符 |
||||
|
:param word: 字符串 |
||||
|
:return: 布尔值,True表示包含中文,False表示不包含中文 |
||||
|
""" |
||||
|
pattern = re.compile(r'[\u4e00-\u9fa5]') |
||||
|
match = pattern.search(word) |
||||
|
return True if match else False |
||||
|
|
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
|
||||
|
# data = [] |
||||
|
with open("data/data_0423_doctor.json", encoding="utf-8") as f: |
||||
|
# for i in f.readlines(): |
||||
|
# a = json.loads(i) |
||||
|
# data.append(a) |
||||
|
data = json.loads(f.read()) |
||||
|
|
||||
|
print(len(data)) |
||||
|
|
||||
|
a = 0 |
||||
|
|
||||
|
a_ = 0 |
||||
|
data_info = {} # 作者 论文名称 论文类别 论文来源 论文年份 摘要 |
||||
|
data_prompt = {} |
||||
|
|
||||
|
data_info_en = {} # 作者 论文名称 论文类别 论文来源 论文年份 摘要 |
||||
|
data_prompt_en = {} |
||||
|
|
||||
|
for data_dan in data: |
||||
|
if str(data_dan["special_topic"]) == "nan" or \ |
||||
|
str(data_dan["author"]) == "nan" or \ |
||||
|
str(data_dan["title"]) == "nan" or \ |
||||
|
str(data_dan["qikan_name"]) == "nan" or \ |
||||
|
str(data_dan["year"]) == "nan" or \ |
||||
|
str(data_dan["abstract"]) == "nan": |
||||
|
a_ += 1 |
||||
|
continue |
||||
|
|
||||
|
leibie_list = data_dan["special_topic"].split(";") |
||||
|
for leibie in leibie_list: |
||||
|
if leibie in lable_discipline_types: |
||||
|
zh_bool = is_contain_chinese(data_dan["title"]) |
||||
|
|
||||
|
if zh_bool == True: |
||||
|
if lable_discipline_types[leibie] not in data_prompt: |
||||
|
dan_data_prompt = "标题:“{}”,摘要:“{}”".format(data_dan["title"], data_dan["abstract"]) |
||||
|
data_prompt[lable_discipline_types[leibie]] = [dan_data_prompt] |
||||
|
data_info[lable_discipline_types[leibie]] = [ |
||||
|
[data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], |
||||
|
data_dan["year"], data_dan["abstract"], "博士"]] |
||||
|
else: |
||||
|
dan_data_prompt = "标题:“{}”,摘要:“{}”".format(data_dan["title"], data_dan["abstract"]) |
||||
|
data_prompt[lable_discipline_types[leibie]].append(dan_data_prompt) |
||||
|
data_info[lable_discipline_types[leibie]].append( |
||||
|
[data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], |
||||
|
data_dan["year"], data_dan["abstract"], "博士"]) |
||||
|
else: |
||||
|
if lable_discipline_types[leibie] not in data_prompt_en: |
||||
|
dan_data_prompt = "标题:“{}”,摘要:“{}”".format(data_dan["title"], data_dan["abstract"]) |
||||
|
data_prompt_en[lable_discipline_types[leibie]] = [dan_data_prompt] |
||||
|
data_info_en[lable_discipline_types[leibie]] = [ |
||||
|
[data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], |
||||
|
data_dan["year"], data_dan["abstract"], "博士"]] |
||||
|
else: |
||||
|
dan_data_prompt = "标题:“{}”,摘要:“{}”".format(data_dan["title"], data_dan["abstract"]) |
||||
|
data_prompt_en[lable_discipline_types[leibie]].append(dan_data_prompt) |
||||
|
data_info_en[lable_discipline_types[leibie]].append( |
||||
|
[data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], |
||||
|
data_dan["year"], data_dan["abstract"], "博士"]) |
||||
|
|
||||
|
a += 1 |
||||
|
|
||||
|
print(2) |
||||
|
strat = 0 |
||||
|
end = 10000 |
||||
|
print(len(data_prompt)) |
||||
|
for leibie in tqdm(data_prompt): |
||||
|
data_ndarray = np.empty((0, 768)) |
||||
|
print("len(data_prompt[leibie])", len(data_prompt[leibie])) |
||||
|
while True: |
||||
|
if end >= len(data_prompt[leibie]): |
||||
|
break |
||||
|
linshi_data = data_prompt[leibie][strat:end] |
||||
|
data_ndarray = np.concatenate((data_ndarray, shengcehng_array(linshi_data))) |
||||
|
print("data_ndarray.shape", data_ndarray.shape) |
||||
|
strat = end |
||||
|
end += 10000 |
||||
|
|
||||
|
linshi_data = data_prompt[leibie][strat:len(data_prompt[leibie])] |
||||
|
print("len(linshi_data)", len(linshi_data)) |
||||
|
data_ndarray = np.concatenate((data_ndarray, shengcehng_array(linshi_data))) |
||||
|
print("data_ndarray.shape", data_ndarray.shape) |
||||
|
np.save(f'data/prompt_doctor/{leibie}.npy', data_ndarray) |
||||
|
strat = 0 |
||||
|
end = 10000 |
||||
|
|
||||
|
for leibie in data_info: |
||||
|
print(len(data_info[leibie])) |
||||
|
with open(f"data/data_info_doctor/{leibie}.json", "w", encoding="utf-8") as f: |
||||
|
f.write(json.dumps(data_info[leibie], ensure_ascii=False, indent=2)) |
||||
|
|
||||
|
for i in data_prompt_en: |
||||
|
print(i) |
||||
|
print(len(data_prompt_en[i])) |
||||
|
|
||||
|
print(len(data)) |
||||
|
print(a_) |
Loading…
Reference in new issue