
8 changed files with 546 additions and 18 deletions
@ -0,0 +1,14 @@ |
|||
## 参考文献生成 |
|||
新参考文件项目使用faiss聚类方式查询,实现秒级响应 |
|||
|
|||
|
|||
#### 生成ndarray数据,生成文献基本信息 |
|||
修改或增加数据 |
|||
python 数据生成ndarray.py |
|||
|
|||
#### 训练faiss |
|||
修改或增加 npy文件路径 |
|||
python 训练faiss.py |
|||
|
|||
#### 部署服务 |
|||
python generate_references_api.py |
@ -0,0 +1,236 @@ |
|||
import os |
|||
import random |
|||
|
|||
os.environ["CUDA_VISIBLE_DEVICES"] = "0" |
|||
from flask import Flask, jsonify |
|||
from flask import request |
|||
import numpy as np |
|||
import faiss |
|||
import json |
|||
import requests |
|||
import socket |
|||
from sentence_transformers import SentenceTransformer |
|||
|
|||
|
|||
with open("data/lable/id2lable.json", encoding="utf-8") as f: |
|||
id2lable = json.loads(f.read()) |
|||
|
|||
with open("data/lable/lable2id.json", encoding="utf-8") as f: |
|||
lable2id = json.loads(f.read()) |
|||
|
|||
with open("data/discipline_types.json") as f: |
|||
lable_discipline_types = json.loads(f.read()) |
|||
|
|||
|
|||
app = Flask(__name__) |
|||
app.config["JSON_AS_ASCII"] = False |
|||
|
|||
d = 768 # dimension |
|||
model = SentenceTransformer('Dmeta-embedding-zh') |
|||
|
|||
def get_host_ip(): |
|||
""" |
|||
查询本机ip地址 |
|||
:return: ip |
|||
""" |
|||
try: |
|||
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) |
|||
s.connect(('8.8.8.8', 80)) |
|||
ip = s.getsockname()[0] |
|||
finally: |
|||
s.close() |
|||
|
|||
return ip |
|||
|
|||
# url = "http://{}:50003/roformer".format(str(get_host_ip())) |
|||
url = "http://{}:50003/roformer".format("192.168.31.149") |
|||
|
|||
def dialog_line_parse(url, text): |
|||
""" |
|||
将数据输入模型进行分析并输出结果 |
|||
:param url: 模型url |
|||
:param text: 进入模型的数据 |
|||
:return: 模型返回结果 |
|||
""" |
|||
|
|||
response = requests.post( |
|||
url, |
|||
json=text, |
|||
timeout=1000 |
|||
) |
|||
if response.status_code == 200: |
|||
return response.json() |
|||
else: |
|||
# logger.error( |
|||
# "【{}】 Failed to get a proper response from remote " |
|||
# "server. Status Code: {}. Response: {}" |
|||
# "".format(url, response.status_code, response.text) |
|||
# ) |
|||
print("【{}】 Failed to get a proper response from remote " |
|||
"server. Status Code: {}. Response: {}" |
|||
"".format(url, response.status_code, response.text)) |
|||
print(text) |
|||
return [] |
|||
|
|||
|
|||
def panduan_paper_lable(paper_lable_text): |
|||
paper_lable = { |
|||
"硕士": "D", |
|||
"期刊": "J", |
|||
"博士": "J" |
|||
} |
|||
return paper_lable[paper_lable_text] |
|||
|
|||
|
|||
def ulit_recall_paper(reference_list, nums): |
|||
''' |
|||
对返回的十篇文章路径读取并解析 |
|||
:param recall_data_list_path: |
|||
:return data: list [[sentence, filename],[sentence, filename],[sentence, filename]] |
|||
''' |
|||
|
|||
# data = [] |
|||
# for path in recall_data_list_path: |
|||
# filename = path.split("/")[-1] |
|||
# with open(path, encoding="gbk") as f: |
|||
# text = f.read() |
|||
# text_list = text.split("\n") |
|||
# for sentence in text_list: |
|||
# if sentence != "": |
|||
# data.append([sentence, filename]) |
|||
# return data |
|||
|
|||
# recall_data_list |
|||
# 作者 论文名称 论文类别 论文来源 论文年份 摘要 期刊 |
|||
# "[1]赵璐.基于旅游资源开发下的新农村景观营建研究[D].西安建筑科技大学,2014." |
|||
|
|||
data_info = [] |
|||
data_title = [] |
|||
|
|||
for data_one in reference_list: |
|||
|
|||
if data_one[1] not in data_title: |
|||
|
|||
print("data_one", data_one) |
|||
print("data_one[0]", data_one[0]) |
|||
paper = ".".join([ |
|||
",".join([str(i).replace("\n", "").replace("\r", "") for i in data_one[0].split(";") if i != ""]), |
|||
data_one[1] + f"[{panduan_paper_lable(data_one[6])}]", |
|||
",".join([ |
|||
data_one[3], str(data_one[4]) + "." |
|||
]) |
|||
]) |
|||
|
|||
data_title.append(data_one[1]) |
|||
data_info.append({ |
|||
"author": data_one[0], |
|||
"title": data_one[1], |
|||
"special_topic": data_one[2], |
|||
"qikan_name": data_one[3], |
|||
"year": str(data_one[4]), |
|||
"abstract": data_one[5], |
|||
"classlable": data_one[6], |
|||
"reference": paper |
|||
}) |
|||
|
|||
# print(data) |
|||
print(data_title) |
|||
print(nums) |
|||
random.shuffle(data_info) |
|||
random.shuffle(data_info) |
|||
data_info = data_info[:int(nums)] |
|||
return data_info |
|||
|
|||
|
|||
def main(title, abstract, nums): |
|||
data = { |
|||
"title": title, |
|||
"abst_zh": abstract, |
|||
"content": "" |
|||
} |
|||
# { |
|||
# "label_num": [ |
|||
# 117, |
|||
# 143 |
|||
# ] |
|||
# } |
|||
result = dialog_line_parse(url, data) |
|||
|
|||
# print(result['label_num'][0]) |
|||
# print(id2lable[result['label_num'][0]]) |
|||
subject_pinyin = lable_discipline_types[id2lable[str(result['label_num'][0])]] |
|||
|
|||
# with open(f"data/prompt/{subject_pinyin}.npy") as : |
|||
# zidonghua = np.load('data/prompt/{subject_pinyin}.npy') |
|||
|
|||
data_subject = np.load(f"data/prompt_qikan/{subject_pinyin}.npy") |
|||
data_subject_1 = np.load(f"data/prompt_master/{subject_pinyin}.npy") |
|||
data_subject_2 = np.load(f"data/prompt_doctor/{subject_pinyin}.npy") |
|||
print("xb.shape", data_subject.shape) |
|||
print("xb_1.shape", data_subject_1.shape) |
|||
print("xb_2.shape", data_subject_2.shape) |
|||
data_subject = np.concatenate((data_subject, data_subject_1, data_subject_2)) |
|||
print("data_subject.shape", data_subject.shape) |
|||
|
|||
index = faiss.read_index(f'data/prompt_qikan_master_doctor_ivf/{subject_pinyin}.ivf') |
|||
|
|||
with open(f"data/data_info_qikan/{subject_pinyin}.json") as f: |
|||
data_info = json.loads(f.read()) |
|||
|
|||
with open(f"data/data_info_master/{subject_pinyin}.json") as f: |
|||
data_info_1 = json.loads(f.read()) |
|||
|
|||
with open(f"data/data_info_doctor/{subject_pinyin}.json") as f: |
|||
data_info_2 = json.loads(f.read()) |
|||
|
|||
print(len(data_info)) |
|||
print(len(data_info_1)) |
|||
print(len(data_info_2)) |
|||
data_info = data_info + data_info_1 + data_info_2 |
|||
print(len(data_info)) |
|||
print(data_info[0]) |
|||
index.add(data_subject) |
|||
# index.nprobe = 2 # default nprobe is 1, try a few more |
|||
# k = nums |
|||
k = 20 |
|||
prompt = "标题:“{}”,摘要:“{}”".format(title, abstract) |
|||
embs = model.encode([prompt], normalize_embeddings=True) |
|||
|
|||
D, I = index.search(embs, int(k)) |
|||
# print(I) |
|||
|
|||
reference_list = [] |
|||
for i in I[0]: |
|||
reference_list.append(data_info[i]) |
|||
|
|||
data_info = ulit_recall_paper(reference_list, nums) |
|||
return "200", data_info |
|||
|
|||
|
|||
@app.route("/", methods=["POST"]) |
|||
def handle_query(): |
|||
# try: |
|||
title = request.form.get("title") |
|||
abstract = "" |
|||
nums = request.form.get('nums') |
|||
|
|||
# content = ulit_request_file(file) |
|||
|
|||
status_code, data_info_list = main(title, abstract, nums) |
|||
|
|||
if status_code == "400": |
|||
return_text = {"resilt": "", "probabilities": None, "status_code": 400} |
|||
else: |
|||
if status_code == "200": |
|||
return_text = { |
|||
"data_info": data_info_list, |
|||
"probabilities": None, |
|||
"status_code": 200 |
|||
} |
|||
else: |
|||
return_text = {"resilt": "", "probabilities": None, "status_code": 400} |
|||
|
|||
return jsonify(return_text) # 返回结果 |
|||
|
|||
if __name__ == "__main__": |
|||
app.run(host="0.0.0.0", port=17003, threaded=True) |
@ -0,0 +1,74 @@ |
|||
import json |
|||
from tqdm import tqdm |
|||
# json.load() |
|||
|
|||
# with open("t_xuewei_cnki_spider.csv", encoding="utf-8") as f: |
|||
# a = f.read() |
|||
# print(a) |
|||
|
|||
import pandas as pd |
|||
|
|||
filename = 'data/spider_latest_doctor_paper_list.csv' |
|||
chunksize = 10000 # 指定每次读取的行数,可以根据需要调整 |
|||
|
|||
df_list = [] |
|||
# 使用 chunksize 参数迭代读取 CSV 文件 |
|||
for chunk in pd.read_csv(filename, chunksize=chunksize): |
|||
# 作者 论文名称 论文类别 论文来源 论文年份 摘要 |
|||
|
|||
# 对每个 chunk 进行处理 |
|||
|
|||
# print(chunk.columns) |
|||
# 9 / 0 |
|||
df_list_dan = chunk.values.tolist() |
|||
# print(df_list[0]) |
|||
for i in tqdm(range(len(df_list_dan))): |
|||
if str(df_list_dan[i][2]) != "nan" and \ |
|||
str(df_list_dan[i][1]) != "nan" and\ |
|||
str(df_list_dan[i][6]) != "nan" and\ |
|||
str(df_list_dan[i][3]) != "nan" and\ |
|||
str(df_list_dan[i][4]) != "nan" and\ |
|||
str(df_list_dan[i][13]) != "nan": |
|||
|
|||
df_list.append({ |
|||
'author': df_list_dan[i][2], |
|||
'title': df_list_dan[i][1], |
|||
'special_topic': df_list_dan[i][6], |
|||
'qikan_name': df_list_dan[i][3], |
|||
'year': df_list_dan[i][4], |
|||
'abstract': df_list_dan[i][13], |
|||
}) |
|||
|
|||
# data = [] |
|||
# json_list = [ |
|||
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki.json", |
|||
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki2.json", |
|||
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki3.json", |
|||
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki6.json", |
|||
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki7.json", |
|||
# ] |
|||
# |
|||
# |
|||
# print("主库数据完成加载") |
|||
# for path in json_list: |
|||
# name, typr_file = path.split(".") |
|||
# name = name.split("/")[-1] |
|||
# a = json.load(open(path)) |
|||
# for i in a: |
|||
# autoid = "_".join([name, str(i['autoid'])]) |
|||
# if autoid in df_dict: |
|||
# data.append([i['f_title']] + df_dict[autoid]) |
|||
# print("path完成筛选") |
|||
# |
|||
|
|||
print(len(df_list)) |
|||
with open("data/data_0423_doctor.json", "w", encoding="utf-8") as f: |
|||
f.write(json.dumps(df_list, ensure_ascii=False, indent=2)) |
|||
|
|||
# |
|||
# with open("data.json", encoding="utf-8") as f: |
|||
# for i in f.readlines(): |
|||
# a = json.loads(i) |
|||
# |
|||
# |
|||
# print(a) |
@ -0,0 +1,134 @@ |
|||
import os |
|||
|
|||
os.environ["CUDA_VISIBLE_DEVICES"] = "1" |
|||
import json |
|||
import numpy as np |
|||
from tqdm import tqdm |
|||
from sentence_transformers import SentenceTransformer |
|||
import re |
|||
|
|||
model = SentenceTransformer('Dmeta-embedding-zh') |
|||
print(1) |
|||
with open("data/discipline_types.json", encoding="utf-8") as f: |
|||
lable_discipline_types = json.loads(f.read()) |
|||
|
|||
|
|||
def erjimul_ulit(): |
|||
pass |
|||
|
|||
|
|||
def shengcehng_array(data): |
|||
embs = model.encode(data, normalize_embeddings=True) |
|||
return embs |
|||
|
|||
|
|||
def is_contain_chinese(word): |
|||
""" |
|||
判断字符串是否包含中文字符 |
|||
:param word: 字符串 |
|||
:return: 布尔值,True表示包含中文,False表示不包含中文 |
|||
""" |
|||
pattern = re.compile(r'[\u4e00-\u9fa5]') |
|||
match = pattern.search(word) |
|||
return True if match else False |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
|
|||
# data = [] |
|||
with open("data/data_0423_doctor.json", encoding="utf-8") as f: |
|||
# for i in f.readlines(): |
|||
# a = json.loads(i) |
|||
# data.append(a) |
|||
data = json.loads(f.read()) |
|||
|
|||
print(len(data)) |
|||
|
|||
a = 0 |
|||
|
|||
a_ = 0 |
|||
data_info = {} # 作者 论文名称 论文类别 论文来源 论文年份 摘要 |
|||
data_prompt = {} |
|||
|
|||
data_info_en = {} # 作者 论文名称 论文类别 论文来源 论文年份 摘要 |
|||
data_prompt_en = {} |
|||
|
|||
for data_dan in data: |
|||
if str(data_dan["special_topic"]) == "nan" or \ |
|||
str(data_dan["author"]) == "nan" or \ |
|||
str(data_dan["title"]) == "nan" or \ |
|||
str(data_dan["qikan_name"]) == "nan" or \ |
|||
str(data_dan["year"]) == "nan" or \ |
|||
str(data_dan["abstract"]) == "nan": |
|||
a_ += 1 |
|||
continue |
|||
|
|||
leibie_list = data_dan["special_topic"].split(";") |
|||
for leibie in leibie_list: |
|||
if leibie in lable_discipline_types: |
|||
zh_bool = is_contain_chinese(data_dan["title"]) |
|||
|
|||
if zh_bool == True: |
|||
if lable_discipline_types[leibie] not in data_prompt: |
|||
dan_data_prompt = "标题:“{}”,摘要:“{}”".format(data_dan["title"], data_dan["abstract"]) |
|||
data_prompt[lable_discipline_types[leibie]] = [dan_data_prompt] |
|||
data_info[lable_discipline_types[leibie]] = [ |
|||
[data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], |
|||
data_dan["year"], data_dan["abstract"], "博士"]] |
|||
else: |
|||
dan_data_prompt = "标题:“{}”,摘要:“{}”".format(data_dan["title"], data_dan["abstract"]) |
|||
data_prompt[lable_discipline_types[leibie]].append(dan_data_prompt) |
|||
data_info[lable_discipline_types[leibie]].append( |
|||
[data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], |
|||
data_dan["year"], data_dan["abstract"], "博士"]) |
|||
else: |
|||
if lable_discipline_types[leibie] not in data_prompt_en: |
|||
dan_data_prompt = "标题:“{}”,摘要:“{}”".format(data_dan["title"], data_dan["abstract"]) |
|||
data_prompt_en[lable_discipline_types[leibie]] = [dan_data_prompt] |
|||
data_info_en[lable_discipline_types[leibie]] = [ |
|||
[data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], |
|||
data_dan["year"], data_dan["abstract"], "博士"]] |
|||
else: |
|||
dan_data_prompt = "标题:“{}”,摘要:“{}”".format(data_dan["title"], data_dan["abstract"]) |
|||
data_prompt_en[lable_discipline_types[leibie]].append(dan_data_prompt) |
|||
data_info_en[lable_discipline_types[leibie]].append( |
|||
[data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], |
|||
data_dan["year"], data_dan["abstract"], "博士"]) |
|||
|
|||
a += 1 |
|||
|
|||
print(2) |
|||
strat = 0 |
|||
end = 10000 |
|||
print(len(data_prompt)) |
|||
for leibie in tqdm(data_prompt): |
|||
data_ndarray = np.empty((0, 768)) |
|||
print("len(data_prompt[leibie])", len(data_prompt[leibie])) |
|||
while True: |
|||
if end >= len(data_prompt[leibie]): |
|||
break |
|||
linshi_data = data_prompt[leibie][strat:end] |
|||
data_ndarray = np.concatenate((data_ndarray, shengcehng_array(linshi_data))) |
|||
print("data_ndarray.shape", data_ndarray.shape) |
|||
strat = end |
|||
end += 10000 |
|||
|
|||
linshi_data = data_prompt[leibie][strat:len(data_prompt[leibie])] |
|||
print("len(linshi_data)", len(linshi_data)) |
|||
data_ndarray = np.concatenate((data_ndarray, shengcehng_array(linshi_data))) |
|||
print("data_ndarray.shape", data_ndarray.shape) |
|||
np.save(f'data/prompt_doctor/{leibie}.npy', data_ndarray) |
|||
strat = 0 |
|||
end = 10000 |
|||
|
|||
for leibie in data_info: |
|||
print(len(data_info[leibie])) |
|||
with open(f"data/data_info_doctor/{leibie}.json", "w", encoding="utf-8") as f: |
|||
f.write(json.dumps(data_info[leibie], ensure_ascii=False, indent=2)) |
|||
|
|||
for i in data_prompt_en: |
|||
print(i) |
|||
print(len(data_prompt_en[i])) |
|||
|
|||
print(len(data)) |
|||
print(a_) |
Loading…
Reference in new issue