diff --git a/README.md b/README.md new file mode 100644 index 0000000..35bd47d --- /dev/null +++ b/README.md @@ -0,0 +1,14 @@ +## 参考文献生成 +新参考文件项目使用faiss聚类方式查询,实现秒级响应 + + +#### 生成ndarray数据,生成文献基本信息 + 修改或增加数据 + python 数据生成ndarray.py + +#### 训练faiss + 修改或增加 npy文件路径 + python 训练faiss.py + +#### 部署服务 + python generate_references_api.py \ No newline at end of file diff --git a/generate_reference_faiss_data_info.py b/generate_reference_faiss_data_info.py new file mode 100644 index 0000000..199d1e5 --- /dev/null +++ b/generate_reference_faiss_data_info.py @@ -0,0 +1,236 @@ +import os +import random + +os.environ["CUDA_VISIBLE_DEVICES"] = "0" +from flask import Flask, jsonify +from flask import request +import numpy as np +import faiss +import json +import requests +import socket +from sentence_transformers import SentenceTransformer + + +with open("data/lable/id2lable.json", encoding="utf-8") as f: + id2lable = json.loads(f.read()) + +with open("data/lable/lable2id.json", encoding="utf-8") as f: + lable2id = json.loads(f.read()) + +with open("data/discipline_types.json") as f: + lable_discipline_types = json.loads(f.read()) + + +app = Flask(__name__) +app.config["JSON_AS_ASCII"] = False + +d = 768 # dimension +model = SentenceTransformer('Dmeta-embedding-zh') + +def get_host_ip(): + """ + 查询本机ip地址 + :return: ip + """ + try: + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + s.connect(('8.8.8.8', 80)) + ip = s.getsockname()[0] + finally: + s.close() + + return ip + +# url = "http://{}:50003/roformer".format(str(get_host_ip())) +url = "http://{}:50003/roformer".format("192.168.31.149") + +def dialog_line_parse(url, text): + """ + 将数据输入模型进行分析并输出结果 + :param url: 模型url + :param text: 进入模型的数据 + :return: 模型返回结果 + """ + + response = requests.post( + url, + json=text, + timeout=1000 + ) + if response.status_code == 200: + return response.json() + else: + # logger.error( + # "【{}】 Failed to get a proper response from remote " + # "server. Status Code: {}. Response: {}" + # "".format(url, response.status_code, response.text) + # ) + print("【{}】 Failed to get a proper response from remote " + "server. Status Code: {}. Response: {}" + "".format(url, response.status_code, response.text)) + print(text) + return [] + + +def panduan_paper_lable(paper_lable_text): + paper_lable = { + "硕士": "D", + "期刊": "J", + "博士": "J" + } + return paper_lable[paper_lable_text] + + +def ulit_recall_paper(reference_list, nums): + ''' + 对返回的十篇文章路径读取并解析 + :param recall_data_list_path: + :return data: list [[sentence, filename],[sentence, filename],[sentence, filename]] + ''' + + # data = [] + # for path in recall_data_list_path: + # filename = path.split("/")[-1] + # with open(path, encoding="gbk") as f: + # text = f.read() + # text_list = text.split("\n") + # for sentence in text_list: + # if sentence != "": + # data.append([sentence, filename]) + # return data + + # recall_data_list + # 作者 论文名称 论文类别 论文来源 论文年份 摘要 期刊 + # "[1]赵璐.基于旅游资源开发下的新农村景观营建研究[D].西安建筑科技大学,2014." + + data_info = [] + data_title = [] + + for data_one in reference_list: + + if data_one[1] not in data_title: + + print("data_one", data_one) + print("data_one[0]", data_one[0]) + paper = ".".join([ + ",".join([str(i).replace("\n", "").replace("\r", "") for i in data_one[0].split(";") if i != ""]), + data_one[1] + f"[{panduan_paper_lable(data_one[6])}]", + ",".join([ + data_one[3], str(data_one[4]) + "." + ]) + ]) + + data_title.append(data_one[1]) + data_info.append({ + "author": data_one[0], + "title": data_one[1], + "special_topic": data_one[2], + "qikan_name": data_one[3], + "year": str(data_one[4]), + "abstract": data_one[5], + "classlable": data_one[6], + "reference": paper + }) + + # print(data) + print(data_title) + print(nums) + random.shuffle(data_info) + random.shuffle(data_info) + data_info = data_info[:int(nums)] + return data_info + + +def main(title, abstract, nums): + data = { + "title": title, + "abst_zh": abstract, + "content": "" + } + # { + # "label_num": [ + # 117, + # 143 + # ] + # } + result = dialog_line_parse(url, data) + + # print(result['label_num'][0]) + # print(id2lable[result['label_num'][0]]) + subject_pinyin = lable_discipline_types[id2lable[str(result['label_num'][0])]] + + # with open(f"data/prompt/{subject_pinyin}.npy") as : + # zidonghua = np.load('data/prompt/{subject_pinyin}.npy') + + data_subject = np.load(f"data/prompt_qikan/{subject_pinyin}.npy") + data_subject_1 = np.load(f"data/prompt_master/{subject_pinyin}.npy") + data_subject_2 = np.load(f"data/prompt_doctor/{subject_pinyin}.npy") + print("xb.shape", data_subject.shape) + print("xb_1.shape", data_subject_1.shape) + print("xb_2.shape", data_subject_2.shape) + data_subject = np.concatenate((data_subject, data_subject_1, data_subject_2)) + print("data_subject.shape", data_subject.shape) + + index = faiss.read_index(f'data/prompt_qikan_master_doctor_ivf/{subject_pinyin}.ivf') + + with open(f"data/data_info_qikan/{subject_pinyin}.json") as f: + data_info = json.loads(f.read()) + + with open(f"data/data_info_master/{subject_pinyin}.json") as f: + data_info_1 = json.loads(f.read()) + + with open(f"data/data_info_doctor/{subject_pinyin}.json") as f: + data_info_2 = json.loads(f.read()) + + print(len(data_info)) + print(len(data_info_1)) + print(len(data_info_2)) + data_info = data_info + data_info_1 + data_info_2 + print(len(data_info)) + print(data_info[0]) + index.add(data_subject) + # index.nprobe = 2 # default nprobe is 1, try a few more + # k = nums + k = 20 + prompt = "标题:“{}”,摘要:“{}”".format(title, abstract) + embs = model.encode([prompt], normalize_embeddings=True) + + D, I = index.search(embs, int(k)) + # print(I) + + reference_list = [] + for i in I[0]: + reference_list.append(data_info[i]) + + data_info = ulit_recall_paper(reference_list, nums) + return "200", data_info + + +@app.route("/", methods=["POST"]) +def handle_query(): + # try: + title = request.form.get("title") + abstract = "" + nums = request.form.get('nums') + + # content = ulit_request_file(file) + + status_code, data_info_list = main(title, abstract, nums) + + if status_code == "400": + return_text = {"resilt": "", "probabilities": None, "status_code": 400} + else: + if status_code == "200": + return_text = { + "data_info": data_info_list, + "probabilities": None, + "status_code": 200 + } + else: + return_text = {"resilt": "", "probabilities": None, "status_code": 400} + + return jsonify(return_text) # 返回结果 + +if __name__ == "__main__": + app.run(host="0.0.0.0", port=17003, threaded=True) diff --git a/generate_references_api_1.py b/generate_references_api_1.py index 451c874..a0c8ada 100644 --- a/generate_references_api_1.py +++ b/generate_references_api_1.py @@ -75,7 +75,7 @@ def panduan_paper_lable(paper_lable_text): paper_lable = { "硕士": "D", "期刊": "J", - "博士": "J" + "博士": "D" } return paper_lable[paper_lable_text] @@ -106,13 +106,23 @@ def ulit_recall_paper(reference_list, nums): for data_one in reference_list: print("data_one", data_one) print("data_one[0]", data_one[0]) - paper = ".".join([ - ",".join([str(i).replace("\n", "").replace("\r", "") for i in data_one[0].split(";") if i != ""]), - data_one[1] + f"[{panduan_paper_lable(data_one[6])}]", - ",".join([ - data_one[3], str(data_one[4]) + "." + + if panduan_paper_lable(data_one[6]) == "J": + paper = ".".join([ + ",".join([str(i).replace("\n", "").replace("\r", "") for i in data_one[0].split(";") if i != ""]), + data_one[1] + f"[{panduan_paper_lable(data_one[6])}]", + ",".join([ + data_one[3], str(data_one[4]) + ]) + ]) + "," + f"({data_one[8]})" + f":{data_one[7]}" + "." + else: + paper = ".".join([ + ",".join([str(i).replace("\n", "").replace("\r", "") for i in data_one[0].split(";") if i != ""]), + data_one[1] + f"[{panduan_paper_lable(data_one[6])}]", + ",".join([ + data_one[3], str(data_one[4]) + "." + ]), ]) - ]) data.append(paper) @@ -157,7 +167,7 @@ def main(title, abstract, nums): index = faiss.read_index(f'data/prompt_qikan_master_doctor_ivf/{subject_pinyin}.ivf') - with open(f"data/data_info_qikan/{subject_pinyin}.json") as f: + with open(f"data/data_info_qikan_1/{subject_pinyin}.json") as f: data_info = json.loads(f.read()) with open(f"data/data_info_master/{subject_pinyin}.json") as f: diff --git a/博士数据整理.py b/博士数据整理.py new file mode 100644 index 0000000..9a3a288 --- /dev/null +++ b/博士数据整理.py @@ -0,0 +1,74 @@ +import json +from tqdm import tqdm +# json.load() + +# with open("t_xuewei_cnki_spider.csv", encoding="utf-8") as f: +# a = f.read() +# print(a) + +import pandas as pd + +filename = 'data/spider_latest_doctor_paper_list.csv' +chunksize = 10000 # 指定每次读取的行数,可以根据需要调整 + +df_list = [] +# 使用 chunksize 参数迭代读取 CSV 文件 +for chunk in pd.read_csv(filename, chunksize=chunksize): + # 作者 论文名称 论文类别 论文来源 论文年份 摘要 + + # 对每个 chunk 进行处理 + + # print(chunk.columns) + # 9 / 0 + df_list_dan = chunk.values.tolist() + # print(df_list[0]) + for i in tqdm(range(len(df_list_dan))): + if str(df_list_dan[i][2]) != "nan" and \ + str(df_list_dan[i][1]) != "nan" and\ + str(df_list_dan[i][6]) != "nan" and\ + str(df_list_dan[i][3]) != "nan" and\ + str(df_list_dan[i][4]) != "nan" and\ + str(df_list_dan[i][13]) != "nan": + + df_list.append({ + 'author': df_list_dan[i][2], + 'title': df_list_dan[i][1], + 'special_topic': df_list_dan[i][6], + 'qikan_name': df_list_dan[i][3], + 'year': df_list_dan[i][4], + 'abstract': df_list_dan[i][13], + }) + +# data = [] +# json_list = [ +# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki.json", +# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki2.json", +# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki3.json", +# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki6.json", +# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki7.json", +# ] +# +# +# print("主库数据完成加载") +# for path in json_list: +# name, typr_file = path.split(".") +# name = name.split("/")[-1] +# a = json.load(open(path)) +# for i in a: +# autoid = "_".join([name, str(i['autoid'])]) +# if autoid in df_dict: +# data.append([i['f_title']] + df_dict[autoid]) +# print("path完成筛选") +# + +print(len(df_list)) +with open("data/data_0423_doctor.json", "w", encoding="utf-8") as f: + f.write(json.dumps(df_list, ensure_ascii=False, indent=2)) + +# +# with open("data.json", encoding="utf-8") as f: +# for i in f.readlines(): +# a = json.loads(i) +# +# +# print(a) \ No newline at end of file diff --git a/博士数据生成ndarray.py b/博士数据生成ndarray.py new file mode 100644 index 0000000..308f1cc --- /dev/null +++ b/博士数据生成ndarray.py @@ -0,0 +1,134 @@ +import os + +os.environ["CUDA_VISIBLE_DEVICES"] = "1" +import json +import numpy as np +from tqdm import tqdm +from sentence_transformers import SentenceTransformer +import re + +model = SentenceTransformer('Dmeta-embedding-zh') +print(1) +with open("data/discipline_types.json", encoding="utf-8") as f: + lable_discipline_types = json.loads(f.read()) + + +def erjimul_ulit(): + pass + + +def shengcehng_array(data): + embs = model.encode(data, normalize_embeddings=True) + return embs + + +def is_contain_chinese(word): + """ + 判断字符串是否包含中文字符 + :param word: 字符串 + :return: 布尔值,True表示包含中文,False表示不包含中文 + """ + pattern = re.compile(r'[\u4e00-\u9fa5]') + match = pattern.search(word) + return True if match else False + + +if __name__ == '__main__': + + # data = [] + with open("data/data_0423_doctor.json", encoding="utf-8") as f: + # for i in f.readlines(): + # a = json.loads(i) + # data.append(a) + data = json.loads(f.read()) + + print(len(data)) + + a = 0 + + a_ = 0 + data_info = {} # 作者 论文名称 论文类别 论文来源 论文年份 摘要 + data_prompt = {} + + data_info_en = {} # 作者 论文名称 论文类别 论文来源 论文年份 摘要 + data_prompt_en = {} + + for data_dan in data: + if str(data_dan["special_topic"]) == "nan" or \ + str(data_dan["author"]) == "nan" or \ + str(data_dan["title"]) == "nan" or \ + str(data_dan["qikan_name"]) == "nan" or \ + str(data_dan["year"]) == "nan" or \ + str(data_dan["abstract"]) == "nan": + a_ += 1 + continue + + leibie_list = data_dan["special_topic"].split(";") + for leibie in leibie_list: + if leibie in lable_discipline_types: + zh_bool = is_contain_chinese(data_dan["title"]) + + if zh_bool == True: + if lable_discipline_types[leibie] not in data_prompt: + dan_data_prompt = "标题:“{}”,摘要:“{}”".format(data_dan["title"], data_dan["abstract"]) + data_prompt[lable_discipline_types[leibie]] = [dan_data_prompt] + data_info[lable_discipline_types[leibie]] = [ + [data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], + data_dan["year"], data_dan["abstract"], "博士"]] + else: + dan_data_prompt = "标题:“{}”,摘要:“{}”".format(data_dan["title"], data_dan["abstract"]) + data_prompt[lable_discipline_types[leibie]].append(dan_data_prompt) + data_info[lable_discipline_types[leibie]].append( + [data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], + data_dan["year"], data_dan["abstract"], "博士"]) + else: + if lable_discipline_types[leibie] not in data_prompt_en: + dan_data_prompt = "标题:“{}”,摘要:“{}”".format(data_dan["title"], data_dan["abstract"]) + data_prompt_en[lable_discipline_types[leibie]] = [dan_data_prompt] + data_info_en[lable_discipline_types[leibie]] = [ + [data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], + data_dan["year"], data_dan["abstract"], "博士"]] + else: + dan_data_prompt = "标题:“{}”,摘要:“{}”".format(data_dan["title"], data_dan["abstract"]) + data_prompt_en[lable_discipline_types[leibie]].append(dan_data_prompt) + data_info_en[lable_discipline_types[leibie]].append( + [data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], + data_dan["year"], data_dan["abstract"], "博士"]) + + a += 1 + + print(2) + strat = 0 + end = 10000 + print(len(data_prompt)) + for leibie in tqdm(data_prompt): + data_ndarray = np.empty((0, 768)) + print("len(data_prompt[leibie])", len(data_prompt[leibie])) + while True: + if end >= len(data_prompt[leibie]): + break + linshi_data = data_prompt[leibie][strat:end] + data_ndarray = np.concatenate((data_ndarray, shengcehng_array(linshi_data))) + print("data_ndarray.shape", data_ndarray.shape) + strat = end + end += 10000 + + linshi_data = data_prompt[leibie][strat:len(data_prompt[leibie])] + print("len(linshi_data)", len(linshi_data)) + data_ndarray = np.concatenate((data_ndarray, shengcehng_array(linshi_data))) + print("data_ndarray.shape", data_ndarray.shape) + np.save(f'data/prompt_doctor/{leibie}.npy', data_ndarray) + strat = 0 + end = 10000 + + for leibie in data_info: + print(len(data_info[leibie])) + with open(f"data/data_info_doctor/{leibie}.json", "w", encoding="utf-8") as f: + f.write(json.dumps(data_info[leibie], ensure_ascii=False, indent=2)) + + for i in data_prompt_en: + print(i) + print(len(data_prompt_en[i])) + + print(len(data)) + print(a_) diff --git a/数据生成ndarray.py b/数据生成ndarray.py index d951bc1..0d5dfee 100644 --- a/数据生成ndarray.py +++ b/数据生成ndarray.py @@ -1,27 +1,42 @@ import os -os.environ["CUDA_VISIBLE_DEVICES"] = "0" + +os.environ["CUDA_VISIBLE_DEVICES"] = "1" import json import numpy as np from tqdm import tqdm from sentence_transformers import SentenceTransformer +import re model = SentenceTransformer('Dmeta-embedding-zh') print(1) with open("data/discipline_types.json", encoding="utf-8") as f: lable_discipline_types = json.loads(f.read()) + def erjimul_ulit(): pass + def shengcehng_array(data): embs = model.encode(data, normalize_embeddings=True) return embs +def is_contain_chinese(word): + """ + 判断字符串是否包含中文字符 + :param word: 字符串 + :return: 布尔值,True表示包含中文,False表示不包含中文 + """ + pattern = re.compile(r'[\u4e00-\u9fa5]') + match = pattern.search(word) + return True if match else False + + if __name__ == '__main__': # data = [] - with open("data/data_0416.json", encoding="utf-8") as f: + with open("data/data_0423_qikan.json", encoding="utf-8") as f: # for i in f.readlines(): # a = json.loads(i) # data.append(a) @@ -34,20 +49,52 @@ if __name__ == '__main__': a_ = 0 data_info = {} # 作者 论文名称 论文类别 论文来源 论文年份 摘要 data_prompt = {} + + data_info_en = {} # 作者 论文名称 论文类别 论文来源 论文年份 摘要 + data_prompt_en = {} + for data_dan in data: - if str(data_dan["special_topic"]) == "nan": + if str(data_dan["special_topic"]) == "nan" or \ + str(data_dan["author"]) == "nan" or \ + str(data_dan["title"]) == "nan" or \ + str(data_dan["qikan_name"]) == "nan" or \ + str(data_dan["year"]) == "nan" or \ + str(data_dan["abstract"]) == "nan": a_ += 1 continue leibie_list = data_dan["special_topic"].split(";") for leibie in leibie_list: if leibie in lable_discipline_types: - if lable_discipline_types[leibie] not in data_prompt: - data_prompt[lable_discipline_types[leibie]] = ["标题:“{}”,摘要:“{}”".format(data_dan["title"], data_dan["abstract"])] - data_info[lable_discipline_types[leibie]] = [[data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], data_dan["year"], data_dan["abstract"], "期刊"]] + zh_bool = is_contain_chinese(data_dan["title"]) + + if zh_bool == True: + if lable_discipline_types[leibie] not in data_prompt: + dan_data_prompt = "标题:“{}”,摘要:“{}”".format(data_dan["title"], data_dan["abstract"]) + data_prompt[lable_discipline_types[leibie]] = [dan_data_prompt] + data_info[lable_discipline_types[leibie]] = [ + [data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], + data_dan["year"], data_dan["abstract"], "期刊"]] + else: + dan_data_prompt = "标题:“{}”,摘要:“{}”".format(data_dan["title"], data_dan["abstract"]) + data_prompt[lable_discipline_types[leibie]].append(dan_data_prompt) + data_info[lable_discipline_types[leibie]].append( + [data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], + data_dan["year"], data_dan["abstract"], "期刊"]) else: - data_prompt[lable_discipline_types[leibie]].append("标题:“{}”,摘要:“{}”".format(data_dan["title"], data_dan["abstract"])) - data_info[lable_discipline_types[leibie]].append([data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], data_dan["year"], data_dan["abstract"], "期刊"]) + if lable_discipline_types[leibie] not in data_prompt_en: + dan_data_prompt = "标题:“{}”,摘要:“{}”".format(data_dan["title"], data_dan["abstract"]) + data_prompt_en[lable_discipline_types[leibie]] = [dan_data_prompt] + data_info_en[lable_discipline_types[leibie]] = [ + [data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], + data_dan["year"], data_dan["abstract"], "期刊"]] + else: + dan_data_prompt = "标题:“{}”,摘要:“{}”".format(data_dan["title"], data_dan["abstract"]) + data_prompt_en[lable_discipline_types[leibie]].append(dan_data_prompt) + data_info_en[lable_discipline_types[leibie]].append( + [data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], + data_dan["year"], data_dan["abstract"], "期刊"]) + a += 1 print(2) @@ -78,3 +125,10 @@ if __name__ == '__main__': print(len(data_info[leibie])) with open(f"data/data_info_qikan/{leibie}.json", "w", encoding="utf-8") as f: f.write(json.dumps(data_info[leibie], ensure_ascii=False, indent=2)) + + for i in data_prompt_en: + print(i) + print(len(data_prompt_en[i])) + + print(len(data)) + print(a_) diff --git a/期刊数据整理.py b/期刊数据整理.py index 47c181c..165f867 100644 --- a/期刊数据整理.py +++ b/期刊数据整理.py @@ -54,7 +54,7 @@ for chunk in pd.read_csv(filename, chunksize=chunksize): # print("path完成筛选") # # -with open("data/data_0416.json", "w") as f: +with open("data/data_0423_qikan.json", "w") as f: f.write(json.dumps(df_list, ensure_ascii=False, indent=2)) # diff --git a/训练faiss.py b/训练faiss.py index a7f075f..28918ac 100644 --- a/训练faiss.py +++ b/训练faiss.py @@ -15,6 +15,12 @@ a = 0 for leibie_zh in lable_discipline_types: xb = np.load(f'data/prompt_qikan/{lable_discipline_types[leibie_zh]}.npy') + xb_1 = np.load(f'data/prompt_master/{lable_discipline_types[leibie_zh]}.npy') + xb_2 = np.load(f'data/prompt_doctor/{lable_discipline_types[leibie_zh]}.npy') + print("xb.shape", xb.shape) + print("xb_1.shape", xb_1.shape) + print("xb_2.shape", xb_2.shape) + xb = np.concatenate((xb, xb_1, xb_2)) # nlist = math.floor((len(lable_discipline_types[leibie_zh]) ** 0.5)) # 聚类的数目 # print(leibie_zh) @@ -30,5 +36,5 @@ for leibie_zh in lable_discipline_types: assert not index.is_trained index.train(xb) # IndexIVFFlat是需要训练的,这边是学习聚类 assert index.is_trained - faiss.write_index(index, f'data/prompt_qikan_ivf/{lable_discipline_types[leibie_zh]}.ivf') + faiss.write_index(index, f'data/prompt_qikan_master_doctor_ivf/{lable_discipline_types[leibie_zh]}.ivf') print(a) \ No newline at end of file