首次提交，实现期刊数据的文献查找

2 years ago · fcb93c6326
9 changed files with 574 additions and 0 deletions
--- a/generate_references_api.py
+++ b/generate_references_api.py
@ -0,0 +1,188 @@
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+from flask import Flask, jsonify
+from flask import request
+import numpy as np
+import faiss
+import json
+import requests
+import socket
+from sentence_transformers import SentenceTransformer
+
+
+with open("data/lable/id2lable.json", encoding="utf-8") as f:
+    id2lable = json.loads(f.read())
+
+with open("data/lable/lable2id.json", encoding="utf-8") as f:
+    lable2id = json.loads(f.read())
+
+with open("data/discipline_types.json") as f:
+    lable_discipline_types = json.loads(f.read())
+
+
+app = Flask(__name__)
+app.config["JSON_AS_ASCII"] = False
+
+d = 768  # dimension
+model = SentenceTransformer('Dmeta-embedding-zh')
+
+def get_host_ip():
+    """
+    查询本机ip地址
+    :return: ip
+    """
+    try:
+        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+        s.connect(('8.8.8.8', 80))
+        ip = s.getsockname()[0]
+    finally:
+        s.close()
+
+    return ip
+
+# url = "http://{}:50003/roformer".format(str(get_host_ip()))
+url = "http://{}:50003/roformer".format("192.168.31.149")
+
+def dialog_line_parse(url, text):
+    """
+    将数据输入模型进行分析并输出结果
+    :param url: 模型url
+    :param text: 进入模型的数据
+    :return: 模型返回结果
+    """
+
+    response = requests.post(
+        url,
+        json=text,
+        timeout=1000
+    )
+    if response.status_code == 200:
+        return response.json()
+    else:
+        # logger.error(
+        #     "【{}】 Failed to get a proper response from remote "
+        #     "server. Status Code: {}. Response: {}"
+        #     "".format(url, response.status_code, response.text)
+        # )
+        print("【{}】 Failed to get a proper response from remote "
+            "server. Status Code: {}. Response: {}"
+            "".format(url, response.status_code, response.text))
+        print(text)
+        return []
+
+def ulit_recall_paper(reference_list):
+    '''
+    对返回的十篇文章路径读取并解析
+    :param recall_data_list_path:
+    :return data: list [[sentence, filename],[sentence, filename],[sentence, filename]]
+    '''
+
+    # data = []
+    # for path in recall_data_list_path:
+    #     filename = path.split("/")[-1]
+    #     with open(path, encoding="gbk") as f:
+    #         text = f.read()
+    #     text_list = text.split("\n")
+    #     for sentence in text_list:
+    #         if sentence != "":
+    #             data.append([sentence, filename])
+    # return data
+
+    # recall_data_list
+    # 作者 论文名称 论文类别 论文来源 论文年份 摘要
+    # "[1]赵璐.基于旅游资源开发下的新农村景观营建研究[D].西安建筑科技大学,2014."
+    data = []
+    for data_one in reference_list:
+        paper = ".".join([
+            ",".join([i for i in data_one[0].split(";") if i != ""]),
+            data_one[1] + "[J]",
+            ",".join([
+                data_one[3], str(data_one[4]) + "."
+            ])
+        ])
+
+        data.append(paper)
+
+    return data
+
+
+def main(title, abstract, nums):
+    data = {
+        "title": title,
+        "abst_zh": abstract,
+        "content": ""
+    }
+    # {
+    #     "label_num": [
+    #         117,
+    #         143
+    #     ]
+    # }
+    result = dialog_line_parse(url, data)
+
+    # print(result['label_num'][0])
+    # print(id2lable[result['label_num'][0]])
+    subject_pinyin = lable_discipline_types[id2lable[str(result['label_num'][0])]]
+
+    # with open(f"data/prompt/{subject_pinyin}.npy") as :
+    #     zidonghua = np.load('data/prompt/{subject_pinyin}.npy')
+
+    data_subject = np.load(f"data/prompt_qikan/{subject_pinyin}.npy")
+
+    index = faiss.read_index(f'data/prompt_qikan_ivf/{subject_pinyin}.ivf')
+
+    with open(f"data/data_info_qikan/{subject_pinyin}.json") as f:
+        data_info = json.loads(f.read())
+
+    index.add(data_subject)
+    # index.nprobe = 2  # default nprobe is 1, try a few more
+    k = nums
+    prompt = "标题：“{}”，摘要：“{}”".format(title, abstract)
+    embs = model.encode([prompt], normalize_embeddings=True)
+
+    D, I = index.search(embs, int(k))
+    print(I)
+
+    reference_list = []
+    abstract_list = []
+    for i in I[0]:
+        reference_list.append(data_info[i])
+        abstract_list.append(data_info[i][5])
+
+    return "200", ulit_recall_paper(reference_list), abstract_list
+
+
+@app.route("/", methods=["POST"])
+def handle_query():
+    # try:
+    title = request.form.get("title")
+    abstract = request.form.get('abstract')
+    nums = request.form.get('nums')
+
+    # content = ulit_request_file(file)
+
+    status_code, reference, abstract_list = main(title, abstract, nums)
+
+    if status_code == "400":
+        return_text = {"resilt": "", "probabilities": None, "status_code": 400}
+    else:
+        reference_list = reference
+        print(reference_list)
+        reference = [f"[{str(i+1)}]" + reference_list[i] for i in range(len(reference_list))]
+        if status_code == "200":
+            return_text = {
+                "resilt": {
+                    "reference": reference,
+                    "abstract": abstract_list
+                },
+                "probabilities": None,
+                "status_code": 200
+            }
+        else:
+            return_text = {"resilt": "", "probabilities": None, "status_code": 400}
+    # except:
+    #     return_text = {"resilt": "", "probabilities": None, "status_code": 400}
+    return jsonify(return_text)  # 返回结果
+
+if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=17001, threaded=True)
--- a/拉去数据.py
+++ b/拉去数据.py
@ -0,0 +1,33 @@
+import pymysql
+import json
+
+# 建立数据库连接
+connection = pymysql.connect(
+    host='rm-bp11ky2z5f34d2949fo.mysql.rds.aliyuncs.com',
+    user='fabiao_r',
+    password='f5u1w8nfb3b',
+    database='fabiao',
+    cursorclass=pymysql.cursors.DictCursor  # 返回字典形式的结果，方便操作
+)
+
+try:
+    with connection.cursor() as cursor:
+        # 执行查询
+        sql = "SELECT * FROM spider_latest_journal_paper_list"
+        cursor.execute(sql)
+
+        # 获取查询结果
+        result = cursor.fetchall()
+        print(len(result))
+
+        # 处理结果
+
+        # for row in result:
+        #     print(row)
+
+        with open("data/doctor_2018_2021.json", "w", encoding="utf-8") as f:
+            f.write(json.dumps(result, indent=2, ensure_ascii=False))
+
+finally:
+    # 关闭连接
+    connection.close()
--- a/数据合并.py
+++ b/数据合并.py
@ -0,0 +1,66 @@
+import json
+
+# json.load()
+
+# with open("t_xuewei_cnki_spider.csv", encoding="utf-8") as f:
+#     a = f.read()
+#     print(a)
+
+import pandas as pd
+
+filename = 'data/spider_latest_journal_paper_list.csv'
+chunksize = 10000  # 指定每次读取的行数，可以根据需要调整
+
+df_list = []
+# 使用 chunksize 参数迭代读取 CSV 文件
+for chunk in pd.read_csv(filename, chunksize=chunksize):
+    # 作者 论文名称 论文类别 论文来源 论文年份 摘要
+
+    # 对每个 chunk 进行处理
+
+    # print(chunk.columns)
+    # 9 / 0
+    df_list_dan = chunk.values.tolist()
+    # print(df_list[0])
+    for i in range(len(df_list_dan)):
+        df_list.append({
+            'author': df_list_dan[i][2],
+            'title': df_list_dan[i][1],
+            'special_topic': df_list_dan[i][7],
+            'qikan_name': df_list_dan[i][3],
+            'year': df_list_dan[i][4],
+            'abstract': df_list_dan[i][10],
+        })
+
+# data = []
+# json_list = [
+#     "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki.json",
+#     "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki2.json",
+#     "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki3.json",
+#     "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki6.json",
+#     "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki7.json",
+# ]
+#
+#
+# print("主库数据完成加载")
+# for path in json_list:
+#     name, typr_file = path.split(".")
+#     name = name.split("/")[-1]
+#     a = json.load(open(path))
+#     for i in a:
+#         autoid = "_".join([name, str(i['autoid'])])
+#         if autoid in df_dict:
+#             data.append([i['f_title']] + df_dict[autoid])
+#     print("path完成筛选")
+#
+#
+with open("data/data_0416.json", "w") as f:
+    f.write(json.dumps(df_list, ensure_ascii=False, indent=2))
+
+#
+# with open("data.json", encoding="utf-8") as f:
+#     for i in f.readlines():
+#         a = json.loads(i)
+#
+#
+# print(a)
--- a/数据生成ndarray.py
+++ b/数据生成ndarray.py
@ -0,0 +1,80 @@
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+import json
+import numpy as np
+from tqdm import tqdm
+from sentence_transformers import SentenceTransformer
+
+model = SentenceTransformer('Dmeta-embedding-zh')
+print(1)
+with open("data/discipline_types.json", encoding="utf-8") as f:
+    lable_discipline_types = json.loads(f.read())
+
+def erjimul_ulit():
+    pass
+
+def shengcehng_array(data):
+    embs = model.encode(data, normalize_embeddings=True)
+    return embs
+
+
+if __name__ == '__main__':
+
+    # data = []
+    with open("data/data_0416.json", encoding="utf-8") as f:
+        # for i in f.readlines():
+        #     a = json.loads(i)
+        #     data.append(a)
+        data = json.loads(f.read())
+
+    print(len(data))
+
+    a = 0
+
+    a_ = 0
+    data_info = {}  # 作者 论文名称 论文类别 论文来源 论文年份 摘要
+    data_prompt = {}
+    for data_dan in data:
+        if str(data_dan["special_topic"]) == "nan":
+            a_ += 1
+            continue
+
+        leibie_list = data_dan["special_topic"].split(";")
+        for leibie in leibie_list:
+            if leibie in lable_discipline_types:
+                if lable_discipline_types[leibie] not in data_prompt:
+                    data_prompt[lable_discipline_types[leibie]] = ["标题：“{}”，摘要：“{}”".format(data_dan["title"], data_dan["abstract"])]
+                    data_info[lable_discipline_types[leibie]] = [[data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], data_dan["year"], data_dan["abstract"], "期刊"]]
+                else:
+                    data_prompt[lable_discipline_types[leibie]].append("标题：“{}”，摘要：“{}”".format(data_dan["title"], data_dan["abstract"]))
+                    data_info[lable_discipline_types[leibie]].append([data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], data_dan["year"], data_dan["abstract"], "期刊"])
+            a += 1
+
+    print(2)
+    strat = 0
+    end = 10000
+    print(len(data_prompt))
+    for leibie in tqdm(data_prompt):
+        data_ndarray = np.empty((0, 768))
+        print("len(data_prompt[leibie])", len(data_prompt[leibie]))
+        while True:
+            if end >= len(data_prompt[leibie]):
+                break
+            linshi_data = data_prompt[leibie][strat:end]
+            data_ndarray = np.concatenate((data_ndarray, shengcehng_array(linshi_data)))
+            print("data_ndarray.shape", data_ndarray.shape)
+            strat = end
+            end += 10000
+
+        linshi_data = data_prompt[leibie][strat:len(data_prompt[leibie])]
+        print("len(linshi_data)", len(linshi_data))
+        data_ndarray = np.concatenate((data_ndarray, shengcehng_array(linshi_data)))
+        print("data_ndarray.shape", data_ndarray.shape)
+        np.save(f'data/prompt_qikan/{leibie}.npy', data_ndarray)
+        strat = 0
+        end = 10000
+
+    for leibie in data_info:
+        print(len(data_info[leibie]))
+        with open(f"data/data_info_qikan/{leibie}.json", "w", encoding="utf-8") as f:
+            f.write(json.dumps(data_info[leibie], ensure_ascii=False, indent=2))
--- a/期刊数据整理.py
+++ b/期刊数据整理.py
@ -0,0 +1,66 @@
+import json
+
+# json.load()
+
+# with open("t_xuewei_cnki_spider.csv", encoding="utf-8") as f:
+#     a = f.read()
+#     print(a)
+
+import pandas as pd
+
+filename = 'data/spider_latest_journal_paper_list.csv'
+chunksize = 10000  # 指定每次读取的行数，可以根据需要调整
+
+df_list = []
+# 使用 chunksize 参数迭代读取 CSV 文件
+for chunk in pd.read_csv(filename, chunksize=chunksize):
+    # 作者 论文名称 论文类别 论文来源 论文年份 摘要
+
+    # 对每个 chunk 进行处理
+
+    # print(chunk.columns)
+    # 9 / 0
+    df_list_dan = chunk.values.tolist()
+    # print(df_list[0])
+    for i in range(len(df_list_dan)):
+        df_list.append({
+            'author': df_list_dan[i][2],
+            'title': df_list_dan[i][1],
+            'special_topic': df_list_dan[i][7],
+            'qikan_name': df_list_dan[i][3],
+            'year': df_list_dan[i][4],
+            'abstract': df_list_dan[i][10],
+        })
+
+# data = []
+# json_list = [
+#     "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki.json",
+#     "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki2.json",
+#     "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki3.json",
+#     "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki6.json",
+#     "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki7.json",
+# ]
+#
+#
+# print("主库数据完成加载")
+# for path in json_list:
+#     name, typr_file = path.split(".")
+#     name = name.split("/")[-1]
+#     a = json.load(open(path))
+#     for i in a:
+#         autoid = "_".join([name, str(i['autoid'])])
+#         if autoid in df_dict:
+#             data.append([i['f_title']] + df_dict[autoid])
+#     print("path完成筛选")
+#
+#
+with open("data/data_0416.json", "w") as f:
+    f.write(json.dumps(df_list, ensure_ascii=False, indent=2))
+
+#
+# with open("data.json", encoding="utf-8") as f:
+#     for i in f.readlines():
+#         a = json.loads(i)
+#
+#
+# print(a)
--- a/测试向量匹配.py
+++ b/测试向量匹配.py
@ -0,0 +1,53 @@
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
+import numpy as np
+import faiss
+import json
+from sentence_transformers import SentenceTransformer
+
+d = 768  # dimension
+zidonghua = np.load('zidonghua.npy')
+model = SentenceTransformer('Dmeta-embedding-zh')
+
+data = []
+with open("data.json", encoding="utf-8") as f:
+    for i in f.readlines():
+        a = json.loads(i)
+        data.append(a)
+
+mubiaoliebie = "自动化技术"
+data_prompt = []
+for i in data:
+    if str(i[1]) == "nan":
+        continue
+
+    leibie_list = i[1].split(";")
+    for leibie in leibie_list:
+        if leibie == mubiaoliebie:
+            data_prompt.append("标题：“{}”，摘要：“{}”".format(i[0], i[2]))
+
+
+
+# faiss.write_index(index, 'index.ivf')
+index = faiss.read_index('zidonghua.ivf')
+
+index.add(zidonghua)                    # add may be a bit slower as well
+# D, I = index.search(xq, k)     # actual search
+# print(I[-5:])                  # neighbors of the 5 last queries
+
+
+print("=======================================")
+index.nprobe = 2              # default nprobe is 1, try a few more
+k = 4
+biaoti = "工业机器人视觉导航系统的设计与实现"
+zhaiyoa = "本研究致力于设计和实现工业机器人视觉导航系统，旨在提高工业生产中机器人的自主导航和定位能力。首先，通过综合考虑视觉传感器、定位算法和控制策略，设计了一种高效的机器人视觉导航系统框架。其次，利用深度学习技术对环境中的关键特征进行识别和定位，实现了机器人在复杂工作场景下的精确定位和路径规划。通过实验验证，本系统在提高机器人工作效率、减少人工干预以及降低操作误差等方面取得了显著的成果。因此，本研究为工业机器人在生产领域的应用提供了重要的技术支持，具有一定的实用和推广价值。"
+
+prompt = "标题：“{}”，摘要：“{}”".format(biaoti, zhaiyoa)
+embs = model.encode([prompt], normalize_embeddings=True)
+
+D, I = index.search(embs, k)
+print(I)
+
+for i in I[0]:
+    print(data_prompt[i])
--- a/生成lable.py
+++ b/生成lable.py
@ -0,0 +1,23 @@
+import json
+
+with open("label_threshold.txt", encoding="utf-8") as f:
+    data = json.loads(f.read())
+
+
+id2lable = {}
+lable2id = {}
+for i in data:
+    if i not in lable2id:
+        lable2id[i] = data[i][0]
+
+for i in lable2id:
+    if lable2id[i] not in id2lable:
+        id2lable[lable2id[i]] = i
+
+
+with open("data/lable/id2lable.json", "w", encoding="utf-8") as f:
+    f.write(json.dumps(id2lable, indent=2, ensure_ascii=False))
+
+
+with open("data/lable/lable2id.json", "w", encoding="utf-8") as f:
+    f.write(json.dumps(lable2id, indent=2, ensure_ascii=False))
--- a/统计学科二级分类.py
+++ b/统计学科二级分类.py
@ -0,0 +1,31 @@
+import json
+from pypinyin import pinyin, Style
+import pandas as pd
+
+
+def hanzi_to_pinyin(hanzi):
+    # 将汉字转换为拼音，Style.NORMAL表示以带音调的拼音形式输出
+    pinyin_list = pinyin(hanzi, style=Style.NORMAL, heteronym=False)
+    print(pinyin_list)
+    # 将拼音列表连接成字符串
+    pinyin_str = ''.join([i[0] for i in pinyin_list])
+    return pinyin_str
+
+
+if __name__ == '__main__':
+    df_list = pd.read_excel("论文种类分类表1.xls").values.tolist()
+    print(df_list)
+
+    erji_dict = {}
+
+    for i in range(len(df_list)):
+        if str(df_list[i][1]) == "nan":
+            continue
+        if df_list[i][1] not in erji_dict :
+            erji_dict[df_list[i][1]] = hanzi_to_pinyin(df_list[i][1])
+
+    print(erji_dict)
+    print(len(erji_dict))
+
+    with open("discipline_types.json", "w", encoding="utf-8") as f:
+        f.write(json.dumps(erji_dict, ensure_ascii=False, indent=2))
--- a/训练faiss.py
+++ b/训练faiss.py
@ -0,0 +1,34 @@
+import numpy as np
+import faiss
+import json
+import math
+
+
+d = 768  # dimension
+# nlist = 1000 #聚类的数目
+
+
+with open("data/discipline_types.json") as f:
+    lable_discipline_types = json.loads(f.read())
+
+a = 0
+for leibie_zh in lable_discipline_types:
+
+    xb = np.load(f'data/prompt_qikan/{lable_discipline_types[leibie_zh]}.npy')
+
+    # nlist = math.floor((len(lable_discipline_types[leibie_zh]) ** 0.5))  # 聚类的数目
+    # print(leibie_zh)
+    # print(len(lable_discipline_types[leibie_zh]))
+    # print(nlist)
+
+    print(xb.shape)
+    nlist = math.floor((xb.shape[0] ** 0.5))
+    a += xb.shape[0]
+    print(nlist)
+    quantizer = faiss.IndexFlatL2(d)
+    index = faiss.IndexIVFFlat(quantizer, d, nlist)
+    assert not index.is_trained
+    index.train(xb) # IndexIVFFlat是需要训练的，这边是学习聚类
+    assert index.is_trained
+    faiss.write_index(index, f'data/prompt_qikan_ivf/{lable_discipline_types[leibie_zh]}.ivf')
+print(a)