根据标题和摘要生成参考文献
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

207 lines
6.5 KiB

import json
import datetime
import pymysql
import re
import requests
from flask import Flask, jsonify
from flask import request
import uuid
import time
import redis
from threading import Thread
pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=8, password="zhicheng123*")
redis_ = redis.Redis(connection_pool=pool, decode_responses=True)
db_key_query = 'query_check_task'
db_key_querying = 'querying_check_task'
db_key_queryset = 'queryset_check_task'
db_key_query_recall = 'query_recall'
def run_query(conn, sql, params):
with conn.cursor() as cursor:
cursor.execute(sql, params)
result = cursor.fetchall()
return result
# def processing_one_text(paper_id):
# conn = pymysql.connect(
# host='192.168.31.145',
# port=3306,
# user='root',
# password='123456',
# db='zhiwang_db',
# charset='utf8mb4',
# cursorclass=pymysql.cursors.DictCursor
# )
#
# sql = 'SELECT * FROM main_table_paper_detail_message WHERE doc_id=%s'
# params = (paper_id,)
#
# result = run_query(conn, sql, params)
#
# conn.close()
# print(result[0]['title'], result[0]['author'])
# title = result[0]['title']
# author = result[0]['author']
# degree = result[0]['degree']
# year = result[0]['content'].split("/")[5]
# content_path = result[0]['content']
# school = result[0]['school']
# qikan_name = result[0]['qikan_name']
# author = str(author).strip(";")
# author = str(author).replace(";", ",")
# # select
# # school, qikan_name
# # from main_table_paper_detail_message limit
# # 10000 \G;;
#
# try:
# with open(content_path, encoding="utf-8") as f:
# text = f.read()
# except:
# with open(content_path, encoding="gbk") as f:
# text = f.read()
#
# paper_info = {
# "title": title,
# "author": author,
# "degree": degree,
# "year": year,
# "paper_len_word": len(text),
# "school": school,
# "qikan_name": qikan_name
# }
# return paper_info
from clickhouse_driver import Client
class PureClient:
def __init__(self, database='test_db'):
# 只需要写本地地址
self.client = Client(host='192.168.31.74', port=9000, user='default',
password='zhicheng123*', database=database)
def run(self, sql):
client = self.client
collection = client.query_dataframe(sql)
return collection
def processing_one_text(paper_id):
pureclient = PureClient()
print("paper_id", paper_id)
sql = 'SELECT * FROM main_paper_message WHERE doc_id={}'.format(paper_id)
result = pureclient.run(sql)
print("result", result)
title = result['title'][0]
author = result['author'][0]
degree = result['degree'][0]
year = result['content'][0].split("/")[5]
school = result['school'][0]
qikan_name = result['qikan_name'][0]
author = str(author).strip(";")
author = str(author).replace(";", ",")
# select
# school, qikan_name
# from main_table_paper_detail_message limit
# 10000 \G;;
paper_info = {
"title": title,
"author": author,
"degree": degree,
"year": year,
"school": school,
"qikan_name": qikan_name
}
print("paper_info", paper_info)
return paper_info
def ulit_recall_paper(recall_data_list_dict):
'''
对返回的十篇文章路径读取并解析
:param recall_data_list_path:
:return data: list [[sentence, filename],[sentence, filename],[sentence, filename]]
'''
# data = []
# for path in recall_data_list_path:
# filename = path.split("/")[-1]
# with open(path, encoding="gbk") as f:
# text = f.read()
# text_list = text.split("\n")
# for sentence in text_list:
# if sentence != "":
# data.append([sentence, filename])
# return data
data = []
for i in list(recall_data_list_dict.items()):
data_one = processing_one_text(i[0])
print("ulit_recall_paper-1")
degree = "[D]"
if data_one['degree'] == "期刊":
degree = "[J]"
# school = result[0]['school']
# qikan_name = result[0]['qikan_name']
if data_one['school'] != " ":
source = data_one['school']
else:
source = data_one['qikan_name']
print("ulit_recall_paper-2")
paper_name = ".".join([data_one['author'], data_one['title'] + degree, ",".join([source, data_one['year']])])
paper_name = paper_name + "."
data.append(paper_name)
print("ulit_recall_paper-3")
data = list(set(data))
return data
def classify_accurate_check():
while True:
if redis_.llen(db_key_query_recall) == 0: # 若队列中没有元素就继续获取
time.sleep(1)
continue
print("计算结果")
query_recall = redis_.lpop(db_key_query_recall).decode('UTF-8') # 获取query的text
query_recall_dict = json.loads(query_recall)
query_recall_uuid = query_recall_dict["uuid"]
recall_data_list_dict = query_recall_dict["data"]
is_success = query_recall_dict["is_success"]
try:
if is_success == "0":
return_text = {"resilt": "宇鹏接口不成功", "probabilities": None, "status_code": 400}
else:
if recall_data_list_dict == "{}":
return_text = {"resilt": "查询结果为空", "probabilities": None, "status_code": 400}
else:
recall_data_list = ulit_recall_paper(recall_data_list_dict)
recall_data = "\n".join(recall_data_list)
return_text = {"resilt": recall_data, "probabilities": None, "status_code": 200}
except:
return_text = {"resilt": "计算有问题", "probabilities": None, "status_code": 400}
load_result_path = "./new_data_logs/{}.json".format(query_recall_uuid)
print("queue_uuid: ", query_recall_uuid)
print("load_result_path: ", load_result_path)
with open(load_result_path, 'w', encoding='utf8') as f2:
# ensure_ascii=False才能输入中文,否则是Unicode字符
# indent=2 JSON数据的缩进,美观
json.dump(return_text, f2, ensure_ascii=False, indent=4)
redis_.set(query_recall_uuid, load_result_path, 86400)
if __name__ == '__main__':
t1 = Thread(target=classify_accurate_check)
t1.start()