|
|
@ -3,6 +3,7 @@ |
|
|
|
# 按 Shift+F10 执行或将其替换为您的代码。 |
|
|
|
# 按 双击 Shift 在所有地方搜索类、文件、工具窗口、操作和设置。 |
|
|
|
import os |
|
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = "-1" |
|
|
|
import faiss |
|
|
|
import numpy as np |
|
|
|
from tqdm import tqdm |
|
|
@ -15,20 +16,12 @@ from flask_cors import CORS |
|
|
|
import pandas as pd |
|
|
|
import concurrent.futures |
|
|
|
import json |
|
|
|
from threading import Thread |
|
|
|
import redis |
|
|
|
|
|
|
|
|
|
|
|
app = Flask(__name__) |
|
|
|
CORS(app) |
|
|
|
app.config["JSON_AS_ASCII"] = False |
|
|
|
|
|
|
|
pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=1, password="zhicheng123*") |
|
|
|
redis_ = redis.Redis(connection_pool=pool, decode_responses=True) |
|
|
|
|
|
|
|
db_key_query = 'query' |
|
|
|
db_key_querying = 'querying' |
|
|
|
batch_size = 32 |
|
|
|
|
|
|
|
openai_api_key = "token-abc123" |
|
|
|
openai_api_base = "http://127.0.0.1:12011/v1" |
|
|
|
|
|
|
@ -157,6 +150,17 @@ def delete_data(title, data_id): |
|
|
|
df.loc[df["ID"] == data_id, "有效"] = False |
|
|
|
df.to_csv(csv_path, sep="\t", index=False) |
|
|
|
|
|
|
|
# 更新索引标记 |
|
|
|
index_path = f"data_np/{title}_index.json" |
|
|
|
if os.path.exists(index_path): |
|
|
|
with open(index_path, "r+") as f: |
|
|
|
index_data = json.load(f) |
|
|
|
if data_id in index_data: |
|
|
|
index_data[data_id]["valid"] = False |
|
|
|
f.seek(0) |
|
|
|
json.dump(index_data, f) |
|
|
|
f.truncate() |
|
|
|
|
|
|
|
|
|
|
|
def check_file_exists(file_path): |
|
|
|
""" |
|
|
@ -177,20 +181,20 @@ def ulit_request_file(new_id, sentence, title): |
|
|
|
# 初始化或读取CSV文件 |
|
|
|
if os.path.exists(file_name_res_save): |
|
|
|
df = pd.read_csv(file_name_res_save, sep="\t") |
|
|
|
# # 检查是否已存在相同正文 |
|
|
|
# if sentence in df["正文"].values: |
|
|
|
# print("正文已存在,跳过处理") |
|
|
|
# return df |
|
|
|
# 检查是否已存在相同正文 |
|
|
|
if sentence in df["正文"].values: |
|
|
|
print("正文已存在,跳过处理") |
|
|
|
return df |
|
|
|
else: |
|
|
|
df = pd.DataFrame(columns=["ID", "正文", "总结", "有效", "已向量化", "向量"]) |
|
|
|
df = pd.DataFrame(columns=["ID", "正文", "总结", "有效", "已向量化"]) |
|
|
|
|
|
|
|
# 添加新数据(生成唯一ID) |
|
|
|
new_row = { |
|
|
|
"ID": new_id, |
|
|
|
"ID": str(new_id), |
|
|
|
"正文": sentence, |
|
|
|
"总结": None, |
|
|
|
"有效": True, |
|
|
|
"已向量化": False, |
|
|
|
"向量": None, |
|
|
|
"已向量化": False |
|
|
|
} |
|
|
|
df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True) |
|
|
|
|
|
|
@ -219,19 +223,9 @@ def ulit_request_file(new_id, sentence, title): |
|
|
|
summary = result['choices'][0]['message']['content'] |
|
|
|
df.at[idx, "总结"] = summary |
|
|
|
|
|
|
|
# df.loc[df.index[2], "总结"] = None |
|
|
|
# df.loc[df.index[3], "总结"] = None |
|
|
|
# df.loc[df.index[4], "总结"] = None |
|
|
|
# df.loc[df.index[5], "总结"] = None |
|
|
|
|
|
|
|
df_ce = df[(df["有效"] == True) & (df["总结"].notnull())] |
|
|
|
for idx in df_ce.index: |
|
|
|
a = shengcehng_array([df_ce.at[idx, "总结"]]) |
|
|
|
df.at[idx, "向量"] = json.dumps(a[0].tolist()) |
|
|
|
df.at[idx, "已向量化"] = True |
|
|
|
|
|
|
|
# 保存更新后的CSV |
|
|
|
df.to_csv(file_name_res_save, sep="\t", index=False) |
|
|
|
|
|
|
|
return df |
|
|
|
|
|
|
|
def main(question, title, top): |
|
|
|
db_dict = { |
|
|
@ -267,28 +261,20 @@ def main(question, title, top): |
|
|
|
index = faiss.IndexFlatIP(d) # buid the index |
|
|
|
|
|
|
|
# 查找向量 |
|
|
|
file_name_res_save = f"data_file_res/{title_dan}.csv" |
|
|
|
df = pd.read_csv(file_name_res_save, sep="\t", encoding="utf-8") |
|
|
|
df_ce = df[df["有效"] == True] |
|
|
|
vector_path = f"data_np/{title_dan}.npy" |
|
|
|
vectors = np.load(vector_path) |
|
|
|
|
|
|
|
print(df_ce.shape) |
|
|
|
data_np = [] |
|
|
|
for idx in df_ce.index: |
|
|
|
data_np.append(json.loads(df.loc[idx, "向量"])) |
|
|
|
|
|
|
|
vectors = np.array(data_np, dtype=object) |
|
|
|
|
|
|
|
# data_str = pd.read_csv(file_name_res_save, sep="\t", encoding="utf-8").values.tolist() |
|
|
|
data_str = pd.read_csv(f"data_file/{title_dan}.csv", sep="\t", encoding="utf-8").values.tolist() |
|
|
|
index.add(vectors) |
|
|
|
D, I = index.search(embs, int(top)) |
|
|
|
print(I) |
|
|
|
|
|
|
|
reference_list = [] |
|
|
|
for i,j in zip(I[0], D[0]): |
|
|
|
reference_list.append([df_ce.loc[df_ce.index[i], "正文"], j]) |
|
|
|
reference_list.append([data_str[i], j]) |
|
|
|
|
|
|
|
for i,j in enumerate(reference_list): |
|
|
|
paper_list_str += "第{}篇\n{},此篇文章跟问题的相关度为{}%\n".format(str(i+1), j[0], j[1]) |
|
|
|
paper_list_str += "第{}篇\n{},此篇文章跟问题的相关度为{}%\n".format(str(i+1), j[0][0], j[1]) |
|
|
|
|
|
|
|
|
|
|
|
''' |
|
|
@ -306,60 +292,6 @@ def main(question, title, top): |
|
|
|
''' |
|
|
|
return model_generate_stream(propmt_connect_input) |
|
|
|
|
|
|
|
def classify(): # 调用模型,设置最大batch_size |
|
|
|
while True: |
|
|
|
if redis_.llen(db_key_query) == 0: # 若队列中没有元素就继续获取 |
|
|
|
time.sleep(3) |
|
|
|
continue |
|
|
|
query = redis_.lpop(db_key_query).decode('UTF-8') # 获取query的text |
|
|
|
data_dict = json.loads(query) |
|
|
|
if data_dict["state"] == "1": |
|
|
|
new_id = data_dict["id"] |
|
|
|
sentence = data_dict["sentence"] |
|
|
|
title = data_dict["title"] |
|
|
|
ulit_request_file(new_id, sentence, title) |
|
|
|
|
|
|
|
|
|
|
|
def add_dan_data(new_id, sentence, title): |
|
|
|
file_name_res_save = f"data_file_res/{title}.csv" |
|
|
|
|
|
|
|
# 初始化或读取CSV文件 |
|
|
|
|
|
|
|
df = pd.read_csv(file_name_res_save, sep="\t") |
|
|
|
|
|
|
|
if sentence in df["正文"].values: |
|
|
|
print("正文已存在,跳过处理") |
|
|
|
return False |
|
|
|
else: |
|
|
|
ulit_request_file(new_id, sentence, title) |
|
|
|
return True |
|
|
|
|
|
|
|
def updata_dan_data(new_id, sentence, title): |
|
|
|
file_name_res_save = f"data_file_res/{title}.csv" |
|
|
|
df = pd.read_csv(file_name_res_save, sep="\t") |
|
|
|
|
|
|
|
|
|
|
|
# 筛选需要处理的记录 |
|
|
|
propmt_connect = { |
|
|
|
"model": "gpt-4-turbo", |
|
|
|
"messages": [{ |
|
|
|
"role": "user", |
|
|
|
"content": f"{sentence}\n以上这条中可能包含了一些病情或者症状,请帮我归纳这条中所对应的病情或者症状是哪些,总结出来,不需要很长,简单归纳即可,直接输出症状或者病情,可以包含一些形容词来辅助描述,不需要有辅助词汇" |
|
|
|
}], |
|
|
|
"top_p": 0.9, |
|
|
|
"temperature": 0.6 |
|
|
|
} |
|
|
|
result = dialog_line_parse(propmt_connect) |
|
|
|
print(result) |
|
|
|
summary = result['choices'][0]['message']['content'] |
|
|
|
# 更新总结,正文字段 |
|
|
|
df.loc[df["ID"] == new_id, "总结"] = summary |
|
|
|
df.loc[df["ID"] == new_id, "正文"] = sentence |
|
|
|
|
|
|
|
a = shengcehng_array([summary]) |
|
|
|
df.loc[df["ID"] == new_id, "向量"] = json.dumps(a[0].tolist()) |
|
|
|
df.to_csv(file_name_res_save, sep="\t", index=False) |
|
|
|
|
|
|
|
|
|
|
|
def model_generate_stream(prompt): |
|
|
|
messages = [ |
|
|
@ -396,13 +328,13 @@ def model_generate_stream(prompt): |
|
|
|
yield content |
|
|
|
|
|
|
|
|
|
|
|
@app.route("/upload_file", methods=["POST"]) |
|
|
|
def upload_file(): |
|
|
|
@app.route("/upload_file_check", methods=["POST"]) |
|
|
|
def upload_file_check(): |
|
|
|
print(request.remote_addr) |
|
|
|
sentence = request.json['sentence'] |
|
|
|
title = request.json["title"] |
|
|
|
new_id = request.json["id"] |
|
|
|
state = request.json["state"] # 1: 批量新增 2:单条新增 3:单挑修改 4: 单条删除 |
|
|
|
sentence = request.form.get('sentence') |
|
|
|
title = request.form.get("title") |
|
|
|
new_id = request.form.get("id") |
|
|
|
state = request.form.get("state") |
|
|
|
''' |
|
|
|
{ |
|
|
|
"1": "csv", |
|
|
@ -413,28 +345,12 @@ def upload_file(): |
|
|
|
''' |
|
|
|
state_res = "" |
|
|
|
if state == "1": |
|
|
|
redis_.rpush(db_key_query, json.dumps({ |
|
|
|
"id": new_id, |
|
|
|
"sentence": sentence, |
|
|
|
"state": state, |
|
|
|
"title": title |
|
|
|
})) # 加入redis |
|
|
|
state_res = "上传完成,正在排队处理数据" |
|
|
|
df = ulit_request_file(new_id, sentence, title) |
|
|
|
Building_vector_database(title, df) |
|
|
|
state_res = "上传完成" |
|
|
|
elif state == "2": |
|
|
|
info_bool = add_dan_data(new_id, sentence, title) |
|
|
|
if info_bool == True: |
|
|
|
state_res = "上传完成" |
|
|
|
else: |
|
|
|
state_res = "上传失败,库中有重复数据" |
|
|
|
|
|
|
|
elif state == "3": |
|
|
|
updata_dan_data(new_id, sentence, title) |
|
|
|
state_res = "修改完成" |
|
|
|
|
|
|
|
elif state == "4": |
|
|
|
delete_data(title, new_id) |
|
|
|
state_res = "删除完成" |
|
|
|
|
|
|
|
return_json = { |
|
|
|
"code": 200, |
|
|
|
"info": state_res |
|
|
@ -442,30 +358,6 @@ def upload_file(): |
|
|
|
return jsonify(return_json) # 返回结果 |
|
|
|
|
|
|
|
|
|
|
|
@app.route("/upload_file_check", methods=["POST"]) |
|
|
|
def upload_file_check(): |
|
|
|
print(request.remote_addr) |
|
|
|
new_id = request.json["id"] |
|
|
|
data_list = redis_.lrange(db_key_query, 0, -1) # 0 表示开始,-1 表示结束(全部) |
|
|
|
# 解析 JSON 数据 |
|
|
|
data_list_id_ = [] |
|
|
|
for item in data_list: |
|
|
|
data = json.loads(item.decode("utf-8")) # Redis 返回的是 bytes,需要 decode + json.loads |
|
|
|
data_list_id_.append(data["id"]) |
|
|
|
if new_id in data_list_id_: |
|
|
|
return_json = { |
|
|
|
"code": 200, |
|
|
|
"info": "上传中" |
|
|
|
} |
|
|
|
return jsonify(return_json) |
|
|
|
else: |
|
|
|
return_json = { |
|
|
|
"code": 200, |
|
|
|
"info": "已入库" |
|
|
|
} |
|
|
|
return jsonify(return_json) |
|
|
|
|
|
|
|
|
|
|
|
@app.route("/search", methods=["POST"]) |
|
|
|
def search(): |
|
|
|
print(request.remote_addr) |
|
|
@ -475,8 +367,6 @@ def search(): |
|
|
|
response = main(texts, title, top) |
|
|
|
return Response(response, mimetype='text/plain; charset=utf-8') # 返回结果 |
|
|
|
|
|
|
|
t = Thread(target=classify) |
|
|
|
t.start() |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
app.run(host="0.0.0.0", port=27000, threaded=True, debug=False) |
|
|
|