diff --git a/main.py b/main.py index 486d827..0b30e43 100644 --- a/main.py +++ b/main.py @@ -3,6 +3,7 @@ # 按 Shift+F10 执行或将其替换为您的代码。 # 按 双击 Shift 在所有地方搜索类、文件、工具窗口、操作和设置。 import os +os.environ["CUDA_VISIBLE_DEVICES"] = "-1" import faiss import numpy as np from tqdm import tqdm @@ -15,20 +16,12 @@ from flask_cors import CORS import pandas as pd import concurrent.futures import json -from threading import Thread -import redis + app = Flask(__name__) CORS(app) app.config["JSON_AS_ASCII"] = False -pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=1, password="zhicheng123*") -redis_ = redis.Redis(connection_pool=pool, decode_responses=True) - -db_key_query = 'query' -db_key_querying = 'querying' -batch_size = 32 - openai_api_key = "token-abc123" openai_api_base = "http://127.0.0.1:12011/v1" @@ -157,6 +150,17 @@ def delete_data(title, data_id): df.loc[df["ID"] == data_id, "有效"] = False df.to_csv(csv_path, sep="\t", index=False) + # 更新索引标记 + index_path = f"data_np/{title}_index.json" + if os.path.exists(index_path): + with open(index_path, "r+") as f: + index_data = json.load(f) + if data_id in index_data: + index_data[data_id]["valid"] = False + f.seek(0) + json.dump(index_data, f) + f.truncate() + def check_file_exists(file_path): """ @@ -177,20 +181,20 @@ def ulit_request_file(new_id, sentence, title): # 初始化或读取CSV文件 if os.path.exists(file_name_res_save): df = pd.read_csv(file_name_res_save, sep="\t") - # # 检查是否已存在相同正文 - # if sentence in df["正文"].values: - # print("正文已存在,跳过处理") - # return df + # 检查是否已存在相同正文 + if sentence in df["正文"].values: + print("正文已存在,跳过处理") + return df else: - df = pd.DataFrame(columns=["ID", "正文", "总结", "有效", "已向量化", "向量"]) + df = pd.DataFrame(columns=["ID", "正文", "总结", "有效", "已向量化"]) + # 添加新数据(生成唯一ID) new_row = { - "ID": new_id, + "ID": str(new_id), "正文": sentence, "总结": None, "有效": True, - "已向量化": False, - "向量": None, + "已向量化": False } df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True) @@ -219,19 +223,9 @@ def ulit_request_file(new_id, sentence, title): summary = result['choices'][0]['message']['content'] df.at[idx, "总结"] = summary - # df.loc[df.index[2], "总结"] = None - # df.loc[df.index[3], "总结"] = None - # df.loc[df.index[4], "总结"] = None - # df.loc[df.index[5], "总结"] = None - - df_ce = df[(df["有效"] == True) & (df["总结"].notnull())] - for idx in df_ce.index: - a = shengcehng_array([df_ce.at[idx, "总结"]]) - df.at[idx, "向量"] = json.dumps(a[0].tolist()) - df.at[idx, "已向量化"] = True - + # 保存更新后的CSV df.to_csv(file_name_res_save, sep="\t", index=False) - + return df def main(question, title, top): db_dict = { @@ -267,28 +261,20 @@ def main(question, title, top): index = faiss.IndexFlatIP(d) # buid the index # 查找向量 - file_name_res_save = f"data_file_res/{title_dan}.csv" - df = pd.read_csv(file_name_res_save, sep="\t", encoding="utf-8") - df_ce = df[df["有效"] == True] + vector_path = f"data_np/{title_dan}.npy" + vectors = np.load(vector_path) - print(df_ce.shape) - data_np = [] - for idx in df_ce.index: - data_np.append(json.loads(df.loc[idx, "向量"])) - - vectors = np.array(data_np, dtype=object) - - # data_str = pd.read_csv(file_name_res_save, sep="\t", encoding="utf-8").values.tolist() + data_str = pd.read_csv(f"data_file/{title_dan}.csv", sep="\t", encoding="utf-8").values.tolist() index.add(vectors) D, I = index.search(embs, int(top)) print(I) reference_list = [] for i,j in zip(I[0], D[0]): - reference_list.append([df_ce.loc[df_ce.index[i], "正文"], j]) + reference_list.append([data_str[i], j]) for i,j in enumerate(reference_list): - paper_list_str += "第{}篇\n{},此篇文章跟问题的相关度为{}%\n".format(str(i+1), j[0], j[1]) + paper_list_str += "第{}篇\n{},此篇文章跟问题的相关度为{}%\n".format(str(i+1), j[0][0], j[1]) ''' @@ -306,60 +292,6 @@ def main(question, title, top): ''' return model_generate_stream(propmt_connect_input) -def classify(): # 调用模型,设置最大batch_size - while True: - if redis_.llen(db_key_query) == 0: # 若队列中没有元素就继续获取 - time.sleep(3) - continue - query = redis_.lpop(db_key_query).decode('UTF-8') # 获取query的text - data_dict = json.loads(query) - if data_dict["state"] == "1": - new_id = data_dict["id"] - sentence = data_dict["sentence"] - title = data_dict["title"] - ulit_request_file(new_id, sentence, title) - - -def add_dan_data(new_id, sentence, title): - file_name_res_save = f"data_file_res/{title}.csv" - - # 初始化或读取CSV文件 - - df = pd.read_csv(file_name_res_save, sep="\t") - - if sentence in df["正文"].values: - print("正文已存在,跳过处理") - return False - else: - ulit_request_file(new_id, sentence, title) - return True - -def updata_dan_data(new_id, sentence, title): - file_name_res_save = f"data_file_res/{title}.csv" - df = pd.read_csv(file_name_res_save, sep="\t") - - - # 筛选需要处理的记录 - propmt_connect = { - "model": "gpt-4-turbo", - "messages": [{ - "role": "user", - "content": f"{sentence}\n以上这条中可能包含了一些病情或者症状,请帮我归纳这条中所对应的病情或者症状是哪些,总结出来,不需要很长,简单归纳即可,直接输出症状或者病情,可以包含一些形容词来辅助描述,不需要有辅助词汇" - }], - "top_p": 0.9, - "temperature": 0.6 - } - result = dialog_line_parse(propmt_connect) - print(result) - summary = result['choices'][0]['message']['content'] - # 更新总结,正文字段 - df.loc[df["ID"] == new_id, "总结"] = summary - df.loc[df["ID"] == new_id, "正文"] = sentence - - a = shengcehng_array([summary]) - df.loc[df["ID"] == new_id, "向量"] = json.dumps(a[0].tolist()) - df.to_csv(file_name_res_save, sep="\t", index=False) - def model_generate_stream(prompt): messages = [ @@ -396,13 +328,13 @@ def model_generate_stream(prompt): yield content -@app.route("/upload_file", methods=["POST"]) -def upload_file(): +@app.route("/upload_file_check", methods=["POST"]) +def upload_file_check(): print(request.remote_addr) - sentence = request.json['sentence'] - title = request.json["title"] - new_id = request.json["id"] - state = request.json["state"] # 1: 批量新增 2:单条新增 3:单挑修改 4: 单条删除 + sentence = request.form.get('sentence') + title = request.form.get("title") + new_id = request.form.get("id") + state = request.form.get("state") ''' { "1": "csv", @@ -413,28 +345,12 @@ def upload_file(): ''' state_res = "" if state == "1": - redis_.rpush(db_key_query, json.dumps({ - "id": new_id, - "sentence": sentence, - "state": state, - "title": title - })) # 加入redis - state_res = "上传完成,正在排队处理数据" + df = ulit_request_file(new_id, sentence, title) + Building_vector_database(title, df) + state_res = "上传完成" elif state == "2": - info_bool = add_dan_data(new_id, sentence, title) - if info_bool == True: - state_res = "上传完成" - else: - state_res = "上传失败,库中有重复数据" - - elif state == "3": - updata_dan_data(new_id, sentence, title) - state_res = "修改完成" - - elif state == "4": delete_data(title, new_id) state_res = "删除完成" - return_json = { "code": 200, "info": state_res @@ -442,30 +358,6 @@ def upload_file(): return jsonify(return_json) # 返回结果 -@app.route("/upload_file_check", methods=["POST"]) -def upload_file_check(): - print(request.remote_addr) - new_id = request.json["id"] - data_list = redis_.lrange(db_key_query, 0, -1) # 0 表示开始,-1 表示结束(全部) - # 解析 JSON 数据 - data_list_id_ = [] - for item in data_list: - data = json.loads(item.decode("utf-8")) # Redis 返回的是 bytes,需要 decode + json.loads - data_list_id_.append(data["id"]) - if new_id in data_list_id_: - return_json = { - "code": 200, - "info": "上传中" - } - return jsonify(return_json) - else: - return_json = { - "code": 200, - "info": "已入库" - } - return jsonify(return_json) - - @app.route("/search", methods=["POST"]) def search(): print(request.remote_addr) @@ -475,8 +367,6 @@ def search(): response = main(texts, title, top) return Response(response, mimetype='text/plain; charset=utf-8') # 返回结果 -t = Thread(target=classify) -t.start() if __name__ == "__main__": app.run(host="0.0.0.0", port=27000, threaded=True, debug=False) diff --git a/main_scokt.py b/main_scokt.py index bf75674..0dbad79 100644 --- a/main_scokt.py +++ b/main_scokt.py @@ -154,6 +154,8 @@ def main(question, title, top): reference_list = [] for i, j in zip(I[0], D[0]): + print("i", i) + print("data_str[i]", data_str[i]) reference_list.append([data_str[i], j]) for i, j in enumerate(reference_list):