diff --git a/main.py b/main.py index c38157d..486d827 100644 --- a/main.py +++ b/main.py @@ -29,17 +29,17 @@ db_key_query = 'query' db_key_querying = 'querying' batch_size = 32 -# openai_api_key = "token-abc123" -# openai_api_base = "http://127.0.0.1:12011/v1" -# -# client = OpenAI( -# api_key=openai_api_key, -# base_url=openai_api_base, -# ) - -# models = client.models.list() -# model = models.data[0].id -model = "1" +openai_api_key = "token-abc123" +openai_api_base = "http://127.0.0.1:12011/v1" + +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) + +models = client.models.list() +model = models.data[0].id +# model = "1" model_encode = SentenceTransformer('/home/majiahui/project/models-llm/bge-large-zh-v1.5') propmt_connect = '''我是一名中医,你是一个中医的医生的助理,我的患者有一个症状,症状如下: {} @@ -157,17 +157,6 @@ def delete_data(title, data_id): df.loc[df["ID"] == data_id, "有效"] = False df.to_csv(csv_path, sep="\t", index=False) - # 更新索引标记 - index_path = f"data_np/{title}_index.json" - if os.path.exists(index_path): - with open(index_path, "r+") as f: - index_data = json.load(f) - if data_id in index_data: - index_data[data_id]["valid"] = False - f.seek(0) - json.dump(index_data, f) - f.truncate() - def check_file_exists(file_path): """ @@ -188,20 +177,20 @@ def ulit_request_file(new_id, sentence, title): # 初始化或读取CSV文件 if os.path.exists(file_name_res_save): df = pd.read_csv(file_name_res_save, sep="\t") - # 检查是否已存在相同正文 - if sentence in df["正文"].values: - print("正文已存在,跳过处理") - return df + # # 检查是否已存在相同正文 + # if sentence in df["正文"].values: + # print("正文已存在,跳过处理") + # return df else: - df = pd.DataFrame(columns=["ID", "正文", "总结", "有效", "已向量化"]) + df = pd.DataFrame(columns=["ID", "正文", "总结", "有效", "已向量化", "向量"]) - # 添加新数据(生成唯一ID) new_row = { - "ID": str(new_id), + "ID": new_id, "正文": sentence, "总结": None, "有效": True, - "已向量化": False + "已向量化": False, + "向量": None, } df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True) @@ -230,9 +219,19 @@ def ulit_request_file(new_id, sentence, title): summary = result['choices'][0]['message']['content'] df.at[idx, "总结"] = summary - # 保存更新后的CSV + # df.loc[df.index[2], "总结"] = None + # df.loc[df.index[3], "总结"] = None + # df.loc[df.index[4], "总结"] = None + # df.loc[df.index[5], "总结"] = None + + df_ce = df[(df["有效"] == True) & (df["总结"].notnull())] + for idx in df_ce.index: + a = shengcehng_array([df_ce.at[idx, "总结"]]) + df.at[idx, "向量"] = json.dumps(a[0].tolist()) + df.at[idx, "已向量化"] = True + df.to_csv(file_name_res_save, sep="\t", index=False) - return df + def main(question, title, top): db_dict = { @@ -268,28 +267,28 @@ def main(question, title, top): index = faiss.IndexFlatIP(d) # buid the index # 查找向量 - vector_path = f"data_np/{title_dan}.npy" - index_path = f"data_np/{title_dan}_index.json" + file_name_res_save = f"data_file_res/{title_dan}.csv" + df = pd.read_csv(file_name_res_save, sep="\t", encoding="utf-8") + df_ce = df[df["有效"] == True] - if not os.path.exists(vector_path) or not os.path.exists(index_path): - return np.empty((0, 1024)) + print(df_ce.shape) + data_np = [] + for idx in df_ce.index: + data_np.append(json.loads(df.loc[idx, "向量"])) - vectors = np.load(vector_path) - with open(index_path, "r") as f: - index_data = json.load(f) + vectors = np.array(data_np, dtype=object) - data_str = pd.read_csv(f"data_file_res/{title_dan}.csv", sep="\t", encoding="utf-8").values.tolist() + # data_str = pd.read_csv(file_name_res_save, sep="\t", encoding="utf-8").values.tolist() index.add(vectors) D, I = index.search(embs, int(top)) print(I) reference_list = [] for i,j in zip(I[0], D[0]): - if data_str[i][3] == True: - reference_list.append([data_str[i], j]) + reference_list.append([df_ce.loc[df_ce.index[i], "正文"], j]) for i,j in enumerate(reference_list): - paper_list_str += "第{}篇\n{},此篇文章跟问题的相关度为{}%\n".format(str(i+1), j[0][1], j[1]) + paper_list_str += "第{}篇\n{},此篇文章跟问题的相关度为{}%\n".format(str(i+1), j[0], j[1]) ''' @@ -318,8 +317,48 @@ def classify(): # 调用模型,设置最大batch_size new_id = data_dict["id"] sentence = data_dict["sentence"] title = data_dict["title"] - df = ulit_request_file(new_id, sentence, title) - Building_vector_database(title, df) + ulit_request_file(new_id, sentence, title) + + +def add_dan_data(new_id, sentence, title): + file_name_res_save = f"data_file_res/{title}.csv" + + # 初始化或读取CSV文件 + + df = pd.read_csv(file_name_res_save, sep="\t") + + if sentence in df["正文"].values: + print("正文已存在,跳过处理") + return False + else: + ulit_request_file(new_id, sentence, title) + return True + +def updata_dan_data(new_id, sentence, title): + file_name_res_save = f"data_file_res/{title}.csv" + df = pd.read_csv(file_name_res_save, sep="\t") + + + # 筛选需要处理的记录 + propmt_connect = { + "model": "gpt-4-turbo", + "messages": [{ + "role": "user", + "content": f"{sentence}\n以上这条中可能包含了一些病情或者症状,请帮我归纳这条中所对应的病情或者症状是哪些,总结出来,不需要很长,简单归纳即可,直接输出症状或者病情,可以包含一些形容词来辅助描述,不需要有辅助词汇" + }], + "top_p": 0.9, + "temperature": 0.6 + } + result = dialog_line_parse(propmt_connect) + print(result) + summary = result['choices'][0]['message']['content'] + # 更新总结,正文字段 + df.loc[df["ID"] == new_id, "总结"] = summary + df.loc[df["ID"] == new_id, "正文"] = sentence + + a = shengcehng_array([summary]) + df.loc[df["ID"] == new_id, "向量"] = json.dumps(a[0].tolist()) + df.to_csv(file_name_res_save, sep="\t", index=False) def model_generate_stream(prompt): @@ -357,13 +396,13 @@ def model_generate_stream(prompt): yield content -@app.route("/upload_file_check", methods=["POST"]) -def upload_file_check(): +@app.route("/upload_file", methods=["POST"]) +def upload_file(): print(request.remote_addr) sentence = request.json['sentence'] title = request.json["title"] new_id = request.json["id"] - state = request.json["state"] + state = request.json["state"] # 1: 批量新增 2:单条新增 3:单挑修改 4: 单条删除 ''' { "1": "csv", @@ -374,18 +413,28 @@ def upload_file_check(): ''' state_res = "" if state == "1": - # df = ulit_request_file(new_id, sentence, title) - # Building_vector_database(title, df) redis_.rpush(db_key_query, json.dumps({ "id": new_id, "sentence": sentence, "state": state, "title": title })) # 加入redis - state_res = "上传完成" + state_res = "上传完成,正在排队处理数据" elif state == "2": + info_bool = add_dan_data(new_id, sentence, title) + if info_bool == True: + state_res = "上传完成" + else: + state_res = "上传失败,库中有重复数据" + + elif state == "3": + updata_dan_data(new_id, sentence, title) + state_res = "修改完成" + + elif state == "4": delete_data(title, new_id) state_res = "删除完成" + return_json = { "code": 200, "info": state_res @@ -393,6 +442,30 @@ def upload_file_check(): return jsonify(return_json) # 返回结果 +@app.route("/upload_file_check", methods=["POST"]) +def upload_file_check(): + print(request.remote_addr) + new_id = request.json["id"] + data_list = redis_.lrange(db_key_query, 0, -1) # 0 表示开始,-1 表示结束(全部) + # 解析 JSON 数据 + data_list_id_ = [] + for item in data_list: + data = json.loads(item.decode("utf-8")) # Redis 返回的是 bytes,需要 decode + json.loads + data_list_id_.append(data["id"]) + if new_id in data_list_id_: + return_json = { + "code": 200, + "info": "上传中" + } + return jsonify(return_json) + else: + return_json = { + "code": 200, + "info": "已入库" + } + return jsonify(return_json) + + @app.route("/search", methods=["POST"]) def search(): print(request.remote_addr)