Browse Source

向量融合到csv中,并支持增删改查操作

div_测试
majiahui@haimaqingfan.com 1 month ago
parent
commit
90af48046f
  1. 175
      main.py

175
main.py

@ -29,17 +29,17 @@ db_key_query = 'query'
db_key_querying = 'querying'
batch_size = 32
# openai_api_key = "token-abc123"
# openai_api_base = "http://127.0.0.1:12011/v1"
#
# client = OpenAI(
# api_key=openai_api_key,
# base_url=openai_api_base,
# )
# models = client.models.list()
# model = models.data[0].id
model = "1"
openai_api_key = "token-abc123"
openai_api_base = "http://127.0.0.1:12011/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
# model = "1"
model_encode = SentenceTransformer('/home/majiahui/project/models-llm/bge-large-zh-v1.5')
propmt_connect = '''我是一名中医,你是一个中医的医生的助理,我的患者有一个症状,症状如下:
{}
@ -157,17 +157,6 @@ def delete_data(title, data_id):
df.loc[df["ID"] == data_id, "有效"] = False
df.to_csv(csv_path, sep="\t", index=False)
# 更新索引标记
index_path = f"data_np/{title}_index.json"
if os.path.exists(index_path):
with open(index_path, "r+") as f:
index_data = json.load(f)
if data_id in index_data:
index_data[data_id]["valid"] = False
f.seek(0)
json.dump(index_data, f)
f.truncate()
def check_file_exists(file_path):
"""
@ -188,20 +177,20 @@ def ulit_request_file(new_id, sentence, title):
# 初始化或读取CSV文件
if os.path.exists(file_name_res_save):
df = pd.read_csv(file_name_res_save, sep="\t")
# 检查是否已存在相同正文
if sentence in df["正文"].values:
print("正文已存在,跳过处理")
return df
# # 检查是否已存在相同正文
# if sentence in df["正文"].values:
# print("正文已存在,跳过处理")
# return df
else:
df = pd.DataFrame(columns=["ID", "正文", "总结", "有效", "已向量化"])
df = pd.DataFrame(columns=["ID", "正文", "总结", "有效", "已向量化", "向量"])
# 添加新数据(生成唯一ID)
new_row = {
"ID": str(new_id),
"ID": new_id,
"正文": sentence,
"总结": None,
"有效": True,
"已向量化": False
"已向量化": False,
"向量": None,
}
df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
@ -230,9 +219,19 @@ def ulit_request_file(new_id, sentence, title):
summary = result['choices'][0]['message']['content']
df.at[idx, "总结"] = summary
# 保存更新后的CSV
# df.loc[df.index[2], "总结"] = None
# df.loc[df.index[3], "总结"] = None
# df.loc[df.index[4], "总结"] = None
# df.loc[df.index[5], "总结"] = None
df_ce = df[(df["有效"] == True) & (df["总结"].notnull())]
for idx in df_ce.index:
a = shengcehng_array([df_ce.at[idx, "总结"]])
df.at[idx, "向量"] = json.dumps(a[0].tolist())
df.at[idx, "已向量化"] = True
df.to_csv(file_name_res_save, sep="\t", index=False)
return df
def main(question, title, top):
db_dict = {
@ -268,28 +267,28 @@ def main(question, title, top):
index = faiss.IndexFlatIP(d) # buid the index
# 查找向量
vector_path = f"data_np/{title_dan}.npy"
index_path = f"data_np/{title_dan}_index.json"
file_name_res_save = f"data_file_res/{title_dan}.csv"
df = pd.read_csv(file_name_res_save, sep="\t", encoding="utf-8")
df_ce = df[df["有效"] == True]
if not os.path.exists(vector_path) or not os.path.exists(index_path):
return np.empty((0, 1024))
print(df_ce.shape)
data_np = []
for idx in df_ce.index:
data_np.append(json.loads(df.loc[idx, "向量"]))
vectors = np.load(vector_path)
with open(index_path, "r") as f:
index_data = json.load(f)
vectors = np.array(data_np, dtype=object)
data_str = pd.read_csv(f"data_file_res/{title_dan}.csv", sep="\t", encoding="utf-8").values.tolist()
# data_str = pd.read_csv(file_name_res_save, sep="\t", encoding="utf-8").values.tolist()
index.add(vectors)
D, I = index.search(embs, int(top))
print(I)
reference_list = []
for i,j in zip(I[0], D[0]):
if data_str[i][3] == True:
reference_list.append([data_str[i], j])
reference_list.append([df_ce.loc[df_ce.index[i], "正文"], j])
for i,j in enumerate(reference_list):
paper_list_str += "{}\n{},此篇文章跟问题的相关度为{}%\n".format(str(i+1), j[0][1], j[1])
paper_list_str += "{}\n{},此篇文章跟问题的相关度为{}%\n".format(str(i+1), j[0], j[1])
'''
@ -318,8 +317,48 @@ def classify(): # 调用模型,设置最大batch_size
new_id = data_dict["id"]
sentence = data_dict["sentence"]
title = data_dict["title"]
df = ulit_request_file(new_id, sentence, title)
Building_vector_database(title, df)
ulit_request_file(new_id, sentence, title)
def add_dan_data(new_id, sentence, title):
file_name_res_save = f"data_file_res/{title}.csv"
# 初始化或读取CSV文件
df = pd.read_csv(file_name_res_save, sep="\t")
if sentence in df["正文"].values:
print("正文已存在,跳过处理")
return False
else:
ulit_request_file(new_id, sentence, title)
return True
def updata_dan_data(new_id, sentence, title):
file_name_res_save = f"data_file_res/{title}.csv"
df = pd.read_csv(file_name_res_save, sep="\t")
# 筛选需要处理的记录
propmt_connect = {
"model": "gpt-4-turbo",
"messages": [{
"role": "user",
"content": f"{sentence}\n以上这条中可能包含了一些病情或者症状,请帮我归纳这条中所对应的病情或者症状是哪些,总结出来,不需要很长,简单归纳即可,直接输出症状或者病情,可以包含一些形容词来辅助描述,不需要有辅助词汇"
}],
"top_p": 0.9,
"temperature": 0.6
}
result = dialog_line_parse(propmt_connect)
print(result)
summary = result['choices'][0]['message']['content']
# 更新总结,正文字段
df.loc[df["ID"] == new_id, "总结"] = summary
df.loc[df["ID"] == new_id, "正文"] = sentence
a = shengcehng_array([summary])
df.loc[df["ID"] == new_id, "向量"] = json.dumps(a[0].tolist())
df.to_csv(file_name_res_save, sep="\t", index=False)
def model_generate_stream(prompt):
@ -357,13 +396,13 @@ def model_generate_stream(prompt):
yield content
@app.route("/upload_file_check", methods=["POST"])
def upload_file_check():
@app.route("/upload_file", methods=["POST"])
def upload_file():
print(request.remote_addr)
sentence = request.json['sentence']
title = request.json["title"]
new_id = request.json["id"]
state = request.json["state"]
state = request.json["state"] # 1: 批量新增 2:单条新增 3:单挑修改 4: 单条删除
'''
{
"1": "csv",
@ -374,18 +413,28 @@ def upload_file_check():
'''
state_res = ""
if state == "1":
# df = ulit_request_file(new_id, sentence, title)
# Building_vector_database(title, df)
redis_.rpush(db_key_query, json.dumps({
"id": new_id,
"sentence": sentence,
"state": state,
"title": title
})) # 加入redis
state_res = "上传完成"
state_res = "上传完成,正在排队处理数据"
elif state == "2":
info_bool = add_dan_data(new_id, sentence, title)
if info_bool == True:
state_res = "上传完成"
else:
state_res = "上传失败,库中有重复数据"
elif state == "3":
updata_dan_data(new_id, sentence, title)
state_res = "修改完成"
elif state == "4":
delete_data(title, new_id)
state_res = "删除完成"
return_json = {
"code": 200,
"info": state_res
@ -393,6 +442,30 @@ def upload_file_check():
return jsonify(return_json) # 返回结果
@app.route("/upload_file_check", methods=["POST"])
def upload_file_check():
print(request.remote_addr)
new_id = request.json["id"]
data_list = redis_.lrange(db_key_query, 0, -1) # 0 表示开始,-1 表示结束(全部)
# 解析 JSON 数据
data_list_id_ = []
for item in data_list:
data = json.loads(item.decode("utf-8")) # Redis 返回的是 bytes,需要 decode + json.loads
data_list_id_.append(data["id"])
if new_id in data_list_id_:
return_json = {
"code": 200,
"info": "上传中"
}
return jsonify(return_json)
else:
return_json = {
"code": 200,
"info": "已入库"
}
return jsonify(return_json)
@app.route("/search", methods=["POST"])
def search():
print(request.remote_addr)

Loading…
Cancel
Save