Browse Source

修复bug

div_单条上传
majiahui@haimaqingfan.com 1 month ago
parent
commit
f84c17d1c4
  1. 337
      main.py

337
main.py

@ -2,21 +2,37 @@
# 按 Shift+F10 执行或将其替换为您的代码。 # 按 Shift+F10 执行或将其替换为您的代码。
# 按 双击 Shift 在所有地方搜索类、文件、工具窗口、操作和设置。 # 按 双击 Shift 在所有地方搜索类、文件、工具窗口、操作和设置。
import os
import faiss import faiss
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
from sentence_transformers import SentenceTransformer from sentence_transformers import SentenceTransformer
import requests import requests
import time import time
from flask import Flask, jsonify from flask import Flask, jsonify, Response, request
from flask import request from openai import OpenAI
from flask_cors import CORS
import pandas as pd import pandas as pd
import concurrent.futures
import json
app = Flask(__name__) app = Flask(__name__)
CORS(app)
app.config["JSON_AS_ASCII"] = False app.config["JSON_AS_ASCII"] = False
model = SentenceTransformer('/home/majiahui/project/models-llm/bge-large-zh-v1.5') openai_api_key = "token-abc123"
openai_api_base = "http://127.0.0.1:12011/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
# model = "1"
model_encode = SentenceTransformer('/home/majiahui/project/models-llm/bge-large-zh-v1.5')
propmt_connect = '''我是一名中医,你是一个中医的医生的助理,我的患者有一个症状,症状如下: propmt_connect = '''我是一名中医,你是一个中医的医生的助理,我的患者有一个症状,症状如下:
{} {}
根据这些症状我通过查找资料{} 根据这些症状我通过查找资料{}
@ -26,7 +42,7 @@ propmt_connect_ziliao = '''在“{}”资料中,有如下相关内容:
{}''' {}'''
def dialog_line_parse(url, text): def dialog_line_parse(text):
""" """
将数据输入模型进行分析并输出结果 将数据输入模型进行分析并输出结果
:param url: 模型url :param url: 模型url
@ -34,8 +50,9 @@ def dialog_line_parse(url, text):
:return: 模型返回结果 :return: 模型返回结果
""" """
url_predict = "http://118.178.228.101:12004/predict"
response = requests.post( response = requests.post(
url, url_predict,
json=text, json=text,
timeout=100000 timeout=100000
) )
@ -49,46 +66,167 @@ def dialog_line_parse(url, text):
# ) # )
print("{}】 Failed to get a proper response from remote " print("{}】 Failed to get a proper response from remote "
"server. Status Code: {}. Response: {}" "server. Status Code: {}. Response: {}"
"".format(url, response.status_code, response.text)) "".format(url_predict, response.status_code, response.text))
return {} return {}
# ['choices'][0]['message']['content']
#
# text = text['messages'][0]['content']
# return_text = {
# 'code': 200,
# 'id': "1",
# 'object': 0,
# 'created': 0,
# 'model': 0,
# 'choices': [
# {
# 'index': 0,
# 'message': {
# 'role': 'assistant',
# 'content': text
# },
# 'logprobs': None,
# 'finish_reason': 'stop'
# }
# ],
# 'usage': 0,
# 'system_fingerprint': 0
# }
# return return_text
def shengcehng_array(data): def shengcehng_array(data):
embs = model.encode(data, normalize_embeddings=True) embs = model_encode.encode(data, normalize_embeddings=True)
return embs return embs
def Building_vector_database(type, name, df):
data_ndarray = np.empty((0, 1024))
for sen in df:
data_ndarray = np.concatenate((data_ndarray, shengcehng_array([sen[0]])))
print("data_ndarray.shape", data_ndarray.shape)
print("data_ndarray.shape", data_ndarray.shape)
np.save(f'data_np/{name}.npy', data_ndarray) def Building_vector_database(title, df):
# 加载需要处理的数据(有效且未向量化)
to_process = df[(df["有效"] == True) & (df["已向量化"] == False)]
if len(to_process) == 0:
print("无新增数据需要向量化")
return
# 生成向量数组
new_vectors = shengcehng_array(to_process["总结"].tolist()) # 假设这是你的向量生成函数
# 加载现有向量库和索引
vector_path = f"data_np/{title}.npy"
index_path = f"data_np/{title}_index.json"
vectors = np.load(vector_path) if os.path.exists(vector_path) else np.empty((0, 1024))
index_data = {}
if os.path.exists(index_path):
with open(index_path, "r") as f:
index_data = json.load(f)
# 更新索引和向量库
start_idx = len(vectors)
vectors = np.vstack([vectors, new_vectors])
for i, (_, row) in enumerate(to_process.iterrows()):
index_data[row["ID"]] = {
"row": start_idx + i,
"valid": True
}
# 保存数据
np.save(vector_path, vectors)
with open(index_path, "w") as f:
json.dump(index_data, f)
# 标记已向量化
df.loc[to_process.index, "已向量化"] = True
df.to_csv(f"data_file_res/{title}.csv", sep="\t", index=False)
def delete_data(title, data_id):
# 更新CSV标记
csv_path = f"data_file_res/{title}.csv"
df = pd.read_csv(csv_path, sep="\t", dtype={"ID": str})
df.loc[df["ID"] == data_id, "有效"] = False
df.to_csv(csv_path, sep="\t", index=False)
def ulit_request_file(file, title): # 更新索引标记
file_name = file.filename index_path = f"data_np/{title}_index.json"
file_name_save = "data_file/{}.csv".format(title) if os.path.exists(index_path):
file.save(file_name_save) with open(index_path, "r+") as f:
index_data = json.load(f)
if data_id in index_data:
index_data[data_id]["valid"] = False
f.seek(0)
json.dump(index_data, f)
f.truncate()
# try:
# with open(file_name_save, encoding="gbk") as f:
# content = f.read()
# except:
# with open(file_name_save, encoding="utf-8") as f:
# content = f.read()
# elif file_name.split(".")[-1] == "docx":
# content = docx2txt.process(file_name_save)
# content_list = [i for i in content.split("\n")] def check_file_exists(file_path):
df = pd.read_csv(file_name_save, sep="\t", encoding="utf-8").values.tolist() """
检查文件是否存在
参数:
file_path (str): 要检查的文件路径
返回:
bool: 文件存在返回True否则返回False
"""
return os.path.isfile(file_path)
def ulit_request_file(new_id, sentence, title):
file_name_res_save = f"data_file_res/{title}.csv"
# 初始化或读取CSV文件
if os.path.exists(file_name_res_save):
df = pd.read_csv(file_name_res_save, sep="\t")
# 检查是否已存在相同正文
if sentence in df["正文"].values:
print("正文已存在,跳过处理")
return df return df
else:
df = pd.DataFrame(columns=["ID", "正文", "总结", "有效", "已向量化"])
# 添加新数据(生成唯一ID)
new_row = {
"ID": str(new_id),
"正文": sentence,
"总结": None,
"有效": True,
"已向量化": False
}
df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
# 筛选需要处理的记录
to_process = df[(df["总结"].isna()) & (df["有效"] == True)]
# 调用API生成总结(示例保留原有逻辑)
data_list = []
for _, row in to_process.iterrows():
data_list.append({
"model": "gpt-4-turbo",
"messages": [{
"role": "user",
"content": f"{row['正文']}\n以上这条中可能包含了一些病情或者症状,请帮我归纳这条中所对应的病情或者症状是哪些,总结出来,不需要很长,简单归纳即可,直接输出症状或者病情,可以包含一些形容词来辅助描述,不需要有辅助词汇"
}],
"top_p": 0.9,
"temperature": 0.6
})
# 并发处理请求
with concurrent.futures.ThreadPoolExecutor(200) as executor:
results = list(executor.map(dialog_line_parse, data_list))
def main(question, db_type, top): # 更新总结字段
for idx, result in zip(to_process.index, results):
summary = result['choices'][0]['message']['content']
df.at[idx, "总结"] = summary
# 保存更新后的CSV
df.to_csv(file_name_res_save, sep="\t", index=False)
return df
def main(question, title, top):
db_dict = { db_dict = {
"1": "yetianshi" "1": "yetianshi"
} }
@ -114,30 +252,43 @@ def main(question, db_type, top):
根据提问匹配上下文 根据提问匹配上下文
''' '''
d = 1024 d = 1024
db_type_list = db_type.split(",") db_type_list = title.split(",")
paper_list_str = "" paper_list_str = ""
for i in db_type_list: for title_dan in db_type_list:
embs = shengcehng_array([question]) embs = shengcehng_array([question])
index = faiss.IndexFlatIP(d) # buid the index index = faiss.IndexFlatIP(d) # buid the index
data_np = np.load(f"data_np/{i}.npy")
# data_str = open(f"data_file/{i}.txt").read().split("\n") # 查找向量
data_str = pd.read_csv(f"data_file/{i}.csv", sep="\t", encoding="utf-8").values.tolist() vector_path = f"data_np/{title_dan}.npy"
index.add(data_np) index_path = f"data_np/{title_dan}_index.json"
if not os.path.exists(vector_path) or not os.path.exists(index_path):
return np.empty((0, 1024))
vectors = np.load(vector_path)
with open(index_path, "r") as f:
index_data = json.load(f)
data_str = pd.read_csv(f"data_file_res/{title_dan}.csv", sep="\t", encoding="utf-8").values.tolist()
index.add(vectors)
D, I = index.search(embs, int(top)) D, I = index.search(embs, int(top))
print(I) print(I)
reference_list = [] reference_list = []
for i,j in zip(I[0], D[0]): for i,j in zip(I[0], D[0]):
if data_str[i][3] == True:
reference_list.append([data_str[i], j]) reference_list.append([data_str[i], j])
for i,j in enumerate(reference_list): for i,j in enumerate(reference_list):
paper_list_str += "{}\n{},此篇文章的转发数为{},评论数为{},点赞数为{}\n,此篇文章跟问题的相关度为{}%\n".format(str(i+1), j[0][0], j[0][1], j[0][2], j[0][3], j[1]) paper_list_str += "{}\n{},此篇文章跟问题的相关度为{}%\n".format(str(i+1), j[0][1], j[1])
''' '''
构造prompt 构造prompt
''' '''
print("paper_list_str", paper_list_str) print("paper_list_str", paper_list_str)
9/0
propmt_connect_ziliao_input = [] propmt_connect_ziliao_input = []
for i in db_type_list: for i in db_type_list:
propmt_connect_ziliao_input.append(propmt_connect_ziliao.format(i, paper_list_str)) propmt_connect_ziliao_input.append(propmt_connect_ziliao.format(i, paper_list_str))
@ -147,61 +298,70 @@ def main(question, db_type, top):
''' '''
生成回答 生成回答
''' '''
url_predict = "http://192.168.31.74:26000/predict" return model_generate_stream(propmt_connect_input)
url_search = "http://192.168.31.74:26000/search"
# data = { def model_generate_stream(prompt):
# "content": propmt_connect_input messages = [
# } {"role": "user", "content": prompt}
data = { ]
"content": propmt_connect_input,
"model": "qwq-32", stream = client.chat.completions.create(model=model,
"top_p": 0.9, messages=messages,
"temperature": 0.6 stream=True)
} printed_reasoning_content = False
printed_content = False
res = dialog_line_parse(url_predict, data)
id_ = res["texts"]["id"] for chunk in stream:
reasoning_content = None
data = { content = None
"id": id_ # Check the content is reasoning_content or content
} if hasattr(chunk.choices[0].delta, "reasoning_content"):
reasoning_content = chunk.choices[0].delta.reasoning_content
while True: elif hasattr(chunk.choices[0].delta, "content"):
res = dialog_line_parse(url_search, data) content = chunk.choices[0].delta.content
if res["code"] == 200:
break if reasoning_content is not None:
else: if not printed_reasoning_content:
time.sleep(1) printed_reasoning_content = True
spilt_str = "</think>" print("reasoning_content:", end="", flush=True)
think, response = str(res["text"]).split(spilt_str) print(reasoning_content, end="", flush=True)
return think, response elif content is not None:
if not printed_content:
printed_content = True
@app.route("/upload_file", methods=["POST"]) print("\ncontent:", end="", flush=True)
def upload_file(): # Extract and print the content
print(request.remote_addr) # print(content, end="", flush=True)
file = request.files.get('file') print(content)
title = request.form.get("title") yield content
df = ulit_request_file(file, title)
Building_vector_database("1", title, df)
return_json = {
"code": 200,
"info": "上传完成"
}
return jsonify(return_json) # 返回结果
@app.route("/upload_file_check", methods=["POST"]) @app.route("/upload_file_check", methods=["POST"])
def upload_file_check(): def upload_file_check():
print(request.remote_addr) print(request.remote_addr)
file = request.files.get('file') sentence = request.form.get('sentence')
title = request.form.get("title") title = request.form.get("title")
df = ulit_request_file(file, title) new_id = request.form.get("id")
Building_vector_database("1", title, df) state = request.form.get("state")
'''
{
"1": "csv",
"2": "xlsx",
"3": "txt",
"4": "pdf"
}
'''
state_res = ""
if state == "1":
df = ulit_request_file(new_id, sentence, title)
Building_vector_database(title, df)
state_res = "上传完成"
elif state == "2":
delete_data(title, new_id)
state_res = "删除完成"
return_json = { return_json = {
"code": 200, "code": 200,
"info": "上传完成" "info": state_res
} }
return jsonify(return_json) # 返回结果 return jsonify(return_json) # 返回结果
@ -210,15 +370,10 @@ def upload_file_check():
def search(): def search():
print(request.remote_addr) print(request.remote_addr)
texts = request.json["texts"] texts = request.json["texts"]
text_type = request.json["text_type"] title = request.json["title"]
top = request.json["top"] top = request.json["top"]
think, response = main(texts, text_type, top) response = main(texts, title, top)
return_json = { return Response(response, mimetype='text/plain; charset=utf-8') # 返回结果
"code": 200,
"think": think,
"response": response
}
return jsonify(return_json) # 返回结果
if __name__ == "__main__": if __name__ == "__main__":

Loading…
Cancel
Save